1;****************************************************************************** 2;* VC1 deblocking optimizations 3;* Copyright (c) 2009 David Conrad 4;* 5;* This file is part of Libav. 6;* 7;* Libav is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* Libav is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with Libav; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "x86inc.asm" 23%include "x86util.asm" 24 25cextern pw_4 26cextern pw_5 27 28section .text 29 30; dst_low, dst_high (src), zero 31; zero-extends one vector from 8 to 16 bits 32%macro UNPACK_8TO16 4 33 mova m%2, m%3 34 punpckh%1 m%3, m%4 35 punpckl%1 m%2, m%4 36%endmacro 37 38%macro STORE_4_WORDS_MMX 6 39 movd %6d, %5 40%if mmsize==16 41 psrldq %5, 4 42%else 43 psrlq %5, 32 44%endif 45 mov %1, %6w 46 shr %6, 16 47 mov %2, %6w 48 movd %6d, %5 49 mov %3, %6w 50 shr %6, 16 51 mov %4, %6w 52%endmacro 53 54%macro STORE_4_WORDS_SSE4 6 55 pextrw %1, %5, %6+0 56 pextrw %2, %5, %6+1 57 pextrw %3, %5, %6+2 58 pextrw %4, %5, %6+3 59%endmacro 60 61; in: p1 p0 q0 q1, clobbers p0 62; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 63%macro VC1_LOOP_FILTER_A0 4 64 psubw %1, %4 65 psubw %2, %3 66 paddw %1, %1 67 pmullw %2, [pw_5] 68 psubw %1, %2 69 paddw %1, [pw_4] 70 psraw %1, 3 71%endmacro 72 73; in: p0 q0 a0 a1 a2 74; m0 m1 m7 m6 m5 75; %1: size 76; out: m0=p0' m1=q0' 77%macro VC1_FILTER 1 78 PABSW m4, m7 79 PABSW m3, m6 80 PABSW m2, m5 81 mova m6, m4 82 pminsw m3, m2 83 pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) 84 psubw m3, m4 85 pmullw m3, [pw_5] ; 5*(a3 - a0) 86 PABSW m2, m3 87 psraw m2, 3 ; abs(d/8) 88 pxor m7, m3 ; d_sign ^= a0_sign 89 90 pxor m5, m5 91 movd m3, r2d 92%if %1 > 4 93 punpcklbw m3, m3 94%endif 95 punpcklbw m3, m5 96 pcmpgtw m3, m4 ; if (a0 < pq) 97 pand m6, m3 98 99 mova m3, m0 100 psubw m3, m1 101 PABSW m4, m3 102 psraw m4, 1 103 pxor m3, m7 ; d_sign ^ clip_sign 104 psraw m3, 15 105 pminsw m2, m4 ; min(d, clip) 106 pcmpgtw m4, m5 107 pand m6, m4 ; filt3 (C return value) 108 109; each set of 4 pixels is not filtered if the 3rd is not 110%if mmsize==16 111 pshuflw m4, m6, 0xaa 112%if %1 > 4 113 pshufhw m4, m4, 0xaa 114%endif 115%else 116 pshufw m4, m6, 0xaa 117%endif 118 pandn m3, m4 119 pand m2, m6 120 pand m3, m2 ; d final 121 122 PSIGNW m3, m7 123 psubw m0, m3 124 paddw m1, m3 125 packuswb m0, m0 126 packuswb m1, m1 127%endmacro 128 129; 1st param: size of filter 130; 2nd param: mov suffix equivalent to the filter size 131%macro VC1_V_LOOP_FILTER 2 132 pxor m5, m5 133 mov%2 m6, [r4] 134 mov%2 m4, [r4+r1] 135 mov%2 m7, [r4+2*r1] 136 mov%2 m0, [r4+r3] 137 punpcklbw m6, m5 138 punpcklbw m4, m5 139 punpcklbw m7, m5 140 punpcklbw m0, m5 141 142 VC1_LOOP_FILTER_A0 m6, m4, m7, m0 143 mov%2 m1, [r0] 144 mov%2 m2, [r0+r1] 145 punpcklbw m1, m5 146 punpcklbw m2, m5 147 mova m4, m0 148 VC1_LOOP_FILTER_A0 m7, m4, m1, m2 149 mov%2 m3, [r0+2*r1] 150 mov%2 m4, [r0+r3] 151 punpcklbw m3, m5 152 punpcklbw m4, m5 153 mova m5, m1 154 VC1_LOOP_FILTER_A0 m5, m2, m3, m4 155 156 VC1_FILTER %1 157 mov%2 [r4+r3], m0 158 mov%2 [r0], m1 159%endmacro 160 161; 1st param: size of filter 162; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register 163; 2nd (optional) param: temp register to use for storing words 164%macro VC1_H_LOOP_FILTER 1-2 165%if %1 == 4 166 movq m0, [r0 -4] 167 movq m1, [r0+ r1-4] 168 movq m2, [r0+2*r1-4] 169 movq m3, [r0+ r3-4] 170 TRANSPOSE4x4B 0, 1, 2, 3, 4 171%else 172 movq m0, [r0 -4] 173 movq m4, [r0+ r1-4] 174 movq m1, [r0+2*r1-4] 175 movq m5, [r0+ r3-4] 176 movq m2, [r4 -4] 177 movq m6, [r4+ r1-4] 178 movq m3, [r4+2*r1-4] 179 movq m7, [r4+ r3-4] 180 punpcklbw m0, m4 181 punpcklbw m1, m5 182 punpcklbw m2, m6 183 punpcklbw m3, m7 184 TRANSPOSE4x4W 0, 1, 2, 3, 4 185%endif 186 pxor m5, m5 187 188 UNPACK_8TO16 bw, 6, 0, 5 189 UNPACK_8TO16 bw, 7, 1, 5 190 VC1_LOOP_FILTER_A0 m6, m0, m7, m1 191 UNPACK_8TO16 bw, 4, 2, 5 192 mova m0, m1 ; m0 = p0 193 VC1_LOOP_FILTER_A0 m7, m1, m4, m2 194 UNPACK_8TO16 bw, 1, 3, 5 195 mova m5, m4 196 VC1_LOOP_FILTER_A0 m5, m2, m1, m3 197 SWAP 1, 4 ; m1 = q0 198 199 VC1_FILTER %1 200 punpcklbw m0, m1 201%if %0 > 1 202 STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 203%if %1 > 4 204 psrldq m0, 4 205 STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 206%endif 207%else 208 STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 209 STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 210%endif 211%endmacro 212 213 214%macro START_V_FILTER 0 215 mov r4, r0 216 lea r3, [4*r1] 217 sub r4, r3 218 lea r3, [r1+2*r1] 219 imul r2, 0x01010101 220%endmacro 221 222%macro START_H_FILTER 1 223 lea r3, [r1+2*r1] 224%if %1 > 4 225 lea r4, [r0+4*r1] 226%endif 227 imul r2, 0x01010101 228%endmacro 229 230; I do not know why the sign extension is needed... 231%macro PSIGNW_SRA_MMX 2 232 psraw %2, 15 233 PSIGNW_MMX %1, %2 234%endmacro 235 236 237%macro VC1_LF_MMX 1 238INIT_MMX 239cglobal vc1_v_loop_filter_internal_%1 240 VC1_V_LOOP_FILTER 4, d 241 ret 242 243cglobal vc1_h_loop_filter_internal_%1 244 VC1_H_LOOP_FILTER 4, r4 245 ret 246 247; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq) 248cglobal vc1_v_loop_filter4_%1, 3,5,0 249 START_V_FILTER 250 call vc1_v_loop_filter_internal_%1 251 RET 252 253; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq) 254cglobal vc1_h_loop_filter4_%1, 3,5,0 255 START_H_FILTER 4 256 call vc1_h_loop_filter_internal_%1 257 RET 258 259; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq) 260cglobal vc1_v_loop_filter8_%1, 3,5,0 261 START_V_FILTER 262 call vc1_v_loop_filter_internal_%1 263 add r4, 4 264 add r0, 4 265 call vc1_v_loop_filter_internal_%1 266 RET 267 268; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq) 269cglobal vc1_h_loop_filter8_%1, 3,5,0 270 START_H_FILTER 4 271 call vc1_h_loop_filter_internal_%1 272 lea r0, [r0+4*r1] 273 call vc1_h_loop_filter_internal_%1 274 RET 275%endmacro 276 277%define PABSW PABSW_MMX 278%define PSIGNW PSIGNW_SRA_MMX 279VC1_LF_MMX mmx 280 281%define PABSW PABSW_MMX2 282VC1_LF_MMX mmx2 283 284INIT_XMM 285; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) 286cglobal vc1_v_loop_filter8_sse2, 3,5,8 287 START_V_FILTER 288 VC1_V_LOOP_FILTER 8, q 289 RET 290 291; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) 292cglobal vc1_h_loop_filter8_sse2, 3,6,8 293 START_H_FILTER 8 294 VC1_H_LOOP_FILTER 8, r5 295 RET 296 297%define PABSW PABSW_SSSE3 298%define PSIGNW PSIGNW_SSSE3 299 300INIT_MMX 301; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) 302cglobal vc1_v_loop_filter4_ssse3, 3,5,0 303 START_V_FILTER 304 VC1_V_LOOP_FILTER 4, d 305 RET 306 307; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) 308cglobal vc1_h_loop_filter4_ssse3, 3,5,0 309 START_H_FILTER 4 310 VC1_H_LOOP_FILTER 4, r4 311 RET 312 313INIT_XMM 314; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) 315cglobal vc1_v_loop_filter8_ssse3, 3,5,8 316 START_V_FILTER 317 VC1_V_LOOP_FILTER 8, q 318 RET 319 320; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) 321cglobal vc1_h_loop_filter8_ssse3, 3,6,8 322 START_H_FILTER 8 323 VC1_H_LOOP_FILTER 8, r5 324 RET 325 326; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) 327cglobal vc1_h_loop_filter8_sse4, 3,5,8 328 START_H_FILTER 8 329 VC1_H_LOOP_FILTER 8 330 RET 331