1;****************************************************************************** 2;* VC1 deblocking optimizations 3;* Copyright (c) 2009 David Conrad 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24cextern pw_4 25cextern pw_5 26 27section .text 28 29; dst_low, dst_high (src), zero 30; zero-extends one vector from 8 to 16 bits 31%macro UNPACK_8TO16 4 32 mova m%2, m%3 33 punpckh%1 m%3, m%4 34 punpckl%1 m%2, m%4 35%endmacro 36 37%macro STORE_4_WORDS 6 38%if cpuflag(sse4) 39 pextrw %1, %5, %6+0 40 pextrw %2, %5, %6+1 41 pextrw %3, %5, %6+2 42 pextrw %4, %5, %6+3 43%else 44 movd %6d, %5 45%if mmsize==16 46 psrldq %5, 4 47%else 48 psrlq %5, 32 49%endif 50 mov %1, %6w 51 shr %6, 16 52 mov %2, %6w 53 movd %6d, %5 54 mov %3, %6w 55 shr %6, 16 56 mov %4, %6w 57%endif 58%endmacro 59 60; in: p1 p0 q0 q1, clobbers p0 61; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 62%macro VC1_LOOP_FILTER_A0 4 63 psubw %1, %4 64 psubw %2, %3 65 paddw %1, %1 66 pmullw %2, [pw_5] 67 psubw %1, %2 68 paddw %1, [pw_4] 69 psraw %1, 3 70%endmacro 71 72; in: p0 q0 a0 a1 a2 73; m0 m1 m7 m6 m5 74; %1: size 75; out: m0=p0' m1=q0' 76%macro VC1_FILTER 1 77 PABSW m4, m7 78 PABSW m3, m6 79 PABSW m2, m5 80 mova m6, m4 81 pminsw m3, m2 82 pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) 83 psubw m3, m4 84 pmullw m3, [pw_5] ; 5*(a3 - a0) 85 PABSW m2, m3 86 psraw m2, 3 ; abs(d/8) 87 pxor m7, m3 ; d_sign ^= a0_sign 88 89 pxor m5, m5 90 movd m3, r2d 91%if %1 > 4 92 punpcklbw m3, m3 93%endif 94 punpcklbw m3, m5 95 pcmpgtw m3, m4 ; if (a0 < pq) 96 pand m6, m3 97 98 mova m3, m0 99 psubw m3, m1 100 PABSW m4, m3 101 psraw m4, 1 102 pxor m3, m7 ; d_sign ^ clip_sign 103 psraw m3, 15 104 pminsw m2, m4 ; min(d, clip) 105 pcmpgtw m4, m5 106 pand m6, m4 ; filt3 (C return value) 107 108; each set of 4 pixels is not filtered if the 3rd is not 109%if mmsize==16 110 pshuflw m4, m6, 0xaa 111%if %1 > 4 112 pshufhw m4, m4, 0xaa 113%endif 114%else 115 pshufw m4, m6, 0xaa 116%endif 117 pandn m3, m4 118 pand m2, m6 119 pand m3, m2 ; d final 120 121 psraw m7, 15 122 pxor m3, m7 123 psubw m3, m7 124 psubw m0, m3 125 paddw m1, m3 126 packuswb m0, m0 127 packuswb m1, m1 128%endmacro 129 130; 1st param: size of filter 131; 2nd param: mov suffix equivalent to the filter size 132%macro VC1_V_LOOP_FILTER 2 133 pxor m5, m5 134 mov%2 m6, [r4] 135 mov%2 m4, [r4+r1] 136 mov%2 m7, [r4+2*r1] 137 mov%2 m0, [r4+r3] 138 punpcklbw m6, m5 139 punpcklbw m4, m5 140 punpcklbw m7, m5 141 punpcklbw m0, m5 142 143 VC1_LOOP_FILTER_A0 m6, m4, m7, m0 144 mov%2 m1, [r0] 145 mov%2 m2, [r0+r1] 146 punpcklbw m1, m5 147 punpcklbw m2, m5 148 mova m4, m0 149 VC1_LOOP_FILTER_A0 m7, m4, m1, m2 150 mov%2 m3, [r0+2*r1] 151 mov%2 m4, [r0+r3] 152 punpcklbw m3, m5 153 punpcklbw m4, m5 154 mova m5, m1 155 VC1_LOOP_FILTER_A0 m5, m2, m3, m4 156 157 VC1_FILTER %1 158 mov%2 [r4+r3], m0 159 mov%2 [r0], m1 160%endmacro 161 162; 1st param: size of filter 163; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register 164; 2nd (optional) param: temp register to use for storing words 165%macro VC1_H_LOOP_FILTER 1-2 166%if %1 == 4 167 movq m0, [r0 -4] 168 movq m1, [r0+ r1-4] 169 movq m2, [r0+2*r1-4] 170 movq m3, [r0+ r3-4] 171 TRANSPOSE4x4B 0, 1, 2, 3, 4 172%else 173 movq m0, [r0 -4] 174 movq m4, [r0+ r1-4] 175 movq m1, [r0+2*r1-4] 176 movq m5, [r0+ r3-4] 177 movq m2, [r4 -4] 178 movq m6, [r4+ r1-4] 179 movq m3, [r4+2*r1-4] 180 movq m7, [r4+ r3-4] 181 punpcklbw m0, m4 182 punpcklbw m1, m5 183 punpcklbw m2, m6 184 punpcklbw m3, m7 185 TRANSPOSE4x4W 0, 1, 2, 3, 4 186%endif 187 pxor m5, m5 188 189 UNPACK_8TO16 bw, 6, 0, 5 190 UNPACK_8TO16 bw, 7, 1, 5 191 VC1_LOOP_FILTER_A0 m6, m0, m7, m1 192 UNPACK_8TO16 bw, 4, 2, 5 193 mova m0, m1 ; m0 = p0 194 VC1_LOOP_FILTER_A0 m7, m1, m4, m2 195 UNPACK_8TO16 bw, 1, 3, 5 196 mova m5, m4 197 VC1_LOOP_FILTER_A0 m5, m2, m1, m3 198 SWAP 1, 4 ; m1 = q0 199 200 VC1_FILTER %1 201 punpcklbw m0, m1 202%if %0 > 1 203 STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 204%if %1 > 4 205 psrldq m0, 4 206 STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 207%endif 208%else 209 STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 210 STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 211%endif 212%endmacro 213 214 215%macro START_V_FILTER 0 216 mov r4, r0 217 lea r3, [4*r1] 218 sub r4, r3 219 lea r3, [r1+2*r1] 220 imul r2, 0x01010101 221%endmacro 222 223%macro START_H_FILTER 1 224 lea r3, [r1+2*r1] 225%if %1 > 4 226 lea r4, [r0+4*r1] 227%endif 228 imul r2, 0x01010101 229%endmacro 230 231%macro VC1_LF 0 232cglobal vc1_v_loop_filter_internal 233 VC1_V_LOOP_FILTER 4, d 234 ret 235 236cglobal vc1_h_loop_filter_internal 237 VC1_H_LOOP_FILTER 4, r4 238 ret 239 240; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq) 241cglobal vc1_v_loop_filter4, 3,5,0 242 START_V_FILTER 243 call vc1_v_loop_filter_internal 244 RET 245 246; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq) 247cglobal vc1_h_loop_filter4, 3,5,0 248 START_H_FILTER 4 249 call vc1_h_loop_filter_internal 250 RET 251 252; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq) 253cglobal vc1_v_loop_filter8, 3,5,0 254 START_V_FILTER 255 call vc1_v_loop_filter_internal 256 add r4, 4 257 add r0, 4 258 call vc1_v_loop_filter_internal 259 RET 260 261; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq) 262cglobal vc1_h_loop_filter8, 3,5,0 263 START_H_FILTER 4 264 call vc1_h_loop_filter_internal 265 lea r0, [r0+4*r1] 266 call vc1_h_loop_filter_internal 267 RET 268%endmacro 269 270INIT_MMX mmxext 271VC1_LF 272 273INIT_XMM sse2 274; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) 275cglobal vc1_v_loop_filter8, 3,5,8 276 START_V_FILTER 277 VC1_V_LOOP_FILTER 8, q 278 RET 279 280; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) 281cglobal vc1_h_loop_filter8, 3,6,8 282 START_H_FILTER 8 283 VC1_H_LOOP_FILTER 8, r5 284 RET 285 286INIT_MMX ssse3 287; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) 288cglobal vc1_v_loop_filter4, 3,5,0 289 START_V_FILTER 290 VC1_V_LOOP_FILTER 4, d 291 RET 292 293; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) 294cglobal vc1_h_loop_filter4, 3,5,0 295 START_H_FILTER 4 296 VC1_H_LOOP_FILTER 4, r4 297 RET 298 299INIT_XMM ssse3 300; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) 301cglobal vc1_v_loop_filter8, 3,5,8 302 START_V_FILTER 303 VC1_V_LOOP_FILTER 8, q 304 RET 305 306; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) 307cglobal vc1_h_loop_filter8, 3,6,8 308 START_H_FILTER 8 309 VC1_H_LOOP_FILTER 8, r5 310 RET 311 312INIT_XMM sse4 313; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) 314cglobal vc1_h_loop_filter8, 3,5,8 315 START_H_FILTER 8 316 VC1_H_LOOP_FILTER 8 317 RET 318