1;******************************************************************************
2;* VC1 deblocking optimizations
3;* Copyright (c) 2009 David Conrad
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24cextern pw_4
25cextern pw_5
26
27section .text
28
29; dst_low, dst_high (src), zero
30; zero-extends one vector from 8 to 16 bits
31%macro UNPACK_8TO16 4
32    mova      m%2, m%3
33    punpckh%1 m%3, m%4
34    punpckl%1 m%2, m%4
35%endmacro
36
37%macro STORE_4_WORDS 6
38%if cpuflag(sse4)
39    pextrw %1, %5, %6+0
40    pextrw %2, %5, %6+1
41    pextrw %3, %5, %6+2
42    pextrw %4, %5, %6+3
43%else
44    movd  %6d, %5
45%if mmsize==16
46    psrldq %5, 4
47%else
48    psrlq  %5, 32
49%endif
50    mov    %1, %6w
51    shr    %6, 16
52    mov    %2, %6w
53    movd  %6d, %5
54    mov    %3, %6w
55    shr    %6, 16
56    mov    %4, %6w
57%endif
58%endmacro
59
60; in:  p1 p0 q0 q1, clobbers p0
61; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
62%macro VC1_LOOP_FILTER_A0 4
63    psubw  %1, %4
64    psubw  %2, %3
65    paddw  %1, %1
66    pmullw %2, [pw_5]
67    psubw  %1, %2
68    paddw  %1, [pw_4]
69    psraw  %1, 3
70%endmacro
71
72; in: p0 q0 a0 a1 a2
73;     m0 m1 m7 m6 m5
74; %1: size
75; out: m0=p0' m1=q0'
76%macro VC1_FILTER 1
77    PABSW   m4, m7
78    PABSW   m3, m6
79    PABSW   m2, m5
80    mova    m6, m4
81    pminsw  m3, m2
82    pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
83    psubw   m3, m4
84    pmullw  m3, [pw_5]   ; 5*(a3 - a0)
85    PABSW   m2, m3
86    psraw   m2, 3   ; abs(d/8)
87    pxor    m7, m3  ; d_sign ^= a0_sign
88
89    pxor    m5, m5
90    movd    m3, r2d
91%if %1 > 4
92    punpcklbw m3, m3
93%endif
94    punpcklbw m3, m5
95    pcmpgtw m3, m4  ; if (a0 < pq)
96    pand    m6, m3
97
98    mova    m3, m0
99    psubw   m3, m1
100    PABSW   m4, m3
101    psraw   m4, 1
102    pxor    m3, m7  ; d_sign ^ clip_sign
103    psraw   m3, 15
104    pminsw  m2, m4  ; min(d, clip)
105    pcmpgtw m4, m5
106    pand    m6, m4  ; filt3 (C return value)
107
108; each set of 4 pixels is not filtered if the 3rd is not
109%if mmsize==16
110    pshuflw m4, m6, 0xaa
111%if %1 > 4
112    pshufhw m4, m4, 0xaa
113%endif
114%else
115    pshufw  m4, m6, 0xaa
116%endif
117    pandn   m3, m4
118    pand    m2, m6
119    pand    m3, m2  ; d final
120
121    psraw   m7, 15
122    pxor    m3, m7
123    psubw   m3, m7
124    psubw   m0, m3
125    paddw   m1, m3
126    packuswb m0, m0
127    packuswb m1, m1
128%endmacro
129
130; 1st param: size of filter
131; 2nd param: mov suffix equivalent to the filter size
132%macro VC1_V_LOOP_FILTER 2
133    pxor      m5, m5
134    mov%2     m6, [r4]
135    mov%2     m4, [r4+r1]
136    mov%2     m7, [r4+2*r1]
137    mov%2     m0, [r4+r3]
138    punpcklbw m6, m5
139    punpcklbw m4, m5
140    punpcklbw m7, m5
141    punpcklbw m0, m5
142
143    VC1_LOOP_FILTER_A0 m6, m4, m7, m0
144    mov%2     m1, [r0]
145    mov%2     m2, [r0+r1]
146    punpcklbw m1, m5
147    punpcklbw m2, m5
148    mova      m4, m0
149    VC1_LOOP_FILTER_A0 m7, m4, m1, m2
150    mov%2     m3, [r0+2*r1]
151    mov%2     m4, [r0+r3]
152    punpcklbw m3, m5
153    punpcklbw m4, m5
154    mova      m5, m1
155    VC1_LOOP_FILTER_A0 m5, m2, m3, m4
156
157    VC1_FILTER %1
158    mov%2 [r4+r3], m0
159    mov%2 [r0],    m1
160%endmacro
161
162; 1st param: size of filter
163;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
164; 2nd (optional) param: temp register to use for storing words
165%macro VC1_H_LOOP_FILTER 1-2
166%if %1 == 4
167    movq      m0, [r0     -4]
168    movq      m1, [r0+  r1-4]
169    movq      m2, [r0+2*r1-4]
170    movq      m3, [r0+  r3-4]
171    TRANSPOSE4x4B 0, 1, 2, 3, 4
172%else
173    movq      m0, [r0     -4]
174    movq      m4, [r0+  r1-4]
175    movq      m1, [r0+2*r1-4]
176    movq      m5, [r0+  r3-4]
177    movq      m2, [r4     -4]
178    movq      m6, [r4+  r1-4]
179    movq      m3, [r4+2*r1-4]
180    movq      m7, [r4+  r3-4]
181    punpcklbw m0, m4
182    punpcklbw m1, m5
183    punpcklbw m2, m6
184    punpcklbw m3, m7
185    TRANSPOSE4x4W 0, 1, 2, 3, 4
186%endif
187    pxor      m5, m5
188
189    UNPACK_8TO16 bw, 6, 0, 5
190    UNPACK_8TO16 bw, 7, 1, 5
191    VC1_LOOP_FILTER_A0 m6, m0, m7, m1
192    UNPACK_8TO16 bw, 4, 2, 5
193    mova    m0, m1                      ; m0 = p0
194    VC1_LOOP_FILTER_A0 m7, m1, m4, m2
195    UNPACK_8TO16 bw, 1, 3, 5
196    mova    m5, m4
197    VC1_LOOP_FILTER_A0 m5, m2, m1, m3
198    SWAP 1, 4                           ; m1 = q0
199
200    VC1_FILTER %1
201    punpcklbw m0, m1
202%if %0 > 1
203    STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
204%if %1 > 4
205    psrldq m0, 4
206    STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
207%endif
208%else
209    STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
210    STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
211%endif
212%endmacro
213
214
215%macro START_V_FILTER 0
216    mov  r4, r0
217    lea  r3, [4*r1]
218    sub  r4, r3
219    lea  r3, [r1+2*r1]
220    imul r2, 0x01010101
221%endmacro
222
223%macro START_H_FILTER 1
224    lea  r3, [r1+2*r1]
225%if %1 > 4
226    lea  r4, [r0+4*r1]
227%endif
228    imul r2, 0x01010101
229%endmacro
230
231%macro VC1_LF 0
232cglobal vc1_v_loop_filter_internal
233    VC1_V_LOOP_FILTER 4, d
234    ret
235
236cglobal vc1_h_loop_filter_internal
237    VC1_H_LOOP_FILTER 4, r4
238    ret
239
240; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
241cglobal vc1_v_loop_filter4, 3,5,0
242    START_V_FILTER
243    call vc1_v_loop_filter_internal
244    RET
245
246; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
247cglobal vc1_h_loop_filter4, 3,5,0
248    START_H_FILTER 4
249    call vc1_h_loop_filter_internal
250    RET
251
252; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
253cglobal vc1_v_loop_filter8, 3,5,0
254    START_V_FILTER
255    call vc1_v_loop_filter_internal
256    add  r4, 4
257    add  r0, 4
258    call vc1_v_loop_filter_internal
259    RET
260
261; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
262cglobal vc1_h_loop_filter8, 3,5,0
263    START_H_FILTER 4
264    call vc1_h_loop_filter_internal
265    lea  r0, [r0+4*r1]
266    call vc1_h_loop_filter_internal
267    RET
268%endmacro
269
270INIT_MMX mmxext
271VC1_LF
272
273INIT_XMM sse2
274; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
275cglobal vc1_v_loop_filter8, 3,5,8
276    START_V_FILTER
277    VC1_V_LOOP_FILTER 8, q
278    RET
279
280; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
281cglobal vc1_h_loop_filter8, 3,6,8
282    START_H_FILTER 8
283    VC1_H_LOOP_FILTER 8, r5
284    RET
285
286INIT_MMX ssse3
287; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
288cglobal vc1_v_loop_filter4, 3,5,0
289    START_V_FILTER
290    VC1_V_LOOP_FILTER 4, d
291    RET
292
293; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
294cglobal vc1_h_loop_filter4, 3,5,0
295    START_H_FILTER 4
296    VC1_H_LOOP_FILTER 4, r4
297    RET
298
299INIT_XMM ssse3
300; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
301cglobal vc1_v_loop_filter8, 3,5,8
302    START_V_FILTER
303    VC1_V_LOOP_FILTER 8, q
304    RET
305
306; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
307cglobal vc1_h_loop_filter8, 3,6,8
308    START_H_FILTER 8
309    VC1_H_LOOP_FILTER 8, r5
310    RET
311
312INIT_XMM sse4
313; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
314cglobal vc1_h_loop_filter8, 3,5,8
315    START_H_FILTER 8
316    VC1_H_LOOP_FILTER 8
317    RET
318