1;******************************************************************************
2;* VC1 deblocking optimizations
3;* Copyright (c) 2009 David Conrad
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25cextern pw_4
26cextern pw_5
27
28section .text
29
30; dst_low, dst_high (src), zero
31; zero-extends one vector from 8 to 16 bits
32%macro UNPACK_8TO16 4
33    mova      m%2, m%3
34    punpckh%1 m%3, m%4
35    punpckl%1 m%2, m%4
36%endmacro
37
38%macro STORE_4_WORDS_MMX 6
39    movd  %6d, %5
40%if mmsize==16
41    psrldq %5, 4
42%else
43    psrlq  %5, 32
44%endif
45    mov    %1, %6w
46    shr    %6, 16
47    mov    %2, %6w
48    movd  %6d, %5
49    mov    %3, %6w
50    shr    %6, 16
51    mov    %4, %6w
52%endmacro
53
54%macro STORE_4_WORDS_SSE4 6
55    pextrw %1, %5, %6+0
56    pextrw %2, %5, %6+1
57    pextrw %3, %5, %6+2
58    pextrw %4, %5, %6+3
59%endmacro
60
61; in:  p1 p0 q0 q1, clobbers p0
62; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
63%macro VC1_LOOP_FILTER_A0 4
64    psubw  %1, %4
65    psubw  %2, %3
66    paddw  %1, %1
67    pmullw %2, [pw_5]
68    psubw  %1, %2
69    paddw  %1, [pw_4]
70    psraw  %1, 3
71%endmacro
72
73; in: p0 q0 a0 a1 a2
74;     m0 m1 m7 m6 m5
75; %1: size
76; out: m0=p0' m1=q0'
77%macro VC1_FILTER 1
78    PABSW   m4, m7
79    PABSW   m3, m6
80    PABSW   m2, m5
81    mova    m6, m4
82    pminsw  m3, m2
83    pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
84    psubw   m3, m4
85    pmullw  m3, [pw_5]   ; 5*(a3 - a0)
86    PABSW   m2, m3
87    psraw   m2, 3   ; abs(d/8)
88    pxor    m7, m3  ; d_sign ^= a0_sign
89
90    pxor    m5, m5
91    movd    m3, r2d
92%if %1 > 4
93    punpcklbw m3, m3
94%endif
95    punpcklbw m3, m5
96    pcmpgtw m3, m4  ; if (a0 < pq)
97    pand    m6, m3
98
99    mova    m3, m0
100    psubw   m3, m1
101    PABSW   m4, m3
102    psraw   m4, 1
103    pxor    m3, m7  ; d_sign ^ clip_sign
104    psraw   m3, 15
105    pminsw  m2, m4  ; min(d, clip)
106    pcmpgtw m4, m5
107    pand    m6, m4  ; filt3 (C return value)
108
109; each set of 4 pixels is not filtered if the 3rd is not
110%if mmsize==16
111    pshuflw m4, m6, 0xaa
112%if %1 > 4
113    pshufhw m4, m4, 0xaa
114%endif
115%else
116    pshufw  m4, m6, 0xaa
117%endif
118    pandn   m3, m4
119    pand    m2, m6
120    pand    m3, m2  ; d final
121
122    PSIGNW  m3, m7
123    psubw   m0, m3
124    paddw   m1, m3
125    packuswb m0, m0
126    packuswb m1, m1
127%endmacro
128
129; 1st param: size of filter
130; 2nd param: mov suffix equivalent to the filter size
131%macro VC1_V_LOOP_FILTER 2
132    pxor      m5, m5
133    mov%2     m6, [r4]
134    mov%2     m4, [r4+r1]
135    mov%2     m7, [r4+2*r1]
136    mov%2     m0, [r4+r3]
137    punpcklbw m6, m5
138    punpcklbw m4, m5
139    punpcklbw m7, m5
140    punpcklbw m0, m5
141
142    VC1_LOOP_FILTER_A0 m6, m4, m7, m0
143    mov%2     m1, [r0]
144    mov%2     m2, [r0+r1]
145    punpcklbw m1, m5
146    punpcklbw m2, m5
147    mova      m4, m0
148    VC1_LOOP_FILTER_A0 m7, m4, m1, m2
149    mov%2     m3, [r0+2*r1]
150    mov%2     m4, [r0+r3]
151    punpcklbw m3, m5
152    punpcklbw m4, m5
153    mova      m5, m1
154    VC1_LOOP_FILTER_A0 m5, m2, m3, m4
155
156    VC1_FILTER %1
157    mov%2 [r4+r3], m0
158    mov%2 [r0],    m1
159%endmacro
160
161; 1st param: size of filter
162;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
163; 2nd (optional) param: temp register to use for storing words
164%macro VC1_H_LOOP_FILTER 1-2
165%if %1 == 4
166    movq      m0, [r0     -4]
167    movq      m1, [r0+  r1-4]
168    movq      m2, [r0+2*r1-4]
169    movq      m3, [r0+  r3-4]
170    TRANSPOSE4x4B 0, 1, 2, 3, 4
171%else
172    movq      m0, [r0     -4]
173    movq      m4, [r0+  r1-4]
174    movq      m1, [r0+2*r1-4]
175    movq      m5, [r0+  r3-4]
176    movq      m2, [r4     -4]
177    movq      m6, [r4+  r1-4]
178    movq      m3, [r4+2*r1-4]
179    movq      m7, [r4+  r3-4]
180    punpcklbw m0, m4
181    punpcklbw m1, m5
182    punpcklbw m2, m6
183    punpcklbw m3, m7
184    TRANSPOSE4x4W 0, 1, 2, 3, 4
185%endif
186    pxor      m5, m5
187
188    UNPACK_8TO16 bw, 6, 0, 5
189    UNPACK_8TO16 bw, 7, 1, 5
190    VC1_LOOP_FILTER_A0 m6, m0, m7, m1
191    UNPACK_8TO16 bw, 4, 2, 5
192    mova    m0, m1                      ; m0 = p0
193    VC1_LOOP_FILTER_A0 m7, m1, m4, m2
194    UNPACK_8TO16 bw, 1, 3, 5
195    mova    m5, m4
196    VC1_LOOP_FILTER_A0 m5, m2, m1, m3
197    SWAP 1, 4                           ; m1 = q0
198
199    VC1_FILTER %1
200    punpcklbw m0, m1
201%if %0 > 1
202    STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
203%if %1 > 4
204    psrldq m0, 4
205    STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
206%endif
207%else
208    STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
209    STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
210%endif
211%endmacro
212
213
214%macro START_V_FILTER 0
215    mov  r4, r0
216    lea  r3, [4*r1]
217    sub  r4, r3
218    lea  r3, [r1+2*r1]
219    imul r2, 0x01010101
220%endmacro
221
222%macro START_H_FILTER 1
223    lea  r3, [r1+2*r1]
224%if %1 > 4
225    lea  r4, [r0+4*r1]
226%endif
227    imul r2, 0x01010101
228%endmacro
229
230; I do not know why the sign extension is needed...
231%macro PSIGNW_SRA_MMX 2
232    psraw %2, 15
233    PSIGNW_MMX %1, %2
234%endmacro
235
236
237%macro VC1_LF_MMX 1
238INIT_MMX
239cglobal vc1_v_loop_filter_internal_%1
240    VC1_V_LOOP_FILTER 4, d
241    ret
242
243cglobal vc1_h_loop_filter_internal_%1
244    VC1_H_LOOP_FILTER 4, r4
245    ret
246
247; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
248cglobal vc1_v_loop_filter4_%1, 3,5,0
249    START_V_FILTER
250    call vc1_v_loop_filter_internal_%1
251    RET
252
253; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
254cglobal vc1_h_loop_filter4_%1, 3,5,0
255    START_H_FILTER 4
256    call vc1_h_loop_filter_internal_%1
257    RET
258
259; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
260cglobal vc1_v_loop_filter8_%1, 3,5,0
261    START_V_FILTER
262    call vc1_v_loop_filter_internal_%1
263    add  r4, 4
264    add  r0, 4
265    call vc1_v_loop_filter_internal_%1
266    RET
267
268; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
269cglobal vc1_h_loop_filter8_%1, 3,5,0
270    START_H_FILTER 4
271    call vc1_h_loop_filter_internal_%1
272    lea  r0, [r0+4*r1]
273    call vc1_h_loop_filter_internal_%1
274    RET
275%endmacro
276
277%define PABSW PABSW_MMX
278%define PSIGNW PSIGNW_SRA_MMX
279VC1_LF_MMX mmx
280
281%define PABSW PABSW_MMX2
282VC1_LF_MMX mmx2
283
284INIT_XMM
285; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
286cglobal vc1_v_loop_filter8_sse2, 3,5,8
287    START_V_FILTER
288    VC1_V_LOOP_FILTER 8, q
289    RET
290
291; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
292cglobal vc1_h_loop_filter8_sse2, 3,6,8
293    START_H_FILTER 8
294    VC1_H_LOOP_FILTER 8, r5
295    RET
296
297%define PABSW PABSW_SSSE3
298%define PSIGNW PSIGNW_SSSE3
299
300INIT_MMX
301; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
302cglobal vc1_v_loop_filter4_ssse3, 3,5,0
303    START_V_FILTER
304    VC1_V_LOOP_FILTER 4, d
305    RET
306
307; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
308cglobal vc1_h_loop_filter4_ssse3, 3,5,0
309    START_H_FILTER 4
310    VC1_H_LOOP_FILTER 4, r4
311    RET
312
313INIT_XMM
314; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
315cglobal vc1_v_loop_filter8_ssse3, 3,5,8
316    START_V_FILTER
317    VC1_V_LOOP_FILTER 8, q
318    RET
319
320; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
321cglobal vc1_h_loop_filter8_ssse3, 3,6,8
322    START_H_FILTER 8
323    VC1_H_LOOP_FILTER 8, r5
324    RET
325
326; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
327cglobal vc1_h_loop_filter8_sse4, 3,5,8
328    START_H_FILTER 8
329    VC1_H_LOOP_FILTER 8
330    RET
331