1;*****************************************************************************
2;* x86-optimized functions for yadif filter
3;*
4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_1:    times 8 dw 1
30pw_8000: times 8 dw 0x8000
31pd_1:    times 4 dd 1
32pd_8000: times 4 dd 0x8000
33
34SECTION .text
35
36%macro PABS 2
37%if cpuflag(ssse3)
38    pabsd %1, %1
39%else
40    pxor    %2, %2
41    pcmpgtd %2, %1
42    pxor    %1, %2
43    psubd   %1, %2
44%endif
45%endmacro
46
47%macro PACK 1
48%if cpuflag(sse4)
49    packusdw %1, %1
50%else
51    psubd    %1, [pd_8000]
52    packssdw %1, %1
53    paddw    %1, [pw_8000]
54%endif
55%endmacro
56
57%macro PMINSD 3
58%if cpuflag(sse4)
59    pminsd %1, %2
60%else
61    mova    %3, %2
62    pcmpgtd %3, %1
63    pand    %1, %3
64    pandn   %3, %2
65    por     %1, %3
66%endif
67%endmacro
68
69%macro PMAXSD 3
70%if cpuflag(sse4)
71    pmaxsd %1, %2
72%else
73    mova    %3, %1
74    pcmpgtd %3, %2
75    pand    %1, %3
76    pandn   %3, %2
77    por     %1, %3
78%endif
79%endmacro
80
81%macro PMAXUW 2
82%if cpuflag(sse4)
83    pmaxuw %1, %2
84%else
85    psubusw %1, %2
86    paddusw %1, %2
87%endif
88%endmacro
89
90%macro CHECK 2
91    movu      m2, [curq+t1+%1*2]
92    movu      m3, [curq+t0+%2*2]
93    mova      m4, m2
94    mova      m5, m2
95    pxor      m4, m3
96    pavgw     m5, m3
97    pand      m4, [pw_1]
98    psubusw   m5, m4
99    RSHIFT    m5, 2
100    punpcklwd m5, m7
101    mova      m4, m2
102    psubusw   m2, m3
103    psubusw   m3, m4
104    PMAXUW    m2, m3
105    mova      m3, m2
106    mova      m4, m2
107    RSHIFT    m3, 2
108    RSHIFT    m4, 4
109    punpcklwd m2, m7
110    punpcklwd m3, m7
111    punpcklwd m4, m7
112    paddd     m2, m3
113    paddd     m2, m4
114%endmacro
115
116%macro CHECK1 0
117    mova    m3, m0
118    pcmpgtd m3, m2
119    PMINSD  m0, m2, m6
120    mova    m6, m3
121    pand    m5, m3
122    pandn   m3, m1
123    por     m3, m5
124    mova    m1, m3
125%endmacro
126
127%macro CHECK2 0
128    paddd   m6, [pd_1]
129    pslld   m6, 30
130    paddd   m2, m6
131    mova    m3, m0
132    pcmpgtd m3, m2
133    PMINSD  m0, m2, m4
134    pand    m5, m3
135    pandn   m3, m1
136    por     m3, m5
137    mova    m1, m3
138%endmacro
139
140; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
141; am not sure whether it is any faster.  A rewrite or refactor of the filter
142; code should make it possible to eliminate the move instruction at the end.  It
143; exists to satisfy the expectation that the "score" values are in m1.
144
145; %macro CHECK2 0
146;     mova    m3, m0
147;     pcmpgtd m0, m2
148;     pand    m0, m6
149;     mova    m6, m0
150;     pand    m5, m6
151;     pand    m2, m0
152;     pandn   m6, m1
153;     pandn   m0, m3
154;     por     m6, m5
155;     por     m0, m2
156;     mova    m1, m6
157; %endmacro
158
159%macro LOAD 2
160    movh      %1, %2
161    punpcklwd %1, m7
162%endmacro
163
164%macro FILTER 3
165.loop%1:
166    pxor         m7, m7
167    LOAD         m0, [curq+t1]
168    LOAD         m1, [curq+t0]
169    LOAD         m2, [%2]
170    LOAD         m3, [%3]
171    mova         m4, m3
172    paddd        m3, m2
173    psrad        m3, 1
174    mova   [rsp+ 0], m0
175    mova   [rsp+16], m3
176    mova   [rsp+32], m1
177    psubd        m2, m4
178    PABS         m2, m4
179    LOAD         m3, [prevq+t1]
180    LOAD         m4, [prevq+t0]
181    psubd        m3, m0
182    psubd        m4, m1
183    PABS         m3, m5
184    PABS         m4, m5
185    paddd        m3, m4
186    psrld        m2, 1
187    psrld        m3, 1
188    PMAXSD       m2, m3, m6
189    LOAD         m3, [nextq+t1]
190    LOAD         m4, [nextq+t0]
191    psubd        m3, m0
192    psubd        m4, m1
193    PABS         m3, m5
194    PABS         m4, m5
195    paddd        m3, m4
196    psrld        m3, 1
197    PMAXSD       m2, m3, m6
198    mova   [rsp+48], m2
199
200    paddd        m1, m0
201    paddd        m0, m0
202    psubd        m0, m1
203    psrld        m1, 1
204    PABS         m0, m2
205
206    movu         m2, [curq+t1-1*2]
207    movu         m3, [curq+t0-1*2]
208    mova         m4, m2
209    psubusw      m2, m3
210    psubusw      m3, m4
211    PMAXUW       m2, m3
212    mova         m3, m2
213    RSHIFT       m3, 4
214    punpcklwd    m2, m7
215    punpcklwd    m3, m7
216    paddd        m0, m2
217    paddd        m0, m3
218    psubd        m0, [pd_1]
219
220    CHECK -2, 0
221    CHECK1
222    CHECK -3, 1
223    CHECK2
224    CHECK 0, -2
225    CHECK1
226    CHECK 1, -3
227    CHECK2
228
229    mova         m6, [rsp+48]
230    cmp   DWORD r8m, 2
231    jge .end%1
232    LOAD         m2, [%2+t1*2]
233    LOAD         m4, [%3+t1*2]
234    LOAD         m3, [%2+t0*2]
235    LOAD         m5, [%3+t0*2]
236    paddd        m2, m4
237    paddd        m3, m5
238    psrld        m2, 1
239    psrld        m3, 1
240    mova         m4, [rsp+ 0]
241    mova         m5, [rsp+16]
242    mova         m7, [rsp+32]
243    psubd        m2, m4
244    psubd        m3, m7
245    mova         m0, m5
246    psubd        m5, m4
247    psubd        m0, m7
248    mova         m4, m2
249    PMINSD       m2, m3, m7
250    PMAXSD       m3, m4, m7
251    PMAXSD       m2, m5, m7
252    PMINSD       m3, m5, m7
253    PMAXSD       m2, m0, m7
254    PMINSD       m3, m0, m7
255    pxor         m4, m4
256    PMAXSD       m6, m3, m7
257    psubd        m4, m2
258    PMAXSD       m6, m4, m7
259
260.end%1:
261    mova         m2, [rsp+16]
262    mova         m3, m2
263    psubd        m2, m6
264    paddd        m3, m6
265    PMAXSD       m1, m2, m7
266    PMINSD       m1, m3, m7
267    PACK         m1
268
269    movh     [dstq], m1
270    add        dstq, mmsize/2
271    add       prevq, mmsize/2
272    add        curq, mmsize/2
273    add       nextq, mmsize/2
274    sub   DWORD r4m, mmsize/4
275    jg .loop%1
276%endmacro
277
278%macro YADIF 0
279%if ARCH_X86_32
280cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
281                                              prefs, mrefs, parity, mode
282%else
283cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
284                                              prefs, mrefs, parity, mode
285%endif
286%if ARCH_X86_32
287    mov            r4, r5mp
288    mov            r5, r6mp
289    DECLARE_REG_TMP 4,5
290%else
291    movsxd         r5, DWORD r5m
292    movsxd         r6, DWORD r6m
293    DECLARE_REG_TMP 5,6
294%endif
295
296    cmp DWORD paritym, 0
297    je .parity0
298    FILTER 1, prevq, curq
299    jmp .ret
300
301.parity0:
302    FILTER 0, curq, nextq
303
304.ret:
305    RET
306%endmacro
307
308INIT_XMM sse4
309YADIF
310INIT_XMM ssse3
311YADIF
312INIT_XMM sse2
313YADIF
314%if ARCH_X86_32
315INIT_MMX mmxext
316YADIF
317%endif
318