1;******************************************************************************
2;* FFT transform with SSE/3DNow optimizations
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22; These functions are not individually interchangeable with the C versions.
23; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
24; in blocks as conventient to the vector size.
25; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
26
27%include "x86inc.asm"
28
29SECTION_RODATA
30
31%define M_SQRT1_2 0.70710678118654752440
32ps_root2: times 4 dd M_SQRT1_2
33ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
34ps_m1p1: dd 1<<31, 0
35
36%assign i 16
37%rep 13
38cextern ff_cos_ %+ i
39%assign i i<<1
40%endrep
41
42%ifdef ARCH_X86_64
43    %define pointer dq
44%else
45    %define pointer dd
46%endif
47
48%macro IF0 1+
49%endmacro
50%macro IF1 1+
51    %1
52%endmacro
53
54section .text align=16
55
56%macro T2_3DN 4 ; z0, z1, mem0, mem1
57    mova     %1, %3
58    mova     %2, %1
59    pfadd    %1, %4
60    pfsub    %2, %4
61%endmacro
62
63%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
64    mova     %5, %3
65    pfsub    %3, %4
66    pfadd    %5, %4 ; {t6,t5}
67    pxor     %3, [ps_m1p1 GLOBAL] ; {t8,t7}
68    mova     %6, %1
69    pswapd   %3, %3
70    pfadd    %1, %5 ; {r0,i0}
71    pfsub    %6, %5 ; {r2,i2}
72    mova     %4, %2
73    pfadd    %2, %3 ; {r1,i1}
74    pfsub    %4, %3 ; {r3,i3}
75    SWAP     %3, %6
76%endmacro
77
78; in:  %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
79; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
80%macro T4_SSE 3
81    mova     %3, %1
82    shufps   %1, %2, 0x64 ; {r0,i0,r3,i2}
83    shufps   %3, %2, 0xce ; {r1,i1,r2,i3}
84    mova     %2, %1
85    addps    %1, %3       ; {t1,t2,t6,t5}
86    subps    %2, %3       ; {t3,t4,t8,t7}
87    mova     %3, %1
88    shufps   %1, %2, 0x44 ; {t1,t2,t3,t4}
89    shufps   %3, %2, 0xbe ; {t6,t5,t7,t8}
90    mova     %2, %1
91    addps    %1, %3       ; {r0,i0,r1,i1}
92    subps    %2, %3       ; {r2,i2,r3,i3}
93    mova     %3, %1
94    shufps   %1, %2, 0x88 ; {r0,r1,r2,r3}
95    shufps   %3, %2, 0xdd ; {i0,i1,i2,i3}
96    SWAP     %2, %3
97%endmacro
98
99%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
100    mova     %5, %3
101    shufps   %3, %4, 0x44 ; {r4,i4,r6,i6}
102    shufps   %5, %4, 0xee ; {r5,i5,r7,i7}
103    mova     %6, %3
104    subps    %3, %5       ; {r5,i5,r7,i7}
105    addps    %6, %5       ; {t1,t2,t3,t4}
106    mova     %5, %3
107    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
108    mulps    %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
109    mulps    %5, [ps_root2 GLOBAL]
110    addps    %3, %5       ; {t8,t7,ta,t9}
111    mova     %5, %6
112    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
113    shufps   %5, %3, 0x9c ; {t1,t4,t7,ta}
114    mova     %3, %6
115    addps    %6, %5       ; {t1,t2,t9,ta}
116    subps    %3, %5       ; {t6,t5,tc,tb}
117    mova     %5, %6
118    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
119    shufps   %5, %3, 0x8d ; {t2,ta,t6,tc}
120    mova     %3, %1
121    mova     %4, %2
122    addps    %1, %6       ; {r0,r1,r2,r3}
123    addps    %2, %5       ; {i0,i1,i2,i3}
124    subps    %3, %6       ; {r4,r5,r6,r7}
125    subps    %4, %5       ; {i4,i5,i6,i7}
126%endmacro
127
128; scheduled for cpu-bound sizes
129%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
130IF%1 mova    m4, Z(4)
131IF%1 mova    m5, Z(5)
132    mova     m0, %2 ; wre
133    mova     m2, m4
134    mova     m1, %3 ; wim
135    mova     m3, m5
136    mulps    m2, m0 ; r2*wre
137IF%1 mova    m6, Z(6)
138    mulps    m3, m1 ; i2*wim
139IF%1 mova    m7, Z(7)
140    mulps    m4, m1 ; r2*wim
141    mulps    m5, m0 ; i2*wre
142    addps    m2, m3 ; r2*wre + i2*wim
143    mova     m3, m1
144    mulps    m1, m6 ; r3*wim
145    subps    m5, m4 ; i2*wre - r2*wim
146    mova     m4, m0
147    mulps    m3, m7 ; i3*wim
148    mulps    m4, m6 ; r3*wre
149    mulps    m0, m7 ; i3*wre
150    subps    m4, m3 ; r3*wre - i3*wim
151    mova     m3, Z(0)
152    addps    m0, m1 ; i3*wre + r3*wim
153    mova     m1, m4
154    addps    m4, m2 ; t5
155    subps    m1, m2 ; t3
156    subps    m3, m4 ; r2
157    addps    m4, Z(0) ; r0
158    mova     m6, Z(2)
159    mova   Z(4), m3
160    mova   Z(0), m4
161    mova     m3, m5
162    subps    m5, m0 ; t4
163    mova     m4, m6
164    subps    m6, m5 ; r3
165    addps    m5, m4 ; r1
166    mova   Z(6), m6
167    mova   Z(2), m5
168    mova     m2, Z(3)
169    addps    m3, m0 ; t6
170    subps    m2, m1 ; i3
171    mova     m7, Z(1)
172    addps    m1, Z(3) ; i1
173    mova   Z(7), m2
174    mova   Z(3), m1
175    mova     m4, m7
176    subps    m7, m3 ; i2
177    addps    m3, m4 ; i0
178    mova   Z(5), m7
179    mova   Z(1), m3
180%endmacro
181
182; scheduled to avoid store->load aliasing
183%macro PASS_BIG 1 ; (!interleave)
184    mova     m4, Z(4) ; r2
185    mova     m5, Z(5) ; i2
186    mova     m2, m4
187    mova     m0, [wq] ; wre
188    mova     m3, m5
189    mova     m1, [wq+o1q] ; wim
190    mulps    m2, m0 ; r2*wre
191    mova     m6, Z(6) ; r3
192    mulps    m3, m1 ; i2*wim
193    mova     m7, Z(7) ; i3
194    mulps    m4, m1 ; r2*wim
195    mulps    m5, m0 ; i2*wre
196    addps    m2, m3 ; r2*wre + i2*wim
197    mova     m3, m1
198    mulps    m1, m6 ; r3*wim
199    subps    m5, m4 ; i2*wre - r2*wim
200    mova     m4, m0
201    mulps    m3, m7 ; i3*wim
202    mulps    m4, m6 ; r3*wre
203    mulps    m0, m7 ; i3*wre
204    subps    m4, m3 ; r3*wre - i3*wim
205    mova     m3, Z(0)
206    addps    m0, m1 ; i3*wre + r3*wim
207    mova     m1, m4
208    addps    m4, m2 ; t5
209    subps    m1, m2 ; t3
210    subps    m3, m4 ; r2
211    addps    m4, Z(0) ; r0
212    mova     m6, Z(2)
213    mova   Z(4), m3
214    mova   Z(0), m4
215    mova     m3, m5
216    subps    m5, m0 ; t4
217    mova     m4, m6
218    subps    m6, m5 ; r3
219    addps    m5, m4 ; r1
220IF%1 mova  Z(6), m6
221IF%1 mova  Z(2), m5
222    mova     m2, Z(3)
223    addps    m3, m0 ; t6
224    subps    m2, m1 ; i3
225    mova     m7, Z(1)
226    addps    m1, Z(3) ; i1
227IF%1 mova  Z(7), m2
228IF%1 mova  Z(3), m1
229    mova     m4, m7
230    subps    m7, m3 ; i2
231    addps    m3, m4 ; i0
232IF%1 mova  Z(5), m7
233IF%1 mova  Z(1), m3
234%if %1==0
235    mova     m4, m5 ; r1
236    mova     m0, m6 ; r3
237    unpcklps m5, m1
238    unpckhps m4, m1
239    unpcklps m6, m2
240    unpckhps m0, m2
241    mova     m1, Z(0)
242    mova     m2, Z(4)
243    mova   Z(2), m5
244    mova   Z(3), m4
245    mova   Z(6), m6
246    mova   Z(7), m0
247    mova     m5, m1 ; r0
248    mova     m4, m2 ; r2
249    unpcklps m1, m3
250    unpckhps m5, m3
251    unpcklps m2, m7
252    unpckhps m4, m7
253    mova   Z(0), m1
254    mova   Z(1), m5
255    mova   Z(4), m2
256    mova   Z(5), m4
257%endif
258%endmacro
259
260%macro PUNPCK 3
261    mova      %3, %1
262    punpckldq %1, %2
263    punpckhdq %3, %2
264%endmacro
265
266INIT_XMM
267
268%define Z(x) [r0+mmsize*x]
269
270align 16
271fft4_sse:
272    mova     m0, Z(0)
273    mova     m1, Z(1)
274    T4_SSE   m0, m1, m2
275    mova   Z(0), m0
276    mova   Z(1), m1
277    ret
278
279align 16
280fft8_sse:
281    mova     m0, Z(0)
282    mova     m1, Z(1)
283    T4_SSE   m0, m1, m2
284    mova     m2, Z(2)
285    mova     m3, Z(3)
286    T8_SSE   m0, m1, m2, m3, m4, m5
287    mova   Z(0), m0
288    mova   Z(1), m1
289    mova   Z(2), m2
290    mova   Z(3), m3
291    ret
292
293align 16
294fft16_sse:
295    mova     m0, Z(0)
296    mova     m1, Z(1)
297    T4_SSE   m0, m1, m2
298    mova     m2, Z(2)
299    mova     m3, Z(3)
300    T8_SSE   m0, m1, m2, m3, m4, m5
301    mova     m4, Z(4)
302    mova     m5, Z(5)
303    mova   Z(0), m0
304    mova   Z(1), m1
305    mova   Z(2), m2
306    mova   Z(3), m3
307    T4_SSE   m4, m5, m6
308    mova     m6, Z(6)
309    mova     m7, Z(7)
310    T4_SSE   m6, m7, m0
311    PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
312    ret
313
314
315INIT_MMX
316
317%macro FFT48_3DN 1
318align 16
319fft4%1:
320    T2_3DN   m0, m1, Z(0), Z(1)
321    mova     m2, Z(2)
322    mova     m3, Z(3)
323    T4_3DN   m0, m1, m2, m3, m4, m5
324    PUNPCK   m0, m1, m4
325    PUNPCK   m2, m3, m5
326    mova   Z(0), m0
327    mova   Z(1), m4
328    mova   Z(2), m2
329    mova   Z(3), m5
330    ret
331
332align 16
333fft8%1:
334    T2_3DN   m0, m1, Z(0), Z(1)
335    mova     m2, Z(2)
336    mova     m3, Z(3)
337    T4_3DN   m0, m1, m2, m3, m4, m5
338    mova   Z(0), m0
339    mova   Z(2), m2
340    T2_3DN   m4, m5, Z(4), Z(5)
341    T2_3DN   m6, m7, Z(6), Z(7)
342    pswapd   m0, m5
343    pswapd   m2, m7
344    pxor     m0, [ps_m1p1 GLOBAL]
345    pxor     m2, [ps_m1p1 GLOBAL]
346    pfsub    m5, m0
347    pfadd    m7, m2
348    pfmul    m5, [ps_root2 GLOBAL]
349    pfmul    m7, [ps_root2 GLOBAL]
350    T4_3DN   m1, m3, m5, m7, m0, m2
351    mova   Z(5), m5
352    mova   Z(7), m7
353    mova     m0, Z(0)
354    mova     m2, Z(2)
355    T4_3DN   m0, m2, m4, m6, m5, m7
356    PUNPCK   m0, m1, m5
357    PUNPCK   m2, m3, m7
358    mova   Z(0), m0
359    mova   Z(1), m5
360    mova   Z(2), m2
361    mova   Z(3), m7
362    PUNPCK   m4, Z(5), m5
363    PUNPCK   m6, Z(7), m7
364    mova   Z(4), m4
365    mova   Z(5), m5
366    mova   Z(6), m6
367    mova   Z(7), m7
368    ret
369%endmacro
370
371FFT48_3DN _3dn2
372
373%macro pswapd 2
374%ifidn %1, %2
375    movd [r0+12], %1
376    punpckhdq %1, [r0+8]
377%else
378    movq  %1, %2
379    psrlq %1, 32
380    punpckldq %1, %2
381%endif
382%endmacro
383
384FFT48_3DN _3dn
385
386
387%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
388
389%macro DECL_PASS 2+ ; name, payload
390align 16
391%1:
392DEFINE_ARGS z, w, n, o1, o3
393    lea o3q, [nq*3]
394    lea o1q, [nq*8]
395    shl o3q, 4
396.loop:
397    %2
398    add zq, mmsize*2
399    add wq, mmsize
400    sub nd, mmsize/8
401    jg .loop
402    rep ret
403%endmacro
404
405INIT_XMM
406DECL_PASS pass_sse, PASS_BIG 1
407DECL_PASS pass_interleave_sse, PASS_BIG 0
408
409INIT_MMX
410%define mulps pfmul
411%define addps pfadd
412%define subps pfsub
413%define unpcklps punpckldq
414%define unpckhps punpckhdq
415DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
416DECL_PASS pass_interleave_3dn, PASS_BIG 0
417%define pass_3dn2 pass_3dn
418%define pass_interleave_3dn2 pass_interleave_3dn
419
420
421%macro DECL_FFT 2-3 ; nbits, cpu, suffix
422%xdefine list_of_fft fft4%2, fft8%2
423%if %1==5
424%xdefine list_of_fft list_of_fft, fft16%2
425%endif
426
427%assign n 1<<%1
428%rep 17-%1
429%assign n2 n/2
430%assign n4 n/4
431%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
432
433align 16
434fft %+ n %+ %3%2:
435    call fft %+ n2 %+ %2
436    add r0, n*4 - (n&(-2<<%1))
437    call fft %+ n4 %+ %2
438    add r0, n*2 - (n2&(-2<<%1))
439    call fft %+ n4 %+ %2
440    sub r0, n*6 + (n2&(-2<<%1))
441    lea r1, [ff_cos_ %+ n GLOBAL]
442    mov r2d, n4/2
443    jmp pass%3%2
444
445%assign n n*2
446%endrep
447%undef n
448
449%ifidn __OUTPUT_FORMAT__,macho64
450section .rodata
451%endif
452
453align 8
454dispatch_tab%3%2: pointer list_of_fft
455
456section .text
457
458; On x86_32, this function does the register saving and restoring for all of fft.
459; The others pass args in registers and don't spill anything.
460cglobal fft_dispatch%3%2, 2,5,0, z, nbits
461    lea r2, [dispatch_tab%3%2 GLOBAL]
462    mov r2, [r2 + (nbitsq-2)*gprsize]
463    call r2
464    RET
465%endmacro ; DECL_FFT
466
467DECL_FFT 5, _sse
468DECL_FFT 5, _sse, _interleave
469DECL_FFT 4, _3dn
470DECL_FFT 4, _3dn, _interleave
471DECL_FFT 4, _3dn2
472DECL_FFT 4, _3dn2, _interleave
473
474