1;******************************************************************************
2;* FFT transform with SSE/3DNow optimizations
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2011 Vitor Sessak
5;*
6;* This algorithm (though not any of the implementation details) is
7;* based on libdjbfft by D. J. Bernstein.
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26; These functions are not individually interchangeable with the C versions.
27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28; in blocks as conventient to the vector size.
29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
30
31%include "libavutil/x86/x86util.asm"
32
33%if ARCH_X86_64
34%define pointer resq
35%else
36%define pointer resd
37%endif
38
39SECTION_RODATA 32
40
41struc FFTContext
42    .nbits:    resd 1
43    .reverse:  resd 1
44    .revtab:   pointer 1
45    .tmpbuf:   pointer 1
46    .mdctsize: resd 1
47    .mdctbits: resd 1
48    .tcos:     pointer 1
49    .tsin:     pointer 1
50    .fftperm:  pointer 1
51    .fftcalc:  pointer 1
52    .imdctcalc:pointer 1
53    .imdcthalf:pointer 1
54endstruc
55
56%define M_SQRT1_2 0.70710678118654752440
57%define M_COS_PI_1_8 0.923879532511287
58%define M_COS_PI_3_8 0.38268343236509
59
60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
62
63ps_root2: times 8 dd M_SQRT1_2
64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
66
67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
71ps_m1m1m1m1: times 4 dd 1<<31
72ps_m1p1: dd 1<<31, 0
73
74%assign i 16
75%rep 13
76cextern cos_ %+ i
77%assign i i<<1
78%endrep
79
80%if ARCH_X86_64
81    %define pointer dq
82%else
83    %define pointer dd
84%endif
85
86%macro IF0 1+
87%endmacro
88%macro IF1 1+
89    %1
90%endmacro
91
92SECTION_TEXT
93
94%macro T2_3DNOW 4 ; z0, z1, mem0, mem1
95    mova     %1, %3
96    mova     %2, %1
97    pfadd    %1, %4
98    pfsub    %2, %4
99%endmacro
100
101%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
102    mova     %5, %3
103    pfsub    %3, %4
104    pfadd    %5, %4 ; {t6,t5}
105    pxor     %3, [ps_m1p1] ; {t8,t7}
106    mova     %6, %1
107    movd [r0+12], %3
108    punpckhdq %3, [r0+8]
109    pfadd    %1, %5 ; {r0,i0}
110    pfsub    %6, %5 ; {r2,i2}
111    mova     %4, %2
112    pfadd    %2, %3 ; {r1,i1}
113    pfsub    %4, %3 ; {r3,i3}
114    SWAP     %3, %6
115%endmacro
116
117;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
118;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
119;      %3, %4, %5 tmp
120; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
121;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
122%macro T8_AVX 5
123    vsubps     %5, %1, %2       ; v  = %1 - %2
124    vaddps     %3, %1, %2       ; w  = %1 + %2
125    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
126    vpermilps  %2, %2, [perm1]
127    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
128    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
129    vsubps     %4, %5, %1       ; s = r - q
130    vaddps     %1, %5, %1       ; u = r + q
131    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
132    vshufps    %5, %4, %1, 0xbb
133    vshufps    %3, %4, %1, 0xee
134    vperm2f128 %3, %3, %5, 0x13
135    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
136    vshufps    %2, %1, %4, 0xdd
137    vshufps    %1, %1, %4, 0x88
138    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
139    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
140    vsubps     %5, %1, %3
141    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
142    vsubps     %2, %4, %1       ; %2 = v - w
143    vaddps     %1, %4, %1       ; %1 = v + w
144%endmacro
145
146; In SSE mode do one fft4 transforms
147; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
148; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
149;
150; In AVX mode do two fft4 transforms
151; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
152; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
153%macro T4_SSE 3
154    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
155    addps    %1, %1, %2       ; {t1,t2,t6,t5}
156    xorps    %3, %3, [ps_p1p1m1p1]
157    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
158    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
159    subps    %3, %1, %2       ; {r2,i2,r3,i3}
160    addps    %1, %1, %2       ; {r0,i0,r1,i1}
161    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
162    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
163%endmacro
164
165; In SSE mode do one FFT8
166; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
167; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
168;
169; In AVX mode do two FFT8
170; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
171;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
172; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
173;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
174%macro T8_SSE 6
175    addps    %6, %3, %4       ; {t1,t2,t3,t4}
176    subps    %3, %3, %4       ; {r5,i5,r7,i7}
177    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
178    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
179    mulps    %4, %4, [ps_root2]
180    addps    %3, %3, %4       ; {t8,t7,ta,t9}
181    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
182    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
183    subps    %3, %6, %4       ; {t6,t5,tc,tb}
184    addps    %6, %6, %4       ; {t1,t2,t9,ta}
185    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
186    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
187    subps    %3, %1, %6       ; {r4,r5,r6,r7}
188    addps    %1, %1, %6       ; {r0,r1,r2,r3}
189    subps    %4, %2, %5       ; {i4,i5,i6,i7}
190    addps    %2, %2, %5       ; {i0,i1,i2,i3}
191%endmacro
192
193; scheduled for cpu-bound sizes
194%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
195IF%1 mova    m4, Z(4)
196IF%1 mova    m5, Z(5)
197    mova     m0, %2 ; wre
198    mova     m1, %3 ; wim
199    mulps    m2, m4, m0 ; r2*wre
200IF%1 mova    m6, Z2(6)
201    mulps    m3, m5, m1 ; i2*wim
202IF%1 mova    m7, Z2(7)
203    mulps    m4, m4, m1 ; r2*wim
204    mulps    m5, m5, m0 ; i2*wre
205    addps    m2, m2, m3 ; r2*wre + i2*wim
206    mulps    m3, m1, m7 ; i3*wim
207    subps    m5, m5, m4 ; i2*wre - r2*wim
208    mulps    m1, m1, m6 ; r3*wim
209    mulps    m4, m0, m6 ; r3*wre
210    mulps    m0, m0, m7 ; i3*wre
211    subps    m4, m4, m3 ; r3*wre - i3*wim
212    mova     m3, Z(0)
213    addps    m0, m0, m1 ; i3*wre + r3*wim
214    subps    m1, m4, m2 ; t3
215    addps    m4, m4, m2 ; t5
216    subps    m3, m3, m4 ; r2
217    addps    m4, m4, Z(0) ; r0
218    mova     m6, Z(2)
219    mova   Z(4), m3
220    mova   Z(0), m4
221    subps    m3, m5, m0 ; t4
222    subps    m4, m6, m3 ; r3
223    addps    m3, m3, m6 ; r1
224    mova  Z2(6), m4
225    mova   Z(2), m3
226    mova     m2, Z(3)
227    addps    m3, m5, m0 ; t6
228    subps    m2, m2, m1 ; i3
229    mova     m7, Z(1)
230    addps    m1, m1, Z(3) ; i1
231    mova  Z2(7), m2
232    mova   Z(3), m1
233    subps    m4, m7, m3 ; i2
234    addps    m3, m3, m7 ; i0
235    mova   Z(5), m4
236    mova   Z(1), m3
237%endmacro
238
239; scheduled to avoid store->load aliasing
240%macro PASS_BIG 1 ; (!interleave)
241    mova     m4, Z(4) ; r2
242    mova     m5, Z(5) ; i2
243    mova     m0, [wq] ; wre
244    mova     m1, [wq+o1q] ; wim
245    mulps    m2, m4, m0 ; r2*wre
246    mova     m6, Z2(6) ; r3
247    mulps    m3, m5, m1 ; i2*wim
248    mova     m7, Z2(7) ; i3
249    mulps    m4, m4, m1 ; r2*wim
250    mulps    m5, m5, m0 ; i2*wre
251    addps    m2, m2, m3 ; r2*wre + i2*wim
252    mulps    m3, m1, m7 ; i3*wim
253    mulps    m1, m1, m6 ; r3*wim
254    subps    m5, m5, m4 ; i2*wre - r2*wim
255    mulps    m4, m0, m6 ; r3*wre
256    mulps    m0, m0, m7 ; i3*wre
257    subps    m4, m4, m3 ; r3*wre - i3*wim
258    mova     m3, Z(0)
259    addps    m0, m0, m1 ; i3*wre + r3*wim
260    subps    m1, m4, m2 ; t3
261    addps    m4, m4, m2 ; t5
262    subps    m3, m3, m4 ; r2
263    addps    m4, m4, Z(0) ; r0
264    mova     m6, Z(2)
265    mova   Z(4), m3
266    mova   Z(0), m4
267    subps    m3, m5, m0 ; t4
268    subps    m4, m6, m3 ; r3
269    addps    m3, m3, m6 ; r1
270IF%1 mova Z2(6), m4
271IF%1 mova  Z(2), m3
272    mova     m2, Z(3)
273    addps    m5, m5, m0 ; t6
274    subps    m2, m2, m1 ; i3
275    mova     m7, Z(1)
276    addps    m1, m1, Z(3) ; i1
277IF%1 mova Z2(7), m2
278IF%1 mova  Z(3), m1
279    subps    m6, m7, m5 ; i2
280    addps    m5, m5, m7 ; i0
281IF%1 mova  Z(5), m6
282IF%1 mova  Z(1), m5
283%if %1==0
284    INTERL m1, m3, m7, Z, 2
285    INTERL m2, m4, m0, Z2, 6
286
287    mova     m1, Z(0)
288    mova     m2, Z(4)
289
290    INTERL m5, m1, m3, Z, 0
291    INTERL m6, m2, m7, Z, 4
292%endif
293%endmacro
294
295%macro PUNPCK 3
296    mova      %3, %1
297    punpckldq %1, %2
298    punpckhdq %3, %2
299%endmacro
300
301%define Z(x) [r0+mmsize*x]
302%define Z2(x) [r0+mmsize*x]
303%define ZH(x) [r0+mmsize*x+mmsize/2]
304
305INIT_YMM avx
306
307%if HAVE_AVX_EXTERNAL
308align 16
309fft8_avx:
310    mova      m0, Z(0)
311    mova      m1, Z(1)
312    T8_AVX    m0, m1, m2, m3, m4
313    mova      Z(0), m0
314    mova      Z(1), m1
315    ret
316
317
318align 16
319fft16_avx:
320    mova       m2, Z(2)
321    mova       m3, Z(3)
322    T4_SSE     m2, m3, m7
323
324    mova       m0, Z(0)
325    mova       m1, Z(1)
326    T8_AVX     m0, m1, m4, m5, m7
327
328    mova       m4, [ps_cos16_1]
329    mova       m5, [ps_cos16_2]
330    vmulps     m6, m2, m4
331    vmulps     m7, m3, m5
332    vaddps     m7, m7, m6
333    vmulps     m2, m2, m5
334    vmulps     m3, m3, m4
335    vsubps     m3, m3, m2
336    vblendps   m2, m7, m3, 0xf0
337    vperm2f128 m3, m7, m3, 0x21
338    vaddps     m4, m2, m3
339    vsubps     m2, m3, m2
340    vperm2f128 m2, m2, m2, 0x01
341    vsubps     m3, m1, m2
342    vaddps     m1, m1, m2
343    vsubps     m5, m0, m4
344    vaddps     m0, m0, m4
345    vextractf128   Z(0), m0, 0
346    vextractf128  ZH(0), m1, 0
347    vextractf128   Z(1), m0, 1
348    vextractf128  ZH(1), m1, 1
349    vextractf128   Z(2), m5, 0
350    vextractf128  ZH(2), m3, 0
351    vextractf128   Z(3), m5, 1
352    vextractf128  ZH(3), m3, 1
353    ret
354
355align 16
356fft32_avx:
357    call fft16_avx
358
359    mova m0, Z(4)
360    mova m1, Z(5)
361
362    T4_SSE      m0, m1, m4
363
364    mova m2, Z(6)
365    mova m3, Z(7)
366
367    T8_SSE      m0, m1, m2, m3, m4, m6
368    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
369    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
370
371    vperm2f128  m4, m0, m2, 0x20
372    vperm2f128  m5, m1, m3, 0x20
373    vperm2f128  m6, m0, m2, 0x31
374    vperm2f128  m7, m1, m3, 0x31
375
376    PASS_SMALL 0, [cos_32], [cos_32+32]
377
378    ret
379
380fft32_interleave_avx:
381    call fft32_avx
382    mov r2d, 32
383.deint_loop:
384    mova     m2, Z(0)
385    mova     m3, Z(1)
386    vunpcklps      m0, m2, m3
387    vunpckhps      m1, m2, m3
388    vextractf128   Z(0), m0, 0
389    vextractf128  ZH(0), m1, 0
390    vextractf128   Z(1), m0, 1
391    vextractf128  ZH(1), m1, 1
392    add r0, mmsize*2
393    sub r2d, mmsize/4
394    jg .deint_loop
395    ret
396
397%endif
398
399INIT_XMM sse
400
401align 16
402fft4_avx:
403fft4_sse:
404    mova     m0, Z(0)
405    mova     m1, Z(1)
406    T4_SSE   m0, m1, m2
407    mova   Z(0), m0
408    mova   Z(1), m1
409    ret
410
411align 16
412fft8_sse:
413    mova     m0, Z(0)
414    mova     m1, Z(1)
415    T4_SSE   m0, m1, m2
416    mova     m2, Z(2)
417    mova     m3, Z(3)
418    T8_SSE   m0, m1, m2, m3, m4, m5
419    mova   Z(0), m0
420    mova   Z(1), m1
421    mova   Z(2), m2
422    mova   Z(3), m3
423    ret
424
425align 16
426fft16_sse:
427    mova     m0, Z(0)
428    mova     m1, Z(1)
429    T4_SSE   m0, m1, m2
430    mova     m2, Z(2)
431    mova     m3, Z(3)
432    T8_SSE   m0, m1, m2, m3, m4, m5
433    mova     m4, Z(4)
434    mova     m5, Z(5)
435    mova   Z(0), m0
436    mova   Z(1), m1
437    mova   Z(2), m2
438    mova   Z(3), m3
439    T4_SSE   m4, m5, m6
440    mova     m6, Z2(6)
441    mova     m7, Z2(7)
442    T4_SSE   m6, m7, m0
443    PASS_SMALL 0, [cos_16], [cos_16+16]
444    ret
445
446
447%macro FFT48_3DNOW 0
448align 16
449fft4 %+ SUFFIX:
450    T2_3DNOW m0, m1, Z(0), Z(1)
451    mova     m2, Z(2)
452    mova     m3, Z(3)
453    T4_3DNOW m0, m1, m2, m3, m4, m5
454    PUNPCK   m0, m1, m4
455    PUNPCK   m2, m3, m5
456    mova   Z(0), m0
457    mova   Z(1), m4
458    mova   Z(2), m2
459    mova   Z(3), m5
460    ret
461
462align 16
463fft8 %+ SUFFIX:
464    T2_3DNOW m0, m1, Z(0), Z(1)
465    mova     m2, Z(2)
466    mova     m3, Z(3)
467    T4_3DNOW m0, m1, m2, m3, m4, m5
468    mova   Z(0), m0
469    mova   Z(2), m2
470    T2_3DNOW m4, m5,  Z(4),  Z(5)
471    T2_3DNOW m6, m7, Z2(6), Z2(7)
472    PSWAPD   m0, m5
473    PSWAPD   m2, m7
474    pxor     m0, [ps_m1p1]
475    pxor     m2, [ps_m1p1]
476    pfsub    m5, m0
477    pfadd    m7, m2
478    pfmul    m5, [ps_root2]
479    pfmul    m7, [ps_root2]
480    T4_3DNOW m1, m3, m5, m7, m0, m2
481    mova   Z(5), m5
482    mova  Z2(7), m7
483    mova     m0, Z(0)
484    mova     m2, Z(2)
485    T4_3DNOW m0, m2, m4, m6, m5, m7
486    PUNPCK   m0, m1, m5
487    PUNPCK   m2, m3, m7
488    mova   Z(0), m0
489    mova   Z(1), m5
490    mova   Z(2), m2
491    mova   Z(3), m7
492    PUNPCK   m4,  Z(5), m5
493    PUNPCK   m6, Z2(7), m7
494    mova   Z(4), m4
495    mova   Z(5), m5
496    mova  Z2(6), m6
497    mova  Z2(7), m7
498    ret
499%endmacro
500
501%if ARCH_X86_32
502INIT_MMX 3dnowext
503FFT48_3DNOW
504
505INIT_MMX 3dnow
506FFT48_3DNOW
507%endif
508
509%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
510%define Z2(x) [zcq + o3q + mmsize*(x&1)]
511%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
512%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
513
514%macro DECL_PASS 2+ ; name, payload
515align 16
516%1:
517DEFINE_ARGS zc, w, n, o1, o3
518    lea o3q, [nq*3]
519    lea o1q, [nq*8]
520    shl o3q, 4
521.loop:
522    %2
523    add zcq, mmsize*2
524    add  wq, mmsize
525    sub  nd, mmsize/8
526    jg .loop
527    rep ret
528%endmacro
529
530%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
531    lea r2, [dispatch_tab%1]
532    mov r2, [r2 + (%2q-2)*gprsize]
533%ifdef PIC
534    lea r3, [$$]
535    add r2, r3
536%endif
537    call r2
538%endmacro ; FFT_DISPATCH
539
540INIT_YMM avx
541
542%if HAVE_AVX_EXTERNAL
543%macro INTERL_AVX 5
544    vunpckhps      %3, %2, %1
545    vunpcklps      %2, %2, %1
546    vextractf128   %4(%5), %2, 0
547    vextractf128  %4 %+ H(%5), %3, 0
548    vextractf128   %4(%5 + 1), %2, 1
549    vextractf128  %4 %+ H(%5 + 1), %3, 1
550%endmacro
551
552%define INTERL INTERL_AVX
553
554DECL_PASS pass_avx, PASS_BIG 1
555DECL_PASS pass_interleave_avx, PASS_BIG 0
556
557cglobal fft_calc, 2,5,8
558    mov     r3d, [r0 + FFTContext.nbits]
559    mov     r0, r1
560    mov     r1, r3
561    FFT_DISPATCH _interleave %+ SUFFIX, r1
562    REP_RET
563
564%endif
565
566INIT_XMM sse
567
568%macro INTERL_SSE 5
569    mova     %3, %2
570    unpcklps %2, %1
571    unpckhps %3, %1
572    mova  %4(%5), %2
573    mova  %4(%5+1), %3
574%endmacro
575
576%define INTERL INTERL_SSE
577
578DECL_PASS pass_sse, PASS_BIG 1
579DECL_PASS pass_interleave_sse, PASS_BIG 0
580
581%macro FFT_CALC_FUNC 0
582cglobal fft_calc, 2,5,8
583    mov     r3d, [r0 + FFTContext.nbits]
584    PUSH    r1
585    PUSH    r3
586    mov     r0, r1
587    mov     r1, r3
588    FFT_DISPATCH _interleave %+ SUFFIX, r1
589    POP     rcx
590    POP     r4
591    cmp     rcx, 3+(mmsize/16)
592    jg      .end
593    mov     r2, -1
594    add     rcx, 3
595    shl     r2, cl
596    sub     r4, r2
597.loop:
598%if mmsize == 8
599    PSWAPD  m0, [r4 + r2 + 4]
600    mova [r4 + r2 + 4], m0
601%else
602    movaps   xmm0, [r4 + r2]
603    movaps   xmm1, xmm0
604    unpcklps xmm0, [r4 + r2 + 16]
605    unpckhps xmm1, [r4 + r2 + 16]
606    movaps   [r4 + r2],      xmm0
607    movaps   [r4 + r2 + 16], xmm1
608%endif
609    add      r2, mmsize*2
610    jl       .loop
611.end:
612%if cpuflag(3dnow)
613    femms
614    RET
615%else
616    REP_RET
617%endif
618%endmacro
619
620%if ARCH_X86_32
621INIT_MMX 3dnow
622FFT_CALC_FUNC
623INIT_MMX 3dnowext
624FFT_CALC_FUNC
625%endif
626INIT_XMM sse
627FFT_CALC_FUNC
628
629cglobal fft_permute, 2,7,1
630    mov     r4,  [r0 + FFTContext.revtab]
631    mov     r5,  [r0 + FFTContext.tmpbuf]
632    mov     ecx, [r0 + FFTContext.nbits]
633    mov     r2, 1
634    shl     r2, cl
635    xor     r0, r0
636%if ARCH_X86_32
637    mov     r1, r1m
638%endif
639.loop:
640    movaps  xmm0, [r1 + 8*r0]
641    movzx   r6, word [r4 + 2*r0]
642    movzx   r3, word [r4 + 2*r0 + 2]
643    movlps  [r5 + 8*r6], xmm0
644    movhps  [r5 + 8*r3], xmm0
645    add     r0, 2
646    cmp     r0, r2
647    jl      .loop
648    shl     r2, 3
649    add     r1, r2
650    add     r5, r2
651    neg     r2
652; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
653.loopcopy:
654    movaps  xmm0, [r5 + r2]
655    movaps  xmm1, [r5 + r2 + 16]
656    movaps  [r1 + r2], xmm0
657    movaps  [r1 + r2 + 16], xmm1
658    add     r2, 32
659    jl      .loopcopy
660    REP_RET
661
662%macro IMDCT_CALC_FUNC 0
663cglobal imdct_calc, 3,5,3
664    mov     r3d, [r0 + FFTContext.mdctsize]
665    mov     r4,  [r0 + FFTContext.imdcthalf]
666    add     r1,  r3
667    PUSH    r3
668    PUSH    r1
669%if ARCH_X86_32
670    push    r2
671    push    r1
672    push    r0
673%else
674    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
675%endif
676    call    r4
677%if ARCH_X86_32
678    add     esp, 12
679%else
680    add     rsp, 8+32*WIN64
681%endif
682    POP     r1
683    POP     r3
684    lea     r0, [r1 + 2*r3]
685    mov     r2, r3
686    sub     r3, mmsize
687    neg     r2
688    mova    m2, [ps_m1m1m1m1]
689.loop:
690%if mmsize == 8
691    PSWAPD  m0, [r1 + r3]
692    PSWAPD  m1, [r0 + r2]
693    pxor    m0, m2
694%else
695    mova    m0, [r1 + r3]
696    mova    m1, [r0 + r2]
697    shufps  m0, m0, 0x1b
698    shufps  m1, m1, 0x1b
699    xorps   m0, m2
700%endif
701    mova [r0 + r3], m1
702    mova [r1 + r2], m0
703    sub     r3, mmsize
704    add     r2, mmsize
705    jl      .loop
706%if cpuflag(3dnow)
707    femms
708    RET
709%else
710    REP_RET
711%endif
712%endmacro
713
714%if ARCH_X86_32
715INIT_MMX 3dnow
716IMDCT_CALC_FUNC
717INIT_MMX 3dnowext
718IMDCT_CALC_FUNC
719%endif
720
721INIT_XMM sse
722IMDCT_CALC_FUNC
723
724%if ARCH_X86_32
725INIT_MMX 3dnow
726%define mulps pfmul
727%define addps pfadd
728%define subps pfsub
729%define unpcklps punpckldq
730%define unpckhps punpckhdq
731DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
732DECL_PASS pass_interleave_3dnow, PASS_BIG 0
733%define pass_3dnowext pass_3dnow
734%define pass_interleave_3dnowext pass_interleave_3dnow
735%endif
736
737%ifdef PIC
738%define SECTION_REL - $$
739%else
740%define SECTION_REL
741%endif
742
743%macro DECL_FFT 1-2 ; nbits, suffix
744%ifidn %0, 1
745%xdefine fullsuffix SUFFIX
746%else
747%xdefine fullsuffix %2 %+ SUFFIX
748%endif
749%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
750%if %1>=5
751%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
752%endif
753%if %1>=6
754%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
755%endif
756
757%assign n 1<<%1
758%rep 17-%1
759%assign n2 n/2
760%assign n4 n/4
761%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
762
763align 16
764fft %+ n %+ fullsuffix:
765    call fft %+ n2 %+ SUFFIX
766    add r0, n*4 - (n&(-2<<%1))
767    call fft %+ n4 %+ SUFFIX
768    add r0, n*2 - (n2&(-2<<%1))
769    call fft %+ n4 %+ SUFFIX
770    sub r0, n*6 + (n2&(-2<<%1))
771    lea r1, [cos_ %+ n]
772    mov r2d, n4/2
773    jmp pass %+ fullsuffix
774
775%assign n n*2
776%endrep
777%undef n
778
779align 8
780dispatch_tab %+ fullsuffix: pointer list_of_fft
781%endmacro ; DECL_FFT
782
783%if HAVE_AVX_EXTERNAL
784INIT_YMM avx
785DECL_FFT 6
786DECL_FFT 6, _interleave
787%endif
788INIT_XMM sse
789DECL_FFT 5
790DECL_FFT 5, _interleave
791%if ARCH_X86_32
792INIT_MMX 3dnow
793DECL_FFT 4
794DECL_FFT 4, _interleave
795INIT_MMX 3dnowext
796DECL_FFT 4
797DECL_FFT 4, _interleave
798%endif
799
800INIT_XMM sse
801%undef mulps
802%undef addps
803%undef subps
804%undef unpcklps
805%undef unpckhps
806
807%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
808%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
809    PSWAPD     m0, [%3+%2*4]
810    movq       m2, [%3+%1*4-8]
811    movq       m3, m0
812    punpckldq  m0, m2
813    punpckhdq  m2, m3
814    movd       m1, [%4+%1*2-4] ; tcos[j]
815    movd       m3, [%4+%2*2]   ; tcos[n4-j-1]
816    punpckldq  m1, [%5+%1*2-4] ; tsin[j]
817    punpckldq  m3, [%5+%2*2]   ; tsin[n4-j-1]
818
819    mova       m4, m0
820    PSWAPD     m5, m1
821    pfmul      m0, m1
822    pfmul      m4, m5
823    mova       m6, m2
824    PSWAPD     m5, m3
825    pfmul      m2, m3
826    pfmul      m6, m5
827%if cpuflag(3dnowext)
828    pfpnacc    m0, m4
829    pfpnacc    m2, m6
830%else
831    SBUTTERFLY dq, 0, 4, 1
832    SBUTTERFLY dq, 2, 6, 3
833    pxor       m4, m7
834    pxor       m6, m7
835    pfadd      m0, m4
836    pfadd      m2, m6
837%endif
838%else
839    movaps   xmm0, [%3+%2*4]
840    movaps   xmm1, [%3+%1*4-0x10]
841    movaps   xmm2, xmm0
842    shufps   xmm0, xmm1, 0x88
843    shufps   xmm1, xmm2, 0x77
844    movlps   xmm4, [%4+%2*2]
845    movlps   xmm5, [%5+%2*2+0x0]
846    movhps   xmm4, [%4+%1*2-0x8]
847    movhps   xmm5, [%5+%1*2-0x8]
848    movaps   xmm2, xmm0
849    movaps   xmm3, xmm1
850    mulps    xmm0, xmm5
851    mulps    xmm1, xmm4
852    mulps    xmm2, xmm4
853    mulps    xmm3, xmm5
854    subps    xmm1, xmm0
855    addps    xmm2, xmm3
856    movaps   xmm0, xmm1
857    unpcklps xmm1, xmm2
858    unpckhps xmm0, xmm2
859%endif
860%endmacro
861
862%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
863    mulps      m6, %3, [%5+%1]
864    mulps      m7, %2, [%5+%1]
865    mulps      %2, %2, [%6+%1]
866    mulps      %3, %3, [%6+%1]
867    subps      %2, %2, m6
868    addps      %3, %3, m7
869%endmacro
870
871%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
872.post:
873    vmovaps      ymm1,   [%3+%1*2]
874    vmovaps      ymm0,   [%3+%1*2+0x20]
875    vmovaps      ymm3,   [%3+%2*2]
876    vmovaps      ymm2,   [%3+%2*2+0x20]
877
878    CMUL         %1, ymm0, ymm1, %3, %4, %5
879    CMUL         %2, ymm2, ymm3, %3, %4, %5
880    vshufps      ymm1, ymm1, ymm1, 0x1b
881    vshufps      ymm3, ymm3, ymm3, 0x1b
882    vperm2f128   ymm1, ymm1, ymm1, 0x01
883    vperm2f128   ymm3, ymm3, ymm3, 0x01
884    vunpcklps    ymm6, ymm2, ymm1
885    vunpckhps    ymm4, ymm2, ymm1
886    vunpcklps    ymm7, ymm0, ymm3
887    vunpckhps    ymm5, ymm0, ymm3
888
889    vextractf128 [%3+%1*2],      ymm7, 0
890    vextractf128 [%3+%1*2+0x10], ymm5, 0
891    vextractf128 [%3+%1*2+0x20], ymm7, 1
892    vextractf128 [%3+%1*2+0x30], ymm5, 1
893
894    vextractf128 [%3+%2*2],      ymm6, 0
895    vextractf128 [%3+%2*2+0x10], ymm4, 0
896    vextractf128 [%3+%2*2+0x20], ymm6, 1
897    vextractf128 [%3+%2*2+0x30], ymm4, 1
898    sub      %2,   0x20
899    add      %1,   0x20
900    jl       .post
901%endmacro
902
903%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
904.post:
905    movaps   xmm1, [%3+%1*2]
906    movaps   xmm0, [%3+%1*2+0x10]
907    CMUL     %1,   xmm0, xmm1, %3, %4, %5
908    movaps   xmm5, [%3+%2*2]
909    movaps   xmm4, [%3+%2*2+0x10]
910    CMUL     %2,   xmm4, xmm5, %3, %4, %5
911    shufps   xmm1, xmm1, 0x1b
912    shufps   xmm5, xmm5, 0x1b
913    movaps   xmm6, xmm4
914    unpckhps xmm4, xmm1
915    unpcklps xmm6, xmm1
916    movaps   xmm2, xmm0
917    unpcklps xmm0, xmm5
918    unpckhps xmm2, xmm5
919    movaps   [%3+%2*2],      xmm6
920    movaps   [%3+%2*2+0x10], xmm4
921    movaps   [%3+%1*2],      xmm0
922    movaps   [%3+%1*2+0x10], xmm2
923    sub      %2,   0x10
924    add      %1,   0x10
925    jl       .post
926%endmacro
927
928%macro CMUL_3DNOW 6
929    mova       m6, [%1+%2*2]
930    mova       %3, [%1+%2*2+8]
931    mova       %4, m6
932    mova       m7, %3
933    pfmul      m6, [%5+%2]
934    pfmul      %3, [%6+%2]
935    pfmul      %4, [%6+%2]
936    pfmul      m7, [%5+%2]
937    pfsub      %3, m6
938    pfadd      %4, m7
939%endmacro
940
941%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
942.post:
943    CMUL_3DNOW %3, %1, m0, m1, %4, %5
944    CMUL_3DNOW %3, %2, m2, m3, %4, %5
945    movd  [%3+%1*2+ 0], m0
946    movd  [%3+%2*2+12], m1
947    movd  [%3+%2*2+ 0], m2
948    movd  [%3+%1*2+12], m3
949    psrlq      m0, 32
950    psrlq      m1, 32
951    psrlq      m2, 32
952    psrlq      m3, 32
953    movd  [%3+%1*2+ 8], m0
954    movd  [%3+%2*2+ 4], m1
955    movd  [%3+%2*2+ 8], m2
956    movd  [%3+%1*2+ 4], m3
957    sub        %2, 8
958    add        %1, 8
959    jl         .post
960%endmacro
961
962%macro DECL_IMDCT 1
963cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
964%if ARCH_X86_64
965%define rrevtab r7
966%define rtcos   r8
967%define rtsin   r9
968%else
969%define rrevtab r6
970%define rtsin   r6
971%define rtcos   r5
972%endif
973    mov   r3d, [r0+FFTContext.mdctsize]
974    add   r2, r3
975    shr   r3, 1
976    mov   rtcos, [r0+FFTContext.tcos]
977    mov   rtsin, [r0+FFTContext.tsin]
978    add   rtcos, r3
979    add   rtsin, r3
980%if ARCH_X86_64 == 0
981    push  rtcos
982    push  rtsin
983%endif
984    shr   r3, 1
985    mov   rrevtab, [r0+FFTContext.revtab]
986    add   rrevtab, r3
987%if ARCH_X86_64 == 0
988    push  rrevtab
989%endif
990
991%if mmsize == 8
992    sub   r3, 2
993%else
994    sub   r3, 4
995%endif
996%if ARCH_X86_64 || mmsize == 8
997    xor   r4, r4
998    sub   r4, r3
999%endif
1000%if notcpuflag(3dnowext) && mmsize == 8
1001    movd  m7, [ps_m1m1m1m1]
1002%endif
1003.pre:
1004%if ARCH_X86_64 == 0
1005;unspill
1006%if mmsize != 8
1007    xor   r4, r4
1008    sub   r4, r3
1009%endif
1010    mov   rtcos, [esp+8]
1011    mov   rtsin, [esp+4]
1012%endif
1013
1014    PREROTATER r4, r3, r2, rtcos, rtsin
1015%if mmsize == 8
1016    mov    r6, [esp]                ; rrevtab = ptr+n8
1017    movzx  r5,  word [rrevtab+r4-2] ; rrevtab[j]
1018    movzx  r6,  word [rrevtab+r3]   ; rrevtab[n4-j-1]
1019    mova [r1+r5*8], m0
1020    mova [r1+r6*8], m2
1021    add    r4, 2
1022    sub    r3, 2
1023%else
1024%if ARCH_X86_64
1025    movzx  r5,  word [rrevtab+r4-4]
1026    movzx  r6,  word [rrevtab+r4-2]
1027    movzx  r10, word [rrevtab+r3]
1028    movzx  r11, word [rrevtab+r3+2]
1029    movlps [r1+r5 *8], xmm0
1030    movhps [r1+r6 *8], xmm0
1031    movlps [r1+r10*8], xmm1
1032    movhps [r1+r11*8], xmm1
1033    add    r4, 4
1034%else
1035    mov    r6, [esp]
1036    movzx  r5, word [r6+r4-4]
1037    movzx  r4, word [r6+r4-2]
1038    movlps [r1+r5*8], xmm0
1039    movhps [r1+r4*8], xmm0
1040    movzx  r5, word [r6+r3]
1041    movzx  r4, word [r6+r3+2]
1042    movlps [r1+r5*8], xmm1
1043    movhps [r1+r4*8], xmm1
1044%endif
1045    sub    r3, 4
1046%endif
1047    jns    .pre
1048
1049    mov  r5, r0
1050    mov  r6, r1
1051    mov  r0, r1
1052    mov  r1d, [r5+FFTContext.nbits]
1053
1054    FFT_DISPATCH SUFFIX, r1
1055
1056    mov  r0d, [r5+FFTContext.mdctsize]
1057    add  r6, r0
1058    shr  r0, 1
1059%if ARCH_X86_64 == 0
1060%define rtcos r2
1061%define rtsin r3
1062    mov  rtcos, [esp+8]
1063    mov  rtsin, [esp+4]
1064%endif
1065    neg  r0
1066    mov  r1, -mmsize
1067    sub  r1, r0
1068    %1 r0, r1, r6, rtcos, rtsin
1069%if ARCH_X86_64 == 0
1070    add esp, 12
1071%endif
1072%if mmsize == 8
1073    femms
1074%endif
1075    RET
1076%endmacro
1077
1078DECL_IMDCT POSROTATESHUF
1079
1080%if ARCH_X86_32
1081INIT_MMX 3dnow
1082DECL_IMDCT POSROTATESHUF_3DNOW
1083
1084INIT_MMX 3dnowext
1085DECL_IMDCT POSROTATESHUF_3DNOW
1086%endif
1087
1088INIT_YMM avx
1089
1090%if HAVE_AVX_EXTERNAL
1091DECL_IMDCT POSROTATESHUF_AVX
1092%endif
1093