1/*
2 * FFT transform with Altivec optimizations
3 * Copyright (c) 2009 Loren Merritt
4 *
5 * This algorithm (though not any of the implementation details) is
6 * based on libdjbfft by D. J. Bernstein.
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/*
26 * These functions are not individually interchangeable with the C versions.
27 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
28 * in blocks as convenient to the vector size.
29 * i.e. {4x real, 4x imaginary, 4x real, ...}
30 *
31 * I ignore standard calling convention.
32 * Instead, the following registers are treated as global constants:
33 * v14: zero
34 * v15..v18: cosines
35 * v19..v29: permutations
36 * r9: 16
37 * r12: ff_cos_tabs
38 * and the rest are free for local use.
39 */
40
41#include "config.h"
42#include "asm.S"
43
44.text
45
46.macro addi2 ra, imm // add 32-bit immediate
47.if \imm & 0xffff
48    addi \ra, \ra, \imm@l
49.endif
50.if (\imm+0x8000)>>16
51    addis \ra, \ra, \imm@ha
52.endif
53.endm
54
55.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
56    vperm   \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
57    vperm   \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
58    vaddfp  \a0,\a2,\a3                         // {t1,t2,t6,t5}
59    vsubfp  \a1,\a2,\a3                         // {t3,t4,t8,t7}
60    vmrghw  \a2,\a0,\a1     // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
61    vperm   \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
62    vaddfp  \a0,\a2,\a3                         // {r0,r1,i0,i1}
63    vsubfp  \a1,\a2,\a3                         // {r2,r3,i2,i3}
64    vperm   \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
65    vperm   \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
66.endm
67
68.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
69    vperm   \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
70    vperm   \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
71    vperm   \b2,\b0,\b1,v20
72    vperm   \b3,\b0,\b1,v21
73    vaddfp  \a0,\a2,\a3                         // {t1,t2,t6,t5}
74    vsubfp  \a1,\a2,\a3                         // {t3,t4,t8,t7}
75    vaddfp  \b0,\b2,\b3
76    vsubfp  \b1,\b2,\b3
77    vmrghw  \a2,\a0,\a1     // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
78    vperm   \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
79    vmrghw  \b2,\b0,\b1
80    vperm   \b3,\b0,\b1,v22
81    vaddfp  \a0,\a2,\a3                         // {r0,r1,i0,i1}
82    vsubfp  \a1,\a2,\a3                         // {r2,r3,i2,i3}
83    vaddfp  \b0,\b2,\b3
84    vsubfp  \b1,\b2,\b3
85    vperm   \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
86    vperm   \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
87    vperm   \b2,\b0,\b1,v23
88    vperm   \b3,\b0,\b1,v24
89.endm
90
91.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
92    vmrghw  \b2,\b0,\b1     // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
93    vmrglw  \b3,\b0,\b1     // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
94    vperm   \a2,\a0,\a1,v20         // FFT4 ...
95    vperm   \a3,\a0,\a1,v21
96    vaddfp  \b0,\b2,\b3                         // {t1,t3,t2,t4}
97    vsubfp  \b1,\b2,\b3                         // {r5,r7,i5,i7}
98    vperm   \b4,\b1,\b1,v25 // vcprm(2,3,0,1)   // {i5,i7,r5,r7}
99    vaddfp  \a0,\a2,\a3
100    vsubfp  \a1,\a2,\a3
101    vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
102    vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
103    vmrghw  \a2,\a0,\a1
104    vperm   \a3,\a0,\a1,v22
105    vperm   \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
106    vperm   \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
107    vaddfp  \a0,\a2,\a3
108    vsubfp  \a1,\a2,\a3
109    vaddfp  \b0,\b2,\b3                         // {t1,t2,t9,ta}
110    vsubfp  \b1,\b2,\b3                         // {t6,t5,tc,tb}
111    vperm   \a2,\a0,\a1,v23
112    vperm   \a3,\a0,\a1,v24
113    vperm   \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
114    vperm   \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
115    vsubfp  \b0,\a2,\b2                         // {r4,r5,r6,r7}
116    vsubfp  \b1,\a3,\b3                         // {i4,i5,i6,i7}
117    vaddfp  \a0,\a2,\b2                         // {r0,r1,r2,r3}
118    vaddfp  \a1,\a3,\b3                         // {i0,i1,i2,i3}
119.endm
120
121.macro BF d0,d1,s0,s1
122    vsubfp  \d1,\s0,\s1
123    vaddfp  \d0,\s0,\s1
124.endm
125
126.macro zip d0,d1,s0,s1
127    vmrghw  \d0,\s0,\s1
128    vmrglw  \d1,\s0,\s1
129.endm
130
131.macro def_fft4 interleave
132fft4\interleave\()_altivec:
133    lvx    v0, 0,r3
134    lvx    v1,r9,r3
135    FFT4   v0,v1,v2,v3
136.ifnb \interleave
137    zip    v0,v1,v2,v3
138    stvx   v0, 0,r3
139    stvx   v1,r9,r3
140.else
141    stvx   v2, 0,r3
142    stvx   v3,r9,r3
143.endif
144    blr
145.endm
146
147.macro def_fft8 interleave
148fft8\interleave\()_altivec:
149    addi   r4,r3,32
150    lvx    v0, 0,r3
151    lvx    v1,r9,r3
152    lvx    v2, 0,r4
153    lvx    v3,r9,r4
154    FFT8   v0,v1,v2,v3,v4,v5,v6,v7,v8
155.ifnb \interleave
156    zip    v4,v5,v0,v1
157    zip    v6,v7,v2,v3
158    stvx   v4, 0,r3
159    stvx   v5,r9,r3
160    stvx   v6, 0,r4
161    stvx   v7,r9,r4
162.else
163    stvx   v0, 0,r3
164    stvx   v1,r9,r3
165    stvx   v2, 0,r4
166    stvx   v3,r9,r4
167.endif
168    blr
169.endm
170
171.macro def_fft16 interleave
172fft16\interleave\()_altivec:
173    addi   r5,r3,64
174    addi   r6,r3,96
175    addi   r4,r3,32
176    lvx    v0, 0,r5
177    lvx    v1,r9,r5
178    lvx    v2, 0,r6
179    lvx    v3,r9,r6
180    FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
181    lvx    v0, 0,r3
182    lvx    v1,r9,r3
183    lvx    v2, 0,r4
184    lvx    v3,r9,r4
185    FFT8   v0,v1,v2,v3,v8,v9,v10,v11,v12
186    vmaddfp   v8,v4,v15,v14 // r2*wre
187    vmaddfp   v9,v5,v15,v14 // i2*wre
188    vmaddfp  v10,v6,v15,v14 // r3*wre
189    vmaddfp  v11,v7,v15,v14 // i3*wre
190    vmaddfp   v8,v5,v16,v8  // i2*wim
191    vnmsubfp  v9,v4,v16,v9  // r2*wim
192    vnmsubfp v10,v7,v16,v10 // i3*wim
193    vmaddfp  v11,v6,v16,v11 // r3*wim
194    BF     v10,v12,v10,v8
195    BF     v11,v13,v9,v11
196    BF     v0,v4,v0,v10
197    BF     v3,v7,v3,v12
198    BF     v1,v5,v1,v11
199    BF     v2,v6,v2,v13
200.ifnb \interleave
201    zip     v8, v9,v0,v1
202    zip    v10,v11,v2,v3
203    zip    v12,v13,v4,v5
204    zip    v14,v15,v6,v7
205    stvx    v8, 0,r3
206    stvx    v9,r9,r3
207    stvx   v10, 0,r4
208    stvx   v11,r9,r4
209    stvx   v12, 0,r5
210    stvx   v13,r9,r5
211    stvx   v14, 0,r6
212    stvx   v15,r9,r6
213.else
214    stvx   v0, 0,r3
215    stvx   v4, 0,r5
216    stvx   v3,r9,r4
217    stvx   v7,r9,r6
218    stvx   v1,r9,r3
219    stvx   v5,r9,r5
220    stvx   v2, 0,r4
221    stvx   v6, 0,r6
222.endif
223    blr
224.endm
225
226// void pass(float *z, float *wre, int n)
227.macro PASS interleave, suffix
228fft_pass\suffix\()_altivec:
229    mtctr  r5
230    slwi   r0,r5,4
231    slwi   r7,r5,6   // o2
232    slwi   r5,r5,5   // o1
233    add   r10,r5,r7  // o3
234    add    r0,r4,r0  // wim
235    addi   r6,r5,16  // o1+16
236    addi   r8,r7,16  // o2+16
237    addi  r11,r10,16 // o3+16
2381:
239    lvx    v8, 0,r4  // wre
240    lvx   v10, 0,r0  // wim
241    sub    r0,r0,r9
242    lvx    v9, 0,r0
243    vperm  v9,v9,v10,v19   // vcprm(s0,3,2,1) => wim[0 .. -3]
244    lvx    v4,r3,r7        // r2 = z[o2]
245    lvx    v5,r3,r8        // i2 = z[o2+16]
246    lvx    v6,r3,r10       // r3 = z[o3]
247    lvx    v7,r3,r11       // i3 = z[o3+16]
248    vmaddfp  v10,v4,v8,v14 // r2*wre
249    vmaddfp  v11,v5,v8,v14 // i2*wre
250    vmaddfp  v12,v6,v8,v14 // r3*wre
251    vmaddfp  v13,v7,v8,v14 // i3*wre
252    lvx    v0, 0,r3        // r0 = z[0]
253    lvx    v3,r3,r6        // i1 = z[o1+16]
254    vmaddfp  v10,v5,v9,v10 // i2*wim
255    vnmsubfp v11,v4,v9,v11 // r2*wim
256    vnmsubfp v12,v7,v9,v12 // i3*wim
257    vmaddfp  v13,v6,v9,v13 // r3*wim
258    lvx    v1,r3,r9        // i0 = z[16]
259    lvx    v2,r3,r5        // r1 = z[o1]
260    BF     v12,v8,v12,v10
261    BF     v13,v9,v11,v13
262    BF     v0,v4,v0,v12
263    BF     v3,v7,v3,v8
264.if !\interleave
265    stvx   v0, 0,r3
266    stvx   v4,r3,r7
267    stvx   v3,r3,r6
268    stvx   v7,r3,r11
269.endif
270    BF     v1,v5,v1,v13
271    BF     v2,v6,v2,v9
272.if !\interleave
273    stvx   v1,r3,r9
274    stvx   v2,r3,r5
275    stvx   v5,r3,r8
276    stvx   v6,r3,r10
277.else
278    vmrghw v8,v0,v1
279    vmrglw v9,v0,v1
280    stvx   v8, 0,r3
281    stvx   v9,r3,r9
282    vmrghw v8,v2,v3
283    vmrglw v9,v2,v3
284    stvx   v8,r3,r5
285    stvx   v9,r3,r6
286    vmrghw v8,v4,v5
287    vmrglw v9,v4,v5
288    stvx   v8,r3,r7
289    stvx   v9,r3,r8
290    vmrghw v8,v6,v7
291    vmrglw v9,v6,v7
292    stvx   v8,r3,r10
293    stvx   v9,r3,r11
294.endif
295    addi   r3,r3,32
296    addi   r4,r4,16
297    bdnz 1b
298    sub    r3,r3,r5
299    blr
300.endm
301
302#define M_SQRT1_2      0.70710678118654752440  /* 1/sqrt(2) */
303
304#define WORD_0  0x00,0x01,0x02,0x03
305#define WORD_1  0x04,0x05,0x06,0x07
306#define WORD_2  0x08,0x09,0x0a,0x0b
307#define WORD_3  0x0c,0x0d,0x0e,0x0f
308#define WORD_s0 0x10,0x11,0x12,0x13
309#define WORD_s1 0x14,0x15,0x16,0x17
310#define WORD_s2 0x18,0x19,0x1a,0x1b
311#define WORD_s3 0x1c,0x1d,0x1e,0x1f
312
313#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
314
315    .rodata
316    .align 4
317fft_data:
318    .float  0, 0, 0, 0
319    .float  1, 0.92387953, M_SQRT1_2, 0.38268343
320    .float  0, 0.38268343, M_SQRT1_2, 0.92387953
321    .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
322    .float   M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
323    vcprm(s0,3,2,1)
324    vcprm(0,1,s2,s1)
325    vcprm(2,3,s0,s3)
326    vcprm(2,s3,3,s2)
327    vcprm(0,1,s0,s1)
328    vcprm(2,3,s2,s3)
329    vcprm(2,3,0,1)
330    vcprm(1,2,s3,s0)
331    vcprm(0,3,s2,s1)
332    vcprm(0,2,s1,s3)
333    vcprm(1,3,s0,s2)
334
335.macro lvm  b, r, regs:vararg
336    lvx     \r, 0, \b
337    addi    \b, \b, 16
338  .ifnb \regs
339    lvm     \b, \regs
340  .endif
341.endm
342
343.macro stvm b, r, regs:vararg
344    stvx    \r, 0, \b
345    addi    \b, \b, 16
346  .ifnb \regs
347    stvm    \b, \regs
348  .endif
349.endm
350
351.macro fft_calc interleave
352extfunc ff_fft_calc\interleave\()_altivec
353    mflr    r0
354    stp     r0, 2*PS(r1)
355    stpu    r1, -(160+16*PS)(r1)
356    get_got r11
357    addi    r6, r1, 16*PS
358    stvm    r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
359    mfvrsave r0
360    stw     r0, 15*PS(r1)
361    li      r6, 0xfffffffc
362    mtvrsave r6
363
364    movrel  r6, fft_data, r11
365    lvm     r6, v14, v15, v16, v17, v18, v19, v20, v21
366    lvm     r6, v22, v23, v24, v25, v26, v27, v28, v29
367
368    li      r9, 16
369    movrel  r12, X(ff_cos_tabs), r11
370
371    movrel  r6, fft_dispatch_tab\interleave\()_altivec, r11
372    lwz     r3, 0(r3)
373    subi    r3, r3, 2
374    slwi    r3, r3, 2+ARCH_PPC64
375    lpx     r3, r3, r6
376    mtctr   r3
377    mr      r3, r4
378    bctrl
379
380    addi    r6, r1, 16*PS
381    lvm     r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
382    lwz     r6, 15*PS(r1)
383    mtvrsave r6
384    lp      r1, 0(r1)
385    lp      r0, 2*PS(r1)
386    mtlr    r0
387    blr
388.endm
389
390.macro DECL_FFT suffix, bits, n, n2, n4
391fft\n\suffix\()_altivec:
392    mflr  r0
393    stp   r0,PS*(\bits-3)(r1)
394    bl    fft\n2\()_altivec
395    addi2 r3,\n*4
396    bl    fft\n4\()_altivec
397    addi2 r3,\n*2
398    bl    fft\n4\()_altivec
399    addi2 r3,\n*-6
400    lp    r0,PS*(\bits-3)(r1)
401    lp    r4,\bits*PS(r12)
402    mtlr  r0
403    li    r5,\n/16
404    b     fft_pass\suffix\()_altivec
405.endm
406
407.macro DECL_FFTS interleave, suffix
408    .text
409    def_fft4  \suffix
410    def_fft8  \suffix
411    def_fft16 \suffix
412    PASS \interleave, \suffix
413    DECL_FFT \suffix, 5,   32,   16,    8
414    DECL_FFT \suffix, 6,   64,   32,   16
415    DECL_FFT \suffix, 7,  128,   64,   32
416    DECL_FFT \suffix, 8,  256,  128,   64
417    DECL_FFT \suffix, 9,  512,  256,  128
418    DECL_FFT \suffix,10, 1024,  512,  256
419    DECL_FFT \suffix,11, 2048, 1024,  512
420    DECL_FFT \suffix,12, 4096, 2048, 1024
421    DECL_FFT \suffix,13, 8192, 4096, 2048
422    DECL_FFT \suffix,14,16384, 8192, 4096
423    DECL_FFT \suffix,15,32768,16384, 8192
424    DECL_FFT \suffix,16,65536,32768,16384
425
426    fft_calc \suffix
427
428    .rodata
429    .align 3
430fft_dispatch_tab\suffix\()_altivec:
431    PTR fft4\suffix\()_altivec
432    PTR fft8\suffix\()_altivec
433    PTR fft16\suffix\()_altivec
434    PTR fft32\suffix\()_altivec
435    PTR fft64\suffix\()_altivec
436    PTR fft128\suffix\()_altivec
437    PTR fft256\suffix\()_altivec
438    PTR fft512\suffix\()_altivec
439    PTR fft1024\suffix\()_altivec
440    PTR fft2048\suffix\()_altivec
441    PTR fft4096\suffix\()_altivec
442    PTR fft8192\suffix\()_altivec
443    PTR fft16384\suffix\()_altivec
444    PTR fft32768\suffix\()_altivec
445    PTR fft65536\suffix\()_altivec
446.endm
447
448DECL_FFTS 0
449DECL_FFTS 1, _interleave
450