1/*
2 * ARM NEON optimised FFT
3 *
4 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5 * Copyright (c) 2009 Naotoshi Nojiri
6 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
7 *
8 * This algorithm (though not any of the implementation details) is
9 * based on libdjbfft by D. J. Bernstein.
10 *
11 * This file is part of FFmpeg.
12 *
13 * FFmpeg is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU Lesser General Public
15 * License as published by the Free Software Foundation; either
16 * version 2.1 of the License, or (at your option) any later version.
17 *
18 * FFmpeg is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 * Lesser General Public License for more details.
22 *
23 * You should have received a copy of the GNU Lesser General Public
24 * License along with FFmpeg; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 */
27
28#include "libavutil/aarch64/asm.S"
29
30#define M_SQRT1_2 0.70710678118654752440
31
32.macro transpose d0, d1, s0, s1
33        trn1            \d0, \s0, \s1
34        trn2            \d1, \s0, \s1
35.endm
36
37
38function fft4_neon
39        ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
40
41        fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
42        fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
43
44        ext             v16.8b, v2.8b,  v3.8b,  #4
45        ext             v17.8b, v3.8b,  v2.8b,  #4
46
47        fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
48        fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
49
50        fadd            v0.2s,  v4.2s,  v5.2s
51        fsub            v2.2s,  v4.2s,  v5.2s
52        fadd            v1.2s,  v6.2s,  v7.2s
53        fsub            v3.2s,  v6.2s,  v7.2s
54
55        st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
56
57        ret
58endfunc
59
60function fft8_neon
61        mov             x1,  x0
62        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
63        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
64        ext             v22.8b, v2.8b,  v3.8b,  #4
65        ext             v23.8b, v3.8b,  v2.8b,  #4
66        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
67        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
68        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
69        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
70        rev64           v27.2s, v28.2s  // ???
71        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
72        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
73        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
74        ext             v6.8b,  v4.8b,  v5.8b,  #4
75        ext             v7.8b,  v5.8b,  v4.8b,  #4
76        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
77        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
78        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
79        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
80        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
81        fadd            v0.2s,  v20.2s, v21.2s
82        fsub            v2.2s,  v20.2s, v21.2s
83        fadd            v1.2s,  v22.2s, v23.2s
84        rev64           v26.2s, v26.2s
85        rev64           v27.2s, v27.2s
86        fsub            v3.2s,  v22.2s, v23.2s
87        fsub            v6.2s,  v6.2s,  v7.2s
88        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
89        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
90        fadd            v7.2s,  v4.2s,  v5.2s
91        fsub            v18.2s, v2.2s,  v6.2s
92        ext             v26.8b, v24.8b, v25.8b, #4
93        ext             v27.8b, v25.8b, v24.8b, #4
94        fadd            v2.2s,  v2.2s,  v6.2s
95        fsub            v16.2s, v0.2s,  v7.2s
96        fadd            v5.2s,  v25.2s, v24.2s
97        fsub            v4.2s,  v26.2s, v27.2s
98        fadd            v0.2s,  v0.2s,  v7.2s
99        fsub            v17.2s, v1.2s,  v5.2s
100        fsub            v19.2s, v3.2s,  v4.2s
101        fadd            v3.2s,  v3.2s,  v4.2s
102        fadd            v1.2s,  v1.2s,  v5.2s
103
104        st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
105        st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
106
107        ret
108endfunc
109
110function fft16_neon
111        mov             x1,  x0
112        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
113        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
114        ext             v22.8b, v2.8b,  v3.8b,  #4
115        ext             v23.8b, v3.8b,  v2.8b,  #4
116        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
117        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
118        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
119        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
120        rev64           v27.2s, v28.2s  // ???
121        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
122        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
123        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
124        ext             v6.8b,  v4.8b,  v5.8b,  #4
125        ext             v7.8b,  v5.8b,  v4.8b,  #4
126        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
127        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
128        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
129        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
130        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
131        fadd            v0.2s,  v20.2s, v21.2s
132        fsub            v2.2s,  v20.2s, v21.2s
133        fadd            v1.2s,  v22.2s, v23.2s
134        rev64           v26.2s, v26.2s
135        rev64           v27.2s, v27.2s
136        fsub            v3.2s,  v22.2s, v23.2s
137        fsub            v6.2s,  v6.2s,  v7.2s
138        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
139        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
140        fadd            v7.2s,  v4.2s,  v5.2s
141        fsub            v18.2s, v2.2s,  v6.2s
142        ld1             {v20.4s,v21.4s}, [x0], #32
143        ld1             {v22.4s,v23.4s}, [x0], #32
144        ext             v26.8b, v24.8b, v25.8b, #4
145        ext             v27.8b, v25.8b, v24.8b, #4
146        fadd            v2.2s,  v2.2s,  v6.2s
147        fsub            v16.2s, v0.2s,  v7.2s
148        fadd            v5.2s,  v25.2s, v24.2s
149        fsub            v4.2s,  v26.2s, v27.2s
150        transpose       v24.2d, v25.2d, v20.2d, v22.2d
151        transpose       v26.2d, v27.2d, v21.2d, v23.2d
152        fadd            v0.2s,  v0.2s,  v7.2s
153        fsub            v17.2s, v1.2s,  v5.2s
154        fsub            v19.2s, v3.2s,  v4.2s
155        fadd            v3.2s,  v3.2s,  v4.2s
156        fadd            v1.2s,  v1.2s,  v5.2s
157        ext             v20.16b, v21.16b, v21.16b,  #4
158        ext             v21.16b, v23.16b, v23.16b,  #4
159
160        zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
161        zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
162        zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
163        zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
164
165        // 2 x fft4
166        transpose       v22.2d, v23.2d, v20.2d, v21.2d
167
168        fadd            v4.4s,  v24.4s, v25.4s
169        fadd            v5.4s,  v26.4s, v27.4s
170        fsub            v6.4s,  v24.4s, v25.4s
171        fsub            v7.4s,  v22.4s, v23.4s
172
173        ld1             {v23.4s},  [x14]
174
175        fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
176        fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
177        fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
178        fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
179
180        //fft_pass_neon_16
181        rev64           v7.4s,  v25.4s
182        fmul            v25.4s, v25.4s, v23.s[1]
183        fmul            v7.4s,  v7.4s,  v29.4s
184        fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
185
186        zip1            v20.4s, v24.4s, v25.4s
187        zip2            v21.4s, v24.4s, v25.4s
188        fneg            v22.4s, v20.4s
189        fadd            v4.4s,  v21.4s, v20.4s
190        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
191        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
192
193        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
194        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
195
196        fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
197        fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
198        fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
199        fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
200
201//second half
202        rev64           v6.4s,  v26.4s
203        fmul            v26.4s, v26.4s, v23.s[2]
204        rev64           v7.4s,  v27.4s
205        fmul            v27.4s, v27.4s, v23.s[3]
206        fmul            v6.4s,  v6.4s,  v29.4s
207        fmul            v7.4s,  v7.4s,  v29.4s
208        fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
209        fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
210
211        zip1            v24.4s, v26.4s, v27.4s
212        zip2            v25.4s, v26.4s, v27.4s
213        fneg            v26.4s, v24.4s
214        fadd            v4.4s,  v25.4s, v24.4s
215        fsub            v6.4s,  v24.4s, v25.4s  // just the second half
216        fadd            v5.4s,  v25.4s, v26.4s  // just the first half
217
218        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
219        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
220
221        fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
222        fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
223        fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
224        fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
225
226        st1             {v16.4s,v17.4s}, [x1], #32
227        st1             {v18.4s,v19.4s}, [x1], #32
228        st1             {v20.4s,v21.4s}, [x1], #32
229        st1             {v22.4s,v23.4s}, [x1], #32
230
231        ret
232endfunc
233
234
235const  trans4_float, align=4
236        .byte    0,  1,  2,  3
237        .byte    8,  9, 10, 11
238        .byte    4,  5,  6,  7
239        .byte   12, 13, 14, 15
240endconst
241
242const  trans8_float, align=4
243        .byte   24, 25, 26, 27
244        .byte    0,  1,  2,  3
245        .byte   28, 29, 30, 31
246        .byte    4,  5,  6,  7
247endconst
248
249function fft_pass_neon
250        sub             x6,  x2,  #1            // n - 1, loop counter
251        lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
252        lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
253        add             x5,  x4,  x5            // wim
254        add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
255        add             x2,  x0,  x2,  lsl #5   // &z[o2]
256        add             x3,  x0,  x3            // &z[o3]
257        add             x1,  x0,  x1            // &z[o1]
258        ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
259        ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
260        ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
261        trn2            v25.2d, v20.2d, v22.2d
262        sub             x5,  x5,  #4            // wim--
263        trn1            v24.2d, v20.2d, v22.2d
264        ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
265        rev64           v7.4s,  v25.4s
266        fmul            v25.4s, v25.4s, v4.s[1]
267        ld1             {v16.4s}, [x0]          // {z[0],z[1]}
268        fmul            v7.4s,  v7.4s,  v29.4s
269        ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
270        prfm            pldl1keep, [x2, #16]
271        prfm            pldl1keep, [x3, #16]
272        fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
273        prfm            pldl1keep, [x0, #16]
274        prfm            pldl1keep, [x1, #16]
275
276        zip1            v20.4s, v24.4s, v25.4s
277        zip2            v21.4s, v24.4s, v25.4s
278        fneg            v22.4s, v20.4s
279        fadd            v4.4s,  v21.4s, v20.4s
280        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
281        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
282
283        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
284        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
285
286        fadd            v20.4s, v16.4s, v4.4s
287        fsub            v22.4s, v16.4s, v4.4s
288        fadd            v21.4s, v17.4s, v5.4s
289        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
290        fsub            v23.4s, v17.4s, v5.4s
291
292        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
293        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
294        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
2951:
296        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
297        ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
298        ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
299        transpose       v26.2d, v27.2d, v20.2d, v22.2d
300        ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
301        rev64           v6.4s,  v26.4s
302        fmul            v26.4s, v26.4s, v4.s[0]
303        rev64           v7.4s,  v27.4s
304        fmul            v27.4s, v27.4s, v4.s[1]
305        fmul            v6.4s,  v6.4s,  v29.4s
306        fmul            v7.4s,  v7.4s,  v29.4s
307        ld1             {v16.4s},[x0]           // {z[0],z[1]}
308        fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
309        fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
310        ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
311
312        subs            x6,  x6,  #1            // n--
313
314        zip1            v20.4s, v26.4s, v27.4s
315        zip2            v21.4s, v26.4s, v27.4s
316        fneg            v22.4s, v20.4s
317        fadd            v4.4s,  v21.4s, v20.4s
318        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
319        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
320
321        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
322        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
323
324        fadd            v20.4s, v16.4s, v4.4s
325        fsub            v22.4s, v16.4s, v4.4s
326        fadd            v21.4s, v17.4s, v5.4s
327        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
328        fsub            v23.4s, v17.4s, v5.4s
329
330        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
331        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
332        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
333        b.ne            1b
334
335        ret
336endfunc
337
338.macro  def_fft n, n2, n4
339function fft\n\()_neon  align=6
340        sub             sp,  sp,  #16
341        stp             x28, x30, [sp]
342        add             x28, x0,  #\n4*2*8
343        bl              fft\n2\()_neon
344        mov             x0,  x28
345        bl              fft\n4\()_neon
346        add             x0,  x28, #\n4*1*8
347        bl              fft\n4\()_neon
348        sub             x0,  x28, #\n4*2*8
349        ldp             x28, x30, [sp], #16
350        movrel          x4,  X(ff_cos_\n)
351        mov             x2,  #\n4>>1
352        b               fft_pass_neon
353endfunc
354.endm
355
356        def_fft    32,    16,     8
357        def_fft    64,    32,    16
358        def_fft   128,    64,    32
359        def_fft   256,   128,    64
360        def_fft   512,   256,   128
361        def_fft  1024,   512,   256
362        def_fft  2048,  1024,   512
363        def_fft  4096,  2048,  1024
364        def_fft  8192,  4096,  2048
365        def_fft 16384,  8192,  4096
366        def_fft 32768, 16384,  8192
367        def_fft 65536, 32768, 16384
368
369function ff_fft_calc_neon, export=1
370        prfm            pldl1keep, [x1]
371        movrel          x10, trans4_float
372        ldr             w2,  [x0]
373        movrel          x11, trans8_float
374        sub             w2,  w2,  #2
375        movrel          x3,  fft_tab_neon
376        ld1             {v30.16b}, [x10]
377        mov             x7,  #-8
378        movrel          x12, pmmp
379        ldr             x3,  [x3, x2, lsl #3]
380        movrel          x13, mppm
381        movrel          x14, X(ff_cos_16)
382        ld1             {v31.16b}, [x11]
383        mov             x0,  x1
384        ld1             {v29.4s},  [x12]         // pmmp
385        ld1             {v28.4s},  [x13]
386        br              x3
387endfunc
388
389function ff_fft_permute_neon, export=1
390        mov             x6,  #1
391        ldr             w2,  [x0]       // nbits
392        ldr             x3,  [x0, #16]  // tmp_buf
393        ldr             x0,  [x0, #8]   // revtab
394        lsl             x6,  x6, x2
395        mov             x2,  x6
3961:
397        ld1             {v0.2s,v1.2s}, [x1], #16
398        ldr             w4,  [x0], #4
399        uxth            w5,  w4
400        lsr             w4,  w4,  #16
401        add             x5,  x3,  x5,  lsl #3
402        add             x4,  x3,  x4,  lsl #3
403        st1             {v0.2s}, [x5]
404        st1             {v1.2s}, [x4]
405        subs            x6,  x6, #2
406        b.gt            1b
407
408        sub             x1,  x1,  x2,  lsl #3
4091:
410        ld1             {v0.4s,v1.4s}, [x3], #32
411        st1             {v0.4s,v1.4s}, [x1], #32
412        subs            x2,  x2,  #4
413        b.gt            1b
414
415        ret
416endfunc
417
418const   fft_tab_neon
419        .quad fft4_neon
420        .quad fft8_neon
421        .quad fft16_neon
422        .quad fft32_neon
423        .quad fft64_neon
424        .quad fft128_neon
425        .quad fft256_neon
426        .quad fft512_neon
427        .quad fft1024_neon
428        .quad fft2048_neon
429        .quad fft4096_neon
430        .quad fft8192_neon
431        .quad fft16384_neon
432        .quad fft32768_neon
433        .quad fft65536_neon
434endconst
435
436const   pmmp, align=4
437        .float          +1.0, -1.0, -1.0, +1.0
438endconst
439
440const   mppm, align=4
441        .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
442endconst
443