1/*
2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22
23#include "asm-offsets.h"
24
25.macro shuffle a, b, c, d
26const shuffle_\a\b\c\d align=4
27        .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
28        .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
29        .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
30        .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
31endconst
32.endm
33
34shuffle 0, 2, 1, 3
35shuffle 1, 0, 3, 2
36shuffle 2, 3, 0, 1
37shuffle 3, 1, 2, 0
38
39
40function fft5_neon
41        lsl             x2,  x2,  #3
42        ld1             {v24.2s},         [x1],  x2
43        ld2             {v25.s,v26.s}[0], [x1],  x2
44        ld2             {v25.s,v26.s}[1], [x1],  x2
45        ld2             {v25.s,v26.s}[2], [x1],  x2
46        ld2             {v25.s,v26.s}[3], [x1]
47        dup             v6.4s,  v24.s[0]
48        dup             v7.4s,  v24.s[1]
49
50        faddp           v0.4s,  v25.4s, v26.4s
51        // z[][0], z[][3]
52        fmul            v16.4s, v25.4s, v15.s[0] // rr
53        fmul            v17.4s, v25.4s, v15.s[1] // ri
54        fmul            v18.4s, v26.4s, v15.s[0] // ir
55        fmul            v19.4s, v26.4s, v15.s[1] // ii
56        faddp           v0.4s,  v0.4s,  v0.4s
57        // z[][1], z[][2]
58        fmul            v20.4s, v25.4s, v15.s[2] // rr
59        fmul            v21.4s, v25.4s, v15.s[3] // ri
60        fmul            v22.4s, v26.4s, v15.s[2] // ir
61        fmul            v23.4s, v26.4s, v15.s[3] // ii
62        fadd            v0.2s,  v24.2s, v0.2s   // out[0]
63
64        // z[0123][0], z[0123][3]
65        fsub            v24.4s, v16.4s, v19.4s  //    (c).re =  rr - ii;
66        fadd            v27.4s, v16.4s, v19.4s  //    (d).re =  rr + ii;
67        ld1             {v16.16b},  [x11]
68        ld1             {v19.16b},  [x14]
69        fadd            v28.4s, v17.4s, v18.4s  //    (c).im =  ri + ir;
70        fsub            v31.4s, v18.4s, v17.4s  //    (d).im = -ri + ir;
71        ld1             {v17.16b},  [x12]
72        // z[0123][1], z[0123][2]
73        fsub            v25.4s, v20.4s, v23.4s  //    (c).re =  rr - ii;
74        fadd            v26.4s, v20.4s, v23.4s  //    (d).re =  rr + ii;
75        ld1             {v18.16b},  [x13]
76        fadd            v29.4s, v21.4s, v22.4s  //    (c).im =  ri + ir;
77        fsub            v30.4s, v22.4s, v21.4s  //    (d).im = -ri + ir;
78
79        //real
80        tbl             v20.16b, {v24.16b}, v16.16b
81        tbl             v21.16b, {v25.16b}, v17.16b
82        tbl             v22.16b, {v26.16b}, v18.16b
83        tbl             v23.16b, {v27.16b}, v19.16b
84        //imag
85        tbl             v16.16b, {v28.16b}, v16.16b
86        tbl             v17.16b, {v29.16b}, v17.16b
87        tbl             v18.16b, {v30.16b}, v18.16b
88        tbl             v19.16b, {v31.16b}, v19.16b
89
90        fadd            v6.4s,  v6.4s,  v20.4s
91        fadd            v22.4s, v22.4s, v23.4s
92        fadd            v7.4s,  v7.4s,  v16.4s
93        fadd            v18.4s, v18.4s, v19.4s
94
95        fadd            v21.4s, v21.4s, v22.4s
96        fadd            v17.4s, v17.4s, v18.4s
97        fadd            v6.4s,  v6.4s,  v21.4s
98        fadd            v7.4s,  v7.4s,  v17.4s
99
100        ret
101endfunc
102
103function fft15_neon
104        mov             x8,  x1
105        mov             x9,  x30
106        add             x2,  x3,  x3,  lsl #1   // 3 * stride
107
108        add             x1,  x8,  x3,  lsl #3   // in + 1 * stride
109        bl              fft5_neon
110        mov             v1.8b,   v0.8b
111        mov             v2.16b,  v6.16b
112        mov             v3.16b,  v7.16b
113
114        add             x1,  x8,  x3,  lsl #4   // in + 2 * stride
115        add             x2,  x3,  x3,  lsl #1   // 3 * stride
116        bl              fft5_neon
117        zip1            v1.4s,   v1.4s,  v0.4s
118        mov             v4.16b,  v6.16b
119        mov             v5.16b,  v7.16b
120
121        mov             x1,  x8                 // in + 0 * stride
122        add             x2,  x3,  x3,  lsl #1   // 3 * stride
123        bl              fft5_neon
124
125        faddp           v20.4s, v1.4s,  v1.4s
126
127        ext             v18.16b, v8.16b,  v8.16b,  #4
128        ext             v19.16b, v9.16b,  v9.16b,  #4
129        mov             v16.16b, v6.16b
130        mov             v17.16b, v7.16b
131        fadd            v20.2s, v20.2s, v0.2s
132
133        uzp1            v18.4s, v18.4s, v10.4s  // exp[2,4,6,8].re
134        uzp1            v19.4s, v19.4s, v11.4s  // exp[2,4,6,8].im
135
136        st1             {v20.2s},  [x0], #8     // out[0]
137
138        fmla            v16.4s, v2.4s,  v8.4s
139        fmls            v16.4s, v3.4s,  v9.4s
140
141        fmla            v17.4s, v2.4s,  v9.4s
142        fmla            v17.4s, v3.4s,  v8.4s
143
144        fmla            v16.4s, v4.4s,  v18.4s
145        fmls            v16.4s, v5.4s,  v19.4s
146
147        fmla            v17.4s, v4.4s,  v19.4s
148        fmla            v17.4s, v5.4s,  v18.4s
149
150        zip1            v18.4s, v16.4s, v17.4s
151        zip2            v19.4s, v16.4s, v17.4s
152
153        rev64           v31.4s, v14.4s
154        trn1            v28.2d, v1.2d,  v1.2d
155        trn2            v29.2d, v1.2d,  v1.2d
156        zip1            v30.2d, v14.2d, v31.2d
157        zip2            v31.2d, v14.2d, v31.2d
158
159        st1             {v18.4s,v19.4s},  [x0], #32 // out[1-4]
160
161        fmul            v16.4s, v28.4s, v30.4s
162        fmul            v17.4s, v29.4s, v30.4s
163        fmls            v16.4s, v29.4s, v31.4s
164        fmla            v17.4s, v28.4s, v31.4s
165        faddp           v16.4s, v16.4s, v16.4s
166        faddp           v17.4s, v17.4s, v17.4s
167        zip1            v18.2s, v16.2s, v17.2s
168        zip2            v19.2s, v16.2s, v17.2s
169
170        fadd            v18.2s, v18.2s, v0.2s
171        fadd            v0.2s,  v19.2s, v0.2s
172
173        ext             v30.16b, v12.16b, v12.16b, #4
174        ext             v31.16b, v13.16b, v13.16b, #4
175        mov             v16.16b, v6.16b
176        mov             v17.16b, v7.16b
177
178        uzp1            v30.4s, v30.4s, v8.4s
179        uzp1            v31.4s, v31.4s, v9.4s
180
181        st1             {v18.2s},  [x0], #8     // out[5]
182
183        fmla            v16.4s, v2.4s,  v10.4s
184        fmls            v16.4s, v3.4s,  v11.4s
185
186        fmla            v17.4s, v2.4s,  v11.4s
187        fmla            v17.4s, v3.4s,  v10.4s
188
189        fmla            v16.4s, v4.4s,  v30.4s
190        fmls            v16.4s, v5.4s,  v31.4s
191
192        fmla            v17.4s, v4.4s,  v31.4s
193        fmla            v17.4s, v5.4s,  v30.4s
194
195        zip1            v18.4s, v16.4s, v17.4s
196        zip2            v19.4s, v16.4s, v17.4s
197
198        ext             v30.16b, v10.16b, v10.16b, #4
199        ext             v31.16b, v11.16b, v11.16b, #4
200
201        fmla            v6.4s,  v2.4s,  v12.4s
202        fmls            v6.4s,  v3.4s,  v13.4s
203
204        st1             {v18.4s,v19.4s},  [x0], #32 // out[6-9]
205
206        uzp1            v30.4s, v30.4s, v12.4s
207        uzp1            v31.4s, v31.4s, v13.4s
208
209        fmla            v7.4s,  v2.4s,  v13.4s
210        fmla            v7.4s,  v3.4s,  v12.4s
211
212        st1             {v0.2s},  [x0], #8     // out[10]
213
214        fmla            v6.4s,  v4.4s,  v30.4s
215        fmls            v6.4s,  v5.4s,  v31.4s
216
217        fmla            v7.4s,  v4.4s,  v31.4s
218        fmla            v7.4s,  v5.4s,  v30.4s
219
220        zip1            v18.4s, v6.4s,  v7.4s
221        zip2            v19.4s, v6.4s,  v7.4s
222
223        st1             {v18.4s,v19.4s},  [x0], #32 // out[11-14]
224
225        ret             x9
226endfunc
227
228// x0: out, x1: out+len2, x2: exptab, x3: len2
229function fft15_pass
230        ands            x6,  x3,  #3
231        mov             x4,  x0
232        mov             x5,  x1
233        b.eq            9f
234        ld1             {v0.2s},  [x0], #8
235        ld1             {v1.2s},  [x1], #8
236        sub             x3,  x3,  x6
237        subs            x6,  x6,  #1
238        fadd            v2.2s,  v0.2s,  v1.2s
239        fsub            v3.2s,  v0.2s,  v1.2s
240        add             x2,  x2,  #8
241        st1             {v2.2s},  [x4], #8
242        st1             {v3.2s},  [x5], #8
243        b.eq            9f
2441:
245        subs            x6,  x6,  #1
246        ldp             s4,  s5,  [x2], #8
247        ldp             s2,  s3,  [x1], #8
248        ldp             s0,  s1,  [x0], #8
249
250        fmul            s6,  s2,  s4
251        fmul            s7,  s2,  s5
252        fmls            s6,  s3,  v5.s[0]
253        fmla            s7,  s3,  v4.s[0]
254
255        fsub            s2,  s0,  s6
256        fsub            s3,  s1,  s7
257        fadd            s0,  s0,  s6
258        fadd            s1,  s1,  s7
259
260        stp             s2,  s3,  [x5], #8
261        stp             s0,  s1,  [x4], #8
262        b.gt            1b
2639:
264        ld1             {v4.4s,v5.4s}, [x2],  #32
265        ld2             {v2.4s,v3.4s}, [x1],  #32
266        uzp1            v6.4s,  v4.4s,  v5.4s
267        uzp2            v7.4s,  v4.4s,  v5.4s
268        ld2             {v0.4s,v1.4s}, [x0],  #32
2698:
270        subs            x3,  x3,  #8
271
272        fmul            v4.4s,  v2.4s,  v6.4s
273        fmul            v5.4s,  v2.4s,  v7.4s
274        b.lt            4f
275
276        ld1             {v18.4s,v19.4s}, [x2],  #32
277
278        fmls            v4.4s,  v3.4s,  v7.4s
279        fmla            v5.4s,  v3.4s,  v6.4s
280
281        ld2             {v22.4s,v23.4s}, [x1],  #32
282
283        fsub            v2.4s,  v0.4s,  v4.4s
284        fadd            v0.4s,  v0.4s,  v4.4s
285        fsub            v3.4s,  v1.4s,  v5.4s
286        fadd            v1.4s,  v1.4s,  v5.4s
287
288        uzp1            v16.4s, v18.4s, v19.4s
289        uzp2            v17.4s, v18.4s, v19.4s
290
291        st2             {v2.4s,v3.4s}, [x5],  #32
292        st2             {v0.4s,v1.4s}, [x4],  #32
293        ld2             {v20.4s,v21.4s}, [x0],  #32
294
295        fmul            v18.4s, v22.4s, v16.4s
296        fmul            v19.4s, v22.4s, v17.4s
297        b.eq            0f
298
299        ld1             {v4.4s,v5.4s}, [x2],  #32
300
301        fmls            v18.4s, v23.4s, v17.4s
302        fmla            v19.4s, v23.4s, v16.4s
303
304        ld2             {v2.4s,v3.4s}, [x1],  #32
305
306        fsub            v22.4s, v20.4s, v18.4s
307        fadd            v20.4s, v20.4s, v18.4s
308        fsub            v23.4s, v21.4s, v19.4s
309        fadd            v21.4s, v21.4s, v19.4s
310
311        uzp1            v6.4s,  v4.4s,  v5.4s
312        uzp2            v7.4s,  v4.4s,  v5.4s
313
314        st2             {v22.4s,v23.4s}, [x5],  #32
315        st2             {v20.4s,v21.4s}, [x4],  #32
316        ld2             {v0.4s,v1.4s}, [x0],  #32
317
318        b               8b
3194:
320        fmls            v4.4s,  v3.4s,  v7.4s
321        fmla            v5.4s,  v3.4s,  v6.4s
322
323        fsub            v2.4s,  v0.4s,  v4.4s
324        fadd            v0.4s,  v0.4s,  v4.4s
325        fsub            v3.4s,  v1.4s,  v5.4s
326        fadd            v1.4s,  v1.4s,  v5.4s
327
328        st2             {v2.4s,v3.4s}, [x5],  #32
329        st2             {v0.4s,v1.4s}, [x4],  #32
330
331        ret
3320:
333        fmls            v18.4s, v23.4s, v17.4s
334        fmla            v19.4s, v23.4s, v16.4s
335
336        fsub            v22.4s, v20.4s, v18.4s
337        fadd            v20.4s, v20.4s, v18.4s
338        fsub            v23.4s, v21.4s, v19.4s
339        fadd            v21.4s, v21.4s, v19.4s
340
341        st2             {v22.4s,v23.4s}, [x5],  #32
342        st2             {v20.4s,v21.4s}, [x4],  #32
343
344        ret
345endfunc
346
347function fft30_neon  align=6
348        sub             sp,  sp,  #0x20
349        stp             x20, x21, [sp]
350        stp             x22, x30, [sp, #0x10]
351        mov             x21, x1
352        mov             x22, x2
353        mov             x20, x4
354        mov             x0,  x21
355        mov             x1,  x22
356        lsl             x3,  x20, #1
357        bl              fft15_neon
358
359        add             x0,  x21, #15*8
360        add             x1,  x22, x20,  lsl #3
361        lsl             x3,  x20, #1
362        bl              fft15_neon
363
364        ldr             x2,  [x10, #(CELT_EXPTAB + 8)]  // s->exptab[1]
365        add             x0,  x21, #0
366        add             x1,  x21, #15*8
367        mov             x3,  #15
368        ldp             x20, x21, [sp]
369        ldp             x22, x30, [sp, #0x10]
370        add             sp,  sp,  #0x20
371        b               fft15_pass
372endfunc
373
374.macro  def_fft n, n2
375function fft\n\()_neon  align=6
376        sub             sp,  sp,  #0x30
377        stp             x20, x21, [sp]
378        stp             x22, x30, [sp, #0x10]
379        stp             x23, x24, [sp, #0x20]
380        mov             x21, x1
381        mov             x22, x2
382        mov             x23, x3
383        mov             x20, x4
384        sub             x3,  x3,  #1
385        lsl             x4,  x4,  #1
386        bl              fft\n2\()_neon
387
388        add             x1,  x21, #(\n2 * 8)
389        add             x2,  x22, x20, lsl #3
390        sub             x3,  x23, #1
391        lsl             x4,  x20, #1
392        bl              fft\n2\()_neon
393
394        add             x5,  x10, #CELT_EXPTAB
395        mov             x0,  x21
396        ldr             x2,  [x5,  x23, lsl #3] // s->exptab[N]
397        add             x1,  x21, #(\n2 * 8)
398        mov             x3,  #\n2
399        ldp             x20, x21, [sp]
400        ldp             x22, x30, [sp, #0x10]
401        ldp             x23, x24, [sp, #0x20]
402        add             sp,  sp,  #0x30
403        b               fft15_pass
404endfunc
405.endm
406
407        def_fft    60,  30
408        def_fft   120,  60
409        def_fft   240, 120
410        def_fft   480, 240
411        def_fft   960, 480
412
413function fft_b15_calc_neon
414        sub             sp,  sp,  #0x50
415        ldr             x8,  [x0,  #CELT_EXPTAB]    // s->exptab[0]
416        movrel          x6,  fact5
417        movrel          x11, shuffle_0213
418        movrel          x12, shuffle_1032
419        movrel          x13, shuffle_2301
420        movrel          x14, shuffle_3120
421        add             x8,  x8,  #8
422        movrel          x5,  fft_tab_neon
423        stp             x20, x30, [sp]
424        stp             d8,  d9,  [sp, #0x10]
425        stp             d10, d11, [sp, #0x20]
426        stp             d12, d13, [sp, #0x30]
427        stp             d14, d15, [sp, #0x40]
428        ld1             {v15.4s}, [x6]
429        ld1             {v0.4s,v1.4s},   [x8],  #32
430        ld1             {v6.2s},  [x8],  #8
431        ld1             {v2.4s,v3.4s},   [x8],  #32
432        ld1             {v7.2s},  [x8],  #8
433        ld1             {v4.4s,v5.4s},   [x8],  #32
434        uzp1            v8.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].re
435        uzp2            v9.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].im
436        uzp1            v10.4s, v2.4s,  v3.4s   // exp[ 6 -  9].re
437        uzp2            v11.4s, v2.4s,  v3.4s   // exp[ 6 -  9].im
438        uzp1            v12.4s, v4.4s,  v5.4s   // exp[11 - 14].re
439        uzp2            v13.4s, v4.4s,  v5.4s   // exp[11 - 14].im
440        zip1            v14.4s, v6.4s,  v7.4s   // exp[5,10].re/exp[5,10].im
441        add             x5,  x5,  x3,  lsl #3
442        ldr             x5,  [x5]
443        mov             x10, x0
444        blr             x5
445        ldp             x20, x30, [sp]
446        ldp             d8,  d9,  [sp, #0x10]
447        ldp             d10, d11, [sp, #0x20]
448        ldp             d12, d13, [sp, #0x30]
449        ldp             d14, d15, [sp, #0x40]
450        add             sp,  sp,  #0x50
451        ret
452endfunc
453
454const   fft_tab_neon
455        .quad fft15_neon
456        .quad fft30_neon
457        .quad fft60_neon
458        .quad fft120_neon
459        .quad fft240_neon
460        .quad fft480_neon
461        .quad fft960_neon
462endconst
463
464function ff_celt_imdct_half_neon, export=1
465        sub             sp,  sp,  #0x20
466        stp             x21, x30, [sp]
467        str             s0, [sp, #0x10]
468
469        ldp             w5,  w6,  [x0,  #CELT_LEN2] // CELT_LEN4
470        mov             x10, x0
471        mov             x21, x1
472        sub             w5,  w5,  #1
473        lsl             x7,  x3,  #3            //  2 * stride * sizeof(float)
474        sub             x8,  xzr, x3,  lsl #3   // -2 * stride * sizeof(float)
475        mul             x5,  x5,  x3
476        ldp             x9,  x10, [x0,  #CELT_TMP]  // CELT_TWIDDLE
477        ldr             w3,  [x0, #CELT_FFT_N]
478        add             x5,  x2,  x5,  lsl #2
479        mov             x11, x9
480
481        sub             w6,  w6,  #4
482        ld1             {v0.s}[0],  [x5], x8
483        ld1             {v1.s}[0],  [x2], x7
484        ld1             {v4.4s,v5.4s}, [x10], #32
485        ld1             {v0.s}[1],  [x5], x8
486        ld1             {v1.s}[1],  [x2], x7
487        uzp1            v2.4s,  v4.4s,  v5.4s
488        ld1             {v0.s}[2],  [x5], x8
489        ld1             {v1.s}[2],  [x2], x7
490        uzp2            v3.4s,  v4.4s,  v5.4s
491        ld1             {v0.s}[3],  [x5], x8
492        ld1             {v1.s}[3],  [x2], x7
4931:
494        subs            w6,  w6,  #4
495
496        ld1             {v20.s}[0], [x5], x8
497        ld1             {v21.s}[0], [x2], x7
498        ld1             {v4.4s,v5.4s}, [x10], #32
499
500        fmul            v6.4s,  v0.4s,  v2.4s
501        fmul            v7.4s,  v0.4s,  v3.4s
502
503        ld1             {v20.s}[1], [x5], x8
504        ld1             {v21.s}[1], [x2], x7
505
506        fmls            v6.4s,  v1.4s,  v3.4s
507        fmla            v7.4s,  v1.4s,  v2.4s
508
509        ld1             {v20.s}[2], [x5], x8
510        ld1             {v21.s}[2], [x2], x7
511
512        uzp1            v2.4s,  v4.4s,  v5.4s
513        uzp2            v3.4s,  v4.4s,  v5.4s
514        ld1             {v20.s}[3], [x5], x8
515        ld1             {v21.s}[3], [x2], x7
516
517        zip1            v4.4s,  v6.4s,  v7.4s
518        zip2            v5.4s,  v6.4s,  v7.4s
519
520        fmul            v6.4s,  v20.4s, v2.4s
521        fmul            v7.4s,  v20.4s, v3.4s
522
523        st1             {v4.4s,v5.4s}, [x9], #32
524
525        fmls            v6.4s,  v21.4s, v3.4s
526        fmla            v7.4s,  v21.4s, v2.4s
527
528        b.eq            3f
529
530        subs            w6,  w6,  #4
531        ld1             {v4.4s,v5.4s}, [x10], #32
532        ld1             {v0.s}[0],  [x5], x8
533        ld1             {v1.s}[0],  [x2], x7
534        uzp1            v2.4s,  v4.4s,  v5.4s
535        ld1             {v0.s}[1],  [x5], x8
536        ld1             {v1.s}[1],  [x2], x7
537        uzp2            v3.4s,  v4.4s,  v5.4s
538        ld1             {v0.s}[2],  [x5], x8
539        ld1             {v1.s}[2],  [x2], x7
540        zip1            v4.4s,  v6.4s,  v7.4s
541        zip2            v5.4s,  v6.4s,  v7.4s
542        ld1             {v0.s}[3],  [x5], x8
543        ld1             {v1.s}[3],  [x2], x7
544
545        st1             {v4.4s,v5.4s}, [x9], #32
546
547        b.gt            1b
548
549        fmul            v6.4s,  v0.4s,  v2.4s
550        fmul            v7.4s,  v0.4s,  v3.4s
551        fmls            v6.4s,  v1.4s,  v3.4s
552        fmla            v7.4s,  v1.4s,  v2.4s
5533:
554        zip1            v4.4s,  v6.4s,  v7.4s
555        zip2            v5.4s,  v6.4s,  v7.4s
556        st1             {v4.4s,v5.4s}, [x9], #32
557
558        mov             x2,  x11
559        mov             x4,  #1
560
561        bl              fft_b15_calc_neon
562
563        ldr             w5,  [x10, #CELT_LEN4]
564        ldr             x6,  [x10, #CELT_TWIDDLE]
565        ldr             s31, [sp, #0x10]
566
567        add             x1,  x21, x5,  lsl #2
568        add             x3,  x6,  x5,  lsl #2
569        sub             x0,  x1,  #16
570        sub             x2,  x3,  #16
571        mov             x8,  #-16
572        mov             x7,  #16
573        mov             x10, x0
574        mov             x11, x1
575
576        sub             w5,  w5,  #4
577
578        ld1             {v0.4s},  [x0], x8
579        ld1             {v1.4s},  [x1], x7
580        ld1             {v2.4s},  [x2], x8
581        ld1             {v3.4s},  [x3], x7
582
583        uzp1            v4.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].re
584        uzp2            v6.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].im
585
586        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
587        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
588
589        fmul            v1.4s,  v6.4s,  v5.4s
590        fmul            v0.4s,  v6.4s,  v7.4s
5912:
592        subs            w5,  w5,  #4
593
594        ld1             {v20.4s}, [x0], x8
595
596        fmla            v1.4s,  v4.4s,  v7.4s
597        fmls            v0.4s,  v4.4s,  v5.4s
598
599        ld1             {v21.4s}, [x1], x7
600
601        ext             v1.16b, v1.16b, v1.16b, #8
602        fmul            v0.4s,  v0.4s,  v31.s[0]
603
604        ld1             {v2.4s},  [x2], x8
605
606        rev64           v1.4s,  v1.4s
607        fmul            v1.4s,  v1.4s,  v31.s[0]
608
609        ld1             {v3.4s},  [x3], x7
610
611        zip1            v5.4s,  v0.4s,  v1.4s
612        zip2            v7.4s,  v0.4s,  v1.4s
613
614        uzp1            v4.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].re
615        uzp2            v6.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].im
616
617        st1             {v5.4s},  [x10], x8
618        st1             {v7.4s},  [x11], x7
619
620        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
621        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
622
623        fmul            v1.4s,  v6.4s,  v5.4s
624        fmul            v0.4s,  v6.4s,  v7.4s
625        b.gt            2b
626
627        fmla            v1.4s,  v4.4s,  v7.4s
628        fmls            v0.4s,  v4.4s,  v5.4s
629        ext             v1.16b, v1.16b, v1.16b, #8
630        fmul            v0.4s,  v0.4s,  v31.s[0]
631        rev64           v1.4s,  v1.4s
632        fmul            v1.4s,  v1.4s,  v31.s[0]
633        zip1            v5.4s,  v0.4s,  v1.4s
634        zip2            v7.4s,  v0.4s,  v1.4s
635        st1             {v5.4s},  [x10], x8
636        st1             {v7.4s},  [x11], x7
637
638        ldp             x21, x30, [sp]
639        add             sp,  sp,  #0x20
640        ret
641endfunc
642
643// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
644const   fact5           align=4
645        .float           0.30901699437494745, 0.95105651629515353
646        .float          -0.80901699437494734, 0.58778525229247325
647endconst
648