1/*
2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23.macro  bflies          d0,  d1,  r0,  r1
24        vrev64.32       \r0, \d1                @ t5, t6, t1, t2
25        vhsub.s16       \r1, \d1, \r0           @ t1-t5, t2-t6, t5-t1, t6-t2
26        vhadd.s16       \r0, \d1, \r0           @ t1+t5, t2+t6, t5+t1, t6+t2
27        vext.16         \r1, \r1, \r1, #1       @ t2-t6, t5-t1, t6-t2, t1-t5
28        vtrn.32         \r0, \r1                @ t1+t5, t2+t6, t2-t6, t5-t1
29                                                @ t5,    t6,    t4,    t3
30        vhsub.s16       \d1, \d0, \r0
31        vhadd.s16       \d0, \d0, \r0
32.endm
33
34.macro  transform01     q0,  q1,  d3,  c0,  c1,  r0,  w0,  w1
35        vrev32.16       \r0, \d3
36        vmull.s16       \w0, \d3, \c0
37        vmlal.s16       \w0, \r0, \c1
38        vshrn.s32       \d3, \w0, #15
39        bflies          \q0, \q1, \w0, \w1
40.endm
41
42.macro  transform2      d0,  d1,  d2,  d3,  q0,  q1,  c0,  c1,  c2,  c3, \
43                        r0,  r1,  w0,  w1
44        vrev32.16       \r0, \d1
45        vrev32.16       \r1, \d3
46        vmull.s16       \w0, \d1, \c0
47        vmlal.s16       \w0, \r0, \c1
48        vmull.s16       \w1, \d3, \c2
49        vmlal.s16       \w1, \r1, \c3
50        vshrn.s32       \d1, \w0, #15
51        vshrn.s32       \d3, \w1, #15
52        bflies          \q0, \q1, \w0, \w1
53.endm
54
55.macro  fft4            d0,  d1,  r0,  r1
56        vhsub.s16       \r0, \d0, \d1           @ t3, t4, t8, t7
57        vhsub.s16       \r1, \d1, \d0
58        vhadd.s16       \d0, \d0, \d1           @ t1, t2, t6, t5
59        vmov.i64        \d1, #0xffff00000000
60        vbit            \r0, \r1, \d1
61        vrev64.16       \r1, \r0                @ t7, t8, t4, t3
62        vtrn.32         \r0, \r1                @ t3, t4, t7, t8
63        vtrn.32         \d0, \r0                @ t1, t2, t3, t4, t6, t5, t8, t7
64        vhsub.s16       \d1, \d0, \r0           @ r2, i2, r3, i1
65        vhadd.s16       \d0, \d0, \r0           @ r0, i0, r1, i3
66.endm
67
68.macro  fft8            d0,  d1,  d2,  d3,  q0,  q1,  c0,  c1,  r0,  r1, w0, w1
69        fft4            \d0, \d1, \r0, \r1
70        vtrn.32         \d0, \d1                @ z0, z2, z1, z3
71        vhadd.s16       \r0, \d2, \d3           @ t1, t2, t3, t4
72        vhsub.s16       \d3, \d2, \d3           @ z5, z7
73        vmov            \d2, \r0
74        transform01     \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
75.endm
76
77function fft4_neon
78        vld1.16         {d0-d1},  [r0]
79        fft4            d0,  d1,  d2,  d3
80        vst1.16         {d0-d1},  [r0]
81        bx              lr
82endfunc
83
84function fft8_neon
85        vld1.16         {d0-d3},  [r0,:128]
86        movrel          r1,  coefs
87        vld1.16         {d30},    [r1,:64]
88        vdup.16         d31, d30[0]
89        fft8            d0,  d1,  d2,  d3,  q0,  q1,  d31, d30, d20, d21, q8, q9
90        vtrn.32         d0,  d1
91        vtrn.32         d2,  d3
92        vst1.16         {d0-d3},  [r0,:128]
93        bx              lr
94endfunc
95
96function fft16_neon
97        vld1.16         {d0-d3},  [r0,:128]!
98        vld1.16         {d4-d7},  [r0,:128]
99        movrel          r1,  coefs
100        sub             r0,  r0,  #32
101        vld1.16         {d28-d31},[r1,:128]
102        vdup.16         d31, d28[0]
103        fft8            d0,  d1,  d2,  d3,  q0,  q1,  d31, d28, d20, d21, q8, q9
104        vswp            d5,  d6
105        fft4            q2,  q3,  q8,  q9
106        vswp            d5,  d6
107        vtrn.32         q0,  q1             @ z0, z4, z2, z6, z1, z5, z3, z7
108        vtrn.32         q2,  q3             @ z8, z12,z10,z14,z9, z13,z11,z15
109        vswp            d1,  d2
110        vdup.16         d31, d28[0]
111        transform01     q0,  q2,  d5,  d31, d28, d20, q8, q9
112        vdup.16         d26, d29[0]
113        vdup.16         d27, d30[0]
114        transform2      d2,  d6,  d3,  d7,  q1,  q3,  d26, d30, d27, d29, \
115                        d20, d21, q8,  q9
116        vtrn.32         q0,  q1
117        vtrn.32         q2,  q3
118        vst1.16         {d0-d3},  [r0,:128]!
119        vst1.16         {d4-d7},  [r0,:128]
120        bx              lr
121endfunc
122
123function fft_pass_neon
124        push            {r4,lr}
125        movrel          lr,  coefs + 24
126        vld1.16         {d30},    [lr,:64]
127        lsl             r12, r2,  #3
128        vmov            d31, d30
129        add             r3,  r1,  r2,  lsl #2
130        mov             lr,  #-8
131        sub             r3,  r3,  #2
132        mov             r4,  r0
133        vld1.16         {d27[]},  [r3,:16]
134        sub             r3,  r3,  #6
135        vld1.16         {q0},     [r4,:128], r12
136        vld1.16         {q1},     [r4,:128], r12
137        vld1.16         {q2},     [r4,:128], r12
138        vld1.16         {q3},     [r4,:128], r12
139        vld1.16         {d28},    [r1,:64]!
140        vld1.16         {d29},    [r3,:64], lr
141        vswp            d1,  d2
142        vswp            d5,  d6
143        vtrn.32         d0,  d1
144        vtrn.32         d4,  d5
145        vdup.16         d25, d28[1]
146        vmul.s16        d27, d27, d31
147        transform01     q0,  q2,  d5,  d25, d27, d20, q8,  q9
148        b               2f
1491:
150        mov             r4,  r0
151        vdup.16         d26, d29[0]
152        vld1.16         {q0},     [r4,:128], r12
153        vld1.16         {q1},     [r4,:128], r12
154        vld1.16         {q2},     [r4,:128], r12
155        vld1.16         {q3},     [r4,:128], r12
156        vld1.16         {d28},    [r1,:64]!
157        vld1.16         {d29},    [r3,:64], lr
158        vswp            d1,  d2
159        vswp            d5,  d6
160        vtrn.32         d0,  d1
161        vtrn.32         d4,  d5
162        vdup.16         d24, d28[0]
163        vdup.16         d25, d28[1]
164        vdup.16         d27, d29[3]
165        vmul.s16        q13, q13, q15
166        transform2      d0,  d4,  d1,  d5,  q0,  q2,  d24, d26, d25, d27, \
167                        d16, d17, q9,  q10
1682:
169        vtrn.32         d2,  d3
170        vtrn.32         d6,  d7
171        vdup.16         d24, d28[2]
172        vdup.16         d26, d29[2]
173        vdup.16         d25, d28[3]
174        vdup.16         d27, d29[1]
175        vmul.s16        q13, q13, q15
176        transform2      d2,  d6,  d3,  d7,  q1,  q3,  d24, d26, d25, d27, \
177                        d16, d17, q9,  q10
178        vtrn.32         d0,  d1
179        vtrn.32         d2,  d3
180        vtrn.32         d4,  d5
181        vtrn.32         d6,  d7
182        vswp            d1,  d2
183        vswp            d5,  d6
184        mov             r4,  r0
185        vst1.16         {q0},     [r4,:128], r12
186        vst1.16         {q1},     [r4,:128], r12
187        vst1.16         {q2},     [r4,:128], r12
188        vst1.16         {q3},     [r4,:128], r12
189        add             r0,  r0,  #16
190        subs            r2,  r2,  #2
191        bgt             1b
192        pop             {r4,pc}
193endfunc
194
195#define F_SQRT1_2   23170
196#define F_COS_16_1  30274
197#define F_COS_16_3  12540
198
199const   coefs, align=4
200        .short          F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2,  F_SQRT1_2
201        .short          F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
202        .short          F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
203        .short          1,         -1,         -1,          1
204endconst
205
206.macro  def_fft n, n2, n4
207function fft\n\()_neon
208        push            {r4, lr}
209        mov             r4,  r0
210        bl              fft\n2\()_neon
211        add             r0,  r4,  #\n4*2*4
212        bl              fft\n4\()_neon
213        add             r0,  r4,  #\n4*3*4
214        bl              fft\n4\()_neon
215        mov             r0,  r4
216        pop             {r4, lr}
217        movrelx         r1,  X(ff_cos_\n\()_fixed)
218        mov             r2,  #\n4/2
219        b               fft_pass_neon
220endfunc
221.endm
222
223        def_fft    32,    16,     8
224        def_fft    64,    32,    16
225        def_fft   128,    64,    32
226        def_fft   256,   128,    64
227        def_fft   512,   256,   128
228        def_fft  1024,   512,   256
229        def_fft  2048,  1024,   512
230        def_fft  4096,  2048,  1024
231        def_fft  8192,  4096,  2048
232        def_fft 16384,  8192,  4096
233        def_fft 32768, 16384,  8192
234        def_fft 65536, 32768, 16384
235
236function ff_fft_fixed_calc_neon, export=1
237        ldr             r2,  [r0]
238        sub             r2,  r2,  #2
239        movrel          r3,  fft_fixed_tab_neon
240        ldr             r3,  [r3, r2, lsl #2]
241        mov             r0,  r1
242        bx              r3
243endfunc
244
245const   fft_fixed_tab_neon
246        .word fft4_neon
247        .word fft8_neon
248        .word fft16_neon
249        .word fft32_neon
250        .word fft64_neon
251        .word fft128_neon
252        .word fft256_neon
253        .word fft512_neon
254        .word fft1024_neon
255        .word fft2048_neon
256        .word fft4096_neon
257        .word fft8192_neon
258        .word fft16384_neon
259        .word fft32768_neon
260        .word fft65536_neon
261endconst
262