1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24@ TODO: * FFTs wider than 16
25@       * dispatch code
26
27function fft4_vfp
28        vldr    d0, [a1, #0*2*4]   @ s0,s1   = z[0]
29        vldr    d4, [a1, #1*2*4]   @ s8,s9   = z[1]
30        vldr    d1, [a1, #2*2*4]   @ s2,s3   = z[2]
31        vldr    d5, [a1, #3*2*4]   @ s10,s11 = z[3]
32        @ stall
33        vadd.f  s12, s0, s8        @ i0
34        vadd.f  s13, s1, s9        @ i1
35        vadd.f  s14, s2, s10       @ i2
36        vadd.f  s15, s3, s11       @ i3
37        vsub.f  s8, s0, s8         @ i4
38        vsub.f  s9, s1, s9         @ i5
39        vsub.f  s10, s2, s10       @ i6
40        vsub.f  s11, s3, s11       @ i7
41        @ stall
42        @ stall
43        vadd.f  s0, s12, s14       @ z[0].re
44        vsub.f  s4, s12, s14       @ z[2].re
45        vadd.f  s1, s13, s15       @ z[0].im
46        vsub.f  s5, s13, s15       @ z[2].im
47        vadd.f  s7, s9, s10        @ z[3].im
48        vsub.f  s3, s9, s10        @ z[1].im
49        vadd.f  s2, s8, s11        @ z[1].re
50        vsub.f  s6, s8, s11        @ z[3].re
51        @ stall
52        @ stall
53        vstr    d0, [a1, #0*2*4]
54        vstr    d2, [a1, #2*2*4]
55        @ stall
56        @ stall
57        vstr    d1, [a1, #1*2*4]
58        vstr    d3, [a1, #3*2*4]
59
60        bx      lr
61endfunc
62
63.macro macro_fft8_head
64        @ FFT4
65        vldr    d4, [a1, #0 * 2*4]
66        vldr    d6, [a1, #1 * 2*4]
67        vldr    d5, [a1, #2 * 2*4]
68        vldr    d7, [a1, #3 * 2*4]
69            @ BF
70            vldr    d12, [a1, #4 * 2*4]
71        vadd.f  s16, s8, s12    @ vector op
72            vldr    d14, [a1, #5 * 2*4]
73            vldr    d13, [a1, #6 * 2*4]
74            vldr    d15, [a1, #7 * 2*4]
75        vsub.f  s20, s8, s12    @ vector op
76        vadd.f  s0, s16, s18
77        vsub.f  s2, s16, s18
78        vadd.f  s1, s17, s19
79        vsub.f  s3, s17, s19
80        vadd.f  s7, s21, s22
81        vsub.f  s5, s21, s22
82        vadd.f  s4, s20, s23
83        vsub.f  s6, s20, s23
84            vsub.f  s20, s24, s28   @ vector op
85        vstr    d0, [a1, #0 * 2*4]  @ transfer s0-s7 to s24-s31 via memory
86        vstr    d1, [a1, #1 * 2*4]
87        vldr    s0, cos1pi4
88            vadd.f  s16, s24, s28   @ vector op
89        vstr    d2, [a1, #2 * 2*4]
90        vstr    d3, [a1, #3 * 2*4]
91        vldr    d12, [a1, #0 * 2*4]
92            @ TRANSFORM
93            vmul.f  s20, s20, s0    @ vector x scalar op
94        vldr    d13, [a1, #1 * 2*4]
95        vldr    d14, [a1, #2 * 2*4]
96        vldr    d15, [a1, #3 * 2*4]
97        @ BUTTERFLIES
98        vadd.f  s0, s18, s16
99        vadd.f  s1, s17, s19
100        vsub.f  s2, s17, s19
101        vsub.f  s3, s18, s16
102            vadd.f  s4, s21, s20
103            vsub.f  s5, s21, s20
104            vadd.f  s6, s22, s23
105            vsub.f  s7, s22, s23
106        vadd.f  s8, s0, s24         @ vector op
107        vstr    d0, [a1, #0 * 2*4]  @ transfer s0-s3 to s12-s15 via memory
108        vstr    d1, [a1, #1 * 2*4]
109        vldr    d6, [a1, #0 * 2*4]
110        vldr    d7, [a1, #1 * 2*4]
111            vadd.f  s1, s5, s6
112            vadd.f  s0, s7, s4
113            vsub.f  s2, s5, s6
114            vsub.f  s3, s7, s4
115        vsub.f  s12, s24, s12       @ vector op
116            vsub.f  s5, s29, s1
117            vsub.f  s4, s28, s0
118            vsub.f  s6, s30, s2
119            vsub.f  s7, s31, s3
120            vadd.f  s16, s0, s28    @ vector op
121        vstr    d6, [a1, #4 * 2*4]
122        vstr    d7, [a1, #6 * 2*4]
123        vstr    d4, [a1, #0 * 2*4]
124        vstr    d5, [a1, #2 * 2*4]
125             vstr    d2, [a1, #5 * 2*4]
126             vstr    d3, [a1, #7 * 2*4]
127.endm
128
129.macro macro_fft8_tail
130             vstr    d8, [a1, #1 * 2*4]
131             vstr    d9, [a1, #3 * 2*4]
132.endm
133
134function fft8_vfp
135        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
136        fmrx    a2, FPSCR
137        fmxr    FPSCR, a3
138        vpush   {s16-s31}
139
140        macro_fft8_head
141        macro_fft8_tail
142
143        vpop    {s16-s31}
144        fmxr    FPSCR, a2
145        bx      lr
146endfunc
147
148.align 3
149cos1pi4:    @ cos(1*pi/4) = sqrt(2)
150        .float  0.707106769084930419921875
151cos1pi8:    @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
152        .float  0.92387950420379638671875
153cos3pi8:    @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
154        .float  0.3826834261417388916015625
155
156function ff_fft16_vfp, export=1
157        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
158        fmrx    a2, FPSCR
159        fmxr    FPSCR, a3
160        vpush   {s16-s31}
161
162        macro_fft8_head
163        @ FFT4(z+8)
164        vldr    d10, [a1, #8 * 2*4]
165        vldr    d12, [a1, #9 * 2*4]
166        vldr    d11, [a1, #10 * 2*4]
167        vldr    d13, [a1, #11 * 2*4]
168        macro_fft8_tail
169        vadd.f  s16, s20, s24   @ vector op
170            @ FFT4(z+12)
171            vldr    d4, [a1, #12 * 2*4]
172            vldr    d6, [a1, #13 * 2*4]
173            vldr    d5, [a1, #14 * 2*4]
174        vsub.f  s20, s20, s24   @ vector op
175            vldr    d7, [a1, #15 * 2*4]
176        vadd.f  s0, s16, s18
177        vsub.f  s4, s16, s18
178        vadd.f  s1, s17, s19
179        vsub.f  s5, s17, s19
180        vadd.f  s7, s21, s22
181        vsub.f  s3, s21, s22
182        vadd.f  s2, s20, s23
183        vsub.f  s6, s20, s23
184            vadd.f  s16, s8, s12    @ vector op
185        vstr    d0, [a1, #8 * 2*4]
186        vstr    d2, [a1, #10 * 2*4]
187        vstr    d1, [a1, #9 * 2*4]
188            vsub.f  s20, s8, s12
189        vstr    d3, [a1, #11 * 2*4]
190        @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
191        vldr    d12, [a1, #10 * 2*4]
192            vadd.f  s0, s16, s18
193            vadd.f  s1, s17, s19
194            vsub.f  s6, s16, s18
195            vsub.f  s7, s17, s19
196            vsub.f  s3, s21, s22
197            vadd.f  s2, s20, s23
198            vadd.f  s5, s21, s22
199            vsub.f  s4, s20, s23
200            vstr    d0, [a1, #12 * 2*4]
201        vmov    s0, s6
202          @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
203          vldr    d6, [a1, #9 * 2*4]
204            vstr    d1, [a1, #13 * 2*4]
205        vldr    d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
206            vstr    d2, [a1, #15 * 2*4]
207          vldr    d7, [a1, #13 * 2*4]
208        vadd.f  s4, s25, s24
209        vsub.f  s5, s25, s24
210        vsub.f  s6, s0, s7
211        vadd.f  s7, s0, s7
212          vmul.f  s20, s12, s3  @ vector op
213            @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
214            vldr    d4, [a1, #11 * 2*4]
215            vldr    d5, [a1, #15 * 2*4]
216            vldr    s1, cos3pi8
217        vmul.f  s24, s4, s2     @ vector * scalar op
218          vmul.f  s28, s12, s1  @ vector * scalar op
219            vmul.f  s12, s8, s1 @ vector * scalar op
220          vadd.f  s4, s20, s29
221          vsub.f  s5, s21, s28
222          vsub.f  s6, s22, s31
223          vadd.f  s7, s23, s30
224            vmul.f  s8, s8, s3  @ vector * scalar op
225          vldr    d8, [a1, #1 * 2*4]
226          vldr    d9, [a1, #5 * 2*4]
227            vldr    d10, [a1, #3 * 2*4]
228            vldr    d11, [a1, #7 * 2*4]
229        vldr    d14, [a1, #2 * 2*4]
230          vadd.f  s0, s6, s4
231          vadd.f  s1, s5, s7
232          vsub.f  s2, s5, s7
233          vsub.f  s3, s6, s4
234            vadd.f  s4, s12, s9
235            vsub.f  s5, s13, s8
236            vsub.f  s6, s14, s11
237            vadd.f  s7, s15, s10
238          vadd.f  s12, s0, s16  @ vector op
239          vstr    d0, [a1, #1 * 2*4]
240          vstr    d1, [a1, #5 * 2*4]
241          vldr    d4, [a1, #1 * 2*4]
242          vldr    d5, [a1, #5 * 2*4]
243            vadd.f  s0, s6, s4
244            vadd.f  s1, s5, s7
245            vsub.f  s2, s5, s7
246            vsub.f  s3, s6, s4
247          vsub.f  s8, s16, s8   @ vector op
248          vstr    d6, [a1, #1 * 2*4]
249          vstr    d7, [a1, #5 * 2*4]
250        vldr    d15, [a1, #6 * 2*4]
251            vsub.f  s4, s20, s0
252            vsub.f  s5, s21, s1
253            vsub.f  s6, s22, s2
254            vsub.f  s7, s23, s3
255            vadd.f  s20, s0, s20    @ vector op
256          vstr    d4, [a1, #9 * 2*4]
257              @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
258              vldr    d6, [a1, #8 * 2*4]
259          vstr    d5, [a1, #13 * 2*4]
260              vldr    d7, [a1, #12 * 2*4]
261          vstr    d2, [a1, #11 * 2*4]
262              vldr    d8, [a1, #0 * 2*4]
263          vstr    d3, [a1, #15 * 2*4]
264              vldr    d9, [a1, #4 * 2*4]
265        vadd.f  s0, s26, s24
266        vadd.f  s1, s25, s27
267        vsub.f  s2, s25, s27
268        vsub.f  s3, s26, s24
269              vadd.f  s4, s14, s12
270              vadd.f  s5, s13, s15
271              vsub.f  s6, s13, s15
272              vsub.f  s7, s14, s12
273        vadd.f  s8, s0, s28 @ vector op
274        vstr    d0, [a1, #3 * 2*4]
275        vstr    d1, [a1, #7 * 2*4]
276        vldr    d6, [a1, #3 * 2*4]
277        vldr    d7, [a1, #7 * 2*4]
278              vsub.f  s0, s16, s4
279              vsub.f  s1, s17, s5
280              vsub.f  s2, s18, s6
281              vsub.f  s3, s19, s7
282        vsub.f  s12, s28, s12       @ vector op
283              vadd.f  s16, s4, s16  @ vector op
284            vstr    d10, [a1, #3 * 2*4]
285            vstr    d11, [a1, #7 * 2*4]
286        vstr    d4, [a1, #2 * 2*4]
287        vstr    d5, [a1, #6 * 2*4]
288              vstr    d0, [a1, #8 * 2*4]
289              vstr    d1, [a1, #12 * 2*4]
290        vstr    d6, [a1, #10 * 2*4]
291        vstr    d7, [a1, #14 * 2*4]
292              vstr    d8, [a1, #0 * 2*4]
293              vstr    d9, [a1, #4 * 2*4]
294
295        vpop    {s16-s31}
296        fmxr    FPSCR, a2
297        bx      lr
298endfunc
299