1/*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24#define ff_fft_calc_neon X(ff_fft_calc_neon)
25
26function ff_imdct_half_neon, export=1
27        push            {r4-r8,lr}
28
29        mov             r12, #1
30        ldr             lr,  [r0, #20]          @ mdct_bits
31        ldr             r4,  [r0, #24]          @ tcos
32        ldr             r3,  [r0, #8]           @ revtab
33        lsl             r12, r12, lr            @ n  = 1 << nbits
34        lsr             lr,  r12, #2            @ n4 = n >> 2
35        add             r7,  r2,  r12,  lsl #1
36        mov             r12, #-16
37        sub             r7,  r7,  #16
38
39        vld2.32         {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
40        vld2.32         {d0-d1},  [r2,:128]!    @ d0 =m0,x d1 =m1,x
41        vrev64.32       d17, d17
42        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
43        vmul.f32        d6,  d17, d2
44        vmul.f32        d7,  d0,  d2
451:
46        subs            lr,  lr,  #2
47        ldr             r6,  [r3], #4
48        vmul.f32        d4,  d0,  d3
49        vmul.f32        d5,  d17, d3
50        vsub.f32        d4,  d6,  d4
51        vadd.f32        d5,  d5,  d7
52        uxth            r8,  r6,  ror #16
53        uxth            r6,  r6
54        add             r8,  r1,  r8,  lsl #3
55        add             r6,  r1,  r6,  lsl #3
56        beq             1f
57        vld2.32         {d16-d17},[r7,:128],r12
58        vld2.32         {d0-d1},  [r2,:128]!
59        vrev64.32       d17, d17
60        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
61        vmul.f32        d6,  d17, d2
62        vmul.f32        d7,  d0,  d2
63        vst2.32         {d4[0],d5[0]}, [r6,:64]
64        vst2.32         {d4[1],d5[1]}, [r8,:64]
65        b               1b
661:
67        vst2.32         {d4[0],d5[0]}, [r6,:64]
68        vst2.32         {d4[1],d5[1]}, [r8,:64]
69
70        mov             r4,  r0
71        mov             r6,  r1
72        bl              ff_fft_calc_neon
73
74        mov             r12, #1
75        ldr             lr,  [r4, #20]          @ mdct_bits
76        ldr             r4,  [r4, #24]          @ tcos
77        lsl             r12, r12, lr            @ n  = 1 << nbits
78        lsr             lr,  r12, #3            @ n8 = n >> 3
79
80        add             r4,  r4,  lr,  lsl #3
81        add             r6,  r6,  lr,  lsl #3
82        sub             r1,  r4,  #16
83        sub             r3,  r6,  #16
84
85        mov             r7,  #-16
86        mov             r8,  r6
87        mov             r0,  r3
88
89        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
90        vld2.32         {d20-d21},[r6,:128]!    @ d20=i2,r2 d21=i3,r3
91        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
921:
93        subs            lr,  lr,  #2
94        vmul.f32        d7,  d0,  d18
95        vld2.32         {d17,d19},[r4,:128]!    @ d17=c2,c3 d19=s2,s3
96        vmul.f32        d4,  d1,  d18
97        vmul.f32        d5,  d21, d19
98        vmul.f32        d6,  d20, d19
99        vmul.f32        d22, d1,  d16
100        vmul.f32        d23, d21, d17
101        vmul.f32        d24, d0,  d16
102        vmul.f32        d25, d20, d17
103        vadd.f32        d7,  d7,  d22
104        vadd.f32        d6,  d6,  d23
105        vsub.f32        d4,  d4,  d24
106        vsub.f32        d5,  d5,  d25
107        beq             1f
108        vld2.32         {d0-d1},  [r3,:128], r7
109        vld2.32         {d20-d21},[r6,:128]!
110        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
111        vrev64.32       q3,  q3
112        vst2.32         {d4,d6},  [r0,:128], r7
113        vst2.32         {d5,d7},  [r8,:128]!
114        b               1b
1151:
116        vrev64.32       q3,  q3
117        vst2.32         {d4,d6},  [r0,:128]
118        vst2.32         {d5,d7},  [r8,:128]
119
120        pop             {r4-r8,pc}
121endfunc
122
123function ff_imdct_calc_neon, export=1
124        push            {r4-r6,lr}
125
126        ldr             r3,  [r0, #20]
127        mov             r4,  #1
128        mov             r5,  r1
129        lsl             r4,  r4,  r3
130        add             r1,  r1,  r4
131
132        bl              X(ff_imdct_half_neon)
133
134        add             r0,  r5,  r4,  lsl #2
135        add             r1,  r5,  r4,  lsl #1
136        sub             r0,  r0,  #8
137        sub             r2,  r1,  #16
138        mov             r3,  #-16
139        mov             r6,  #-8
140        vmov.i32        d30, #1<<31
1411:
142        vld1.32         {d0-d1},  [r2,:128], r3
143        pld             [r0, #-16]
144        vrev64.32       q0,  q0
145        vld1.32         {d2-d3},  [r1,:128]!
146        veor            d4,  d1,  d30
147        pld             [r2, #-16]
148        vrev64.32       q1,  q1
149        veor            d5,  d0,  d30
150        vst1.32         {d2},     [r0,:64], r6
151        vst1.32         {d3},     [r0,:64], r6
152        vst1.32         {d4-d5},  [r5,:128]!
153        subs            r4,  r4,  #16
154        bgt             1b
155
156        pop             {r4-r6,pc}
157endfunc
158
159function ff_mdct_calc_neon, export=1
160        push            {r4-r10,lr}
161
162        mov             r12, #1
163        ldr             lr,  [r0, #20]          @ mdct_bits
164        ldr             r4,  [r0, #24]          @ tcos
165        ldr             r3,  [r0, #8]           @ revtab
166        lsl             lr,  r12, lr            @ n  = 1 << nbits
167        add             r7,  r2,  lr            @ in4u
168        sub             r9,  r7,  #16           @ in4d
169        add             r2,  r7,  lr,  lsl #1   @ in3u
170        add             r8,  r9,  lr,  lsl #1   @ in3d
171        add             r5,  r4,  lr,  lsl #1
172        sub             r5,  r5,  #16
173        sub             r3,  r3,  #4
174        mov             r12, #-16
175
176        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
177        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
178        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
179        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
180        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
181        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
182        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
183        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
184        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
185        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
186        vsub.f32        d16, d16, d2            @ in0u-in2d      R
187        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
1881:
189        vmul.f32        d7,  d0,  d21           @  I*s
190A       ldr             r10, [r3, lr, lsr #1]
191T       lsr             r10, lr,  #1
192T       ldr             r10, [r3, r10]
193        vmul.f32        d6,  d1,  d20           @ -R*c
194        ldr             r6,  [r3, #4]!
195        vmul.f32        d4,  d1,  d21           @ -R*s
196        vmul.f32        d5,  d0,  d20           @  I*c
197        vmul.f32        d24, d16, d30           @  R*c
198        vmul.f32        d25, d17, d31           @ -I*s
199        vmul.f32        d22, d16, d31           @  R*s
200        vmul.f32        d23, d17, d30           @  I*c
201        subs            lr,  lr,  #16
202        vsub.f32        d6,  d6,  d7            @ -R*c-I*s
203        vadd.f32        d7,  d4,  d5            @ -R*s+I*c
204        vsub.f32        d24, d25, d24           @ I*s-R*c
205        vadd.f32        d25, d22, d23           @ R*s-I*c
206        beq             1f
207        mov             r12, #-16
208        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
209        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
210        vneg.f32        d7,  d7                 @  R*s-I*c
211        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
212        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
213        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
214        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
215        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
216        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
217        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
218        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
219        vsub.f32        d16, d16, d2            @ in0u-in2d      R
220        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
221        uxth            r12, r6,  ror #16
222        uxth            r6,  r6
223        add             r12, r1,  r12, lsl #3
224        add             r6,  r1,  r6,  lsl #3
225        vst2.32         {d6[0],d7[0]}, [r6,:64]
226        vst2.32         {d6[1],d7[1]}, [r12,:64]
227        uxth            r6,  r10, ror #16
228        uxth            r10, r10
229        add             r6 , r1,  r6,  lsl #3
230        add             r10, r1,  r10, lsl #3
231        vst2.32         {d24[0],d25[0]},[r10,:64]
232        vst2.32         {d24[1],d25[1]},[r6,:64]
233        b               1b
2341:
235        vneg.f32        d7,  d7                 @  R*s-I*c
236        uxth            r12, r6,  ror #16
237        uxth            r6,  r6
238        add             r12, r1,  r12, lsl #3
239        add             r6,  r1,  r6,  lsl #3
240        vst2.32         {d6[0],d7[0]}, [r6,:64]
241        vst2.32         {d6[1],d7[1]}, [r12,:64]
242        uxth            r6,  r10, ror #16
243        uxth            r10, r10
244        add             r6 , r1,  r6,  lsl #3
245        add             r10, r1,  r10, lsl #3
246        vst2.32         {d24[0],d25[0]},[r10,:64]
247        vst2.32         {d24[1],d25[1]},[r6,:64]
248
249        mov             r4,  r0
250        mov             r6,  r1
251        bl              ff_fft_calc_neon
252
253        mov             r12, #1
254        ldr             lr,  [r4, #20]          @ mdct_bits
255        ldr             r4,  [r4, #24]          @ tcos
256        lsl             r12, r12, lr            @ n  = 1 << nbits
257        lsr             lr,  r12, #3            @ n8 = n >> 3
258
259        add             r4,  r4,  lr,  lsl #3
260        add             r6,  r6,  lr,  lsl #3
261        sub             r1,  r4,  #16
262        sub             r3,  r6,  #16
263
264        mov             r7,  #-16
265        mov             r8,  r6
266        mov             r0,  r3
267
268        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
269        vld2.32         {d20-d21},[r6,:128]!    @ d20=r2,i2 d21=r3,i3
270        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
2711:
272        subs            lr,  lr,  #2
273        vmul.f32        d7,  d0,  d18           @ r1*s1,r0*s0
274        vld2.32         {d17,d19},[r4,:128]!    @ c2,c3 s2,s3
275        vmul.f32        d4,  d1,  d18           @ i1*s1,i0*s0
276        vmul.f32        d5,  d21, d19           @ i2*s2,i3*s3
277        vmul.f32        d6,  d20, d19           @ r2*s2,r3*s3
278        vmul.f32        d24, d0,  d16           @ r1*c1,r0*c0
279        vmul.f32        d25, d20, d17           @ r2*c2,r3*c3
280        vmul.f32        d22, d21, d17           @ i2*c2,i3*c3
281        vmul.f32        d23, d1,  d16           @ i1*c1,i0*c0
282        vadd.f32        d4,  d4,  d24           @ i1*s1+r1*c1,i0*s0+r0*c0
283        vadd.f32        d5,  d5,  d25           @ i2*s2+r2*c2,i3*s3+r3*c3
284        vsub.f32        d6,  d22, d6            @ i2*c2-r2*s2,i3*c3-r3*s3
285        vsub.f32        d7,  d23, d7            @ i1*c1-r1*s1,i0*c0-r0*s0
286        vneg.f32        q2,  q2
287        beq             1f
288        vld2.32         {d0-d1},  [r3,:128], r7
289        vld2.32         {d20-d21},[r6,:128]!
290        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
291        vrev64.32       q3,  q3
292        vst2.32         {d4,d6},  [r0,:128], r7
293        vst2.32         {d5,d7},  [r8,:128]!
294        b               1b
2951:
296        vrev64.32       q3,  q3
297        vst2.32         {d4,d6},  [r0,:128]
298        vst2.32         {d5,d7},  [r8,:128]
299
300        pop             {r4-r10,pc}
301endfunc
302