1/*
2 * Copyright (c) 2012 Mans Rullgard
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23function ff_sbr_sum64x5_neon, export=1
24        push            {lr}
25        add             r1,  r0,  # 64*4
26        add             r2,  r0,  #128*4
27        add             r3,  r0,  #192*4
28        add             lr,  r0,  #256*4
29        mov             r12, #64
301:
31        vld1.32         {q0},     [r0,:128]
32        vld1.32         {q1},     [r1,:128]!
33        vadd.f32        q0,  q0,  q1
34        vld1.32         {q2},     [r2,:128]!
35        vadd.f32        q0,  q0,  q2
36        vld1.32         {q3},     [r3,:128]!
37        vadd.f32        q0,  q0,  q3
38        vld1.32         {q8},     [lr,:128]!
39        vadd.f32        q0,  q0,  q8
40        vst1.32         {q0},     [r0,:128]!
41        subs            r12, #4
42        bgt             1b
43        pop             {pc}
44endfunc
45
46function ff_sbr_sum_square_neon, export=1
47        vmov.f32        q0,  #0.0
481:
49        vld1.32         {q1},     [r0,:128]!
50        vmla.f32        q0,  q1,  q1
51        subs            r1,  r1,  #2
52        bgt             1b
53        vadd.f32        d0,  d0,  d1
54        vpadd.f32       d0,  d0,  d0
55NOVFP   vmov.32         r0,  d0[0]
56        bx              lr
57endfunc
58
59function ff_sbr_neg_odd_64_neon, export=1
60        mov             r1,  r0
61        vmov.i32        q8,  #1<<31
62        vld2.32         {q0,q1},  [r0,:128]!
63        veor            q1,  q1,  q8
64        vld2.32         {q2,q3},  [r0,:128]!
65    .rept 3
66        vst2.32         {q0,q1},  [r1,:128]!
67        veor            q3,  q3,  q8
68        vld2.32         {q0,q1},  [r0,:128]!
69        vst2.32         {q2,q3},  [r1,:128]!
70        veor            q1,  q1,  q8
71        vld2.32         {q2,q3},  [r0,:128]!
72    .endr
73        veor            q3,  q3,  q8
74        vst2.32         {q0,q1},  [r1,:128]!
75        vst2.32         {q2,q3},  [r1,:128]!
76        bx              lr
77endfunc
78
79function ff_sbr_qmf_pre_shuffle_neon, export=1
80        add             r1,  r0,  #60*4
81        add             r2,  r0,  #64*4
82        vld1.32         {d0},     [r0,:64]!
83        vst1.32         {d0},     [r2,:64]!
84        mov             r3,  #-16
85        mov             r12, #24
86        vmov.i32        q8,  #1<<31
87        vld1.32         {q0},     [r1,:128], r3
88        vld1.32         {d2},     [r0,:64]!
891:
90        vld1.32         {d3,d4},  [r0,:128]!
91        vrev64.32       q0,  q0
92        vld1.32         {q9},     [r1,:128], r3
93        veor            q0,  q0,  q8
94        vld1.32         {d5,d6},  [r0,:128]!
95        vswp            d0,  d1
96        vrev64.32       q9,  q9
97        vst2.32         {q0,q1},  [r2,:64]!
98        vmov            q10, q2
99        veor            q9,  q9,  q8
100        vmov            d2,  d6
101        vswp            d18, d19
102        vld1.32         {q0},     [r1,:128], r3
103        vst2.32         {q9,q10}, [r2,:64]!
104        subs            r12, r12, #8
105        bgt             1b
106        vld1.32         {d3,d4},  [r0,:128]!
107        vrev64.32       q0,  q0
108        vld1.32         {q9},     [r1,:128], r3
109        veor            q0,  q0,  q8
110        vld1.32         {d5},     [r0,:64]!
111        vswp            d0,  d1
112        vrev64.32       q9,  q9
113        vst2.32         {q0,q1},  [r2,:64]!
114        vswp            d4,  d5
115        veor            q1,  q9,  q8
116        vst2.32         {d3,d5},  [r2,:64]!
117        vst2.32         {d2[0],d4[0]}, [r2,:64]!
118        bx              lr
119endfunc
120
121function ff_sbr_qmf_post_shuffle_neon, export=1
122        add             r2,  r1,  #60*4
123        mov             r3,  #-16
124        mov             r12, #32
125        vmov.i32        q8,  #1<<31
126        vld1.32         {q0},     [r2,:128], r3
127        vld1.32         {q1},     [r1,:128]!
1281:
129        pld             [r2, #-32]
130        vrev64.32       q0,  q0
131        vswp            d2,  d3
132        veor            q0,  q0,  q8
133        vld1.32         {q2},     [r2,:128], r3
134        vld1.32         {q3},     [r1,:128]!
135        vst2.32         {d1,d3},  [r0,:128]!
136        vst2.32         {d0,d2},  [r0,:128]!
137        pld             [r2, #-32]
138        vrev64.32       q2,  q2
139        vswp            d6,  d7
140        veor            q2,  q2,  q8
141        vld1.32         {q0},     [r2,:128], r3
142        vld1.32         {q1},     [r1,:128]!
143        vst2.32         {d5,d7},  [r0,:128]!
144        vst2.32         {d4,d6},  [r0,:128]!
145        subs            r12, r12, #8
146        bgt             1b
147        bx              lr
148endfunc
149
150function ff_sbr_qmf_deint_neg_neon, export=1
151        add             r1,  r1,  #60*4
152        add             r2,  r0,  #62*4
153        mov             r3,  #-16
154        mov             r12, #32
155        vmov.i32        d2,  #1<<31
1561:
157        vld2.32         {d0,d1},  [r1,:128], r3
158        veor            d0,  d0,  d2
159        vrev64.32       d1,  d1
160        vst1.32         {d0},     [r2,:64]
161        vst1.32         {d1},     [r0,:64]!
162        sub             r2,  r2,  #8
163        subs            r12, r12, #2
164        bgt             1b
165        bx              lr
166endfunc
167
168function ff_sbr_qmf_deint_bfly_neon, export=1
169        push            {lr}
170        add             r2,  r2,  #60*4
171        add             r3,  r0,  #124*4
172        mov             r12, #64
173        mov             lr,  #-16
1741:
175        vld1.32         {q0},     [r1,:128]!
176        vld1.32         {q1},     [r2,:128], lr
177        vrev64.32       q2,  q0
178        vrev64.32       q3,  q1
179        vadd.f32        d3,  d4,  d3
180        vadd.f32        d2,  d5,  d2
181        vsub.f32        d0,  d0,  d7
182        vsub.f32        d1,  d1,  d6
183        vst1.32         {q1},     [r3,:128], lr
184        vst1.32         {q0},     [r0,:128]!
185        subs            r12, r12, #4
186        bgt             1b
187        pop             {pc}
188endfunc
189
190function ff_sbr_hf_g_filt_neon, export=1
191        ldr             r12, [sp]
192        add             r1,  r1,  r12, lsl #3
193        mov             r12, #40*2*4
194        sub             r3,  r3,  #1
195        vld2.32         {d2[],d3[]},[r2,:64]!
196        vld1.32         {d0},     [r1,:64], r12
1971:
198        vld1.32         {d1},     [r1,:64], r12
199        vmul.f32        q3,  q0,  q1
200        vld2.32         {d2[],d3[]},[r2,:64]!
201        vld1.32         {d0},     [r1,:64], r12
202        vst1.32         {q3},     [r0,:64]!
203        subs            r3,  r3,  #2
204        bgt             1b
205        it              lt
206        bxlt            lr
207        vmul.f32        d0,  d0,  d2
208        vst1.32         {d0},     [r0,:64]!
209        bx              lr
210endfunc
211
212function ff_sbr_hf_gen_neon, export=1
213NOVFP   vld1.32         {d1[]},   [sp,:32]
214VFP     vdup.32         d1,  d0[0]
215        vmul.f32        d0,  d1,  d1
216        vld1.32         {d3},     [r2,:64]
217        vld1.32         {d2},     [r3,:64]
218        vmul.f32        q0,  q0,  q1
219        ldrd            r2,  r3,  [sp, #4*!HAVE_VFP_ARGS]
220        vtrn.32         d0,  d1
221        vneg.f32        d18, d1
222        vtrn.32         d18, d1
223        add             r0,  r0,  r2,  lsl #3
224        add             r1,  r1,  r2,  lsl #3
225        sub             r1,  r1,  #2*8
226        sub             r3,  r3,  r2
227        vld1.32         {q1},     [r1,:128]!
2281:
229        vld1.32         {q3},     [r1,:128]!
230        vrev64.32       q2,  q1
231        vmov            q8,  q3
232        vrev64.32       d20, d3
233        vrev64.32       d21, d6
234        vmla.f32        q3,  q1,  d0[0]
235        vmla.f32        d6,  d4,  d18
236        vmla.f32        d7,  d20, d18
237        vmla.f32        d6,  d3,  d0[1]
238        vmla.f32        d7,  d16, d0[1]
239        vmla.f32        d6,  d5,  d1
240        vmla.f32        d7,  d21, d1
241        vmov            q1,  q8
242        vst1.32         {q3},     [r0,:128]!
243        subs            r3,  r3,  #2
244        bgt             1b
245        bx              lr
246endfunc
247
248function ff_sbr_autocorrelate_neon, export=1
249        vld1.32         {q0},     [r0,:128]!
250        vmov.f32        q1,  #0.0
251        vmov.f32        q3,  #0.0
252        vmov.f32        d20, #0.0
253        vmul.f32        d21, d1,  d1
254        vmov            q8,  q0
255        vmov            q11, q0
256        mov             r12, #36
2571:
258        vld1.32         {q2},     [r0,:128]!
259        vrev64.32       q12, q2
260        vmla.f32        q10, q2,  q2
261        vmla.f32        d2,  d1,  d4
262        vmla.f32        d3,  d1,  d24
263        vmla.f32        d6,  d0,  d4
264        vmla.f32        d7,  d0,  d24
265        vmla.f32        d2,  d4,  d5
266        vmla.f32        d3,  d4,  d25
267        vmla.f32        d6,  d1,  d5
268        vmla.f32        d7,  d1,  d25
269        vmov            q0,  q2
270        subs            r12, r12, #2
271        bgt             1b
272        vld1.32         {q2},     [r0,:128]!
273        vrev64.32       q12, q2
274        vmla.f32        d2,  d1,  d4
275        vmla.f32        d3,  d1,  d24
276        vmla.f32        d6,  d0,  d4
277        vmla.f32        d7,  d0,  d24
278        vadd.f32        d20, d20, d21
279        vrev64.32       d18, d17
280        vmla.f32        d6,  d1,  d5
281        vmla.f32        d7,  d1,  d25
282        vmov            q0,  q1
283        vmla.f32        d0,  d16, d17
284        vmla.f32        d1,  d16, d18
285        vmla.f32        d2,  d4,  d5
286        vmla.f32        d3,  d4,  d25
287        vneg.f32        s15, s15
288        vmov            d21, d20
289        vpadd.f32       d0,  d0,  d2
290        vpadd.f32       d7,  d6,  d7
291        vtrn.32         d1,  d3
292        vsub.f32        d6,  d1,  d3
293        vmla.f32        d20, d22, d22
294        vmla.f32        d21, d4,  d4
295        vtrn.32         d0,  d6
296        vpadd.f32       d20, d20, d21
297        vst1.32         {q3},     [r1,:128]!
298        vst1.32         {d20[1]}, [r1,:32]
299        add             r1,  r1,  #2*4
300        vst1.32         {d0},     [r1,:64]
301        add             r1,  r1,  #4*4
302        vst1.32         {d20[0]}, [r1,:32]
303        bx              lr
304endfunc
305
306function ff_sbr_hf_apply_noise_0_neon, export=1
307        vmov.i32        d3,  #0
308.Lhf_apply_noise_0:
309        push            {r4,lr}
310        movrelx         r4,  X(ff_sbr_noise_table)
311        ldr             r12, [sp, #12]
312        add             r3,  r3,  #1
313        bfc             r3,  #9,  #23
314        sub             r12, r12, #1
3151:
316        add             lr,  r4,  r3,  lsl #3
317        vld2.32         {q0},     [r0,:64]
318        vld2.32         {q3},     [lr,:64]
319        vld1.32         {d2},     [r1,:64]!
320        vld1.32         {d18},    [r2,:64]!
321        vceq.f32        d16, d2,  #0
322        veor            d2,  d2,  d3
323        vmov            q2,  q0
324        vmla.f32        d0,  d6,  d18
325        vmla.f32        d1,  d7,  d18
326        vadd.f32        d4,  d4,  d2
327        add             r3,  r3,  #2
328        bfc             r3,  #9,  #23
329        vbif            d0,  d4,  d16
330        vbif            d1,  d5,  d16
331        vst2.32         {q0},     [r0,:64]!
332        subs            r12, r12, #2
333        bgt             1b
334        blt             2f
335        add             lr,  r4,  r3,  lsl #3
336        vld1.32         {d0},     [r0,:64]
337        vld1.32         {d6},     [lr,:64]
338        vld1.32         {d2[]},   [r1,:32]!
339        vld1.32         {d3[]},   [r2,:32]!
340        vceq.f32        d4,  d2,  #0
341        veor            d2,  d2,  d3
342        vmov            d1,  d0
343        vmla.f32        d0,  d6,  d3
344        vadd.f32        s2,  s2,  s4
345        vbif            d0,  d1,  d4
346        vst1.32         {d0},     [r0,:64]!
3472:
348        pop             {r4,pc}
349endfunc
350
351function ff_sbr_hf_apply_noise_1_neon, export=1
352        ldr             r12, [sp]
353        push            {r4,lr}
354        lsl             r12, r12, #31
355        eor             lr,  r12, #1<<31
356        vmov            d3,  r12, lr
357.Lhf_apply_noise_1:
358        movrelx         r4,  X(ff_sbr_noise_table)
359        ldr             r12, [sp, #12]
360        add             r3,  r3,  #1
361        bfc             r3,  #9,  #23
362        sub             r12, r12, #1
3631:
364        add             lr,  r4,  r3,  lsl #3
365        vld2.32         {q0},     [r0,:64]
366        vld2.32         {q3},     [lr,:64]
367        vld1.32         {d2},     [r1,:64]!
368        vld1.32         {d18},    [r2,:64]!
369        vceq.f32        d16, d2,  #0
370        veor            d2,  d2,  d3
371        vmov            q2,  q0
372        vmla.f32        d0,  d6,  d18
373        vmla.f32        d1,  d7,  d18
374        vadd.f32        d5,  d5,  d2
375        add             r3,  r3,  #2
376        bfc             r3,  #9,  #23
377        vbif            d0,  d4,  d16
378        vbif            d1,  d5,  d16
379        vst2.32         {q0},     [r0,:64]!
380        subs            r12, r12, #2
381        bgt             1b
382        blt             2f
383        add             lr,  r4,  r3,  lsl #3
384        vld1.32         {d0},     [r0,:64]
385        vld1.32         {d6},     [lr,:64]
386        vld1.32         {d2[]},   [r1,:32]!
387        vld1.32         {d18[]},  [r2,:32]!
388        vceq.f32        d4,  d2,  #0
389        veor            d2,  d2,  d3
390        vmov            d1,  d0
391        vmla.f32        d0,  d6,  d18
392        vadd.f32        s3,  s3,  s5
393        vbif            d0,  d1,  d4
394        vst1.32         {d0},     [r0,:64]!
3952:
396        pop             {r4,pc}
397endfunc
398
399function ff_sbr_hf_apply_noise_2_neon, export=1
400        vmov.i32        d3,  #1<<31
401        b               .Lhf_apply_noise_0
402endfunc
403
404function ff_sbr_hf_apply_noise_3_neon, export=1
405        ldr             r12, [sp]
406        push            {r4,lr}
407        lsl             r12, r12, #31
408        eor             lr,  r12, #1<<31
409        vmov            d3,  lr, r12
410        b               .Lhf_apply_noise_1
411endfunc
412