1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24POUT          .req    a1
25PIN           .req    a2
26PCOEF         .req    a3
27OLDFPSCR      .req    a4
28COUNTER       .req    ip
29
30IN0           .req    s4
31IN1           .req    s5
32IN2           .req    s6
33IN3           .req    s7
34IN4           .req    s0
35IN5           .req    s1
36IN6           .req    s2
37IN7           .req    s3
38COEF0         .req    s8   @ coefficient elements
39COEF1         .req    s9
40COEF2         .req    s10
41COEF3         .req    s11
42COEF4         .req    s12
43COEF5         .req    s13
44COEF6         .req    s14
45COEF7         .req    s15
46ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
47ACCUM4        .req    s20
48POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
49POST1         .req    s25
50POST2         .req    s26
51POST3         .req    s27
52
53
54.macro inner_loop  decifactor, dir, tail, head
55 .ifc "\dir","up"
56  .set X, 0
57  .set Y, 4
58 .else
59  .set X, 4*JMAX*4 - 4
60  .set Y, -4
61 .endif
62 .ifnc "\head",""
63        vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
64        vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
65        vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
66        vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
67 .endif
68 .ifnc "\tail",""
69        vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
70 .endif
71 .ifnc "\head",""
72        vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
73        vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
74        vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
75        vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
76 .endif
77 .ifnc "\head",""
78        vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
79   .ifc "\tail",""
80        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
81   .endif
82        vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
83        vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
84   .ifnc "\tail",""
85        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
86   .endif
87        vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
88        vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
89 .endif
90 .ifnc "\tail",""
91        vstmia  POUT!, {POST0-POST3}
92 .endif
93 .ifnc "\head",""
94        vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
95        vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
96        vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
97        vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
98        vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
99        vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
100  .if \decifactor == 32
101        vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
102        vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
103        vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
104        vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
105        vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
106        vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
107        vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
108        vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
109        vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
110        vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
111        vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
112        vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
113        vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
114        vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
115        vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
116        vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
117        vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
118        vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
119        vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
120        vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
121  .endif
122 .endif
123.endm
124
125.macro dca_lfe_fir  decifactor
126function ff_dca_lfe_fir\decifactor\()_vfp, export=1
127        fmrx    OLDFPSCR, FPSCR
128        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
129        fmxr    FPSCR, ip
130        vldr    IN0, [PIN, #-0*4]
131        vldr    IN1, [PIN, #-1*4]
132        vldr    IN2, [PIN, #-2*4]
133        vldr    IN3, [PIN, #-3*4]
134 .if \decifactor == 32
135  .set JMAX, 8
136        vpush   {s16-s31}
137        vldr    IN4, [PIN, #-4*4]
138        vldr    IN5, [PIN, #-5*4]
139        vldr    IN6, [PIN, #-6*4]
140        vldr    IN7, [PIN, #-7*4]
141 .else
142  .set JMAX, 4
143        vpush   {s16-s27}
144 .endif
145
146        mov     COUNTER, #\decifactor/4 - 1
147        inner_loop  \decifactor, up,, head
1481:      add     PCOEF, PCOEF, #4*JMAX*4
149        subs    COUNTER, COUNTER, #1
150        inner_loop  \decifactor, up, tail, head
151        bne     1b
152        inner_loop  \decifactor, up, tail
153
154        mov     COUNTER, #\decifactor/4 - 1
155        inner_loop  \decifactor, down,, head
1561:      sub     PCOEF, PCOEF, #4*JMAX*4
157        subs    COUNTER, COUNTER, #1
158        inner_loop  \decifactor, down, tail, head
159        bne     1b
160        inner_loop  \decifactor, down, tail
161
162 .if \decifactor == 32
163        vpop    {s16-s31}
164 .else
165        vpop    {s16-s27}
166 .endif
167        fmxr    FPSCR, OLDFPSCR
168        bx      lr
169endfunc
170.endm
171
172        dca_lfe_fir  64
173 .ltorg
174        dca_lfe_fir  32
175
176        .unreq  POUT
177        .unreq  PIN
178        .unreq  PCOEF
179        .unreq  OLDFPSCR
180        .unreq  COUNTER
181
182        .unreq  IN0
183        .unreq  IN1
184        .unreq  IN2
185        .unreq  IN3
186        .unreq  IN4
187        .unreq  IN5
188        .unreq  IN6
189        .unreq  IN7
190        .unreq  COEF0
191        .unreq  COEF1
192        .unreq  COEF2
193        .unreq  COEF3
194        .unreq  COEF4
195        .unreq  COEF5
196        .unreq  COEF6
197        .unreq  COEF7
198        .unreq  ACCUM0
199        .unreq  ACCUM4
200        .unreq  POST0
201        .unreq  POST1
202        .unreq  POST2
203        .unreq  POST3
204
205
206IN      .req    a1
207SBACT   .req    a2
208OLDFPSCR .req   a3
209IMDCT   .req    a4
210WINDOW  .req    v1
211OUT     .req    v2
212BUF     .req    v3
213SCALEINT .req   v4 @ only used in softfp case
214COUNT   .req    v5
215
216SCALE   .req    s0
217
218/* Stack layout differs in softfp and hardfp cases:
219 *
220 * hardfp
221 *      fp -> 6 arg words saved by caller
222 *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
223 *            s16-s23 on entry
224 *            align 16
225 *     buf -> 8*32*4 bytes buffer
226 *            s0 on entry
227 *      sp -> 3 arg words for callee
228 *
229 * softfp
230 *      fp -> 7 arg words saved by caller
231 *            a4,v1-v5,fp,lr on entry
232 *            s16-s23 on entry
233 *            align 16
234 *     buf -> 8*32*4 bytes buffer
235 *      sp -> 4 arg words for callee
236 */
237
238/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
239 *                                 SynthFilterContext *synth, FFTContext *imdct,
240 *                                 float (*synth_buf_ptr)[512],
241 *                                 int *synth_buf_offset, float (*synth_buf2)[32],
242 *                                 const float (*window)[512], float *samples_out,
243 *                                 float (*raXin)[32], float scale);
244 */
245function ff_dca_qmf_32_subbands_vfp, export=1
246VFP     push    {a3-a4,v1-v3,v5,fp,lr}
247NOVFP   push    {a4,v1-v5,fp,lr}
248        add     fp, sp, #8*4
249        vpush   {s16-s23}
250        @ The buffer pointed at by raXin isn't big enough for us to do a
251        @ complete matrix transposition as we want to, so allocate an
252        @ alternative buffer from the stack. Align to 4 words for speed.
253        sub     BUF, sp, #8*32*4
254        bic     BUF, BUF, #15
255        mov     sp, BUF
256        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
257        fmrx    OLDFPSCR, FPSCR
258        fmxr    FPSCR, lr
259        @ COUNT is used to count down 2 things at once:
260        @ bits 0-4 are the number of word pairs remaining in the output row
261        @ bits 5-31 are the number of words to copy (with possible negation)
262        @   from the source matrix before we start zeroing the remainder
263        mov     COUNT, #(-4 << 5) + 16
264        adds    COUNT, COUNT, SBACT, lsl #5
265        bmi     2f
2661:
267        vldr    s8,  [IN, #(0*8+0)*4]
268        vldr    s10, [IN, #(0*8+1)*4]
269        vldr    s12, [IN, #(0*8+2)*4]
270        vldr    s14, [IN, #(0*8+3)*4]
271        vldr    s16, [IN, #(0*8+4)*4]
272        vldr    s18, [IN, #(0*8+5)*4]
273        vldr    s20, [IN, #(0*8+6)*4]
274        vldr    s22, [IN, #(0*8+7)*4]
275        vneg.f  s8, s8
276        vldr    s9,  [IN, #(1*8+0)*4]
277        vldr    s11, [IN, #(1*8+1)*4]
278        vldr    s13, [IN, #(1*8+2)*4]
279        vldr    s15, [IN, #(1*8+3)*4]
280        vneg.f  s16, s16
281        vldr    s17, [IN, #(1*8+4)*4]
282        vldr    s19, [IN, #(1*8+5)*4]
283        vldr    s21, [IN, #(1*8+6)*4]
284        vldr    s23, [IN, #(1*8+7)*4]
285        vstr    d4,  [BUF, #(0*32+0)*4]
286        vstr    d5,  [BUF, #(1*32+0)*4]
287        vstr    d6,  [BUF, #(2*32+0)*4]
288        vstr    d7,  [BUF, #(3*32+0)*4]
289        vstr    d8,  [BUF, #(4*32+0)*4]
290        vstr    d9,  [BUF, #(5*32+0)*4]
291        vstr    d10, [BUF, #(6*32+0)*4]
292        vstr    d11, [BUF, #(7*32+0)*4]
293        vldr    s9,  [IN, #(3*8+0)*4]
294        vldr    s11, [IN, #(3*8+1)*4]
295        vldr    s13, [IN, #(3*8+2)*4]
296        vldr    s15, [IN, #(3*8+3)*4]
297        vldr    s17, [IN, #(3*8+4)*4]
298        vldr    s19, [IN, #(3*8+5)*4]
299        vldr    s21, [IN, #(3*8+6)*4]
300        vldr    s23, [IN, #(3*8+7)*4]
301        vneg.f  s9, s9
302        vldr    s8,  [IN, #(2*8+0)*4]
303        vldr    s10, [IN, #(2*8+1)*4]
304        vldr    s12, [IN, #(2*8+2)*4]
305        vldr    s14, [IN, #(2*8+3)*4]
306        vneg.f  s17, s17
307        vldr    s16, [IN, #(2*8+4)*4]
308        vldr    s18, [IN, #(2*8+5)*4]
309        vldr    s20, [IN, #(2*8+6)*4]
310        vldr    s22, [IN, #(2*8+7)*4]
311        vstr    d4,  [BUF, #(0*32+2)*4]
312        vstr    d5,  [BUF, #(1*32+2)*4]
313        vstr    d6,  [BUF, #(2*32+2)*4]
314        vstr    d7,  [BUF, #(3*32+2)*4]
315        vstr    d8,  [BUF, #(4*32+2)*4]
316        vstr    d9,  [BUF, #(5*32+2)*4]
317        vstr    d10, [BUF, #(6*32+2)*4]
318        vstr    d11, [BUF, #(7*32+2)*4]
319        add     IN, IN, #4*8*4
320        add     BUF, BUF, #4*4
321        subs    COUNT, COUNT, #(4 << 5) + 2
322        bpl     1b
3232:      @ Now deal with trailing < 4 samples
324        adds    COUNT, COUNT, #3 << 5
325        bmi     4f  @ sb_act was a multiple of 4
326        bics    lr, COUNT, #0x1F
327        bne     3f
328        @ sb_act was n*4+1
329        vldr    s8,  [IN, #(0*8+0)*4]
330        vldr    s10, [IN, #(0*8+1)*4]
331        vldr    s12, [IN, #(0*8+2)*4]
332        vldr    s14, [IN, #(0*8+3)*4]
333        vldr    s16, [IN, #(0*8+4)*4]
334        vldr    s18, [IN, #(0*8+5)*4]
335        vldr    s20, [IN, #(0*8+6)*4]
336        vldr    s22, [IN, #(0*8+7)*4]
337        vneg.f  s8, s8
338        vldr    s9,  zero
339        vldr    s11, zero
340        vldr    s13, zero
341        vldr    s15, zero
342        vneg.f  s16, s16
343        vldr    s17, zero
344        vldr    s19, zero
345        vldr    s21, zero
346        vldr    s23, zero
347        vstr    d4,  [BUF, #(0*32+0)*4]
348        vstr    d5,  [BUF, #(1*32+0)*4]
349        vstr    d6,  [BUF, #(2*32+0)*4]
350        vstr    d7,  [BUF, #(3*32+0)*4]
351        vstr    d8,  [BUF, #(4*32+0)*4]
352        vstr    d9,  [BUF, #(5*32+0)*4]
353        vstr    d10, [BUF, #(6*32+0)*4]
354        vstr    d11, [BUF, #(7*32+0)*4]
355        add     BUF, BUF, #2*4
356        sub     COUNT, COUNT, #1
357        b       4f
3583:      @ sb_act was n*4+2 or n*4+3, so do the first 2
359        vldr    s8,  [IN, #(0*8+0)*4]
360        vldr    s10, [IN, #(0*8+1)*4]
361        vldr    s12, [IN, #(0*8+2)*4]
362        vldr    s14, [IN, #(0*8+3)*4]
363        vldr    s16, [IN, #(0*8+4)*4]
364        vldr    s18, [IN, #(0*8+5)*4]
365        vldr    s20, [IN, #(0*8+6)*4]
366        vldr    s22, [IN, #(0*8+7)*4]
367        vneg.f  s8, s8
368        vldr    s9,  [IN, #(1*8+0)*4]
369        vldr    s11, [IN, #(1*8+1)*4]
370        vldr    s13, [IN, #(1*8+2)*4]
371        vldr    s15, [IN, #(1*8+3)*4]
372        vneg.f  s16, s16
373        vldr    s17, [IN, #(1*8+4)*4]
374        vldr    s19, [IN, #(1*8+5)*4]
375        vldr    s21, [IN, #(1*8+6)*4]
376        vldr    s23, [IN, #(1*8+7)*4]
377        vstr    d4,  [BUF, #(0*32+0)*4]
378        vstr    d5,  [BUF, #(1*32+0)*4]
379        vstr    d6,  [BUF, #(2*32+0)*4]
380        vstr    d7,  [BUF, #(3*32+0)*4]
381        vstr    d8,  [BUF, #(4*32+0)*4]
382        vstr    d9,  [BUF, #(5*32+0)*4]
383        vstr    d10, [BUF, #(6*32+0)*4]
384        vstr    d11, [BUF, #(7*32+0)*4]
385        add     BUF, BUF, #2*4
386        sub     COUNT, COUNT, #(2 << 5) + 1
387        bics    lr, COUNT, #0x1F
388        bne     4f
389        @ sb_act was n*4+3
390        vldr    s8,  [IN, #(2*8+0)*4]
391        vldr    s10, [IN, #(2*8+1)*4]
392        vldr    s12, [IN, #(2*8+2)*4]
393        vldr    s14, [IN, #(2*8+3)*4]
394        vldr    s16, [IN, #(2*8+4)*4]
395        vldr    s18, [IN, #(2*8+5)*4]
396        vldr    s20, [IN, #(2*8+6)*4]
397        vldr    s22, [IN, #(2*8+7)*4]
398        vldr    s9,  zero
399        vldr    s11, zero
400        vldr    s13, zero
401        vldr    s15, zero
402        vldr    s17, zero
403        vldr    s19, zero
404        vldr    s21, zero
405        vldr    s23, zero
406        vstr    d4,  [BUF, #(0*32+0)*4]
407        vstr    d5,  [BUF, #(1*32+0)*4]
408        vstr    d6,  [BUF, #(2*32+0)*4]
409        vstr    d7,  [BUF, #(3*32+0)*4]
410        vstr    d8,  [BUF, #(4*32+0)*4]
411        vstr    d9,  [BUF, #(5*32+0)*4]
412        vstr    d10, [BUF, #(6*32+0)*4]
413        vstr    d11, [BUF, #(7*32+0)*4]
414        add     BUF, BUF, #2*4
415        sub     COUNT, COUNT, #1
4164:      @ Now fill the remainder with 0
417        vldr    s8, zero
418        vldr    s9, zero
419        ands    COUNT, COUNT, #0x1F
420        beq     6f
4215:      vstr    d4, [BUF, #(0*32+0)*4]
422        vstr    d4, [BUF, #(1*32+0)*4]
423        vstr    d4, [BUF, #(2*32+0)*4]
424        vstr    d4, [BUF, #(3*32+0)*4]
425        vstr    d4, [BUF, #(4*32+0)*4]
426        vstr    d4, [BUF, #(5*32+0)*4]
427        vstr    d4, [BUF, #(6*32+0)*4]
428        vstr    d4, [BUF, #(7*32+0)*4]
429        add     BUF, BUF, #2*4
430        subs    COUNT, COUNT, #1
431        bne     5b
4326:
433        fmxr    FPSCR, OLDFPSCR
434        ldr     WINDOW, [fp, #3*4]
435        ldr     OUT, [fp, #4*4]
436        sub     BUF, BUF, #32*4
437NOVFP   ldr     SCALEINT, [fp, #6*4]
438        mov     COUNT, #8
439VFP     vpush   {SCALE}
440VFP     sub     sp, sp, #3*4
441NOVFP   sub     sp, sp, #4*4
4427:
443VFP     ldr     a1, [fp, #-7*4]     @ imdct
444NOVFP   ldr     a1, [fp, #-8*4]
445        ldmia   fp, {a2-a4}
446VFP     stmia   sp, {WINDOW, OUT, BUF}
447NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
448VFP     vldr    SCALE, [sp, #3*4]
449        bl      X(ff_synth_filter_float_vfp)
450        add     OUT, OUT, #32*4
451        add     BUF, BUF, #32*4
452        subs    COUNT, COUNT, #1
453        bne     7b
454
455A       sub     sp, fp, #(8+8)*4
456T       sub     fp, fp, #(8+8)*4
457T       mov     sp, fp
458        vpop    {s16-s23}
459VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
460NOVFP   pop     {a4,v1-v5,fp,pc}
461endfunc
462
463        .unreq  IN
464        .unreq  SBACT
465        .unreq  OLDFPSCR
466        .unreq  IMDCT
467        .unreq  WINDOW
468        .unreq  OUT
469        .unreq  BUF
470        .unreq  SCALEINT
471        .unreq  COUNT
472
473        .unreq  SCALE
474
475        .align 2
476zero:   .word   0
477