1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24IMDCT         .req    r0
25ORIG_P_SB     .req    r1
26P_SB_OFF      .req    r2
27I             .req    r0
28P_SB2_UP      .req    r1
29OLDFPSCR      .req    r2
30P_SB2_DN      .req    r3
31P_WIN_DN      .req    r4
32P_OUT_DN      .req    r5
33P_SB          .req    r6
34J_WRAP        .req    r7
35P_WIN_UP      .req    r12
36P_OUT_UP      .req    r14
37
38SCALE         .req    s0
39SBUF_DAT_REV0 .req    s4
40SBUF_DAT_REV1 .req    s5
41SBUF_DAT_REV2 .req    s6
42SBUF_DAT_REV3 .req    s7
43VA0           .req    s8
44VA3           .req    s11
45VB0           .req    s12
46VB3           .req    s15
47VC0           .req    s8
48VC3           .req    s11
49VD0           .req    s12
50VD3           .req    s15
51SBUF_DAT0     .req    s16
52SBUF_DAT1     .req    s17
53SBUF_DAT2     .req    s18
54SBUF_DAT3     .req    s19
55SBUF_DAT_ALT0 .req    s20
56SBUF_DAT_ALT1 .req    s21
57SBUF_DAT_ALT2 .req    s22
58SBUF_DAT_ALT3 .req    s23
59WIN_DN_DAT0   .req    s24
60WIN_UP_DAT0   .req    s28
61
62
63.macro inner_loop  half, tail, head
64 .if (OFFSET & (64*4)) == 0                @ even numbered call
65        SBUF_DAT_THIS0 .req SBUF_DAT0
66        SBUF_DAT_THIS1 .req SBUF_DAT1
67        SBUF_DAT_THIS2 .req SBUF_DAT2
68        SBUF_DAT_THIS3 .req SBUF_DAT3
69  .ifnc "\head",""
70        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
71        vldr    d9, [P_SB, #OFFSET+8]
72  .endif
73 .else
74        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
75        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
76        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
77        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
78  .ifnc "\head",""
79        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
80        vldr    d11, [P_SB, #OFFSET+8]
81  .endif
82 .endif
83 .ifnc "\tail",""
84  .ifc "\half","ab"
85        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
86  .else
87        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
88  .endif
89 .endif
90 .ifnc "\head",""
91        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
92        vldr    d15, [P_WIN_UP, #OFFSET+8]
93        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
94        vldr    d13, [P_WIN_DN, #OFFSET+8]
95        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
96        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
97        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
98        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
99  .ifc "\half","ab"
100        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
101  .else
102        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
103  .endif
104        teq     J_WRAP, #J
105        bne     2f             @ strongly predictable, so better than cond exec in this case
106        sub     P_SB, P_SB, #512*4
1072:
108  .set J, J - 64
109  .set OFFSET, OFFSET + 64*4
110 .endif
111        .unreq  SBUF_DAT_THIS0
112        .unreq  SBUF_DAT_THIS1
113        .unreq  SBUF_DAT_THIS2
114        .unreq  SBUF_DAT_THIS3
115.endm
116
117
118/* void ff_synth_filter_float_vfp(FFTContext *imdct,
119 *                                float *synth_buf_ptr, int *synth_buf_offset,
120 *                                float synth_buf2[32], const float window[512],
121 *                                float out[32], const float in[32], float scale)
122 */
123function ff_synth_filter_float_vfp, export=1
124        push    {r3-r7,lr}
125        vpush   {s16-s31}
126        ldr     lr, [P_SB_OFF]
127        add     a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
128        mov     P_SB, a2                  @ and keep a copy for ourselves
129        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
130        sub     lr, lr, #32
131        and     lr, lr, #512-32
132        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
133        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
134VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
135        bl      X(ff_imdct_half_vfp)
136VFP     vmov    SCALE, s16
137
138        fmrx    OLDFPSCR, FPSCR
139        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
140        fmxr    FPSCR, lr
141        ldr     P_SB2_DN, [sp, #16*4]
142        ldr     P_WIN_DN, [sp, #(16+6+0)*4]
143        ldr     P_OUT_DN, [sp, #(16+6+1)*4]
144NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]
145
146#define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
147        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
148        add     P_SB2_UP, P_SB2_DN, #16*4
149        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
150        add     P_OUT_UP, P_OUT_DN, #16*4
151        add     P_SB2_DN, P_SB2_DN, #16*4
152        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
153        add     P_OUT_DN, P_OUT_DN, #16*4
154        mov     I, #4
1551:
156        vldmia  P_SB2_UP!, {VB0-VB3}
157        vldmdb  P_SB2_DN!, {VA0-VA3}
158 .set J, 512 - 64
159 .set OFFSET, -IMM_OFF_SKEW
160        inner_loop  ab,, head
161 .rept 7
162        inner_loop  ab, tail, head
163 .endr
164        inner_loop  ab, tail
165        add     P_WIN_UP, P_WIN_UP, #4*4
166        sub     P_WIN_DN, P_WIN_DN, #4*4
167        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
168        add     P_SB, P_SB, #(512+4)*4
169        subs    I, I, #1
170        vmul.f  VA0, VA0, SCALE
171        vstmia  P_OUT_UP!, {VB0-VB3}
172        vstmdb  P_OUT_DN!, {VA0-VA3}
173        bne     1b
174
175        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
176        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
177        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
178        mov     I, #4
1791:
180        vldr.d  d4, zero             @ d4 = VC0
181        vldr.d  d5, zero
182        vldr.d  d6, zero             @ d6 = VD0
183        vldr.d  d7, zero
184 .set J, 512 - 64
185 .set OFFSET, -IMM_OFF_SKEW
186        inner_loop  cd,, head
187 .rept 7
188        inner_loop  cd, tail, head
189 .endr
190        inner_loop  cd, tail
191        add     P_WIN_UP, P_WIN_UP, #4*4
192        sub     P_WIN_DN, P_WIN_DN, #4*4
193        add     P_SB, P_SB, #(512+4)*4
194        subs    I, I, #1
195        vstmia  P_SB2_UP!, {VC0-VC3}
196        vstmdb  P_SB2_DN!, {VD0-VD3}
197        bne     1b
198
199        fmxr    FPSCR, OLDFPSCR
200        vpop    {s16-s31}
201        pop     {r3-r7,pc}
202endfunc
203
204        .unreq  IMDCT
205        .unreq  ORIG_P_SB
206        .unreq  P_SB_OFF
207        .unreq  I
208        .unreq  P_SB2_UP
209        .unreq  OLDFPSCR
210        .unreq  P_SB2_DN
211        .unreq  P_WIN_DN
212        .unreq  P_OUT_DN
213        .unreq  P_SB
214        .unreq  J_WRAP
215        .unreq  P_WIN_UP
216        .unreq  P_OUT_UP
217
218        .unreq  SCALE
219        .unreq  SBUF_DAT_REV0
220        .unreq  SBUF_DAT_REV1
221        .unreq  SBUF_DAT_REV2
222        .unreq  SBUF_DAT_REV3
223        .unreq  VA0
224        .unreq  VA3
225        .unreq  VB0
226        .unreq  VB3
227        .unreq  VC0
228        .unreq  VC3
229        .unreq  VD0
230        .unreq  VD3
231        .unreq  SBUF_DAT0
232        .unreq  SBUF_DAT1
233        .unreq  SBUF_DAT2
234        .unreq  SBUF_DAT3
235        .unreq  SBUF_DAT_ALT0
236        .unreq  SBUF_DAT_ALT1
237        .unreq  SBUF_DAT_ALT2
238        .unreq  SBUF_DAT_ALT3
239        .unreq  WIN_DN_DAT0
240        .unreq  WIN_UP_DAT0
241
242        .align  3
243zero:   .word   0, 0
244