1/*
2 * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "config.h"
22#include "libavutil/arm/asm.S"
23
24/**
25 * ARM VFP optimised int32 to float conversion.
26 * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
27 * (16 bytes alignment is best for BCM2835), little-endian.
28 */
29@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
30function ff_int32_to_float_fmul_array8_vfp, export=1
31        push    {lr}
32        ldr     a1, [sp, #4]
33        subs    lr, a1, #3*8
34        bcc     50f                        @ too short to pipeline
35        @ Now need to find (len / 8) % 3. The approximation
36        @ x / 24 = (x * 0xAB) >> 12
37        @ is good for x < 4096, which is true for both AC3 and DCA.
38        mov     a1, #0xAB
39        ldr     ip, =0x03070000            @ RunFast mode, short vectors of length 8, stride 1
40        mul     a1, lr, a1
41        vpush   {s16-s31}
42        mov     a1, a1, lsr #12
43        add     a1, a1, a1, lsl #1
44        rsb     a1, a1, lr, lsr #3
45        cmp     a1, #1
46        fmrx    a1, FPSCR
47        fmxr    FPSCR, ip
48        beq     11f
49        blo     10f
50        @ Array is (2 + multiple of 3) x 8 floats long
51        @ drop through...
52        vldmia          a3!, {s16-s23}
53        vldmia          a4!, {s2,s3}
54        vldmia          a3!, {s24-s31}
55        vcvt.f32.s32    s16, s16
56        vcvt.f32.s32    s17, s17
57        vcvt.f32.s32    s18, s18
58        vcvt.f32.s32    s19, s19
59        vcvt.f32.s32    s20, s20
60        vcvt.f32.s32    s21, s21
61        vcvt.f32.s32    s22, s22
62        vcvt.f32.s32    s23, s23
63        vmul.f32        s16, s16, s2
64        @ drop through...
653:
66        vldmia          a3!, {s8-s15}
67        vldmia          a4!, {s1}
68        vcvt.f32.s32    s24, s24
69        vcvt.f32.s32    s25, s25
70        vcvt.f32.s32    s26, s26
71        vcvt.f32.s32    s27, s27
72        vcvt.f32.s32    s28, s28
73        vcvt.f32.s32    s29, s29
74        vcvt.f32.s32    s30, s30
75        vcvt.f32.s32    s31, s31
76        vmul.f32        s24, s24, s3
77        vstmia          a2!, {s16-s19}
78        vstmia          a2!, {s20-s23}
792:
80        vldmia          a3!, {s16-s23}
81        vldmia          a4!, {s2}
82        vcvt.f32.s32    s8, s8
83        vcvt.f32.s32    s9, s9
84        vcvt.f32.s32    s10, s10
85        vcvt.f32.s32    s11, s11
86        vcvt.f32.s32    s12, s12
87        vcvt.f32.s32    s13, s13
88        vcvt.f32.s32    s14, s14
89        vcvt.f32.s32    s15, s15
90        vmul.f32        s8, s8, s1
91        vstmia          a2!, {s24-s27}
92        vstmia          a2!, {s28-s31}
931:
94        vldmia          a3!, {s24-s31}
95        vldmia          a4!, {s3}
96        vcvt.f32.s32    s16, s16
97        vcvt.f32.s32    s17, s17
98        vcvt.f32.s32    s18, s18
99        vcvt.f32.s32    s19, s19
100        vcvt.f32.s32    s20, s20
101        vcvt.f32.s32    s21, s21
102        vcvt.f32.s32    s22, s22
103        vcvt.f32.s32    s23, s23
104        vmul.f32        s16, s16, s2
105        vstmia          a2!, {s8-s11}
106        vstmia          a2!, {s12-s15}
107
108        subs            lr, lr, #8*3
109        bpl             3b
110
111        vcvt.f32.s32    s24, s24
112        vcvt.f32.s32    s25, s25
113        vcvt.f32.s32    s26, s26
114        vcvt.f32.s32    s27, s27
115        vcvt.f32.s32    s28, s28
116        vcvt.f32.s32    s29, s29
117        vcvt.f32.s32    s30, s30
118        vcvt.f32.s32    s31, s31
119        vmul.f32        s24, s24, s3
120        vstmia          a2!, {s16-s19}
121        vstmia          a2!, {s20-s23}
122        vstmia          a2!, {s24-s27}
123        vstmia          a2!, {s28-s31}
124
125        fmxr    FPSCR, a1
126        vpop    {s16-s31}
127        pop     {pc}
128
12910:     @ Array is (multiple of 3) x 8 floats long
130        vldmia          a3!, {s8-s15}
131        vldmia          a4!, {s1,s2}
132        vldmia          a3!, {s16-s23}
133        vcvt.f32.s32    s8, s8
134        vcvt.f32.s32    s9, s9
135        vcvt.f32.s32    s10, s10
136        vcvt.f32.s32    s11, s11
137        vcvt.f32.s32    s12, s12
138        vcvt.f32.s32    s13, s13
139        vcvt.f32.s32    s14, s14
140        vcvt.f32.s32    s15, s15
141        vmul.f32        s8, s8, s1
142        b               1b
143
14411:     @ Array is (1 + multiple of 3) x 8 floats long
145        vldmia          a3!, {s24-s31}
146        vldmia          a4!, {s3}
147        vldmia          a3!, {s8-s15}
148        vldmia          a4!, {s1}
149        vcvt.f32.s32    s24, s24
150        vcvt.f32.s32    s25, s25
151        vcvt.f32.s32    s26, s26
152        vcvt.f32.s32    s27, s27
153        vcvt.f32.s32    s28, s28
154        vcvt.f32.s32    s29, s29
155        vcvt.f32.s32    s30, s30
156        vcvt.f32.s32    s31, s31
157        vmul.f32        s24, s24, s3
158        b               2b
159
16050:
161        ldr     lr, =0x03070000         @ RunFast mode, short vectors of length 8, stride 1
162        fmrx    ip, FPSCR
163        fmxr    FPSCR, lr
16451:
165        vldmia          a3!, {s8-s15}
166        vldmia          a4!, {s0}
167        vcvt.f32.s32    s8, s8
168        vcvt.f32.s32    s9, s9
169        vcvt.f32.s32    s10, s10
170        vcvt.f32.s32    s11, s11
171        vcvt.f32.s32    s12, s12
172        vcvt.f32.s32    s13, s13
173        vcvt.f32.s32    s14, s14
174        vcvt.f32.s32    s15, s15
175        vmul.f32        s8, s8, s0
176        subs            a1, a1, #8
177        vstmia          a2!, {s8-s11}
178        vstmia          a2!, {s12-s15}
179        bne             51b
180
181        fmxr    FPSCR, ip
182        pop     {pc}
183endfunc
184
185/**
186 * ARM VFP optimised int32 to float conversion.
187 * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
188 * (16 bytes alignment is best for BCM2835), little-endian.
189 * TODO: could be further optimised by unrolling and interleaving, as above
190 */
191@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
192function ff_int32_to_float_fmul_scalar_vfp, export=1
193VFP     tmp     .req    a4
194VFP     len     .req    a3
195NOVFP   tmp     .req    a3
196NOVFP   len     .req    a4
197NOVFP   vmov    s0, a3
198        ldr     tmp, =0x03070000           @ RunFast mode, short vectors of length 8, stride 1
199        fmrx    ip, FPSCR
200        fmxr    FPSCR, tmp
2011:
202        vldmia          a2!, {s8-s15}
203        vcvt.f32.s32    s8, s8
204        vcvt.f32.s32    s9, s9
205        vcvt.f32.s32    s10, s10
206        vcvt.f32.s32    s11, s11
207        vcvt.f32.s32    s12, s12
208        vcvt.f32.s32    s13, s13
209        vcvt.f32.s32    s14, s14
210        vcvt.f32.s32    s15, s15
211        vmul.f32        s8, s8, s0
212        subs            len, len, #8
213        vstmia          a1!, {s8-s11}
214        vstmia          a1!, {s12-s15}
215        bne             1b
216
217        fmxr    FPSCR, ip
218        bx      lr
219endfunc
220        .unreq  tmp
221        .unreq  len
222