1/*
2 * ARM NEON optimised Float DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "config.h"
23#include "asm.S"
24
25function ff_vector_fmul_neon, export=1
26        subs            r3,  r3,  #8
27        vld1.32         {d0-d3},  [r1,:128]!
28        vld1.32         {d4-d7},  [r2,:128]!
29        vmul.f32        q8,  q0,  q2
30        vmul.f32        q9,  q1,  q3
31        beq             3f
32        bics            ip,  r3,  #15
33        beq             2f
341:      subs            ip,  ip,  #16
35        vld1.32         {d0-d1},  [r1,:128]!
36        vld1.32         {d4-d5},  [r2,:128]!
37        vmul.f32        q10, q0,  q2
38        vld1.32         {d2-d3},  [r1,:128]!
39        vld1.32         {d6-d7},  [r2,:128]!
40        vmul.f32        q11, q1,  q3
41        vst1.32         {d16-d19},[r0,:128]!
42        vld1.32         {d0-d1},  [r1,:128]!
43        vld1.32         {d4-d5},  [r2,:128]!
44        vmul.f32        q8,  q0,  q2
45        vld1.32         {d2-d3},  [r1,:128]!
46        vld1.32         {d6-d7},  [r2,:128]!
47        vmul.f32        q9,  q1,  q3
48        vst1.32         {d20-d23},[r0,:128]!
49        bne             1b
50        ands            r3,  r3,  #15
51        beq             3f
522:      vld1.32         {d0-d1},  [r1,:128]!
53        vld1.32         {d4-d5},  [r2,:128]!
54        vst1.32         {d16-d17},[r0,:128]!
55        vmul.f32        q8,  q0,  q2
56        vld1.32         {d2-d3},  [r1,:128]!
57        vld1.32         {d6-d7},  [r2,:128]!
58        vst1.32         {d18-d19},[r0,:128]!
59        vmul.f32        q9,  q1,  q3
603:      vst1.32         {d16-d19},[r0,:128]!
61        bx              lr
62endfunc
63
64function ff_vector_fmac_scalar_neon, export=1
65VFP     len .req r2
66VFP     acc .req r3
67NOVFP   len .req r3
68NOVFP   acc .req r2
69VFP     vdup.32         q15, d0[0]
70NOVFP   vdup.32         q15, r2
71        bics            r12, len, #15
72        mov             acc, r0
73        beq             3f
74        vld1.32         {q0},     [r1,:128]!
75        vld1.32         {q8},     [acc,:128]!
76        vld1.32         {q1},     [r1,:128]!
77        vld1.32         {q9},     [acc,:128]!
781:      vmla.f32        q8,  q0,  q15
79        vld1.32         {q2},     [r1,:128]!
80        vld1.32         {q10},    [acc,:128]!
81        vmla.f32        q9,  q1,  q15
82        vld1.32         {q3},     [r1,:128]!
83        vld1.32         {q11},    [acc,:128]!
84        vmla.f32        q10, q2,  q15
85        vst1.32         {q8},     [r0,:128]!
86        vmla.f32        q11, q3,  q15
87        vst1.32         {q9},     [r0,:128]!
88        subs            r12, r12, #16
89        beq             2f
90        vld1.32         {q0},     [r1,:128]!
91        vld1.32         {q8},     [acc,:128]!
92        vst1.32         {q10},    [r0,:128]!
93        vld1.32         {q1},     [r1,:128]!
94        vld1.32         {q9},     [acc,:128]!
95        vst1.32         {q11},    [r0,:128]!
96        b               1b
972:      vst1.32         {q10},    [r0,:128]!
98        vst1.32         {q11},    [r0,:128]!
99        ands            len, len, #15
100        it              eq
101        bxeq            lr
1023:      vld1.32         {q0},     [r1,:128]!
103        vld1.32         {q8},     [acc,:128]!
104        vmla.f32        q8,  q0,  q15
105        vst1.32         {q8},     [r0,:128]!
106        subs            len, len, #4
107        bgt             3b
108        bx              lr
109        .unreq          len
110endfunc
111
112function ff_vector_fmul_scalar_neon, export=1
113VFP     len .req r2
114NOVFP   len .req r3
115VFP     vdup.32         q8,  d0[0]
116NOVFP   vdup.32         q8,  r2
117        bics            r12, len, #15
118        beq             3f
119        vld1.32         {q0},[r1,:128]!
120        vld1.32         {q1},[r1,:128]!
1211:      vmul.f32        q0,  q0,  q8
122        vld1.32         {q2},[r1,:128]!
123        vmul.f32        q1,  q1,  q8
124        vld1.32         {q3},[r1,:128]!
125        vmul.f32        q2,  q2,  q8
126        vst1.32         {q0},[r0,:128]!
127        vmul.f32        q3,  q3,  q8
128        vst1.32         {q1},[r0,:128]!
129        subs            r12, r12, #16
130        beq             2f
131        vld1.32         {q0},[r1,:128]!
132        vst1.32         {q2},[r0,:128]!
133        vld1.32         {q1},[r1,:128]!
134        vst1.32         {q3},[r0,:128]!
135        b               1b
1362:      vst1.32         {q2},[r0,:128]!
137        vst1.32         {q3},[r0,:128]!
138        ands            len, len, #15
139        it              eq
140        bxeq            lr
1413:      vld1.32         {q0},[r1,:128]!
142        vmul.f32        q0,  q0,  q8
143        vst1.32         {q0},[r0,:128]!
144        subs            len, len, #4
145        bgt             3b
146        bx              lr
147        .unreq          len
148endfunc
149
150function ff_vector_fmul_window_neon, export=1
151        push            {r4,r5,lr}
152        ldr             lr,  [sp, #12]
153        sub             r2,  r2,  #8
154        sub             r5,  lr,  #2
155        add             r2,  r2,  r5, lsl #2
156        add             r4,  r3,  r5, lsl #3
157        add             ip,  r0,  r5, lsl #3
158        mov             r5,  #-16
159        vld1.32         {d0,d1},  [r1,:128]!
160        vld1.32         {d2,d3},  [r2,:128], r5
161        vld1.32         {d4,d5},  [r3,:128]!
162        vld1.32         {d6,d7},  [r4,:128], r5
1631:      subs            lr,  lr,  #4
164        vmul.f32        d22, d0,  d4
165        vrev64.32       q3,  q3
166        vmul.f32        d23, d1,  d5
167        vrev64.32       q1,  q1
168        vmul.f32        d20, d0,  d7
169        vmul.f32        d21, d1,  d6
170        beq             2f
171        vmla.f32        d22, d3,  d7
172        vld1.32         {d0,d1},  [r1,:128]!
173        vmla.f32        d23, d2,  d6
174        vld1.32         {d18,d19},[r2,:128], r5
175        vmls.f32        d20, d3,  d4
176        vld1.32         {d24,d25},[r3,:128]!
177        vmls.f32        d21, d2,  d5
178        vld1.32         {d6,d7},  [r4,:128], r5
179        vmov            q1,  q9
180        vrev64.32       q11, q11
181        vmov            q2,  q12
182        vswp            d22, d23
183        vst1.32         {d20,d21},[r0,:128]!
184        vst1.32         {d22,d23},[ip,:128], r5
185        b               1b
1862:      vmla.f32        d22, d3,  d7
187        vmla.f32        d23, d2,  d6
188        vmls.f32        d20, d3,  d4
189        vmls.f32        d21, d2,  d5
190        vrev64.32       q11, q11
191        vswp            d22, d23
192        vst1.32         {d20,d21},[r0,:128]!
193        vst1.32         {d22,d23},[ip,:128], r5
194        pop             {r4,r5,pc}
195endfunc
196
197function ff_vector_fmul_add_neon, export=1
198        ldr             r12, [sp]
199        vld1.32         {q0-q1},  [r1,:128]!
200        vld1.32         {q8-q9},  [r2,:128]!
201        vld1.32         {q2-q3},  [r3,:128]!
202        vmul.f32        q10, q0,  q8
203        vmul.f32        q11, q1,  q9
2041:      vadd.f32        q12, q2,  q10
205        vadd.f32        q13, q3,  q11
206        pld             [r1, #16]
207        pld             [r2, #16]
208        pld             [r3, #16]
209        subs            r12, r12, #8
210        beq             2f
211        vld1.32         {q0},     [r1,:128]!
212        vld1.32         {q8},     [r2,:128]!
213        vmul.f32        q10, q0,  q8
214        vld1.32         {q1},     [r1,:128]!
215        vld1.32         {q9},     [r2,:128]!
216        vmul.f32        q11, q1,  q9
217        vld1.32         {q2-q3},  [r3,:128]!
218        vst1.32         {q12-q13},[r0,:128]!
219        b               1b
2202:      vst1.32         {q12-q13},[r0,:128]!
221        bx              lr
222endfunc
223
224function ff_vector_fmul_reverse_neon, export=1
225        add             r2,  r2,  r3,  lsl #2
226        sub             r2,  r2,  #32
227        mov             r12, #-32
228        vld1.32         {q0-q1},  [r1,:128]!
229        vld1.32         {q2-q3},  [r2,:128], r12
2301:      pld             [r1, #32]
231        vrev64.32       q3,  q3
232        vmul.f32        d16, d0,  d7
233        vmul.f32        d17, d1,  d6
234        pld             [r2, #-32]
235        vrev64.32       q2,  q2
236        vmul.f32        d18, d2,  d5
237        vmul.f32        d19, d3,  d4
238        subs            r3,  r3,  #8
239        beq             2f
240        vld1.32         {q0-q1},  [r1,:128]!
241        vld1.32         {q2-q3},  [r2,:128], r12
242        vst1.32         {q8-q9},  [r0,:128]!
243        b               1b
2442:      vst1.32         {q8-q9},  [r0,:128]!
245        bx              lr
246endfunc
247
248function ff_butterflies_float_neon, export=1
2491:      vld1.32         {q0},[r0,:128]
250        vld1.32         {q1},[r1,:128]
251        vsub.f32        q2,  q0,  q1
252        vadd.f32        q1,  q0,  q1
253        vst1.32         {q2},[r1,:128]!
254        vst1.32         {q1},[r0,:128]!
255        subs            r2,  r2,  #4
256        bgt             1b
257        bx              lr
258endfunc
259
260function ff_scalarproduct_float_neon, export=1
261        vmov.f32        q2,  #0.0
2621:      vld1.32         {q0},[r0,:128]!
263        vld1.32         {q1},[r1,:128]!
264        vmla.f32        q2,  q0,  q1
265        subs            r2,  r2,  #4
266        bgt             1b
267        vadd.f32        d0,  d4,  d5
268        vpadd.f32       d0,  d0,  d0
269NOVFP   vmov.32         r0,  d0[0]
270        bx              lr
271endfunc
272