1/*
2 * Copyright (c) 2012 Mans Rullgard
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23function ff_ps_add_squares_neon, export=1
24        mov             r3,  r0
25        sub             r2,  r2,  #4
26        vld1.32         {q0},     [r1,:128]!
27        vmul.f32        q0,  q0,  q0
28        vld1.32         {q2},     [r1,:128]!
29        vmul.f32        q2,  q2,  q2
30        vld1.32         {q1},     [r0,:128]!
311:
32        vpadd.f32       d6,  d0,  d1
33        vld1.32         {q0},     [r1,:128]!
34        vpadd.f32       d7,  d4,  d5
35        vmul.f32        q0,  q0,  q0
36        vld1.32         {q2},     [r1,:128]!
37        vadd.f32        q3,  q1,  q3
38        vld1.32         {q1},     [r0,:128]!
39        vmul.f32        q2,  q2,  q2
40        vst1.32         {q3},     [r3,:128]!
41        subs            r2,  r2,  #4
42        bgt             1b
43        vpadd.f32       d6,  d0,  d1
44        vpadd.f32       d7,  d4,  d5
45        vadd.f32        q1,  q1,  q3
46        vst1.32         {q1},     [r3,:128]!
47        bx              lr
48endfunc
49
50function ff_ps_mul_pair_single_neon, export=1
51        sub             r3,  r3,  #4
52        tst             r1,  #8
53        bne             2f
54        vld1.32         {q0},     [r1,:128]!
551:
56        vld1.32         {q3},     [r2,:128]!
57        vmul.f32        d4,  d0,  d6[0]
58        vmul.f32        d5,  d1,  d6[1]
59        vld1.32         {q1},     [r1,:128]!
60        vmul.f32        d6,  d2,  d7[0]
61        vmul.f32        d7,  d3,  d7[1]
62        vld1.32         {q0},     [r1,:128]!
63        vst1.32         {q2,q3},  [r0,:128]!
64        subs            r3,  r3,  #4
65        bgt             1b
66        vld1.32         {q3},     [r2,:128]!
67        vmul.f32        d4,  d0,  d6[0]
68        vmul.f32        d5,  d1,  d6[1]
69        vld1.32         {q1},     [r1,:128]!
70        vmul.f32        d6,  d2,  d7[0]
71        vmul.f32        d7,  d3,  d7[1]
72        vst1.32         {q2,q3},  [r0,:128]!
73        bx              lr
742:
75        vld1.32         {d0},     [r1,:64]!
76        vld1.32         {d1,d2},  [r1,:128]!
771:
78        vld1.32         {q3},     [r2,:128]!
79        vmul.f32        d4,  d0,  d6[0]
80        vmul.f32        d5,  d1,  d6[1]
81        vld1.32         {d0,d1},  [r1,:128]!
82        vmul.f32        d6,  d2,  d7[0]
83        vmul.f32        d7,  d0,  d7[1]
84        vmov            d0,  d1
85        vld1.32         {d1,d2},  [r1,:128]!
86        vst1.32         {q2,q3},  [r0,:128]!
87        subs            r3,  r3,  #4
88        bgt             1b
89        vld1.32         {q3},     [r2,:128]!
90        vmul.f32        d4,  d0,  d6[0]
91        vmul.f32        d5,  d1,  d6[1]
92        vld1.32         {d0},     [r1,:64]!
93        vmul.f32        d6,  d2,  d7[0]
94        vmul.f32        d7,  d0,  d7[1]
95        vst1.32         {q2,q3},  [r0,:128]!
96        bx              lr
97endfunc
98
99function ff_ps_hybrid_synthesis_deint_neon, export=1
100        push            {r4-r8,lr}
101        add             r0,  r0,  r2,  lsl #2
102        add             r1,  r1,  r2,  lsl #5+1+2
103        rsb             r2,  r2,  #64
104        mov             r5,  #64*4
105        mov             lr,  r0
106        add             r4,  r0,  #38*64*4
107        mov             r12, r3
1082:
109        vld1.32         {d0,d1},  [r1,:128]!
110        vst1.32         {d0[0]},  [lr,:32], r5
111        vst1.32         {d0[1]},  [r4,:32], r5
112        vst1.32         {d1[0]},  [lr,:32], r5
113        vst1.32         {d1[1]},  [r4,:32], r5
114        subs            r12, r12, #2
115        bgt             2b
116        add             r0,  r0,  #4
117        sub             r2,  r2,  #1
118        tst             r2,  #2
119        bne             6f
1201:
121        mov             lr,  r0
122        add             r4,  r0,  #38*64*4
123        add             r6,  r1,  #  32*2*4
124        add             r7,  r1,  #2*32*2*4
125        add             r8,  r1,  #3*32*2*4
126        mov             r12, r3
1272:
128        vld1.32         {d0,d1},  [r1,:128]!
129        vld1.32         {d2,d3},  [r6,:128]!
130        vld1.32         {d4,d5},  [r7,:128]!
131        vld1.32         {d6,d7},  [r8,:128]!
132        vst4.32         {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
133        vst4.32         {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
134        vst4.32         {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
135        vst4.32         {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
136        subs            r12, r12, #2
137        bgt             2b
138        add             r0,  r0,  #16
139        add             r1,  r1,  #3*32*2*4
140        subs            r2,  r2,  #4
141        bgt             1b
142        pop             {r4-r8,pc}
1436:
144        mov             lr,  r0
145        add             r4,  r0,  #38*64*4
146        add             r6,  r1,  #32*2*4
147        mov             r12, r3
1482:
149        vld1.32         {d0,d1},  [r1,:128]!
150        vld1.32         {d2,d3},  [r6,:128]!
151        vst2.32         {d0[0],d2[0]}, [lr,:64], r5
152        vst2.32         {d0[1],d2[1]}, [r4,:64], r5
153        vst2.32         {d1[0],d3[0]}, [lr,:64], r5
154        vst2.32         {d1[1],d3[1]}, [r4,:64], r5
155        subs            r12, r12, #2
156        bgt             2b
157        add             r0,  r0,  #8
158        add             r1,  r1,  #32*2*4
159        sub             r2,  r2,  #2
160        b               1b
161endfunc
162
163function ff_ps_hybrid_analysis_neon, export=1
164        vldm            r1,  {d19-d31}
165        ldr             r12, [sp]
166        lsl             r3,  r3,  #3
167        vadd.f32        d16, d19, d31
168        vadd.f32        d17, d20, d30
169        vsub.f32        d18, d19, d31
170        vsub.f32        d19, d20, d30
171        vsub.f32        d0,  d21, d29
172        vsub.f32        d1,  d22, d28
173        vadd.f32        d2,  d21, d29
174        vadd.f32        d3,  d22, d28
175        vadd.f32        d20, d23, d27
176        vadd.f32        d21, d24, d26
177        vsub.f32        d22, d23, d27
178        vsub.f32        d23, d24, d26
179        vmov.i32        d6,  #1<<31
180        vmov.i32        d7,  #0
181        vmov.f32        q14, #0.0
182        vmov.f32        q15, #0.0
183        vtrn.32         d6,  d7
184        vrev64.32       q9,  q9
185        vrev64.32       q0,  q0
186        vrev64.32       q11, q11
187        veor            q9,  q9,  q3
188        veor            q0,  q0,  q3
189        veor            q11, q11, q3
190        vld1.32         {q13},    [r2,:128]!
191        vtrn.32         q8,  q9
192        vtrn.32         q1,  q0
193        vtrn.32         q10, q11
194        sub             r12, r12, #1
195        vmla.f32        q14, q8,  q13
196        vld1.32         {q2},     [r2,:128]!
197        vmla.f32        q15, q9,  q13
1981:
199        vmla.f32        q14, q1,  q2
200        vld1.32         {q13},    [r2,:128]!
201        vmla.f32        q15, q0,  q2
202        vmla.f32        q14, q10, q13
203        vld1.32         {q2},     [r2,:128]!
204        vmla.f32        q15, q11, q13
205        vld1.32         {q13},    [r2,:128]!
206        vadd.f32        d6,  d28, d29
207        vadd.f32        d7,  d30, d31
208        vmov.f32        q14, #0.0
209        vmov.f32        q15, #0.0
210        vmla.f32        q14, q8,  q13
211        vpadd.f32       d6,  d6,  d7
212        vmla.f32        q15, q9,  q13
213        vmla.f32        d6,  d25, d4[0]
214        vld1.32         {q2},     [r2,:128]!
215        vst1.32         {d6},     [r0,:64], r3
216        subs            r12, r12, #1
217        bgt             1b
218        vmla.f32        q14, q1,  q2
219        vld1.32         {q13},    [r2,:128]!
220        vmla.f32        q15, q0,  q2
221        vmla.f32        q14, q10, q13
222        vld1.32         {q2},     [r2,:128]!
223        vmla.f32        q15, q11, q13
224        vadd.f32        d6,  d28, d29
225        vadd.f32        d7,  d30, d31
226        vpadd.f32       d6,  d6,  d7
227        vmla.f32        d6,  d25, d4[0]
228        vst1.32         {d6},     [r0,:64], r3
229        bx              lr
230endfunc
231
232function ff_ps_stereo_interpolate_neon, export=1
233        vld1.32         {q0},     [r2]
234        vld1.32         {q14},    [r3]
235        vadd.f32        q15, q14, q14
236        mov             r2,  r0
237        mov             r3,  r1
238        ldr             r12, [sp]
239        vadd.f32        q1,  q0,  q14
240        vadd.f32        q0,  q0,  q15
241        vld1.32         {q2},     [r0,:64]!
242        vld1.32         {q3},     [r1,:64]!
243        subs            r12, r12, #1
244        beq             2f
2451:
246        vmul.f32        d16, d4,  d2[0]
247        vmul.f32        d17, d5,  d0[0]
248        vmul.f32        d18, d4,  d2[1]
249        vmul.f32        d19, d5,  d0[1]
250        vmla.f32        d16, d6,  d3[0]
251        vmla.f32        d17, d7,  d1[0]
252        vmla.f32        d18, d6,  d3[1]
253        vmla.f32        d19, d7,  d1[1]
254        vadd.f32        q1,  q1,  q15
255        vadd.f32        q0,  q0,  q15
256        vld1.32         {q2},     [r0,:64]!
257        vld1.32         {q3},     [r1,:64]!
258        vst1.32         {q8},     [r2,:64]!
259        vst1.32         {q9},     [r3,:64]!
260        subs            r12, r12, #2
261        bgt             1b
262        it              lt
263        bxlt            lr
2642:
265        vmul.f32        d16, d4,  d2[0]
266        vmul.f32        d18, d4,  d2[1]
267        vmla.f32        d16, d6,  d3[0]
268        vmla.f32        d18, d6,  d3[1]
269        vst1.32         {d16},    [r2,:64]!
270        vst1.32         {d18},    [r3,:64]!
271        bx              lr
272endfunc
273