1/*
2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "config.h"
23#include "asm.S"
24
25        preserve8
26
27function ff_clear_block_neon, export=1
28        vmov.i16        q0,  #0
29        .rept           8
30        vst1.16         {q0}, [r0,:128]!
31        .endr
32        bx              lr
33endfunc
34
35function ff_clear_blocks_neon, export=1
36        vmov.i16        q0,  #0
37        .rept           8*6
38        vst1.16         {q0}, [r0,:128]!
39        .endr
40        bx              lr
41endfunc
42
43.macro  pixels16        rnd=1, avg=0
44  .if \avg
45        mov             r12, r0
46  .endif
471:      vld1.64         {q0},     [r1], r2
48        vld1.64         {q1},     [r1], r2
49        vld1.64         {q2},     [r1], r2
50        pld             [r1, r2, lsl #2]
51        vld1.64         {q3},     [r1], r2
52        pld             [r1]
53        pld             [r1, r2]
54        pld             [r1, r2, lsl #1]
55  .if \avg
56        vld1.64         {q8},     [r12,:128], r2
57        vrhadd.u8       q0,  q0,  q8
58        vld1.64         {q9},     [r12,:128], r2
59        vrhadd.u8       q1,  q1,  q9
60        vld1.64         {q10},    [r12,:128], r2
61        vrhadd.u8       q2,  q2,  q10
62        vld1.64         {q11},    [r12,:128], r2
63        vrhadd.u8       q3,  q3,  q11
64  .endif
65        subs            r3,  r3,  #4
66        vst1.64         {q0},     [r0,:128], r2
67        vst1.64         {q1},     [r0,:128], r2
68        vst1.64         {q2},     [r0,:128], r2
69        vst1.64         {q3},     [r0,:128], r2
70        bne             1b
71        bx              lr
72.endm
73
74.macro  pixels16_x2     rnd=1, avg=0
751:      vld1.64         {d0-d2},  [r1], r2
76        vld1.64         {d4-d6},  [r1], r2
77        pld             [r1]
78        pld             [r1, r2]
79        subs            r3,  r3,  #2
80        vext.8          q1,  q0,  q1,  #1
81        avg             q0,  q0,  q1
82        vext.8          q3,  q2,  q3,  #1
83        avg             q2,  q2,  q3
84  .if \avg
85        vld1.8          {q1},     [r0,:128], r2
86        vld1.8          {q3},     [r0,:128]
87        vrhadd.u8       q0,  q0,  q1
88        vrhadd.u8       q2,  q2,  q3
89        sub             r0,  r0,  r2
90  .endif
91        vst1.64         {q0},     [r0,:128], r2
92        vst1.64         {q2},     [r0,:128], r2
93        bne             1b
94        bx              lr
95.endm
96
97.macro  pixels16_y2     rnd=1, avg=0
98        vld1.64         {q0},     [r1], r2
99        vld1.64         {q1},     [r1], r2
1001:      subs            r3,  r3,  #2
101        avg             q2,  q0,  q1
102        vld1.64         {q0},     [r1], r2
103        avg             q3,  q0,  q1
104        vld1.64         {q1},     [r1], r2
105        pld             [r1]
106        pld             [r1, r2]
107  .if \avg
108        vld1.8          {q8},     [r0,:128], r2
109        vld1.8          {q9},     [r0,:128]
110        vrhadd.u8       q2,  q2,  q8
111        vrhadd.u8       q3,  q3,  q9
112        sub             r0,  r0,  r2
113  .endif
114        vst1.64         {q2},     [r0,:128], r2
115        vst1.64         {q3},     [r0,:128], r2
116        bne             1b
117        bx              lr
118.endm
119
120.macro  pixels16_xy2    rnd=1, avg=0
121        vld1.64         {d0-d2},  [r1], r2
122        vld1.64         {d4-d6},  [r1], r2
123  .ifeq \rnd
124        vmov.i16        q13, #1
125  .endif
126        pld             [r1]
127        pld             [r1, r2]
128        vext.8          q1,  q0,  q1,  #1
129        vext.8          q3,  q2,  q3,  #1
130        vaddl.u8        q8,  d0,  d2
131        vaddl.u8        q10, d1,  d3
132        vaddl.u8        q9,  d4,  d6
133        vaddl.u8        q11, d5,  d7
1341:      subs            r3,  r3,  #2
135        vld1.64         {d0-d2},  [r1], r2
136        vadd.u16        q12, q8,  q9
137        pld             [r1]
138  .ifeq \rnd
139        vadd.u16        q12, q12, q13
140  .endif
141        vext.8          q15, q0,  q1,  #1
142        vadd.u16        q1 , q10, q11
143        shrn            d28, q12, #2
144  .ifeq \rnd
145        vadd.u16        q1,  q1,  q13
146  .endif
147        shrn            d29, q1,  #2
148  .if \avg
149        vld1.8          {q8},     [r0,:128]
150        vrhadd.u8       q14, q14, q8
151  .endif
152        vaddl.u8        q8,  d0,  d30
153        vld1.64         {d2-d4},  [r1], r2
154        vaddl.u8        q10, d1,  d31
155        vst1.64         {q14},    [r0,:128], r2
156        vadd.u16        q12, q8,  q9
157        pld             [r1, r2]
158  .ifeq \rnd
159        vadd.u16        q12, q12, q13
160  .endif
161        vext.8          q2,  q1,  q2,  #1
162        vadd.u16        q0,  q10, q11
163        shrn            d30, q12, #2
164  .ifeq \rnd
165        vadd.u16        q0,  q0,  q13
166  .endif
167        shrn            d31, q0,  #2
168  .if \avg
169        vld1.8          {q9},     [r0,:128]
170        vrhadd.u8       q15, q15, q9
171  .endif
172        vaddl.u8        q9,  d2,  d4
173        vaddl.u8        q11, d3,  d5
174        vst1.64         {q15},    [r0,:128], r2
175        bgt             1b
176        bx              lr
177.endm
178
179.macro  pixels8         rnd=1, avg=0
1801:      vld1.64         {d0},     [r1], r2
181        vld1.64         {d1},     [r1], r2
182        vld1.64         {d2},     [r1], r2
183        pld             [r1, r2, lsl #2]
184        vld1.64         {d3},     [r1], r2
185        pld             [r1]
186        pld             [r1, r2]
187        pld             [r1, r2, lsl #1]
188  .if \avg
189        vld1.64         {d4},     [r0,:64], r2
190        vrhadd.u8       d0,  d0,  d4
191        vld1.64         {d5},     [r0,:64], r2
192        vrhadd.u8       d1,  d1,  d5
193        vld1.64         {d6},     [r0,:64], r2
194        vrhadd.u8       d2,  d2,  d6
195        vld1.64         {d7},     [r0,:64], r2
196        vrhadd.u8       d3,  d3,  d7
197        sub             r0,  r0,  r2,  lsl #2
198  .endif
199        subs            r3,  r3,  #4
200        vst1.64         {d0},     [r0,:64], r2
201        vst1.64         {d1},     [r0,:64], r2
202        vst1.64         {d2},     [r0,:64], r2
203        vst1.64         {d3},     [r0,:64], r2
204        bne             1b
205        bx              lr
206.endm
207
208.macro  pixels8_x2      rnd=1, avg=0
2091:      vld1.64         {q0},     [r1], r2
210        vext.8          d1,  d0,  d1,  #1
211        vld1.64         {q1},     [r1], r2
212        vext.8          d3,  d2,  d3,  #1
213        pld             [r1]
214        pld             [r1, r2]
215        subs            r3,  r3,  #2
216        vswp            d1,  d2
217        avg             q0,  q0,  q1
218  .if \avg
219        vld1.8          {d4},     [r0,:64], r2
220        vld1.8          {d5},     [r0,:64]
221        vrhadd.u8       q0,  q0,  q2
222        sub             r0,  r0,  r2
223  .endif
224        vst1.64         {d0},     [r0,:64], r2
225        vst1.64         {d1},     [r0,:64], r2
226        bne             1b
227        bx              lr
228.endm
229
230.macro  pixels8_y2      rnd=1, avg=0
231        vld1.64         {d0},     [r1], r2
232        vld1.64         {d1},     [r1], r2
2331:      subs            r3,  r3,  #2
234        avg             d4,  d0,  d1
235        vld1.64         {d0},     [r1], r2
236        avg             d5,  d0,  d1
237        vld1.64         {d1},     [r1], r2
238        pld             [r1]
239        pld             [r1, r2]
240  .if \avg
241        vld1.8          {d2},     [r0,:64], r2
242        vld1.8          {d3},     [r0,:64]
243        vrhadd.u8       q2,  q2,  q1
244        sub             r0,  r0,  r2
245  .endif
246        vst1.64         {d4},     [r0,:64], r2
247        vst1.64         {d5},     [r0,:64], r2
248        bne             1b
249        bx              lr
250.endm
251
252.macro  pixels8_xy2     rnd=1, avg=0
253        vld1.64         {q0},     [r1], r2
254        vld1.64         {q1},     [r1], r2
255  .ifeq \rnd
256        vmov.i16        q11, #1
257  .endif
258        pld             [r1]
259        pld             [r1, r2]
260        vext.8          d4,  d0,  d1,  #1
261        vext.8          d6,  d2,  d3,  #1
262        vaddl.u8        q8,  d0,  d4
263        vaddl.u8        q9,  d2,  d6
2641:      subs            r3,  r3,  #2
265        vld1.64         {q0},     [r1], r2
266        pld             [r1]
267        vadd.u16        q10, q8,  q9
268        vext.8          d4,  d0,  d1,  #1
269  .ifeq \rnd
270        vadd.u16        q10, q10, q11
271  .endif
272        vaddl.u8        q8,  d0,  d4
273        shrn            d5,  q10, #2
274        vld1.64         {q1},     [r1], r2
275        vadd.u16        q10, q8,  q9
276        pld             [r1, r2]
277  .if \avg
278        vld1.8          {d7},     [r0,:64]
279        vrhadd.u8       d5,  d5,  d7
280  .endif
281  .ifeq \rnd
282        vadd.u16        q10, q10, q11
283  .endif
284        vst1.64         {d5},     [r0,:64], r2
285        shrn            d7,  q10, #2
286  .if \avg
287        vld1.8          {d5},     [r0,:64]
288        vrhadd.u8       d7,  d7,  d5
289  .endif
290        vext.8          d6,  d2,  d3,  #1
291        vaddl.u8        q9,  d2,  d6
292        vst1.64         {d7},     [r0,:64], r2
293        bgt             1b
294        bx              lr
295.endm
296
297.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
298  .if \rnd
299    .macro avg  rd, rn, rm
300        vrhadd.u8       \rd, \rn, \rm
301    .endm
302    .macro shrn rd, rn, rm
303        vrshrn.u16      \rd, \rn, \rm
304    .endm
305  .else
306    .macro avg  rd, rn, rm
307        vhadd.u8        \rd, \rn, \rm
308    .endm
309    .macro shrn rd, rn, rm
310        vshrn.u16       \rd, \rn, \rm
311    .endm
312  .endif
313function ff_\pfx\name\suf\()_neon, export=1
314        \name           \rnd, \avg
315endfunc
316        .purgem         avg
317        .purgem         shrn
318.endm
319
320.macro  pixfunc2        pfx, name, avg=0
321        pixfunc         \pfx, \name,          rnd=1, avg=\avg
322        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
323.endm
324
325function ff_put_h264_qpel16_mc00_neon, export=1
326        mov             r3,  #16
327endfunc
328
329        pixfunc         put_, pixels16,     avg=0
330        pixfunc2        put_, pixels16_x2,  avg=0
331        pixfunc2        put_, pixels16_y2,  avg=0
332        pixfunc2        put_, pixels16_xy2, avg=0
333
334function ff_avg_h264_qpel16_mc00_neon, export=1
335        mov             r3,  #16
336endfunc
337
338        pixfunc         avg_, pixels16,     avg=1
339        pixfunc2        avg_, pixels16_x2,  avg=1
340        pixfunc2        avg_, pixels16_y2,  avg=1
341        pixfunc2        avg_, pixels16_xy2, avg=1
342
343function ff_put_h264_qpel8_mc00_neon, export=1
344        mov             r3,  #8
345endfunc
346
347        pixfunc         put_, pixels8,     avg=0
348        pixfunc2        put_, pixels8_x2,  avg=0
349        pixfunc2        put_, pixels8_y2,  avg=0
350        pixfunc2        put_, pixels8_xy2, avg=0
351
352function ff_avg_h264_qpel8_mc00_neon, export=1
353        mov             r3,  #8
354endfunc
355
356        pixfunc         avg_, pixels8,     avg=1
357        pixfunc2        avg_, pixels8_x2,  avg=1
358        pixfunc2        avg_, pixels8_y2,  avg=1
359        pixfunc2        avg_, pixels8_xy2, avg=1
360
361function ff_put_pixels_clamped_neon, export=1
362        vld1.64         {d16-d19}, [r0,:128]!
363        vqmovun.s16     d0, q8
364        vld1.64         {d20-d23}, [r0,:128]!
365        vqmovun.s16     d1, q9
366        vld1.64         {d24-d27}, [r0,:128]!
367        vqmovun.s16     d2, q10
368        vld1.64         {d28-d31}, [r0,:128]!
369        vqmovun.s16     d3, q11
370        vst1.64         {d0},      [r1,:64], r2
371        vqmovun.s16     d4, q12
372        vst1.64         {d1},      [r1,:64], r2
373        vqmovun.s16     d5, q13
374        vst1.64         {d2},      [r1,:64], r2
375        vqmovun.s16     d6, q14
376        vst1.64         {d3},      [r1,:64], r2
377        vqmovun.s16     d7, q15
378        vst1.64         {d4},      [r1,:64], r2
379        vst1.64         {d5},      [r1,:64], r2
380        vst1.64         {d6},      [r1,:64], r2
381        vst1.64         {d7},      [r1,:64], r2
382        bx              lr
383endfunc
384
385function ff_put_signed_pixels_clamped_neon, export=1
386        vmov.u8         d31, #128
387        vld1.64         {d16-d17}, [r0,:128]!
388        vqmovn.s16      d0, q8
389        vld1.64         {d18-d19}, [r0,:128]!
390        vqmovn.s16      d1, q9
391        vld1.64         {d16-d17}, [r0,:128]!
392        vqmovn.s16      d2, q8
393        vld1.64         {d18-d19}, [r0,:128]!
394        vadd.u8         d0, d0, d31
395        vld1.64         {d20-d21}, [r0,:128]!
396        vadd.u8         d1, d1, d31
397        vld1.64         {d22-d23}, [r0,:128]!
398        vadd.u8         d2, d2, d31
399        vst1.64         {d0},      [r1,:64], r2
400        vqmovn.s16      d3, q9
401        vst1.64         {d1},      [r1,:64], r2
402        vqmovn.s16      d4, q10
403        vst1.64         {d2},      [r1,:64], r2
404        vqmovn.s16      d5, q11
405        vld1.64         {d24-d25}, [r0,:128]!
406        vadd.u8         d3, d3, d31
407        vld1.64         {d26-d27}, [r0,:128]!
408        vadd.u8         d4, d4, d31
409        vadd.u8         d5, d5, d31
410        vst1.64         {d3},      [r1,:64], r2
411        vqmovn.s16      d6, q12
412        vst1.64         {d4},      [r1,:64], r2
413        vqmovn.s16      d7, q13
414        vst1.64         {d5},      [r1,:64], r2
415        vadd.u8         d6, d6, d31
416        vadd.u8         d7, d7, d31
417        vst1.64         {d6},      [r1,:64], r2
418        vst1.64         {d7},      [r1,:64], r2
419        bx              lr
420endfunc
421
422function ff_add_pixels_clamped_neon, export=1
423        mov             r3, r1
424        vld1.64         {d16},   [r1,:64], r2
425        vld1.64         {d0-d1}, [r0,:128]!
426        vaddw.u8        q0, q0, d16
427        vld1.64         {d17},   [r1,:64], r2
428        vld1.64         {d2-d3}, [r0,:128]!
429        vqmovun.s16     d0, q0
430        vld1.64         {d18},   [r1,:64], r2
431        vaddw.u8        q1, q1, d17
432        vld1.64         {d4-d5}, [r0,:128]!
433        vaddw.u8        q2, q2, d18
434        vst1.64         {d0},    [r3,:64], r2
435        vqmovun.s16     d2, q1
436        vld1.64         {d19},   [r1,:64], r2
437        vld1.64         {d6-d7}, [r0,:128]!
438        vaddw.u8        q3, q3, d19
439        vqmovun.s16     d4, q2
440        vst1.64         {d2},    [r3,:64], r2
441        vld1.64         {d16},   [r1,:64], r2
442        vqmovun.s16     d6, q3
443        vld1.64         {d0-d1}, [r0,:128]!
444        vaddw.u8        q0, q0, d16
445        vst1.64         {d4},    [r3,:64], r2
446        vld1.64         {d17},   [r1,:64], r2
447        vld1.64         {d2-d3}, [r0,:128]!
448        vaddw.u8        q1, q1, d17
449        vst1.64         {d6},    [r3,:64], r2
450        vqmovun.s16     d0, q0
451        vld1.64         {d18},   [r1,:64], r2
452        vld1.64         {d4-d5}, [r0,:128]!
453        vaddw.u8        q2, q2, d18
454        vst1.64         {d0},    [r3,:64], r2
455        vqmovun.s16     d2, q1
456        vld1.64         {d19},   [r1,:64], r2
457        vqmovun.s16     d4, q2
458        vld1.64         {d6-d7}, [r0,:128]!
459        vaddw.u8        q3, q3, d19
460        vst1.64         {d2},    [r3,:64], r2
461        vqmovun.s16     d6, q3
462        vst1.64         {d4},    [r3,:64], r2
463        vst1.64         {d6},    [r3,:64], r2
464        bx              lr
465endfunc
466
467function ff_vector_fmul_neon, export=1
468        subs            r3,  r3,  #8
469        vld1.64         {d0-d3},  [r1,:128]!
470        vld1.64         {d4-d7},  [r2,:128]!
471        vmul.f32        q8,  q0,  q2
472        vmul.f32        q9,  q1,  q3
473        beq             3f
474        bics            ip,  r3,  #15
475        beq             2f
4761:      subs            ip,  ip,  #16
477        vld1.64         {d0-d1},  [r1,:128]!
478        vld1.64         {d4-d5},  [r2,:128]!
479        vmul.f32        q10, q0,  q2
480        vld1.64         {d2-d3},  [r1,:128]!
481        vld1.64         {d6-d7},  [r2,:128]!
482        vmul.f32        q11, q1,  q3
483        vst1.64         {d16-d19},[r0,:128]!
484        vld1.64         {d0-d1},  [r1,:128]!
485        vld1.64         {d4-d5},  [r2,:128]!
486        vmul.f32        q8,  q0,  q2
487        vld1.64         {d2-d3},  [r1,:128]!
488        vld1.64         {d6-d7},  [r2,:128]!
489        vmul.f32        q9,  q1,  q3
490        vst1.64         {d20-d23},[r0,:128]!
491        bne             1b
492        ands            r3,  r3,  #15
493        beq             3f
4942:      vld1.64         {d0-d1},  [r1,:128]!
495        vld1.64         {d4-d5},  [r2,:128]!
496        vst1.64         {d16-d17},[r0,:128]!
497        vmul.f32        q8,  q0,  q2
498        vld1.64         {d2-d3},  [r1,:128]!
499        vld1.64         {d6-d7},  [r2,:128]!
500        vst1.64         {d18-d19},[r0,:128]!
501        vmul.f32        q9,  q1,  q3
5023:      vst1.64         {d16-d19},[r0,:128]!
503        bx              lr
504endfunc
505
506function ff_vector_fmul_window_neon, export=1
507        push            {r4,r5,lr}
508        ldr             lr,  [sp, #12]
509        sub             r2,  r2,  #8
510        sub             r5,  lr,  #2
511        add             r2,  r2,  r5, lsl #2
512        add             r4,  r3,  r5, lsl #3
513        add             ip,  r0,  r5, lsl #3
514        mov             r5,  #-16
515        vld1.64         {d0,d1},  [r1,:128]!
516        vld1.64         {d2,d3},  [r2,:128], r5
517        vld1.64         {d4,d5},  [r3,:128]!
518        vld1.64         {d6,d7},  [r4,:128], r5
5191:      subs            lr,  lr,  #4
520        vmul.f32        d22, d0,  d4
521        vrev64.32       q3,  q3
522        vmul.f32        d23, d1,  d5
523        vrev64.32       q1,  q1
524        vmul.f32        d20, d0,  d7
525        vmul.f32        d21, d1,  d6
526        beq             2f
527        vmla.f32        d22, d3,  d7
528        vld1.64         {d0,d1},  [r1,:128]!
529        vmla.f32        d23, d2,  d6
530        vld1.64         {d18,d19},[r2,:128], r5
531        vmls.f32        d20, d3,  d4
532        vld1.64         {d24,d25},[r3,:128]!
533        vmls.f32        d21, d2,  d5
534        vld1.64         {d6,d7},  [r4,:128], r5
535        vmov            q1,  q9
536        vrev64.32       q11, q11
537        vmov            q2,  q12
538        vswp            d22, d23
539        vst1.64         {d20,d21},[r0,:128]!
540        vst1.64         {d22,d23},[ip,:128], r5
541        b               1b
5422:      vmla.f32        d22, d3,  d7
543        vmla.f32        d23, d2,  d6
544        vmls.f32        d20, d3,  d4
545        vmls.f32        d21, d2,  d5
546        vrev64.32       q11, q11
547        vswp            d22, d23
548        vst1.64         {d20,d21},[r0,:128]!
549        vst1.64         {d22,d23},[ip,:128], r5
550        pop             {r4,r5,pc}
551endfunc
552
553#if CONFIG_VORBIS_DECODER
554function ff_vorbis_inverse_coupling_neon, export=1
555        vmov.i32        q10, #1<<31
556        subs            r2,  r2,  #4
557        mov             r3,  r0
558        mov             r12, r1
559        beq             3f
560
561        vld1.32         {d24-d25},[r1,:128]!
562        vld1.32         {d22-d23},[r0,:128]!
563        vcle.s32        q8,  q12, #0
564        vand            q9,  q11, q10
565        veor            q12, q12, q9
566        vand            q2,  q12, q8
567        vbic            q3,  q12, q8
568        vadd.f32        q12, q11, q2
569        vsub.f32        q11, q11, q3
5701:      vld1.32         {d2-d3},  [r1,:128]!
571        vld1.32         {d0-d1},  [r0,:128]!
572        vcle.s32        q8,  q1,  #0
573        vand            q9,  q0,  q10
574        veor            q1,  q1,  q9
575        vst1.32         {d24-d25},[r3, :128]!
576        vst1.32         {d22-d23},[r12,:128]!
577        vand            q2,  q1,  q8
578        vbic            q3,  q1,  q8
579        vadd.f32        q1,  q0,  q2
580        vsub.f32        q0,  q0,  q3
581        subs            r2,  r2,  #8
582        ble             2f
583        vld1.32         {d24-d25},[r1,:128]!
584        vld1.32         {d22-d23},[r0,:128]!
585        vcle.s32        q8,  q12, #0
586        vand            q9,  q11, q10
587        veor            q12, q12, q9
588        vst1.32         {d2-d3},  [r3, :128]!
589        vst1.32         {d0-d1},  [r12,:128]!
590        vand            q2,  q12, q8
591        vbic            q3,  q12, q8
592        vadd.f32        q12, q11, q2
593        vsub.f32        q11, q11, q3
594        b               1b
595
5962:      vst1.32         {d2-d3},  [r3, :128]!
597        vst1.32         {d0-d1},  [r12,:128]!
598        it              lt
599        bxlt            lr
600
6013:      vld1.32         {d2-d3},  [r1,:128]
602        vld1.32         {d0-d1},  [r0,:128]
603        vcle.s32        q8,  q1,  #0
604        vand            q9,  q0,  q10
605        veor            q1,  q1,  q9
606        vand            q2,  q1,  q8
607        vbic            q3,  q1,  q8
608        vadd.f32        q1,  q0,  q2
609        vsub.f32        q0,  q0,  q3
610        vst1.32         {d2-d3},  [r0,:128]!
611        vst1.32         {d0-d1},  [r1,:128]!
612        bx              lr
613endfunc
614#endif
615
616function ff_vector_fmul_scalar_neon, export=1
617VFP     len .req r2
618NOVFP   len .req r3
619VFP     vdup.32         q8,  d0[0]
620NOVFP   vdup.32         q8,  r2
621        bics            r12, len, #15
622        beq             3f
623        vld1.32         {q0},[r1,:128]!
624        vld1.32         {q1},[r1,:128]!
6251:      vmul.f32        q0,  q0,  q8
626        vld1.32         {q2},[r1,:128]!
627        vmul.f32        q1,  q1,  q8
628        vld1.32         {q3},[r1,:128]!
629        vmul.f32        q2,  q2,  q8
630        vst1.32         {q0},[r0,:128]!
631        vmul.f32        q3,  q3,  q8
632        vst1.32         {q1},[r0,:128]!
633        subs            r12, r12, #16
634        beq             2f
635        vld1.32         {q0},[r1,:128]!
636        vst1.32         {q2},[r0,:128]!
637        vld1.32         {q1},[r1,:128]!
638        vst1.32         {q3},[r0,:128]!
639        b               1b
6402:      vst1.32         {q2},[r0,:128]!
641        vst1.32         {q3},[r0,:128]!
642        ands            len, len, #15
643        it              eq
644        bxeq            lr
6453:      vld1.32         {q0},[r1,:128]!
646        vmul.f32        q0,  q0,  q8
647        vst1.32         {q0},[r0,:128]!
648        subs            len, len, #4
649        bgt             3b
650        bx              lr
651        .unreq          len
652endfunc
653
654function ff_vector_fmac_scalar_neon, export=1
655VFP     len .req r2
656VFP     acc .req r3
657NOVFP   len .req r3
658NOVFP   acc .req r2
659VFP     vdup.32         q15, d0[0]
660NOVFP   vdup.32         q15, r2
661        bics            r12, len, #15
662        mov             acc, r0
663        beq             3f
664        vld1.32         {q0},     [r1,:128]!
665        vld1.32         {q8},     [acc,:128]!
666        vld1.32         {q1},     [r1,:128]!
667        vld1.32         {q9},     [acc,:128]!
6681:      vmla.f32        q8,  q0,  q15
669        vld1.32         {q2},     [r1,:128]!
670        vld1.32         {q10},    [acc,:128]!
671        vmla.f32        q9,  q1,  q15
672        vld1.32         {q3},     [r1,:128]!
673        vld1.32         {q11},    [acc,:128]!
674        vmla.f32        q10, q2,  q15
675        vst1.32         {q8},     [r0,:128]!
676        vmla.f32        q11, q3,  q15
677        vst1.32         {q9},     [r0,:128]!
678        subs            r12, r12, #16
679        beq             2f
680        vld1.32         {q0},     [r1,:128]!
681        vld1.32         {q8},     [acc,:128]!
682        vst1.32         {q10},    [r0,:128]!
683        vld1.32         {q1},     [r1,:128]!
684        vld1.32         {q9},     [acc,:128]!
685        vst1.32         {q11},    [r0,:128]!
686        b               1b
6872:      vst1.32         {q10},    [r0,:128]!
688        vst1.32         {q11},    [r0,:128]!
689        ands            len, len, #15
690        it              eq
691        bxeq            lr
6923:      vld1.32         {q0},     [r1,:128]!
693        vld1.32         {q8},     [acc,:128]!
694        vmla.f32        q8,  q0,  q15
695        vst1.32         {q8},     [r0,:128]!
696        subs            len, len, #4
697        bgt             3b
698        bx              lr
699        .unreq          len
700endfunc
701
702function ff_butterflies_float_neon, export=1
7031:      vld1.32         {q0},[r0,:128]
704        vld1.32         {q1},[r1,:128]
705        vsub.f32        q2,  q0,  q1
706        vadd.f32        q1,  q0,  q1
707        vst1.32         {q2},[r1,:128]!
708        vst1.32         {q1},[r0,:128]!
709        subs            r2,  r2,  #4
710        bgt             1b
711        bx              lr
712endfunc
713
714function ff_scalarproduct_float_neon, export=1
715        vmov.f32        q2,  #0.0
7161:      vld1.32         {q0},[r0,:128]!
717        vld1.32         {q1},[r1,:128]!
718        vmla.f32        q2,  q0,  q1
719        subs            r2,  r2,  #4
720        bgt             1b
721        vadd.f32        d0,  d4,  d5
722        vpadd.f32       d0,  d0,  d0
723NOVFP   vmov.32         r0,  d0[0]
724        bx              lr
725endfunc
726
727function ff_vector_fmul_reverse_neon, export=1
728        add             r2,  r2,  r3,  lsl #2
729        sub             r2,  r2,  #32
730        mov             r12, #-32
731        vld1.32         {q0-q1},  [r1,:128]!
732        vld1.32         {q2-q3},  [r2,:128], r12
7331:      pld             [r1, #32]
734        vrev64.32       q3,  q3
735        vmul.f32        d16, d0,  d7
736        vmul.f32        d17, d1,  d6
737        pld             [r2, #-32]
738        vrev64.32       q2,  q2
739        vmul.f32        d18, d2,  d5
740        vmul.f32        d19, d3,  d4
741        subs            r3,  r3,  #8
742        beq             2f
743        vld1.32         {q0-q1},  [r1,:128]!
744        vld1.32         {q2-q3},  [r2,:128], r12
745        vst1.32         {q8-q9},  [r0,:128]!
746        b               1b
7472:      vst1.32         {q8-q9},  [r0,:128]!
748        bx              lr
749endfunc
750
751function ff_vector_fmul_add_neon, export=1
752        ldr             r12, [sp]
753        vld1.32         {q0-q1},  [r1,:128]!
754        vld1.32         {q8-q9},  [r2,:128]!
755        vld1.32         {q2-q3},  [r3,:128]!
756        vmul.f32        q10, q0,  q8
757        vmul.f32        q11, q1,  q9
7581:      vadd.f32        q12, q2,  q10
759        vadd.f32        q13, q3,  q11
760        pld             [r1, #16]
761        pld             [r2, #16]
762        pld             [r3, #16]
763        subs            r12, r12, #8
764        beq             2f
765        vld1.32         {q0},     [r1,:128]!
766        vld1.32         {q8},     [r2,:128]!
767        vmul.f32        q10, q0,  q8
768        vld1.32         {q1},     [r1,:128]!
769        vld1.32         {q9},     [r2,:128]!
770        vmul.f32        q11, q1,  q9
771        vld1.32         {q2-q3},  [r3,:128]!
772        vst1.32         {q12-q13},[r0,:128]!
773        b               1b
7742:      vst1.32         {q12-q13},[r0,:128]!
775        bx              lr
776endfunc
777
778function ff_vector_clipf_neon, export=1
779VFP     vdup.32         q1,  d0[1]
780VFP     vdup.32         q0,  d0[0]
781NOVFP   vdup.32         q0,  r2
782NOVFP   vdup.32         q1,  r3
783NOVFP   ldr             r2,  [sp]
784        vld1.f32        {q2},[r1,:128]!
785        vmin.f32        q10, q2,  q1
786        vld1.f32        {q3},[r1,:128]!
787        vmin.f32        q11, q3,  q1
7881:      vmax.f32        q8,  q10, q0
789        vmax.f32        q9,  q11, q0
790        subs            r2,  r2,  #8
791        beq             2f
792        vld1.f32        {q2},[r1,:128]!
793        vmin.f32        q10, q2,  q1
794        vld1.f32        {q3},[r1,:128]!
795        vmin.f32        q11, q3,  q1
796        vst1.f32        {q8},[r0,:128]!
797        vst1.f32        {q9},[r0,:128]!
798        b               1b
7992:      vst1.f32        {q8},[r0,:128]!
800        vst1.f32        {q9},[r0,:128]!
801        bx              lr
802endfunc
803
804function ff_apply_window_int16_neon, export=1
805        push            {r4,lr}
806        add             r4,  r1,  r3,  lsl #1
807        add             lr,  r0,  r3,  lsl #1
808        sub             r4,  r4,  #16
809        sub             lr,  lr,  #16
810        mov             r12, #-16
8111:
812        vld1.16         {q0},     [r1,:128]!
813        vld1.16         {q2},     [r2,:128]!
814        vld1.16         {q1},     [r4,:128], r12
815        vrev64.16       q3,  q2
816        vqrdmulh.s16    q0,  q0,  q2
817        vqrdmulh.s16    d2,  d2,  d7
818        vqrdmulh.s16    d3,  d3,  d6
819        vst1.16         {q0},     [r0,:128]!
820        vst1.16         {q1},     [lr,:128], r12
821        subs            r3,  r3,  #16
822        bgt             1b
823
824        pop             {r4,pc}
825endfunc
826
827function ff_vector_clip_int32_neon, export=1
828        vdup.32         q0,  r2
829        vdup.32         q1,  r3
830        ldr             r2,  [sp]
8311:
832        vld1.32         {q2-q3},  [r1,:128]!
833        vmin.s32        q2,  q2,  q1
834        vmin.s32        q3,  q3,  q1
835        vmax.s32        q2,  q2,  q0
836        vmax.s32        q3,  q3,  q0
837        vst1.32         {q2-q3},  [r0,:128]!
838        subs            r2,  r2,  #8
839        bgt             1b
840        bx              lr
841endfunc
842