1/*
2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "asm.S"
23
24        preserve8
25        .fpu neon
26        .text
27
28        .macro pixels16 avg=0
29.if \avg
30        mov             ip,  r0
31.endif
321:      vld1.64         {d0, d1},  [r1], r2
33        vld1.64         {d2, d3},  [r1], r2
34        vld1.64         {d4, d5},  [r1], r2
35        pld             [r1, r2, lsl #2]
36        vld1.64         {d6, d7},  [r1], r2
37        pld             [r1]
38        pld             [r1, r2]
39        pld             [r1, r2, lsl #1]
40.if \avg
41        vld1.64         {d16,d17}, [ip], r2
42        vrhadd.u8       q0,  q0,  q8
43        vld1.64         {d18,d19}, [ip], r2
44        vrhadd.u8       q1,  q1,  q9
45        vld1.64         {d20,d21}, [ip], r2
46        vrhadd.u8       q2,  q2,  q10
47        vld1.64         {d22,d23}, [ip], r2
48        vrhadd.u8       q3,  q3,  q11
49.endif
50        subs            r3,  r3,  #4
51        vst1.64         {d0, d1},  [r0,:128], r2
52        vst1.64         {d2, d3},  [r0,:128], r2
53        vst1.64         {d4, d5},  [r0,:128], r2
54        vst1.64         {d6, d7},  [r0,:128], r2
55        bne             1b
56        bx              lr
57        .endm
58
59        .macro pixels16_x2 vhadd=vrhadd.u8
601:      vld1.64         {d0-d2},   [r1], r2
61        vld1.64         {d4-d6},   [r1], r2
62        pld             [r1]
63        pld             [r1, r2]
64        subs            r3,  r3,  #2
65        vext.8          q1,  q0,  q1,  #1
66        \vhadd          q0,  q0,  q1
67        vext.8          q3,  q2,  q3,  #1
68        \vhadd          q2,  q2,  q3
69        vst1.64         {d0, d1},  [r0,:128], r2
70        vst1.64         {d4, d5},  [r0,:128], r2
71        bne             1b
72        bx              lr
73        .endm
74
75        .macro pixels16_y2 vhadd=vrhadd.u8
76        push            {lr}
77        add             ip,  r1,  r2
78        lsl             lr,  r2,  #1
79        vld1.64         {d0, d1},  [r1], lr
80        vld1.64         {d2, d3},  [ip], lr
811:      subs            r3,  r3,  #2
82        \vhadd          q2,  q0,  q1
83        vld1.64         {d0, d1},  [r1],      lr
84        \vhadd          q3,  q0,  q1
85        vld1.64         {d2, d3},  [ip],      lr
86        pld             [r1]
87        pld             [ip]
88        vst1.64         {d4, d5},  [r0,:128], r2
89        vst1.64         {d6, d7},  [r0,:128], r2
90        bne             1b
91        pop             {pc}
92        .endm
93
94        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
95        push            {lr}
96        lsl             lr,  r2,  #1
97        add             ip,  r1,  r2
98        vld1.64         {d0-d2},   [r1], lr
99        vld1.64         {d4-d6},   [ip], lr
100.if \no_rnd
101        vmov.i16        q13, #1
102.endif
103        pld             [r1]
104        pld             [ip]
105        vext.8          q1,  q0,  q1,  #1
106        vext.8          q3,  q2,  q3,  #1
107        vaddl.u8        q8,  d0,  d2
108        vaddl.u8        q10, d1,  d3
109        vaddl.u8        q9,  d4,  d6
110        vaddl.u8        q11, d5,  d7
1111:      subs            r3,  r3,  #2
112        vld1.64         {d0-d2},   [r1], lr
113        vadd.u16        q12, q8,  q9
114        pld             [r1]
115.if \no_rnd
116        vadd.u16        q12, q12, q13
117.endif
118        vext.8          q15, q0,  q1,  #1
119        vadd.u16        q1 , q10, q11
120        \vshrn          d28, q12, #2
121.if \no_rnd
122        vadd.u16        q1,  q1,  q13
123.endif
124        \vshrn          d29, q1,  #2
125        vaddl.u8        q8,  d0,  d30
126        vld1.64         {d2-d4},   [ip], lr
127        vaddl.u8        q10, d1,  d31
128        vst1.64         {d28,d29}, [r0,:128], r2
129        vadd.u16        q12, q8,  q9
130        pld             [ip]
131.if \no_rnd
132        vadd.u16        q12, q12, q13
133.endif
134        vext.8          q2,  q1,  q2,  #1
135        vadd.u16        q0,  q10, q11
136        \vshrn          d30, q12, #2
137.if \no_rnd
138        vadd.u16        q0,  q0,  q13
139.endif
140        \vshrn          d31, q0,  #2
141        vaddl.u8        q9,  d2,  d4
142        vaddl.u8        q11, d3,  d5
143        vst1.64         {d30,d31}, [r0,:128], r2
144        bgt             1b
145        pop             {pc}
146        .endm
147
148        .macro pixels8
1491:      vld1.64         {d0}, [r1], r2
150        vld1.64         {d1}, [r1], r2
151        vld1.64         {d2}, [r1], r2
152        pld             [r1, r2, lsl #2]
153        vld1.64         {d3}, [r1], r2
154        pld             [r1]
155        pld             [r1, r2]
156        pld             [r1, r2, lsl #1]
157        subs            r3,  r3,  #4
158        vst1.64         {d0}, [r0,:64], r2
159        vst1.64         {d1}, [r0,:64], r2
160        vst1.64         {d2}, [r0,:64], r2
161        vst1.64         {d3}, [r0,:64], r2
162        bne             1b
163        bx              lr
164        .endm
165
166        .macro pixels8_x2 vhadd=vrhadd.u8
1671:      vld1.64         {d0, d1},  [r1], r2
168        vext.8          d1,  d0,  d1,  #1
169        vld1.64         {d2, d3},  [r1], r2
170        vext.8          d3,  d2,  d3,  #1
171        pld             [r1]
172        pld             [r1, r2]
173        subs            r3,  r3,  #2
174        vswp            d1,  d2
175        \vhadd          q0,  q0,  q1
176        vst1.64         {d0},      [r0,:64], r2
177        vst1.64         {d1},      [r0,:64], r2
178        bne             1b
179        bx              lr
180        .endm
181
182        .macro pixels8_y2 vhadd=vrhadd.u8
183        push            {lr}
184        add             ip,  r1,  r2
185        lsl             lr,  r2,  #1
186        vld1.64         {d0},      [r1], lr
187        vld1.64         {d1},      [ip], lr
1881:      subs            r3,  r3,  #2
189        \vhadd          d4,  d0,  d1
190        vld1.64         {d0},      [r1],     lr
191        \vhadd          d5,  d0,  d1
192        vld1.64         {d1},      [ip],     lr
193        pld             [r1]
194        pld             [ip]
195        vst1.64         {d4},      [r0,:64], r2
196        vst1.64         {d5},      [r0,:64], r2
197        bne             1b
198        pop             {pc}
199        .endm
200
201        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
202        push            {lr}
203        lsl             lr,  r2,  #1
204        add             ip,  r1,  r2
205        vld1.64         {d0, d1},  [r1], lr
206        vld1.64         {d2, d3},  [ip], lr
207.if \no_rnd
208        vmov.i16        q11, #1
209.endif
210        pld             [r1]
211        pld             [ip]
212        vext.8          d4,  d0,  d1,  #1
213        vext.8          d6,  d2,  d3,  #1
214        vaddl.u8        q8,  d0,  d4
215        vaddl.u8        q9,  d2,  d6
2161:      subs            r3,  r3,  #2
217        vld1.64         {d0, d1},  [r1], lr
218        pld             [r1]
219        vadd.u16        q10, q8,  q9
220        vext.8          d4,  d0,  d1,  #1
221.if \no_rnd
222        vadd.u16        q10, q10, q11
223.endif
224        vaddl.u8        q8,  d0,  d4
225        \vshrn          d5,  q10, #2
226        vld1.64         {d2, d3},  [ip], lr
227        vadd.u16        q10, q8,  q9
228        pld             [ip]
229.if \no_rnd
230        vadd.u16        q10, q10, q11
231.endif
232        vst1.64         {d5},      [r0,:64], r2
233        \vshrn          d7,  q10, #2
234        vext.8          d6,  d2,  d3,  #1
235        vaddl.u8        q9,  d2,  d6
236        vst1.64         {d7},      [r0,:64], r2
237        bgt             1b
238        pop             {pc}
239        .endm
240
241        .macro pixfunc pfx name suf rnd_op args:vararg
242function ff_\pfx\name\suf\()_neon, export=1
243        \name \rnd_op \args
244        .endfunc
245        .endm
246
247        .macro pixfunc2 pfx name args:vararg
248        pixfunc \pfx \name
249        pixfunc \pfx \name \args
250        .endm
251
252function ff_put_h264_qpel16_mc00_neon, export=1
253        mov   r3, #16
254        .endfunc
255
256        pixfunc  put_ pixels16
257        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
258        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
259        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260
261function ff_avg_h264_qpel16_mc00_neon, export=1
262        mov   r3, #16
263        .endfunc
264
265        pixfunc  avg_ pixels16,, 1
266
267function ff_put_h264_qpel8_mc00_neon, export=1
268        mov   r3, #8
269        .endfunc
270
271        pixfunc  put_ pixels8
272        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
273        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
274        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
275
276function ff_float_to_int16_neon, export=1
277        subs            r2,  r2,  #8
278        vld1.64         {d0-d1},  [r1,:128]!
279        vcvt.s32.f32    q8,  q0,  #16
280        vld1.64         {d2-d3},  [r1,:128]!
281        vcvt.s32.f32    q9,  q1,  #16
282        beq             3f
283        bics            ip,  r2,  #15
284        beq             2f
2851:      subs            ip,  ip,  #16
286        vshrn.s32       d4,  q8,  #16
287        vld1.64         {d0-d1},  [r1,:128]!
288        vcvt.s32.f32    q0,  q0,  #16
289        vshrn.s32       d5,  q9,  #16
290        vld1.64         {d2-d3},  [r1,:128]!
291        vcvt.s32.f32    q1,  q1,  #16
292        vshrn.s32       d6,  q0,  #16
293        vst1.64         {d4-d5},  [r0,:128]!
294        vshrn.s32       d7,  q1,  #16
295        vld1.64         {d16-d17},[r1,:128]!
296        vcvt.s32.f32    q8,  q8,  #16
297        vld1.64         {d18-d19},[r1,:128]!
298        vcvt.s32.f32    q9,  q9,  #16
299        vst1.64         {d6-d7},  [r0,:128]!
300        bne             1b
301        ands            r2,  r2,  #15
302        beq             3f
3032:      vld1.64         {d0-d1},  [r1,:128]!
304        vshrn.s32       d4,  q8,  #16
305        vcvt.s32.f32    q0,  q0,  #16
306        vld1.64         {d2-d3},  [r1,:128]!
307        vshrn.s32       d5,  q9,  #16
308        vcvt.s32.f32    q1,  q1,  #16
309        vshrn.s32       d6,  q0,  #16
310        vst1.64         {d4-d5},  [r0,:128]!
311        vshrn.s32       d7,  q1,  #16
312        vst1.64         {d6-d7},  [r0,:128]!
313        bx              lr
3143:      vshrn.s32       d4,  q8,  #16
315        vshrn.s32       d5,  q9,  #16
316        vst1.64         {d4-d5},  [r0,:128]!
317        bx              lr
318        .endfunc
319
320function ff_float_to_int16_interleave_neon, export=1
321        cmp             r3, #2
322        ldrlt           r1, [r1]
323        blt             ff_float_to_int16_neon
324        bne             4f
325
326        ldr             r3, [r1]
327        ldr             r1, [r1, #4]
328
329        subs            r2,  r2,  #8
330        vld1.64         {d0-d1},  [r3,:128]!
331        vcvt.s32.f32    q8,  q0,  #16
332        vld1.64         {d2-d3},  [r3,:128]!
333        vcvt.s32.f32    q9,  q1,  #16
334        vld1.64         {d20-d21},[r1,:128]!
335        vcvt.s32.f32    q10, q10, #16
336        vld1.64         {d22-d23},[r1,:128]!
337        vcvt.s32.f32    q11, q11, #16
338        beq             3f
339        bics            ip,  r2,  #15
340        beq             2f
3411:      subs            ip,  ip,  #16
342        vld1.64         {d0-d1},  [r3,:128]!
343        vcvt.s32.f32    q0,  q0,  #16
344        vsri.32         q10, q8,  #16
345        vld1.64         {d2-d3},  [r3,:128]!
346        vcvt.s32.f32    q1,  q1,  #16
347        vld1.64         {d24-d25},[r1,:128]!
348        vcvt.s32.f32    q12, q12, #16
349        vld1.64         {d26-d27},[r1,:128]!
350        vsri.32         q11, q9,  #16
351        vst1.64         {d20-d21},[r0,:128]!
352        vcvt.s32.f32    q13, q13, #16
353        vst1.64         {d22-d23},[r0,:128]!
354        vsri.32         q12, q0,  #16
355        vld1.64         {d16-d17},[r3,:128]!
356        vsri.32         q13, q1,  #16
357        vst1.64         {d24-d25},[r0,:128]!
358        vcvt.s32.f32    q8,  q8,  #16
359        vld1.64         {d18-d19},[r3,:128]!
360        vcvt.s32.f32    q9,  q9,  #16
361        vld1.64         {d20-d21},[r1,:128]!
362        vcvt.s32.f32    q10, q10, #16
363        vld1.64         {d22-d23},[r1,:128]!
364        vcvt.s32.f32    q11, q11, #16
365        vst1.64         {d26-d27},[r0,:128]!
366        bne             1b
367        ands            r2,  r2,  #15
368        beq             3f
3692:      vsri.32         q10, q8,  #16
370        vld1.64         {d0-d1},  [r3,:128]!
371        vcvt.s32.f32    q0,  q0,  #16
372        vld1.64         {d2-d3},  [r3,:128]!
373        vcvt.s32.f32    q1,  q1,  #16
374        vld1.64         {d24-d25},[r1,:128]!
375        vcvt.s32.f32    q12, q12, #16
376        vsri.32         q11, q9,  #16
377        vld1.64         {d26-d27},[r1,:128]!
378        vcvt.s32.f32    q13, q13, #16
379        vst1.64         {d20-d21},[r0,:128]!
380        vsri.32         q12, q0,  #16
381        vst1.64         {d22-d23},[r0,:128]!
382        vsri.32         q13, q1,  #16
383        vst1.64         {d24-d27},[r0,:128]!
384        bx              lr
3853:      vsri.32         q10, q8,  #16
386        vsri.32         q11, q9,  #16
387        vst1.64         {d20-d23},[r0,:128]!
388        bx              lr
389
3904:      push            {r4-r8,lr}
391        cmp             r3,  #4
392        lsl             ip,  r3,  #1
393        blt             4f
394
395        @ 4 channels
3965:      ldmia           r1!, {r4-r7}
397        mov             lr,  r2
398        mov             r8,  r0
399        vld1.64         {d16-d17},[r4,:128]!
400        vcvt.s32.f32    q8,  q8,  #16
401        vld1.64         {d18-d19},[r5,:128]!
402        vcvt.s32.f32    q9,  q9,  #16
403        vld1.64         {d20-d21},[r6,:128]!
404        vcvt.s32.f32    q10, q10, #16
405        vld1.64         {d22-d23},[r7,:128]!
406        vcvt.s32.f32    q11, q11, #16
4076:      subs            lr,  lr,  #8
408        vld1.64         {d0-d1},  [r4,:128]!
409        vcvt.s32.f32    q0,  q0,  #16
410        vsri.32         q9,  q8,  #16
411        vld1.64         {d2-d3},  [r5,:128]!
412        vcvt.s32.f32    q1,  q1,  #16
413        vsri.32         q11, q10, #16
414        vld1.64         {d4-d5},  [r6,:128]!
415        vcvt.s32.f32    q2,  q2,  #16
416        vzip.32         d18, d22
417        vld1.64         {d6-d7},  [r7,:128]!
418        vcvt.s32.f32    q3,  q3,  #16
419        vzip.32         d19, d23
420        vst1.64         {d18},    [r8], ip
421        vsri.32         q1,  q0,  #16
422        vst1.64         {d22},    [r8], ip
423        vsri.32         q3,  q2,  #16
424        vst1.64         {d19},    [r8], ip
425        vzip.32         d2,  d6
426        vst1.64         {d23},    [r8], ip
427        vzip.32         d3,  d7
428        beq             7f
429        vld1.64         {d16-d17},[r4,:128]!
430        vcvt.s32.f32    q8,  q8,  #16
431        vst1.64         {d2},     [r8], ip
432        vld1.64         {d18-d19},[r5,:128]!
433        vcvt.s32.f32    q9,  q9,  #16
434        vst1.64         {d6},     [r8], ip
435        vld1.64         {d20-d21},[r6,:128]!
436        vcvt.s32.f32    q10, q10, #16
437        vst1.64         {d3},     [r8], ip
438        vld1.64         {d22-d23},[r7,:128]!
439        vcvt.s32.f32    q11, q11, #16
440        vst1.64         {d7},     [r8], ip
441        b               6b
4427:      vst1.64         {d2},     [r8], ip
443        vst1.64         {d6},     [r8], ip
444        vst1.64         {d3},     [r8], ip
445        vst1.64         {d7},     [r8], ip
446        subs            r3,  r3,  #4
447        popeq           {r4-r8,pc}
448        cmp             r3,  #4
449        add             r0,  r0,  #8
450        bge             5b
451
452        @ 2 channels
4534:      cmp             r3,  #2
454        blt             4f
455        ldmia           r1!, {r4-r5}
456        mov             lr,  r2
457        mov             r8,  r0
458        tst             lr,  #8
459        vld1.64         {d16-d17},[r4,:128]!
460        vcvt.s32.f32    q8,  q8,  #16
461        vld1.64         {d18-d19},[r5,:128]!
462        vcvt.s32.f32    q9,  q9,  #16
463        vld1.64         {d20-d21},[r4,:128]!
464        vcvt.s32.f32    q10, q10, #16
465        vld1.64         {d22-d23},[r5,:128]!
466        vcvt.s32.f32    q11, q11, #16
467        beq             6f
468        subs            lr,  lr,  #8
469        beq             7f
470        vsri.32         d18, d16, #16
471        vsri.32         d19, d17, #16
472        vld1.64         {d16-d17},[r4,:128]!
473        vcvt.s32.f32    q8,  q8,  #16
474        vst1.32         {d18[0]}, [r8], ip
475        vsri.32         d22, d20, #16
476        vst1.32         {d18[1]}, [r8], ip
477        vsri.32         d23, d21, #16
478        vst1.32         {d19[0]}, [r8], ip
479        vst1.32         {d19[1]}, [r8], ip
480        vld1.64         {d18-d19},[r5,:128]!
481        vcvt.s32.f32    q9,  q9,  #16
482        vst1.32         {d22[0]}, [r8], ip
483        vst1.32         {d22[1]}, [r8], ip
484        vld1.64         {d20-d21},[r4,:128]!
485        vcvt.s32.f32    q10, q10, #16
486        vst1.32         {d23[0]}, [r8], ip
487        vst1.32         {d23[1]}, [r8], ip
488        vld1.64         {d22-d23},[r5,:128]!
489        vcvt.s32.f32    q11, q11, #16
4906:      subs            lr,  lr,  #16
491        vld1.64         {d0-d1},  [r4,:128]!
492        vcvt.s32.f32    q0,  q0,  #16
493        vsri.32         d18, d16, #16
494        vld1.64         {d2-d3},  [r5,:128]!
495        vcvt.s32.f32    q1,  q1,  #16
496        vsri.32         d19, d17, #16
497        vld1.64         {d4-d5},  [r4,:128]!
498        vcvt.s32.f32    q2,  q2,  #16
499        vld1.64         {d6-d7},  [r5,:128]!
500        vcvt.s32.f32    q3,  q3,  #16
501        vst1.32         {d18[0]}, [r8], ip
502        vsri.32         d22, d20, #16
503        vst1.32         {d18[1]}, [r8], ip
504        vsri.32         d23, d21, #16
505        vst1.32         {d19[0]}, [r8], ip
506        vsri.32         d2,  d0,  #16
507        vst1.32         {d19[1]}, [r8], ip
508        vsri.32         d3,  d1,  #16
509        vst1.32         {d22[0]}, [r8], ip
510        vsri.32         d6,  d4,  #16
511        vst1.32         {d22[1]}, [r8], ip
512        vsri.32         d7,  d5,  #16
513        vst1.32         {d23[0]}, [r8], ip
514        vst1.32         {d23[1]}, [r8], ip
515        beq             6f
516        vld1.64         {d16-d17},[r4,:128]!
517        vcvt.s32.f32    q8,  q8,  #16
518        vst1.32         {d2[0]},  [r8], ip
519        vst1.32         {d2[1]},  [r8], ip
520        vld1.64         {d18-d19},[r5,:128]!
521        vcvt.s32.f32    q9,  q9,  #16
522        vst1.32         {d3[0]},  [r8], ip
523        vst1.32         {d3[1]},  [r8], ip
524        vld1.64         {d20-d21},[r4,:128]!
525        vcvt.s32.f32    q10, q10, #16
526        vst1.32         {d6[0]},  [r8], ip
527        vst1.32         {d6[1]},  [r8], ip
528        vld1.64         {d22-d23},[r5,:128]!
529        vcvt.s32.f32    q11, q11, #16
530        vst1.32         {d7[0]},  [r8], ip
531        vst1.32         {d7[1]},  [r8], ip
532        bgt             6b
5336:      vst1.32         {d2[0]},  [r8], ip
534        vst1.32         {d2[1]},  [r8], ip
535        vst1.32         {d3[0]},  [r8], ip
536        vst1.32         {d3[1]},  [r8], ip
537        vst1.32         {d6[0]},  [r8], ip
538        vst1.32         {d6[1]},  [r8], ip
539        vst1.32         {d7[0]},  [r8], ip
540        vst1.32         {d7[1]},  [r8], ip
541        b               8f
5427:      vsri.32         d18, d16, #16
543        vsri.32         d19, d17, #16
544        vst1.32         {d18[0]}, [r8], ip
545        vsri.32         d22, d20, #16
546        vst1.32         {d18[1]}, [r8], ip
547        vsri.32         d23, d21, #16
548        vst1.32         {d19[0]}, [r8], ip
549        vst1.32         {d19[1]}, [r8], ip
550        vst1.32         {d22[0]}, [r8], ip
551        vst1.32         {d22[1]}, [r8], ip
552        vst1.32         {d23[0]}, [r8], ip
553        vst1.32         {d23[1]}, [r8], ip
5548:      subs            r3,  r3,  #2
555        add             r0,  r0,  #4
556        popeq           {r4-r8,pc}
557
558        @ 1 channel
5594:      ldr             r4,  [r1],#4
560        tst             r2,  #8
561        mov             lr,  r2
562        mov             r5,  r0
563        vld1.64         {d0-d1},  [r4,:128]!
564        vcvt.s32.f32    q0,  q0,  #16
565        vld1.64         {d2-d3},  [r4,:128]!
566        vcvt.s32.f32    q1,  q1,  #16
567        bne             8f
5686:      subs            lr,  lr,  #16
569        vld1.64         {d4-d5},  [r4,:128]!
570        vcvt.s32.f32    q2,  q2,  #16
571        vld1.64         {d6-d7},  [r4,:128]!
572        vcvt.s32.f32    q3,  q3,  #16
573        vst1.16         {d0[1]},  [r5,:16], ip
574        vst1.16         {d0[3]},  [r5,:16], ip
575        vst1.16         {d1[1]},  [r5,:16], ip
576        vst1.16         {d1[3]},  [r5,:16], ip
577        vst1.16         {d2[1]},  [r5,:16], ip
578        vst1.16         {d2[3]},  [r5,:16], ip
579        vst1.16         {d3[1]},  [r5,:16], ip
580        vst1.16         {d3[3]},  [r5,:16], ip
581        beq             7f
582        vld1.64         {d0-d1},  [r4,:128]!
583        vcvt.s32.f32    q0,  q0,  #16
584        vld1.64         {d2-d3},  [r4,:128]!
585        vcvt.s32.f32    q1,  q1,  #16
5867:      vst1.16         {d4[1]},  [r5,:16], ip
587        vst1.16         {d4[3]},  [r5,:16], ip
588        vst1.16         {d5[1]},  [r5,:16], ip
589        vst1.16         {d5[3]},  [r5,:16], ip
590        vst1.16         {d6[1]},  [r5,:16], ip
591        vst1.16         {d6[3]},  [r5,:16], ip
592        vst1.16         {d7[1]},  [r5,:16], ip
593        vst1.16         {d7[3]},  [r5,:16], ip
594        bgt             6b
595        pop             {r4-r8,pc}
5968:      subs            lr,  lr,  #8
597        vst1.16         {d0[1]},  [r5,:16], ip
598        vst1.16         {d0[3]},  [r5,:16], ip
599        vst1.16         {d1[1]},  [r5,:16], ip
600        vst1.16         {d1[3]},  [r5,:16], ip
601        vst1.16         {d2[1]},  [r5,:16], ip
602        vst1.16         {d2[3]},  [r5,:16], ip
603        vst1.16         {d3[1]},  [r5,:16], ip
604        vst1.16         {d3[3]},  [r5,:16], ip
605        popeq           {r4-r8,pc}
606        vld1.64         {d0-d1},  [r4,:128]!
607        vcvt.s32.f32    q0,  q0,  #16
608        vld1.64         {d2-d3},  [r4,:128]!
609        vcvt.s32.f32    q1,  q1,  #16
610        b               6b
611        .endfunc
612
613function ff_vector_fmul_neon, export=1
614        mov             r3,  r0
615        subs            r2,  r2,  #8
616        vld1.64         {d0-d3},  [r0,:128]!
617        vld1.64         {d4-d7},  [r1,:128]!
618        vmul.f32        q8,  q0,  q2
619        vmul.f32        q9,  q1,  q3
620        beq             3f
621        bics            ip,  r2,  #15
622        beq             2f
6231:      subs            ip,  ip,  #16
624        vld1.64         {d0-d1},  [r0,:128]!
625        vld1.64         {d4-d5},  [r1,:128]!
626        vmul.f32        q10, q0,  q2
627        vld1.64         {d2-d3},  [r0,:128]!
628        vld1.64         {d6-d7},  [r1,:128]!
629        vmul.f32        q11, q1,  q3
630        vst1.64         {d16-d19},[r3,:128]!
631        vld1.64         {d0-d1},  [r0,:128]!
632        vld1.64         {d4-d5},  [r1,:128]!
633        vmul.f32        q8,  q0,  q2
634        vld1.64         {d2-d3},  [r0,:128]!
635        vld1.64         {d6-d7},  [r1,:128]!
636        vmul.f32        q9,  q1,  q3
637        vst1.64         {d20-d23},[r3,:128]!
638        bne             1b
639        ands            r2,  r2,  #15
640        beq             3f
6412:      vld1.64         {d0-d1},  [r0,:128]!
642        vld1.64         {d4-d5},  [r1,:128]!
643        vst1.64         {d16-d17},[r3,:128]!
644        vmul.f32        q8,  q0,  q2
645        vld1.64         {d2-d3},  [r0,:128]!
646        vld1.64         {d6-d7},  [r1,:128]!
647        vst1.64         {d18-d19},[r3,:128]!
648        vmul.f32        q9,  q1,  q3
6493:      vst1.64         {d16-d19},[r3,:128]!
650        bx              lr
651        .endfunc
652
653function ff_vector_fmul_window_neon, export=1
654        vld1.32         {d16[],d17[]}, [sp,:32]
655        push            {r4,r5,lr}
656        ldr             lr,  [sp, #16]
657        sub             r2,  r2,  #8
658        sub             r5,  lr,  #2
659        add             r2,  r2,  r5, lsl #2
660        add             r4,  r3,  r5, lsl #3
661        add             ip,  r0,  r5, lsl #3
662        mov             r5,  #-16
663        vld1.64         {d0,d1},  [r1,:128]!
664        vld1.64         {d2,d3},  [r2,:128], r5
665        vld1.64         {d4,d5},  [r3,:128]!
666        vld1.64         {d6,d7},  [r4,:128], r5
6671:      subs            lr,  lr,  #4
668        vmov            q11, q8
669        vmla.f32        d22, d0,  d4
670        vmov            q10, q8
671        vmla.f32        d23, d1,  d5
672        vrev64.32       q3,  q3
673        vmla.f32        d20, d0,  d7
674        vrev64.32       q1,  q1
675        vmla.f32        d21, d1,  d6
676        beq             2f
677        vmla.f32        d22, d3,  d7
678        vld1.64         {d0,d1},  [r1,:128]!
679        vmla.f32        d23, d2,  d6
680        vld1.64         {d18,d19},[r2,:128], r5
681        vmls.f32        d20, d3,  d4
682        vld1.64         {d24,d25},[r3,:128]!
683        vmls.f32        d21, d2,  d5
684        vld1.64         {d6,d7},  [r4,:128], r5
685        vmov            q1,  q9
686        vrev64.32       q11, q11
687        vmov            q2,  q12
688        vswp            d22, d23
689        vst1.64         {d20,d21},[r0,:128]!
690        vst1.64         {d22,d23},[ip,:128], r5
691        b               1b
6922:      vmla.f32        d22, d3,  d7
693        vmla.f32        d23, d2,  d6
694        vmls.f32        d20, d3,  d4
695        vmls.f32        d21, d2,  d5
696        vrev64.32       q11, q11
697        vswp            d22, d23
698        vst1.64         {d20,d21},[r0,:128]!
699        vst1.64         {d22,d23},[ip,:128], r5
700        pop             {r4,r5,pc}
701        .endfunc
702