1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22
23        .fpu neon
24
25        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
26        vtrn.32         \r0, \r4
27        vtrn.32         \r1, \r5
28        vtrn.32         \r2, \r6
29        vtrn.32         \r3, \r7
30        vtrn.16         \r0, \r2
31        vtrn.16         \r1, \r3
32        vtrn.16         \r4, \r6
33        vtrn.16         \r5, \r7
34        vtrn.8          \r0, \r1
35        vtrn.8          \r2, \r3
36        vtrn.8          \r4, \r5
37        vtrn.8          \r6, \r7
38        .endm
39
40        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
41        vswp            \r0, \r4
42        vswp            \r1, \r5
43        vswp            \r2, \r6
44        vswp            \r3, \r7
45        .endm
46
47        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
48        vtrn.32         \r0, \r2
49        vtrn.32         \r1, \r3
50        vtrn.32         \r4, \r6
51        vtrn.32         \r5, \r7
52        vtrn.16         \r0, \r1
53        vtrn.16         \r2, \r3
54        vtrn.16         \r4, \r5
55        vtrn.16         \r6, \r7
56        .endm
57
58/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
59        .macro  h264_chroma_mc8 type
60function ff_\type\()_h264_chroma_mc8_neon, export=1
61        push            {r4-r7, lr}
62        ldrd            r4,  [sp, #20]
63.ifc \type,avg
64        mov             lr,  r0
65.endif
66        pld             [r1]
67        pld             [r1, r2]
68
69        muls            r7,  r4,  r5
70        rsb             r6,  r7,  r5,  lsl #3
71        rsb             ip,  r7,  r4,  lsl #3
72        sub             r4,  r7,  r4,  lsl #3
73        sub             r4,  r4,  r5,  lsl #3
74        add             r4,  r4,  #64
75
76        beq             2f
77
78        add             r5,  r1,  r2
79
80        vdup.8          d0,  r4
81        lsl             r4,  r2,  #1
82        vdup.8          d1,  ip
83        vld1.64         {d4, d5}, [r1], r4
84        vdup.8          d2,  r6
85        vld1.64         {d6, d7}, [r5], r4
86        vdup.8          d3,  r7
87
88        vext.8          d5,  d4,  d5,  #1
89        vext.8          d7,  d6,  d7,  #1
90
911:      pld             [r5]
92        vmull.u8        q8,  d4,  d0
93        vmlal.u8        q8,  d5,  d1
94        vld1.64         {d4, d5}, [r1], r4
95        vmlal.u8        q8,  d6,  d2
96        vext.8          d5,  d4,  d5,  #1
97        vmlal.u8        q8,  d7,  d3
98        vmull.u8        q9,  d6,  d0
99        subs            r3,  r3,  #2
100        vmlal.u8        q9,  d7,  d1
101        vmlal.u8        q9,  d4,  d2
102        vmlal.u8        q9,  d5,  d3
103        vrshrn.u16      d16, q8,  #6
104        vld1.64         {d6, d7}, [r5], r4
105        pld             [r1]
106        vrshrn.u16      d17, q9,  #6
107.ifc \type,avg
108        vld1.64         {d20}, [lr,:64], r2
109        vld1.64         {d21}, [lr,:64], r2
110        vrhadd.u8       q8,  q8,  q10
111.endif
112        vext.8          d7,  d6,  d7,  #1
113        vst1.64         {d16}, [r0,:64], r2
114        vst1.64         {d17}, [r0,:64], r2
115        bgt             1b
116
117        pop             {r4-r7, pc}
118
1192:      tst             r6,  r6
120        add             ip,  ip,  r6
121        vdup.8          d0,  r4
122        vdup.8          d1,  ip
123
124        beq             4f
125
126        add             r5,  r1,  r2
127        lsl             r4,  r2,  #1
128        vld1.64         {d4}, [r1], r4
129        vld1.64         {d6}, [r5], r4
130
1313:      pld             [r5]
132        vmull.u8        q8,  d4,  d0
133        vmlal.u8        q8,  d6,  d1
134        vld1.64         {d4}, [r1], r4
135        vmull.u8        q9,  d6,  d0
136        vmlal.u8        q9,  d4,  d1
137        vld1.64         {d6}, [r5], r4
138        vrshrn.u16      d16, q8,  #6
139        vrshrn.u16      d17, q9,  #6
140.ifc \type,avg
141        vld1.64         {d20}, [lr,:64], r2
142        vld1.64         {d21}, [lr,:64], r2
143        vrhadd.u8       q8,  q8,  q10
144.endif
145        subs            r3,  r3,  #2
146        pld             [r1]
147        vst1.64         {d16}, [r0,:64], r2
148        vst1.64         {d17}, [r0,:64], r2
149        bgt             3b
150
151        pop             {r4-r7, pc}
152
1534:      vld1.64         {d4, d5}, [r1], r2
154        vld1.64         {d6, d7}, [r1], r2
155        vext.8          d5,  d4,  d5,  #1
156        vext.8          d7,  d6,  d7,  #1
157
1585:      pld             [r1]
159        subs            r3,  r3,  #2
160        vmull.u8        q8,  d4,  d0
161        vmlal.u8        q8,  d5,  d1
162        vld1.64         {d4, d5}, [r1], r2
163        vmull.u8        q9,  d6,  d0
164        vmlal.u8        q9,  d7,  d1
165        pld             [r1]
166        vext.8          d5,  d4,  d5,  #1
167        vrshrn.u16      d16, q8,  #6
168        vrshrn.u16      d17, q9,  #6
169.ifc \type,avg
170        vld1.64         {d20}, [lr,:64], r2
171        vld1.64         {d21}, [lr,:64], r2
172        vrhadd.u8       q8,  q8,  q10
173.endif
174        vld1.64         {d6, d7}, [r1], r2
175        vext.8          d7,  d6,  d7,  #1
176        vst1.64         {d16}, [r0,:64], r2
177        vst1.64         {d17}, [r0,:64], r2
178        bgt             5b
179
180        pop             {r4-r7, pc}
181        .endfunc
182        .endm
183
184/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
185        .macro  h264_chroma_mc4 type
186function ff_\type\()_h264_chroma_mc4_neon, export=1
187        push            {r4-r7, lr}
188        ldrd            r4,  [sp, #20]
189.ifc \type,avg
190        mov             lr,  r0
191.endif
192        pld             [r1]
193        pld             [r1, r2]
194
195        muls            r7,  r4,  r5
196        rsb             r6,  r7,  r5,  lsl #3
197        rsb             ip,  r7,  r4,  lsl #3
198        sub             r4,  r7,  r4,  lsl #3
199        sub             r4,  r4,  r5,  lsl #3
200        add             r4,  r4,  #64
201
202        beq             2f
203
204        add             r5,  r1,  r2
205
206        vdup.8          d0,  r4
207        lsl             r4,  r2,  #1
208        vdup.8          d1,  ip
209        vld1.64         {d4},     [r1], r4
210        vdup.8          d2,  r6
211        vld1.64         {d6},     [r5], r4
212        vdup.8          d3,  r7
213
214        vext.8          d5,  d4,  d5,  #1
215        vext.8          d7,  d6,  d7,  #1
216        vtrn.32         d4,  d5
217        vtrn.32         d6,  d7
218
219        vtrn.32         d0,  d1
220        vtrn.32         d2,  d3
221
2221:      pld             [r5]
223        vmull.u8        q8,  d4,  d0
224        vmlal.u8        q8,  d6,  d2
225        vld1.64         {d4},     [r1], r4
226        vext.8          d5,  d4,  d5,  #1
227        vtrn.32         d4,  d5
228        vmull.u8        q9,  d6,  d0
229        vmlal.u8        q9,  d4,  d2
230        vld1.64         {d6},     [r5], r4
231        vadd.i16        d16, d16, d17
232        vadd.i16        d17, d18, d19
233        vrshrn.u16      d16, q8,  #6
234        subs            r3,  r3,  #2
235        pld             [r1]
236.ifc \type,avg
237        vld1.32         {d20[0]}, [lr,:32], r2
238        vld1.32         {d20[1]}, [lr,:32], r2
239        vrhadd.u8       d16, d16, d20
240.endif
241        vext.8          d7,  d6,  d7,  #1
242        vtrn.32         d6,  d7
243        vst1.32         {d16[0]}, [r0,:32], r2
244        vst1.32         {d16[1]}, [r0,:32], r2
245        bgt             1b
246
247        pop             {r4-r7, pc}
248
2492:      tst             r6,  r6
250        add             ip,  ip,  r6
251        vdup.8          d0,  r4
252        vdup.8          d1,  ip
253        vtrn.32         d0,  d1
254
255        beq             4f
256
257        vext.32         d1,  d0,  d1,  #1
258        add             r5,  r1,  r2
259        lsl             r4,  r2,  #1
260        vld1.32         {d4[0]},  [r1], r4
261        vld1.32         {d4[1]},  [r5], r4
262
2633:      pld             [r5]
264        vmull.u8        q8,  d4,  d0
265        vld1.32         {d4[0]},  [r1], r4
266        vmull.u8        q9,  d4,  d1
267        vld1.32         {d4[1]},  [r5], r4
268        vadd.i16        d16, d16, d17
269        vadd.i16        d17, d18, d19
270        vrshrn.u16      d16, q8,  #6
271.ifc \type,avg
272        vld1.32         {d20[0]}, [lr,:32], r2
273        vld1.32         {d20[1]}, [lr,:32], r2
274        vrhadd.u8       d16, d16, d20
275.endif
276        subs            r3,  r3,  #2
277        pld             [r1]
278        vst1.32         {d16[0]}, [r0,:32], r2
279        vst1.32         {d16[1]}, [r0,:32], r2
280        bgt             3b
281
282        pop             {r4-r7, pc}
283
2844:      vld1.64         {d4},     [r1], r2
285        vld1.64         {d6},     [r1], r2
286        vext.8          d5,  d4,  d5,  #1
287        vext.8          d7,  d6,  d7,  #1
288        vtrn.32         d4,  d5
289        vtrn.32         d6,  d7
290
2915:      vmull.u8        q8,  d4,  d0
292        vmull.u8        q9,  d6,  d0
293        subs            r3,  r3,  #2
294        vld1.64         {d4},     [r1], r2
295        vext.8          d5,  d4,  d5,  #1
296        vtrn.32         d4,  d5
297        vadd.i16        d16, d16, d17
298        vadd.i16        d17, d18, d19
299        pld             [r1]
300        vrshrn.u16      d16, q8,  #6
301.ifc \type,avg
302        vld1.32         {d20[0]}, [lr,:32], r2
303        vld1.32         {d20[1]}, [lr,:32], r2
304        vrhadd.u8       d16, d16, d20
305.endif
306        vld1.64         {d6},     [r1], r2
307        vext.8          d7,  d6,  d7,  #1
308        vtrn.32         d6,  d7
309        pld             [r1]
310        vst1.32         {d16[0]}, [r0,:32], r2
311        vst1.32         {d16[1]}, [r0,:32], r2
312        bgt             5b
313
314        pop             {r4-r7, pc}
315        .endfunc
316        .endm
317
318        .text
319        .align
320
321        h264_chroma_mc8 put
322        h264_chroma_mc8 avg
323        h264_chroma_mc4 put
324        h264_chroma_mc4 avg
325
326        /* H.264 loop filter */
327
328        .macro h264_loop_filter_start
329        ldr             ip,  [sp]
330        tst             r2,  r2
331        ldr             ip,  [ip]
332        tstne           r3,  r3
333        vmov.32         d24[0], ip
334        and             ip,  ip,  ip, lsl #16
335        bxeq            lr
336        ands            ip,  ip,  ip, lsl #8
337        bxlt            lr
338        .endm
339
340        .macro align_push_regs
341        and             ip,  sp,  #15
342        add             ip,  ip,  #32
343        sub             sp,  sp,  ip
344        vst1.64         {d12-d15}, [sp,:128]
345        sub             sp,  sp,  #32
346        vst1.64         {d8-d11},  [sp,:128]
347        .endm
348
349        .macro align_pop_regs
350        vld1.64         {d8-d11},  [sp,:128]!
351        vld1.64         {d12-d15}, [sp,:128], ip
352        .endm
353
354        .macro h264_loop_filter_luma
355        vdup.8          q11, r2         @ alpha
356        vmovl.u8        q12, d24
357        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
358        vmovl.u16       q12, d24
359        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
360        vsli.16         q12, q12, #8
361        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
362        vsli.32         q12, q12, #16
363        vclt.u8         q6,  q6,  q11   @ < alpha
364        vdup.8          q11, r3         @ beta
365        vclt.s8         q7,  q12, #0
366        vclt.u8         q14, q14, q11   @ < beta
367        vclt.u8         q15, q15, q11   @ < beta
368        vbic            q6,  q6,  q7
369        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
370        vand            q6,  q6,  q14
371        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
372        vclt.u8         q4,  q4,  q11   @ < beta
373        vand            q6,  q6,  q15
374        vclt.u8         q5,  q5,  q11   @ < beta
375        vand            q4,  q4,  q6
376        vand            q5,  q5,  q6
377        vand            q12, q12, q6
378        vrhadd.u8       q14, q8,  q0
379        vsub.i8         q6,  q12, q4
380        vqadd.u8        q7,  q9,  q12
381        vhadd.u8        q10, q10, q14
382        vsub.i8         q6,  q6,  q5
383        vhadd.u8        q14, q2,  q14
384        vmin.u8         q7,  q7,  q10
385        vqsub.u8        q11, q9,  q12
386        vqadd.u8        q2,  q1,  q12
387        vmax.u8         q7,  q7,  q11
388        vqsub.u8        q11, q1,  q12
389        vmin.u8         q14, q2,  q14
390        vmovl.u8        q2,  d0
391        vmax.u8         q14, q14, q11
392        vmovl.u8        q10, d1
393        vsubw.u8        q2,  q2,  d16
394        vsubw.u8        q10, q10, d17
395        vshl.i16        q2,  q2,  #2
396        vshl.i16        q10, q10, #2
397        vaddw.u8        q2,  q2,  d18
398        vaddw.u8        q10, q10, d19
399        vsubw.u8        q2,  q2,  d2
400        vsubw.u8        q10, q10, d3
401        vrshrn.i16      d4,  q2,  #3
402        vrshrn.i16      d5,  q10, #3
403        vbsl            q4,  q7,  q9
404        vbsl            q5,  q14, q1
405        vneg.s8         q7,  q6
406        vmovl.u8        q14, d16
407        vmin.s8         q2,  q2,  q6
408        vmovl.u8        q6,  d17
409        vmax.s8         q2,  q2,  q7
410        vmovl.u8        q11, d0
411        vmovl.u8        q12, d1
412        vaddw.s8        q14, q14, d4
413        vaddw.s8        q6,  q6,  d5
414        vsubw.s8        q11, q11, d4
415        vsubw.s8        q12, q12, d5
416        vqmovun.s16     d16, q14
417        vqmovun.s16     d17, q6
418        vqmovun.s16     d0,  q11
419        vqmovun.s16     d1,  q12
420        .endm
421
422function ff_h264_v_loop_filter_luma_neon, export=1
423        h264_loop_filter_start
424
425        vld1.64         {d0, d1},  [r0,:128], r1
426        vld1.64         {d2, d3},  [r0,:128], r1
427        vld1.64         {d4, d5},  [r0,:128], r1
428        sub             r0,  r0,  r1, lsl #2
429        sub             r0,  r0,  r1, lsl #1
430        vld1.64         {d20,d21}, [r0,:128], r1
431        vld1.64         {d18,d19}, [r0,:128], r1
432        vld1.64         {d16,d17}, [r0,:128], r1
433
434        align_push_regs
435
436        h264_loop_filter_luma
437
438        sub             r0,  r0,  r1, lsl #1
439        vst1.64         {d8, d9},  [r0,:128], r1
440        vst1.64         {d16,d17}, [r0,:128], r1
441        vst1.64         {d0, d1},  [r0,:128], r1
442        vst1.64         {d10,d11}, [r0,:128]
443
444        align_pop_regs
445        bx              lr
446        .endfunc
447
448function ff_h264_h_loop_filter_luma_neon, export=1
449        h264_loop_filter_start
450
451        sub             r0,  r0,  #4
452        vld1.64         {d6},  [r0], r1
453        vld1.64         {d20}, [r0], r1
454        vld1.64         {d18}, [r0], r1
455        vld1.64         {d16}, [r0], r1
456        vld1.64         {d0},  [r0], r1
457        vld1.64         {d2},  [r0], r1
458        vld1.64         {d4},  [r0], r1
459        vld1.64         {d26}, [r0], r1
460        vld1.64         {d7},  [r0], r1
461        vld1.64         {d21}, [r0], r1
462        vld1.64         {d19}, [r0], r1
463        vld1.64         {d17}, [r0], r1
464        vld1.64         {d1},  [r0], r1
465        vld1.64         {d3},  [r0], r1
466        vld1.64         {d5},  [r0], r1
467        vld1.64         {d27}, [r0], r1
468
469        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
470
471        align_push_regs
472        sub             sp,  sp,  #16
473        vst1.64         {d4, d5},  [sp,:128]
474        sub             sp,  sp,  #16
475        vst1.64         {d20,d21}, [sp,:128]
476
477        h264_loop_filter_luma
478
479        vld1.64         {d20,d21}, [sp,:128]!
480        vld1.64         {d4, d5},  [sp,:128]!
481
482        transpose_8x8   q3, q10, q4, q8, q0, q5, q2, q13
483
484        sub             r0,  r0,  r1, lsl #4
485        vst1.64         {d6},  [r0], r1
486        vst1.64         {d20}, [r0], r1
487        vst1.64         {d8},  [r0], r1
488        vst1.64         {d16}, [r0], r1
489        vst1.64         {d0},  [r0], r1
490        vst1.64         {d10}, [r0], r1
491        vst1.64         {d4},  [r0], r1
492        vst1.64         {d26}, [r0], r1
493        vst1.64         {d7},  [r0], r1
494        vst1.64         {d21}, [r0], r1
495        vst1.64         {d9},  [r0], r1
496        vst1.64         {d17}, [r0], r1
497        vst1.64         {d1},  [r0], r1
498        vst1.64         {d11}, [r0], r1
499        vst1.64         {d5},  [r0], r1
500        vst1.64         {d27}, [r0], r1
501
502        align_pop_regs
503        bx              lr
504        .endfunc
505
506        .macro h264_loop_filter_chroma
507        vdup.8          d22, r2         @ alpha
508        vmovl.u8        q12, d24
509        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
510        vmovl.u8        q2,  d0
511        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
512        vsubw.u8        q2,  q2,  d16
513        vsli.16         d24, d24, #8
514        vshl.i16        q2,  q2,  #2
515        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
516        vaddw.u8        q2,  q2,  d18
517        vclt.u8         d26, d26, d22   @ < alpha
518        vsubw.u8        q2,  q2,  d2
519        vdup.8          d22, r3         @ beta
520        vclt.s8         d25, d24, #0
521        vrshrn.i16      d4,  q2,  #3
522        vclt.u8         d28, d28, d22   @ < beta
523        vbic            d26, d26, d25
524        vclt.u8         d30, d30, d22   @ < beta
525        vand            d26, d26, d28
526        vneg.s8         d25, d24
527        vand            d26, d26, d30
528        vmin.s8         d4,  d4,  d24
529        vmovl.u8        q14, d16
530        vand            d4,  d4,  d26
531        vmax.s8         d4,  d4,  d25
532        vmovl.u8        q11, d0
533        vaddw.s8        q14, q14, d4
534        vsubw.s8        q11, q11, d4
535        vqmovun.s16     d16, q14
536        vqmovun.s16     d0,  q11
537        .endm
538
539function ff_h264_v_loop_filter_chroma_neon, export=1
540        h264_loop_filter_start
541
542        sub             r0,  r0,  r1, lsl #1
543        vld1.64         {d18}, [r0,:64], r1
544        vld1.64         {d16}, [r0,:64], r1
545        vld1.64         {d0},  [r0,:64], r1
546        vld1.64         {d2},  [r0,:64]
547
548        h264_loop_filter_chroma
549
550        sub             r0,  r0,  r1, lsl #1
551        vst1.64         {d16}, [r0,:64], r1
552        vst1.64         {d0},  [r0,:64], r1
553
554        bx              lr
555        .endfunc
556
557function ff_h264_h_loop_filter_chroma_neon, export=1
558        h264_loop_filter_start
559
560        sub             r0,  r0,  #2
561        vld1.32         {d18[0]}, [r0], r1
562        vld1.32         {d16[0]}, [r0], r1
563        vld1.32         {d0[0]},  [r0], r1
564        vld1.32         {d2[0]},  [r0], r1
565        vld1.32         {d18[1]}, [r0], r1
566        vld1.32         {d16[1]}, [r0], r1
567        vld1.32         {d0[1]},  [r0], r1
568        vld1.32         {d2[1]},  [r0], r1
569
570        vtrn.16         d18, d0
571        vtrn.16         d16, d2
572        vtrn.8          d18, d16
573        vtrn.8          d0,  d2
574
575        h264_loop_filter_chroma
576
577        vtrn.16         d18, d0
578        vtrn.16         d16, d2
579        vtrn.8          d18, d16
580        vtrn.8          d0,  d2
581
582        sub             r0,  r0,  r1, lsl #3
583        vst1.32         {d18[0]}, [r0], r1
584        vst1.32         {d16[0]}, [r0], r1
585        vst1.32         {d0[0]},  [r0], r1
586        vst1.32         {d2[0]},  [r0], r1
587        vst1.32         {d18[1]}, [r0], r1
588        vst1.32         {d16[1]}, [r0], r1
589        vst1.32         {d0[1]},  [r0], r1
590        vst1.32         {d2[1]},  [r0], r1
591
592        bx              lr
593        .endfunc
594
595        /* H.264 qpel MC */
596
597        .macro  lowpass_const r
598        movw            \r,  #5
599        movt            \r,  #20
600        vmov.32         d6[0], \r
601        .endm
602
603        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
604.if \narrow
605        t0 .req q0
606        t1 .req q8
607.else
608        t0 .req \d0
609        t1 .req \d1
610.endif
611        vext.8          d2,  \r0, \r1, #2
612        vext.8          d3,  \r0, \r1, #3
613        vaddl.u8        q1,  d2,  d3
614        vext.8          d4,  \r0, \r1, #1
615        vext.8          d5,  \r0, \r1, #4
616        vaddl.u8        q2,  d4,  d5
617        vext.8          d30, \r0, \r1, #5
618        vaddl.u8        t0,  \r0, d30
619        vext.8          d18, \r2, \r3, #2
620        vmla.i16        t0,  q1,  d6[1]
621        vext.8          d19, \r2, \r3, #3
622        vaddl.u8        q9,  d18, d19
623        vext.8          d20, \r2, \r3, #1
624        vmls.i16        t0,  q2,  d6[0]
625        vext.8          d21, \r2, \r3, #4
626        vaddl.u8        q10, d20, d21
627        vext.8          d31, \r2, \r3, #5
628        vaddl.u8        t1,  \r2, d31
629        vmla.i16        t1,  q9,  d6[1]
630        vmls.i16        t1,  q10, d6[0]
631.if \narrow
632        vqrshrun.s16    \d0, t0,  #5
633        vqrshrun.s16    \d1, t1,  #5
634.endif
635        .unreq  t0
636        .unreq  t1
637        .endm
638
639        .macro  lowpass_8_1 r0, r1, d0, narrow=1
640.if \narrow
641        t0 .req q0
642.else
643        t0 .req \d0
644.endif
645        vext.8          d2,  \r0, \r1, #2
646        vext.8          d3,  \r0, \r1, #3
647        vaddl.u8        q1,  d2,  d3
648        vext.8          d4,  \r0, \r1, #1
649        vext.8          d5,  \r0, \r1, #4
650        vaddl.u8        q2,  d4,  d5
651        vext.8          d30, \r0, \r1, #5
652        vaddl.u8        t0,  \r0, d30
653        vmla.i16        t0,  q1,  d6[1]
654        vmls.i16        t0,  q2,  d6[0]
655.if \narrow
656        vqrshrun.s16    \d0, t0,  #5
657.endif
658        .unreq  t0
659        .endm
660
661        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
662        vext.16         q1,  \r0, \r1, #2
663        vext.16         q0,  \r0, \r1, #3
664        vaddl.s16       q9,  d2,  d0
665        vext.16         q2,  \r0, \r1, #1
666        vaddl.s16       q1,  d3,  d1
667        vext.16         q3,  \r0, \r1, #4
668        vaddl.s16       q10, d4,  d6
669        vext.16         \r1, \r0, \r1, #5
670        vaddl.s16       q2,  d5,  d7
671        vaddl.s16       q0,  \h0, \h1
672        vaddl.s16       q8,  \l0, \l1
673
674        vshl.i32        q3,  q9,  #4
675        vshl.i32        q9,  q9,  #2
676        vshl.i32        q15, q10, #2
677        vadd.i32        q9,  q9,  q3
678        vadd.i32        q10, q10, q15
679
680        vshl.i32        q3,  q1,  #4
681        vshl.i32        q1,  q1,  #2
682        vshl.i32        q15, q2,  #2
683        vadd.i32        q1,  q1,  q3
684        vadd.i32        q2,  q2,  q15
685
686        vadd.i32        q9,  q9,  q8
687        vsub.i32        q9,  q9,  q10
688
689        vadd.i32        q1,  q1,  q0
690        vsub.i32        q1,  q1,  q2
691
692        vrshrn.s32      d18, q9,  #10
693        vrshrn.s32      d19, q1,  #10
694
695        vqmovun.s16     \d,  q9
696        .endm
697
698function put_h264_qpel16_h_lowpass_neon_packed
699        mov             r4,  lr
700        mov             ip,  #16
701        mov             r3,  #8
702        bl              put_h264_qpel8_h_lowpass_neon
703        sub             r1,  r1,  r2, lsl #4
704        add             r1,  r1,  #8
705        mov             ip,  #16
706        mov             lr,  r4
707        b               put_h264_qpel8_h_lowpass_neon
708        .endfunc
709
710function put_h264_qpel16_h_lowpass_neon
711        push            {lr}
712        mov             ip,  #16
713        bl              put_h264_qpel8_h_lowpass_neon
714        sub             r0,  r0,  r3, lsl #4
715        sub             r1,  r1,  r2, lsl #4
716        add             r0,  r0,  #8
717        add             r1,  r1,  #8
718        mov             ip,  #16
719        pop             {lr}
720        .endfunc
721
722function put_h264_qpel8_h_lowpass_neon
7231:      vld1.64         {d0, d1},  [r1], r2
724        vld1.64         {d16,d17}, [r1], r2
725        subs            ip,  ip,  #2
726        lowpass_8       d0,  d1,  d16, d17, d0,  d16
727        vst1.64         {d0},     [r0,:64], r3
728        vst1.64         {d16},    [r0,:64], r3
729        bne             1b
730        bx              lr
731        .endfunc
732
733function put_h264_qpel16_h_lowpass_l2_neon
734        push            {lr}
735        mov             ip,  #16
736        bl              put_h264_qpel8_h_lowpass_l2_neon
737        sub             r0,  r0,  r2, lsl #4
738        sub             r1,  r1,  r2, lsl #4
739        sub             r3,  r3,  r2, lsl #4
740        add             r0,  r0,  #8
741        add             r1,  r1,  #8
742        add             r3,  r3,  #8
743        mov             ip,  #16
744        pop             {lr}
745        .endfunc
746
747function put_h264_qpel8_h_lowpass_l2_neon
7481:      vld1.64         {d0, d1},  [r1], r2
749        vld1.64         {d16,d17}, [r1], r2
750        vld1.64         {d28},     [r3], r2
751        vld1.64         {d29},     [r3], r2
752        subs            ip,  ip,  #2
753        lowpass_8       d0,  d1,  d16, d17, d0,  d1
754        vrhadd.u8       q0,  q0,  q14
755        vst1.64         {d0},      [r0,:64], r2
756        vst1.64         {d1},      [r0,:64], r2
757        bne             1b
758        bx              lr
759        .endfunc
760
761function put_h264_qpel16_v_lowpass_neon_packed
762        mov             r4,  lr
763        mov             r2,  #8
764        bl              put_h264_qpel8_v_lowpass_neon
765        sub             r1,  r1,  r3, lsl #2
766        bl              put_h264_qpel8_v_lowpass_neon
767        sub             r1,  r1,  r3, lsl #4
768        sub             r1,  r1,  r3, lsl #2
769        add             r1,  r1,  #8
770        bl              put_h264_qpel8_v_lowpass_neon
771        sub             r1,  r1,  r3, lsl #2
772        mov             lr,  r4
773        b               put_h264_qpel8_v_lowpass_neon
774        .endfunc
775
776function put_h264_qpel16_v_lowpass_neon
777        mov             r4,  lr
778        bl              put_h264_qpel8_v_lowpass_neon
779        sub             r1,  r1,  r3, lsl #2
780        bl              put_h264_qpel8_v_lowpass_neon
781        sub             r0,  r0,  r2, lsl #4
782        add             r0,  r0,  #8
783        sub             r1,  r1,  r3, lsl #4
784        sub             r1,  r1,  r3, lsl #2
785        add             r1,  r1,  #8
786        bl              put_h264_qpel8_v_lowpass_neon
787        sub             r1,  r1,  r3, lsl #2
788        mov             lr,  r4
789        .endfunc
790
791function put_h264_qpel8_v_lowpass_neon
792        vld1.64         {d8},  [r1], r3
793        vld1.64         {d10}, [r1], r3
794        vld1.64         {d12}, [r1], r3
795        vld1.64         {d14}, [r1], r3
796        vld1.64         {d22}, [r1], r3
797        vld1.64         {d24}, [r1], r3
798        vld1.64         {d26}, [r1], r3
799        vld1.64         {d28}, [r1], r3
800        vld1.64         {d9},  [r1], r3
801        vld1.64         {d11}, [r1], r3
802        vld1.64         {d13}, [r1], r3
803        vld1.64         {d15}, [r1], r3
804        vld1.64         {d23}, [r1]
805
806        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
807        lowpass_8       d8,  d9,  d10, d11, d8,  d10
808        lowpass_8       d12, d13, d14, d15, d12, d14
809        lowpass_8       d22, d23, d24, d25, d22, d24
810        lowpass_8       d26, d27, d28, d29, d26, d28
811        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
812
813        vst1.64         {d8},  [r0,:64], r2
814        vst1.64         {d10}, [r0,:64], r2
815        vst1.64         {d12}, [r0,:64], r2
816        vst1.64         {d14}, [r0,:64], r2
817        vst1.64         {d22}, [r0,:64], r2
818        vst1.64         {d24}, [r0,:64], r2
819        vst1.64         {d26}, [r0,:64], r2
820        vst1.64         {d28}, [r0,:64], r2
821
822        bx              lr
823        .endfunc
824
825function put_h264_qpel16_v_lowpass_l2_neon
826        mov             r4,  lr
827        bl              put_h264_qpel8_v_lowpass_l2_neon
828        sub             r1,  r1,  r3, lsl #2
829        bl              put_h264_qpel8_v_lowpass_l2_neon
830        sub             r0,  r0,  r3, lsl #4
831        sub             ip,  ip,  r2, lsl #4
832        add             r0,  r0,  #8
833        add             ip,  ip,  #8
834        sub             r1,  r1,  r3, lsl #4
835        sub             r1,  r1,  r3, lsl #2
836        add             r1,  r1,  #8
837        bl              put_h264_qpel8_v_lowpass_l2_neon
838        sub             r1,  r1,  r3, lsl #2
839        mov             lr,  r4
840        .endfunc
841
842function put_h264_qpel8_v_lowpass_l2_neon
843        vld1.64         {d8},  [r1], r3
844        vld1.64         {d10}, [r1], r3
845        vld1.64         {d12}, [r1], r3
846        vld1.64         {d14}, [r1], r3
847        vld1.64         {d22}, [r1], r3
848        vld1.64         {d24}, [r1], r3
849        vld1.64         {d26}, [r1], r3
850        vld1.64         {d28}, [r1], r3
851        vld1.64         {d9},  [r1], r3
852        vld1.64         {d11}, [r1], r3
853        vld1.64         {d13}, [r1], r3
854        vld1.64         {d15}, [r1], r3
855        vld1.64         {d23}, [r1]
856
857        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
858        lowpass_8       d8,  d9,  d10, d11, d8,  d9
859        lowpass_8       d12, d13, d14, d15, d12, d13
860        lowpass_8       d22, d23, d24, d25, d22, d23
861        lowpass_8       d26, d27, d28, d29, d26, d27
862        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
863
864        vld1.64         {d0},  [ip], r2
865        vld1.64         {d1},  [ip], r2
866        vld1.64         {d2},  [ip], r2
867        vld1.64         {d3},  [ip], r2
868        vld1.64         {d4},  [ip], r2
869        vrhadd.u8       q0,  q0,  q4
870        vld1.64         {d5},  [ip], r2
871        vrhadd.u8       q1,  q1,  q6
872        vld1.64         {d10}, [ip], r2
873        vrhadd.u8       q2,  q2,  q11
874        vld1.64         {d11}, [ip], r2
875
876        vst1.64         {d0},  [r0,:64], r3
877        vst1.64         {d1},  [r0,:64], r3
878        vrhadd.u8       q5,  q5,  q13
879        vst1.64         {d2},  [r0,:64], r3
880        vst1.64         {d3},  [r0,:64], r3
881        vst1.64         {d4},  [r0,:64], r3
882        vst1.64         {d5},  [r0,:64], r3
883        vst1.64         {d10}, [r0,:64], r3
884        vst1.64         {d11}, [r0,:64], r3
885
886        bx              lr
887        .endfunc
888
889function put_h264_qpel8_hv_lowpass_neon_top
890        lowpass_const   ip
891        mov             ip,  #12
8921:      vld1.64         {d0, d1},  [r1], r3
893        vld1.64         {d16,d17}, [r1], r3
894        subs            ip,  ip,  #2
895        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
896        vst1.64         {d22-d25}, [r4,:128]!
897        bne             1b
898
899        vld1.64         {d0, d1},  [r1]
900        lowpass_8_1     d0,  d1,  q12, narrow=0
901
902        mov             ip,  #-16
903        add             r4,  r4,  ip
904        vld1.64         {d30,d31}, [r4,:128], ip
905        vld1.64         {d20,d21}, [r4,:128], ip
906        vld1.64         {d18,d19}, [r4,:128], ip
907        vld1.64         {d16,d17}, [r4,:128], ip
908        vld1.64         {d14,d15}, [r4,:128], ip
909        vld1.64         {d12,d13}, [r4,:128], ip
910        vld1.64         {d10,d11}, [r4,:128], ip
911        vld1.64         {d8, d9},  [r4,:128], ip
912        vld1.64         {d6, d7},  [r4,:128], ip
913        vld1.64         {d4, d5},  [r4,:128], ip
914        vld1.64         {d2, d3},  [r4,:128], ip
915        vld1.64         {d0, d1},  [r4,:128]
916
917        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
918        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
919
920        swap4           d17, d19, d21, d31, d24, d26, d28, d22
921        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
922
923        vst1.64         {d30,d31}, [r4,:128]!
924        vst1.64         {d6, d7},  [r4,:128]!
925        vst1.64         {d20,d21}, [r4,:128]!
926        vst1.64         {d4, d5},  [r4,:128]!
927        vst1.64         {d18,d19}, [r4,:128]!
928        vst1.64         {d2, d3},  [r4,:128]!
929        vst1.64         {d16,d17}, [r4,:128]!
930        vst1.64         {d0, d1},  [r4,:128]
931
932        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
933        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
934        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
935        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
936
937        vld1.64         {d16,d17}, [r4,:128], ip
938        vld1.64         {d30,d31}, [r4,:128], ip
939        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
940        vld1.64         {d16,d17}, [r4,:128], ip
941        vld1.64         {d30,d31}, [r4,:128], ip
942        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
943        vld1.64         {d16,d17}, [r4,:128], ip
944        vld1.64         {d30,d31}, [r4,:128], ip
945        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
946        vld1.64         {d16,d17}, [r4,:128], ip
947        vld1.64         {d30,d31}, [r4,:128]
948        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
949
950        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
951
952        bx              lr
953        .endfunc
954
955function put_h264_qpel8_hv_lowpass_neon
956        mov             r10, lr
957        bl              put_h264_qpel8_hv_lowpass_neon_top
958        vst1.64         {d12},     [r0,:64], r2
959        vst1.64         {d13},     [r0,:64], r2
960        vst1.64         {d14},     [r0,:64], r2
961        vst1.64         {d15},     [r0,:64], r2
962        vst1.64         {d8},      [r0,:64], r2
963        vst1.64         {d9},      [r0,:64], r2
964        vst1.64         {d10},     [r0,:64], r2
965        vst1.64         {d11},     [r0,:64], r2
966
967        mov             lr,  r10
968        bx              lr
969        .endfunc
970
971function put_h264_qpel8_hv_lowpass_l2_neon
972        mov             r10, lr
973        bl              put_h264_qpel8_hv_lowpass_neon_top
974
975        vld1.64         {d0, d1},  [r2,:128]!
976        vld1.64         {d2, d3},  [r2,:128]!
977        vrhadd.u8       q0,  q0,  q6
978        vld1.64         {d4, d5},  [r2,:128]!
979        vrhadd.u8       q1,  q1,  q7
980        vld1.64         {d6, d7},  [r2,:128]!
981        vrhadd.u8       q2,  q2,  q4
982
983        vst1.64         {d0},      [r0,:64], r3
984        vrhadd.u8       q3,  q3,  q5
985        vst1.64         {d1},      [r0,:64], r3
986        vst1.64         {d2},      [r0,:64], r3
987        vst1.64         {d3},      [r0,:64], r3
988        vst1.64         {d4},      [r0,:64], r3
989        vst1.64         {d5},      [r0,:64], r3
990        vst1.64         {d6},      [r0,:64], r3
991        vst1.64         {d7},      [r0,:64], r3
992
993        mov             lr,  r10
994        bx              lr
995        .endfunc
996
997function put_h264_qpel16_hv_lowpass_neon
998        mov             r9,  lr
999        bl              put_h264_qpel8_hv_lowpass_neon
1000        sub             r1,  r1,  r3, lsl #2
1001        bl              put_h264_qpel8_hv_lowpass_neon
1002        sub             r1,  r1,  r3, lsl #4
1003        sub             r1,  r1,  r3, lsl #2
1004        add             r1,  r1,  #8
1005        sub             r0,  r0,  r2, lsl #4
1006        add             r0,  r0,  #8
1007        bl              put_h264_qpel8_hv_lowpass_neon
1008        sub             r1,  r1,  r3, lsl #2
1009        mov             lr,  r9
1010        b               put_h264_qpel8_hv_lowpass_neon
1011        .endfunc
1012
1013function put_h264_qpel16_hv_lowpass_l2_neon
1014        mov             r9,  lr
1015        sub             r2,  r4,  #256
1016        bl              put_h264_qpel8_hv_lowpass_l2_neon
1017        sub             r1,  r1,  r3, lsl #2
1018        bl              put_h264_qpel8_hv_lowpass_l2_neon
1019        sub             r1,  r1,  r3, lsl #4
1020        sub             r1,  r1,  r3, lsl #2
1021        add             r1,  r1,  #8
1022        sub             r0,  r0,  r3, lsl #4
1023        add             r0,  r0,  #8
1024        bl              put_h264_qpel8_hv_lowpass_l2_neon
1025        sub             r1,  r1,  r3, lsl #2
1026        mov             lr,  r9
1027        b               put_h264_qpel8_hv_lowpass_l2_neon
1028        .endfunc
1029
1030function ff_put_h264_qpel8_mc10_neon, export=1
1031        lowpass_const   r3
1032        mov             r3,  r1
1033        sub             r1,  r1,  #2
1034        mov             ip,  #8
1035        b               put_h264_qpel8_h_lowpass_l2_neon
1036        .endfunc
1037
1038function ff_put_h264_qpel8_mc20_neon, export=1
1039        lowpass_const   r3
1040        sub             r1,  r1,  #2
1041        mov             r3,  r2
1042        mov             ip,  #8
1043        b               put_h264_qpel8_h_lowpass_neon
1044        .endfunc
1045
1046function ff_put_h264_qpel8_mc30_neon, export=1
1047        lowpass_const   r3
1048        add             r3,  r1,  #1
1049        sub             r1,  r1,  #2
1050        mov             ip,  #8
1051        b               put_h264_qpel8_h_lowpass_l2_neon
1052        .endfunc
1053
1054function ff_put_h264_qpel8_mc01_neon, export=1
1055        push            {lr}
1056        mov             ip,  r1
1057put_h264_qpel8_mc01:
1058        lowpass_const   r3
1059        mov             r3,  r2
1060        sub             r1,  r1,  r2, lsl #1
1061        vpush           {d8-d15}
1062        bl              put_h264_qpel8_v_lowpass_l2_neon
1063        vpop            {d8-d15}
1064        pop             {pc}
1065        .endfunc
1066
1067function ff_put_h264_qpel8_mc11_neon, export=1
1068        push            {r0, r1, r2, lr}
1069put_h264_qpel8_mc11:
1070        lowpass_const   r3
1071        sub             sp,  sp,  #64
1072        mov             r0,  sp
1073        sub             r1,  r1,  #2
1074        mov             r3,  #8
1075        mov             ip,  #8
1076        vpush           {d8-d15}
1077        bl              put_h264_qpel8_h_lowpass_neon
1078        ldrd            r0,  [sp, #128]
1079        mov             r3,  r2
1080        add             ip,  sp,  #64
1081        sub             r1,  r1,  r2, lsl #1
1082        mov             r2,  #8
1083        bl              put_h264_qpel8_v_lowpass_l2_neon
1084        vpop            {d8-d15}
1085        add             sp,  sp,  #76
1086        pop             {pc}
1087        .endfunc
1088
1089function ff_put_h264_qpel8_mc21_neon, export=1
1090        push            {r0, r1, r4, r10, r11, lr}
1091put_h264_qpel8_mc21:
1092        lowpass_const   r3
1093        mov             r11, sp
1094        bic             sp,  sp,  #15
1095        sub             sp,  sp,  #(8*8+16*12)
1096        sub             r1,  r1,  #2
1097        mov             r3,  #8
1098        mov             r0,  sp
1099        mov             ip,  #8
1100        vpush           {d8-d15}
1101        bl              put_h264_qpel8_h_lowpass_neon
1102        mov             r4,  r0
1103        ldrd            r0,  [r11]
1104        sub             r1,  r1,  r2, lsl #1
1105        sub             r1,  r1,  #2
1106        mov             r3,  r2
1107        sub             r2,  r4,  #64
1108        bl              put_h264_qpel8_hv_lowpass_l2_neon
1109        vpop            {d8-d15}
1110        add             sp,  r11,  #8
1111        pop             {r4, r10, r11, pc}
1112        .endfunc
1113
1114function ff_put_h264_qpel8_mc31_neon, export=1
1115        add             r1,  r1,  #1
1116        push            {r0, r1, r2, lr}
1117        sub             r1,  r1,  #1
1118        b               put_h264_qpel8_mc11
1119        .endfunc
1120
1121function ff_put_h264_qpel8_mc02_neon, export=1
1122        push            {lr}
1123        lowpass_const   r3
1124        sub             r1,  r1,  r2, lsl #1
1125        mov             r3,  r2
1126        vpush           {d8-d15}
1127        bl              put_h264_qpel8_v_lowpass_neon
1128        vpop            {d8-d15}
1129        pop             {pc}
1130        .endfunc
1131
1132function ff_put_h264_qpel8_mc12_neon, export=1
1133        push            {r0, r1, r4, r10, r11, lr}
1134put_h264_qpel8_mc12:
1135        lowpass_const   r3
1136        mov             r11, sp
1137        bic             sp,  sp,  #15
1138        sub             sp,  sp,  #(8*8+16*12)
1139        sub             r1,  r1,  r2, lsl #1
1140        mov             r3,  r2
1141        mov             r2,  #8
1142        mov             r0,  sp
1143        vpush           {d8-d15}
1144        bl              put_h264_qpel8_v_lowpass_neon
1145        mov             r4,  r0
1146        ldrd            r0,  [r11]
1147        sub             r1,  r1,  r3, lsl #1
1148        sub             r1,  r1,  #2
1149        sub             r2,  r4,  #64
1150        bl              put_h264_qpel8_hv_lowpass_l2_neon
1151        vpop            {d8-d15}
1152        add             sp,  r11,  #8
1153        pop             {r4, r10, r11, pc}
1154        .endfunc
1155
1156function ff_put_h264_qpel8_mc22_neon, export=1
1157        push            {r4, r10, r11, lr}
1158        mov             r11, sp
1159        bic             sp,  sp,  #15
1160        sub             r1,  r1,  r2, lsl #1
1161        sub             r1,  r1,  #2
1162        mov             r3,  r2
1163        sub             sp,  sp,  #(16*12)
1164        mov             r4,  sp
1165        vpush           {d8-d15}
1166        bl              put_h264_qpel8_hv_lowpass_neon
1167        vpop            {d8-d15}
1168        mov             sp,  r11
1169        pop             {r4, r10, r11, pc}
1170        .endfunc
1171
1172function ff_put_h264_qpel8_mc32_neon, export=1
1173        push            {r0, r1, r4, r10, r11, lr}
1174        add             r1,  r1,  #1
1175        b               put_h264_qpel8_mc12
1176        .endfunc
1177
1178function ff_put_h264_qpel8_mc03_neon, export=1
1179        push            {lr}
1180        add             ip,  r1,  r2
1181        b               put_h264_qpel8_mc01
1182        .endfunc
1183
1184function ff_put_h264_qpel8_mc13_neon, export=1
1185        push            {r0, r1, r2, lr}
1186        add             r1,  r1,  r2
1187        b               put_h264_qpel8_mc11
1188        .endfunc
1189
1190function ff_put_h264_qpel8_mc23_neon, export=1
1191        push            {r0, r1, r4, r10, r11, lr}
1192        add             r1,  r1,  r2
1193        b               put_h264_qpel8_mc21
1194        .endfunc
1195
1196function ff_put_h264_qpel8_mc33_neon, export=1
1197        add             r1,  r1,  #1
1198        push            {r0, r1, r2, lr}
1199        add             r1,  r1,  r2
1200        sub             r1,  r1,  #1
1201        b               put_h264_qpel8_mc11
1202        .endfunc
1203
1204function ff_put_h264_qpel16_mc10_neon, export=1
1205        lowpass_const   r3
1206        mov             r3,  r1
1207        sub             r1,  r1,  #2
1208        b               put_h264_qpel16_h_lowpass_l2_neon
1209        .endfunc
1210
1211function ff_put_h264_qpel16_mc20_neon, export=1
1212        lowpass_const   r3
1213        sub             r1,  r1,  #2
1214        mov             r3,  r2
1215        b               put_h264_qpel16_h_lowpass_neon
1216        .endfunc
1217
1218function ff_put_h264_qpel16_mc30_neon, export=1
1219        lowpass_const   r3
1220        add             r3,  r1,  #1
1221        sub             r1,  r1,  #2
1222        b               put_h264_qpel16_h_lowpass_l2_neon
1223        .endfunc
1224
1225function ff_put_h264_qpel16_mc01_neon, export=1
1226        push            {r4, lr}
1227        mov             ip,  r1
1228put_h264_qpel16_mc01:
1229        lowpass_const   r3
1230        mov             r3,  r2
1231        sub             r1,  r1,  r2, lsl #1
1232        vpush           {d8-d15}
1233        bl              put_h264_qpel16_v_lowpass_l2_neon
1234        vpop            {d8-d15}
1235        pop             {r4, pc}
1236        .endfunc
1237
1238function ff_put_h264_qpel16_mc11_neon, export=1
1239        push            {r0, r1, r4, lr}
1240put_h264_qpel16_mc11:
1241        lowpass_const   r3
1242        sub             sp,  sp,  #256
1243        mov             r0,  sp
1244        sub             r1,  r1,  #2
1245        mov             r3,  #16
1246        vpush           {d8-d15}
1247        bl              put_h264_qpel16_h_lowpass_neon
1248        add             r0,  sp,  #256
1249        ldrd            r0,  [r0, #64]
1250        mov             r3,  r2
1251        add             ip,  sp,  #64
1252        sub             r1,  r1,  r2, lsl #1
1253        mov             r2,  #16
1254        bl              put_h264_qpel16_v_lowpass_l2_neon
1255        vpop            {d8-d15}
1256        add             sp,  sp,  #(256+8)
1257        pop             {r4, pc}
1258        .endfunc
1259
1260function ff_put_h264_qpel16_mc21_neon, export=1
1261        push            {r0, r1, r4-r5, r9-r11, lr}
1262put_h264_qpel16_mc21:
1263        lowpass_const   r3
1264        mov             r11, sp
1265        bic             sp,  sp,  #15
1266        sub             sp,  sp,  #(16*16+16*12)
1267        sub             r1,  r1,  #2
1268        mov             r0,  sp
1269        vpush           {d8-d15}
1270        bl              put_h264_qpel16_h_lowpass_neon_packed
1271        mov             r4,  r0
1272        ldrd            r0,  [r11]
1273        sub             r1,  r1,  r2, lsl #1
1274        sub             r1,  r1,  #2
1275        mov             r3,  r2
1276        bl              put_h264_qpel16_hv_lowpass_l2_neon
1277        vpop            {d8-d15}
1278        add             sp,  r11,  #8
1279        pop             {r4-r5, r9-r11, pc}
1280        .endfunc
1281
1282function ff_put_h264_qpel16_mc31_neon, export=1
1283        add             r1,  r1,  #1
1284        push            {r0, r1, r4, lr}
1285        sub             r1,  r1,  #1
1286        b               put_h264_qpel16_mc11
1287        .endfunc
1288
1289function ff_put_h264_qpel16_mc02_neon, export=1
1290        push            {r4, lr}
1291        lowpass_const   r3
1292        sub             r1,  r1,  r2, lsl #1
1293        mov             r3,  r2
1294        vpush           {d8-d15}
1295        bl              put_h264_qpel16_v_lowpass_neon
1296        vpop            {d8-d15}
1297        pop             {r4, pc}
1298        .endfunc
1299
1300function ff_put_h264_qpel16_mc12_neon, export=1
1301        push            {r0, r1, r4-r5, r9-r11, lr}
1302put_h264_qpel16_mc12:
1303        lowpass_const   r3
1304        mov             r11, sp
1305        bic             sp,  sp,  #15
1306        sub             sp,  sp,  #(16*16+16*12)
1307        sub             r1,  r1,  r2, lsl #1
1308        mov             r0,  sp
1309        mov             r3,  r2
1310        vpush           {d8-d15}
1311        bl              put_h264_qpel16_v_lowpass_neon_packed
1312        mov             r4,  r0
1313        ldrd            r0,  [r11]
1314        sub             r1,  r1,  r3, lsl #1
1315        sub             r1,  r1,  #2
1316        mov             r2,  r3
1317        bl              put_h264_qpel16_hv_lowpass_l2_neon
1318        vpop            {d8-d15}
1319        add             sp,  r11,  #8
1320        pop             {r4-r5, r9-r11, pc}
1321        .endfunc
1322
1323function ff_put_h264_qpel16_mc22_neon, export=1
1324        push            {r4, r9-r11, lr}
1325        lowpass_const   r3
1326        mov             r11, sp
1327        bic             sp,  sp,  #15
1328        sub             r1,  r1,  r2, lsl #1
1329        sub             r1,  r1,  #2
1330        mov             r3,  r2
1331        sub             sp,  sp,  #(16*12)
1332        mov             r4,  sp
1333        vpush           {d8-d15}
1334        bl              put_h264_qpel16_hv_lowpass_neon
1335        vpop            {d8-d15}
1336        mov             sp,  r11
1337        pop             {r4, r9-r11, pc}
1338        .endfunc
1339
1340function ff_put_h264_qpel16_mc32_neon, export=1
1341        push            {r0, r1, r4-r5, r9-r11, lr}
1342        add             r1,  r1,  #1
1343        b               put_h264_qpel16_mc12
1344        .endfunc
1345
1346function ff_put_h264_qpel16_mc03_neon, export=1
1347        push            {r4, lr}
1348        add             ip,  r1,  r2
1349        b               put_h264_qpel16_mc01
1350        .endfunc
1351
1352function ff_put_h264_qpel16_mc13_neon, export=1
1353        push            {r0, r1, r4, lr}
1354        add             r1,  r1,  r2
1355        b               put_h264_qpel16_mc11
1356        .endfunc
1357
1358function ff_put_h264_qpel16_mc23_neon, export=1
1359        push            {r0, r1, r4-r5, r9-r11, lr}
1360        add             r1,  r1,  r2
1361        b               put_h264_qpel16_mc21
1362        .endfunc
1363
1364function ff_put_h264_qpel16_mc33_neon, export=1
1365        add             r1,  r1,  #1
1366        push            {r0, r1, r4, lr}
1367        add             r1,  r1,  r2
1368        sub             r1,  r1,  #1
1369        b               put_h264_qpel16_mc11
1370        .endfunc
1371
1372@ Biweighted prediction
1373
1374        .macro  biweight_16 macs, macd
1375        vdup.8          d0,  r4
1376        vdup.8          d1,  r5
1377        vmov            q2,  q8
1378        vmov            q3,  q8
13791:      subs            ip,  ip,  #2
1380        vld1.8          {d20-d21},[r0,:128], r2
1381        \macd           q2,  d0,  d20
1382        pld             [r0]
1383        \macd           q3,  d0,  d21
1384        vld1.8          {d22-d23},[r1,:128], r2
1385        \macs           q2,  d1,  d22
1386        pld             [r1]
1387        \macs           q3,  d1,  d23
1388        vmov            q12, q8
1389        vld1.8          {d28-d29},[r0,:128], r2
1390        vmov            q13, q8
1391        \macd           q12, d0,  d28
1392        pld             [r0]
1393        \macd           q13, d0,  d29
1394        vld1.8          {d30-d31},[r1,:128], r2
1395        \macs           q12, d1,  d30
1396        pld             [r1]
1397        \macs           q13, d1,  d31
1398        vshl.s16        q2,  q2,  q9
1399        vshl.s16        q3,  q3,  q9
1400        vqmovun.s16     d4,  q2
1401        vqmovun.s16     d5,  q3
1402        vshl.s16        q12, q12, q9
1403        vshl.s16        q13, q13, q9
1404        vqmovun.s16     d24, q12
1405        vqmovun.s16     d25, q13
1406        vmov            q3,  q8
1407        vst1.8          {d4- d5}, [r6,:128], r2
1408        vmov            q2,  q8
1409        vst1.8          {d24-d25},[r6,:128], r2
1410        bne             1b
1411        pop             {r4-r6, pc}
1412        .endm
1413
1414        .macro  biweight_8 macs, macd
1415        vdup.8          d0,  r4
1416        vdup.8          d1,  r5
1417        vmov            q1,  q8
1418        vmov            q10, q8
14191:      subs            ip,  ip,  #2
1420        vld1.8          {d4},[r0,:64], r2
1421        \macd           q1,  d0,  d4
1422        pld             [r0]
1423        vld1.8          {d5},[r1,:64], r2
1424        \macs           q1,  d1,  d5
1425        pld             [r1]
1426        vld1.8          {d6},[r0,:64], r2
1427        \macd           q10, d0,  d6
1428        pld             [r0]
1429        vld1.8          {d7},[r1,:64], r2
1430        \macs           q10, d1,  d7
1431        pld             [r1]
1432        vshl.s16        q1,  q1,  q9
1433        vqmovun.s16     d2,  q1
1434        vshl.s16        q10, q10, q9
1435        vqmovun.s16     d4,  q10
1436        vmov            q10, q8
1437        vst1.8          {d2},[r6,:64], r2
1438        vmov            q1,  q8
1439        vst1.8          {d4},[r6,:64], r2
1440        bne             1b
1441        pop             {r4-r6, pc}
1442        .endm
1443
1444        .macro  biweight_4 macs, macd
1445        vdup.8          d0,  r4
1446        vdup.8          d1,  r5
1447        vmov            q1,  q8
1448        vmov            q10, q8
14491:      subs            ip,  ip,  #4
1450        vld1.32         {d4[0]},[r0,:32], r2
1451        vld1.32         {d4[1]},[r0,:32], r2
1452        \macd           q1,  d0,  d4
1453        pld             [r0]
1454        vld1.32         {d5[0]},[r1,:32], r2
1455        vld1.32         {d5[1]},[r1,:32], r2
1456        \macs           q1,  d1,  d5
1457        pld             [r1]
1458        blt             2f
1459        vld1.32         {d6[0]},[r0,:32], r2
1460        vld1.32         {d6[1]},[r0,:32], r2
1461        \macd           q10, d0,  d6
1462        pld             [r0]
1463        vld1.32         {d7[0]},[r1,:32], r2
1464        vld1.32         {d7[1]},[r1,:32], r2
1465        \macs           q10, d1,  d7
1466        pld             [r1]
1467        vshl.s16        q1,  q1,  q9
1468        vqmovun.s16     d2,  q1
1469        vshl.s16        q10, q10, q9
1470        vqmovun.s16     d4,  q10
1471        vmov            q10, q8
1472        vst1.32         {d2[0]},[r6,:32], r2
1473        vst1.32         {d2[1]},[r6,:32], r2
1474        vmov            q1,  q8
1475        vst1.32         {d4[0]},[r6,:32], r2
1476        vst1.32         {d4[1]},[r6,:32], r2
1477        bne             1b
1478        pop             {r4-r6, pc}
14792:      vshl.s16        q1,  q1,  q9
1480        vqmovun.s16     d2,  q1
1481        vst1.32         {d2[0]},[r6,:32], r2
1482        vst1.32         {d2[1]},[r6,:32], r2
1483        pop             {r4-r6, pc}
1484        .endm
1485
1486        .macro  biweight_func w
1487function biweight_h264_pixels_\w\()_neon
1488        push            {r4-r6, lr}
1489        add             r4,  sp,  #16
1490        ldm             r4,  {r4-r6}
1491        lsr             lr,  r4,  #31
1492        add             r6,  r6,  #1
1493        eors            lr,  lr,  r5,  lsr #30
1494        orr             r6,  r6,  #1
1495        vdup.16         q9,  r3
1496        lsl             r6,  r6,  r3
1497        vmvn            q9,  q9
1498        vdup.16         q8,  r6
1499        mov             r6,  r0
1500        beq             10f
1501        subs            lr,  lr,  #1
1502        beq             20f
1503        subs            lr,  lr,  #1
1504        beq             30f
1505        b               40f
150610:     biweight_\w     vmlal.u8, vmlal.u8
150720:     rsb             r4,  r4,  #0
1508        biweight_\w     vmlal.u8, vmlsl.u8
150930:     rsb             r4,  r4,  #0
1510        rsb             r5,  r5,  #0
1511        biweight_\w     vmlsl.u8, vmlsl.u8
151240:     rsb             r5,  r5,  #0
1513        biweight_\w     vmlsl.u8, vmlal.u8
1514        .endfunc
1515        .endm
1516
1517        .macro  biweight_entry w, h, b=1
1518function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1519        mov             ip,  #\h
1520.if \b
1521        b               biweight_h264_pixels_\w\()_neon
1522.endif
1523        .endfunc
1524        .endm
1525
1526        biweight_entry  16, 8
1527        biweight_entry  16, 16, b=0
1528        biweight_func   16
1529
1530        biweight_entry  8,  16
1531        biweight_entry  8,  4
1532        biweight_entry  8,  8,  b=0
1533        biweight_func   8
1534
1535        biweight_entry  4,  8
1536        biweight_entry  4,  2
1537        biweight_entry  4,  4,  b=0
1538        biweight_func   4
1539
1540@ Weighted prediction
1541
1542        .macro  weight_16 add
1543        vdup.8          d0,  r3
15441:      subs            ip,  ip,  #2
1545        vld1.8          {d20-d21},[r0,:128], r1
1546        vmull.u8        q2,  d0,  d20
1547        pld             [r0]
1548        vmull.u8        q3,  d0,  d21
1549        vld1.8          {d28-d29},[r0,:128], r1
1550        vmull.u8        q12, d0,  d28
1551        pld             [r0]
1552        vmull.u8        q13, d0,  d29
1553        \add            q2,  q8,  q2
1554        vrshl.s16       q2,  q2,  q9
1555        \add            q3,  q8,  q3
1556        vrshl.s16       q3,  q3,  q9
1557        vqmovun.s16     d4,  q2
1558        vqmovun.s16     d5,  q3
1559        \add            q12, q8,  q12
1560        vrshl.s16       q12, q12, q9
1561        \add            q13, q8,  q13
1562        vrshl.s16       q13, q13, q9
1563        vqmovun.s16     d24, q12
1564        vqmovun.s16     d25, q13
1565        vst1.8          {d4- d5}, [r4,:128], r1
1566        vst1.8          {d24-d25},[r4,:128], r1
1567        bne             1b
1568        pop             {r4, pc}
1569        .endm
1570
1571        .macro  weight_8 add
1572        vdup.8          d0,  r3
15731:      subs            ip,  ip,  #2
1574        vld1.8          {d4},[r0,:64], r1
1575        vmull.u8        q1,  d0,  d4
1576        pld             [r0]
1577        vld1.8          {d6},[r0,:64], r1
1578        vmull.u8        q10, d0,  d6
1579        \add            q1,  q8,  q1
1580        pld             [r0]
1581        vrshl.s16       q1,  q1,  q9
1582        vqmovun.s16     d2,  q1
1583        \add            q10, q8,  q10
1584        vrshl.s16       q10, q10, q9
1585        vqmovun.s16     d4,  q10
1586        vst1.8          {d2},[r4,:64], r1
1587        vst1.8          {d4},[r4,:64], r1
1588        bne             1b
1589        pop             {r4, pc}
1590        .endm
1591
1592        .macro  weight_4 add
1593        vdup.8          d0,  r3
1594        vmov            q1,  q8
1595        vmov            q10, q8
15961:      subs            ip,  ip,  #4
1597        vld1.32         {d4[0]},[r0,:32], r1
1598        vld1.32         {d4[1]},[r0,:32], r1
1599        vmull.u8        q1,  d0,  d4
1600        pld             [r0]
1601        blt             2f
1602        vld1.32         {d6[0]},[r0,:32], r1
1603        vld1.32         {d6[1]},[r0,:32], r1
1604        vmull.u8        q10, d0,  d6
1605        pld             [r0]
1606        \add            q1,  q8,  q1
1607        vrshl.s16       q1,  q1,  q9
1608        vqmovun.s16     d2,  q1
1609        \add            q10, q8,  q10
1610        vrshl.s16       q10, q10, q9
1611        vqmovun.s16     d4,  q10
1612        vmov            q10, q8
1613        vst1.32         {d2[0]},[r4,:32], r1
1614        vst1.32         {d2[1]},[r4,:32], r1
1615        vmov            q1,  q8
1616        vst1.32         {d4[0]},[r4,:32], r1
1617        vst1.32         {d4[1]},[r4,:32], r1
1618        bne             1b
1619        pop             {r4, pc}
16202:      \add            q1,  q8,  q1
1621        vrshl.s16       q1,  q1,  q9
1622        vqmovun.s16     d2,  q1
1623        vst1.32         {d2[0]},[r4,:32], r1
1624        vst1.32         {d2[1]},[r4,:32], r1
1625        pop             {r4, pc}
1626        .endm
1627
1628        .macro  weight_func w
1629function weight_h264_pixels_\w\()_neon
1630        push            {r4, lr}
1631        ldr             r4,  [sp, #8]
1632        cmp             r2,  #1
1633        lsl             r4,  r4,  r2
1634        vdup.16         q8,  r4
1635        mov             r4,  r0
1636        ble             20f
1637        rsb             lr,  r2,  #1
1638        vdup.16         q9,  lr
1639        cmp             r3,  #0
1640        blt             10f
1641        weight_\w       vhadd.s16
164210:     rsb             r3,  r3,  #0
1643        weight_\w       vhsub.s16
164420:     rsb             lr,  r2,  #0
1645        vdup.16         q9,  lr
1646        cmp             r3,  #0
1647        blt             10f
1648        weight_\w       vadd.s16
164910:     rsb             r3,  r3,  #0
1650        weight_\w       vsub.s16
1651        .endfunc
1652        .endm
1653
1654        .macro  weight_entry w, h, b=1
1655function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1656        mov             ip,  #\h
1657.if \b
1658        b               weight_h264_pixels_\w\()_neon
1659.endif
1660        .endfunc
1661        .endm
1662
1663        weight_entry    16, 8
1664        weight_entry    16, 16, b=0
1665        weight_func     16
1666
1667        weight_entry    8,  16
1668        weight_entry    8,  4
1669        weight_entry    8,  8,  b=0
1670        weight_func     8
1671
1672        weight_entry    4,  8
1673        weight_entry    4,  2
1674        weight_entry    4,  4,  b=0
1675        weight_func     4
1676