1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22#include "neon.S"
23
24        /* H.264 loop filter */
25
26.macro  h264_loop_filter_start
27        ldr             r12, [sp]
28        tst             r2,  r2
29        ldr             r12, [r12]
30        it              ne
31        tstne           r3,  r3
32        vmov.32         d24[0], r12
33        and             r12, r12, r12, lsl #16
34        it              eq
35        bxeq            lr
36        ands            r12, r12, r12, lsl #8
37        it              lt
38        bxlt            lr
39.endm
40
41.macro  h264_loop_filter_luma
42        vdup.8          q11, r2         @ alpha
43        vmovl.u8        q12, d24
44        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
45        vmovl.u16       q12, d24
46        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
47        vsli.16         q12, q12, #8
48        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
49        vsli.32         q12, q12, #16
50        vclt.u8         q6,  q6,  q11   @ < alpha
51        vdup.8          q11, r3         @ beta
52        vclt.s8         q7,  q12, #0
53        vclt.u8         q14, q14, q11   @ < beta
54        vclt.u8         q15, q15, q11   @ < beta
55        vbic            q6,  q6,  q7
56        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
57        vand            q6,  q6,  q14
58        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
59        vclt.u8         q4,  q4,  q11   @ < beta
60        vand            q6,  q6,  q15
61        vclt.u8         q5,  q5,  q11   @ < beta
62        vand            q4,  q4,  q6
63        vand            q5,  q5,  q6
64        vand            q12, q12, q6
65        vrhadd.u8       q14, q8,  q0
66        vsub.i8         q6,  q12, q4
67        vqadd.u8        q7,  q9,  q12
68        vhadd.u8        q10, q10, q14
69        vsub.i8         q6,  q6,  q5
70        vhadd.u8        q14, q2,  q14
71        vmin.u8         q7,  q7,  q10
72        vqsub.u8        q11, q9,  q12
73        vqadd.u8        q2,  q1,  q12
74        vmax.u8         q7,  q7,  q11
75        vqsub.u8        q11, q1,  q12
76        vmin.u8         q14, q2,  q14
77        vmovl.u8        q2,  d0
78        vmax.u8         q14, q14, q11
79        vmovl.u8        q10, d1
80        vsubw.u8        q2,  q2,  d16
81        vsubw.u8        q10, q10, d17
82        vshl.i16        q2,  q2,  #2
83        vshl.i16        q10, q10, #2
84        vaddw.u8        q2,  q2,  d18
85        vaddw.u8        q10, q10, d19
86        vsubw.u8        q2,  q2,  d2
87        vsubw.u8        q10, q10, d3
88        vrshrn.i16      d4,  q2,  #3
89        vrshrn.i16      d5,  q10, #3
90        vbsl            q4,  q7,  q9
91        vbsl            q5,  q14, q1
92        vneg.s8         q7,  q6
93        vmovl.u8        q14, d16
94        vmin.s8         q2,  q2,  q6
95        vmovl.u8        q6,  d17
96        vmax.s8         q2,  q2,  q7
97        vmovl.u8        q11, d0
98        vmovl.u8        q12, d1
99        vaddw.s8        q14, q14, d4
100        vaddw.s8        q6,  q6,  d5
101        vsubw.s8        q11, q11, d4
102        vsubw.s8        q12, q12, d5
103        vqmovun.s16     d16, q14
104        vqmovun.s16     d17, q6
105        vqmovun.s16     d0,  q11
106        vqmovun.s16     d1,  q12
107.endm
108
109function ff_h264_v_loop_filter_luma_neon, export=1
110        h264_loop_filter_start
111
112        vld1.8          {d0, d1},  [r0,:128], r1
113        vld1.8          {d2, d3},  [r0,:128], r1
114        vld1.8          {d4, d5},  [r0,:128], r1
115        sub             r0,  r0,  r1, lsl #2
116        sub             r0,  r0,  r1, lsl #1
117        vld1.8          {d20,d21}, [r0,:128], r1
118        vld1.8          {d18,d19}, [r0,:128], r1
119        vld1.8          {d16,d17}, [r0,:128], r1
120
121        vpush           {d8-d15}
122
123        h264_loop_filter_luma
124
125        sub             r0,  r0,  r1, lsl #1
126        vst1.8          {d8, d9},  [r0,:128], r1
127        vst1.8          {d16,d17}, [r0,:128], r1
128        vst1.8          {d0, d1},  [r0,:128], r1
129        vst1.8          {d10,d11}, [r0,:128]
130
131        vpop            {d8-d15}
132        bx              lr
133endfunc
134
135function ff_h264_h_loop_filter_luma_neon, export=1
136        h264_loop_filter_start
137
138        sub             r0,  r0,  #4
139        vld1.8          {d6},  [r0], r1
140        vld1.8          {d20}, [r0], r1
141        vld1.8          {d18}, [r0], r1
142        vld1.8          {d16}, [r0], r1
143        vld1.8          {d0},  [r0], r1
144        vld1.8          {d2},  [r0], r1
145        vld1.8          {d4},  [r0], r1
146        vld1.8          {d26}, [r0], r1
147        vld1.8          {d7},  [r0], r1
148        vld1.8          {d21}, [r0], r1
149        vld1.8          {d19}, [r0], r1
150        vld1.8          {d17}, [r0], r1
151        vld1.8          {d1},  [r0], r1
152        vld1.8          {d3},  [r0], r1
153        vld1.8          {d5},  [r0], r1
154        vld1.8          {d27}, [r0], r1
155
156        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
157
158        vpush           {d8-d15}
159
160        h264_loop_filter_luma
161
162        transpose_4x4   q4, q8, q0, q5
163
164        sub             r0,  r0,  r1, lsl #4
165        add             r0,  r0,  #2
166        vst1.32         {d8[0]},  [r0], r1
167        vst1.32         {d16[0]}, [r0], r1
168        vst1.32         {d0[0]},  [r0], r1
169        vst1.32         {d10[0]}, [r0], r1
170        vst1.32         {d8[1]},  [r0], r1
171        vst1.32         {d16[1]}, [r0], r1
172        vst1.32         {d0[1]},  [r0], r1
173        vst1.32         {d10[1]}, [r0], r1
174        vst1.32         {d9[0]},  [r0], r1
175        vst1.32         {d17[0]}, [r0], r1
176        vst1.32         {d1[0]},  [r0], r1
177        vst1.32         {d11[0]}, [r0], r1
178        vst1.32         {d9[1]},  [r0], r1
179        vst1.32         {d17[1]}, [r0], r1
180        vst1.32         {d1[1]},  [r0], r1
181        vst1.32         {d11[1]}, [r0], r1
182
183        vpop            {d8-d15}
184        bx              lr
185endfunc
186
187.macro  h264_loop_filter_chroma
188        vdup.8          d22, r2         @ alpha
189        vmovl.u8        q12, d24
190        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
191        vmovl.u8        q2,  d0
192        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
193        vsubw.u8        q2,  q2,  d16
194        vsli.16         d24, d24, #8
195        vshl.i16        q2,  q2,  #2
196        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
197        vaddw.u8        q2,  q2,  d18
198        vclt.u8         d26, d26, d22   @ < alpha
199        vsubw.u8        q2,  q2,  d2
200        vdup.8          d22, r3         @ beta
201        vrshrn.i16      d4,  q2,  #3
202        vclt.u8         d28, d28, d22   @ < beta
203        vclt.u8         d30, d30, d22   @ < beta
204        vmin.s8         d4,  d4,  d24
205        vneg.s8         d25, d24
206        vand            d26, d26, d28
207        vmax.s8         d4,  d4,  d25
208        vand            d26, d26, d30
209        vmovl.u8        q11, d0
210        vand            d4,  d4,  d26
211        vmovl.u8        q14, d16
212        vaddw.s8        q14, q14, d4
213        vsubw.s8        q11, q11, d4
214        vqmovun.s16     d16, q14
215        vqmovun.s16     d0,  q11
216.endm
217
218function ff_h264_v_loop_filter_chroma_neon, export=1
219        h264_loop_filter_start
220
221        sub             r0,  r0,  r1, lsl #1
222        vld1.8          {d18}, [r0,:64], r1
223        vld1.8          {d16}, [r0,:64], r1
224        vld1.8          {d0},  [r0,:64], r1
225        vld1.8          {d2},  [r0,:64]
226
227        h264_loop_filter_chroma
228
229        sub             r0,  r0,  r1, lsl #1
230        vst1.8          {d16}, [r0,:64], r1
231        vst1.8          {d0},  [r0,:64], r1
232
233        bx              lr
234endfunc
235
236function ff_h264_h_loop_filter_chroma_neon, export=1
237        h264_loop_filter_start
238
239        sub             r0,  r0,  #2
240        vld1.32         {d18[0]}, [r0], r1
241        vld1.32         {d16[0]}, [r0], r1
242        vld1.32         {d0[0]},  [r0], r1
243        vld1.32         {d2[0]},  [r0], r1
244        vld1.32         {d18[1]}, [r0], r1
245        vld1.32         {d16[1]}, [r0], r1
246        vld1.32         {d0[1]},  [r0], r1
247        vld1.32         {d2[1]},  [r0], r1
248
249        vtrn.16         d18, d0
250        vtrn.16         d16, d2
251        vtrn.8          d18, d16
252        vtrn.8          d0,  d2
253
254        h264_loop_filter_chroma
255
256        vtrn.16         d18, d0
257        vtrn.16         d16, d2
258        vtrn.8          d18, d16
259        vtrn.8          d0,  d2
260
261        sub             r0,  r0,  r1, lsl #3
262        vst1.32         {d18[0]}, [r0], r1
263        vst1.32         {d16[0]}, [r0], r1
264        vst1.32         {d0[0]},  [r0], r1
265        vst1.32         {d2[0]},  [r0], r1
266        vst1.32         {d18[1]}, [r0], r1
267        vst1.32         {d16[1]}, [r0], r1
268        vst1.32         {d0[1]},  [r0], r1
269        vst1.32         {d2[1]},  [r0], r1
270
271        bx              lr
272endfunc
273
274        /* H.264 qpel MC */
275
276.macro  lowpass_const   r
277        movw            \r,  #5
278        movt            \r,  #20
279        vmov.32         d6[0], \r
280.endm
281
282.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
283  .if \narrow
284        t0 .req q0
285        t1 .req q8
286  .else
287        t0 .req \d0
288        t1 .req \d1
289  .endif
290        vext.8          d2,  \r0, \r1, #2
291        vext.8          d3,  \r0, \r1, #3
292        vaddl.u8        q1,  d2,  d3
293        vext.8          d4,  \r0, \r1, #1
294        vext.8          d5,  \r0, \r1, #4
295        vaddl.u8        q2,  d4,  d5
296        vext.8          d30, \r0, \r1, #5
297        vaddl.u8        t0,  \r0, d30
298        vext.8          d18, \r2, \r3, #2
299        vmla.i16        t0,  q1,  d6[1]
300        vext.8          d19, \r2, \r3, #3
301        vaddl.u8        q9,  d18, d19
302        vext.8          d20, \r2, \r3, #1
303        vmls.i16        t0,  q2,  d6[0]
304        vext.8          d21, \r2, \r3, #4
305        vaddl.u8        q10, d20, d21
306        vext.8          d31, \r2, \r3, #5
307        vaddl.u8        t1,  \r2, d31
308        vmla.i16        t1,  q9,  d6[1]
309        vmls.i16        t1,  q10, d6[0]
310  .if \narrow
311        vqrshrun.s16    \d0, t0,  #5
312        vqrshrun.s16    \d1, t1,  #5
313  .endif
314        .unreq  t0
315        .unreq  t1
316.endm
317
318.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
319  .if \narrow
320        t0 .req q0
321  .else
322        t0 .req \d0
323  .endif
324        vext.8          d2,  \r0, \r1, #2
325        vext.8          d3,  \r0, \r1, #3
326        vaddl.u8        q1,  d2,  d3
327        vext.8          d4,  \r0, \r1, #1
328        vext.8          d5,  \r0, \r1, #4
329        vaddl.u8        q2,  d4,  d5
330        vext.8          d30, \r0, \r1, #5
331        vaddl.u8        t0,  \r0, d30
332        vmla.i16        t0,  q1,  d6[1]
333        vmls.i16        t0,  q2,  d6[0]
334  .if \narrow
335        vqrshrun.s16    \d0, t0,  #5
336  .endif
337        .unreq  t0
338.endm
339
340.macro  lowpass_8.16    r0,  r1,  l0,  h0,  l1,  h1,  d
341        vext.16         q1,  \r0, \r1, #2
342        vext.16         q0,  \r0, \r1, #3
343        vaddl.s16       q9,  d2,  d0
344        vext.16         q2,  \r0, \r1, #1
345        vaddl.s16       q1,  d3,  d1
346        vext.16         q3,  \r0, \r1, #4
347        vaddl.s16       q10, d4,  d6
348        vext.16         \r1, \r0, \r1, #5
349        vaddl.s16       q2,  d5,  d7
350        vaddl.s16       q0,  \h0, \h1
351        vaddl.s16       q8,  \l0, \l1
352
353        vshl.i32        q3,  q9,  #4
354        vshl.i32        q9,  q9,  #2
355        vshl.i32        q15, q10, #2
356        vadd.i32        q9,  q9,  q3
357        vadd.i32        q10, q10, q15
358
359        vshl.i32        q3,  q1,  #4
360        vshl.i32        q1,  q1,  #2
361        vshl.i32        q15, q2,  #2
362        vadd.i32        q1,  q1,  q3
363        vadd.i32        q2,  q2,  q15
364
365        vadd.i32        q9,  q9,  q8
366        vsub.i32        q9,  q9,  q10
367
368        vadd.i32        q1,  q1,  q0
369        vsub.i32        q1,  q1,  q2
370
371        vrshrn.s32      d18, q9,  #10
372        vrshrn.s32      d19, q1,  #10
373
374        vqmovun.s16     \d,  q9
375.endm
376
377function put_h264_qpel16_h_lowpass_neon_packed
378        mov             r4,  lr
379        mov             r12, #16
380        mov             r3,  #8
381        bl              put_h264_qpel8_h_lowpass_neon
382        sub             r1,  r1,  r2, lsl #4
383        add             r1,  r1,  #8
384        mov             r12, #16
385        mov             lr,  r4
386        b               put_h264_qpel8_h_lowpass_neon
387endfunc
388
389.macro  h264_qpel_h_lowpass type
390function \type\()_h264_qpel16_h_lowpass_neon
391        push            {lr}
392        mov             r12, #16
393        bl              \type\()_h264_qpel8_h_lowpass_neon
394        sub             r0,  r0,  r3, lsl #4
395        sub             r1,  r1,  r2, lsl #4
396        add             r0,  r0,  #8
397        add             r1,  r1,  #8
398        mov             r12, #16
399        pop             {lr}
400endfunc
401
402function \type\()_h264_qpel8_h_lowpass_neon
4031:      vld1.8          {d0, d1},  [r1], r2
404        vld1.8          {d16,d17}, [r1], r2
405        subs            r12, r12, #2
406        lowpass_8       d0,  d1,  d16, d17, d0,  d16
407  .ifc \type,avg
408        vld1.8          {d2},     [r0,:64], r3
409        vrhadd.u8       d0,  d0,  d2
410        vld1.8          {d3},     [r0,:64]
411        vrhadd.u8       d16, d16, d3
412        sub             r0,  r0,  r3
413  .endif
414        vst1.8          {d0},     [r0,:64], r3
415        vst1.8          {d16},    [r0,:64], r3
416        bne             1b
417        bx              lr
418endfunc
419.endm
420
421        h264_qpel_h_lowpass put
422        h264_qpel_h_lowpass avg
423
424.macro  h264_qpel_h_lowpass_l2 type
425function \type\()_h264_qpel16_h_lowpass_l2_neon
426        push            {lr}
427        mov             r12, #16
428        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
429        sub             r0,  r0,  r2, lsl #4
430        sub             r1,  r1,  r2, lsl #4
431        sub             r3,  r3,  r2, lsl #4
432        add             r0,  r0,  #8
433        add             r1,  r1,  #8
434        add             r3,  r3,  #8
435        mov             r12, #16
436        pop             {lr}
437endfunc
438
439function \type\()_h264_qpel8_h_lowpass_l2_neon
4401:      vld1.8          {d0, d1},  [r1], r2
441        vld1.8          {d16,d17}, [r1], r2
442        vld1.8          {d28},     [r3], r2
443        vld1.8          {d29},     [r3], r2
444        subs            r12, r12, #2
445        lowpass_8       d0,  d1,  d16, d17, d0,  d1
446        vrhadd.u8       q0,  q0,  q14
447  .ifc \type,avg
448        vld1.8          {d2},      [r0,:64], r2
449        vrhadd.u8       d0,  d0,  d2
450        vld1.8          {d3},      [r0,:64]
451        vrhadd.u8       d1,  d1,  d3
452        sub             r0,  r0,  r2
453  .endif
454        vst1.8          {d0},      [r0,:64], r2
455        vst1.8          {d1},      [r0,:64], r2
456        bne             1b
457        bx              lr
458endfunc
459.endm
460
461        h264_qpel_h_lowpass_l2 put
462        h264_qpel_h_lowpass_l2 avg
463
464function put_h264_qpel16_v_lowpass_neon_packed
465        mov             r4,  lr
466        mov             r2,  #8
467        bl              put_h264_qpel8_v_lowpass_neon
468        sub             r1,  r1,  r3, lsl #2
469        bl              put_h264_qpel8_v_lowpass_neon
470        sub             r1,  r1,  r3, lsl #4
471        sub             r1,  r1,  r3, lsl #2
472        add             r1,  r1,  #8
473        bl              put_h264_qpel8_v_lowpass_neon
474        sub             r1,  r1,  r3, lsl #2
475        mov             lr,  r4
476        b               put_h264_qpel8_v_lowpass_neon
477endfunc
478
479.macro  h264_qpel_v_lowpass type
480function \type\()_h264_qpel16_v_lowpass_neon
481        mov             r4,  lr
482        bl              \type\()_h264_qpel8_v_lowpass_neon
483        sub             r1,  r1,  r3, lsl #2
484        bl              \type\()_h264_qpel8_v_lowpass_neon
485        sub             r0,  r0,  r2, lsl #4
486        add             r0,  r0,  #8
487        sub             r1,  r1,  r3, lsl #4
488        sub             r1,  r1,  r3, lsl #2
489        add             r1,  r1,  #8
490        bl              \type\()_h264_qpel8_v_lowpass_neon
491        sub             r1,  r1,  r3, lsl #2
492        mov             lr,  r4
493endfunc
494
495function \type\()_h264_qpel8_v_lowpass_neon
496        vld1.8          {d8},  [r1], r3
497        vld1.8          {d10}, [r1], r3
498        vld1.8          {d12}, [r1], r3
499        vld1.8          {d14}, [r1], r3
500        vld1.8          {d22}, [r1], r3
501        vld1.8          {d24}, [r1], r3
502        vld1.8          {d26}, [r1], r3
503        vld1.8          {d28}, [r1], r3
504        vld1.8          {d9},  [r1], r3
505        vld1.8          {d11}, [r1], r3
506        vld1.8          {d13}, [r1], r3
507        vld1.8          {d15}, [r1], r3
508        vld1.8          {d23}, [r1]
509
510        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
511        lowpass_8       d8,  d9,  d10, d11, d8,  d10
512        lowpass_8       d12, d13, d14, d15, d12, d14
513        lowpass_8       d22, d23, d24, d25, d22, d24
514        lowpass_8       d26, d27, d28, d29, d26, d28
515        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
516
517  .ifc \type,avg
518        vld1.8          {d9},  [r0,:64], r2
519        vrhadd.u8       d8,  d8,  d9
520        vld1.8          {d11}, [r0,:64], r2
521        vrhadd.u8       d10, d10, d11
522        vld1.8          {d13}, [r0,:64], r2
523        vrhadd.u8       d12, d12, d13
524        vld1.8          {d15}, [r0,:64], r2
525        vrhadd.u8       d14, d14, d15
526        vld1.8          {d23}, [r0,:64], r2
527        vrhadd.u8       d22, d22, d23
528        vld1.8          {d25}, [r0,:64], r2
529        vrhadd.u8       d24, d24, d25
530        vld1.8          {d27}, [r0,:64], r2
531        vrhadd.u8       d26, d26, d27
532        vld1.8          {d29}, [r0,:64], r2
533        vrhadd.u8       d28, d28, d29
534        sub             r0,  r0,  r2,  lsl #3
535  .endif
536
537        vst1.8          {d8},  [r0,:64], r2
538        vst1.8          {d10}, [r0,:64], r2
539        vst1.8          {d12}, [r0,:64], r2
540        vst1.8          {d14}, [r0,:64], r2
541        vst1.8          {d22}, [r0,:64], r2
542        vst1.8          {d24}, [r0,:64], r2
543        vst1.8          {d26}, [r0,:64], r2
544        vst1.8          {d28}, [r0,:64], r2
545
546        bx              lr
547endfunc
548.endm
549
550        h264_qpel_v_lowpass put
551        h264_qpel_v_lowpass avg
552
553.macro  h264_qpel_v_lowpass_l2 type
554function \type\()_h264_qpel16_v_lowpass_l2_neon
555        mov             r4,  lr
556        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
557        sub             r1,  r1,  r3, lsl #2
558        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
559        sub             r0,  r0,  r3, lsl #4
560        sub             r12, r12, r2, lsl #4
561        add             r0,  r0,  #8
562        add             r12, r12, #8
563        sub             r1,  r1,  r3, lsl #4
564        sub             r1,  r1,  r3, lsl #2
565        add             r1,  r1,  #8
566        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
567        sub             r1,  r1,  r3, lsl #2
568        mov             lr,  r4
569endfunc
570
571function \type\()_h264_qpel8_v_lowpass_l2_neon
572        vld1.8          {d8},  [r1], r3
573        vld1.8          {d10}, [r1], r3
574        vld1.8          {d12}, [r1], r3
575        vld1.8          {d14}, [r1], r3
576        vld1.8          {d22}, [r1], r3
577        vld1.8          {d24}, [r1], r3
578        vld1.8          {d26}, [r1], r3
579        vld1.8          {d28}, [r1], r3
580        vld1.8          {d9},  [r1], r3
581        vld1.8          {d11}, [r1], r3
582        vld1.8          {d13}, [r1], r3
583        vld1.8          {d15}, [r1], r3
584        vld1.8          {d23}, [r1]
585
586        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
587        lowpass_8       d8,  d9,  d10, d11, d8,  d9
588        lowpass_8       d12, d13, d14, d15, d12, d13
589        lowpass_8       d22, d23, d24, d25, d22, d23
590        lowpass_8       d26, d27, d28, d29, d26, d27
591        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
592
593        vld1.8          {d0},  [r12], r2
594        vld1.8          {d1},  [r12], r2
595        vld1.8          {d2},  [r12], r2
596        vld1.8          {d3},  [r12], r2
597        vld1.8          {d4},  [r12], r2
598        vrhadd.u8       q0,  q0,  q4
599        vld1.8          {d5},  [r12], r2
600        vrhadd.u8       q1,  q1,  q6
601        vld1.8          {d10}, [r12], r2
602        vrhadd.u8       q2,  q2,  q11
603        vld1.8          {d11}, [r12], r2
604        vrhadd.u8       q5,  q5,  q13
605
606  .ifc \type,avg
607        vld1.8          {d16}, [r0,:64], r3
608        vrhadd.u8       d0,  d0,  d16
609        vld1.8          {d17}, [r0,:64], r3
610        vrhadd.u8       d1,  d1,  d17
611        vld1.8          {d16}, [r0,:64], r3
612        vrhadd.u8       d2,  d2,  d16
613        vld1.8          {d17}, [r0,:64], r3
614        vrhadd.u8       d3,  d3,  d17
615        vld1.8          {d16}, [r0,:64], r3
616        vrhadd.u8       d4,  d4,  d16
617        vld1.8          {d17}, [r0,:64], r3
618        vrhadd.u8       d5,  d5,  d17
619        vld1.8          {d16}, [r0,:64], r3
620        vrhadd.u8       d10, d10, d16
621        vld1.8          {d17}, [r0,:64], r3
622        vrhadd.u8       d11, d11, d17
623        sub             r0,  r0,  r3,  lsl #3
624  .endif
625
626        vst1.8          {d0},  [r0,:64], r3
627        vst1.8          {d1},  [r0,:64], r3
628        vst1.8          {d2},  [r0,:64], r3
629        vst1.8          {d3},  [r0,:64], r3
630        vst1.8          {d4},  [r0,:64], r3
631        vst1.8          {d5},  [r0,:64], r3
632        vst1.8          {d10}, [r0,:64], r3
633        vst1.8          {d11}, [r0,:64], r3
634
635        bx              lr
636endfunc
637.endm
638
639        h264_qpel_v_lowpass_l2 put
640        h264_qpel_v_lowpass_l2 avg
641
642function put_h264_qpel8_hv_lowpass_neon_top
643        lowpass_const   r12
644        mov             r12, #12
6451:      vld1.8          {d0, d1},  [r1], r3
646        vld1.8          {d16,d17}, [r1], r3
647        subs            r12, r12, #2
648        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
649        vst1.8          {d22-d25}, [r4,:128]!
650        bne             1b
651
652        vld1.8          {d0, d1},  [r1]
653        lowpass_8_1     d0,  d1,  q12, narrow=0
654
655        mov             r12, #-16
656        add             r4,  r4,  r12
657        vld1.8          {d30,d31}, [r4,:128], r12
658        vld1.8          {d20,d21}, [r4,:128], r12
659        vld1.8          {d18,d19}, [r4,:128], r12
660        vld1.8          {d16,d17}, [r4,:128], r12
661        vld1.8          {d14,d15}, [r4,:128], r12
662        vld1.8          {d12,d13}, [r4,:128], r12
663        vld1.8          {d10,d11}, [r4,:128], r12
664        vld1.8          {d8, d9},  [r4,:128], r12
665        vld1.8          {d6, d7},  [r4,:128], r12
666        vld1.8          {d4, d5},  [r4,:128], r12
667        vld1.8          {d2, d3},  [r4,:128], r12
668        vld1.8          {d0, d1},  [r4,:128]
669
670        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
671        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
672
673        swap4           d17, d19, d21, d31, d24, d26, d28, d22
674        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
675
676        vst1.8          {d30,d31}, [r4,:128]!
677        vst1.8          {d6, d7},  [r4,:128]!
678        vst1.8          {d20,d21}, [r4,:128]!
679        vst1.8          {d4, d5},  [r4,:128]!
680        vst1.8          {d18,d19}, [r4,:128]!
681        vst1.8          {d2, d3},  [r4,:128]!
682        vst1.8          {d16,d17}, [r4,:128]!
683        vst1.8          {d0, d1},  [r4,:128]
684
685        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
686        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
687        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
688        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
689
690        vld1.8          {d16,d17}, [r4,:128], r12
691        vld1.8          {d30,d31}, [r4,:128], r12
692        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
693        vld1.8          {d16,d17}, [r4,:128], r12
694        vld1.8          {d30,d31}, [r4,:128], r12
695        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
696        vld1.8          {d16,d17}, [r4,:128], r12
697        vld1.8          {d30,d31}, [r4,:128], r12
698        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
699        vld1.8          {d16,d17}, [r4,:128], r12
700        vld1.8          {d30,d31}, [r4,:128]
701        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
702
703        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
704
705        bx              lr
706endfunc
707
708.macro  h264_qpel8_hv_lowpass type
709function \type\()_h264_qpel8_hv_lowpass_neon
710        mov             r10, lr
711        bl              put_h264_qpel8_hv_lowpass_neon_top
712  .ifc \type,avg
713        vld1.8          {d0},      [r0,:64], r2
714        vrhadd.u8       d12, d12, d0
715        vld1.8          {d1},      [r0,:64], r2
716        vrhadd.u8       d13, d13, d1
717        vld1.8          {d2},      [r0,:64], r2
718        vrhadd.u8       d14, d14, d2
719        vld1.8          {d3},      [r0,:64], r2
720        vrhadd.u8       d15, d15, d3
721        vld1.8          {d4},      [r0,:64], r2
722        vrhadd.u8       d8,  d8,  d4
723        vld1.8          {d5},      [r0,:64], r2
724        vrhadd.u8       d9,  d9,  d5
725        vld1.8          {d6},      [r0,:64], r2
726        vrhadd.u8       d10, d10, d6
727        vld1.8          {d7},      [r0,:64], r2
728        vrhadd.u8       d11, d11, d7
729        sub             r0,  r0,  r2,  lsl #3
730  .endif
731
732        vst1.8          {d12},     [r0,:64], r2
733        vst1.8          {d13},     [r0,:64], r2
734        vst1.8          {d14},     [r0,:64], r2
735        vst1.8          {d15},     [r0,:64], r2
736        vst1.8          {d8},      [r0,:64], r2
737        vst1.8          {d9},      [r0,:64], r2
738        vst1.8          {d10},     [r0,:64], r2
739        vst1.8          {d11},     [r0,:64], r2
740
741        mov             lr,  r10
742        bx              lr
743endfunc
744.endm
745
746        h264_qpel8_hv_lowpass put
747        h264_qpel8_hv_lowpass avg
748
749.macro  h264_qpel8_hv_lowpass_l2 type
750function \type\()_h264_qpel8_hv_lowpass_l2_neon
751        mov             r10, lr
752        bl              put_h264_qpel8_hv_lowpass_neon_top
753
754        vld1.8          {d0, d1},  [r2,:128]!
755        vld1.8          {d2, d3},  [r2,:128]!
756        vrhadd.u8       q0,  q0,  q6
757        vld1.8          {d4, d5},  [r2,:128]!
758        vrhadd.u8       q1,  q1,  q7
759        vld1.8          {d6, d7},  [r2,:128]!
760        vrhadd.u8       q2,  q2,  q4
761        vrhadd.u8       q3,  q3,  q5
762  .ifc \type,avg
763        vld1.8          {d16},     [r0,:64], r3
764        vrhadd.u8       d0,  d0,  d16
765        vld1.8          {d17},     [r0,:64], r3
766        vrhadd.u8       d1,  d1,  d17
767        vld1.8          {d18},     [r0,:64], r3
768        vrhadd.u8       d2,  d2,  d18
769        vld1.8          {d19},     [r0,:64], r3
770        vrhadd.u8       d3,  d3,  d19
771        vld1.8          {d20},     [r0,:64], r3
772        vrhadd.u8       d4,  d4,  d20
773        vld1.8          {d21},     [r0,:64], r3
774        vrhadd.u8       d5,  d5,  d21
775        vld1.8          {d22},     [r0,:64], r3
776        vrhadd.u8       d6,  d6,  d22
777        vld1.8          {d23},     [r0,:64], r3
778        vrhadd.u8       d7,  d7,  d23
779        sub             r0,  r0,  r3,  lsl #3
780  .endif
781        vst1.8          {d0},      [r0,:64], r3
782        vst1.8          {d1},      [r0,:64], r3
783        vst1.8          {d2},      [r0,:64], r3
784        vst1.8          {d3},      [r0,:64], r3
785        vst1.8          {d4},      [r0,:64], r3
786        vst1.8          {d5},      [r0,:64], r3
787        vst1.8          {d6},      [r0,:64], r3
788        vst1.8          {d7},      [r0,:64], r3
789
790        mov             lr,  r10
791        bx              lr
792endfunc
793.endm
794
795        h264_qpel8_hv_lowpass_l2 put
796        h264_qpel8_hv_lowpass_l2 avg
797
798.macro  h264_qpel16_hv  type
799function \type\()_h264_qpel16_hv_lowpass_neon
800        mov             r9,  lr
801        bl              \type\()_h264_qpel8_hv_lowpass_neon
802        sub             r1,  r1,  r3, lsl #2
803        bl              \type\()_h264_qpel8_hv_lowpass_neon
804        sub             r1,  r1,  r3, lsl #4
805        sub             r1,  r1,  r3, lsl #2
806        add             r1,  r1,  #8
807        sub             r0,  r0,  r2, lsl #4
808        add             r0,  r0,  #8
809        bl              \type\()_h264_qpel8_hv_lowpass_neon
810        sub             r1,  r1,  r3, lsl #2
811        mov             lr,  r9
812        b               \type\()_h264_qpel8_hv_lowpass_neon
813endfunc
814
815function \type\()_h264_qpel16_hv_lowpass_l2_neon
816        mov             r9,  lr
817        sub             r2,  r4,  #256
818        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
819        sub             r1,  r1,  r3, lsl #2
820        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
821        sub             r1,  r1,  r3, lsl #4
822        sub             r1,  r1,  r3, lsl #2
823        add             r1,  r1,  #8
824        sub             r0,  r0,  r3, lsl #4
825        add             r0,  r0,  #8
826        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
827        sub             r1,  r1,  r3, lsl #2
828        mov             lr,  r9
829        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
830endfunc
831.endm
832
833        h264_qpel16_hv put
834        h264_qpel16_hv avg
835
836.macro  h264_qpel8      type
837function ff_\type\()_h264_qpel8_mc10_neon, export=1
838        lowpass_const   r3
839        mov             r3,  r1
840        sub             r1,  r1,  #2
841        mov             r12, #8
842        b               \type\()_h264_qpel8_h_lowpass_l2_neon
843endfunc
844
845function ff_\type\()_h264_qpel8_mc20_neon, export=1
846        lowpass_const   r3
847        sub             r1,  r1,  #2
848        mov             r3,  r2
849        mov             r12, #8
850        b               \type\()_h264_qpel8_h_lowpass_neon
851endfunc
852
853function ff_\type\()_h264_qpel8_mc30_neon, export=1
854        lowpass_const   r3
855        add             r3,  r1,  #1
856        sub             r1,  r1,  #2
857        mov             r12, #8
858        b               \type\()_h264_qpel8_h_lowpass_l2_neon
859endfunc
860
861function ff_\type\()_h264_qpel8_mc01_neon, export=1
862        push            {lr}
863        mov             r12, r1
864\type\()_h264_qpel8_mc01:
865        lowpass_const   r3
866        mov             r3,  r2
867        sub             r1,  r1,  r2, lsl #1
868        vpush           {d8-d15}
869        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
870        vpop            {d8-d15}
871        pop             {pc}
872endfunc
873
874function ff_\type\()_h264_qpel8_mc11_neon, export=1
875        push            {r0, r1, r11, lr}
876\type\()_h264_qpel8_mc11:
877        lowpass_const   r3
878        mov             r11, sp
879A       bic             sp,  sp,  #15
880T       bic             r0,  r11, #15
881T       mov             sp,  r0
882        sub             sp,  sp,  #64
883        mov             r0,  sp
884        sub             r1,  r1,  #2
885        mov             r3,  #8
886        mov             r12, #8
887        vpush           {d8-d15}
888        bl              put_h264_qpel8_h_lowpass_neon
889        ldrd            r0,  [r11], #8
890        mov             r3,  r2
891        add             r12, sp,  #64
892        sub             r1,  r1,  r2, lsl #1
893        mov             r2,  #8
894        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
895        vpop            {d8-d15}
896        mov             sp,  r11
897        pop             {r11, pc}
898endfunc
899
900function ff_\type\()_h264_qpel8_mc21_neon, export=1
901        push            {r0, r1, r4, r10, r11, lr}
902\type\()_h264_qpel8_mc21:
903        lowpass_const   r3
904        mov             r11, sp
905A       bic             sp,  sp,  #15
906T       bic             r0,  r11, #15
907T       mov             sp,  r0
908        sub             sp,  sp,  #(8*8+16*12)
909        sub             r1,  r1,  #2
910        mov             r3,  #8
911        mov             r0,  sp
912        mov             r12, #8
913        vpush           {d8-d15}
914        bl              put_h264_qpel8_h_lowpass_neon
915        mov             r4,  r0
916        ldrd            r0,  [r11], #8
917        sub             r1,  r1,  r2, lsl #1
918        sub             r1,  r1,  #2
919        mov             r3,  r2
920        sub             r2,  r4,  #64
921        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
922        vpop            {d8-d15}
923        mov             sp,  r11
924        pop             {r4, r10, r11, pc}
925endfunc
926
927function ff_\type\()_h264_qpel8_mc31_neon, export=1
928        add             r1,  r1,  #1
929        push            {r0, r1, r11, lr}
930        sub             r1,  r1,  #1
931        b               \type\()_h264_qpel8_mc11
932endfunc
933
934function ff_\type\()_h264_qpel8_mc02_neon, export=1
935        push            {lr}
936        lowpass_const   r3
937        sub             r1,  r1,  r2, lsl #1
938        mov             r3,  r2
939        vpush           {d8-d15}
940        bl              \type\()_h264_qpel8_v_lowpass_neon
941        vpop            {d8-d15}
942        pop             {pc}
943endfunc
944
945function ff_\type\()_h264_qpel8_mc12_neon, export=1
946        push            {r0, r1, r4, r10, r11, lr}
947\type\()_h264_qpel8_mc12:
948        lowpass_const   r3
949        mov             r11, sp
950A       bic             sp,  sp,  #15
951T       bic             r0,  r11, #15
952T       mov             sp,  r0
953        sub             sp,  sp,  #(8*8+16*12)
954        sub             r1,  r1,  r2, lsl #1
955        mov             r3,  r2
956        mov             r2,  #8
957        mov             r0,  sp
958        vpush           {d8-d15}
959        bl              put_h264_qpel8_v_lowpass_neon
960        mov             r4,  r0
961        ldrd            r0,  [r11], #8
962        sub             r1,  r1,  r3, lsl #1
963        sub             r1,  r1,  #2
964        sub             r2,  r4,  #64
965        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
966        vpop            {d8-d15}
967        mov             sp,  r11
968        pop             {r4, r10, r11, pc}
969endfunc
970
971function ff_\type\()_h264_qpel8_mc22_neon, export=1
972        push            {r4, r10, r11, lr}
973        mov             r11, sp
974A       bic             sp,  sp,  #15
975T       bic             r4,  r11, #15
976T       mov             sp,  r4
977        sub             r1,  r1,  r2, lsl #1
978        sub             r1,  r1,  #2
979        mov             r3,  r2
980        sub             sp,  sp,  #(16*12)
981        mov             r4,  sp
982        vpush           {d8-d15}
983        bl              \type\()_h264_qpel8_hv_lowpass_neon
984        vpop            {d8-d15}
985        mov             sp,  r11
986        pop             {r4, r10, r11, pc}
987endfunc
988
989function ff_\type\()_h264_qpel8_mc32_neon, export=1
990        push            {r0, r1, r4, r10, r11, lr}
991        add             r1,  r1,  #1
992        b               \type\()_h264_qpel8_mc12
993endfunc
994
995function ff_\type\()_h264_qpel8_mc03_neon, export=1
996        push            {lr}
997        add             r12, r1,  r2
998        b               \type\()_h264_qpel8_mc01
999endfunc
1000
1001function ff_\type\()_h264_qpel8_mc13_neon, export=1
1002        push            {r0, r1, r11, lr}
1003        add             r1,  r1,  r2
1004        b               \type\()_h264_qpel8_mc11
1005endfunc
1006
1007function ff_\type\()_h264_qpel8_mc23_neon, export=1
1008        push            {r0, r1, r4, r10, r11, lr}
1009        add             r1,  r1,  r2
1010        b               \type\()_h264_qpel8_mc21
1011endfunc
1012
1013function ff_\type\()_h264_qpel8_mc33_neon, export=1
1014        add             r1,  r1,  #1
1015        push            {r0, r1, r11, lr}
1016        add             r1,  r1,  r2
1017        sub             r1,  r1,  #1
1018        b               \type\()_h264_qpel8_mc11
1019endfunc
1020.endm
1021
1022        h264_qpel8 put
1023        h264_qpel8 avg
1024
1025.macro  h264_qpel16     type
1026function ff_\type\()_h264_qpel16_mc10_neon, export=1
1027        lowpass_const   r3
1028        mov             r3,  r1
1029        sub             r1,  r1,  #2
1030        b               \type\()_h264_qpel16_h_lowpass_l2_neon
1031endfunc
1032
1033function ff_\type\()_h264_qpel16_mc20_neon, export=1
1034        lowpass_const   r3
1035        sub             r1,  r1,  #2
1036        mov             r3,  r2
1037        b               \type\()_h264_qpel16_h_lowpass_neon
1038endfunc
1039
1040function ff_\type\()_h264_qpel16_mc30_neon, export=1
1041        lowpass_const   r3
1042        add             r3,  r1,  #1
1043        sub             r1,  r1,  #2
1044        b               \type\()_h264_qpel16_h_lowpass_l2_neon
1045endfunc
1046
1047function ff_\type\()_h264_qpel16_mc01_neon, export=1
1048        push            {r4, lr}
1049        mov             r12, r1
1050\type\()_h264_qpel16_mc01:
1051        lowpass_const   r3
1052        mov             r3,  r2
1053        sub             r1,  r1,  r2, lsl #1
1054        vpush           {d8-d15}
1055        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1056        vpop            {d8-d15}
1057        pop             {r4, pc}
1058endfunc
1059
1060function ff_\type\()_h264_qpel16_mc11_neon, export=1
1061        push            {r0, r1, r4, r11, lr}
1062\type\()_h264_qpel16_mc11:
1063        lowpass_const   r3
1064        mov             r11, sp
1065A       bic             sp,  sp,  #15
1066T       bic             r0,  r11, #15
1067T       mov             sp,  r0
1068        sub             sp,  sp,  #256
1069        mov             r0,  sp
1070        sub             r1,  r1,  #2
1071        mov             r3,  #16
1072        vpush           {d8-d15}
1073        bl              put_h264_qpel16_h_lowpass_neon
1074        ldrd            r0,  [r11], #8
1075        mov             r3,  r2
1076        add             r12, sp,  #64
1077        sub             r1,  r1,  r2, lsl #1
1078        mov             r2,  #16
1079        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1080        vpop            {d8-d15}
1081        mov             sp,  r11
1082        pop             {r4, r11, pc}
1083endfunc
1084
1085function ff_\type\()_h264_qpel16_mc21_neon, export=1
1086        push            {r0, r1, r4-r5, r9-r11, lr}
1087\type\()_h264_qpel16_mc21:
1088        lowpass_const   r3
1089        mov             r11, sp
1090A       bic             sp,  sp,  #15
1091T       bic             r0,  r11, #15
1092T       mov             sp,  r0
1093        sub             sp,  sp,  #(16*16+16*12)
1094        sub             r1,  r1,  #2
1095        mov             r0,  sp
1096        vpush           {d8-d15}
1097        bl              put_h264_qpel16_h_lowpass_neon_packed
1098        mov             r4,  r0
1099        ldrd            r0,  [r11], #8
1100        sub             r1,  r1,  r2, lsl #1
1101        sub             r1,  r1,  #2
1102        mov             r3,  r2
1103        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1104        vpop            {d8-d15}
1105        mov             sp,  r11
1106        pop             {r4-r5, r9-r11, pc}
1107endfunc
1108
1109function ff_\type\()_h264_qpel16_mc31_neon, export=1
1110        add             r1,  r1,  #1
1111        push            {r0, r1, r4, r11, lr}
1112        sub             r1,  r1,  #1
1113        b               \type\()_h264_qpel16_mc11
1114endfunc
1115
1116function ff_\type\()_h264_qpel16_mc02_neon, export=1
1117        push            {r4, lr}
1118        lowpass_const   r3
1119        sub             r1,  r1,  r2, lsl #1
1120        mov             r3,  r2
1121        vpush           {d8-d15}
1122        bl              \type\()_h264_qpel16_v_lowpass_neon
1123        vpop            {d8-d15}
1124        pop             {r4, pc}
1125endfunc
1126
1127function ff_\type\()_h264_qpel16_mc12_neon, export=1
1128        push            {r0, r1, r4-r5, r9-r11, lr}
1129\type\()_h264_qpel16_mc12:
1130        lowpass_const   r3
1131        mov             r11, sp
1132A       bic             sp,  sp,  #15
1133T       bic             r0,  r11, #15
1134T       mov             sp,  r0
1135        sub             sp,  sp,  #(16*16+16*12)
1136        sub             r1,  r1,  r2, lsl #1
1137        mov             r0,  sp
1138        mov             r3,  r2
1139        vpush           {d8-d15}
1140        bl              put_h264_qpel16_v_lowpass_neon_packed
1141        mov             r4,  r0
1142        ldrd            r0,  [r11], #8
1143        sub             r1,  r1,  r3, lsl #1
1144        sub             r1,  r1,  #2
1145        mov             r2,  r3
1146        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1147        vpop            {d8-d15}
1148        mov             sp,  r11
1149        pop             {r4-r5, r9-r11, pc}
1150endfunc
1151
1152function ff_\type\()_h264_qpel16_mc22_neon, export=1
1153        push            {r4, r9-r11, lr}
1154        lowpass_const   r3
1155        mov             r11, sp
1156A       bic             sp,  sp,  #15
1157T       bic             r4,  r11, #15
1158T       mov             sp,  r4
1159        sub             r1,  r1,  r2, lsl #1
1160        sub             r1,  r1,  #2
1161        mov             r3,  r2
1162        sub             sp,  sp,  #(16*12)
1163        mov             r4,  sp
1164        vpush           {d8-d15}
1165        bl              \type\()_h264_qpel16_hv_lowpass_neon
1166        vpop            {d8-d15}
1167        mov             sp,  r11
1168        pop             {r4, r9-r11, pc}
1169endfunc
1170
1171function ff_\type\()_h264_qpel16_mc32_neon, export=1
1172        push            {r0, r1, r4-r5, r9-r11, lr}
1173        add             r1,  r1,  #1
1174        b               \type\()_h264_qpel16_mc12
1175endfunc
1176
1177function ff_\type\()_h264_qpel16_mc03_neon, export=1
1178        push            {r4, lr}
1179        add             r12, r1,  r2
1180        b               \type\()_h264_qpel16_mc01
1181endfunc
1182
1183function ff_\type\()_h264_qpel16_mc13_neon, export=1
1184        push            {r0, r1, r4, r11, lr}
1185        add             r1,  r1,  r2
1186        b               \type\()_h264_qpel16_mc11
1187endfunc
1188
1189function ff_\type\()_h264_qpel16_mc23_neon, export=1
1190        push            {r0, r1, r4-r5, r9-r11, lr}
1191        add             r1,  r1,  r2
1192        b               \type\()_h264_qpel16_mc21
1193endfunc
1194
1195function ff_\type\()_h264_qpel16_mc33_neon, export=1
1196        add             r1,  r1,  #1
1197        push            {r0, r1, r4, r11, lr}
1198        add             r1,  r1,  r2
1199        sub             r1,  r1,  #1
1200        b               \type\()_h264_qpel16_mc11
1201endfunc
1202.endm
1203
1204        h264_qpel16 put
1205        h264_qpel16 avg
1206
1207@ Biweighted prediction
1208
1209.macro  biweight_16     macs, macd
1210        vdup.8          d0,  r4
1211        vdup.8          d1,  r5
1212        vmov            q2,  q8
1213        vmov            q3,  q8
12141:      subs            r3,  r3,  #2
1215        vld1.8          {d20-d21},[r0,:128], r2
1216        \macd           q2,  d0,  d20
1217        pld             [r0]
1218        \macd           q3,  d0,  d21
1219        vld1.8          {d22-d23},[r1,:128], r2
1220        \macs           q2,  d1,  d22
1221        pld             [r1]
1222        \macs           q3,  d1,  d23
1223        vmov            q12, q8
1224        vld1.8          {d28-d29},[r0,:128], r2
1225        vmov            q13, q8
1226        \macd           q12, d0,  d28
1227        pld             [r0]
1228        \macd           q13, d0,  d29
1229        vld1.8          {d30-d31},[r1,:128], r2
1230        \macs           q12, d1,  d30
1231        pld             [r1]
1232        \macs           q13, d1,  d31
1233        vshl.s16        q2,  q2,  q9
1234        vshl.s16        q3,  q3,  q9
1235        vqmovun.s16     d4,  q2
1236        vqmovun.s16     d5,  q3
1237        vshl.s16        q12, q12, q9
1238        vshl.s16        q13, q13, q9
1239        vqmovun.s16     d24, q12
1240        vqmovun.s16     d25, q13
1241        vmov            q3,  q8
1242        vst1.8          {d4- d5}, [r6,:128], r2
1243        vmov            q2,  q8
1244        vst1.8          {d24-d25},[r6,:128], r2
1245        bne             1b
1246        pop             {r4-r6, pc}
1247.endm
1248
1249.macro  biweight_8      macs, macd
1250        vdup.8          d0,  r4
1251        vdup.8          d1,  r5
1252        vmov            q1,  q8
1253        vmov            q10, q8
12541:      subs            r3,  r3,  #2
1255        vld1.8          {d4},[r0,:64], r2
1256        \macd           q1,  d0,  d4
1257        pld             [r0]
1258        vld1.8          {d5},[r1,:64], r2
1259        \macs           q1,  d1,  d5
1260        pld             [r1]
1261        vld1.8          {d6},[r0,:64], r2
1262        \macd           q10, d0,  d6
1263        pld             [r0]
1264        vld1.8          {d7},[r1,:64], r2
1265        \macs           q10, d1,  d7
1266        pld             [r1]
1267        vshl.s16        q1,  q1,  q9
1268        vqmovun.s16     d2,  q1
1269        vshl.s16        q10, q10, q9
1270        vqmovun.s16     d4,  q10
1271        vmov            q10, q8
1272        vst1.8          {d2},[r6,:64], r2
1273        vmov            q1,  q8
1274        vst1.8          {d4},[r6,:64], r2
1275        bne             1b
1276        pop             {r4-r6, pc}
1277.endm
1278
1279.macro  biweight_4      macs, macd
1280        vdup.8          d0,  r4
1281        vdup.8          d1,  r5
1282        vmov            q1,  q8
1283        vmov            q10, q8
12841:      subs            r3,  r3,  #4
1285        vld1.32         {d4[0]},[r0,:32], r2
1286        vld1.32         {d4[1]},[r0,:32], r2
1287        \macd           q1,  d0,  d4
1288        pld             [r0]
1289        vld1.32         {d5[0]},[r1,:32], r2
1290        vld1.32         {d5[1]},[r1,:32], r2
1291        \macs           q1,  d1,  d5
1292        pld             [r1]
1293        blt             2f
1294        vld1.32         {d6[0]},[r0,:32], r2
1295        vld1.32         {d6[1]},[r0,:32], r2
1296        \macd           q10, d0,  d6
1297        pld             [r0]
1298        vld1.32         {d7[0]},[r1,:32], r2
1299        vld1.32         {d7[1]},[r1,:32], r2
1300        \macs           q10, d1,  d7
1301        pld             [r1]
1302        vshl.s16        q1,  q1,  q9
1303        vqmovun.s16     d2,  q1
1304        vshl.s16        q10, q10, q9
1305        vqmovun.s16     d4,  q10
1306        vmov            q10, q8
1307        vst1.32         {d2[0]},[r6,:32], r2
1308        vst1.32         {d2[1]},[r6,:32], r2
1309        vmov            q1,  q8
1310        vst1.32         {d4[0]},[r6,:32], r2
1311        vst1.32         {d4[1]},[r6,:32], r2
1312        bne             1b
1313        pop             {r4-r6, pc}
13142:      vshl.s16        q1,  q1,  q9
1315        vqmovun.s16     d2,  q1
1316        vst1.32         {d2[0]},[r6,:32], r2
1317        vst1.32         {d2[1]},[r6,:32], r2
1318        pop             {r4-r6, pc}
1319.endm
1320
1321.macro  biweight_func   w
1322function ff_biweight_h264_pixels_\w\()_neon, export=1
1323        push            {r4-r6, lr}
1324        ldr             r12, [sp, #16]
1325        add             r4,  sp,  #20
1326        ldm             r4,  {r4-r6}
1327        lsr             lr,  r4,  #31
1328        add             r6,  r6,  #1
1329        eors            lr,  lr,  r5,  lsr #30
1330        orr             r6,  r6,  #1
1331        vdup.16         q9,  r12
1332        lsl             r6,  r6,  r12
1333        vmvn            q9,  q9
1334        vdup.16         q8,  r6
1335        mov             r6,  r0
1336        beq             10f
1337        subs            lr,  lr,  #1
1338        beq             20f
1339        subs            lr,  lr,  #1
1340        beq             30f
1341        b               40f
134210:     biweight_\w     vmlal.u8, vmlal.u8
134320:     rsb             r4,  r4,  #0
1344        biweight_\w     vmlal.u8, vmlsl.u8
134530:     rsb             r4,  r4,  #0
1346        rsb             r5,  r5,  #0
1347        biweight_\w     vmlsl.u8, vmlsl.u8
134840:     rsb             r5,  r5,  #0
1349        biweight_\w     vmlsl.u8, vmlal.u8
1350endfunc
1351.endm
1352
1353        biweight_func   16
1354        biweight_func   8
1355        biweight_func   4
1356
1357@ Weighted prediction
1358
1359.macro  weight_16       add
1360        vdup.8          d0,  r12
13611:      subs            r2,  r2,  #2
1362        vld1.8          {d20-d21},[r0,:128], r1
1363        vmull.u8        q2,  d0,  d20
1364        pld             [r0]
1365        vmull.u8        q3,  d0,  d21
1366        vld1.8          {d28-d29},[r0,:128], r1
1367        vmull.u8        q12, d0,  d28
1368        pld             [r0]
1369        vmull.u8        q13, d0,  d29
1370        \add            q2,  q8,  q2
1371        vrshl.s16       q2,  q2,  q9
1372        \add            q3,  q8,  q3
1373        vrshl.s16       q3,  q3,  q9
1374        vqmovun.s16     d4,  q2
1375        vqmovun.s16     d5,  q3
1376        \add            q12, q8,  q12
1377        vrshl.s16       q12, q12, q9
1378        \add            q13, q8,  q13
1379        vrshl.s16       q13, q13, q9
1380        vqmovun.s16     d24, q12
1381        vqmovun.s16     d25, q13
1382        vst1.8          {d4- d5}, [r4,:128], r1
1383        vst1.8          {d24-d25},[r4,:128], r1
1384        bne             1b
1385        pop             {r4, pc}
1386.endm
1387
1388.macro  weight_8        add
1389        vdup.8          d0,  r12
13901:      subs            r2,  r2,  #2
1391        vld1.8          {d4},[r0,:64], r1
1392        vmull.u8        q1,  d0,  d4
1393        pld             [r0]
1394        vld1.8          {d6},[r0,:64], r1
1395        vmull.u8        q10, d0,  d6
1396        \add            q1,  q8,  q1
1397        pld             [r0]
1398        vrshl.s16       q1,  q1,  q9
1399        vqmovun.s16     d2,  q1
1400        \add            q10, q8,  q10
1401        vrshl.s16       q10, q10, q9
1402        vqmovun.s16     d4,  q10
1403        vst1.8          {d2},[r4,:64], r1
1404        vst1.8          {d4},[r4,:64], r1
1405        bne             1b
1406        pop             {r4, pc}
1407.endm
1408
1409.macro  weight_4        add
1410        vdup.8          d0,  r12
1411        vmov            q1,  q8
1412        vmov            q10, q8
14131:      subs            r2,  r2,  #4
1414        vld1.32         {d4[0]},[r0,:32], r1
1415        vld1.32         {d4[1]},[r0,:32], r1
1416        vmull.u8        q1,  d0,  d4
1417        pld             [r0]
1418        blt             2f
1419        vld1.32         {d6[0]},[r0,:32], r1
1420        vld1.32         {d6[1]},[r0,:32], r1
1421        vmull.u8        q10, d0,  d6
1422        pld             [r0]
1423        \add            q1,  q8,  q1
1424        vrshl.s16       q1,  q1,  q9
1425        vqmovun.s16     d2,  q1
1426        \add            q10, q8,  q10
1427        vrshl.s16       q10, q10, q9
1428        vqmovun.s16     d4,  q10
1429        vmov            q10, q8
1430        vst1.32         {d2[0]},[r4,:32], r1
1431        vst1.32         {d2[1]},[r4,:32], r1
1432        vmov            q1,  q8
1433        vst1.32         {d4[0]},[r4,:32], r1
1434        vst1.32         {d4[1]},[r4,:32], r1
1435        bne             1b
1436        pop             {r4, pc}
14372:      \add            q1,  q8,  q1
1438        vrshl.s16       q1,  q1,  q9
1439        vqmovun.s16     d2,  q1
1440        vst1.32         {d2[0]},[r4,:32], r1
1441        vst1.32         {d2[1]},[r4,:32], r1
1442        pop             {r4, pc}
1443.endm
1444
1445.macro  weight_func     w
1446function ff_weight_h264_pixels_\w\()_neon, export=1
1447        push            {r4, lr}
1448        ldr             r12, [sp, #8]
1449        ldr             r4,  [sp, #12]
1450        cmp             r3,  #1
1451        lsl             r4,  r4,  r3
1452        vdup.16         q8,  r4
1453        mov             r4,  r0
1454        ble             20f
1455        rsb             lr,  r3,  #1
1456        vdup.16         q9,  lr
1457        cmp             r12, #0
1458        blt             10f
1459        weight_\w       vhadd.s16
146010:     rsb             r12, r12, #0
1461        weight_\w       vhsub.s16
146220:     rsb             lr,  r3,  #0
1463        vdup.16         q9,  lr
1464        cmp             r12, #0
1465        blt             10f
1466        weight_\w       vadd.s16
146710:     rsb             r12, r12, #0
1468        weight_\w       vsub.s16
1469endfunc
1470.endm
1471
1472        weight_func     16
1473        weight_func     8
1474        weight_func     4
1475