1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25        /* H.264 qpel MC */
26
27.macro  lowpass_const   r
28        movz            \r, #20, lsl #16
29        movk            \r, #5
30        mov             v6.S[0], \r
31.endm
32
33//trashes v0-v5
34.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
35        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
36        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
37        uaddl           v2.8H,      v2.8B,     v3.8B
38        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
39        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
40        uaddl           v4.8H,      v4.8B,     v5.8B
41        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
42        uaddl           \d0\().8H,  \r0\().8B, v1.8B
43        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
44        mla             \d0\().8H,  v2.8H,     v6.H[1]
45        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
46        uaddl           v0.8H,      v0.8B,     v1.8B
47        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
48        mls             \d0\().8H,  v4.8H,     v6.H[0]
49        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
50        uaddl           v1.8H,      v1.8B,     v3.8B
51        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
52        uaddl           \d1\().8H,  \r2\().8B, v2.8B
53        mla             \d1\().8H,  v0.8H,     v6.H[1]
54        mls             \d1\().8H,  v1.8H,     v6.H[0]
55  .if \narrow
56        sqrshrun        \d0\().8B,  \d0\().8H, #5
57        sqrshrun        \d1\().8B,  \d1\().8H, #5
58  .endif
59.endm
60
61//trashes v0-v5, v7, v30-v31
62.macro  lowpass_8H      r0,  r1
63        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
64        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
65        uaddl           v0.8H,      v0.8B,      v1.8B
66        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
67        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
68        uaddl           v2.8H,      v2.8B,      v3.8B
69        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
70        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
71        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
72        mla             \r0\().8H,  v0.8H,      v6.H[1]
73        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
74        uaddl           v4.8H,      v4.8B,      v5.8B
75        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
76        mls             \r0\().8H,  v2.8H,      v6.H[0]
77        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
78        uaddl           v7.8H,      v7.8B,      v0.8B
79        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
80        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
81        mla             \r1\().8H,  v4.8H,      v6.H[1]
82        mls             \r1\().8H,  v7.8H,      v6.H[0]
83.endm
84
85// trashes v2-v5, v30
86.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
87        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
88        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
89        uaddl           v2.8H,     v2.8B,     v3.8B
90        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
91        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
92        uaddl           v4.8H,     v4.8B,     v5.8B
93        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
94        uaddl           \d0\().8H, \r0\().8B, v30.8B
95        mla             \d0\().8H, v2.8H,     v6.H[1]
96        mls             \d0\().8H, v4.8H,     v6.H[0]
97  .if \narrow
98        sqrshrun        \d0\().8B, \d0\().8H, #5
99  .endif
100.endm
101
102// trashed v0-v7
103.macro  lowpass_8.16    r0,  r1,  r2
104        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
105        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
106        saddl           v5.4S,      v1.4H,      v0.4H
107        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
108        saddl2          v1.4S,      v1.8H,      v0.8H
109        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
110        saddl           v6.4S,      v2.4H,      v3.4H
111        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
112        saddl2          v2.4S,      v2.8H,      v3.8H
113        saddl           v0.4S,      \r0\().4H,  \r1\().4H
114        saddl2          v4.4S,      \r0\().8H,  \r1\().8H
115
116        shl             v3.4S,  v5.4S,  #4
117        shl             v5.4S,  v5.4S,  #2
118        shl             v7.4S,  v6.4S,  #2
119        add             v5.4S,  v5.4S,  v3.4S
120        add             v6.4S,  v6.4S,  v7.4S
121
122        shl             v3.4S,  v1.4S,  #4
123        shl             v1.4S,  v1.4S,  #2
124        shl             v7.4S,  v2.4S,  #2
125        add             v1.4S,  v1.4S,  v3.4S
126        add             v2.4S,  v2.4S,  v7.4S
127
128        add             v5.4S,  v5.4S,  v0.4S
129        sub             v5.4S,  v5.4S,  v6.4S
130
131        add             v1.4S,  v1.4S,  v4.4S
132        sub             v1.4S,  v1.4S,  v2.4S
133
134        rshrn           v5.4H,  v5.4S,  #10
135        rshrn2          v5.8H,  v1.4S,  #10
136
137        sqxtun          \r2\().8B,  v5.8H
138.endm
139
140function put_h264_qpel16_h_lowpass_neon_packed
141        mov             x4,  x30
142        mov             x12, #16
143        mov             x3,  #8
144        bl              put_h264_qpel8_h_lowpass_neon
145        sub             x1,  x1,  x2, lsl #4
146        add             x1,  x1,  #8
147        mov             x12, #16
148        mov             x30, x4
149        b               put_h264_qpel8_h_lowpass_neon
150endfunc
151
152.macro  h264_qpel_h_lowpass type
153function \type\()_h264_qpel16_h_lowpass_neon
154        mov             x13, x30
155        mov             x12, #16
156        bl              \type\()_h264_qpel8_h_lowpass_neon
157        sub             x0,  x0,  x3, lsl #4
158        sub             x1,  x1,  x2, lsl #4
159        add             x0,  x0,  #8
160        add             x1,  x1,  #8
161        mov             x12, #16
162        mov             x30, x13
163endfunc
164
165function \type\()_h264_qpel8_h_lowpass_neon
1661:      ld1             {v28.8B, v29.8B}, [x1], x2
167        ld1             {v16.8B, v17.8B}, [x1], x2
168        subs            x12, x12, #2
169        lowpass_8       v28, v29, v16, v17, v28, v16
170  .ifc \type,avg
171        ld1             {v2.8B},    [x0], x3
172        urhadd          v28.8B, v28.8B,  v2.8B
173        ld1             {v3.8B},    [x0]
174        urhadd          v16.8B, v16.8B, v3.8B
175        sub             x0,  x0,  x3
176  .endif
177        st1             {v28.8B},    [x0], x3
178        st1             {v16.8B},    [x0], x3
179        b.ne            1b
180        ret
181endfunc
182.endm
183
184        h264_qpel_h_lowpass put
185        h264_qpel_h_lowpass avg
186
187.macro  h264_qpel_h_lowpass_l2 type
188function \type\()_h264_qpel16_h_lowpass_l2_neon
189        mov             x13, x30
190        mov             x12, #16
191        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
192        sub             x0,  x0,  x2, lsl #4
193        sub             x1,  x1,  x2, lsl #4
194        sub             x3,  x3,  x2, lsl #4
195        add             x0,  x0,  #8
196        add             x1,  x1,  #8
197        add             x3,  x3,  #8
198        mov             x12, #16
199        mov             x30, x13
200endfunc
201
202function \type\()_h264_qpel8_h_lowpass_l2_neon
2031:      ld1             {v26.8B, v27.8B}, [x1], x2
204        ld1             {v16.8B, v17.8B}, [x1], x2
205        ld1             {v28.8B},     [x3], x2
206        ld1             {v29.8B},     [x3], x2
207        subs            x12, x12, #2
208        lowpass_8       v26, v27, v16, v17, v26, v27
209        urhadd          v26.8B, v26.8B, v28.8B
210        urhadd          v27.8B, v27.8B, v29.8B
211  .ifc \type,avg
212        ld1             {v2.8B},      [x0], x2
213        urhadd          v26.8B, v26.8B, v2.8B
214        ld1             {v3.8B},      [x0]
215        urhadd          v27.8B, v27.8B, v3.8B
216        sub             x0,  x0,  x2
217  .endif
218        st1             {v26.8B},     [x0], x2
219        st1             {v27.8B},     [x0], x2
220        b.ne            1b
221        ret
222endfunc
223.endm
224
225        h264_qpel_h_lowpass_l2 put
226        h264_qpel_h_lowpass_l2 avg
227
228function put_h264_qpel16_v_lowpass_neon_packed
229        mov             x4,  x30
230        mov             x2,  #8
231        bl              put_h264_qpel8_v_lowpass_neon
232        sub             x1,  x1,  x3, lsl #2
233        bl              put_h264_qpel8_v_lowpass_neon
234        sub             x1,  x1,  x3, lsl #4
235        sub             x1,  x1,  x3, lsl #2
236        add             x1,  x1,  #8
237        bl              put_h264_qpel8_v_lowpass_neon
238        sub             x1,  x1,  x3, lsl #2
239        mov             x30, x4
240        b               put_h264_qpel8_v_lowpass_neon
241endfunc
242
243.macro  h264_qpel_v_lowpass type
244function \type\()_h264_qpel16_v_lowpass_neon
245        mov             x4,  x30
246        bl              \type\()_h264_qpel8_v_lowpass_neon
247        sub             x1,  x1,  x3, lsl #2
248        bl              \type\()_h264_qpel8_v_lowpass_neon
249        sub             x0,  x0,  x2, lsl #4
250        add             x0,  x0,  #8
251        sub             x1,  x1,  x3, lsl #4
252        sub             x1,  x1,  x3, lsl #2
253        add             x1,  x1,  #8
254        bl              \type\()_h264_qpel8_v_lowpass_neon
255        sub             x1,  x1,  x3, lsl #2
256        mov             x30, x4
257endfunc
258
259function \type\()_h264_qpel8_v_lowpass_neon
260        ld1             {v16.8B}, [x1], x3
261        ld1             {v18.8B}, [x1], x3
262        ld1             {v20.8B}, [x1], x3
263        ld1             {v22.8B}, [x1], x3
264        ld1             {v24.8B}, [x1], x3
265        ld1             {v26.8B}, [x1], x3
266        ld1             {v28.8B}, [x1], x3
267        ld1             {v30.8B}, [x1], x3
268        ld1             {v17.8B}, [x1], x3
269        ld1             {v19.8B}, [x1], x3
270        ld1             {v21.8B}, [x1], x3
271        ld1             {v23.8B}, [x1], x3
272        ld1             {v25.8B}, [x1]
273
274        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
275        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
276        lowpass_8       v16, v17, v18, v19, v16, v17
277        lowpass_8       v20, v21, v22, v23, v18, v19
278        lowpass_8       v24, v25, v26, v27, v20, v21
279        lowpass_8       v28, v29, v30, v31, v22, v23
280        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
281
282  .ifc \type,avg
283        ld1             {v24.8B},  [x0], x2
284        urhadd          v16.8B, v16.8B, v24.8B
285        ld1             {v25.8B}, [x0], x2
286        urhadd          v17.8B, v17.8B, v25.8B
287        ld1             {v26.8B}, [x0], x2
288        urhadd          v18.8B, v18.8B, v26.8B
289        ld1             {v27.8B}, [x0], x2
290        urhadd          v19.8B, v19.8B, v27.8B
291        ld1             {v28.8B}, [x0], x2
292        urhadd          v20.8B, v20.8B, v28.8B
293        ld1             {v29.8B}, [x0], x2
294        urhadd          v21.8B, v21.8B, v29.8B
295        ld1             {v30.8B}, [x0], x2
296        urhadd          v22.8B, v22.8B, v30.8B
297        ld1             {v31.8B}, [x0], x2
298        urhadd          v23.8B, v23.8B, v31.8B
299        sub             x0,  x0,  x2,  lsl #3
300  .endif
301
302        st1             {v16.8B}, [x0], x2
303        st1             {v17.8B}, [x0], x2
304        st1             {v18.8B}, [x0], x2
305        st1             {v19.8B}, [x0], x2
306        st1             {v20.8B}, [x0], x2
307        st1             {v21.8B}, [x0], x2
308        st1             {v22.8B}, [x0], x2
309        st1             {v23.8B}, [x0], x2
310
311        ret
312endfunc
313.endm
314
315        h264_qpel_v_lowpass put
316        h264_qpel_v_lowpass avg
317
318.macro  h264_qpel_v_lowpass_l2 type
319function \type\()_h264_qpel16_v_lowpass_l2_neon
320        mov             x4,  x30
321        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
322        sub             x1,  x1,  x3, lsl #2
323        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
324        sub             x0,  x0,  x3, lsl #4
325        sub             x12, x12, x2, lsl #4
326        add             x0,  x0,  #8
327        add             x12, x12, #8
328        sub             x1,  x1,  x3, lsl #4
329        sub             x1,  x1,  x3, lsl #2
330        add             x1,  x1,  #8
331        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
332        sub             x1,  x1,  x3, lsl #2
333        mov             x30, x4
334endfunc
335
336function \type\()_h264_qpel8_v_lowpass_l2_neon
337        ld1             {v16.8B}, [x1], x3
338        ld1             {v18.8B}, [x1], x3
339        ld1             {v20.8B}, [x1], x3
340        ld1             {v22.8B}, [x1], x3
341        ld1             {v24.8B}, [x1], x3
342        ld1             {v26.8B}, [x1], x3
343        ld1             {v28.8B}, [x1], x3
344        ld1             {v30.8B}, [x1], x3
345        ld1             {v17.8B}, [x1], x3
346        ld1             {v19.8B}, [x1], x3
347        ld1             {v21.8B}, [x1], x3
348        ld1             {v23.8B}, [x1], x3
349        ld1             {v25.8B}, [x1]
350
351        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
352        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
353        lowpass_8       v16, v17, v18, v19, v16, v17
354        lowpass_8       v20, v21, v22, v23, v18, v19
355        lowpass_8       v24, v25, v26, v27, v20, v21
356        lowpass_8       v28, v29, v30, v31, v22, v23
357        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
358
359        ld1             {v24.8B},  [x12], x2
360        ld1             {v25.8B},  [x12], x2
361        ld1             {v26.8B},  [x12], x2
362        ld1             {v27.8B},  [x12], x2
363        ld1             {v28.8B},  [x12], x2
364        urhadd          v16.8B, v24.8B, v16.8B
365        urhadd          v17.8B, v25.8B, v17.8B
366        ld1             {v29.8B},  [x12], x2
367        urhadd          v18.8B, v26.8B, v18.8B
368        urhadd          v19.8B, v27.8B, v19.8B
369        ld1             {v30.8B}, [x12], x2
370        urhadd          v20.8B, v28.8B, v20.8B
371        urhadd          v21.8B, v29.8B, v21.8B
372        ld1             {v31.8B}, [x12], x2
373        urhadd          v22.8B, v30.8B, v22.8B
374        urhadd          v23.8B, v31.8B, v23.8B
375
376  .ifc \type,avg
377        ld1             {v24.8B}, [x0], x3
378        urhadd          v16.8B, v16.8B, v24.8B
379        ld1             {v25.8B}, [x0], x3
380        urhadd          v17.8B, v17.8B, v25.8B
381        ld1             {v26.8B}, [x0], x3
382        urhadd          v18.8B, v18.8B, v26.8B
383        ld1             {v27.8B}, [x0], x3
384        urhadd          v19.8B, v19.8B, v27.8B
385        ld1             {v28.8B}, [x0], x3
386        urhadd          v20.8B, v20.8B, v28.8B
387        ld1             {v29.8B}, [x0], x3
388        urhadd          v21.8B, v21.8B, v29.8B
389        ld1             {v30.8B}, [x0], x3
390        urhadd          v22.8B, v22.8B, v30.8B
391        ld1             {v31.8B}, [x0], x3
392        urhadd          v23.8B, v23.8B, v31.8B
393        sub             x0,  x0,  x3,  lsl #3
394  .endif
395
396        st1             {v16.8B}, [x0], x3
397        st1             {v17.8B}, [x0], x3
398        st1             {v18.8B}, [x0], x3
399        st1             {v19.8B}, [x0], x3
400        st1             {v20.8B}, [x0], x3
401        st1             {v21.8B}, [x0], x3
402        st1             {v22.8B}, [x0], x3
403        st1             {v23.8B}, [x0], x3
404
405        ret
406endfunc
407.endm
408
409        h264_qpel_v_lowpass_l2 put
410        h264_qpel_v_lowpass_l2 avg
411
412function put_h264_qpel8_hv_lowpass_neon_top
413        lowpass_const   w12
414        ld1             {v16.8H}, [x1], x3
415        ld1             {v17.8H}, [x1], x3
416        ld1             {v18.8H}, [x1], x3
417        ld1             {v19.8H}, [x1], x3
418        ld1             {v20.8H}, [x1], x3
419        ld1             {v21.8H}, [x1], x3
420        ld1             {v22.8H}, [x1], x3
421        ld1             {v23.8H}, [x1], x3
422        ld1             {v24.8H}, [x1], x3
423        ld1             {v25.8H}, [x1], x3
424        ld1             {v26.8H}, [x1], x3
425        ld1             {v27.8H}, [x1], x3
426        ld1             {v28.8H}, [x1]
427        lowpass_8H      v16, v17
428        lowpass_8H      v18, v19
429        lowpass_8H      v20, v21
430        lowpass_8H      v22, v23
431        lowpass_8H      v24, v25
432        lowpass_8H      v26, v27
433        lowpass_8H      v28, v29
434
435        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
436        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
437
438        lowpass_8.16    v16, v24, v16
439        lowpass_8.16    v17, v25, v17
440
441        lowpass_8.16    v18, v26, v18
442        lowpass_8.16    v19, v27, v19
443
444        lowpass_8.16    v20, v28, v20
445        lowpass_8.16    v21, v29, v21
446
447        lowpass_8.16    v22, v30, v22
448        lowpass_8.16    v23, v31, v23
449
450        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
451
452        ret
453endfunc
454
455.macro  h264_qpel8_hv_lowpass type
456function \type\()_h264_qpel8_hv_lowpass_neon
457        mov             x10, x30
458        bl              put_h264_qpel8_hv_lowpass_neon_top
459  .ifc \type,avg
460        ld1             {v0.8B},      [x0], x2
461        urhadd          v16.8B, v16.8B, v0.8B
462        ld1             {v1.8B},      [x0], x2
463        urhadd          v17.8B, v17.8B, v1.8B
464        ld1             {v2.8B},      [x0], x2
465        urhadd          v18.8B, v18.8B, v2.8B
466        ld1             {v3.8B},      [x0], x2
467        urhadd          v19.8B, v19.8B, v3.8B
468        ld1             {v4.8B},      [x0], x2
469        urhadd          v20.8B, v20.8B, v4.8B
470        ld1             {v5.8B},      [x0], x2
471        urhadd          v21.8B, v21.8B, v5.8B
472        ld1             {v6.8B},      [x0], x2
473        urhadd          v22.8B, v22.8B, v6.8B
474        ld1             {v7.8B},      [x0], x2
475        urhadd          v23.8B, v23.8B, v7.8B
476        sub             x0,  x0,  x2,  lsl #3
477  .endif
478
479        st1             {v16.8B},     [x0], x2
480        st1             {v17.8B},     [x0], x2
481        st1             {v18.8B},     [x0], x2
482        st1             {v19.8B},     [x0], x2
483        st1             {v20.8B},     [x0], x2
484        st1             {v21.8B},     [x0], x2
485        st1             {v22.8B},     [x0], x2
486        st1             {v23.8B},     [x0], x2
487
488        ret             x10
489endfunc
490.endm
491
492        h264_qpel8_hv_lowpass put
493        h264_qpel8_hv_lowpass avg
494
495.macro  h264_qpel8_hv_lowpass_l2 type
496function \type\()_h264_qpel8_hv_lowpass_l2_neon
497        mov             x10, x30
498        bl              put_h264_qpel8_hv_lowpass_neon_top
499
500        ld1             {v0.8B, v1.8B},  [x2], #16
501        ld1             {v2.8B, v3.8B},  [x2], #16
502        urhadd          v0.8B,  v0.8B,  v16.8B
503        urhadd          v1.8B,  v1.8B,  v17.8B
504        ld1             {v4.8B, v5.8B},  [x2], #16
505        urhadd          v2.8B,  v2.8B,  v18.8B
506        urhadd          v3.8B,  v3.8B,  v19.8B
507        ld1             {v6.8B, v7.8B},  [x2], #16
508        urhadd          v4.8B,  v4.8B,  v20.8B
509        urhadd          v5.8B,  v5.8B,  v21.8B
510        urhadd          v6.8B,  v6.8B,  v22.8B
511        urhadd          v7.8B,  v7.8B,  v23.8B
512  .ifc \type,avg
513        ld1             {v16.8B},     [x0], x3
514        urhadd          v0.8B,  v0.8B,  v16.8B
515        ld1             {v17.8B},     [x0], x3
516        urhadd          v1.8B,  v1.8B,  v17.8B
517        ld1             {v18.8B},     [x0], x3
518        urhadd          v2.8B,  v2.8B,  v18.8B
519        ld1             {v19.8B},     [x0], x3
520        urhadd          v3.8B,  v3.8B,  v19.8B
521        ld1             {v20.8B},     [x0], x3
522        urhadd          v4.8B,  v4.8B,  v20.8B
523        ld1             {v21.8B},     [x0], x3
524        urhadd          v5.8B,  v5.8B,  v21.8B
525        ld1             {v22.8B},     [x0], x3
526        urhadd          v6.8B,  v6.8B,  v22.8B
527        ld1             {v23.8B},     [x0], x3
528        urhadd          v7.8B,  v7.8B,  v23.8B
529        sub             x0,  x0,  x3,  lsl #3
530  .endif
531        st1             {v0.8B},      [x0], x3
532        st1             {v1.8B},      [x0], x3
533        st1             {v2.8B},      [x0], x3
534        st1             {v3.8B},      [x0], x3
535        st1             {v4.8B},      [x0], x3
536        st1             {v5.8B},      [x0], x3
537        st1             {v6.8B},      [x0], x3
538        st1             {v7.8B},      [x0], x3
539
540        ret             x10
541endfunc
542.endm
543
544        h264_qpel8_hv_lowpass_l2 put
545        h264_qpel8_hv_lowpass_l2 avg
546
547.macro  h264_qpel16_hv  type
548function \type\()_h264_qpel16_hv_lowpass_neon
549        mov             x13, x30
550        bl              \type\()_h264_qpel8_hv_lowpass_neon
551        sub             x1,  x1,  x3, lsl #2
552        bl              \type\()_h264_qpel8_hv_lowpass_neon
553        sub             x1,  x1,  x3, lsl #4
554        sub             x1,  x1,  x3, lsl #2
555        add             x1,  x1,  #8
556        sub             x0,  x0,  x2, lsl #4
557        add             x0,  x0,  #8
558        bl              \type\()_h264_qpel8_hv_lowpass_neon
559        sub             x1,  x1,  x3, lsl #2
560        mov             x30, x13
561        b               \type\()_h264_qpel8_hv_lowpass_neon
562endfunc
563
564function \type\()_h264_qpel16_hv_lowpass_l2_neon
565        mov             x13, x30
566        sub             x2,  x4,  #256
567        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
568        sub             x1,  x1,  x3, lsl #2
569        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
570        sub             x1,  x1,  x3, lsl #4
571        sub             x1,  x1,  x3, lsl #2
572        add             x1,  x1,  #8
573        sub             x0,  x0,  x3, lsl #4
574        add             x0,  x0,  #8
575        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
576        sub             x1,  x1,  x3, lsl #2
577        mov             x30, x13
578        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
579endfunc
580.endm
581
582        h264_qpel16_hv put
583        h264_qpel16_hv avg
584
585.macro  h264_qpel8      type
586function ff_\type\()_h264_qpel8_mc10_neon, export=1
587        lowpass_const   w3
588        mov             x3,  x1
589        sub             x1,  x1,  #2
590        mov             x12, #8
591        b               \type\()_h264_qpel8_h_lowpass_l2_neon
592endfunc
593
594function ff_\type\()_h264_qpel8_mc20_neon, export=1
595        lowpass_const   w3
596        sub             x1,  x1,  #2
597        mov             x3,  x2
598        mov             x12, #8
599        b               \type\()_h264_qpel8_h_lowpass_neon
600endfunc
601
602function ff_\type\()_h264_qpel8_mc30_neon, export=1
603        lowpass_const   w3
604        add             x3,  x1,  #1
605        sub             x1,  x1,  #2
606        mov             x12, #8
607        b               \type\()_h264_qpel8_h_lowpass_l2_neon
608endfunc
609
610function ff_\type\()_h264_qpel8_mc01_neon, export=1
611        mov             x14, x30
612        mov             x12, x1
613\type\()_h264_qpel8_mc01:
614        lowpass_const   w3
615        mov             x3,  x2
616        sub             x1,  x1,  x2, lsl #1
617        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
618        ret             x14
619endfunc
620
621function ff_\type\()_h264_qpel8_mc11_neon, export=1
622        mov             x14, x30
623        mov             x8,  x0
624        mov             x9,  x1
625\type\()_h264_qpel8_mc11:
626        lowpass_const   w3
627        mov             x11, sp
628        sub             sp,  sp,  #64
629        mov             x0,  sp
630        sub             x1,  x1,  #2
631        mov             x3,  #8
632        mov             x12, #8
633        bl              put_h264_qpel8_h_lowpass_neon
634        mov             x0,  x8
635        mov             x3,  x2
636        mov             x12, sp
637        sub             x1,  x9,  x2, lsl #1
638        mov             x2,  #8
639        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
640        mov             sp,  x11
641        ret             x14
642endfunc
643
644function ff_\type\()_h264_qpel8_mc21_neon, export=1
645        mov             x14, x30
646        mov             x8,  x0
647        mov             x9,  x1
648\type\()_h264_qpel8_mc21:
649        lowpass_const   w3
650        mov             x11, sp
651        sub             sp,  sp,  #(8*8+16*12)
652        sub             x1,  x1,  #2
653        mov             x3,  #8
654        mov             x0,  sp
655        mov             x12, #8
656        bl              put_h264_qpel8_h_lowpass_neon
657        mov             x4,  x0
658        mov             x0,  x8
659        sub             x1,  x9,  x2, lsl #1
660        sub             x1,  x1,  #2
661        mov             x3,  x2
662        sub             x2,  x4,  #64
663        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
664        mov             sp,  x11
665        ret             x14
666endfunc
667
668function ff_\type\()_h264_qpel8_mc31_neon, export=1
669        add             x1,  x1,  #1
670        mov             x14, x30
671        mov             x8,  x0
672        mov             x9,  x1
673        sub             x1,  x1,  #1
674        b               \type\()_h264_qpel8_mc11
675endfunc
676
677function ff_\type\()_h264_qpel8_mc02_neon, export=1
678        mov             x14, x30
679        lowpass_const   w3
680        sub             x1,  x1,  x2, lsl #1
681        mov             x3,  x2
682        bl              \type\()_h264_qpel8_v_lowpass_neon
683        ret             x14
684endfunc
685
686function ff_\type\()_h264_qpel8_mc12_neon, export=1
687        mov             x14, x30
688        mov             x8,  x0
689        mov             x9,  x1
690\type\()_h264_qpel8_mc12:
691        lowpass_const   w3
692        mov             x11, sp
693        sub             sp,  sp,  #(8*8+16*12)
694        sub             x1,  x1,  x2, lsl #1
695        mov             x3,  x2
696        mov             x2,  #8
697        mov             x0,  sp
698        bl              put_h264_qpel8_v_lowpass_neon
699        mov             x4,  x0
700        mov             x0,  x8
701        sub             x1,  x9,  x3, lsl #1
702        sub             x1,  x1,  #2
703        sub             x2,  x4,  #64
704        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
705        mov             sp,  x11
706        ret             x14
707endfunc
708
709function ff_\type\()_h264_qpel8_mc22_neon, export=1
710        mov             x14, x30
711        mov             x11, sp
712        sub             x1,  x1,  x2, lsl #1
713        sub             x1,  x1,  #2
714        mov             x3,  x2
715        bl              \type\()_h264_qpel8_hv_lowpass_neon
716        mov             sp,  x11
717        ret             x14
718endfunc
719
720function ff_\type\()_h264_qpel8_mc32_neon, export=1
721        mov             x14, x30
722        mov             x8,  x0
723        mov             x9,  x1
724        add             x1,  x1,  #1
725        b               \type\()_h264_qpel8_mc12
726endfunc
727
728function ff_\type\()_h264_qpel8_mc03_neon, export=1
729        mov             x14, x30
730        add             x12, x1,  x2
731        b               \type\()_h264_qpel8_mc01
732endfunc
733
734function ff_\type\()_h264_qpel8_mc13_neon, export=1
735        mov             x14, x30
736        mov             x8,  x0
737        mov             x9,  x1
738        add             x1,  x1,  x2
739        b               \type\()_h264_qpel8_mc11
740endfunc
741
742function ff_\type\()_h264_qpel8_mc23_neon, export=1
743        mov             x14, x30
744        mov             x8,  x0
745        mov             x9,  x1
746        add             x1,  x1,  x2
747        b               \type\()_h264_qpel8_mc21
748endfunc
749
750function ff_\type\()_h264_qpel8_mc33_neon, export=1
751        add             x1,  x1,  #1
752        mov             x14, x30
753        mov             x8,  x0
754        mov             x9,  x1
755        add             x1,  x1,  x2
756        sub             x1,  x1,  #1
757        b               \type\()_h264_qpel8_mc11
758endfunc
759.endm
760
761        h264_qpel8 put
762        h264_qpel8 avg
763
764.macro  h264_qpel16     type
765function ff_\type\()_h264_qpel16_mc10_neon, export=1
766        lowpass_const   w3
767        mov             x3,  x1
768        sub             x1,  x1,  #2
769        b               \type\()_h264_qpel16_h_lowpass_l2_neon
770endfunc
771
772function ff_\type\()_h264_qpel16_mc20_neon, export=1
773        lowpass_const   w3
774        sub             x1,  x1,  #2
775        mov             x3,  x2
776        b               \type\()_h264_qpel16_h_lowpass_neon
777endfunc
778
779function ff_\type\()_h264_qpel16_mc30_neon, export=1
780        lowpass_const   w3
781        add             x3,  x1,  #1
782        sub             x1,  x1,  #2
783        b               \type\()_h264_qpel16_h_lowpass_l2_neon
784endfunc
785
786function ff_\type\()_h264_qpel16_mc01_neon, export=1
787        mov             x14, x30
788        mov             x12, x1
789\type\()_h264_qpel16_mc01:
790        lowpass_const   w3
791        mov             x3,  x2
792        sub             x1,  x1,  x2, lsl #1
793        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
794        ret             x14
795endfunc
796
797function ff_\type\()_h264_qpel16_mc11_neon, export=1
798        mov             x14, x30
799        mov             x8,  x0
800        mov             x9,  x1
801\type\()_h264_qpel16_mc11:
802        lowpass_const   w3
803        mov             x11, sp
804        sub             sp,  sp,  #256
805        mov             x0,  sp
806        sub             x1,  x1,  #2
807        mov             x3,  #16
808        bl              put_h264_qpel16_h_lowpass_neon
809        mov             x0,  x8
810        mov             x3,  x2
811        mov             x12, sp
812        sub             x1,  x9,  x2, lsl #1
813        mov             x2,  #16
814        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
815        mov             sp,  x11
816        ret             x14
817endfunc
818
819function ff_\type\()_h264_qpel16_mc21_neon, export=1
820        mov             x14, x30
821        mov             x8,  x0
822        mov             x9,  x1
823\type\()_h264_qpel16_mc21:
824        lowpass_const   w3
825        mov             x11, sp
826        sub             sp,  sp,  #(16*16+16*12)
827        sub             x1,  x1,  #2
828        mov             x0,  sp
829        bl              put_h264_qpel16_h_lowpass_neon_packed
830        mov             x4,  x0
831        mov             x0,  x8
832        sub             x1,  x9,  x2, lsl #1
833        sub             x1,  x1,  #2
834        mov             x3,  x2
835        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
836        mov             sp,  x11
837        ret             x14
838endfunc
839
840function ff_\type\()_h264_qpel16_mc31_neon, export=1
841        add             x1,  x1,  #1
842        mov             x14, x30
843        mov             x8,  x0
844        mov             x9,  x1
845        sub             x1,  x1,  #1
846        b               \type\()_h264_qpel16_mc11
847endfunc
848
849function ff_\type\()_h264_qpel16_mc02_neon, export=1
850        mov             x14, x30
851        lowpass_const   w3
852        sub             x1,  x1,  x2, lsl #1
853        mov             x3,  x2
854        bl              \type\()_h264_qpel16_v_lowpass_neon
855        ret             x14
856endfunc
857
858function ff_\type\()_h264_qpel16_mc12_neon, export=1
859        mov             x14, x30
860        mov             x8,  x0
861        mov             x9,  x1
862\type\()_h264_qpel16_mc12:
863        lowpass_const   w3
864        mov             x11, sp
865        sub             sp,  sp,  #(16*16+16*12)
866        sub             x1,  x1,  x2, lsl #1
867        mov             x0,  sp
868        mov             x3,  x2
869        bl              put_h264_qpel16_v_lowpass_neon_packed
870        mov             x4,  x0
871        mov             x0,  x8
872        sub             x1,  x9,  x3, lsl #1
873        sub             x1,  x1,  #2
874        mov             x2,  x3
875        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
876        mov             sp,  x11
877        ret             x14
878endfunc
879
880function ff_\type\()_h264_qpel16_mc22_neon, export=1
881        mov             x14, x30
882        lowpass_const   w3
883        mov             x11, sp
884        sub             x1,  x1,  x2, lsl #1
885        sub             x1,  x1,  #2
886        mov             x3,  x2
887        bl              \type\()_h264_qpel16_hv_lowpass_neon
888        mov             sp,  x11 // restore stack
889        ret             x14
890endfunc
891
892function ff_\type\()_h264_qpel16_mc32_neon, export=1
893        mov             x14, x30
894        mov             x8,  x0
895        mov             x9,  x1
896        add             x1,  x1,  #1
897        b               \type\()_h264_qpel16_mc12
898endfunc
899
900function ff_\type\()_h264_qpel16_mc03_neon, export=1
901        mov             x14, x30
902        add             x12, x1,  x2
903        b               \type\()_h264_qpel16_mc01
904endfunc
905
906function ff_\type\()_h264_qpel16_mc13_neon, export=1
907        mov             x14, x30
908        mov             x8,  x0
909        mov             x9,  x1
910        add             x1,  x1,  x2
911        b               \type\()_h264_qpel16_mc11
912endfunc
913
914function ff_\type\()_h264_qpel16_mc23_neon, export=1
915        mov             x14, x30
916        mov             x8,  x0
917        mov             x9,  x1
918        add             x1,  x1,  x2
919        b               \type\()_h264_qpel16_mc21
920endfunc
921
922function ff_\type\()_h264_qpel16_mc33_neon, export=1
923        add             x1,  x1,  #1
924        mov             x14, x30
925        mov             x8,  x0
926        mov             x9,  x1
927        add             x1,  x1,  x2
928        sub             x1,  x1,  #1
929        b               \type\()_h264_qpel16_mc11
930endfunc
931.endm
932
933        h264_qpel16 put
934        h264_qpel16 avg
935