1/*
2 * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
3 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "asm.S"
23#include "neon.S"
24
25.macro  qpel_lowpass    r0,  r1,  rc1, rc2, shift
26        vext.8          d25, \r0, \r1, #1       @ src[-1]
27        vext.8          d26, \r0, \r1, #4       @ src[ 2]
28        vext.8          d24, \r0, \r1, #5       @ src[ 3]
29        vaddl.u8        q9,  d25, d26
30        vaddl.u8        q8,  \r0, d24
31        vext.8          d27, \r0, \r1, #2       @ src[ 0]
32        vshl.s16        q12, q9,  #2
33        vsub.s16        q8,  q8,  q9
34        vext.8          d28, \r0, \r1, #3       @ src[ 1]
35        vsub.s16        q8,  q8,  q12
36        vmlal.u8        q8,  d27, \rc1
37        vmlal.u8        q8,  d28, \rc2
38        vqrshrun.s16    \r0, q8,  #\shift
39.endm
40
41.macro  qpel_lowpass_x2 r0,  r1,  r2,  r3,  rc1, rc2, shift
42        vext.8          d25, \r0, \r1, #1       @ src[-1]
43        vext.8          d26, \r0, \r1, #4       @ src[ 2]
44        vext.8          d24, \r0, \r1, #5       @ src[ 3]
45        vaddl.u8        q9,  d25, d26
46        vaddl.u8        q8,  \r0, d24
47        vext.8          d29, \r0, \r1, #2       @ src[ 0]
48        vext.8          d28, \r0, \r1, #3       @ src[ 1]
49        vshl.s16        q10, q9,  #2
50        vext.8          \r1, \r2, \r3, #1       @ src[-1]
51        vsub.s16        q8,  q8,  q9
52        vext.8          d22, \r2, \r3, #4       @ src[ 2]
53        vext.8          \r0, \r2, \r3, #5       @ src[ 3]
54        vaddl.u8        q13, \r1, d22
55        vaddl.u8        q12, \r2, \r0
56        vsub.s16        q8,  q8,  q10
57        vshl.s16        q9,  q13, #2
58        vsub.s16        q12, q12, q13
59        vmlal.u8        q8,  d29, \rc1
60        vmlal.u8        q8,  d28, \rc2
61        vsub.s16        q12, q12, q9
62        vext.8          d26, \r2, \r3, #2       @ src[ 0]
63        vext.8          d27, \r2, \r3, #3       @ src[ 1]
64        vmlal.u8        q12, d26, \rc1
65        vmlal.u8        q12, d27, \rc2
66        vqrshrun.s16    \r0, q8,  #\shift
67        vqrshrun.s16    \r2, q12, #\shift
68.endm
69
70.macro  rv40_qpel8_h    shift
71function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
721:
73        vld1.8          {q2},     [r1], r2
74        vld1.8          {q3},     [r1], r2
75        qpel_lowpass_x2 d4,  d5,  d6,  d7,  d0,  d1,  \shift
76        vst1.8          {d4},     [r12,:64]!
77        vst1.8          {d6},     [r12,:64]!
78        subs            r3,  r3,  #2
79        bgt             1b
80        vld1.8          {q2},     [r1]
81        qpel_lowpass    d4,  d5,  d0,  d1,  \shift
82        vst1.8          {d4},     [r12,:64]!
83        bx              lr
84endfunc
85.endm
86
87.macro  rv40_qpel8_v    shift, type
88function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
89        vld1.64         {d2},     [r1,:64]!
90        vld1.64         {d3},     [r1,:64]!
91        vld1.64         {d4},     [r1,:64]!
92        vld1.64         {d5},     [r1,:64]!
93        vld1.64         {d6},     [r1,:64]!
94        vld1.64         {d7},     [r1,:64]!
95        vld1.64         {d8},     [r1,:64]!
96        vld1.64         {d9},     [r1,:64]!
97        vld1.64         {d10},    [r1,:64]!
98        vld1.64         {d11},    [r1,:64]!
99        vld1.64         {d12},    [r1,:64]!
100        vld1.64         {d13},    [r1,:64]!
101        vld1.64         {d14},    [r1,:64]!
102        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
103        transpose_8x8   d10, d11, d12, d13, d14, d15, d30, d31
104        qpel_lowpass_x2 d2,  d10, d3,  d11, d0,  d1,  \shift
105        qpel_lowpass_x2 d4,  d12, d5,  d13, d0,  d1,  \shift
106        qpel_lowpass_x2 d6,  d14, d7,  d15, d0,  d1,  \shift
107        qpel_lowpass_x2 d8,  d30, d9,  d31, d0,  d1,  \shift
108        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
109  .ifc \type,avg
110        vld1.64         d12,      [r0,:64], r2
111        vld1.64         d13,      [r0,:64], r2
112        vld1.64         d14,      [r0,:64], r2
113        vld1.64         d15,      [r0,:64], r2
114        vld1.64         d16,      [r0,:64], r2
115        vld1.64         d17,      [r0,:64], r2
116        vld1.64         d18,      [r0,:64], r2
117        vld1.64         d19,      [r0,:64], r2
118        sub             r0,  r0,  r2,  lsl #3
119        vrhadd.u8       q1,  q1,  q6
120        vrhadd.u8       q2,  q2,  q7
121        vrhadd.u8       q3,  q3,  q8
122        vrhadd.u8       q4,  q4,  q9
123  .endif
124        vst1.64         d2,       [r0,:64], r2
125        vst1.64         d3,       [r0,:64], r2
126        vst1.64         d4,       [r0,:64], r2
127        vst1.64         d5,       [r0,:64], r2
128        vst1.64         d6,       [r0,:64], r2
129        vst1.64         d7,       [r0,:64], r2
130        vst1.64         d8,       [r0,:64], r2
131        vst1.64         d9,       [r0,:64], r2
132        bx              lr
133endfunc
134.endm
135
136        rv40_qpel8_h    5
137        rv40_qpel8_h    6
138
139.macro  rv40_qpel       type
140function \type\()_rv40_qpel8_h_lowpass_neon
141  .ifc \type,avg
142        mov             r12, r0
143  .endif
1441:
145        vld1.8          {q2},     [r1], r2
146        vld1.8          {q3},     [r1], r2
147        qpel_lowpass_x2 d4,  d5,  d6,  d7,  d0,  d1,  6
148  .ifc \type,avg
149        vld1.8          {d3},     [r12,:64], r2
150        vld1.8          {d16},    [r12,:64], r2
151        vrhadd.u8       d4,  d4,  d3
152        vrhadd.u8       d6,  d6,  d16
153  .endif
154        vst1.8          {d4},     [r0,:64], r2
155        vst1.8          {d6},     [r0,:64], r2
156        subs            r3,  r3,  #2
157        bgt             1b
158        bx              lr
159endfunc
160
161function \type\()_rv40_qpel8_v_lowpass_neon
162        vld1.64         {d2},     [r1], r2
163        vld1.64         {d3},     [r1], r2
164        vld1.64         {d4},     [r1], r2
165        vld1.64         {d5},     [r1], r2
166        vld1.64         {d6},     [r1], r2
167        vld1.64         {d7},     [r1], r2
168        vld1.64         {d8},     [r1], r2
169        vld1.64         {d9},     [r1], r2
170        vld1.64         {d10},    [r1], r2
171        vld1.64         {d11},    [r1], r2
172        vld1.64         {d12},    [r1], r2
173        vld1.64         {d13},    [r1], r2
174        vld1.64         {d14},    [r1]
175        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
176        transpose_8x8   d10, d11, d12, d13, d14, d15, d30, d31
177        qpel_lowpass_x2 d2,  d10, d3,  d11, d0,  d1,  6
178        qpel_lowpass_x2 d4,  d12, d5,  d13, d0,  d1,  6
179        qpel_lowpass_x2 d6,  d14, d7,  d15, d0,  d1,  6
180        qpel_lowpass_x2 d8,  d30, d9,  d31, d0,  d1,  6
181        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
182  .ifc \type,avg
183        vld1.64         d12,      [r0,:64], r2
184        vld1.64         d13,      [r0,:64], r2
185        vld1.64         d14,      [r0,:64], r2
186        vld1.64         d15,      [r0,:64], r2
187        vld1.64         d16,      [r0,:64], r2
188        vld1.64         d17,      [r0,:64], r2
189        vld1.64         d18,      [r0,:64], r2
190        vld1.64         d19,      [r0,:64], r2
191        sub             r0,  r0,  r2,  lsl #3
192        vrhadd.u8       q1,  q1,  q6
193        vrhadd.u8       q2,  q2,  q7
194        vrhadd.u8       q3,  q3,  q8
195        vrhadd.u8       q4,  q4,  q9
196  .endif
197        vst1.64         d2,       [r0,:64], r2
198        vst1.64         d3,       [r0,:64], r2
199        vst1.64         d4,       [r0,:64], r2
200        vst1.64         d5,       [r0,:64], r2
201        vst1.64         d6,       [r0,:64], r2
202        vst1.64         d7,       [r0,:64], r2
203        vst1.64         d8,       [r0,:64], r2
204        vst1.64         d9,       [r0,:64], r2
205        bx              lr
206endfunc
207
208        rv40_qpel8_v    5, \type
209        rv40_qpel8_v    6, \type
210
211function ff_\type\()_rv40_qpel8_mc10_neon, export=1
212        sub             r1,  r1,  #2
213        mov             r3,  #8
214        vmov.i8         d0,  #52
215        vmov.i8         d1,  #20
216        b               \type\()_rv40_qpel8_h_lowpass_neon
217endfunc
218
219function ff_\type\()_rv40_qpel8_mc30_neon, export=1
220        sub             r1,  r1,  #2
221        mov             r3,  #8
222        vmov.i8         d0,  #20
223        vmov.i8         d1,  #52
224        b               \type\()_rv40_qpel8_h_lowpass_neon
225endfunc
226
227function ff_\type\()_rv40_qpel8_mc01_neon, export=1
228        push            {r4, lr}
229        vpush           {d8-d15}
230        sub             r1,  r1,  r2,  lsl #1
231        vmov.i8         d0,  #52
232        vmov.i8         d1,  #20
233        bl              \type\()_rv40_qpel8_v_lowpass_neon
234        vpop            {d8-d15}
235        pop             {r4, pc}
236endfunc
237
238function ff_\type\()_rv40_qpel8_mc11_neon, export=1
239        push            {r4, lr}
240        vpush           {d8-d15}
241        sub             sp,  sp,  #14*8
242        add             r12, sp,  #7
243        bic             r12, r12, #7
244        sub             r1,  r1,  r2,  lsl #1
245        sub             r1,  r1,  #2
246        mov             r3,  #12
247        vmov.i8         d0,  #52
248        vmov.i8         d1,  #20
249        bl              put_rv40_qpel8_h_lp_packed_s6_neon
250        add             r1,  sp,  #7
251        bic             r1,  r1,  #7
252        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
253        add             sp,  sp,  #14*8
254        vpop            {d8-d15}
255        pop             {r4, pc}
256endfunc
257
258function ff_\type\()_rv40_qpel8_mc21_neon, export=1
259        push            {r4, lr}
260        vpush           {d8-d15}
261        sub             sp,  sp,  #14*8
262        add             r12, sp,  #7
263        bic             r12, r12, #7
264        sub             r1,  r1,  r2,  lsl #1
265        sub             r1,  r1,  #2
266        mov             r3,  #12
267        vmov.i8         d0,  #20
268        vmov.i8         d1,  #20
269        bl              put_rv40_qpel8_h_lp_packed_s5_neon
270        add             r1,  sp,  #7
271        bic             r1,  r1,  #7
272        vmov.i8         d0,  #52
273        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
274        add             sp,  sp,  #14*8
275        vpop            {d8-d15}
276        pop             {r4, pc}
277endfunc
278
279function ff_\type\()_rv40_qpel8_mc31_neon, export=1
280        push            {r4, lr}
281        vpush           {d8-d15}
282        sub             sp,  sp,  #14*8
283        add             r12, sp,  #7
284        bic             r12, r12, #7
285        sub             r1,  r1,  r2,  lsl #1
286        sub             r1,  r1,  #2
287        mov             r3,  #12
288        vmov.i8         d0,  #20
289        vmov.i8         d1,  #52
290        bl              put_rv40_qpel8_h_lp_packed_s6_neon
291        add             r1,  sp,  #7
292        bic             r1,  r1,  #7
293        vswp            d0,  d1
294        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
295        add             sp,  sp,  #14*8
296        vpop            {d8-d15}
297        pop             {r4, pc}
298endfunc
299
300function ff_\type\()_rv40_qpel8_mc12_neon, export=1
301        push            {r4, lr}
302        vpush           {d8-d15}
303        sub             sp,  sp,  #14*8
304        add             r12, sp,  #7
305        bic             r12, r12, #7
306        sub             r1,  r1,  r2,  lsl #1
307        sub             r1,  r1,  #2
308        mov             r3,  #12
309        vmov.i8         d0,  #52
310        vmov.i8         d1,  #20
311        bl              put_rv40_qpel8_h_lp_packed_s6_neon
312        add             r1,  sp,  #7
313        bic             r1,  r1,  #7
314        vmov.i8         d0,  #20
315        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
316        add             sp,  sp,  #14*8
317        vpop            {d8-d15}
318        pop             {r4, pc}
319endfunc
320
321function ff_\type\()_rv40_qpel8_mc22_neon, export=1
322        push            {r4, lr}
323        vpush           {d8-d15}
324        sub             sp,  sp,  #14*8
325        add             r12, sp,  #7
326        bic             r12, r12, #7
327        sub             r1,  r1,  r2,  lsl #1
328        sub             r1,  r1,  #2
329        mov             r3,  #12
330        vmov.i8         d0,  #20
331        vmov.i8         d1,  #20
332        bl              put_rv40_qpel8_h_lp_packed_s5_neon
333        add             r1,  sp,  #7
334        bic             r1,  r1,  #7
335        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
336        add             sp,  sp,  #14*8
337        vpop            {d8-d15}
338        pop             {r4, pc}
339endfunc
340
341function ff_\type\()_rv40_qpel8_mc32_neon, export=1
342        push            {r4, lr}
343        vpush           {d8-d15}
344        sub             sp,  sp,  #14*8
345        add             r12, sp,  #7
346        bic             r12, r12, #7
347        sub             r1,  r1,  r2,  lsl #1
348        sub             r1,  r1,  #2
349        mov             r3,  #12
350        vmov.i8         d0,  #20
351        vmov.i8         d1,  #52
352        bl              put_rv40_qpel8_h_lp_packed_s6_neon
353        add             r1,  sp,  #7
354        bic             r1,  r1,  #7
355        vmov.i8         d1,  #20
356        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
357        add             sp,  sp,  #14*8
358        vpop            {d8-d15}
359        pop             {r4, pc}
360endfunc
361
362function ff_\type\()_rv40_qpel8_mc03_neon, export=1
363        push            {r4, lr}
364        vpush           {d8-d15}
365        sub             r1,  r1,  r2,  lsl #1
366        vmov.i8         d0,  #20
367        vmov.i8         d1,  #52
368        bl              \type\()_rv40_qpel8_v_lowpass_neon
369        vpop            {d8-d15}
370        pop             {r4, pc}
371endfunc
372
373function ff_\type\()_rv40_qpel8_mc33_neon, export=1
374        mov             r3,  #8
375        b               X(ff_\type\()_pixels8_xy2_neon)
376endfunc
377
378function ff_\type\()_rv40_qpel8_mc13_neon, export=1
379        push            {r4, lr}
380        vpush           {d8-d15}
381        sub             sp,  sp,  #14*8
382        add             r12, sp,  #7
383        bic             r12, r12, #7
384        sub             r1,  r1,  r2,  lsl #1
385        sub             r1,  r1,  #2
386        mov             r3,  #12
387        vmov.i8         d0,  #52
388        vmov.i8         d1,  #20
389        bl              put_rv40_qpel8_h_lp_packed_s6_neon
390        add             r1,  sp,  #7
391        bic             r1,  r1,  #7
392        vswp            d0,  d1
393        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
394        add             sp,  sp,  #14*8
395        vpop            {d8-d15}
396        pop             {r4, pc}
397endfunc
398
399function ff_\type\()_rv40_qpel8_mc23_neon, export=1
400        push            {r4, lr}
401        vpush           {d8-d15}
402        sub             sp,  sp,  #14*8
403        add             r12, sp,  #7
404        bic             r12, r12, #7
405        sub             r1,  r1,  r2,  lsl #1
406        sub             r1,  r1,  #2
407        mov             r3,  #12
408        vmov.i8         d0,  #20
409        vmov.i8         d1,  #20
410        bl              put_rv40_qpel8_h_lp_packed_s5_neon
411        add             r1,  sp,  #7
412        bic             r1,  r1,  #7
413        vmov.i8         d1,  #52
414        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
415        add             sp,  sp,  #14*8
416        vpop            {d8-d15}
417        pop             {r4, pc}
418endfunc
419
420function ff_\type\()_rv40_qpel16_mc10_neon, export=1
421        vmov.i8         d0,  #52
422        vmov.i8         d1,  #20
423.L\type\()_rv40_qpel16_h:
424        push            {r1, lr}
425        sub             r1,  r1,  #2
426        mov             r3,  #16
427        bl              \type\()_rv40_qpel8_h_lowpass_neon
428        pop             {r1, lr}
429        sub             r0,  r0,  r2,  lsl #4
430        add             r0,  r0,  #8
431        add             r1,  r1,  #6
432        mov             r3,  #16
433        b               \type\()_rv40_qpel8_h_lowpass_neon
434endfunc
435
436function ff_\type\()_rv40_qpel16_mc30_neon, export=1
437        vmov.i8         d0,  #20
438        vmov.i8         d1,  #52
439        b               .L\type\()_rv40_qpel16_h
440endfunc
441
442function ff_\type\()_rv40_qpel16_mc01_neon, export=1
443        vmov.i8         d0,  #52
444        vmov.i8         d1,  #20
445.L\type\()_rv40_qpel16_v:
446        sub             r1,  r1,  r2,  lsl #1
447        push            {r1, lr}
448        vpush           {d8-d15}
449        bl              \type\()_rv40_qpel8_v_lowpass_neon
450        sub             r1,  r1,  r2,  lsl #2
451        bl              \type\()_rv40_qpel8_v_lowpass_neon
452        ldr             r1,  [sp, #64]
453        sub             r0,  r0,  r2,  lsl #4
454        add             r0,  r0,  #8
455        add             r1,  r1,  #8
456        bl              \type\()_rv40_qpel8_v_lowpass_neon
457        sub             r1,  r1,  r2,  lsl #2
458        bl              \type\()_rv40_qpel8_v_lowpass_neon
459        vpop            {d8-d15}
460        pop             {r1, pc}
461endfunc
462
463function ff_\type\()_rv40_qpel16_mc11_neon, export=1
464        sub             r1,  r1,  r2,  lsl #1
465        sub             r1,  r1,  #2
466        push            {r1, lr}
467        vpush           {d8-d15}
468        sub             sp,  sp,  #44*8
469        add             r12, sp,  #7
470        bic             r12, r12, #7
471        mov             r3,  #20
472        vmov.i8         d0,  #52
473        vmov.i8         d1,  #20
474        bl              put_rv40_qpel8_h_lp_packed_s6_neon
475        ldr             r1,  [sp, #416]
476        add             r1,  r1,  #8
477        mov             r3,  #20
478        bl              put_rv40_qpel8_h_lp_packed_s6_neon
479.L\type\()_rv40_qpel16_v_s6:
480        add             r1,  sp,  #7
481        bic             r1,  r1,  #7
482        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
483        sub             r1,  r1,  #40
484        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
485        sub             r0,  r0,  r2,  lsl #4
486        add             r0,  r0,  #8
487        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
488        sub             r1,  r1,  #40
489        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
490        add             sp,  sp,  #44*8
491        vpop            {d8-d15}
492        pop             {r1, pc}
493endfunc
494
495function ff_\type\()_rv40_qpel16_mc21_neon, export=1
496        sub             r1,  r1,  r2,  lsl #1
497        sub             r1,  r1,  #2
498        push            {r1, lr}
499        vpush           {d8-d15}
500        sub             sp,  sp,  #44*8
501        add             r12, sp,  #7
502        bic             r12, r12, #7
503        mov             r3,  #20
504        vmov.i8         d0,  #20
505        vmov.i8         d1,  #20
506        bl              put_rv40_qpel8_h_lp_packed_s5_neon
507        ldr             r1,  [sp, #416]
508        add             r1,  r1,  #8
509        mov             r3,  #20
510        bl              put_rv40_qpel8_h_lp_packed_s5_neon
511        vmov.i8         d0,  #52
512        b               .L\type\()_rv40_qpel16_v_s6
513endfunc
514
515function ff_\type\()_rv40_qpel16_mc31_neon, export=1
516        sub             r1,  r1,  r2,  lsl #1
517        sub             r1,  r1,  #2
518        push            {r1, lr}
519        vpush           {d8-d15}
520        sub             sp,  sp,  #44*8
521        add             r12, sp,  #7
522        bic             r12, r12, #7
523        mov             r3,  #20
524        vmov.i8         d0,  #20
525        vmov.i8         d1,  #52
526        bl              put_rv40_qpel8_h_lp_packed_s6_neon
527        ldr             r1,  [sp, #416]
528        add             r1,  r1,  #8
529        mov             r3,  #20
530        bl              put_rv40_qpel8_h_lp_packed_s6_neon
531        vswp            d0,  d1
532        b               .L\type\()_rv40_qpel16_v_s6
533endfunc
534
535function ff_\type\()_rv40_qpel16_mc12_neon, export=1
536        sub             r1,  r1,  r2,  lsl #1
537        sub             r1,  r1,  #2
538        push            {r1, lr}
539        vpush           {d8-d15}
540        sub             sp,  sp,  #44*8
541        add             r12, sp,  #7
542        bic             r12, r12, #7
543        mov             r3,  #20
544        vmov.i8         d0,  #52
545        vmov.i8         d1,  #20
546        bl              put_rv40_qpel8_h_lp_packed_s6_neon
547        ldr             r1,  [sp, #416]
548        add             r1,  r1,  #8
549        mov             r3,  #20
550        bl              put_rv40_qpel8_h_lp_packed_s6_neon
551        vmov.i8         d0,  #20
552.L\type\()_rv40_qpel16_v_s5:
553        add             r1,  sp,  #7
554        bic             r1,  r1,  #7
555        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
556        sub             r1,  r1,  #40
557        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
558        sub             r0,  r0,  r2,  lsl #4
559        add             r0,  r0,  #8
560        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
561        sub             r1,  r1,  #40
562        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
563        add             sp,  sp,  #44*8
564        vpop            {d8-d15}
565        pop             {r1, pc}
566endfunc
567
568function ff_\type\()_rv40_qpel16_mc22_neon, export=1
569        sub             r1,  r1,  r2,  lsl #1
570        sub             r1,  r1,  #2
571        push            {r1, lr}
572        vpush           {d8-d15}
573        sub             sp,  sp,  #44*8
574        add             r12, sp,  #7
575        bic             r12, r12, #7
576        mov             r3,  #20
577        vmov.i8         d0,  #20
578        vmov.i8         d1,  #20
579        bl              put_rv40_qpel8_h_lp_packed_s5_neon
580        ldr             r1,  [sp, #416]
581        add             r1,  r1,  #8
582        mov             r3,  #20
583        bl              put_rv40_qpel8_h_lp_packed_s5_neon
584        b               .L\type\()_rv40_qpel16_v_s5
585endfunc
586
587function ff_\type\()_rv40_qpel16_mc32_neon, export=1
588        sub             r1,  r1,  r2,  lsl #1
589        sub             r1,  r1,  #2
590        push            {r1, lr}
591        vpush           {d8-d15}
592        sub             sp,  sp,  #44*8
593        add             r12, sp,  #7
594        bic             r12, r12, #7
595        mov             r3,  #20
596        vmov.i8         d0,  #20
597        vmov.i8         d1,  #52
598        bl              put_rv40_qpel8_h_lp_packed_s6_neon
599        ldr             r1,  [sp, #416]
600        add             r1,  r1,  #8
601        mov             r3,  #20
602        bl              put_rv40_qpel8_h_lp_packed_s6_neon
603        vmov.i8         d1,  #20
604        b               .L\type\()_rv40_qpel16_v_s5
605endfunc
606
607function ff_\type\()_rv40_qpel16_mc03_neon, export=1
608        vmov.i8         d0,  #20
609        vmov.i8         d1,  #52
610        b               .L\type\()_rv40_qpel16_v
611endfunc
612
613function ff_\type\()_rv40_qpel16_mc13_neon, export=1
614        sub             r1,  r1,  r2,  lsl #1
615        sub             r1,  r1,  #2
616        push            {r1, lr}
617        vpush           {d8-d15}
618        sub             sp,  sp,  #44*8
619        add             r12, sp,  #7
620        bic             r12, r12, #7
621        mov             r3,  #20
622        vmov.i8         d0,  #52
623        vmov.i8         d1,  #20
624        bl              put_rv40_qpel8_h_lp_packed_s6_neon
625        ldr             r1,  [sp, #416]
626        add             r1,  r1,  #8
627        mov             r3,  #20
628        bl              put_rv40_qpel8_h_lp_packed_s6_neon
629        vswp            d0,  d1
630        b               .L\type\()_rv40_qpel16_v_s6
631endfunc
632
633function ff_\type\()_rv40_qpel16_mc23_neon, export=1
634        sub             r1,  r1,  r2,  lsl #1
635        sub             r1,  r1,  #2
636        push            {r1, lr}
637        vpush           {d8-d15}
638        sub             sp,  sp,  #44*8
639        add             r12, sp,  #7
640        bic             r12, r12, #7
641        mov             r3,  #20
642        vmov.i8         d0,  #20
643        vmov.i8         d1,  #20
644        bl              put_rv40_qpel8_h_lp_packed_s5_neon
645        ldr             r1,  [sp, #416]
646        add             r1,  r1,  #8
647        mov             r3,  #20
648        bl              put_rv40_qpel8_h_lp_packed_s5_neon
649        vmov.i8         d1,  #52
650        b               .L\type\()_rv40_qpel16_v_s6
651endfunc
652
653function ff_\type\()_rv40_qpel16_mc33_neon, export=1
654        mov             r3,  #16
655        b               X(ff_\type\()_pixels16_xy2_neon)
656endfunc
657.endm
658
659        rv40_qpel       put
660        rv40_qpel       avg
661
662.macro  rv40_weight
663        vmovl.u8        q8,  d2
664        vmovl.u8        q9,  d3
665        vmovl.u8        q10, d4
666        vmovl.u8        q11, d5
667        vmull.u16       q2,  d16, d0[2]
668        vmull.u16       q3,  d17, d0[2]
669        vmull.u16       q8,  d18, d0[2]
670        vmull.u16       q9,  d19, d0[2]
671        vmull.u16       q12, d20, d0[0]
672        vmull.u16       q13, d21, d0[0]
673        vmull.u16       q14, d22, d0[0]
674        vmull.u16       q15, d23, d0[0]
675        vshrn.i32       d4,  q2,  #9
676        vshrn.i32       d5,  q3,  #9
677        vshrn.i32       d6,  q8,  #9
678        vshrn.i32       d7,  q9,  #9
679        vshrn.i32       d16, q12, #9
680        vshrn.i32       d17, q13, #9
681        vshrn.i32       d18, q14, #9
682        vshrn.i32       d19, q15, #9
683        vadd.u16        q2,  q2,  q8
684        vadd.u16        q3,  q3,  q9
685        vrshrn.i16      d2,  q2,  #5
686        vrshrn.i16      d3,  q3,  #5
687.endm
688
689/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
690                                    int w1, int w2, int stride) */
691function ff_rv40_weight_func_16_neon, export=1
692        ldr             r12, [sp]
693        vmov            d0,  r3,  r12
694        ldr             r12, [sp, #4]
695        mov             r3,  #16
6961:
697        vld1.8          {q1},     [r1,:128], r12
698        vld1.8          {q2},     [r2,:128], r12
699        rv40_weight
700        vst1.8          {q1},     [r0,:128], r12
701        subs            r3,  r3,  #1
702        bne             1b
703        bx              lr
704endfunc
705
706/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
707                                   int w1, int w2, int stride) */
708function ff_rv40_weight_func_8_neon, export=1
709        ldr             r12, [sp]
710        vmov            d0,  r3,  r12
711        ldr             r12, [sp, #4]
712        mov             r3,  #8
7131:
714        vld1.8          {d2},     [r1,:64], r12
715        vld1.8          {d3},     [r1,:64], r12
716        vld1.8          {d4},     [r2,:64], r12
717        vld1.8          {d5},     [r2,:64], r12
718        rv40_weight
719        vst1.8          {d2},     [r0,:64], r12
720        vst1.8          {d3},     [r0,:64], r12
721        subs            r3,  r3,  #2
722        bne             1b
723        bx              lr
724endfunc
725
726function ff_rv40_h_loop_filter_strength_neon, export=1
727        pkhbt           r2,  r3,  r2,  lsl #18
728
729        ldr             r3,  [r0]
730        ldr_dpre        r12, r0,  r1
731        teq             r3,  r12
732        beq             1f
733
734        sub             r0,  r0,  r1,  lsl #1
735
736        vld1.32         {d4[]},   [r0,:32], r1  @ -3
737        vld1.32         {d0[]},   [r0,:32], r1  @ -2
738        vld1.32         {d4[1]},  [r0,:32], r1  @ -1
739        vld1.32         {d5[]},   [r0,:32], r1  @  0
740        vld1.32         {d1[]},   [r0,:32], r1  @  1
741        vld1.32         {d5[0]},  [r0,:32], r1  @  2
742
743        vpaddl.u8       q8,  q0                 @ -2, -2, -2, -2,  1,  1,  1,  1
744        vpaddl.u8       q9,  q2                 @ -3, -3, -1, -1,  2,  2,  0,  0
745        vdup.32         d30, r2                 @ beta2, beta << 2
746        vpadd.u16       d16, d16, d17           @ -2, -2,  1,  1
747        vpadd.u16       d18, d18, d19           @ -3, -1,  2,  0
748        vabd.u16        d16, d18, d16
749        vclt.u16        d16, d16, d30
750
751        ldrd            r2,  r3,  [sp, #4]
752        vmovl.u16       q12, d16
753        vtrn.16         d16, d17
754        vshr.u32        q12, q12, #15
755        ldr             r0,  [sp]
756        vst1.32         {d24[1]}, [r2,:32]
757        vst1.32         {d25[1]}, [r3,:32]
758
759        cmp             r0,  #0
760        it              eq
761        bxeq            lr
762
763        vand            d18, d16, d17
764        vtrn.32         d18, d19
765        vand            d18, d18, d19
766        vmov.u16        r0,  d18[0]
767        bx              lr
7681:
769        ldrd            r2,  r3,  [sp, #4]
770        mov             r0,  #0
771        str             r0,  [r2]
772        str             r0,  [r3]
773        bx              lr
774endfunc
775
776function ff_rv40_v_loop_filter_strength_neon, export=1
777        sub             r0,  r0,  #3
778        pkhbt           r2,  r3,  r2,  lsl #18
779
780        vld1.8          {d0},     [r0], r1
781        vld1.8          {d1},     [r0], r1
782        vld1.8          {d2},     [r0], r1
783        vld1.8          {d3},     [r0], r1
784
785        vaddl.u8        q0,  d0,  d1
786        vaddl.u8        q1,  d2,  d3
787        vdup.32         q15, r2
788        vadd.u16        q0,  q0,  q1            @ -3, -2, -1,  0,  1,  2
789        vext.16         q1,  q0,  q0,  #1       @ -2, -1,  0,  1,  2
790        vabd.u16        q0,  q1,  q0
791        vclt.u16        q0,  q0,  q15
792
793        ldrd            r2,  r3,  [sp, #4]
794        vmovl.u16       q1,  d0
795        vext.16         d1,  d0,  d1,  #3
796        vshr.u32        q1,  q1,  #15
797        ldr             r0,  [sp]
798        vst1.32         {d2[1]},  [r2,:32]
799        vst1.32         {d3[1]},  [r3,:32]
800
801        cmp             r0,  #0
802        it              eq
803        bxeq            lr
804
805        vand            d0,  d0,  d1
806        vtrn.16         d0,  d1
807        vand            d0,  d0,  d1
808        vmov.u16        r0,  d0[0]
809        bx              lr
810endfunc
811
812.macro  rv40_weak_loop_filter
813        vdup.16         d30, r2                 @ filter_p1
814        vdup.16         d31, r3                 @ filter_q1
815        ldrd            r2,  r3,  [sp]
816        vdup.16         d28, r2                 @ alpha
817        vdup.16         d29, r3                 @ beta
818        ldr             r12, [sp, #8]
819        vdup.16         d25, r12                @ lim_p0q0
820        ldrd            r2,  r3,  [sp, #12]
821        vsubl.u8        q9,  d5,  d4            @ x, t
822        vabdl.u8        q8,  d5,  d4            @ x, abs(t)
823        vneg.s16        q15, q15
824        vceq.i16        d16, d19, #0            @ !t
825        vshl.s16        d19, d19, #2            @ t << 2
826        vmul.u16        d18, d17, d28           @ alpha * abs(t)
827        vand            d24, d30, d31           @ filter_p1 & filter_q1
828        vsubl.u8        q1,  d0,  d4            @ p1p2, p1p0
829        vsubl.u8        q3,  d1,  d5            @ q1q2, q1q0
830        vmov.i16        d22, #3
831        vshr.u16        d18, d18, #7
832        vadd.i16        d22, d22, d24           @ 3 - (filter_p1 & filter_q1)
833        vsubl.u8        q10, d0,  d1            @ src[-2] - src[1]
834        vcle.u16        d18, d18, d22
835        vand            d20, d20, d24
836        vneg.s16        d23, d25                @ -lim_p0q0
837        vadd.s16        d19, d19, d20
838        vbic            d16, d18, d16           @ t && u <= 3 - (fp1 & fq1)
839        vtrn.32         d4,  d5                 @ -3,  2, -1,  0
840        vrshr.s16       d19, d19, #3
841        vmov            d28, d29                @ beta
842        vswp            d3,  d6                 @ q1q2, p1p0
843        vmin.s16        d19, d19, d25
844        vand            d30, d30, d16
845        vand            d31, d31, d16
846        vadd.s16        q10, q1,  q3            @ p1p2 + p1p0, q1q2 + q1q0
847        vmax.s16        d19, d19, d23           @ diff
848        vabs.s16        q1,  q1                 @ abs(p1p2), abs(q1q2)
849        vand            d18, d19, d16           @ diff
850        vcle.u16        q1,  q1,  q14
851        vneg.s16        d19, d18                @ -diff
852        vdup.16         d26, r3                 @ lim_p1
853        vaddw.u8        q2,  q9,  d5            @ src[-1]+diff, src[0]-diff
854        vhsub.s16       q11, q10, q9
855        vand            q1,  q1,  q15
856        vqmovun.s16     d4,  q2                 @ -1,  0
857        vand            q9,  q11, q1
858        vdup.16         d27, r2                 @ lim_q1
859        vneg.s16        q9,  q9
860        vneg.s16        q14, q13
861        vmin.s16        q9,  q9,  q13
862        vtrn.32         d0,  d1                 @ -2,  1,  -2,  1
863        vmax.s16        q9,  q9,  q14
864        vaddw.u8        q3,  q9,  d0
865        vqmovun.s16     d5,  q3                 @ -2,  1
866.endm
867
868function ff_rv40_h_weak_loop_filter_neon, export=1
869        sub             r0,  r0,  r1,  lsl #1
870        sub             r0,  r0,  r1
871
872        vld1.32         {d4[]},   [r0,:32], r1
873        vld1.32         {d0[]},   [r0,:32], r1
874        vld1.32         {d4[1]},  [r0,:32], r1
875        vld1.32         {d5[]},   [r0,:32], r1
876        vld1.32         {d1[]},   [r0,:32], r1
877        vld1.32         {d5[0]},  [r0,:32]
878
879        sub             r0,  r0,  r1,  lsl #2
880
881        rv40_weak_loop_filter
882
883        vst1.32         {d5[0]},  [r0,:32], r1
884        vst1.32         {d4[0]},  [r0,:32], r1
885        vst1.32         {d4[1]},  [r0,:32], r1
886        vst1.32         {d5[1]},  [r0,:32], r1
887
888        bx              lr
889endfunc
890
891function ff_rv40_v_weak_loop_filter_neon, export=1
892        sub             r12, r0,  #3
893        sub             r0,  r0,  #2
894
895        vld1.8          {d4},     [r12], r1
896        vld1.8          {d5},     [r12], r1
897        vld1.8          {d2},     [r12], r1
898        vld1.8          {d3},     [r12], r1
899
900        vtrn.16         q2,  q1
901        vtrn.8          d4,  d5
902        vtrn.8          d2,  d3
903
904        vrev64.32       d5,  d5
905        vtrn.32         q2,  q1
906        vdup.32         d0,  d3[0]
907        vdup.32         d1,  d2[0]
908
909        rv40_weak_loop_filter
910
911        vtrn.32         q2,  q3
912        vswp            d4,  d5
913
914        vst4.8          {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
915        vst4.8          {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
916        vst4.8          {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
917        vst4.8          {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
918
919        bx              lr
920endfunc
921