1/*
2 * VP8 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25#include "neon.S"
26
27function ff_vp8_luma_dc_wht_neon, export=1
28        vld1.16         {q0-q1},  [r1,:128]
29        vmov.i16        q15, #0
30
31        vadd.i16        d4,  d0,  d3
32        vadd.i16        d6,  d1,  d2
33        vst1.16         {q15},    [r1,:128]!
34        vsub.i16        d7,  d1,  d2
35        vsub.i16        d5,  d0,  d3
36        vst1.16         {q15},    [r1,:128]
37        vadd.i16        q0,  q2,  q3
38        vsub.i16        q1,  q2,  q3
39
40        vmov.i16        q8, #3
41
42        vtrn.32         d0,  d2
43        vtrn.32         d1,  d3
44        vtrn.16         d0,  d1
45        vtrn.16         d2,  d3
46
47        vadd.i16        d0,  d0,  d16
48
49        vadd.i16        d4,  d0,  d3
50        vadd.i16        d6,  d1,  d2
51        vsub.i16        d7,  d1,  d2
52        vsub.i16        d5,  d0,  d3
53        vadd.i16        q0,  q2,  q3
54        vsub.i16        q1,  q2,  q3
55
56        vshr.s16        q0,  q0,  #3
57        vshr.s16        q1,  q1,  #3
58
59        mov             r3,  #32
60        vst1.16         {d0[0]},  [r0,:16], r3
61        vst1.16         {d1[0]},  [r0,:16], r3
62        vst1.16         {d2[0]},  [r0,:16], r3
63        vst1.16         {d3[0]},  [r0,:16], r3
64        vst1.16         {d0[1]},  [r0,:16], r3
65        vst1.16         {d1[1]},  [r0,:16], r3
66        vst1.16         {d2[1]},  [r0,:16], r3
67        vst1.16         {d3[1]},  [r0,:16], r3
68        vst1.16         {d0[2]},  [r0,:16], r3
69        vst1.16         {d1[2]},  [r0,:16], r3
70        vst1.16         {d2[2]},  [r0,:16], r3
71        vst1.16         {d3[2]},  [r0,:16], r3
72        vst1.16         {d0[3]},  [r0,:16], r3
73        vst1.16         {d1[3]},  [r0,:16], r3
74        vst1.16         {d2[3]},  [r0,:16], r3
75        vst1.16         {d3[3]},  [r0,:16], r3
76
77        bx              lr
78endfunc
79
80function ff_vp8_idct_add_neon, export=1
81        vld1.16         {q0-q1},  [r1,:128]
82        movw            r3,  #20091
83        movt            r3,  #35468/2
84        vdup.32         d4,  r3
85
86        vmull.s16       q12, d1,  d4[0]
87        vmull.s16       q13, d3,  d4[0]
88        vqdmulh.s16     d20, d1,  d4[1]
89        vqdmulh.s16     d23, d3,  d4[1]
90        vshrn.s32       d21, q12, #16
91        vshrn.s32       d22, q13, #16
92        vadd.s16        d21, d21, d1
93        vadd.s16        d22, d22, d3
94
95        vadd.s16        d16, d0,  d2
96        vsub.s16        d17, d0,  d2
97        vadd.s16        d18, d21, d23
98        vsub.s16        d19, d20, d22
99        vadd.s16        q0,  q8,  q9
100        vsub.s16        q1,  q8,  q9
101
102        vtrn.32         d0,  d3
103        vtrn.32         d1,  d2
104        vtrn.16         d0,  d1
105        vtrn.16         d3,  d2
106
107        vmov.i16        q15, #0
108        vmull.s16       q12, d1,  d4[0]
109        vst1.16         {q15},    [r1,:128]!
110        vmull.s16       q13, d2,  d4[0]
111        vst1.16         {q15},    [r1,:128]
112        vqdmulh.s16     d21, d1,  d4[1]
113        vqdmulh.s16     d23, d2,  d4[1]
114        vshrn.s32       d20, q12, #16
115        vshrn.s32       d22, q13, #16
116        vadd.i16        d20, d20, d1
117        vadd.i16        d22, d22, d2
118
119        vadd.i16        d16, d0,  d3
120        vsub.i16        d17, d0,  d3
121        vadd.i16        d18, d20, d23
122        vld1.32         {d20[]},  [r0,:32], r2
123        vsub.i16        d19, d21, d22
124        vld1.32         {d22[]},  [r0,:32], r2
125        vadd.s16        q0,  q8,  q9
126        vld1.32         {d23[]},  [r0,:32], r2
127        vsub.s16        q1,  q8,  q9
128        vld1.32         {d21[]},  [r0,:32], r2
129        vrshr.s16       q0,  q0,  #3
130        vtrn.32         q10, q11
131        vrshr.s16       q1,  q1,  #3
132
133        sub             r0,  r0,  r2,  lsl #2
134
135        vtrn.32         d0,  d3
136        vtrn.32         d1,  d2
137        vtrn.16         d0,  d1
138        vtrn.16         d3,  d2
139
140        vaddw.u8        q0,  q0,  d20
141        vaddw.u8        q1,  q1,  d21
142        vqmovun.s16     d0,  q0
143        vqmovun.s16     d1,  q1
144
145        vst1.32         {d0[0]},  [r0,:32], r2
146        vst1.32         {d0[1]},  [r0,:32], r2
147        vst1.32         {d1[1]},  [r0,:32], r2
148        vst1.32         {d1[0]},  [r0,:32], r2
149
150        bx              lr
151endfunc
152
153function ff_vp8_idct_dc_add_neon, export=1
154        mov             r3,  #0
155        ldrsh           r12, [r1]
156        strh            r3,  [r1]
157        vdup.16         q1,  r12
158        vrshr.s16       q1,  q1,  #3
159        vld1.32         {d0[]},   [r0,:32], r2
160        vld1.32         {d1[]},   [r0,:32], r2
161        vld1.32         {d0[1]},  [r0,:32], r2
162        vld1.32         {d1[1]},  [r0,:32], r2
163        vaddw.u8        q2,  q1,  d0
164        vaddw.u8        q3,  q1,  d1
165        sub             r0,  r0,  r2, lsl #2
166        vqmovun.s16     d0,  q2
167        vqmovun.s16     d1,  q3
168        vst1.32         {d0[0]},  [r0,:32], r2
169        vst1.32         {d1[0]},  [r0,:32], r2
170        vst1.32         {d0[1]},  [r0,:32], r2
171        vst1.32         {d1[1]},  [r0,:32], r2
172        bx              lr
173endfunc
174
175function ff_vp8_idct_dc_add4uv_neon, export=1
176        vmov.i16        d0,  #0
177        mov             r3,  #32
178        vld1.16         {d16[]},  [r1,:16]
179        vst1.16         {d0[0]},  [r1,:16], r3
180        vld1.16         {d17[]},  [r1,:16]
181        vst1.16         {d0[0]},  [r1,:16], r3
182        vld1.16         {d18[]},  [r1,:16]
183        vst1.16         {d0[0]},  [r1,:16], r3
184        vld1.16         {d19[]},  [r1,:16]
185        vst1.16         {d0[0]},  [r1,:16], r3
186        mov             r3,  r0
187        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
188        vld1.8          {d0},     [r0,:64], r2
189        vrshr.s16       q9,  q9,  #3
190        vld1.8          {d1},     [r0,:64], r2
191        vaddw.u8        q10, q8,  d0
192        vld1.8          {d2},     [r0,:64], r2
193        vaddw.u8        q0,  q8,  d1
194        vld1.8          {d3},     [r0,:64], r2
195        vaddw.u8        q11, q8,  d2
196        vld1.8          {d4},     [r0,:64], r2
197        vaddw.u8        q1,  q8,  d3
198        vld1.8          {d5},     [r0,:64], r2
199        vaddw.u8        q12, q9,  d4
200        vld1.8          {d6},     [r0,:64], r2
201        vaddw.u8        q2,  q9,  d5
202        vld1.8          {d7},     [r0,:64], r2
203        vaddw.u8        q13, q9,  d6
204        vqmovun.s16     d20, q10
205        vaddw.u8        q3,  q9,  d7
206        vqmovun.s16     d21, q0
207        vqmovun.s16     d22, q11
208        vst1.8          {d20},    [r3,:64], r2
209        vqmovun.s16     d23, q1
210        vst1.8          {d21},    [r3,:64], r2
211        vqmovun.s16     d24, q12
212        vst1.8          {d22},    [r3,:64], r2
213        vqmovun.s16     d25, q2
214        vst1.8          {d23},    [r3,:64], r2
215        vqmovun.s16     d26, q13
216        vst1.8          {d24},    [r3,:64], r2
217        vqmovun.s16     d27, q3
218        vst1.8          {d25},    [r3,:64], r2
219        vst1.8          {d26},    [r3,:64], r2
220        vst1.8          {d27},    [r3,:64], r2
221
222        bx              lr
223endfunc
224
225function ff_vp8_idct_dc_add4y_neon, export=1
226        vmov.i16        d0,  #0
227        mov             r3,  #32
228        vld1.16         {d16[]},  [r1,:16]
229        vst1.16         {d0[0]},  [r1,:16], r3
230        vld1.16         {d17[]},  [r1,:16]
231        vst1.16         {d0[0]},  [r1,:16], r3
232        vld1.16         {d18[]},  [r1,:16]
233        vst1.16         {d0[0]},  [r1,:16], r3
234        vld1.16         {d19[]},  [r1,:16]
235        vst1.16         {d0[0]},  [r1,:16], r3
236        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
237        vld1.8          {q0},     [r0,:128], r2
238        vrshr.s16       q9,  q9,  #3
239        vld1.8          {q1},     [r0,:128], r2
240        vaddw.u8        q10, q8,  d0
241        vld1.8          {q2},     [r0,:128], r2
242        vaddw.u8        q0,  q9,  d1
243        vld1.8          {q3},     [r0,:128], r2
244        vaddw.u8        q11, q8,  d2
245        vaddw.u8        q1,  q9,  d3
246        vaddw.u8        q12, q8,  d4
247        vaddw.u8        q2,  q9,  d5
248        vaddw.u8        q13, q8,  d6
249        vaddw.u8        q3,  q9,  d7
250        sub             r0,  r0,  r2,  lsl #2
251        vqmovun.s16     d20, q10
252        vqmovun.s16     d21, q0
253        vqmovun.s16     d22, q11
254        vqmovun.s16     d23, q1
255        vqmovun.s16     d24, q12
256        vst1.8          {q10},    [r0,:128], r2
257        vqmovun.s16     d25, q2
258        vst1.8          {q11},    [r0,:128], r2
259        vqmovun.s16     d26, q13
260        vst1.8          {q12},    [r0,:128], r2
261        vqmovun.s16     d27, q3
262        vst1.8          {q13},    [r0,:128], r2
263
264        bx              lr
265endfunc
266
267@ Register layout:
268@   P3..Q3 -> q0..q7
269@   flim_E -> q14
270@   flim_I -> q15
271@   hev_thresh -> r12
272@
273.macro  vp8_loop_filter, inner=0, simple=0
274    .if \simple
275        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
276        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
277        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
278        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
279        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
280        vmov.i8         q13, #0x80
281        vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
282    .else
283        @ calculate hev and normal_limit:
284        vabd.u8         q12, q2,  q3            @ abs(P1-P0)
285        vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
286        vabd.u8         q10, q0,  q1            @ abs(P3-P2)
287        vabd.u8         q11, q1,  q2            @ abs(P2-P1)
288        vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
289        vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
290        vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
291        vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
292        vand            q8,  q8,  q9
293        vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
294        vand            q8,  q8,  q11
295        vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
296        vand            q8,  q8,  q10
297        vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
298        vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
299        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
300        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
301        vand            q8,  q8,  q10
302        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
303        vand            q8,  q8,  q11
304        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
305        vdup.8          q15, r12                @ hev_thresh
306        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
307        vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
308        vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
309        vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
310        vand            q8,  q8,  q11
311        vmov.i8         q13, #0x80
312        vorr            q9,  q12, q14
313    .endif
314
315        @ at this point:
316        @   q8: normal_limit
317        @   q9: hev
318
319        @ convert to signed value:
320        veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
321        veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
322
323        vmov.i16        q12, #3
324        vsubl.s8        q10, d8,  d6            @ QS0 - PS0
325        vsubl.s8        q11, d9,  d7            @   (widened to 16bit)
326        veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
327        veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
328        vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
329        vmul.i16        q11, q11, q12
330
331        vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
332        vmov.i8         q14, #4
333        vmov.i8         q15, #3
334    .if \inner
335        vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
336    .endif
337        vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
338        vaddw.s8        q11, q11, d25
339        vqmovn.s16      d20, q10                @ narrow result back into q10
340        vqmovn.s16      d21, q11
341    .if !\inner && !\simple
342        veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
343        veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
344    .endif
345        vand            q10, q10, q8            @ w &= normal_limit
346
347        @ registers used at this point..
348        @   q0 -> P3  (don't corrupt)
349        @   q1-q6 -> PS2-QS2
350        @   q7 -> Q3  (don't corrupt)
351        @   q9 -> hev
352        @   q10 -> w
353        @   q13 -> #0x80
354        @   q14 -> #4
355        @   q15 -> #3
356        @   q8, q11, q12 -> unused
357
358        @ filter_common:   is4tap==1
359        @   c1 = clamp(w + 4) >> 3;
360        @   c2 = clamp(w + 3) >> 3;
361        @   Q0 = s2u(QS0 - c1);
362        @   P0 = s2u(PS0 + c2);
363
364    .if \simple
365        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
366        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
367        vshr.s8         q11, q11, #3            @ c1 >>= 3
368        vshr.s8         q12, q12, #3            @ c2 >>= 3
369        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
370        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
371        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
372        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
373        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
374        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
375    .elseif \inner
376        @ the !is4tap case of filter_common, only used for inner blocks
377        @   c3 = ((c1&~hev) + 1) >> 1;
378        @   Q1 = s2u(QS1 - c3);
379        @   P1 = s2u(PS1 + c3);
380        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
381        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
382        vshr.s8         q11, q11, #3            @ c1 >>= 3
383        vshr.s8         q12, q12, #3            @ c2 >>= 3
384        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
385        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
386        vbic            q11, q11, q9            @ c1 & ~hev
387        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
388        vrshr.s8        q11, q11, #1            @ c3 >>= 1
389        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
390        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
391        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
392        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
393        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
394    .else
395        vand            q12, q10, q9            @ w & hev
396        vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
397        vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
398        vshr.s8         q11, q11, #3            @ c1 >>= 3
399        vshr.s8         q12, q12, #3            @ c2 >>= 3
400        vbic            q10, q10, q9            @ w &= ~hev
401        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
402        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
403
404        @ filter_mbedge:
405        @   a = clamp((27*w + 63) >> 7);
406        @   Q0 = s2u(QS0 - a);
407        @   P0 = s2u(PS0 + a);
408        @   a = clamp((18*w + 63) >> 7);
409        @   Q1 = s2u(QS1 - a);
410        @   P1 = s2u(PS1 + a);
411        @   a = clamp((9*w + 63) >> 7);
412        @   Q2 = s2u(QS2 - a);
413        @   P2 = s2u(PS2 + a);
414        vmov.i16        q9,  #63
415        vshll.s8        q14, d20, #3
416        vshll.s8        q15, d21, #3
417        vaddw.s8        q14, q14, d20
418        vaddw.s8        q15, q15, d21
419        vadd.s16        q8,  q9,  q14
420        vadd.s16        q9,  q9,  q15           @  9*w + 63
421        vadd.s16        q11, q8,  q14
422        vadd.s16        q12, q9,  q15           @ 18*w + 63
423        vadd.s16        q14, q11, q14
424        vadd.s16        q15, q12, q15           @ 27*w + 63
425        vqshrn.s16      d16, q8,  #7
426        vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
427        vqshrn.s16      d22, q11, #7
428        vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
429        vqshrn.s16      d28, q14, #7
430        vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
431        vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
432        vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
433        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
434        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
435        vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
436        vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
437        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
438        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
439        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
440        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
441        veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
442        veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
443    .endif
444.endm
445
446.macro  vp8_v_loop_filter16 name, inner=0, simple=0
447function ff_vp8_v_loop_filter16\name\()_neon, export=1
448        vpush           {q4-q7}
449        sub             r0,  r0,  r1,  lsl #1+!\simple
450
451        @ Load pixels:
452    .if !\simple
453        ldr             r12, [sp, #64]          @ hev_thresh
454        vld1.8          {q0},     [r0,:128], r1 @ P3
455        vld1.8          {q1},     [r0,:128], r1 @ P2
456    .endif
457        vld1.8          {q2},     [r0,:128], r1 @ P1
458        vld1.8          {q3},     [r0,:128], r1 @ P0
459        vld1.8          {q4},     [r0,:128], r1 @ Q0
460        vld1.8          {q5},     [r0,:128], r1 @ Q1
461    .if !\simple
462        vld1.8          {q6},     [r0,:128], r1 @ Q2
463        vld1.8          {q7},     [r0,:128]     @ Q3
464        vdup.8          q15, r3                 @ flim_I
465    .endif
466        vdup.8          q14, r2                 @ flim_E
467
468        vp8_loop_filter inner=\inner, simple=\simple
469
470        @ back up to P2:  dst -= stride * 6
471        sub             r0,  r0,  r1,  lsl #2
472    .if !\simple
473        sub             r0,  r0,  r1,  lsl #1
474
475        @ Store pixels:
476        vst1.8          {q1},     [r0,:128], r1 @ P2
477    .endif
478        vst1.8          {q2},     [r0,:128], r1 @ P1
479        vst1.8          {q3},     [r0,:128], r1 @ P0
480        vst1.8          {q4},     [r0,:128], r1 @ Q0
481        vst1.8          {q5},     [r0,:128], r1 @ Q1
482    .if !\simple
483        vst1.8          {q6},     [r0,:128]     @ Q2
484    .endif
485
486        vpop            {q4-q7}
487        bx              lr
488endfunc
489.endm
490
491vp8_v_loop_filter16
492vp8_v_loop_filter16 _inner,  inner=1
493vp8_v_loop_filter16 _simple, simple=1
494
495.macro  vp8_v_loop_filter8uv name, inner=0
496function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
497        vpush           {q4-q7}
498        sub             r0,  r0,  r2,  lsl #2
499        sub             r1,  r1,  r2,  lsl #2
500        ldr             r12, [sp, #64]          @ flim_I
501
502        @ Load pixels:
503        vld1.8          {d0},     [r0,:64], r2  @ P3
504        vld1.8          {d1},     [r1,:64], r2  @ P3
505        vld1.8          {d2},     [r0,:64], r2  @ P2
506        vld1.8          {d3},     [r1,:64], r2  @ P2
507        vld1.8          {d4},     [r0,:64], r2  @ P1
508        vld1.8          {d5},     [r1,:64], r2  @ P1
509        vld1.8          {d6},     [r0,:64], r2  @ P0
510        vld1.8          {d7},     [r1,:64], r2  @ P0
511        vld1.8          {d8},     [r0,:64], r2  @ Q0
512        vld1.8          {d9},     [r1,:64], r2  @ Q0
513        vld1.8          {d10},    [r0,:64], r2  @ Q1
514        vld1.8          {d11},    [r1,:64], r2  @ Q1
515        vld1.8          {d12},    [r0,:64], r2  @ Q2
516        vld1.8          {d13},    [r1,:64], r2  @ Q2
517        vld1.8          {d14},    [r0,:64]      @ Q3
518        vld1.8          {d15},    [r1,:64]      @ Q3
519
520        vdup.8          q14, r3                 @ flim_E
521        vdup.8          q15, r12                @ flim_I
522        ldr             r12, [sp, #68]          @ hev_thresh
523
524        vp8_loop_filter inner=\inner
525
526        @ back up to P2:  u,v -= stride * 6
527        sub             r0,  r0,  r2,  lsl #2
528        sub             r1,  r1,  r2,  lsl #2
529        sub             r0,  r0,  r2,  lsl #1
530        sub             r1,  r1,  r2,  lsl #1
531
532        @ Store pixels:
533        vst1.8          {d2},     [r0,:64], r2  @ P2
534        vst1.8          {d3},     [r1,:64], r2  @ P2
535        vst1.8          {d4},     [r0,:64], r2  @ P1
536        vst1.8          {d5},     [r1,:64], r2  @ P1
537        vst1.8          {d6},     [r0,:64], r2  @ P0
538        vst1.8          {d7},     [r1,:64], r2  @ P0
539        vst1.8          {d8},     [r0,:64], r2  @ Q0
540        vst1.8          {d9},     [r1,:64], r2  @ Q0
541        vst1.8          {d10},    [r0,:64], r2  @ Q1
542        vst1.8          {d11},    [r1,:64], r2  @ Q1
543        vst1.8          {d12},    [r0,:64]      @ Q2
544        vst1.8          {d13},    [r1,:64]      @ Q2
545
546        vpop            {q4-q7}
547        bx              lr
548endfunc
549.endm
550
551vp8_v_loop_filter8uv
552vp8_v_loop_filter8uv _inner, inner=1
553
554.macro  vp8_h_loop_filter16 name, inner=0, simple=0
555function ff_vp8_h_loop_filter16\name\()_neon, export=1
556        vpush           {q4-q7}
557        sub             r0,  r0,  #4
558    .if !\simple
559        ldr             r12, [sp, #64]          @ hev_thresh
560    .endif
561
562        @ Load pixels:
563        vld1.8          {d0},     [r0], r1      @ load first 8-line src data
564        vld1.8          {d2},     [r0], r1
565        vld1.8          {d4},     [r0], r1
566        vld1.8          {d6},     [r0], r1
567        vld1.8          {d8},     [r0], r1
568        vld1.8          {d10},    [r0], r1
569        vld1.8          {d12},    [r0], r1
570        vld1.8          {d14},    [r0], r1
571        vld1.8          {d1},     [r0], r1      @ load second 8-line src data
572        vld1.8          {d3},     [r0], r1
573        vld1.8          {d5},     [r0], r1
574        vld1.8          {d7},     [r0], r1
575        vld1.8          {d9},     [r0], r1
576        vld1.8          {d11},    [r0], r1
577        vld1.8          {d13},    [r0], r1
578        vld1.8          {d15},    [r0], r1
579
580        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
581
582        vdup.8          q14, r2                 @ flim_E
583    .if !\simple
584        vdup.8          q15, r3                 @ flim_I
585    .endif
586
587        vp8_loop_filter inner=\inner, simple=\simple
588
589        sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
590
591        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
592
593        @ Store pixels:
594        vst1.8          {d0},     [r0],     r1
595        vst1.8          {d2},     [r0],     r1
596        vst1.8          {d4},     [r0],     r1
597        vst1.8          {d6},     [r0],     r1
598        vst1.8          {d8},     [r0],     r1
599        vst1.8          {d10},    [r0],     r1
600        vst1.8          {d12},    [r0],     r1
601        vst1.8          {d14},    [r0],     r1
602        vst1.8          {d1},     [r0],     r1
603        vst1.8          {d3},     [r0],     r1
604        vst1.8          {d5},     [r0],     r1
605        vst1.8          {d7},     [r0],     r1
606        vst1.8          {d9},     [r0],     r1
607        vst1.8          {d11},    [r0],     r1
608        vst1.8          {d13},    [r0],     r1
609        vst1.8          {d15},    [r0]
610
611        vpop            {q4-q7}
612        bx              lr
613endfunc
614.endm
615
616vp8_h_loop_filter16
617vp8_h_loop_filter16 _inner,  inner=1
618vp8_h_loop_filter16 _simple, simple=1
619
620.macro  vp8_h_loop_filter8uv name, inner=0
621function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
622        vpush           {q4-q7}
623        sub             r0,  r0,  #4
624        sub             r1,  r1,  #4
625        ldr             r12, [sp, #64]          @ flim_I
626
627        @ Load pixels:
628        vld1.8          {d0},     [r0], r2      @ load u
629        vld1.8          {d1},     [r1], r2      @ load v
630        vld1.8          {d2},     [r0], r2
631        vld1.8          {d3},     [r1], r2
632        vld1.8          {d4},     [r0], r2
633        vld1.8          {d5},     [r1], r2
634        vld1.8          {d6},     [r0], r2
635        vld1.8          {d7},     [r1], r2
636        vld1.8          {d8},     [r0], r2
637        vld1.8          {d9},     [r1], r2
638        vld1.8          {d10},    [r0], r2
639        vld1.8          {d11},    [r1], r2
640        vld1.8          {d12},    [r0], r2
641        vld1.8          {d13},    [r1], r2
642        vld1.8          {d14},    [r0], r2
643        vld1.8          {d15},    [r1], r2
644
645        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
646
647        vdup.8          q14, r3                 @ flim_E
648        vdup.8          q15, r12                @ flim_I
649        ldr             r12, [sp, #68]          @ hev_thresh
650
651        vp8_loop_filter inner=\inner
652
653        sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
654        sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
655
656        transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
657
658        @ Store pixels:
659        vst1.8          {d0},     [r0], r2
660        vst1.8          {d1},     [r1], r2
661        vst1.8          {d2},     [r0], r2
662        vst1.8          {d3},     [r1], r2
663        vst1.8          {d4},     [r0], r2
664        vst1.8          {d5},     [r1], r2
665        vst1.8          {d6},     [r0], r2
666        vst1.8          {d7},     [r1], r2
667        vst1.8          {d8},     [r0], r2
668        vst1.8          {d9},     [r1], r2
669        vst1.8          {d10},    [r0], r2
670        vst1.8          {d11},    [r1], r2
671        vst1.8          {d12},    [r0], r2
672        vst1.8          {d13},    [r1], r2
673        vst1.8          {d14},    [r0]
674        vst1.8          {d15},    [r1]
675
676        vpop            {q4-q7}
677        bx              lr
678endfunc
679.endm
680
681vp8_h_loop_filter8uv
682vp8_h_loop_filter8uv _inner, inner=1
683
684function ff_put_vp8_pixels16_neon, export=1
685        ldr             r12, [sp, #0]           @ h
6861:
687        subs            r12, r12, #4
688        vld1.8          {q0},     [r2], r3
689        vld1.8          {q1},     [r2], r3
690        vld1.8          {q2},     [r2], r3
691        vld1.8          {q3},     [r2], r3
692        vst1.8          {q0},     [r0,:128], r1
693        vst1.8          {q1},     [r0,:128], r1
694        vst1.8          {q2},     [r0,:128], r1
695        vst1.8          {q3},     [r0,:128], r1
696        bgt             1b
697        bx              lr
698endfunc
699
700function ff_put_vp8_pixels8_neon, export=1
701        ldr             r12, [sp, #0]           @ h
7021:
703        subs            r12, r12, #4
704        vld1.8          {d0},     [r2], r3
705        vld1.8          {d1},     [r2], r3
706        vld1.8          {d2},     [r2], r3
707        vld1.8          {d3},     [r2], r3
708        vst1.8          {d0},     [r0,:64], r1
709        vst1.8          {d1},     [r0,:64], r1
710        vst1.8          {d2},     [r0,:64], r1
711        vst1.8          {d3},     [r0,:64], r1
712        bgt             1b
713        bx              lr
714endfunc
715
716/* 4/6-tap 8th-pel MC */
717
718.macro  vp8_epel8_h6    d,   a,   b
719        vext.8          d27, \a,  \b,  #1
720        vmovl.u8        q8,  \a
721        vext.8          d28, \a,  \b,  #2
722        vmovl.u8        q9,  d27
723        vext.8          d29, \a,  \b,  #3
724        vmovl.u8        q10, d28
725        vext.8          d30, \a,  \b,  #4
726        vmovl.u8        q11, d29
727        vext.8          d31, \a,  \b,  #5
728        vmovl.u8        q12, d30
729        vmul.u16        q10, q10, d0[2]
730        vmovl.u8        q13, d31
731        vmul.u16        q11, q11, d0[3]
732        vmls.u16        q10, q9,  d0[1]
733        vmls.u16        q11, q12, d1[0]
734        vmla.u16        q10, q8,  d0[0]
735        vmla.u16        q11, q13, d1[1]
736        vqadd.s16       q11, q10, q11
737        vqrshrun.s16    \d,  q11, #7
738.endm
739
740.macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
741        vext.8          q14, \q0, \q1, #3
742        vext.8          q15, \q0, \q1, #4
743        vmovl.u8        q11, d28
744        vmovl.u8        q14, d29
745        vext.8          q3,  \q0, \q1, #2
746        vmovl.u8        q12, d30
747        vmovl.u8        q15, d31
748        vext.8          q8,  \q0, \q1, #1
749        vmovl.u8        q10, d6
750        vmovl.u8        q3,  d7
751        vext.8          q2,  \q0, \q1, #5
752        vmovl.u8        q13, d4
753        vmovl.u8        q2,  d5
754        vmovl.u8        q9,  d16
755        vmovl.u8        q8,  d17
756        vmul.u16        q11, q11, d0[3]
757        vmul.u16        q10, q10, d0[2]
758        vmul.u16        q3,  q3,  d0[2]
759        vmul.u16        q14, q14, d0[3]
760        vmls.u16        q11, q12, d1[0]
761        vmovl.u8        q12, \s0
762        vmovl.u8        q1,  \s1
763        vmls.u16        q10, q9,  d0[1]
764        vmls.u16        q3,  q8,  d0[1]
765        vmls.u16        q14, q15, d1[0]
766        vmla.u16        q10, q12, d0[0]
767        vmla.u16        q11, q13, d1[1]
768        vmla.u16        q3,  q1,  d0[0]
769        vmla.u16        q14, q2,  d1[1]
770        vqadd.s16       q11, q10, q11
771        vqadd.s16       q14, q3,  q14
772        vqrshrun.s16    \d0, q11, #7
773        vqrshrun.s16    \d1, q14, #7
774.endm
775
776.macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
777        vmovl.u8        q10, \s2
778        vmovl.u8        q11, \s3
779        vmovl.u8        q9,  \s1
780        vmovl.u8        q12, \s4
781        vmovl.u8        q8,  \s0
782        vmovl.u8        q13, \s5
783        vmul.u16        q10, q10, d0[2]
784        vmul.u16        q11, q11, d0[3]
785        vmls.u16        q10, q9,  d0[1]
786        vmls.u16        q11, q12, d1[0]
787        vmla.u16        q10, q8,  d0[0]
788        vmla.u16        q11, q13, d1[1]
789        vqadd.s16       q11, q10, q11
790        vqrshrun.s16    \d0, q11, #7
791.endm
792
793.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
794        vmovl.u8        q10, \s0
795        vmovl.u8        q11, \s3
796        vmovl.u8        q14, \s6
797        vmovl.u8        q9,  \s1
798        vmovl.u8        q12, \s4
799        vmovl.u8        q8,  \s2
800        vmovl.u8        q13, \s5
801        vmul.u16        q10, q10, d0[0]
802        vmul.u16        q15, q11, d0[3]
803        vmul.u16        q11, q11, d0[2]
804        vmul.u16        q14, q14, d1[1]
805        vmls.u16        q10, q9,  d0[1]
806        vmls.u16        q15, q12, d1[0]
807        vmls.u16        q11, q8,  d0[1]
808        vmls.u16        q14, q13, d1[0]
809        vmla.u16        q10, q8,  d0[2]
810        vmla.u16        q15, q13, d1[1]
811        vmla.u16        q11, q9,  d0[0]
812        vmla.u16        q14, q12, d0[3]
813        vqadd.s16       q15, q10, q15
814        vqadd.s16       q14, q11, q14
815        vqrshrun.s16    \d0, q15, #7
816        vqrshrun.s16    \d1, q14, #7
817.endm
818
819.macro  vp8_epel8_h4    d,   a,   b
820        vext.8          d28, \a,  \b,  #1
821        vmovl.u8        q9,  \a
822        vext.8          d29, \a,  \b,  #2
823        vmovl.u8        q10, d28
824        vext.8          d30, \a,  \b,  #3
825        vmovl.u8        q11, d29
826        vmovl.u8        q12, d30
827        vmul.u16        q10, q10, d0[2]
828        vmul.u16        q11, q11, d0[3]
829        vmls.u16        q10, q9,  d0[1]
830        vmls.u16        q11, q12, d1[0]
831        vqadd.s16       q11, q10, q11
832        vqrshrun.s16    \d,  q11, #7
833.endm
834
835.macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
836        vmovl.u8        q9,  \s0
837        vmovl.u8        q10, \s1
838        vmovl.u8        q11, \s2
839        vmovl.u8        q12, \s3
840        vmovl.u8        q13, \s4
841        vmul.u16        q8,  q10, d0[2]
842        vmul.u16        q14, q11, d0[3]
843        vmul.u16        q11, q11, d0[2]
844        vmul.u16        q15, q12, d0[3]
845        vmls.u16        q8,  q9,  d0[1]
846        vmls.u16        q14, q12, d1[0]
847        vmls.u16        q11, q10, d0[1]
848        vmls.u16        q15, q13, d1[0]
849        vqadd.s16       q8,  q8,  q14
850        vqadd.s16       q11, q11, q15
851        vqrshrun.s16    \d0, q8,  #7
852        vqrshrun.s16    \d1, q11, #7
853.endm
854
855function ff_put_vp8_epel16_v6_neon, export=1
856        sub             r2,  r2,  r3,  lsl #1
857        push            {r4,lr}
858        vpush           {d8-d15}
859
860        ldr             r4,  [sp, #80]          @ my
861        movrel          lr,  subpel_filters-16
862        ldr             r12, [sp, #72]          @ h
863        add             r4,  lr,  r4, lsl #4
864        vld1.16         {q0},     [r4,:128]
8651:
866        vld1.8          {d2-d3},  [r2], r3
867        vld1.8          {d4-d5},  [r2], r3
868        vld1.8          {d6-d7},  [r2], r3
869        vld1.8          {d8-d9},  [r2], r3
870        vld1.8          {d10-d11},[r2], r3
871        vld1.8          {d12-d13},[r2], r3
872        vld1.8          {d14-d15},[r2]
873        sub             r2,  r2,  r3,  lsl #2
874
875        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
876        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
877
878        vst1.8          {d2-d3},  [r0,:128], r1
879        vst1.8          {d4-d5},  [r0,:128], r1
880        subs            r12, r12, #2
881        bne             1b
882
883        vpop            {d8-d15}
884        pop             {r4,pc}
885endfunc
886
887function ff_put_vp8_epel16_h6_neon, export=1
888        sub             r2,  r2,  #2
889        push            {r4,lr}
890
891        ldr             r4,  [sp, #12]          @ mx
892        movrel          lr,  subpel_filters-16
893        ldr             r12, [sp, #8]           @ h
894        add             r4,  lr,  r4, lsl #4
895        vld1.16         {q0},     [r4,:128]
8961:
897        vld1.8          {d2-d4},  [r2], r3
898
899        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
900
901        vst1.8          {d2-d3}, [r0,:128], r1
902        subs            r12, r12, #1
903        bne             1b
904
905        pop             {r4,pc}
906endfunc
907
908function ff_put_vp8_epel16_h6v6_neon, export=1
909        sub             r2,  r2,  r3,  lsl #1
910        sub             r2,  r2,  #2
911        push            {r4,lr}
912        vpush           {d8-d9}
913
914        @ first pass (horizontal):
915        ldr             r4,  [sp, #28]          @ mx
916        movrel          lr,  subpel_filters-16
917        ldr             r12, [sp, #24]          @ h
918        add             r4,  lr,  r4, lsl #4
919        sub             sp,  sp,  #336+16
920        vld1.16         {q0},     [r4,:128]
921        add             lr,  sp,  #15
922        add             r12, r12, #5
923        bic             lr,  lr,  #15
9241:
925        vld1.8          {d2,d3,d4}, [r2], r3
926
927        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
928
929        vst1.8          {d2-d3}, [lr,:128]!
930        subs            r12, r12, #1
931        bne             1b
932
933        @ second pass (vertical):
934        ldr             r4,  [sp, #336+16+32]   @ my
935        movrel          lr,  subpel_filters-16
936        ldr             r12, [sp, #336+16+24]   @ h
937        add             r4,  lr,  r4, lsl #4
938        add             lr,  sp,  #15
939        vld1.16         {q0},     [r4,:128]
940        bic             lr,  lr,  #15
9412:
942        vld1.8          {d2-d5},  [lr,:128]!
943        vld1.8          {d6-d9},  [lr,:128]!
944        vld1.8          {d28-d31},[lr,:128]
945        sub             lr,  lr,  #48
946
947        vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
948        vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
949
950        vst1.8          {d2-d3}, [r0,:128], r1
951        subs            r12, r12, #1
952        bne             2b
953
954        add             sp,  sp,  #336+16
955        vpop            {d8-d9}
956        pop             {r4,pc}
957endfunc
958
959function ff_put_vp8_epel8_v6_neon, export=1
960        sub             r2,  r2,  r3,  lsl #1
961        push            {r4,lr}
962
963        ldr             r4,  [sp, #16]          @ my
964        movrel          lr,  subpel_filters-16
965        ldr             r12, [sp, #8]           @ h
966        add             r4,  lr,  r4, lsl #4
967        vld1.16         {q0},     [r4,:128]
9681:
969        vld1.8          {d2},  [r2], r3
970        vld1.8          {d3},  [r2], r3
971        vld1.8          {d4},  [r2], r3
972        vld1.8          {d5},  [r2], r3
973        vld1.8          {d6},  [r2], r3
974        vld1.8          {d7},  [r2], r3
975        vld1.8          {d28}, [r2]
976
977        sub             r2,  r2,  r3,  lsl #2
978
979        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
980
981        vst1.8          {d2}, [r0,:64], r1
982        vst1.8          {d3}, [r0,:64], r1
983        subs            r12, r12, #2
984        bne             1b
985
986        pop             {r4,pc}
987endfunc
988
989function ff_put_vp8_epel8_h6_neon, export=1
990        sub             r2,  r2,  #2
991        push            {r4,lr}
992
993        ldr             r4,  [sp, #12]          @ mx
994        movrel          lr,  subpel_filters-16
995        ldr             r12, [sp, #8]           @ h
996        add             r4,  lr,  r4, lsl #4
997        vld1.16         {q0},     [r4,:128]
9981:
999        vld1.8          {d2,d3}, [r2], r3
1000
1001        vp8_epel8_h6    d2,  d2,  d3
1002
1003        vst1.8          {d2}, [r0,:64], r1
1004        subs            r12, r12, #1
1005        bne             1b
1006
1007        pop             {r4,pc}
1008endfunc
1009
1010function ff_put_vp8_epel8_h6v6_neon, export=1
1011        sub             r2,  r2,  r3,  lsl #1
1012        sub             r2,  r2,  #2
1013        push            {r4,lr}
1014
1015        @ first pass (horizontal):
1016        ldr             r4,  [sp, #12]          @ mx
1017        movrel          lr,  subpel_filters-16
1018        ldr             r12, [sp, #8]           @ h
1019        add             r4,  lr,  r4, lsl #4
1020        sub             sp,  sp,  #168+16
1021        vld1.16         {q0},     [r4,:128]
1022        add             lr,  sp,  #15
1023        add             r12, r12, #5
1024        bic             lr,  lr,  #15
10251:
1026        vld1.8          {d2,d3}, [r2], r3
1027
1028        vp8_epel8_h6    d2,  d2,  d3
1029
1030        vst1.8          {d2}, [lr,:64]!
1031        subs            r12, r12, #1
1032        bne             1b
1033
1034        @ second pass (vertical):
1035        ldr             r4,  [sp, #168+16+16]   @ my
1036        movrel          lr,  subpel_filters-16
1037        ldr             r12, [sp, #168+16+8]    @ h
1038        add             r4,  lr,  r4, lsl #4
1039        add             lr,  sp,  #15
1040        vld1.16         {q0},     [r4,:128]
1041        bic             lr,  lr,  #15
10422:
1043        vld1.8          {d2-d5},  [lr,:128]!
1044        vld1.8          {d6-d7},  [lr,:128]!
1045        vld1.8          {d30},    [lr,:64]
1046        sub             lr,  lr,  #32
1047
1048        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1049
1050        vst1.8          {d2}, [r0,:64], r1
1051        vst1.8          {d3}, [r0,:64], r1
1052        subs            r12, r12, #2
1053        bne             2b
1054
1055        add             sp,  sp,  #168+16
1056        pop             {r4,pc}
1057endfunc
1058
1059function ff_put_vp8_epel8_v4_neon, export=1
1060        sub             r2,  r2,  r3
1061        push            {r4,lr}
1062
1063        ldr             r4,  [sp, #16]          @ my
1064        movrel          lr,  subpel_filters-16
1065        ldr             r12, [sp, #8]           @ h
1066        add             r4,  lr,  r4, lsl #4
1067        vld1.16         {q0},     [r4,:128]
10681:
1069        vld1.8          {d2},     [r2], r3
1070        vld1.8          {d3},     [r2], r3
1071        vld1.8          {d4},     [r2], r3
1072        vld1.8          {d5},     [r2], r3
1073        vld1.8          {d6},     [r2]
1074        sub             r2,  r2,  r3,  lsl #1
1075
1076        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1077
1078        vst1.8          {d2}, [r0,:64], r1
1079        vst1.8          {d3}, [r0,:64], r1
1080        subs            r12, r12, #2
1081        bne             1b
1082
1083        pop             {r4,pc}
1084endfunc
1085
1086function ff_put_vp8_epel8_h4_neon, export=1
1087        sub             r2,  r2,  #1
1088        push            {r4,lr}
1089
1090        ldr             r4,  [sp, #12]          @ mx
1091        movrel          lr,  subpel_filters-16
1092        ldr             r12, [sp, #8]           @ h
1093        add             r4,  lr,  r4, lsl #4
1094        vld1.16         {q0},     [r4,:128]
10951:
1096        vld1.8          {d2,d3}, [r2], r3
1097
1098        vp8_epel8_h4    d2,  d2,  d3
1099
1100        vst1.8          {d2}, [r0,:64], r1
1101        subs            r12, r12, #1
1102        bne             1b
1103
1104        pop             {r4,pc}
1105endfunc
1106
1107function ff_put_vp8_epel8_h4v4_neon, export=1
1108        sub             r2,  r2,  r3
1109        sub             r2,  r2,  #1
1110        push            {r4,lr}
1111
1112        @ first pass (horizontal):
1113        ldr             r4,  [sp, #12]          @ mx
1114        movrel          lr,  subpel_filters-16
1115        ldr             r12, [sp, #8]           @ h
1116        add             r4,  lr,  r4, lsl #4
1117        sub             sp,  sp,  #168+16
1118        vld1.16         {q0},     [r4,:128]
1119        add             lr,  sp,  #15
1120        add             r12, r12, #3
1121        bic             lr,  lr,  #15
11221:
1123        vld1.8          {d2,d3}, [r2], r3
1124
1125        vp8_epel8_h4    d2,  d2,  d3
1126
1127        vst1.8          {d2}, [lr,:64]!
1128        subs            r12, r12, #1
1129        bne             1b
1130
1131        @ second pass (vertical):
1132        ldr             r4,  [sp, #168+16+16]   @ my
1133        movrel          lr,  subpel_filters-16
1134        ldr             r12, [sp, #168+16+8]    @ h
1135        add             r4,  lr,  r4, lsl #4
1136        add             lr,  sp,  #15
1137        vld1.16         {q0},     [r4,:128]
1138        bic             lr,  lr,  #15
11392:
1140        vld1.8          {d2-d5},  [lr,:128]!
1141        vld1.8          {d6},     [lr,:64]
1142        sub             lr,  lr,  #16
1143
1144        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1145
1146        vst1.8          {d2},     [r0,:64], r1
1147        vst1.8          {d3},     [r0,:64], r1
1148        subs            r12, r12, #2
1149        bne             2b
1150
1151        add             sp,  sp,  #168+16
1152        pop             {r4,pc}
1153endfunc
1154
1155function ff_put_vp8_epel8_h6v4_neon, export=1
1156        sub             r2,  r2,  r3
1157        sub             r2,  r2,  #2
1158        push            {r4,lr}
1159
1160        @ first pass (horizontal):
1161        ldr             r4,  [sp, #12]          @ mx
1162        movrel          lr,  subpel_filters-16
1163        ldr             r12, [sp, #8]           @ h
1164        add             r4,  lr,  r4, lsl #4
1165        sub             sp,  sp,  #168+16
1166        vld1.16         {q0},     [r4,:128]
1167        add             lr,  sp,  #15
1168        add             r12, r12, #3
1169        bic             lr,  lr,  #15
11701:
1171        vld1.8          {d2,d3}, [r2], r3
1172
1173        vp8_epel8_h6    d2,  d2,  d3
1174
1175        vst1.8          {d2}, [lr,:64]!
1176        subs            r12, r12, #1
1177        bne             1b
1178
1179        @ second pass (vertical):
1180        ldr             r4,  [sp, #168+16+16]   @ my
1181        movrel          lr,  subpel_filters-16
1182        ldr             r12, [sp, #168+16+8]    @ h
1183        add             r4,  lr,  r4, lsl #4
1184        add             lr,  sp,  #15
1185        vld1.16         {q0},     [r4,:128]
1186        bic             lr,  lr,  #15
11872:
1188        vld1.8          {d2-d5},  [lr,:128]!
1189        vld1.8          {d6},     [lr,:64]
1190        sub             lr,  lr,  #16
1191
1192        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1193
1194        vst1.8          {d2},     [r0,:64], r1
1195        vst1.8          {d3},     [r0,:64], r1
1196        subs            r12, r12, #2
1197        bne             2b
1198
1199        add             sp,  sp,  #168+16
1200        pop             {r4,pc}
1201endfunc
1202
1203function ff_put_vp8_epel8_h4v6_neon, export=1
1204        sub             r2,  r2,  r3,  lsl #1
1205        sub             r2,  r2,  #1
1206        push            {r4,lr}
1207
1208        @ first pass (horizontal):
1209        ldr             r4,  [sp, #12]          @ mx
1210        movrel          lr,  subpel_filters-16
1211        ldr             r12, [sp, #8]           @ h
1212        add             r4,  lr,  r4, lsl #4
1213        sub             sp,  sp,  #168+16
1214        vld1.16         {q0},     [r4,:128]
1215        add             lr,  sp,  #15
1216        add             r12, r12, #5
1217        bic             lr,  lr,  #15
12181:
1219        vld1.8          {d2,d3}, [r2], r3
1220
1221        vp8_epel8_h4    d2,  d2,  d3
1222
1223        vst1.8          {d2}, [lr,:64]!
1224        subs            r12, r12, #1
1225        bne             1b
1226
1227        @ second pass (vertical):
1228        ldr             r4,  [sp, #168+16+16]   @ my
1229        movrel          lr,  subpel_filters-16
1230        ldr             r12, [sp, #168+16+8]    @ h
1231        add             r4,  lr,  r4, lsl #4
1232        add             lr,  sp,  #15
1233        vld1.16         {q0},     [r4,:128]
1234        bic             lr,  lr,  #15
12352:
1236        vld1.8          {d2-d5},  [lr,:128]!
1237        vld1.8          {d6-d7},  [lr,:128]!
1238        vld1.8          {d30},    [lr,:64]
1239        sub             lr,  lr,  #32
1240
1241        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1242
1243        vst1.8          {d2}, [r0,:64], r1
1244        vst1.8          {d3}, [r0,:64], r1
1245        subs            r12, r12, #2
1246        bne             2b
1247
1248        add             sp,  sp,  #168+16
1249        pop             {r4,pc}
1250endfunc
1251
1252.ltorg
1253
1254function ff_put_vp8_epel4_v6_neon, export=1
1255        sub             r2,  r2,  r3,  lsl #1
1256        push            {r4,lr}
1257
1258        ldr             r4,  [sp, #16]          @ my
1259        movrel          lr,  subpel_filters-16
1260        ldr             r12, [sp, #8]           @ h
1261        add             r4,  lr,  r4, lsl #4
1262        vld1.16         {q0},     [r4,:128]
12631:
1264        vld1.32         {d2[]},   [r2], r3
1265        vld1.32         {d3[]},   [r2], r3
1266        vld1.32         {d4[]},   [r2], r3
1267        vld1.32         {d5[]},   [r2], r3
1268        vld1.32         {d6[]},   [r2], r3
1269        vld1.32         {d7[]},   [r2], r3
1270        vld1.32         {d28[]},  [r2]
1271        sub             r2,  r2,  r3,  lsl #2
1272        vld1.32         {d2[1]},  [r2], r3
1273        vld1.32         {d3[1]},  [r2], r3
1274        vld1.32         {d4[1]},  [r2], r3
1275        vld1.32         {d5[1]},  [r2], r3
1276        vld1.32         {d6[1]},  [r2], r3
1277        vld1.32         {d7[1]},  [r2], r3
1278        vld1.32         {d28[1]}, [r2]
1279        sub             r2,  r2,  r3,  lsl #2
1280
1281        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1282
1283        vst1.32         {d2[0]},  [r0,:32], r1
1284        vst1.32         {d3[0]},  [r0,:32], r1
1285        vst1.32         {d2[1]},  [r0,:32], r1
1286        vst1.32         {d3[1]},  [r0,:32], r1
1287        subs            r12, r12, #4
1288        bne             1b
1289
1290        pop             {r4,pc}
1291endfunc
1292
1293function ff_put_vp8_epel4_h6_neon, export=1
1294        sub             r2,  r2,  #2
1295        push            {r4,lr}
1296
1297        ldr             r4,  [sp, #12]          @ mx
1298        movrel          lr,  subpel_filters-16
1299        ldr             r12, [sp, #8]           @ h
1300        add             r4,  lr,  r4, lsl #4
1301        vld1.16         {q0},     [r4,:128]
13021:
1303        vld1.8          {q1},     [r2], r3
1304        vp8_epel8_h6    d2,  d2,  d3
1305        vst1.32         {d2[0]},  [r0,:32], r1
1306        subs            r12, r12, #1
1307        bne             1b
1308
1309        pop             {r4,pc}
1310endfunc
1311
1312function ff_put_vp8_epel4_h6v6_neon, export=1
1313        sub             r2,  r2,  r3,  lsl #1
1314        sub             r2,  r2,  #2
1315        push            {r4,lr}
1316
1317        ldr             r4,  [sp, #12]          @ mx
1318        movrel          lr,  subpel_filters-16
1319        ldr             r12, [sp, #8]           @ h
1320        add             r4,  lr,  r4, lsl #4
1321        sub             sp,  sp,  #52+16
1322        vld1.16         {q0},     [r4,:128]
1323        add             lr,  sp,  #15
1324        add             r12, r12, #5
1325        bic             lr,  lr,  #15
13261:
1327        vld1.8          {q1},     [r2], r3
1328        vp8_epel8_h6    d2,  d2,  d3
1329        vst1.32         {d2[0]},  [lr,:32]!
1330        subs            r12, r12, #1
1331        bne             1b
1332
1333        ldr             r4,  [sp, #52+16+16]    @ my
1334        movrel          lr,  subpel_filters-16
1335        ldr             r12, [sp, #52+16+8]     @ h
1336        add             r4,  lr,  r4, lsl #4
1337        add             lr,  sp,  #15
1338        vld1.16         {q0},     [r4,:128]
1339        bic             lr,  lr,  #15
13402:
1341        vld1.8          {d2-d3},  [lr,:128]!
1342        vld1.8          {d6},     [lr,:64]!
1343        vld1.32         {d28[]},  [lr,:32]
1344        sub             lr,  lr,  #16
1345        vld1.8          {d4-d5},  [lr]!
1346        vld1.8          {d7},     [lr,:64]!
1347        vld1.32         {d28[1]}, [lr,:32]
1348        sub             lr,  lr,  #16
1349        vtrn.32         q1,  q2
1350        vtrn.32         d6,  d7
1351        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1352        vst1.32         {d2[0]},  [r0,:32], r1
1353        vst1.32         {d3[0]},  [r0,:32], r1
1354        vst1.32         {d2[1]},  [r0,:32], r1
1355        vst1.32         {d3[1]},  [r0,:32], r1
1356        subs            r12, r12, #4
1357        bne             2b
1358
1359        add             sp,  sp,  #52+16
1360        pop             {r4,pc}
1361endfunc
1362
1363function ff_put_vp8_epel4_h4v6_neon, export=1
1364        sub             r2,  r2,  r3,  lsl #1
1365        sub             r2,  r2,  #1
1366        push            {r4,lr}
1367
1368        ldr             r4,  [sp, #12]          @ mx
1369        movrel          lr,  subpel_filters-16
1370        ldr             r12, [sp, #8]           @ h
1371        add             r4,  lr,  r4, lsl #4
1372        sub             sp,  sp,  #52+16
1373        vld1.16         {q0},     [r4,:128]
1374        add             lr,  sp,  #15
1375        add             r12, r12, #5
1376        bic             lr,  lr,  #15
13771:
1378        vld1.8          {d2},     [r2], r3
1379        vp8_epel8_h4    d2,  d2,  d2
1380        vst1.32         {d2[0]},  [lr,:32]!
1381        subs            r12, r12, #1
1382        bne             1b
1383
1384        ldr             r4,  [sp, #52+16+16]    @ my
1385        movrel          lr,  subpel_filters-16
1386        ldr             r12, [sp, #52+16+8]     @ h
1387        add             r4,  lr,  r4, lsl #4
1388        add             lr,  sp,  #15
1389        vld1.16         {q0},     [r4,:128]
1390        bic             lr,  lr,  #15
13912:
1392        vld1.8          {d2-d3},  [lr,:128]!
1393        vld1.8          {d6},     [lr,:64]!
1394        vld1.32         {d28[]},  [lr,:32]
1395        sub             lr,  lr,  #16
1396        vld1.8          {d4-d5},  [lr]!
1397        vld1.8          {d7},     [lr,:64]!
1398        vld1.32         {d28[1]}, [lr,:32]
1399        sub             lr,  lr,  #16
1400        vtrn.32         q1,  q2
1401        vtrn.32         d6,  d7
1402        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1403        vst1.32         {d2[0]},  [r0,:32], r1
1404        vst1.32         {d3[0]},  [r0,:32], r1
1405        vst1.32         {d2[1]},  [r0,:32], r1
1406        vst1.32         {d3[1]},  [r0,:32], r1
1407        subs            r12, r12, #4
1408        bne             2b
1409
1410        add             sp,  sp,  #52+16
1411        pop             {r4,pc}
1412endfunc
1413
1414function ff_put_vp8_epel4_h6v4_neon, export=1
1415        sub             r2,  r2,  r3
1416        sub             r2,  r2,  #2
1417        push            {r4,lr}
1418
1419        ldr             r4,  [sp, #12]          @ mx
1420        movrel          lr,  subpel_filters-16
1421        ldr             r12, [sp, #8]           @ h
1422        add             r4,  lr,  r4, lsl #4
1423        sub             sp,  sp,  #44+16
1424        vld1.16         {q0},     [r4,:128]
1425        add             lr,  sp,  #15
1426        add             r12, r12, #3
1427        bic             lr,  lr,  #15
14281:
1429        vld1.8          {q1},     [r2], r3
1430        vp8_epel8_h6    d2,  d2,  d3
1431        vst1.32         {d2[0]},  [lr,:32]!
1432        subs            r12, r12, #1
1433        bne             1b
1434
1435        ldr             r4,  [sp, #44+16+16]    @ my
1436        movrel          lr,  subpel_filters-16
1437        ldr             r12, [sp, #44+16+8]     @ h
1438        add             r4,  lr,  r4, lsl #4
1439        add             lr,  sp,  #15
1440        vld1.16         {q0},     [r4,:128]
1441        bic             lr,  lr,  #15
14422:
1443        vld1.8          {d2-d3},  [lr,:128]!
1444        vld1.32         {d6[]},   [lr,:32]
1445        sub             lr,  lr,  #8
1446        vld1.8          {d4-d5},  [lr]!
1447        vld1.32         {d6[1]},  [lr,:32]
1448        sub             lr,  lr,  #8
1449        vtrn.32         q1,  q2
1450        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1451        vst1.32         {d2[0]},  [r0,:32], r1
1452        vst1.32         {d3[0]},  [r0,:32], r1
1453        vst1.32         {d2[1]},  [r0,:32], r1
1454        vst1.32         {d3[1]},  [r0,:32], r1
1455        subs            r12, r12, #4
1456        bne             2b
1457
1458        add             sp,  sp,  #44+16
1459        pop             {r4,pc}
1460endfunc
1461
1462function ff_put_vp8_epel4_h4_neon, export=1
1463        sub             r2,  r2,  #1
1464        push            {r4,lr}
1465
1466        ldr             r4,  [sp, #12]          @ mx
1467        movrel          lr,  subpel_filters-16
1468        ldr             r12, [sp, #8]           @ h
1469        add             r4,  lr,  r4, lsl #4
1470        vld1.16         {q0},     [r4,:128]
14711:
1472        vld1.8          {d2},     [r2], r3
1473        vp8_epel8_h4    d2,  d2,  d2
1474        vst1.32         {d2[0]},  [r0,:32], r1
1475        subs            r12, r12, #1
1476        bne             1b
1477
1478        pop             {r4,pc}
1479endfunc
1480
1481function ff_put_vp8_epel4_v4_neon, export=1
1482        sub             r2,  r2,  r3
1483        push            {r4,lr}
1484
1485        ldr             r4,  [sp, #16]          @ my
1486        movrel          lr,  subpel_filters-16
1487        ldr             r12, [sp, #8]           @ h
1488        add             r4,  lr,  r4, lsl #4
1489        vld1.16         {q0},     [r4,:128]
14901:
1491        vld1.32         {d2[]},   [r2], r3
1492        vld1.32         {d3[]},   [r2], r3
1493        vld1.32         {d4[]},   [r2], r3
1494        vld1.32         {d5[]},   [r2], r3
1495        vld1.32         {d6[]},   [r2]
1496        sub             r2,  r2,  r3,  lsl #1
1497        vld1.32         {d2[1]},  [r2], r3
1498        vld1.32         {d3[1]},  [r2], r3
1499        vld1.32         {d4[1]},  [r2], r3
1500        vld1.32         {d5[1]},  [r2], r3
1501        vld1.32         {d6[1]},  [r2]
1502        sub             r2,  r2,  r3,  lsl #1
1503
1504        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1505
1506        vst1.32         {d2[0]},  [r0,:32], r1
1507        vst1.32         {d3[0]},  [r0,:32], r1
1508        vst1.32         {d2[1]},  [r0,:32], r1
1509        vst1.32         {d3[1]},  [r0,:32], r1
1510        subs            r12, r12, #4
1511        bne             1b
1512
1513        pop             {r4,pc}
1514endfunc
1515
1516function ff_put_vp8_epel4_h4v4_neon, export=1
1517        sub             r2,  r2,  r3
1518        sub             r2,  r2,  #1
1519        push            {r4,lr}
1520
1521        ldr             r4,  [sp, #12]          @ mx
1522        movrel          lr,  subpel_filters-16
1523        ldr             r12, [sp, #8]           @ h
1524        add             r4,  lr,  r4, lsl #4
1525        sub             sp,  sp,  #44+16
1526        vld1.16         {q0},     [r4,:128]
1527        add             lr,  sp,  #15
1528        add             r12, r12, #3
1529        bic             lr,  lr,  #15
15301:
1531        vld1.8          {d2},     [r2], r3
1532        vp8_epel8_h4    d2,  d2,  d3
1533        vst1.32         {d2[0]},  [lr,:32]!
1534        subs            r12, r12, #1
1535        bne             1b
1536
1537        ldr             r4,  [sp, #44+16+16]    @ my
1538        movrel          lr,  subpel_filters-16
1539        ldr             r12, [sp, #44+16+8]     @ h
1540        add             r4,  lr,  r4, lsl #4
1541        add             lr,  sp,  #15
1542        vld1.16         {q0},     [r4,:128]
1543        bic             lr,  lr,  #15
15442:
1545        vld1.8          {d2-d3},  [lr,:128]!
1546        vld1.32         {d6[]},   [lr,:32]
1547        sub             lr,  lr,  #8
1548        vld1.8          {d4-d5},  [lr]!
1549        vld1.32         {d6[1]},  [lr,:32]
1550        sub             lr,  lr,  #8
1551        vtrn.32         q1,  q2
1552        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1553        vst1.32         {d2[0]},  [r0,:32], r1
1554        vst1.32         {d3[0]},  [r0,:32], r1
1555        vst1.32         {d2[1]},  [r0,:32], r1
1556        vst1.32         {d3[1]},  [r0,:32], r1
1557        subs            r12, r12, #4
1558        bne             2b
1559
1560        add             sp,  sp,  #44+16
1561        pop             {r4,pc}
1562endfunc
1563
1564@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1565@ arithmatic can be used to apply filters
1566const   subpel_filters, align=4
1567        .short     0,   6, 123,  12,   1,   0,   0,   0
1568        .short     2,  11, 108,  36,   8,   1,   0,   0
1569        .short     0,   9,  93,  50,   6,   0,   0,   0
1570        .short     3,  16,  77,  77,  16,   3,   0,   0
1571        .short     0,   6,  50,  93,   9,   0,   0,   0
1572        .short     1,   8,  36, 108,  11,   2,   0,   0
1573        .short     0,   1,  12, 123,   6,   0,   0,   0
1574endconst
1575
1576/* Bilinear MC */
1577
1578function ff_put_vp8_bilin16_h_neon, export=1
1579        ldr             r12, [sp, #4]           @ mx
1580        vdup.8          d0,  r12
1581        rsb             r12, r12, #8
1582        vdup.8          d1,  r12
1583        ldr             r12, [sp]               @ h
15841:
1585        subs            r12, r12, #2
1586        vld1.8          {d2-d4},  [r2], r3
1587        vext.8          q2,  q1,  q2,  #1
1588        vmull.u8        q8,  d2,  d1
1589        vmlal.u8        q8,  d4,  d0
1590        vld1.8          {d18-d20},[r2], r3
1591        vmull.u8        q3,  d3,  d1
1592        vmlal.u8        q3,  d5,  d0
1593        vext.8          q10, q9,  q10, #1
1594        vmull.u8        q11, d18, d1
1595        vmlal.u8        q11, d20, d0
1596        vmull.u8        q12, d19, d1
1597        vmlal.u8        q12, d21, d0
1598        vrshrn.u16      d4,  q8,  #3
1599        vrshrn.u16      d5,  q3,  #3
1600        vrshrn.u16      d6,  q11, #3
1601        vrshrn.u16      d7,  q12, #3
1602        vst1.8          {q2},     [r0,:128], r1
1603        vst1.8          {q3},     [r0,:128], r1
1604        bgt             1b
1605
1606        bx              lr
1607endfunc
1608
1609function ff_put_vp8_bilin16_v_neon, export=1
1610        ldr             r12, [sp, #8]           @ my
1611        vdup.8          d0,  r12
1612        rsb             r12, r12, #8
1613        vdup.8          d1,  r12
1614        ldr             r12, [sp]               @ h
1615        vld1.8          {q1},     [r2], r3
16161:
1617        subs            r12, r12, #2
1618        vld1.8          {q2},     [r2], r3
1619        vmull.u8        q3,  d2,  d1
1620        vmlal.u8        q3,  d4,  d0
1621        vmull.u8        q8,  d3,  d1
1622        vmlal.u8        q8,  d5,  d0
1623        vld1.8          {q1},     [r2], r3
1624        vmull.u8        q9,  d4,  d1
1625        vmlal.u8        q9,  d2,  d0
1626        vmull.u8        q10, d5,  d1
1627        vmlal.u8        q10, d3,  d0
1628        vrshrn.u16      d4,  q3,  #3
1629        vrshrn.u16      d5,  q8,  #3
1630        vrshrn.u16      d6,  q9,  #3
1631        vrshrn.u16      d7,  q10, #3
1632        vst1.8          {q2},     [r0,:128], r1
1633        vst1.8          {q3},     [r0,:128], r1
1634        bgt             1b
1635
1636        bx              lr
1637endfunc
1638
1639function ff_put_vp8_bilin16_hv_neon, export=1
1640        ldr             r12, [sp, #4]           @ mx
1641        vdup.8          d0,  r12
1642        rsb             r12, r12, #8
1643        vdup.8          d1,  r12
1644        ldr             r12, [sp, #8]           @ my
1645        vdup.8          d2,  r12
1646        rsb             r12, r12, #8
1647        vdup.8          d3,  r12
1648        ldr             r12, [sp]               @ h
1649
1650        vld1.8          {d4-d6},  [r2], r3
1651        vext.8          q3,  q2,  q3,  #1
1652        vmull.u8        q8,  d4,  d1
1653        vmlal.u8        q8,  d6,  d0
1654        vmull.u8        q9,  d5,  d1
1655        vmlal.u8        q9,  d7,  d0
1656        vrshrn.u16      d4,  q8,  #3
1657        vrshrn.u16      d5,  q9,  #3
16581:
1659        subs            r12, r12, #2
1660        vld1.8          {d18-d20},[r2], r3
1661        vext.8          q10, q9,  q10, #1
1662        vmull.u8        q11, d18, d1
1663        vmlal.u8        q11, d20, d0
1664        vld1.8          {d26-d28},[r2], r3
1665        vmull.u8        q12, d19, d1
1666        vmlal.u8        q12, d21, d0
1667        vext.8          q14, q13, q14, #1
1668        vmull.u8        q8,  d26, d1
1669        vmlal.u8        q8,  d28, d0
1670        vmull.u8        q9,  d27, d1
1671        vmlal.u8        q9,  d29, d0
1672        vrshrn.u16      d6,  q11, #3
1673        vrshrn.u16      d7,  q12, #3
1674        vmull.u8        q12, d4,  d3
1675        vmlal.u8        q12, d6,  d2
1676        vmull.u8        q15, d5,  d3
1677        vmlal.u8        q15, d7,  d2
1678        vrshrn.u16      d4,  q8,  #3
1679        vrshrn.u16      d5,  q9,  #3
1680        vmull.u8        q10, d6,  d3
1681        vmlal.u8        q10, d4,  d2
1682        vmull.u8        q11, d7,  d3
1683        vmlal.u8        q11, d5,  d2
1684        vrshrn.u16      d24, q12, #3
1685        vrshrn.u16      d25, q15, #3
1686        vst1.8          {q12},    [r0,:128], r1
1687        vrshrn.u16      d20, q10, #3
1688        vrshrn.u16      d21, q11, #3
1689        vst1.8          {q10},    [r0,:128], r1
1690        bgt             1b
1691
1692        bx              lr
1693endfunc
1694
1695function ff_put_vp8_bilin8_h_neon, export=1
1696        ldr             r12, [sp, #4]           @ mx
1697        vdup.8          d0,  r12
1698        rsb             r12, r12, #8
1699        vdup.8          d1,  r12
1700        ldr             r12, [sp]               @ h
17011:
1702        subs            r12, r12, #2
1703        vld1.8          {q1},     [r2], r3
1704        vext.8          d3,  d2,  d3,  #1
1705        vmull.u8        q2,  d2,  d1
1706        vmlal.u8        q2,  d3,  d0
1707        vld1.8          {q3},     [r2], r3
1708        vext.8          d7,  d6,  d7,  #1
1709        vmull.u8        q8,  d6,  d1
1710        vmlal.u8        q8,  d7,  d0
1711        vrshrn.u16      d4,  q2,  #3
1712        vrshrn.u16      d16, q8,  #3
1713        vst1.8          {d4},     [r0,:64], r1
1714        vst1.8          {d16},    [r0,:64], r1
1715        bgt             1b
1716
1717        bx              lr
1718endfunc
1719
1720function ff_put_vp8_bilin8_v_neon, export=1
1721        ldr             r12, [sp, #8]           @ my
1722        vdup.8          d0,  r12
1723        rsb             r12, r12,  #8
1724        vdup.8          d1,  r12
1725        ldr             r12, [sp]               @ h
1726        vld1.8          {d2},     [r2], r3
17271:
1728        subs            r12, r12, #2
1729        vld1.8          {d3},     [r2], r3
1730        vmull.u8        q2,  d2,  d1
1731        vmlal.u8        q2,  d3,  d0
1732        vld1.8          {d2},     [r2], r3
1733        vmull.u8        q3,  d3,  d1
1734        vmlal.u8        q3,  d2,  d0
1735        vrshrn.u16      d4,  q2,  #3
1736        vrshrn.u16      d6,  q3,  #3
1737        vst1.8          {d4},     [r0,:64], r1
1738        vst1.8          {d6},     [r0,:64], r1
1739        bgt             1b
1740
1741        bx              lr
1742endfunc
1743
1744function ff_put_vp8_bilin8_hv_neon, export=1
1745        ldr             r12, [sp, #4]           @ mx
1746        vdup.8          d0,  r12
1747        rsb             r12, r12, #8
1748        vdup.8          d1,  r12
1749        ldr             r12, [sp, #8]           @ my
1750        vdup.8          d2,  r12
1751        rsb             r12, r12, #8
1752        vdup.8          d3,  r12
1753        ldr             r12, [sp]               @ h
1754
1755        vld1.8          {q2},     [r2], r3
1756        vext.8          d5,  d4,  d5,  #1
1757        vmull.u8        q9,  d4,  d1
1758        vmlal.u8        q9,  d5,  d0
1759        vrshrn.u16      d22, q9,  #3
17601:
1761        subs            r12, r12, #2
1762        vld1.8          {q3},     [r2], r3
1763        vext.8          d7,  d6,  d7,  #1
1764        vmull.u8        q8,  d6,  d1
1765        vmlal.u8        q8,  d7,  d0
1766        vld1.8          {q2},     [r2], r3
1767        vext.8          d5,  d4,  d5,  #1
1768        vmull.u8        q9,  d4,  d1
1769        vmlal.u8        q9,  d5,  d0
1770        vrshrn.u16      d16, q8,  #3
1771        vmull.u8        q10, d22, d3
1772        vmlal.u8        q10, d16, d2
1773        vrshrn.u16      d22, q9,  #3
1774        vmull.u8        q12, d16, d3
1775        vmlal.u8        q12, d22, d2
1776        vrshrn.u16      d20, q10, #3
1777        vst1.8          {d20},    [r0,:64], r1
1778        vrshrn.u16      d23, q12, #3
1779        vst1.8          {d23},    [r0,:64], r1
1780        bgt             1b
1781
1782        bx              lr
1783endfunc
1784
1785function ff_put_vp8_bilin4_h_neon, export=1
1786        ldr             r12, [sp, #4]           @ mx
1787        vdup.8          d0,  r12
1788        rsb             r12, r12, #8
1789        vdup.8          d1,  r12
1790        ldr             r12, [sp]               @ h
17911:
1792        subs            r12, r12, #2
1793        vld1.8          {d2},     [r2], r3
1794        vext.8          d3,  d2,  d3,  #1
1795        vld1.8          {d6},     [r2], r3
1796        vext.8          d7,  d6,  d7,  #1
1797        vtrn.32         q1,  q3
1798        vmull.u8        q2,  d2,  d1
1799        vmlal.u8        q2,  d3,  d0
1800        vrshrn.u16      d4,  q2,  #3
1801        vst1.32         {d4[0]},  [r0,:32], r1
1802        vst1.32         {d4[1]}, [r0,:32], r1
1803        bgt             1b
1804
1805        bx              lr
1806endfunc
1807
1808function ff_put_vp8_bilin4_v_neon, export=1
1809        ldr             r12, [sp, #8]           @ my
1810        vdup.8          d0,  r12
1811        rsb             r12, r12, #8
1812        vdup.8          d1,  r12
1813        ldr             r12, [sp]               @ h
1814        vld1.32         {d2[]},   [r2], r3
18151:
1816        vld1.32         {d3[]},   [r2]
1817        vld1.32         {d2[1]},  [r2], r3
1818        vld1.32         {d3[1]},  [r2], r3
1819        vmull.u8        q2,  d2,  d1
1820        vmlal.u8        q2,  d3,  d0
1821        vtrn.32         d3,  d2
1822        vrshrn.u16      d4,  q2,  #3
1823        vst1.32         {d4[0]},  [r0,:32], r1
1824        vst1.32         {d4[1]},  [r0,:32], r1
1825        subs            r12, r12, #2
1826        bgt             1b
1827
1828        bx              lr
1829endfunc
1830
1831function ff_put_vp8_bilin4_hv_neon, export=1
1832        ldr             r12, [sp, #4]           @ mx
1833        vdup.8          d0,  r12
1834        rsb             r12, r12, #8
1835        vdup.8          d1,  r12
1836        ldr             r12, [sp, #8]           @ my
1837        vdup.8          d2,  r12
1838        rsb             r12, r12, #8
1839        vdup.8          d3,  r12
1840        ldr             r12, [sp]               @ h
1841
1842        vld1.8          {d4},     [r2], r3
1843        vext.8          d5,  d4,  d4,  #1
1844        vmull.u8        q9,  d4,  d1
1845        vmlal.u8        q9,  d5,  d0
1846        vrshrn.u16      d22, q9,  #3
18471:
1848        subs            r12, r12, #2
1849        vld1.8          {d6},     [r2], r3
1850        vext.8          d7,  d6,  d6,  #1
1851        vld1.8          {d4},     [r2], r3
1852        vext.8          d5,  d4,  d4,  #1
1853        vtrn.32         q3,  q2
1854        vmull.u8        q8,  d6,  d1
1855        vmlal.u8        q8,  d7,  d0
1856        vrshrn.u16      d16, q8,  #3
1857        vmull.u8        q10, d16, d2
1858        vtrn.32         d22, d16
1859        vmlal.u8        q10, d22, d3
1860        vrev64.32       d22, d16
1861        vrshrn.u16      d20, q10, #3
1862        vst1.32         {d20[0]}, [r0,:32], r1
1863        vst1.32         {d20[1]}, [r0,:32], r1
1864        bgt             1b
1865
1866        bx              lr
1867endfunc
1868