1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24.macro  h264_chroma_mc8 type, codec=h264
25function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26        push            {r4-r7, lr}
27        ldrd            r4,  r5,  [sp, #20]
28  .ifc \type,avg
29        mov             lr,  r0
30  .endif
31        pld             [r1]
32        pld             [r1, r2]
33
34  .ifc \codec,rv40
35        movrel          r6,  rv40bias
36        lsr             r7,  r5,  #1
37        add             r6,  r6,  r7,  lsl #3
38        lsr             r7,  r4,  #1
39        add             r6,  r6,  r7,  lsl #1
40        vld1.16         {d22[],d23[]}, [r6,:16]
41  .endif
42  .ifc \codec,vc1
43        vmov.u16        q11, #28
44  .endif
45
46A       muls            r7,  r4,  r5
47T       mul             r7,  r4,  r5
48T       cmp             r7,  #0
49        rsb             r6,  r7,  r5,  lsl #3
50        rsb             r12, r7,  r4,  lsl #3
51        sub             r4,  r7,  r4,  lsl #3
52        sub             r4,  r4,  r5,  lsl #3
53        add             r4,  r4,  #64
54
55        beq             2f
56
57        vdup.8          d0,  r4
58        vdup.8          d1,  r12
59        vld1.8          {d4, d5}, [r1], r2
60        vdup.8          d2,  r6
61        vdup.8          d3,  r7
62        vext.8          d5,  d4,  d5,  #1
63
641:      vld1.8          {d6, d7}, [r1], r2
65        vmull.u8        q8,  d4,  d0
66        vmlal.u8        q8,  d5,  d1
67        vext.8          d7,  d6,  d7,  #1
68        vld1.8          {d4, d5}, [r1], r2
69        vmlal.u8        q8,  d6,  d2
70        pld             [r1]
71        vext.8          d5,  d4,  d5,  #1
72        vmlal.u8        q8,  d7,  d3
73        vmull.u8        q9,  d6,  d0
74        subs            r3,  r3,  #2
75        vmlal.u8        q9,  d7,  d1
76        vmlal.u8        q9,  d4,  d2
77        vmlal.u8        q9,  d5,  d3
78        pld             [r1, r2]
79  .ifc \codec,h264
80        vrshrn.u16      d16, q8,  #6
81        vrshrn.u16      d17, q9,  #6
82  .else
83        vadd.u16        q8,  q8,  q11
84        vadd.u16        q9,  q9,  q11
85        vshrn.u16       d16, q8,  #6
86        vshrn.u16       d17, q9,  #6
87  .endif
88  .ifc \type,avg
89        vld1.8          {d20}, [lr,:64], r2
90        vld1.8          {d21}, [lr,:64], r2
91        vrhadd.u8       q8,  q8,  q10
92  .endif
93        vst1.8          {d16}, [r0,:64], r2
94        vst1.8          {d17}, [r0,:64], r2
95        bgt             1b
96
97        pop             {r4-r7, pc}
98
992:      adds            r12, r12, r6
100        vdup.8          d0,  r4
101        beq             5f
102        tst             r6,  r6
103        vdup.8          d1,  r12
104
105        beq             4f
106
107        vld1.8          {d4}, [r1], r2
108
1093:      vld1.8          {d6}, [r1], r2
110        vmull.u8        q8,  d4,  d0
111        vmlal.u8        q8,  d6,  d1
112        vld1.8          {d4}, [r1], r2
113        vmull.u8        q9,  d6,  d0
114        vmlal.u8        q9,  d4,  d1
115        pld             [r1]
116  .ifc \codec,h264
117        vrshrn.u16      d16, q8,  #6
118        vrshrn.u16      d17, q9,  #6
119  .else
120        vadd.u16        q8,  q8,  q11
121        vadd.u16        q9,  q9,  q11
122        vshrn.u16       d16, q8,  #6
123        vshrn.u16       d17, q9,  #6
124  .endif
125        pld             [r1, r2]
126  .ifc \type,avg
127        vld1.8          {d20}, [lr,:64], r2
128        vld1.8          {d21}, [lr,:64], r2
129        vrhadd.u8       q8,  q8,  q10
130  .endif
131        subs            r3,  r3,  #2
132        vst1.8          {d16}, [r0,:64], r2
133        vst1.8          {d17}, [r0,:64], r2
134        bgt             3b
135
136        pop             {r4-r7, pc}
137
1384:      vld1.8          {d4, d5}, [r1], r2
139        vld1.8          {d6, d7}, [r1], r2
140        vext.8          d5,  d4,  d5,  #1
141        vext.8          d7,  d6,  d7,  #1
142        pld             [r1]
143        subs            r3,  r3,  #2
144        vmull.u8        q8,  d4,  d0
145        vmlal.u8        q8,  d5,  d1
146        vmull.u8        q9,  d6,  d0
147        vmlal.u8        q9,  d7,  d1
148        pld             [r1, r2]
149  .ifc \codec,h264
150        vrshrn.u16      d16, q8,  #6
151        vrshrn.u16      d17, q9,  #6
152  .else
153        vadd.u16        q8,  q8,  q11
154        vadd.u16        q9,  q9,  q11
155        vshrn.u16       d16, q8,  #6
156        vshrn.u16       d17, q9,  #6
157  .endif
158  .ifc \type,avg
159        vld1.8          {d20}, [lr,:64], r2
160        vld1.8          {d21}, [lr,:64], r2
161        vrhadd.u8       q8,  q8,  q10
162  .endif
163        vst1.8          {d16}, [r0,:64], r2
164        vst1.8          {d17}, [r0,:64], r2
165        bgt             4b
166
167        pop             {r4-r7, pc}
168
1695:      vld1.8          {d4}, [r1], r2
170        vld1.8          {d5}, [r1], r2
171        pld             [r1]
172        subs            r3,  r3,  #2
173        vmull.u8        q8,  d4,  d0
174        vmull.u8        q9,  d5,  d0
175        pld             [r1, r2]
176  .ifc \codec,h264
177        vrshrn.u16      d16, q8,  #6
178        vrshrn.u16      d17, q9,  #6
179  .else
180        vadd.u16        q8,  q8,  q11
181        vadd.u16        q9,  q9,  q11
182        vshrn.u16       d16, q8,  #6
183        vshrn.u16       d17, q9,  #6
184  .endif
185  .ifc \type,avg
186        vld1.8          {d20}, [lr,:64], r2
187        vld1.8          {d21}, [lr,:64], r2
188        vrhadd.u8       q8,  q8,  q10
189  .endif
190        vst1.8          {d16}, [r0,:64], r2
191        vst1.8          {d17}, [r0,:64], r2
192        bgt             5b
193
194        pop             {r4-r7, pc}
195endfunc
196.endm
197
198/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
199.macro  h264_chroma_mc4 type, codec=h264
200function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
201        push            {r4-r7, lr}
202        ldrd            r4,  r5,  [sp, #20]
203  .ifc \type,avg
204        mov             lr,  r0
205  .endif
206        pld             [r1]
207        pld             [r1, r2]
208
209  .ifc \codec,rv40
210        movrel          r6,  rv40bias
211        lsr             r7,  r5,  #1
212        add             r6,  r6,  r7,  lsl #3
213        lsr             r7,  r4,  #1
214        add             r6,  r6,  r7,  lsl #1
215        vld1.16         {d22[],d23[]}, [r6,:16]
216  .endif
217  .ifc \codec,vc1
218        vmov.u16        q11, #28
219  .endif
220
221A       muls            r7,  r4,  r5
222T       mul             r7,  r4,  r5
223T       cmp             r7,  #0
224        rsb             r6,  r7,  r5,  lsl #3
225        rsb             r12, r7,  r4,  lsl #3
226        sub             r4,  r7,  r4,  lsl #3
227        sub             r4,  r4,  r5,  lsl #3
228        add             r4,  r4,  #64
229
230        beq             2f
231
232        vdup.8          d0,  r4
233        vdup.8          d1,  r12
234        vld1.8          {d4},     [r1], r2
235        vdup.8          d2,  r6
236        vdup.8          d3,  r7
237
238        vext.8          d5,  d4,  d5,  #1
239        vtrn.32         d4,  d5
240
241        vtrn.32         d0,  d1
242        vtrn.32         d2,  d3
243
2441:      vld1.8          {d6},     [r1], r2
245        vext.8          d7,  d6,  d7,  #1
246        vtrn.32         d6,  d7
247        vmull.u8        q8,  d4,  d0
248        vmlal.u8        q8,  d6,  d2
249        vld1.8          {d4},     [r1], r2
250        vext.8          d5,  d4,  d5,  #1
251        vtrn.32         d4,  d5
252        pld             [r1]
253        vmull.u8        q9,  d6,  d0
254        vmlal.u8        q9,  d4,  d2
255        vadd.i16        d16, d16, d17
256        vadd.i16        d17, d18, d19
257  .ifc \codec,h264
258        vrshrn.u16      d16, q8,  #6
259  .else
260        vadd.u16        q8,  q8,  q11
261        vshrn.u16       d16, q8,  #6
262  .endif
263        subs            r3,  r3,  #2
264        pld             [r1, r2]
265  .ifc \type,avg
266        vld1.32         {d20[0]}, [lr,:32], r2
267        vld1.32         {d20[1]}, [lr,:32], r2
268        vrhadd.u8       d16, d16, d20
269  .endif
270        vst1.32         {d16[0]}, [r0,:32], r2
271        vst1.32         {d16[1]}, [r0,:32], r2
272        bgt             1b
273
274        pop             {r4-r7, pc}
275
2762:      adds            r12, r12, r6
277        vdup.8          d0,  r4
278        beq             5f
279        tst             r6,  r6
280        vdup.8          d1,  r12
281        vtrn.32         d0,  d1
282
283        beq             4f
284
285        vext.32         d1,  d0,  d1,  #1
286        vld1.32         {d4[0]},  [r1], r2
287
2883:      vld1.32         {d4[1]},  [r1], r2
289        vmull.u8        q8,  d4,  d0
290        vld1.32         {d4[0]},  [r1], r2
291        vmull.u8        q9,  d4,  d1
292        vadd.i16        d16, d16, d17
293        vadd.i16        d17, d18, d19
294        pld             [r1]
295  .ifc \codec,h264
296        vrshrn.u16      d16, q8,  #6
297  .else
298        vadd.u16        q8,  q8,  q11
299        vshrn.u16       d16, q8,  #6
300  .endif
301  .ifc \type,avg
302        vld1.32         {d20[0]}, [lr,:32], r2
303        vld1.32         {d20[1]}, [lr,:32], r2
304        vrhadd.u8       d16, d16, d20
305  .endif
306        subs            r3,  r3,  #2
307        pld             [r1, r2]
308        vst1.32         {d16[0]}, [r0,:32], r2
309        vst1.32         {d16[1]}, [r0,:32], r2
310        bgt             3b
311
312        pop             {r4-r7, pc}
313
3144:      vld1.8          {d4},     [r1], r2
315        vld1.8          {d6},     [r1], r2
316        vext.8          d5,  d4,  d5,  #1
317        vext.8          d7,  d6,  d7,  #1
318        vtrn.32         d4,  d5
319        vtrn.32         d6,  d7
320        vmull.u8        q8,  d4,  d0
321        vmull.u8        q9,  d6,  d0
322        subs            r3,  r3,  #2
323        vadd.i16        d16, d16, d17
324        vadd.i16        d17, d18, d19
325        pld             [r1]
326  .ifc \codec,h264
327        vrshrn.u16      d16, q8,  #6
328  .else
329        vadd.u16        q8,  q8,  q11
330        vshrn.u16       d16, q8,  #6
331  .endif
332  .ifc \type,avg
333        vld1.32         {d20[0]}, [lr,:32], r2
334        vld1.32         {d20[1]}, [lr,:32], r2
335        vrhadd.u8       d16, d16, d20
336  .endif
337        pld             [r1]
338        vst1.32         {d16[0]}, [r0,:32], r2
339        vst1.32         {d16[1]}, [r0,:32], r2
340        bgt             4b
341
342        pop             {r4-r7, pc}
343
3445:      vld1.32         {d4[0]},  [r1], r2
345        vld1.32         {d4[1]},  [r1], r2
346        vmull.u8        q8,  d4,  d0
347        subs            r3,  r3,  #2
348        pld             [r1]
349  .ifc \codec,h264
350        vrshrn.u16      d16, q8,  #6
351  .else
352        vadd.u16        q8,  q8,  q11
353        vshrn.u16       d16, q8,  #6
354  .endif
355  .ifc \type,avg
356        vld1.32         {d20[0]}, [lr,:32], r2
357        vld1.32         {d20[1]}, [lr,:32], r2
358        vrhadd.u8       d16, d16, d20
359  .endif
360        pld             [r1]
361        vst1.32         {d16[0]}, [r0,:32], r2
362        vst1.32         {d16[1]}, [r0,:32], r2
363        bgt             5b
364
365        pop             {r4-r7, pc}
366endfunc
367.endm
368
369.macro  h264_chroma_mc2 type
370function ff_\type\()_h264_chroma_mc2_neon, export=1
371        push            {r4-r6, lr}
372        ldr             r4,  [sp, #16]
373        ldr             lr,  [sp, #20]
374        pld             [r1]
375        pld             [r1, r2]
376        orrs            r5,  r4,  lr
377        beq             2f
378
379        mul             r5,  r4,  lr
380        rsb             r6,  r5,  lr,  lsl #3
381        rsb             r12, r5,  r4,  lsl #3
382        sub             r4,  r5,  r4,  lsl #3
383        sub             r4,  r4,  lr,  lsl #3
384        add             r4,  r4,  #64
385        vdup.8          d0,  r4
386        vdup.8          d2,  r12
387        vdup.8          d1,  r6
388        vdup.8          d3,  r5
389        vtrn.16         q0,  q1
3901:
391        vld1.32         {d4[0]},  [r1], r2
392        vld1.32         {d4[1]},  [r1], r2
393        vrev64.32       d5,  d4
394        vld1.32         {d5[1]},  [r1]
395        vext.8          q3,  q2,  q2,  #1
396        vtrn.16         q2,  q3
397        vmull.u8        q8,  d4,  d0
398        vmlal.u8        q8,  d5,  d1
399  .ifc \type,avg
400        vld1.16         {d18[0]}, [r0,:16], r2
401        vld1.16         {d18[1]}, [r0,:16]
402        sub             r0,  r0,  r2
403  .endif
404        vtrn.32         d16, d17
405        vadd.i16        d16, d16, d17
406        vrshrn.u16      d16, q8,  #6
407  .ifc \type,avg
408        vrhadd.u8       d16, d16, d18
409  .endif
410        vst1.16         {d16[0]}, [r0,:16], r2
411        vst1.16         {d16[1]}, [r0,:16], r2
412        subs            r3,  r3,  #2
413        bgt             1b
414        pop             {r4-r6, pc}
4152:
416  .ifc \type,put
417        ldrh_post       r5,  r1,  r2
418        strh_post       r5,  r0,  r2
419        ldrh_post       r6,  r1,  r2
420        strh_post       r6,  r0,  r2
421  .else
422        vld1.16         {d16[0]}, [r1], r2
423        vld1.16         {d16[1]}, [r1], r2
424        vld1.16         {d18[0]}, [r0,:16], r2
425        vld1.16         {d18[1]}, [r0,:16]
426        sub             r0,  r0,  r2
427        vrhadd.u8       d16, d16, d18
428        vst1.16         {d16[0]}, [r0,:16], r2
429        vst1.16         {d16[1]}, [r0,:16], r2
430  .endif
431        subs            r3,  r3,  #2
432        bgt             2b
433        pop             {r4-r6, pc}
434endfunc
435.endm
436
437        h264_chroma_mc8 put
438        h264_chroma_mc8 avg
439        h264_chroma_mc4 put
440        h264_chroma_mc4 avg
441        h264_chroma_mc2 put
442        h264_chroma_mc2 avg
443
444#if CONFIG_RV40_DECODER
445const   rv40bias
446        .short           0, 16, 32, 16
447        .short          32, 28, 32, 28
448        .short           0, 32, 16, 32
449        .short          32, 28, 32, 28
450endconst
451
452        h264_chroma_mc8 put, rv40
453        h264_chroma_mc8 avg, rv40
454        h264_chroma_mc4 put, rv40
455        h264_chroma_mc4 avg, rv40
456#endif
457
458#if CONFIG_VC1_DECODER
459        h264_chroma_mc8 put, vc1
460        h264_chroma_mc8 avg, vc1
461        h264_chroma_mc4 put, vc1
462        h264_chroma_mc4 avg, vc1
463#endif
464