1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22
23/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24.macro  h264_chroma_mc8 type, codec=h264
25function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26        push            {r4-r7, lr}
27        ldrd            r4,  [sp, #20]
28  .ifc \type,avg
29        mov             lr,  r0
30  .endif
31        pld             [r1]
32        pld             [r1, r2]
33
34  .ifc \codec,rv40
35        movrel          r6,  rv40bias
36        lsr             r7,  r5,  #1
37        add             r6,  r6,  r7,  lsl #3
38        lsr             r7,  r4,  #1
39        add             r6,  r6,  r7,  lsl #1
40        vld1.16         {d22[],d23[]}, [r6,:16]
41  .endif
42
43A       muls            r7,  r4,  r5
44T       mul             r7,  r4,  r5
45T       cmp             r7,  #0
46        rsb             r6,  r7,  r5,  lsl #3
47        rsb             r12, r7,  r4,  lsl #3
48        sub             r4,  r7,  r4,  lsl #3
49        sub             r4,  r4,  r5,  lsl #3
50        add             r4,  r4,  #64
51
52        beq             2f
53
54        add             r5,  r1,  r2
55
56        vdup.8          d0,  r4
57        lsl             r4,  r2,  #1
58        vdup.8          d1,  r12
59        vld1.8          {d4, d5}, [r1], r4
60        vdup.8          d2,  r6
61        vld1.8          {d6, d7}, [r5], r4
62        vdup.8          d3,  r7
63
64        vext.8          d5,  d4,  d5,  #1
65        vext.8          d7,  d6,  d7,  #1
66
671:      pld             [r5]
68        vmull.u8        q8,  d4,  d0
69        vmlal.u8        q8,  d5,  d1
70        vld1.8          {d4, d5}, [r1], r4
71        vmlal.u8        q8,  d6,  d2
72        vext.8          d5,  d4,  d5,  #1
73        vmlal.u8        q8,  d7,  d3
74        vmull.u8        q9,  d6,  d0
75        subs            r3,  r3,  #2
76        vmlal.u8        q9,  d7,  d1
77        vmlal.u8        q9,  d4,  d2
78        vmlal.u8        q9,  d5,  d3
79        vld1.8          {d6, d7}, [r5], r4
80        pld             [r1]
81  .ifc \codec,h264
82        vrshrn.u16      d16, q8,  #6
83        vrshrn.u16      d17, q9,  #6
84  .else
85        vadd.u16        q8,  q8,  q11
86        vadd.u16        q9,  q9,  q11
87        vshrn.u16       d16, q8,  #6
88        vshrn.u16       d17, q9,  #6
89  .endif
90  .ifc \type,avg
91        vld1.8          {d20}, [lr,:64], r2
92        vld1.8          {d21}, [lr,:64], r2
93        vrhadd.u8       q8,  q8,  q10
94  .endif
95        vext.8          d7,  d6,  d7,  #1
96        vst1.8          {d16}, [r0,:64], r2
97        vst1.8          {d17}, [r0,:64], r2
98        bgt             1b
99
100        pop             {r4-r7, pc}
101
1022:      tst             r6,  r6
103        add             r12, r12, r6
104        vdup.8          d0,  r4
105        vdup.8          d1,  r12
106
107        beq             4f
108
109        add             r5,  r1,  r2
110        lsl             r4,  r2,  #1
111        vld1.8          {d4}, [r1], r4
112        vld1.8          {d6}, [r5], r4
113
1143:      pld             [r5]
115        vmull.u8        q8,  d4,  d0
116        vmlal.u8        q8,  d6,  d1
117        vld1.8          {d4}, [r1], r4
118        vmull.u8        q9,  d6,  d0
119        vmlal.u8        q9,  d4,  d1
120        vld1.8          {d6}, [r5], r4
121  .ifc \codec,h264
122        vrshrn.u16      d16, q8,  #6
123        vrshrn.u16      d17, q9,  #6
124  .else
125        vadd.u16        q8,  q8,  q11
126        vadd.u16        q9,  q9,  q11
127        vshrn.u16       d16, q8,  #6
128        vshrn.u16       d17, q9,  #6
129  .endif
130  .ifc \type,avg
131        vld1.8          {d20}, [lr,:64], r2
132        vld1.8          {d21}, [lr,:64], r2
133        vrhadd.u8       q8,  q8,  q10
134  .endif
135        subs            r3,  r3,  #2
136        pld             [r1]
137        vst1.8          {d16}, [r0,:64], r2
138        vst1.8          {d17}, [r0,:64], r2
139        bgt             3b
140
141        pop             {r4-r7, pc}
142
1434:      vld1.8          {d4, d5}, [r1], r2
144        vld1.8          {d6, d7}, [r1], r2
145        vext.8          d5,  d4,  d5,  #1
146        vext.8          d7,  d6,  d7,  #1
147
1485:      pld             [r1]
149        subs            r3,  r3,  #2
150        vmull.u8        q8,  d4,  d0
151        vmlal.u8        q8,  d5,  d1
152        vld1.8          {d4, d5}, [r1], r2
153        vmull.u8        q9,  d6,  d0
154        vmlal.u8        q9,  d7,  d1
155        pld             [r1]
156        vext.8          d5,  d4,  d5,  #1
157  .ifc \codec,h264
158        vrshrn.u16      d16, q8,  #6
159        vrshrn.u16      d17, q9,  #6
160  .else
161        vadd.u16        q8,  q8,  q11
162        vadd.u16        q9,  q9,  q11
163        vshrn.u16       d16, q8,  #6
164        vshrn.u16       d17, q9,  #6
165  .endif
166  .ifc \type,avg
167        vld1.8          {d20}, [lr,:64], r2
168        vld1.8          {d21}, [lr,:64], r2
169        vrhadd.u8       q8,  q8,  q10
170  .endif
171        vld1.8          {d6, d7}, [r1], r2
172        vext.8          d7,  d6,  d7,  #1
173        vst1.8          {d16}, [r0,:64], r2
174        vst1.8          {d17}, [r0,:64], r2
175        bgt             5b
176
177        pop             {r4-r7, pc}
178endfunc
179.endm
180
181/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
182.macro  h264_chroma_mc4 type, codec=h264
183function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
184        push            {r4-r7, lr}
185        ldrd            r4,  [sp, #20]
186  .ifc \type,avg
187        mov             lr,  r0
188  .endif
189        pld             [r1]
190        pld             [r1, r2]
191
192  .ifc \codec,rv40
193        movrel          r6,  rv40bias
194        lsr             r7,  r5,  #1
195        add             r6,  r6,  r7,  lsl #3
196        lsr             r7,  r4,  #1
197        add             r6,  r6,  r7,  lsl #1
198        vld1.16         {d22[],d23[]}, [r6,:16]
199  .endif
200
201A       muls            r7,  r4,  r5
202T       mul             r7,  r4,  r5
203T       cmp             r7,  #0
204        rsb             r6,  r7,  r5,  lsl #3
205        rsb             r12, r7,  r4,  lsl #3
206        sub             r4,  r7,  r4,  lsl #3
207        sub             r4,  r4,  r5,  lsl #3
208        add             r4,  r4,  #64
209
210        beq             2f
211
212        add             r5,  r1,  r2
213
214        vdup.8          d0,  r4
215        lsl             r4,  r2,  #1
216        vdup.8          d1,  r12
217        vld1.8          {d4},     [r1], r4
218        vdup.8          d2,  r6
219        vld1.8          {d6},     [r5], r4
220        vdup.8          d3,  r7
221
222        vext.8          d5,  d4,  d5,  #1
223        vext.8          d7,  d6,  d7,  #1
224        vtrn.32         d4,  d5
225        vtrn.32         d6,  d7
226
227        vtrn.32         d0,  d1
228        vtrn.32         d2,  d3
229
2301:      pld             [r5]
231        vmull.u8        q8,  d4,  d0
232        vmlal.u8        q8,  d6,  d2
233        vld1.8          {d4},     [r1], r4
234        vext.8          d5,  d4,  d5,  #1
235        vtrn.32         d4,  d5
236        vmull.u8        q9,  d6,  d0
237        vmlal.u8        q9,  d4,  d2
238        vld1.8          {d6},     [r5], r4
239        vadd.i16        d16, d16, d17
240        vadd.i16        d17, d18, d19
241  .ifc \codec,h264
242        vrshrn.u16      d16, q8,  #6
243  .else
244        vadd.u16        q8,  q8,  q11
245        vshrn.u16       d16, q8,  #6
246  .endif
247        subs            r3,  r3,  #2
248        pld             [r1]
249  .ifc \type,avg
250        vld1.32         {d20[0]}, [lr,:32], r2
251        vld1.32         {d20[1]}, [lr,:32], r2
252        vrhadd.u8       d16, d16, d20
253  .endif
254        vext.8          d7,  d6,  d7,  #1
255        vtrn.32         d6,  d7
256        vst1.32         {d16[0]}, [r0,:32], r2
257        vst1.32         {d16[1]}, [r0,:32], r2
258        bgt             1b
259
260        pop             {r4-r7, pc}
261
2622:      tst             r6,  r6
263        add             r12, r12, r6
264        vdup.8          d0,  r4
265        vdup.8          d1,  r12
266        vtrn.32         d0,  d1
267
268        beq             4f
269
270        vext.32         d1,  d0,  d1,  #1
271        add             r5,  r1,  r2
272        lsl             r4,  r2,  #1
273        vld1.32         {d4[0]},  [r1], r4
274        vld1.32         {d4[1]},  [r5], r4
275
2763:      pld             [r5]
277        vmull.u8        q8,  d4,  d0
278        vld1.32         {d4[0]},  [r1], r4
279        vmull.u8        q9,  d4,  d1
280        vld1.32         {d4[1]},  [r5], r4
281        vadd.i16        d16, d16, d17
282        vadd.i16        d17, d18, d19
283  .ifc \codec,h264
284        vrshrn.u16      d16, q8,  #6
285  .else
286        vadd.u16        q8,  q8,  q11
287        vshrn.u16       d16, q8,  #6
288  .endif
289  .ifc \type,avg
290        vld1.32         {d20[0]}, [lr,:32], r2
291        vld1.32         {d20[1]}, [lr,:32], r2
292        vrhadd.u8       d16, d16, d20
293  .endif
294        subs            r3,  r3,  #2
295        pld             [r1]
296        vst1.32         {d16[0]}, [r0,:32], r2
297        vst1.32         {d16[1]}, [r0,:32], r2
298        bgt             3b
299
300        pop             {r4-r7, pc}
301
3024:      vld1.8          {d4},     [r1], r2
303        vld1.8          {d6},     [r1], r2
304        vext.8          d5,  d4,  d5,  #1
305        vext.8          d7,  d6,  d7,  #1
306        vtrn.32         d4,  d5
307        vtrn.32         d6,  d7
308
3095:      vmull.u8        q8,  d4,  d0
310        vmull.u8        q9,  d6,  d0
311        subs            r3,  r3,  #2
312        vld1.8          {d4},     [r1], r2
313        vext.8          d5,  d4,  d5,  #1
314        vtrn.32         d4,  d5
315        vadd.i16        d16, d16, d17
316        vadd.i16        d17, d18, d19
317        pld             [r1]
318  .ifc \codec,h264
319        vrshrn.u16      d16, q8,  #6
320  .else
321        vadd.u16        q8,  q8,  q11
322        vshrn.u16       d16, q8,  #6
323  .endif
324  .ifc \type,avg
325        vld1.32         {d20[0]}, [lr,:32], r2
326        vld1.32         {d20[1]}, [lr,:32], r2
327        vrhadd.u8       d16, d16, d20
328  .endif
329        vld1.8          {d6},     [r1], r2
330        vext.8          d7,  d6,  d7,  #1
331        vtrn.32         d6,  d7
332        pld             [r1]
333        vst1.32         {d16[0]}, [r0,:32], r2
334        vst1.32         {d16[1]}, [r0,:32], r2
335        bgt             5b
336
337        pop             {r4-r7, pc}
338endfunc
339.endm
340
341.macro  h264_chroma_mc2 type
342function ff_\type\()_h264_chroma_mc2_neon, export=1
343        push            {r4-r6, lr}
344        ldr             r4,  [sp, #16]
345        ldr             lr,  [sp, #20]
346        pld             [r1]
347        pld             [r1, r2]
348        orrs            r5,  r4,  lr
349        beq             2f
350
351        mul             r5,  r4,  lr
352        rsb             r6,  r5,  lr,  lsl #3
353        rsb             r12, r5,  r4,  lsl #3
354        sub             r4,  r5,  r4,  lsl #3
355        sub             r4,  r4,  lr,  lsl #3
356        add             r4,  r4,  #64
357        vdup.8          d0,  r4
358        vdup.8          d2,  r12
359        vdup.8          d1,  r6
360        vdup.8          d3,  r5
361        vtrn.16         q0,  q1
3621:
363        vld1.32         {d4[0]},  [r1], r2
364        vld1.32         {d4[1]},  [r1], r2
365        vrev64.32       d5,  d4
366        vld1.32         {d5[1]},  [r1]
367        vext.8          q3,  q2,  q2,  #1
368        vtrn.16         q2,  q3
369        vmull.u8        q8,  d4,  d0
370        vmlal.u8        q8,  d5,  d1
371  .ifc \type,avg
372        vld1.16         {d18[0]}, [r0,:16], r2
373        vld1.16         {d18[1]}, [r0,:16]
374        sub             r0,  r0,  r2
375  .endif
376        vtrn.32         d16, d17
377        vadd.i16        d16, d16, d17
378        vrshrn.u16      d16, q8,  #6
379  .ifc \type,avg
380        vrhadd.u8       d16, d16, d18
381  .endif
382        vst1.16         {d16[0]}, [r0,:16], r2
383        vst1.16         {d16[1]}, [r0,:16], r2
384        subs            r3,  r3,  #2
385        bgt             1b
386        pop             {r4-r6, pc}
3872:
388  .ifc \type,put
389        ldrh_post       r5,  r1,  r2
390        strh_post       r5,  r0,  r2
391        ldrh_post       r6,  r1,  r2
392        strh_post       r6,  r0,  r2
393  .else
394        vld1.16         {d16[0]}, [r1], r2
395        vld1.16         {d16[1]}, [r1], r2
396        vld1.16         {d18[0]}, [r0,:16], r2
397        vld1.16         {d18[1]}, [r0,:16]
398        sub             r0,  r0,  r2
399        vrhadd.u8       d16, d16, d18
400        vst1.16         {d16[0]}, [r0,:16], r2
401        vst1.16         {d16[1]}, [r0,:16], r2
402  .endif
403        subs            r3,  r3,  #2
404        bgt             2b
405        pop             {r4-r6, pc}
406endfunc
407.endm
408
409#if CONFIG_H264_DECODER
410        h264_chroma_mc8 put
411        h264_chroma_mc8 avg
412        h264_chroma_mc4 put
413        h264_chroma_mc4 avg
414        h264_chroma_mc2 put
415        h264_chroma_mc2 avg
416#endif
417
418#if CONFIG_RV40_DECODER
419const   rv40bias
420        .short           0, 16, 32, 16
421        .short          32, 28, 32, 28
422        .short           0, 32, 16, 32
423        .short          32, 28, 32, 28
424endconst
425
426        h264_chroma_mc8 put, rv40
427        h264_chroma_mc8 avg, rv40
428        h264_chroma_mc4 put, rv40
429        h264_chroma_mc4 avg, rv40
430#endif
431