1@
2@ ARMv4 optimized DSP utils
3@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
4@
5@ This file is part of Libav.
6@
7@ Libav is free software; you can redistribute it and/or
8@ modify it under the terms of the GNU Lesser General Public
9@ License as published by the Free Software Foundation; either
10@ version 2.1 of the License, or (at your option) any later version.
11@
12@ Libav is distributed in the hope that it will be useful,
13@ but WITHOUT ANY WARRANTY; without even the implied warranty of
14@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15@ Lesser General Public License for more details.
16@
17@ You should have received a copy of the GNU Lesser General Public
18@ License along with Libav; if not, write to the Free Software
19@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20@
21
22#include "config.h"
23#include "asm.S"
24
25        preserve8
26
27#if HAVE_ARMV5TE
28function ff_prefetch_arm, export=1
29        subs            r2,  r2,  #1
30        pld             [r0]
31        add             r0,  r0,  r1
32        bne             ff_prefetch_arm
33        bx              lr
34endfunc
35#else
36#define pld @
37#endif
38
39.macro  ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
40        mov             \Rd0, \Rn0, lsr #(\shift * 8)
41        mov             \Rd1, \Rn1, lsr #(\shift * 8)
42        mov             \Rd2, \Rn2, lsr #(\shift * 8)
43        mov             \Rd3, \Rn3, lsr #(\shift * 8)
44        orr             \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
45        orr             \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
46        orr             \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
47        orr             \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
48.endm
49.macro  ALIGN_DWORD shift, R0, R1, R2
50        mov             \R0, \R0, lsr #(\shift * 8)
51        orr             \R0, \R0, \R1, lsl #(32 - \shift * 8)
52        mov             \R1, \R1, lsr #(\shift * 8)
53        orr             \R1, \R1, \R2, lsl #(32 - \shift * 8)
54.endm
55.macro  ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
56        mov             \Rdst0, \Rsrc0, lsr #(\shift * 8)
57        mov             \Rdst1, \Rsrc1, lsr #(\shift * 8)
58        orr             \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
59        orr             \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
60.endm
61
62.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
63        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
64        @ Rmask = 0xFEFEFEFE
65        @ Rn = destroy
66        eor             \Rd0, \Rn0, \Rm0
67        eor             \Rd1, \Rn1, \Rm1
68        orr             \Rn0, \Rn0, \Rm0
69        orr             \Rn1, \Rn1, \Rm1
70        and             \Rd0, \Rd0, \Rmask
71        and             \Rd1, \Rd1, \Rmask
72        sub             \Rd0, \Rn0, \Rd0, lsr #1
73        sub             \Rd1, \Rn1, \Rd1, lsr #1
74.endm
75
76.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
77        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
78        @ Rmask = 0xFEFEFEFE
79        @ Rn = destroy
80        eor             \Rd0, \Rn0, \Rm0
81        eor             \Rd1, \Rn1, \Rm1
82        and             \Rn0, \Rn0, \Rm0
83        and             \Rn1, \Rn1, \Rm1
84        and             \Rd0, \Rd0, \Rmask
85        and             \Rd1, \Rd1, \Rmask
86        add             \Rd0, \Rn0, \Rd0, lsr #1
87        add             \Rd1, \Rn1, \Rd1, lsr #1
88.endm
89
90.macro  JMP_ALIGN tmp, reg
91        ands            \tmp, \reg, #3
92        bic             \reg, \reg, #3
93        beq             1f
94        subs            \tmp, \tmp, #1
95        beq             2f
96        subs            \tmp, \tmp, #1
97        beq             3f
98        b    4f
99.endm
100
101@ ----------------------------------------------------------------
102        .align 5
103function ff_put_pixels16_arm, export=1
104        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
105        @ block = word aligned, pixles = unaligned
106        pld             [r1]
107        push            {r4-r11, lr}
108        JMP_ALIGN       r5,  r1
1091:
110        ldm             r1,  {r4-r7}
111        add             r1,  r1,  r2
112        stm             r0,  {r4-r7}
113        pld             [r1]
114        subs            r3,  r3,  #1
115        add             r0,  r0,  r2
116        bne             1b
117        pop             {r4-r11, pc}
118        .align 5
1192:
120        ldm             r1,  {r4-r8}
121        add             r1,  r1,  r2
122        ALIGN_QWORD_D   1,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
123        pld             [r1]
124        subs            r3,  r3,  #1
125        stm             r0,  {r9-r12}
126        add             r0,  r0,  r2
127        bne             2b
128        pop             {r4-r11, pc}
129        .align 5
1303:
131        ldm             r1,  {r4-r8}
132        add             r1,  r1,  r2
133        ALIGN_QWORD_D   2,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
134        pld             [r1]
135        subs            r3,  r3,  #1
136        stm             r0,  {r9-r12}
137        add             r0,  r0,  r2
138        bne             3b
139        pop             {r4-r11, pc}
140        .align 5
1414:
142        ldm             r1,  {r4-r8}
143        add             r1,  r1,  r2
144        ALIGN_QWORD_D   3,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
145        pld             [r1]
146        subs            r3,  r3,  #1
147        stm             r0,  {r9-r12}
148        add             r0,  r0,  r2
149        bne             4b
150        pop             {r4-r11,pc}
151endfunc
152
153@ ----------------------------------------------------------------
154        .align 5
155function ff_put_pixels8_arm, export=1
156        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
157        @ block = word aligned, pixles = unaligned
158        pld             [r1]
159        push            {r4-r5,lr}
160        JMP_ALIGN       r5,  r1
1611:
162        ldm             r1,  {r4-r5}
163        add             r1,  r1,  r2
164        subs            r3,  r3,  #1
165        pld             [r1]
166        stm             r0,  {r4-r5}
167        add             r0,  r0,  r2
168        bne             1b
169        pop             {r4-r5,pc}
170        .align 5
1712:
172        ldm             r1,  {r4-r5, r12}
173        add             r1,  r1,  r2
174        ALIGN_DWORD     1,   r4,  r5,  r12
175        pld             [r1]
176        subs            r3,  r3,  #1
177        stm             r0,  {r4-r5}
178        add             r0,  r0,  r2
179        bne             2b
180        pop             {r4-r5,pc}
181        .align 5
1823:
183        ldm             r1,  {r4-r5, r12}
184        add             r1,  r1,  r2
185        ALIGN_DWORD     2,   r4,  r5,  r12
186        pld             [r1]
187        subs            r3,  r3,  #1
188        stm             r0,  {r4-r5}
189        add             r0,  r0,  r2
190        bne             3b
191        pop             {r4-r5,pc}
192        .align 5
1934:
194        ldm             r1,  {r4-r5, r12}
195        add             r1,  r1,  r2
196        ALIGN_DWORD     3,   r4,  r5,  r12
197        pld             [r1]
198        subs            r3,  r3,  #1
199        stm             r0,  {r4-r5}
200        add             r0,  r0,  r2
201        bne             4b
202        pop             {r4-r5,pc}
203endfunc
204
205@ ----------------------------------------------------------------
206        .align 5
207function ff_put_pixels8_x2_arm, export=1
208        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
209        @ block = word aligned, pixles = unaligned
210        pld             [r1]
211        push            {r4-r10,lr}
212        ldr             r12, =0xfefefefe
213        JMP_ALIGN       r5,  r1
2141:
215        ldm             r1,  {r4-r5, r10}
216        add             r1,  r1,  r2
217        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
218        pld             [r1]
219        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
220        subs            r3,  r3,  #1
221        stm             r0,  {r8-r9}
222        add             r0,  r0,  r2
223        bne             1b
224        pop             {r4-r10,pc}
225        .align 5
2262:
227        ldm             r1,  {r4-r5, r10}
228        add             r1,  r1,  r2
229        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
230        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
231        pld             [r1]
232        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
233        subs            r3,  r3,  #1
234        stm             r0,  {r4-r5}
235        add             r0,  r0,  r2
236        bne             2b
237        pop             {r4-r10,pc}
238        .align 5
2393:
240        ldm             r1,  {r4-r5, r10}
241        add             r1,  r1,  r2
242        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
243        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
244        pld             [r1]
245        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
246        subs            r3,  r3,  #1
247        stm             r0,  {r4-r5}
248        add             r0,  r0,  r2
249        bne             3b
250        pop             {r4-r10,pc}
251        .align 5
2524:
253        ldm             r1,  {r4-r5, r10}
254        add             r1,  r1,  r2
255        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
256        pld             [r1]
257        RND_AVG32       r8,  r9,  r6,  r7,  r5,  r10, r12
258        subs            r3,  r3,  #1
259        stm             r0,  {r8-r9}
260        add             r0,  r0,  r2
261        bne             4b
262        pop             {r4-r10,pc}
263endfunc
264
265        .align 5
266function ff_put_no_rnd_pixels8_x2_arm, export=1
267        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
268        @ block = word aligned, pixles = unaligned
269        pld             [r1]
270        push            {r4-r10,lr}
271        ldr             r12, =0xfefefefe
272        JMP_ALIGN       r5,  r1
2731:
274        ldm             r1,  {r4-r5, r10}
275        add             r1,  r1,  r2
276        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
277        pld             [r1]
278        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
279        subs            r3,  r3,  #1
280        stm             r0,  {r8-r9}
281        add             r0,  r0,  r2
282        bne             1b
283        pop             {r4-r10,pc}
284        .align 5
2852:
286        ldm             r1,  {r4-r5, r10}
287        add             r1,  r1,  r2
288        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
289        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
290        pld             [r1]
291        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
292        subs            r3,  r3,  #1
293        stm             r0,  {r4-r5}
294        add             r0,  r0,  r2
295        bne             2b
296        pop             {r4-r10,pc}
297        .align 5
2983:
299        ldm             r1,  {r4-r5, r10}
300        add             r1,  r1,  r2
301        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
302        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
303        pld             [r1]
304        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
305        subs            r3,  r3,  #1
306        stm             r0,  {r4-r5}
307        add             r0,  r0,  r2
308        bne             3b
309        pop             {r4-r10,pc}
310        .align 5
3114:
312        ldm             r1,  {r4-r5, r10}
313        add             r1,  r1,  r2
314        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
315        pld             [r1]
316        NO_RND_AVG32    r8,  r9,  r6,  r7,  r5,  r10, r12
317        subs            r3,  r3,  #1
318        stm             r0,  {r8-r9}
319        add             r0,  r0,  r2
320        bne             4b
321        pop             {r4-r10,pc}
322endfunc
323
324
325@ ----------------------------------------------------------------
326        .align 5
327function ff_put_pixels8_y2_arm, export=1
328        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
329        @ block = word aligned, pixles = unaligned
330        pld             [r1]
331        push            {r4-r11,lr}
332        mov             r3,  r3,  lsr #1
333        ldr             r12, =0xfefefefe
334        JMP_ALIGN       r5,  r1
3351:
336        ldm             r1,  {r4-r5}
337        add             r1,  r1,  r2
3386:      ldm             r1,  {r6-r7}
339        add             r1,  r1,  r2
340        pld             [r1]
341        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
342        ldm             r1,  {r4-r5}
343        add             r1,  r1,  r2
344        stm             r0,  {r8-r9}
345        add             r0,  r0,  r2
346        pld             [r1]
347        RND_AVG32       r8,  r9,  r6,  r7,  r4,  r5,  r12
348        subs            r3,  r3,  #1
349        stm             r0,  {r8-r9}
350        add             r0,  r0,  r2
351        bne             6b
352        pop             {r4-r11,pc}
353        .align 5
3542:
355        ldm             r1,  {r4-r6}
356        add             r1,  r1,  r2
357        pld             [r1]
358        ALIGN_DWORD     1,   r4,  r5,  r6
3596:      ldm             r1,  {r7-r9}
360        add             r1,  r1,  r2
361        pld             [r1]
362        ALIGN_DWORD     1,   r7,  r8,  r9
363        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
364        stm             r0,  {r10-r11}
365        add             r0,  r0,  r2
366        ldm             r1,  {r4-r6}
367        add             r1,  r1,  r2
368        pld             [r1]
369        ALIGN_DWORD     1,   r4,  r5,  r6
370        subs            r3,  r3,  #1
371        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
372        stm             r0,  {r10-r11}
373        add             r0,  r0,  r2
374        bne             6b
375        pop             {r4-r11,pc}
376        .align 5
3773:
378        ldm             r1,  {r4-r6}
379        add             r1,  r1,  r2
380        pld             [r1]
381        ALIGN_DWORD     2,   r4,  r5,  r6
3826:      ldm             r1,  {r7-r9}
383        add             r1,  r1,  r2
384        pld             [r1]
385        ALIGN_DWORD     2,   r7,  r8,  r9
386        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
387        stm             r0,  {r10-r11}
388        add             r0,  r0,  r2
389        ldm             r1,  {r4-r6}
390        add             r1,  r1,  r2
391        pld             [r1]
392        ALIGN_DWORD     2,   r4,  r5,  r6
393        subs            r3,  r3,  #1
394        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
395        stm             r0,  {r10-r11}
396        add             r0,  r0,  r2
397        bne             6b
398        pop             {r4-r11,pc}
399        .align 5
4004:
401        ldm             r1,  {r4-r6}
402        add             r1,  r1,  r2
403        pld             [r1]
404        ALIGN_DWORD     3,   r4,  r5,  r6
4056:      ldm             r1,  {r7-r9}
406        add             r1,  r1,  r2
407        pld             [r1]
408        ALIGN_DWORD     3,   r7,  r8,  r9
409        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
410        stm             r0,  {r10-r11}
411        add             r0,  r0,  r2
412        ldm             r1,  {r4-r6}
413        add             r1,  r1,  r2
414        pld             [r1]
415        ALIGN_DWORD     3,   r4,  r5,  r6
416        subs            r3,  r3,  #1
417        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
418        stm             r0,  {r10-r11}
419        add             r0,  r0,  r2
420        bne             6b
421        pop             {r4-r11,pc}
422endfunc
423
424        .align 5
425function ff_put_no_rnd_pixels8_y2_arm, export=1
426        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
427        @ block = word aligned, pixles = unaligned
428        pld             [r1]
429        push            {r4-r11,lr}
430        mov             r3,  r3,  lsr #1
431        ldr             r12, =0xfefefefe
432        JMP_ALIGN       r5,  r1
4331:
434        ldm             r1,  {r4-r5}
435        add             r1,  r1,  r2
4366:      ldm             r1,  {r6-r7}
437        add             r1,  r1,  r2
438        pld             [r1]
439        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
440        ldm             r1,  {r4-r5}
441        add             r1,  r1,  r2
442        stm             r0,  {r8-r9}
443        add             r0,  r0,  r2
444        pld             [r1]
445        NO_RND_AVG32    r8,  r9,  r6,  r7,  r4,  r5,  r12
446        subs            r3,  r3,  #1
447        stm             r0,  {r8-r9}
448        add             r0,  r0,  r2
449        bne             6b
450        pop             {r4-r11,pc}
451        .align 5
4522:
453        ldm             r1,  {r4-r6}
454        add             r1,  r1,  r2
455        pld             [r1]
456        ALIGN_DWORD     1,   r4,  r5,  r6
4576:      ldm             r1,  {r7-r9}
458        add             r1,  r1,  r2
459        pld             [r1]
460        ALIGN_DWORD     1,   r7,  r8,  r9
461        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
462        stm             r0,  {r10-r11}
463        add             r0,  r0,  r2
464        ldm             r1,  {r4-r6}
465        add             r1,  r1,  r2
466        pld             [r1]
467        ALIGN_DWORD     1,   r4,  r5,  r6
468        subs            r3,  r3,  #1
469        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
470        stm             r0,  {r10-r11}
471        add             r0,  r0,  r2
472        bne             6b
473        pop             {r4-r11,pc}
474        .align 5
4753:
476        ldm             r1,  {r4-r6}
477        add             r1,  r1,  r2
478        pld             [r1]
479        ALIGN_DWORD     2,   r4,  r5,  r6
4806:      ldm             r1,  {r7-r9}
481        add             r1,  r1,  r2
482        pld             [r1]
483        ALIGN_DWORD     2,   r7,  r8,  r9
484        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
485        stm             r0,  {r10-r11}
486        add             r0,  r0,  r2
487        ldm             r1,  {r4-r6}
488        add             r1,  r1,  r2
489        pld             [r1]
490        ALIGN_DWORD     2,   r4,  r5,  r6
491        subs            r3,  r3,  #1
492        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
493        stm             r0,  {r10-r11}
494        add             r0,  r0,  r2
495        bne             6b
496        pop             {r4-r11,pc}
497        .align 5
4984:
499        ldm             r1,  {r4-r6}
500        add             r1,  r1,  r2
501        pld             [r1]
502        ALIGN_DWORD     3,   r4,  r5,  r6
5036:      ldm             r1,  {r7-r9}
504        add             r1,  r1,  r2
505        pld             [r1]
506        ALIGN_DWORD     3,   r7,  r8,  r9
507        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
508        stm             r0,  {r10-r11}
509        add             r0,  r0,  r2
510        ldm             r1,  {r4-r6}
511        add             r1,  r1,  r2
512        pld             [r1]
513        ALIGN_DWORD     3,   r4,  r5,  r6
514        subs            r3,  r3,  #1
515        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
516        stm             r0,  {r10-r11}
517        add             r0,  r0,  r2
518        bne             6b
519        pop             {r4-r11,pc}
520endfunc
521
522        .ltorg
523
524@ ----------------------------------------------------------------
525.macro  RND_XY2_IT align, rnd
526        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
527        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
528.if \align == 0
529        ldm             r1,  {r6-r8}
530.elseif \align == 3
531        ldm             r1,  {r5-r7}
532.else
533        ldm             r1,  {r8-r10}
534.endif
535        add             r1,  r1,  r2
536        pld             [r1]
537.if \align == 0
538        ALIGN_DWORD_D   1,   r4,  r5,  r6,  r7,  r8
539.elseif \align == 1
540        ALIGN_DWORD_D   1,   r4,  r5,  r8,  r9,  r10
541        ALIGN_DWORD_D   2,   r6,  r7,  r8,  r9,  r10
542.elseif \align == 2
543        ALIGN_DWORD_D   2,   r4,  r5,  r8,  r9,  r10
544        ALIGN_DWORD_D   3,   r6,  r7,  r8,  r9,  r10
545.elseif \align == 3
546        ALIGN_DWORD_D   3,   r4,  r5,  r5,  r6,  r7
547.endif
548        ldr             r14, =0x03030303
549        tst             r3,  #1
550        and             r8,  r4,  r14
551        and             r9,  r5,  r14
552        and             r10, r6,  r14
553        and             r11, r7,  r14
554        it              eq
555        andeq           r14, r14, r14, \rnd #1
556        add             r8,  r8,  r10
557        add             r9,  r9,  r11
558        ldr             r12, =0xfcfcfcfc >> 2
559        itt             eq
560        addeq           r8,  r8,  r14
561        addeq           r9,  r9,  r14
562        and             r4,  r12, r4,  lsr #2
563        and             r5,  r12, r5,  lsr #2
564        and             r6,  r12, r6,  lsr #2
565        and             r7,  r12, r7,  lsr #2
566        add             r10, r4,  r6
567        add             r11, r5,  r7
568        subs            r3,  r3,  #1
569.endm
570
571.macro RND_XY2_EXPAND align, rnd
572        RND_XY2_IT      \align, \rnd
5736:      push            {r8-r11}
574        RND_XY2_IT      \align, \rnd
575        pop             {r4-r7}
576        add             r4,  r4,  r8
577        add             r5,  r5,  r9
578        ldr             r14, =0x0f0f0f0f
579        add             r6,  r6,  r10
580        add             r7,  r7,  r11
581        and             r4,  r14, r4,  lsr #2
582        and             r5,  r14, r5,  lsr #2
583        add             r4,  r4,  r6
584        add             r5,  r5,  r7
585        stm             r0,  {r4-r5}
586        add             r0,  r0,  r2
587        bge             6b
588        pop             {r4-r11,pc}
589.endm
590
591        .align 5
592function ff_put_pixels8_xy2_arm, export=1
593        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
594        @ block = word aligned, pixles = unaligned
595        pld             [r1]
596        push            {r4-r11,lr} @ R14 is also called LR
597        JMP_ALIGN       r5,  r1
5981:      RND_XY2_EXPAND  0, lsl
599        .align 5
6002:      RND_XY2_EXPAND  1, lsl
601        .align 5
6023:      RND_XY2_EXPAND  2, lsl
603        .align 5
6044:      RND_XY2_EXPAND  3, lsl
605endfunc
606
607        .align 5
608function ff_put_no_rnd_pixels8_xy2_arm, export=1
609        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
610        @ block = word aligned, pixles = unaligned
611        pld             [r1]
612        push            {r4-r11,lr}
613        JMP_ALIGN       r5,  r1
6141:      RND_XY2_EXPAND  0, lsr
615        .align 5
6162:      RND_XY2_EXPAND  1, lsr
617        .align 5
6183:      RND_XY2_EXPAND  2, lsr
619        .align 5
6204:      RND_XY2_EXPAND  3, lsr
621endfunc
622
623        .align 5
624@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
625function ff_add_pixels_clamped_arm, export=1
626        push            {r4-r10}
627        mov             r10, #8
6281:
629        ldr             r4,  [r1]               /* load dest */
630        /* block[0] and block[1]*/
631        ldrsh           r5,  [r0]
632        ldrsh           r7,  [r0, #2]
633        and             r6,  r4,  #0xFF
634        and             r8,  r4,  #0xFF00
635        add             r6,  r5,  r6
636        add             r8,  r7,  r8,  lsr #8
637        mvn             r5,  r5
638        mvn             r7,  r7
639        tst             r6,  #0x100
640        it              ne
641        movne           r6,  r5,  lsr #24
642        tst             r8,  #0x100
643        it              ne
644        movne           r8,  r7,  lsr #24
645        mov             r9,  r6
646        ldrsh           r5,  [r0, #4]           /* moved form [A] */
647        orr             r9,  r9,  r8,  lsl #8
648        /* block[2] and block[3] */
649        /* [A] */
650        ldrsh           r7,  [r0, #6]
651        and             r6,  r4,  #0xFF0000
652        and             r8,  r4,  #0xFF000000
653        add             r6,  r5,  r6,  lsr #16
654        add             r8,  r7,  r8,  lsr #24
655        mvn             r5,  r5
656        mvn             r7,  r7
657        tst             r6,  #0x100
658        it              ne
659        movne           r6,  r5,  lsr #24
660        tst             r8,  #0x100
661        it              ne
662        movne           r8,  r7,  lsr #24
663        orr             r9,  r9,  r6,  lsl #16
664        ldr             r4,  [r1, #4]           /* moved form [B] */
665        orr             r9,  r9,  r8,  lsl #24
666        /* store dest */
667        ldrsh           r5,  [r0, #8]           /* moved form [C] */
668        str             r9,  [r1]
669
670        /* load dest */
671        /* [B] */
672        /* block[4] and block[5] */
673        /* [C] */
674        ldrsh           r7,  [r0, #10]
675        and             r6,  r4,  #0xFF
676        and             r8,  r4,  #0xFF00
677        add             r6,  r5,  r6
678        add             r8,  r7,  r8,  lsr #8
679        mvn             r5,  r5
680        mvn             r7,  r7
681        tst             r6,  #0x100
682        it              ne
683        movne           r6,  r5,  lsr #24
684        tst             r8,  #0x100
685        it              ne
686        movne           r8,  r7,  lsr #24
687        mov             r9,  r6
688        ldrsh           r5,  [r0, #12]          /* moved from [D] */
689        orr             r9,  r9,  r8,  lsl #8
690        /* block[6] and block[7] */
691        /* [D] */
692        ldrsh           r7,  [r0, #14]
693        and             r6,  r4,  #0xFF0000
694        and             r8,  r4,  #0xFF000000
695        add             r6,  r5,  r6,  lsr #16
696        add             r8,  r7,  r8,  lsr #24
697        mvn             r5,  r5
698        mvn             r7,  r7
699        tst             r6,  #0x100
700        it              ne
701        movne           r6,  r5,  lsr #24
702        tst             r8,  #0x100
703        it              ne
704        movne           r8,  r7,  lsr #24
705        orr             r9,  r9,  r6,  lsl #16
706        add             r0,  r0,  #16           /* moved from [E] */
707        orr             r9,  r9,  r8,  lsl #24
708        subs            r10, r10, #1            /* moved from [F] */
709        /* store dest */
710        str             r9,  [r1, #4]
711
712        /* [E] */
713        /* [F] */
714        add             r1,  r1,  r2
715        bne             1b
716
717        pop             {r4-r10}
718        bx              lr
719endfunc
720