1/*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25
26#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33#define ROW_SHIFT 11
34#define COL_SHIFT 20
35
36#define W13 (W1 | (W3 << 16))
37#define W26 (W2 | (W6 << 16))
38#define W57 (W5 | (W7 << 16))
39
40function idct_row_armv5te
41        str    lr, [sp, #-4]!
42
43        ldrd   v1, v2, [a1, #8]
44        ldrd   a3, a4, [a1]          /* a3 = row[1:0], a4 = row[3:2] */
45        orrs   v1, v1, v2
46        itt    eq
47        cmpeq  v1, a4
48        cmpeq  v1, a3, lsr #16
49        beq    row_dc_only
50
51        mov    v1, #(1<<(ROW_SHIFT-1))
52        mov    ip, #16384
53        sub    ip, ip, #1            /* ip = W4 */
54        smlabb v1, ip, a3, v1        /* v1 = W4*row[0]+(1<<(RS-1)) */
55        ldr    ip, =W26              /* ip = W2 | (W6 << 16) */
56        smultb a2, ip, a4
57        smulbb lr, ip, a4
58        add    v2, v1, a2
59        sub    v3, v1, a2
60        sub    v4, v1, lr
61        add    v1, v1, lr
62
63        ldr    ip, =W13              /* ip = W1 | (W3 << 16) */
64        ldr    lr, =W57              /* lr = W5 | (W7 << 16) */
65        smulbt v5, ip, a3
66        smultt v6, lr, a4
67        smlatt v5, ip, a4, v5
68        smultt a2, ip, a3
69        smulbt v7, lr, a3
70        sub    v6, v6, a2
71        smulbt a2, ip, a4
72        smultt fp, lr, a3
73        sub    v7, v7, a2
74        smulbt a2, lr, a4
75        ldrd   a3, a4, [a1, #8]     /* a3=row[5:4] a4=row[7:6] */
76        sub    fp, fp, a2
77
78        orrs   a2, a3, a4
79        beq    1f
80
81        smlabt v5, lr, a3, v5
82        smlabt v6, ip, a3, v6
83        smlatt v5, lr, a4, v5
84        smlabt v6, lr, a4, v6
85        smlatt v7, lr, a3, v7
86        smlatt fp, ip, a3, fp
87        smulbt a2, ip, a4
88        smlatt v7, ip, a4, v7
89        sub    fp, fp, a2
90
91        ldr    ip, =W26              /* ip = W2 | (W6 << 16) */
92        mov    a2, #16384
93        sub    a2, a2, #1            /* a2 =  W4 */
94        smulbb a2, a2, a3            /* a2 =  W4*row[4] */
95        smultb lr, ip, a4            /* lr =  W6*row[6] */
96        add    v1, v1, a2            /* v1 += W4*row[4] */
97        add    v1, v1, lr            /* v1 += W6*row[6] */
98        add    v4, v4, a2            /* v4 += W4*row[4] */
99        sub    v4, v4, lr            /* v4 -= W6*row[6] */
100        smulbb lr, ip, a4            /* lr =  W2*row[6] */
101        sub    v2, v2, a2            /* v2 -= W4*row[4] */
102        sub    v2, v2, lr            /* v2 -= W2*row[6] */
103        sub    v3, v3, a2            /* v3 -= W4*row[4] */
104        add    v3, v3, lr            /* v3 += W2*row[6] */
105
1061:      add    a2, v1, v5
107        mov    a3, a2, lsr #11
108        bic    a3, a3, #0x1f0000
109        sub    a2, v2, v6
110        mov    a2, a2, lsr #11
111        add    a3, a3, a2, lsl #16
112        add    a2, v3, v7
113        mov    a4, a2, lsr #11
114        bic    a4, a4, #0x1f0000
115        add    a2, v4, fp
116        mov    a2, a2, lsr #11
117        add    a4, a4, a2, lsl #16
118        strd   a3, a4, [a1]
119
120        sub    a2, v4, fp
121        mov    a3, a2, lsr #11
122        bic    a3, a3, #0x1f0000
123        sub    a2, v3, v7
124        mov    a2, a2, lsr #11
125        add    a3, a3, a2, lsl #16
126        add    a2, v2, v6
127        mov    a4, a2, lsr #11
128        bic    a4, a4, #0x1f0000
129        sub    a2, v1, v5
130        mov    a2, a2, lsr #11
131        add    a4, a4, a2, lsl #16
132        strd   a3, a4, [a1, #8]
133
134        ldr    pc, [sp], #4
135
136row_dc_only:
137        orr    a3, a3, a3, lsl #16
138        bic    a3, a3, #0xe000
139        mov    a3, a3, lsl #3
140        mov    a4, a3
141        strd   a3, a4, [a1]
142        strd   a3, a4, [a1, #8]
143
144        ldr    pc, [sp], #4
145endfunc
146
147        .macro idct_col
148        ldr    a4, [a1]              /* a4 = col[1:0] */
149        mov    ip, #16384
150        sub    ip, ip, #1            /* ip = W4 */
151#if 0
152        mov    v1, #(1<<(COL_SHIFT-1))
153        smlabt v2, ip, a4, v1        /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
154        smlabb v1, ip, a4, v1        /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
155        ldr    a4, [a1, #(16*4)]
156#else
157        mov    v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
158        add    v2, v1, a4, asr #16
159        rsb    v2, v2, v2, lsl #14
160        mov    a4, a4, lsl #16
161        add    v1, v1, a4, asr #16
162        ldr    a4, [a1, #(16*4)]
163        rsb    v1, v1, v1, lsl #14
164#endif
165
166        smulbb lr, ip, a4
167        smulbt a3, ip, a4
168        sub    v3, v1, lr
169        sub    v5, v1, lr
170        add    v7, v1, lr
171        add    v1, v1, lr
172        sub    v4, v2, a3
173        sub    v6, v2, a3
174        add    fp, v2, a3
175        ldr    ip, =W26
176        ldr    a4, [a1, #(16*2)]
177        add    v2, v2, a3
178
179        smulbb lr, ip, a4
180        smultb a3, ip, a4
181        add    v1, v1, lr
182        sub    v7, v7, lr
183        add    v3, v3, a3
184        sub    v5, v5, a3
185        smulbt lr, ip, a4
186        smultt a3, ip, a4
187        add    v2, v2, lr
188        sub    fp, fp, lr
189        add    v4, v4, a3
190        ldr    a4, [a1, #(16*6)]
191        sub    v6, v6, a3
192
193        smultb lr, ip, a4
194        smulbb a3, ip, a4
195        add    v1, v1, lr
196        sub    v7, v7, lr
197        sub    v3, v3, a3
198        add    v5, v5, a3
199        smultt lr, ip, a4
200        smulbt a3, ip, a4
201        add    v2, v2, lr
202        sub    fp, fp, lr
203        sub    v4, v4, a3
204        add    v6, v6, a3
205
206        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
207
208        ldr    ip, =W13
209        ldr    a4, [a1, #(16*1)]
210        ldr    lr, =W57
211        smulbb v1, ip, a4
212        smultb v3, ip, a4
213        smulbb v5, lr, a4
214        smultb v7, lr, a4
215        smulbt v2, ip, a4
216        smultt v4, ip, a4
217        smulbt v6, lr, a4
218        smultt fp, lr, a4
219        rsb    v4, v4, #0
220        ldr    a4, [a1, #(16*3)]
221        rsb    v3, v3, #0
222
223        smlatb v1, ip, a4, v1
224        smlatb v3, lr, a4, v3
225        smulbb a3, ip, a4
226        smulbb a2, lr, a4
227        sub    v5, v5, a3
228        sub    v7, v7, a2
229        smlatt v2, ip, a4, v2
230        smlatt v4, lr, a4, v4
231        smulbt a3, ip, a4
232        smulbt a2, lr, a4
233        sub    v6, v6, a3
234        ldr    a4, [a1, #(16*5)]
235        sub    fp, fp, a2
236
237        smlabb v1, lr, a4, v1
238        smlabb v3, ip, a4, v3
239        smlatb v5, lr, a4, v5
240        smlatb v7, ip, a4, v7
241        smlabt v2, lr, a4, v2
242        smlabt v4, ip, a4, v4
243        smlatt v6, lr, a4, v6
244        ldr    a3, [a1, #(16*7)]
245        smlatt fp, ip, a4, fp
246
247        smlatb v1, lr, a3, v1
248        smlabb v3, lr, a3, v3
249        smlatb v5, ip, a3, v5
250        smulbb a4, ip, a3
251        smlatt v2, lr, a3, v2
252        sub    v7, v7, a4
253        smlabt v4, lr, a3, v4
254        smulbt a4, ip, a3
255        smlatt v6, ip, a3, v6
256        sub    fp, fp, a4
257        .endm
258
259function idct_col_armv5te
260        str    lr, [sp, #-4]!
261
262        idct_col
263
264        ldmfd  sp!, {a3, a4}
265        adds   a2, a3, v1
266        mov    a2, a2, lsr #20
267        it     mi
268        orrmi  a2, a2, #0xf000
269        add    ip, a4, v2
270        mov    ip, ip, asr #20
271        orr    a2, a2, ip, lsl #16
272        str    a2, [a1]
273        subs   a3, a3, v1
274        mov    a2, a3, lsr #20
275        it     mi
276        orrmi  a2, a2, #0xf000
277        sub    a4, a4, v2
278        mov    a4, a4, asr #20
279        orr    a2, a2, a4, lsl #16
280        ldmfd  sp!, {a3, a4}
281        str    a2, [a1, #(16*7)]
282
283        subs   a2, a3, v3
284        mov    a2, a2, lsr #20
285        it     mi
286        orrmi  a2, a2, #0xf000
287        sub    ip, a4, v4
288        mov    ip, ip, asr #20
289        orr    a2, a2, ip, lsl #16
290        str    a2, [a1, #(16*1)]
291        adds   a3, a3, v3
292        mov    a2, a3, lsr #20
293        it     mi
294        orrmi  a2, a2, #0xf000
295        add    a4, a4, v4
296        mov    a4, a4, asr #20
297        orr    a2, a2, a4, lsl #16
298        ldmfd  sp!, {a3, a4}
299        str    a2, [a1, #(16*6)]
300
301        adds   a2, a3, v5
302        mov    a2, a2, lsr #20
303        it     mi
304        orrmi  a2, a2, #0xf000
305        add    ip, a4, v6
306        mov    ip, ip, asr #20
307        orr    a2, a2, ip, lsl #16
308        str    a2, [a1, #(16*2)]
309        subs   a3, a3, v5
310        mov    a2, a3, lsr #20
311        it     mi
312        orrmi  a2, a2, #0xf000
313        sub    a4, a4, v6
314        mov    a4, a4, asr #20
315        orr    a2, a2, a4, lsl #16
316        ldmfd  sp!, {a3, a4}
317        str    a2, [a1, #(16*5)]
318
319        adds   a2, a3, v7
320        mov    a2, a2, lsr #20
321        it     mi
322        orrmi  a2, a2, #0xf000
323        add    ip, a4, fp
324        mov    ip, ip, asr #20
325        orr    a2, a2, ip, lsl #16
326        str    a2, [a1, #(16*3)]
327        subs   a3, a3, v7
328        mov    a2, a3, lsr #20
329        it     mi
330        orrmi  a2, a2, #0xf000
331        sub    a4, a4, fp
332        mov    a4, a4, asr #20
333        orr    a2, a2, a4, lsl #16
334        str    a2, [a1, #(16*4)]
335
336        ldr    pc, [sp], #4
337endfunc
338
339.macro  clip   dst, src:vararg
340        movs   \dst, \src
341        it     mi
342        movmi  \dst, #0
343        cmp    \dst, #255
344        it     gt
345        movgt  \dst, #255
346.endm
347
348.macro  aclip  dst, src:vararg
349        adds   \dst, \src
350        it     mi
351        movmi  \dst, #0
352        cmp    \dst, #255
353        it     gt
354        movgt  \dst, #255
355.endm
356
357function idct_col_put_armv5te
358        str    lr, [sp, #-4]!
359
360        idct_col
361
362        ldmfd  sp!, {a3, a4}
363        ldr    lr, [sp, #32]
364        add    a2, a3, v1
365        clip   a2, a2, asr #20
366        add    ip, a4, v2
367        clip   ip, ip, asr #20
368        orr    a2, a2, ip, lsl #8
369        sub    a3, a3, v1
370        clip   a3, a3, asr #20
371        sub    a4, a4, v2
372        clip   a4, a4, asr #20
373        ldr    v1, [sp, #28]
374        strh   a2, [v1]
375        add    a2, v1, #2
376        str    a2, [sp, #28]
377        orr    a2, a3, a4, lsl #8
378        rsb    v2, lr, lr, lsl #3
379        ldmfd  sp!, {a3, a4}
380        strh_pre a2, v2, v1
381
382        sub    a2, a3, v3
383        clip   a2, a2, asr #20
384        sub    ip, a4, v4
385        clip   ip, ip, asr #20
386        orr    a2, a2, ip, lsl #8
387        strh_pre a2, v1, lr
388        add    a3, a3, v3
389        clip   a2, a3, asr #20
390        add    a4, a4, v4
391        clip   a4, a4, asr #20
392        orr    a2, a2, a4, lsl #8
393        ldmfd  sp!, {a3, a4}
394        strh_dpre a2, v2, lr
395
396        add    a2, a3, v5
397        clip   a2, a2, asr #20
398        add    ip, a4, v6
399        clip   ip, ip, asr #20
400        orr    a2, a2, ip, lsl #8
401        strh_pre a2, v1, lr
402        sub    a3, a3, v5
403        clip   a2, a3, asr #20
404        sub    a4, a4, v6
405        clip   a4, a4, asr #20
406        orr    a2, a2, a4, lsl #8
407        ldmfd  sp!, {a3, a4}
408        strh_dpre a2, v2, lr
409
410        add    a2, a3, v7
411        clip   a2, a2, asr #20
412        add    ip, a4, fp
413        clip   ip, ip, asr #20
414        orr    a2, a2, ip, lsl #8
415        strh   a2, [v1, lr]
416        sub    a3, a3, v7
417        clip   a2, a3, asr #20
418        sub    a4, a4, fp
419        clip   a4, a4, asr #20
420        orr    a2, a2, a4, lsl #8
421        strh_dpre a2, v2, lr
422
423        ldr    pc, [sp], #4
424endfunc
425
426function idct_col_add_armv5te
427        str    lr, [sp, #-4]!
428
429        idct_col
430
431        ldr    lr, [sp, #36]
432
433        ldmfd  sp!, {a3, a4}
434        ldrh   ip, [lr]
435        add    a2, a3, v1
436        sub    a3, a3, v1
437        and    v1, ip, #255
438        aclip  a2, v1, a2, asr #20
439        add    v1, a4, v2
440        mov    v1, v1, asr #20
441        aclip  v1, v1, ip, lsr #8
442        orr    a2, a2, v1, lsl #8
443        ldr    v1, [sp, #32]
444        sub    a4, a4, v2
445        rsb    v2, v1, v1, lsl #3
446        ldrh_pre ip, v2, lr
447        strh   a2, [lr]
448        and    a2, ip, #255
449        aclip  a3, a2, a3, asr #20
450        mov    a4, a4, asr #20
451        aclip  a4, a4, ip, lsr #8
452        add    a2, lr, #2
453        str    a2, [sp, #28]
454        orr    a2, a3, a4, lsl #8
455        strh   a2, [v2]
456
457        ldmfd  sp!, {a3, a4}
458        ldrh_pre ip, lr, v1
459        sub    a2, a3, v3
460        add    a3, a3, v3
461        and    v3, ip, #255
462        aclip  a2, v3, a2, asr #20
463        sub    v3, a4, v4
464        mov    v3, v3, asr #20
465        aclip  v3, v3, ip, lsr #8
466        orr    a2, a2, v3, lsl #8
467        add    a4, a4, v4
468        ldrh_dpre ip, v2, v1
469        strh   a2, [lr]
470        and    a2, ip, #255
471        aclip  a3, a2, a3, asr #20
472        mov    a4, a4, asr #20
473        aclip  a4, a4, ip, lsr #8
474        orr    a2, a3, a4, lsl #8
475        strh   a2, [v2]
476
477        ldmfd  sp!, {a3, a4}
478        ldrh_pre ip, lr, v1
479        add    a2, a3, v5
480        sub    a3, a3, v5
481        and    v3, ip, #255
482        aclip  a2, v3, a2, asr #20
483        add    v3, a4, v6
484        mov    v3, v3, asr #20
485        aclip  v3, v3, ip, lsr #8
486        orr    a2, a2, v3, lsl #8
487        sub    a4, a4, v6
488        ldrh_dpre ip, v2, v1
489        strh   a2, [lr]
490        and    a2, ip, #255
491        aclip  a3, a2, a3, asr #20
492        mov    a4, a4, asr #20
493        aclip  a4, a4, ip, lsr #8
494        orr    a2, a3, a4, lsl #8
495        strh   a2, [v2]
496
497        ldmfd  sp!, {a3, a4}
498        ldrh_pre ip, lr, v1
499        add    a2, a3, v7
500        sub    a3, a3, v7
501        and    v3, ip, #255
502        aclip  a2, v3, a2, asr #20
503        add    v3, a4, fp
504        mov    v3, v3, asr #20
505        aclip  v3, v3, ip, lsr #8
506        orr    a2, a2, v3, lsl #8
507        sub    a4, a4, fp
508        ldrh_dpre ip, v2, v1
509        strh   a2, [lr]
510        and    a2, ip, #255
511        aclip  a3, a2, a3, asr #20
512        mov    a4, a4, asr #20
513        aclip  a4, a4, ip, lsr #8
514        orr    a2, a3, a4, lsl #8
515        strh   a2, [v2]
516
517        ldr    pc, [sp], #4
518endfunc
519
520function ff_simple_idct_armv5te, export=1
521        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
522
523        bl     idct_row_armv5te
524        add    a1, a1, #16
525        bl     idct_row_armv5te
526        add    a1, a1, #16
527        bl     idct_row_armv5te
528        add    a1, a1, #16
529        bl     idct_row_armv5te
530        add    a1, a1, #16
531        bl     idct_row_armv5te
532        add    a1, a1, #16
533        bl     idct_row_armv5te
534        add    a1, a1, #16
535        bl     idct_row_armv5te
536        add    a1, a1, #16
537        bl     idct_row_armv5te
538
539        sub    a1, a1, #(16*7)
540
541        bl     idct_col_armv5te
542        add    a1, a1, #4
543        bl     idct_col_armv5te
544        add    a1, a1, #4
545        bl     idct_col_armv5te
546        add    a1, a1, #4
547        bl     idct_col_armv5te
548
549        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
550endfunc
551
552function ff_simple_idct_add_armv5te, export=1
553        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
554
555        mov    a1, a3
556
557        bl     idct_row_armv5te
558        add    a1, a1, #16
559        bl     idct_row_armv5te
560        add    a1, a1, #16
561        bl     idct_row_armv5te
562        add    a1, a1, #16
563        bl     idct_row_armv5te
564        add    a1, a1, #16
565        bl     idct_row_armv5te
566        add    a1, a1, #16
567        bl     idct_row_armv5te
568        add    a1, a1, #16
569        bl     idct_row_armv5te
570        add    a1, a1, #16
571        bl     idct_row_armv5te
572
573        sub    a1, a1, #(16*7)
574
575        bl     idct_col_add_armv5te
576        add    a1, a1, #4
577        bl     idct_col_add_armv5te
578        add    a1, a1, #4
579        bl     idct_col_add_armv5te
580        add    a1, a1, #4
581        bl     idct_col_add_armv5te
582
583        add    sp, sp, #8
584        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
585endfunc
586
587function ff_simple_idct_put_armv5te, export=1
588        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
589
590        mov    a1, a3
591
592        bl     idct_row_armv5te
593        add    a1, a1, #16
594        bl     idct_row_armv5te
595        add    a1, a1, #16
596        bl     idct_row_armv5te
597        add    a1, a1, #16
598        bl     idct_row_armv5te
599        add    a1, a1, #16
600        bl     idct_row_armv5te
601        add    a1, a1, #16
602        bl     idct_row_armv5te
603        add    a1, a1, #16
604        bl     idct_row_armv5te
605        add    a1, a1, #16
606        bl     idct_row_armv5te
607
608        sub    a1, a1, #(16*7)
609
610        bl     idct_col_put_armv5te
611        add    a1, a1, #4
612        bl     idct_col_put_armv5te
613        add    a1, a1, #4
614        bl     idct_col_put_armv5te
615        add    a1, a1, #4
616        bl     idct_col_put_armv5te
617
618        add    sp, sp, #8
619        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
620endfunc
621