1/*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of Libav.
8 *
9 * Libav is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * Libav is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with Libav; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "asm.S"
25
26#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33#define ROW_SHIFT 11
34#define COL_SHIFT 20
35
36#define W13 (W1 | (W3 << 16))
37#define W26 (W2 | (W6 << 16))
38#define W57 (W5 | (W7 << 16))
39
40        .text
41        .align
42w13:    .long W13
43w26:    .long W26
44w57:    .long W57
45
46function idct_row_armv5te
47        str    lr, [sp, #-4]!
48
49        ldrd   v1, [a1, #8]
50        ldrd   a3, [a1]              /* a3 = row[1:0], a4 = row[3:2] */
51        orrs   v1, v1, v2
52        itt    eq
53        cmpeq  v1, a4
54        cmpeq  v1, a3, lsr #16
55        beq    row_dc_only
56
57        mov    v1, #(1<<(ROW_SHIFT-1))
58        mov    ip, #16384
59        sub    ip, ip, #1            /* ip = W4 */
60        smlabb v1, ip, a3, v1        /* v1 = W4*row[0]+(1<<(RS-1)) */
61        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
62        smultb a2, ip, a4
63        smulbb lr, ip, a4
64        add    v2, v1, a2
65        sub    v3, v1, a2
66        sub    v4, v1, lr
67        add    v1, v1, lr
68
69        ldr    ip, w13               /* ip = W1 | (W3 << 16) */
70        ldr    lr, w57               /* lr = W5 | (W7 << 16) */
71        smulbt v5, ip, a3
72        smultt v6, lr, a4
73        smlatt v5, ip, a4, v5
74        smultt a2, ip, a3
75        smulbt v7, lr, a3
76        sub    v6, v6, a2
77        smulbt a2, ip, a4
78        smultt fp, lr, a3
79        sub    v7, v7, a2
80        smulbt a2, lr, a4
81        ldrd   a3, [a1, #8]          /* a3=row[5:4] a4=row[7:6] */
82        sub    fp, fp, a2
83
84        orrs   a2, a3, a4
85        beq    1f
86
87        smlabt v5, lr, a3, v5
88        smlabt v6, ip, a3, v6
89        smlatt v5, lr, a4, v5
90        smlabt v6, lr, a4, v6
91        smlatt v7, lr, a3, v7
92        smlatt fp, ip, a3, fp
93        smulbt a2, ip, a4
94        smlatt v7, ip, a4, v7
95        sub    fp, fp, a2
96
97        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
98        mov    a2, #16384
99        sub    a2, a2, #1            /* a2 =  W4 */
100        smulbb a2, a2, a3            /* a2 =  W4*row[4] */
101        smultb lr, ip, a4            /* lr =  W6*row[6] */
102        add    v1, v1, a2            /* v1 += W4*row[4] */
103        add    v1, v1, lr            /* v1 += W6*row[6] */
104        add    v4, v4, a2            /* v4 += W4*row[4] */
105        sub    v4, v4, lr            /* v4 -= W6*row[6] */
106        smulbb lr, ip, a4            /* lr =  W2*row[6] */
107        sub    v2, v2, a2            /* v2 -= W4*row[4] */
108        sub    v2, v2, lr            /* v2 -= W2*row[6] */
109        sub    v3, v3, a2            /* v3 -= W4*row[4] */
110        add    v3, v3, lr            /* v3 += W2*row[6] */
111
1121:      add    a2, v1, v5
113        mov    a3, a2, lsr #11
114        bic    a3, a3, #0x1f0000
115        sub    a2, v2, v6
116        mov    a2, a2, lsr #11
117        add    a3, a3, a2, lsl #16
118        add    a2, v3, v7
119        mov    a4, a2, lsr #11
120        bic    a4, a4, #0x1f0000
121        add    a2, v4, fp
122        mov    a2, a2, lsr #11
123        add    a4, a4, a2, lsl #16
124        strd   a3, [a1]
125
126        sub    a2, v4, fp
127        mov    a3, a2, lsr #11
128        bic    a3, a3, #0x1f0000
129        sub    a2, v3, v7
130        mov    a2, a2, lsr #11
131        add    a3, a3, a2, lsl #16
132        add    a2, v2, v6
133        mov    a4, a2, lsr #11
134        bic    a4, a4, #0x1f0000
135        sub    a2, v1, v5
136        mov    a2, a2, lsr #11
137        add    a4, a4, a2, lsl #16
138        strd   a3, [a1, #8]
139
140        ldr    pc, [sp], #4
141
142row_dc_only:
143        orr    a3, a3, a3, lsl #16
144        bic    a3, a3, #0xe000
145        mov    a3, a3, lsl #3
146        mov    a4, a3
147        strd   a3, [a1]
148        strd   a3, [a1, #8]
149
150        ldr    pc, [sp], #4
151endfunc
152
153        .macro idct_col
154        ldr    a4, [a1]              /* a4 = col[1:0] */
155        mov    ip, #16384
156        sub    ip, ip, #1            /* ip = W4 */
157#if 0
158        mov    v1, #(1<<(COL_SHIFT-1))
159        smlabt v2, ip, a4, v1        /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
160        smlabb v1, ip, a4, v1        /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
161        ldr    a4, [a1, #(16*4)]
162#else
163        mov    v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
164        add    v2, v1, a4, asr #16
165        rsb    v2, v2, v2, lsl #14
166        mov    a4, a4, lsl #16
167        add    v1, v1, a4, asr #16
168        ldr    a4, [a1, #(16*4)]
169        rsb    v1, v1, v1, lsl #14
170#endif
171
172        smulbb lr, ip, a4
173        smulbt a3, ip, a4
174        sub    v3, v1, lr
175        sub    v5, v1, lr
176        add    v7, v1, lr
177        add    v1, v1, lr
178        sub    v4, v2, a3
179        sub    v6, v2, a3
180        add    fp, v2, a3
181        ldr    ip, w26
182        ldr    a4, [a1, #(16*2)]
183        add    v2, v2, a3
184
185        smulbb lr, ip, a4
186        smultb a3, ip, a4
187        add    v1, v1, lr
188        sub    v7, v7, lr
189        add    v3, v3, a3
190        sub    v5, v5, a3
191        smulbt lr, ip, a4
192        smultt a3, ip, a4
193        add    v2, v2, lr
194        sub    fp, fp, lr
195        add    v4, v4, a3
196        ldr    a4, [a1, #(16*6)]
197        sub    v6, v6, a3
198
199        smultb lr, ip, a4
200        smulbb a3, ip, a4
201        add    v1, v1, lr
202        sub    v7, v7, lr
203        sub    v3, v3, a3
204        add    v5, v5, a3
205        smultt lr, ip, a4
206        smulbt a3, ip, a4
207        add    v2, v2, lr
208        sub    fp, fp, lr
209        sub    v4, v4, a3
210        add    v6, v6, a3
211
212        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
213
214        ldr    ip, w13
215        ldr    a4, [a1, #(16*1)]
216        ldr    lr, w57
217        smulbb v1, ip, a4
218        smultb v3, ip, a4
219        smulbb v5, lr, a4
220        smultb v7, lr, a4
221        smulbt v2, ip, a4
222        smultt v4, ip, a4
223        smulbt v6, lr, a4
224        smultt fp, lr, a4
225        rsb    v4, v4, #0
226        ldr    a4, [a1, #(16*3)]
227        rsb    v3, v3, #0
228
229        smlatb v1, ip, a4, v1
230        smlatb v3, lr, a4, v3
231        smulbb a3, ip, a4
232        smulbb a2, lr, a4
233        sub    v5, v5, a3
234        sub    v7, v7, a2
235        smlatt v2, ip, a4, v2
236        smlatt v4, lr, a4, v4
237        smulbt a3, ip, a4
238        smulbt a2, lr, a4
239        sub    v6, v6, a3
240        ldr    a4, [a1, #(16*5)]
241        sub    fp, fp, a2
242
243        smlabb v1, lr, a4, v1
244        smlabb v3, ip, a4, v3
245        smlatb v5, lr, a4, v5
246        smlatb v7, ip, a4, v7
247        smlabt v2, lr, a4, v2
248        smlabt v4, ip, a4, v4
249        smlatt v6, lr, a4, v6
250        ldr    a3, [a1, #(16*7)]
251        smlatt fp, ip, a4, fp
252
253        smlatb v1, lr, a3, v1
254        smlabb v3, lr, a3, v3
255        smlatb v5, ip, a3, v5
256        smulbb a4, ip, a3
257        smlatt v2, lr, a3, v2
258        sub    v7, v7, a4
259        smlabt v4, lr, a3, v4
260        smulbt a4, ip, a3
261        smlatt v6, ip, a3, v6
262        sub    fp, fp, a4
263        .endm
264
265function idct_col_armv5te
266        str    lr, [sp, #-4]!
267
268        idct_col
269
270        ldmfd  sp!, {a3, a4}
271        adds   a2, a3, v1
272        mov    a2, a2, lsr #20
273        it     mi
274        orrmi  a2, a2, #0xf000
275        add    ip, a4, v2
276        mov    ip, ip, asr #20
277        orr    a2, a2, ip, lsl #16
278        str    a2, [a1]
279        subs   a3, a3, v1
280        mov    a2, a3, lsr #20
281        it     mi
282        orrmi  a2, a2, #0xf000
283        sub    a4, a4, v2
284        mov    a4, a4, asr #20
285        orr    a2, a2, a4, lsl #16
286        ldmfd  sp!, {a3, a4}
287        str    a2, [a1, #(16*7)]
288
289        subs   a2, a3, v3
290        mov    a2, a2, lsr #20
291        it     mi
292        orrmi  a2, a2, #0xf000
293        sub    ip, a4, v4
294        mov    ip, ip, asr #20
295        orr    a2, a2, ip, lsl #16
296        str    a2, [a1, #(16*1)]
297        adds   a3, a3, v3
298        mov    a2, a3, lsr #20
299        it     mi
300        orrmi  a2, a2, #0xf000
301        add    a4, a4, v4
302        mov    a4, a4, asr #20
303        orr    a2, a2, a4, lsl #16
304        ldmfd  sp!, {a3, a4}
305        str    a2, [a1, #(16*6)]
306
307        adds   a2, a3, v5
308        mov    a2, a2, lsr #20
309        it     mi
310        orrmi  a2, a2, #0xf000
311        add    ip, a4, v6
312        mov    ip, ip, asr #20
313        orr    a2, a2, ip, lsl #16
314        str    a2, [a1, #(16*2)]
315        subs   a3, a3, v5
316        mov    a2, a3, lsr #20
317        it     mi
318        orrmi  a2, a2, #0xf000
319        sub    a4, a4, v6
320        mov    a4, a4, asr #20
321        orr    a2, a2, a4, lsl #16
322        ldmfd  sp!, {a3, a4}
323        str    a2, [a1, #(16*5)]
324
325        adds   a2, a3, v7
326        mov    a2, a2, lsr #20
327        it     mi
328        orrmi  a2, a2, #0xf000
329        add    ip, a4, fp
330        mov    ip, ip, asr #20
331        orr    a2, a2, ip, lsl #16
332        str    a2, [a1, #(16*3)]
333        subs   a3, a3, v7
334        mov    a2, a3, lsr #20
335        it     mi
336        orrmi  a2, a2, #0xf000
337        sub    a4, a4, fp
338        mov    a4, a4, asr #20
339        orr    a2, a2, a4, lsl #16
340        str    a2, [a1, #(16*4)]
341
342        ldr    pc, [sp], #4
343endfunc
344
345.macro  clip   dst, src:vararg
346        movs   \dst, \src
347        it     mi
348        movmi  \dst, #0
349        cmp    \dst, #255
350        it     gt
351        movgt  \dst, #255
352.endm
353
354.macro  aclip  dst, src:vararg
355        adds   \dst, \src
356        it     mi
357        movmi  \dst, #0
358        cmp    \dst, #255
359        it     gt
360        movgt  \dst, #255
361.endm
362
363function idct_col_put_armv5te
364        str    lr, [sp, #-4]!
365
366        idct_col
367
368        ldmfd  sp!, {a3, a4}
369        ldr    lr, [sp, #32]
370        add    a2, a3, v1
371        clip   a2, a2, asr #20
372        add    ip, a4, v2
373        clip   ip, ip, asr #20
374        orr    a2, a2, ip, lsl #8
375        sub    a3, a3, v1
376        clip   a3, a3, asr #20
377        sub    a4, a4, v2
378        clip   a4, a4, asr #20
379        ldr    v1, [sp, #28]
380        strh   a2, [v1]
381        add    a2, v1, #2
382        str    a2, [sp, #28]
383        orr    a2, a3, a4, lsl #8
384        rsb    v2, lr, lr, lsl #3
385        ldmfd  sp!, {a3, a4}
386        strh_pre a2, v2, v1
387
388        sub    a2, a3, v3
389        clip   a2, a2, asr #20
390        sub    ip, a4, v4
391        clip   ip, ip, asr #20
392        orr    a2, a2, ip, lsl #8
393        strh_pre a2, v1, lr
394        add    a3, a3, v3
395        clip   a2, a3, asr #20
396        add    a4, a4, v4
397        clip   a4, a4, asr #20
398        orr    a2, a2, a4, lsl #8
399        ldmfd  sp!, {a3, a4}
400        strh_dpre a2, v2, lr
401
402        add    a2, a3, v5
403        clip   a2, a2, asr #20
404        add    ip, a4, v6
405        clip   ip, ip, asr #20
406        orr    a2, a2, ip, lsl #8
407        strh_pre a2, v1, lr
408        sub    a3, a3, v5
409        clip   a2, a3, asr #20
410        sub    a4, a4, v6
411        clip   a4, a4, asr #20
412        orr    a2, a2, a4, lsl #8
413        ldmfd  sp!, {a3, a4}
414        strh_dpre a2, v2, lr
415
416        add    a2, a3, v7
417        clip   a2, a2, asr #20
418        add    ip, a4, fp
419        clip   ip, ip, asr #20
420        orr    a2, a2, ip, lsl #8
421        strh   a2, [v1, lr]
422        sub    a3, a3, v7
423        clip   a2, a3, asr #20
424        sub    a4, a4, fp
425        clip   a4, a4, asr #20
426        orr    a2, a2, a4, lsl #8
427        strh_dpre a2, v2, lr
428
429        ldr    pc, [sp], #4
430endfunc
431
432function idct_col_add_armv5te
433        str    lr, [sp, #-4]!
434
435        idct_col
436
437        ldr    lr, [sp, #36]
438
439        ldmfd  sp!, {a3, a4}
440        ldrh   ip, [lr]
441        add    a2, a3, v1
442        sub    a3, a3, v1
443        and    v1, ip, #255
444        aclip  a2, v1, a2, asr #20
445        add    v1, a4, v2
446        mov    v1, v1, asr #20
447        aclip  v1, v1, ip, lsr #8
448        orr    a2, a2, v1, lsl #8
449        ldr    v1, [sp, #32]
450        sub    a4, a4, v2
451        rsb    v2, v1, v1, lsl #3
452        ldrh_pre ip, v2, lr
453        strh   a2, [lr]
454        and    a2, ip, #255
455        aclip  a3, a2, a3, asr #20
456        mov    a4, a4, asr #20
457        aclip  a4, a4, ip, lsr #8
458        add    a2, lr, #2
459        str    a2, [sp, #28]
460        orr    a2, a3, a4, lsl #8
461        strh   a2, [v2]
462
463        ldmfd  sp!, {a3, a4}
464        ldrh_pre ip, lr, v1
465        sub    a2, a3, v3
466        add    a3, a3, v3
467        and    v3, ip, #255
468        aclip  a2, v3, a2, asr #20
469        sub    v3, a4, v4
470        mov    v3, v3, asr #20
471        aclip  v3, v3, ip, lsr #8
472        orr    a2, a2, v3, lsl #8
473        add    a4, a4, v4
474        ldrh_dpre ip, v2, v1
475        strh   a2, [lr]
476        and    a2, ip, #255
477        aclip  a3, a2, a3, asr #20
478        mov    a4, a4, asr #20
479        aclip  a4, a4, ip, lsr #8
480        orr    a2, a3, a4, lsl #8
481        strh   a2, [v2]
482
483        ldmfd  sp!, {a3, a4}
484        ldrh_pre ip, lr, v1
485        add    a2, a3, v5
486        sub    a3, a3, v5
487        and    v3, ip, #255
488        aclip  a2, v3, a2, asr #20
489        add    v3, a4, v6
490        mov    v3, v3, asr #20
491        aclip  v3, v3, ip, lsr #8
492        orr    a2, a2, v3, lsl #8
493        sub    a4, a4, v6
494        ldrh_dpre ip, v2, v1
495        strh   a2, [lr]
496        and    a2, ip, #255
497        aclip  a3, a2, a3, asr #20
498        mov    a4, a4, asr #20
499        aclip  a4, a4, ip, lsr #8
500        orr    a2, a3, a4, lsl #8
501        strh   a2, [v2]
502
503        ldmfd  sp!, {a3, a4}
504        ldrh_pre ip, lr, v1
505        add    a2, a3, v7
506        sub    a3, a3, v7
507        and    v3, ip, #255
508        aclip  a2, v3, a2, asr #20
509        add    v3, a4, fp
510        mov    v3, v3, asr #20
511        aclip  v3, v3, ip, lsr #8
512        orr    a2, a2, v3, lsl #8
513        sub    a4, a4, fp
514        ldrh_dpre ip, v2, v1
515        strh   a2, [lr]
516        and    a2, ip, #255
517        aclip  a3, a2, a3, asr #20
518        mov    a4, a4, asr #20
519        aclip  a4, a4, ip, lsr #8
520        orr    a2, a3, a4, lsl #8
521        strh   a2, [v2]
522
523        ldr    pc, [sp], #4
524endfunc
525
526function ff_simple_idct_armv5te, export=1
527        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
528
529        bl     idct_row_armv5te
530        add    a1, a1, #16
531        bl     idct_row_armv5te
532        add    a1, a1, #16
533        bl     idct_row_armv5te
534        add    a1, a1, #16
535        bl     idct_row_armv5te
536        add    a1, a1, #16
537        bl     idct_row_armv5te
538        add    a1, a1, #16
539        bl     idct_row_armv5te
540        add    a1, a1, #16
541        bl     idct_row_armv5te
542        add    a1, a1, #16
543        bl     idct_row_armv5te
544
545        sub    a1, a1, #(16*7)
546
547        bl     idct_col_armv5te
548        add    a1, a1, #4
549        bl     idct_col_armv5te
550        add    a1, a1, #4
551        bl     idct_col_armv5te
552        add    a1, a1, #4
553        bl     idct_col_armv5te
554
555        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
556endfunc
557
558function ff_simple_idct_add_armv5te, export=1
559        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
560
561        mov    a1, a3
562
563        bl     idct_row_armv5te
564        add    a1, a1, #16
565        bl     idct_row_armv5te
566        add    a1, a1, #16
567        bl     idct_row_armv5te
568        add    a1, a1, #16
569        bl     idct_row_armv5te
570        add    a1, a1, #16
571        bl     idct_row_armv5te
572        add    a1, a1, #16
573        bl     idct_row_armv5te
574        add    a1, a1, #16
575        bl     idct_row_armv5te
576        add    a1, a1, #16
577        bl     idct_row_armv5te
578
579        sub    a1, a1, #(16*7)
580
581        bl     idct_col_add_armv5te
582        add    a1, a1, #4
583        bl     idct_col_add_armv5te
584        add    a1, a1, #4
585        bl     idct_col_add_armv5te
586        add    a1, a1, #4
587        bl     idct_col_add_armv5te
588
589        add    sp, sp, #8
590        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
591endfunc
592
593function ff_simple_idct_put_armv5te, export=1
594        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
595
596        mov    a1, a3
597
598        bl     idct_row_armv5te
599        add    a1, a1, #16
600        bl     idct_row_armv5te
601        add    a1, a1, #16
602        bl     idct_row_armv5te
603        add    a1, a1, #16
604        bl     idct_row_armv5te
605        add    a1, a1, #16
606        bl     idct_row_armv5te
607        add    a1, a1, #16
608        bl     idct_row_armv5te
609        add    a1, a1, #16
610        bl     idct_row_armv5te
611        add    a1, a1, #16
612        bl     idct_row_armv5te
613
614        sub    a1, a1, #(16*7)
615
616        bl     idct_col_put_armv5te
617        add    a1, a1, #4
618        bl     idct_col_put_armv5te
619        add    a1, a1, #4
620        bl     idct_col_put_armv5te
621        add    a1, a1, #4
622        bl     idct_col_put_armv5te
623
624        add    sp, sp, #8
625        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
626endfunc
627