1/*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "asm.S"
25
26#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33#define ROW_SHIFT 11
34#define COL_SHIFT 20
35
36#define W13 (W1 | (W3 << 16))
37#define W26 (W2 | (W6 << 16))
38#define W42 (W4 | (W2 << 16))
39#define W42n (-W4&0xffff | (-W2 << 16))
40#define W46 (W4 | (W6 << 16))
41#define W57 (W5 | (W7 << 16))
42
43        .text
44        .align
45w13:    .long W13
46w26:    .long W26
47w42:    .long W42
48w42n:   .long W42n
49w46:    .long W46
50w57:    .long W57
51
52/*
53  Compute partial IDCT of single row.
54  shift = left-shift amount
55  r0 = source address
56  r2 = row[2,0] <= 2 cycles
57  r3 = row[3,1]
58  ip = w42      <= 2 cycles
59
60  Output in registers r4--r11
61*/
62        .macro idct_row shift
63        ldr    lr, w46               /* lr  = W4 | (W6 << 16) */
64        mov    r1, #(1<<(\shift-1))
65        smlad  r4, r2, ip, r1
66        smlsd  r7, r2, ip, r1
67        ldr    ip, w13               /* ip  = W1 | (W3 << 16) */
68        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
69        smlad  r5, r2, lr, r1
70        smlsd  r6, r2, lr, r1
71
72        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
73        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
74        ldr    lr, [r0, #12]         /* lr  =  row[7,5] */
75        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
76        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
77        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
78        smlad  r8, lr, r10,r8        /* B0  +=      W5*row[5] + W7*row[7] */
79        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
80
81        ldr    r3, w42n              /* r3 =  -W4 | (-W2 << 16) */
82        smlad  r10,lr, r2, r10       /* B2 +=  W7*row[5] + W3*row[7] */
83        ldr    r2, [r0, #4]          /* r2 =   row[6,4] */
84        smlsdx r11,lr, ip, r11       /* B3 +=  W3*row[5] - W1*row[7] */
85        ldr    ip, w46               /* ip =   W4 | (W6 << 16) */
86        smlad  r9, lr, r1, r9        /* B1 -=  W1*row[5] + W5*row[7] */
87
88        smlad  r5, r2, r3, r5        /* A1 += -W4*row[4] - W2*row[6] */
89        smlsd  r6, r2, r3, r6        /* A2 += -W4*row[4] + W2*row[6] */
90        smlad  r4, r2, ip, r4        /* A0 +=  W4*row[4] + W6*row[6] */
91        smlsd  r7, r2, ip, r7        /* A3 +=  W4*row[4] - W6*row[6] */
92        .endm
93
94/*
95  Compute partial IDCT of half row.
96  shift = left-shift amount
97  r2 = row[2,0]
98  r3 = row[3,1]
99  ip = w42
100
101  Output in registers r4--r11
102*/
103        .macro idct_row4 shift
104        ldr    lr, w46               /* lr =  W4 | (W6 << 16) */
105        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
106        mov    r1, #(1<<(\shift-1))
107        smlad  r4, r2, ip, r1
108        smlsd  r7, r2, ip, r1
109        ldr    ip, w13               /* ip =  W1 | (W3 << 16) */
110        smlad  r5, r2, lr, r1
111        smlsd  r6, r2, lr, r1
112        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
113        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
114        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
115        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
116        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
117        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
118        .endm
119
120/*
121  Compute final part of IDCT single row without shift.
122  Input in registers r4--r11
123  Output in registers ip, r4--r6, lr, r8--r10
124*/
125        .macro idct_finish
126        add    ip, r4, r8            /* r1 = A0 + B0 */
127        sub    lr, r4, r8            /* r2 = A0 - B0 */
128        sub    r4, r5, r9            /* r2 = A1 + B1 */
129        add    r8, r5, r9            /* r2 = A1 - B1 */
130        add    r5, r6, r10           /* r1 = A2 + B2 */
131        sub    r9, r6, r10           /* r1 = A2 - B2 */
132        add    r6, r7, r11           /* r2 = A3 + B3 */
133        sub    r10,r7, r11           /* r2 = A3 - B3 */
134        .endm
135
136/*
137  Compute final part of IDCT single row.
138  shift = right-shift amount
139  Input/output in registers r4--r11
140*/
141        .macro idct_finish_shift shift
142        add    r3, r4, r8            /* r3 = A0 + B0 */
143        sub    r2, r4, r8            /* r2 = A0 - B0 */
144        mov    r4, r3, asr #\shift
145        mov    r8, r2, asr #\shift
146
147        sub    r3, r5, r9            /* r3 = A1 + B1 */
148        add    r2, r5, r9            /* r2 = A1 - B1 */
149        mov    r5, r3, asr #\shift
150        mov    r9, r2, asr #\shift
151
152        add    r3, r6, r10           /* r3 = A2 + B2 */
153        sub    r2, r6, r10           /* r2 = A2 - B2 */
154        mov    r6, r3, asr #\shift
155        mov    r10,r2, asr #\shift
156
157        add    r3, r7, r11           /* r3 = A3 + B3 */
158        sub    r2, r7, r11           /* r2 = A3 - B3 */
159        mov    r7, r3, asr #\shift
160        mov    r11,r2, asr #\shift
161        .endm
162
163/*
164  Compute final part of IDCT single row, saturating results at 8 bits.
165  shift = right-shift amount
166  Input/output in registers r4--r11
167*/
168        .macro idct_finish_shift_sat shift
169        add    r3, r4, r8            /* r3 = A0 + B0 */
170        sub    ip, r4, r8            /* ip = A0 - B0 */
171        usat   r4, #8, r3, asr #\shift
172        usat   r8, #8, ip, asr #\shift
173
174        sub    r3, r5, r9            /* r3 = A1 + B1 */
175        add    ip, r5, r9            /* ip = A1 - B1 */
176        usat   r5, #8, r3, asr #\shift
177        usat   r9, #8, ip, asr #\shift
178
179        add    r3, r6, r10           /* r3 = A2 + B2 */
180        sub    ip, r6, r10           /* ip = A2 - B2 */
181        usat   r6, #8, r3, asr #\shift
182        usat   r10,#8, ip, asr #\shift
183
184        add    r3, r7, r11           /* r3 = A3 + B3 */
185        sub    ip, r7, r11           /* ip = A3 - B3 */
186        usat   r7, #8, r3, asr #\shift
187        usat   r11,#8, ip, asr #\shift
188        .endm
189
190/*
191  Compute IDCT of single row, storing as column.
192  r0 = source
193  r1 = dest
194*/
195function idct_row_armv6
196        push   {lr}
197
198        ldr    lr, [r0, #12]         /* lr = row[7,5] */
199        ldr    ip, [r0, #4]          /* ip = row[6,4] */
200        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
201        ldr    r2, [r0]              /* r2 = row[2,0] */
202        orrs   lr, lr, ip
203        cmpeq  lr, r3
204        cmpeq  lr, r2, lsr #16
205        beq    1f
206        push   {r1}
207        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
208        cmp    lr, #0
209        beq    2f
210
211        idct_row   ROW_SHIFT
212        b      3f
213
2142:      idct_row4  ROW_SHIFT
215
2163:      pop    {r1}
217        idct_finish_shift ROW_SHIFT
218
219        strh   r4, [r1]
220        strh   r5, [r1, #(16*2)]
221        strh   r6, [r1, #(16*4)]
222        strh   r7, [r1, #(16*6)]
223        strh   r11,[r1, #(16*1)]
224        strh   r10,[r1, #(16*3)]
225        strh   r9, [r1, #(16*5)]
226        strh   r8, [r1, #(16*7)]
227
228        pop    {pc}
229
2301:      mov    r2, r2, lsl #3
231        strh   r2, [r1]
232        strh   r2, [r1, #(16*2)]
233        strh   r2, [r1, #(16*4)]
234        strh   r2, [r1, #(16*6)]
235        strh   r2, [r1, #(16*1)]
236        strh   r2, [r1, #(16*3)]
237        strh   r2, [r1, #(16*5)]
238        strh   r2, [r1, #(16*7)]
239        pop    {pc}
240        .endfunc
241
242/*
243  Compute IDCT of single column, read as row.
244  r0 = source
245  r1 = dest
246*/
247function idct_col_armv6
248        push   {r1, lr}
249
250        ldr    r2, [r0]              /* r2 = row[2,0] */
251        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
252        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
253        idct_row COL_SHIFT
254        pop    {r1}
255        idct_finish_shift COL_SHIFT
256
257        strh   r4, [r1]
258        strh   r5, [r1, #(16*1)]
259        strh   r6, [r1, #(16*2)]
260        strh   r7, [r1, #(16*3)]
261        strh   r11,[r1, #(16*4)]
262        strh   r10,[r1, #(16*5)]
263        strh   r9, [r1, #(16*6)]
264        strh   r8, [r1, #(16*7)]
265
266        pop    {pc}
267        .endfunc
268
269/*
270  Compute IDCT of single column, read as row, store saturated 8-bit.
271  r0 = source
272  r1 = dest
273  r2 = line size
274*/
275function idct_col_put_armv6
276        push   {r1, r2, lr}
277
278        ldr    r2, [r0]              /* r2 = row[2,0] */
279        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
280        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
281        idct_row COL_SHIFT
282        pop    {r1, r2}
283        idct_finish_shift_sat COL_SHIFT
284
285        strb   r4, [r1], r2
286        strb   r5, [r1], r2
287        strb   r6, [r1], r2
288        strb   r7, [r1], r2
289        strb   r11,[r1], r2
290        strb   r10,[r1], r2
291        strb   r9, [r1], r2
292        strb   r8, [r1], r2
293
294        sub    r1, r1, r2, lsl #3
295
296        pop    {pc}
297        .endfunc
298
299/*
300  Compute IDCT of single column, read as row, add/store saturated 8-bit.
301  r0 = source
302  r1 = dest
303  r2 = line size
304*/
305function idct_col_add_armv6
306        push   {r1, r2, lr}
307
308        ldr    r2, [r0]              /* r2 = row[2,0] */
309        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
310        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
311        idct_row COL_SHIFT
312        pop    {r1, r2}
313        idct_finish
314
315        ldrb   r3, [r1]
316        ldrb   r7, [r1, r2]
317        ldrb   r11,[r1, r2, lsl #2]
318        add    ip, r3, ip, asr #COL_SHIFT
319        usat   ip, #8, ip
320        add    r4, r7, r4, asr #COL_SHIFT
321        strb   ip, [r1], r2
322        ldrb   ip, [r1, r2]
323        usat   r4, #8, r4
324        ldrb   r11,[r1, r2, lsl #2]
325        add    r5, ip, r5, asr #COL_SHIFT
326        usat   r5, #8, r5
327        strb   r4, [r1], r2
328        ldrb   r3, [r1, r2]
329        ldrb   ip, [r1, r2, lsl #2]
330        strb   r5, [r1], r2
331        ldrb   r7, [r1, r2]
332        ldrb   r4, [r1, r2, lsl #2]
333        add    r6, r3, r6, asr #COL_SHIFT
334        usat   r6, #8, r6
335        add    r10,r7, r10,asr #COL_SHIFT
336        usat   r10,#8, r10
337        add    r9, r11,r9, asr #COL_SHIFT
338        usat   r9, #8, r9
339        add    r8, ip, r8, asr #COL_SHIFT
340        usat   r8, #8, r8
341        add    lr, r4, lr, asr #COL_SHIFT
342        usat   lr, #8, lr
343        strb   r6, [r1], r2
344        strb   r10,[r1], r2
345        strb   r9, [r1], r2
346        strb   r8, [r1], r2
347        strb   lr, [r1], r2
348
349        sub    r1, r1, r2, lsl #3
350
351        pop    {pc}
352        .endfunc
353
354/*
355  Compute 8 IDCT row transforms.
356  func = IDCT row->col function
357  width = width of columns in bytes
358*/
359        .macro idct_rows func width
360        bl     \func
361        add    r0, r0, #(16*2)
362        add    r1, r1, #\width
363        bl     \func
364        add    r0, r0, #(16*2)
365        add    r1, r1, #\width
366        bl     \func
367        add    r0, r0, #(16*2)
368        add    r1, r1, #\width
369        bl     \func
370        sub    r0, r0, #(16*5)
371        add    r1, r1, #\width
372        bl     \func
373        add    r0, r0, #(16*2)
374        add    r1, r1, #\width
375        bl     \func
376        add    r0, r0, #(16*2)
377        add    r1, r1, #\width
378        bl     \func
379        add    r0, r0, #(16*2)
380        add    r1, r1, #\width
381        bl     \func
382
383        sub    r0, r0, #(16*7)
384        .endm
385
386/* void ff_simple_idct_armv6(DCTELEM *data); */
387function ff_simple_idct_armv6, export=1
388        push   {r4-r11, lr}
389        sub    sp, sp, #128
390
391        mov    r1, sp
392        idct_rows idct_row_armv6, 2
393        mov    r1, r0
394        mov    r0, sp
395        idct_rows idct_col_armv6, 2
396
397        add    sp, sp, #128
398        pop    {r4-r11, pc}
399        .endfunc
400
401/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
402function ff_simple_idct_add_armv6, export=1
403        push   {r0, r1, r4-r11, lr}
404        sub    sp, sp, #128
405
406        mov    r0, r2
407        mov    r1, sp
408        idct_rows idct_row_armv6, 2
409        mov    r0, sp
410        ldr    r1, [sp, #128]
411        ldr    r2, [sp, #(128+4)]
412        idct_rows idct_col_add_armv6, 1
413
414        add    sp, sp, #(128+8)
415        pop    {r4-r11, pc}
416        .endfunc
417
418/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
419function ff_simple_idct_put_armv6, export=1
420        push   {r0, r1, r4-r11, lr}
421        sub    sp, sp, #128
422
423        mov    r0, r2
424        mov    r1, sp
425        idct_rows idct_row_armv6, 2
426        mov    r0, sp
427        ldr    r1, [sp, #128]
428        ldr    r2, [sp, #(128+4)]
429        idct_rows idct_col_put_armv6, 1
430
431        add    sp, sp, #(128+8)
432        pop    {r4-r11, pc}
433        .endfunc
434