1/*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file libavcodec/simple_idct.c
25 * simpleidct in C.
26 */
27
28/*
29  based upon some outcommented c code from mpeg2dec (idct_mmx.c
30  written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
31 */
32#include "avcodec.h"
33#include "dsputil.h"
34#include "mathops.h"
35#include "simple_idct.h"
36
37#if 0
38#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
39#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
40#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
41#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
42#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
43#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
44#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
45#define ROW_SHIFT 8
46#define COL_SHIFT 17
47#else
48#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
52#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
53#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
54#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
55#define ROW_SHIFT 11
56#define COL_SHIFT 20 // 6
57#endif
58
59static inline void idctRowCondDC (DCTELEM * row)
60{
61        int a0, a1, a2, a3, b0, b1, b2, b3;
62#if HAVE_FAST_64BIT
63        uint64_t temp;
64#else
65        uint32_t temp;
66#endif
67
68#if HAVE_FAST_64BIT
69#ifdef WORDS_BIGENDIAN
70#define ROW0_MASK 0xffff000000000000LL
71#else
72#define ROW0_MASK 0xffffLL
73#endif
74        if(sizeof(DCTELEM)==2){
75            if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
76                  ((uint64_t *)row)[1]) == 0) {
77                temp = (row[0] << 3) & 0xffff;
78                temp += temp << 16;
79                temp += temp << 32;
80                ((uint64_t *)row)[0] = temp;
81                ((uint64_t *)row)[1] = temp;
82                return;
83            }
84        }else{
85            if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
86                row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
87                return;
88            }
89        }
90#else
91        if(sizeof(DCTELEM)==2){
92            if (!(((uint32_t*)row)[1] |
93                  ((uint32_t*)row)[2] |
94                  ((uint32_t*)row)[3] |
95                  row[1])) {
96                temp = (row[0] << 3) & 0xffff;
97                temp += temp << 16;
98                ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
99                ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
100                return;
101            }
102        }else{
103            if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
104                row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
105                return;
106            }
107        }
108#endif
109
110        a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
111        a1 = a0;
112        a2 = a0;
113        a3 = a0;
114
115        /* no need to optimize : gcc does it */
116        a0 += W2 * row[2];
117        a1 += W6 * row[2];
118        a2 -= W6 * row[2];
119        a3 -= W2 * row[2];
120
121        b0 = MUL16(W1, row[1]);
122        MAC16(b0, W3, row[3]);
123        b1 = MUL16(W3, row[1]);
124        MAC16(b1, -W7, row[3]);
125        b2 = MUL16(W5, row[1]);
126        MAC16(b2, -W1, row[3]);
127        b3 = MUL16(W7, row[1]);
128        MAC16(b3, -W5, row[3]);
129
130#if HAVE_FAST_64BIT
131        temp = ((uint64_t*)row)[1];
132#else
133        temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
134#endif
135        if (temp != 0) {
136            a0 += W4*row[4] + W6*row[6];
137            a1 += - W4*row[4] - W2*row[6];
138            a2 += - W4*row[4] + W2*row[6];
139            a3 += W4*row[4] - W6*row[6];
140
141            MAC16(b0, W5, row[5]);
142            MAC16(b0, W7, row[7]);
143
144            MAC16(b1, -W1, row[5]);
145            MAC16(b1, -W5, row[7]);
146
147            MAC16(b2, W7, row[5]);
148            MAC16(b2, W3, row[7]);
149
150            MAC16(b3, W3, row[5]);
151            MAC16(b3, -W1, row[7]);
152        }
153
154        row[0] = (a0 + b0) >> ROW_SHIFT;
155        row[7] = (a0 - b0) >> ROW_SHIFT;
156        row[1] = (a1 + b1) >> ROW_SHIFT;
157        row[6] = (a1 - b1) >> ROW_SHIFT;
158        row[2] = (a2 + b2) >> ROW_SHIFT;
159        row[5] = (a2 - b2) >> ROW_SHIFT;
160        row[3] = (a3 + b3) >> ROW_SHIFT;
161        row[4] = (a3 - b3) >> ROW_SHIFT;
162}
163
164static inline void idctSparseColPut (uint8_t *dest, int line_size,
165                                     DCTELEM * col)
166{
167        int a0, a1, a2, a3, b0, b1, b2, b3;
168        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
169
170        /* XXX: I did that only to give same values as previous code */
171        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
172        a1 = a0;
173        a2 = a0;
174        a3 = a0;
175
176        a0 +=  + W2*col[8*2];
177        a1 +=  + W6*col[8*2];
178        a2 +=  - W6*col[8*2];
179        a3 +=  - W2*col[8*2];
180
181        b0 = MUL16(W1, col[8*1]);
182        b1 = MUL16(W3, col[8*1]);
183        b2 = MUL16(W5, col[8*1]);
184        b3 = MUL16(W7, col[8*1]);
185
186        MAC16(b0, + W3, col[8*3]);
187        MAC16(b1, - W7, col[8*3]);
188        MAC16(b2, - W1, col[8*3]);
189        MAC16(b3, - W5, col[8*3]);
190
191        if(col[8*4]){
192            a0 += + W4*col[8*4];
193            a1 += - W4*col[8*4];
194            a2 += - W4*col[8*4];
195            a3 += + W4*col[8*4];
196        }
197
198        if (col[8*5]) {
199            MAC16(b0, + W5, col[8*5]);
200            MAC16(b1, - W1, col[8*5]);
201            MAC16(b2, + W7, col[8*5]);
202            MAC16(b3, + W3, col[8*5]);
203        }
204
205        if(col[8*6]){
206            a0 += + W6*col[8*6];
207            a1 += - W2*col[8*6];
208            a2 += + W2*col[8*6];
209            a3 += - W6*col[8*6];
210        }
211
212        if (col[8*7]) {
213            MAC16(b0, + W7, col[8*7]);
214            MAC16(b1, - W5, col[8*7]);
215            MAC16(b2, + W3, col[8*7]);
216            MAC16(b3, - W1, col[8*7]);
217        }
218
219        dest[0] = cm[(a0 + b0) >> COL_SHIFT];
220        dest += line_size;
221        dest[0] = cm[(a1 + b1) >> COL_SHIFT];
222        dest += line_size;
223        dest[0] = cm[(a2 + b2) >> COL_SHIFT];
224        dest += line_size;
225        dest[0] = cm[(a3 + b3) >> COL_SHIFT];
226        dest += line_size;
227        dest[0] = cm[(a3 - b3) >> COL_SHIFT];
228        dest += line_size;
229        dest[0] = cm[(a2 - b2) >> COL_SHIFT];
230        dest += line_size;
231        dest[0] = cm[(a1 - b1) >> COL_SHIFT];
232        dest += line_size;
233        dest[0] = cm[(a0 - b0) >> COL_SHIFT];
234}
235
236static inline void idctSparseColAdd (uint8_t *dest, int line_size,
237                                     DCTELEM * col)
238{
239        int a0, a1, a2, a3, b0, b1, b2, b3;
240        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
241
242        /* XXX: I did that only to give same values as previous code */
243        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
244        a1 = a0;
245        a2 = a0;
246        a3 = a0;
247
248        a0 +=  + W2*col[8*2];
249        a1 +=  + W6*col[8*2];
250        a2 +=  - W6*col[8*2];
251        a3 +=  - W2*col[8*2];
252
253        b0 = MUL16(W1, col[8*1]);
254        b1 = MUL16(W3, col[8*1]);
255        b2 = MUL16(W5, col[8*1]);
256        b3 = MUL16(W7, col[8*1]);
257
258        MAC16(b0, + W3, col[8*3]);
259        MAC16(b1, - W7, col[8*3]);
260        MAC16(b2, - W1, col[8*3]);
261        MAC16(b3, - W5, col[8*3]);
262
263        if(col[8*4]){
264            a0 += + W4*col[8*4];
265            a1 += - W4*col[8*4];
266            a2 += - W4*col[8*4];
267            a3 += + W4*col[8*4];
268        }
269
270        if (col[8*5]) {
271            MAC16(b0, + W5, col[8*5]);
272            MAC16(b1, - W1, col[8*5]);
273            MAC16(b2, + W7, col[8*5]);
274            MAC16(b3, + W3, col[8*5]);
275        }
276
277        if(col[8*6]){
278            a0 += + W6*col[8*6];
279            a1 += - W2*col[8*6];
280            a2 += + W2*col[8*6];
281            a3 += - W6*col[8*6];
282        }
283
284        if (col[8*7]) {
285            MAC16(b0, + W7, col[8*7]);
286            MAC16(b1, - W5, col[8*7]);
287            MAC16(b2, + W3, col[8*7]);
288            MAC16(b3, - W1, col[8*7]);
289        }
290
291        dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
292        dest += line_size;
293        dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
294        dest += line_size;
295        dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
296        dest += line_size;
297        dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
298        dest += line_size;
299        dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
300        dest += line_size;
301        dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
302        dest += line_size;
303        dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
304        dest += line_size;
305        dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
306}
307
308static inline void idctSparseCol (DCTELEM * col)
309{
310        int a0, a1, a2, a3, b0, b1, b2, b3;
311
312        /* XXX: I did that only to give same values as previous code */
313        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
314        a1 = a0;
315        a2 = a0;
316        a3 = a0;
317
318        a0 +=  + W2*col[8*2];
319        a1 +=  + W6*col[8*2];
320        a2 +=  - W6*col[8*2];
321        a3 +=  - W2*col[8*2];
322
323        b0 = MUL16(W1, col[8*1]);
324        b1 = MUL16(W3, col[8*1]);
325        b2 = MUL16(W5, col[8*1]);
326        b3 = MUL16(W7, col[8*1]);
327
328        MAC16(b0, + W3, col[8*3]);
329        MAC16(b1, - W7, col[8*3]);
330        MAC16(b2, - W1, col[8*3]);
331        MAC16(b3, - W5, col[8*3]);
332
333        if(col[8*4]){
334            a0 += + W4*col[8*4];
335            a1 += - W4*col[8*4];
336            a2 += - W4*col[8*4];
337            a3 += + W4*col[8*4];
338        }
339
340        if (col[8*5]) {
341            MAC16(b0, + W5, col[8*5]);
342            MAC16(b1, - W1, col[8*5]);
343            MAC16(b2, + W7, col[8*5]);
344            MAC16(b3, + W3, col[8*5]);
345        }
346
347        if(col[8*6]){
348            a0 += + W6*col[8*6];
349            a1 += - W2*col[8*6];
350            a2 += + W2*col[8*6];
351            a3 += - W6*col[8*6];
352        }
353
354        if (col[8*7]) {
355            MAC16(b0, + W7, col[8*7]);
356            MAC16(b1, - W5, col[8*7]);
357            MAC16(b2, + W3, col[8*7]);
358            MAC16(b3, - W1, col[8*7]);
359        }
360
361        col[0 ] = ((a0 + b0) >> COL_SHIFT);
362        col[8 ] = ((a1 + b1) >> COL_SHIFT);
363        col[16] = ((a2 + b2) >> COL_SHIFT);
364        col[24] = ((a3 + b3) >> COL_SHIFT);
365        col[32] = ((a3 - b3) >> COL_SHIFT);
366        col[40] = ((a2 - b2) >> COL_SHIFT);
367        col[48] = ((a1 - b1) >> COL_SHIFT);
368        col[56] = ((a0 - b0) >> COL_SHIFT);
369}
370
371void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
372{
373    int i;
374    for(i=0; i<8; i++)
375        idctRowCondDC(block + i*8);
376
377    for(i=0; i<8; i++)
378        idctSparseColPut(dest + i, line_size, block + i);
379}
380
381void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
382{
383    int i;
384    for(i=0; i<8; i++)
385        idctRowCondDC(block + i*8);
386
387    for(i=0; i<8; i++)
388        idctSparseColAdd(dest + i, line_size, block + i);
389}
390
391void ff_simple_idct(DCTELEM *block)
392{
393    int i;
394    for(i=0; i<8; i++)
395        idctRowCondDC(block + i*8);
396
397    for(i=0; i<8; i++)
398        idctSparseCol(block + i);
399}
400
401/* 2x4x8 idct */
402
403#define CN_SHIFT 12
404#define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))
405#define C1 C_FIX(0.6532814824)
406#define C2 C_FIX(0.2705980501)
407
408/* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized,
409   and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
410#define C_SHIFT (4+1+12)
411
412static inline void idct4col_put(uint8_t *dest, int line_size, const DCTELEM *col)
413{
414    int c0, c1, c2, c3, a0, a1, a2, a3;
415    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
416
417    a0 = col[8*0];
418    a1 = col[8*2];
419    a2 = col[8*4];
420    a3 = col[8*6];
421    c0 = ((a0 + a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
422    c2 = ((a0 - a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
423    c1 = a1 * C1 + a3 * C2;
424    c3 = a1 * C2 - a3 * C1;
425    dest[0] = cm[(c0 + c1) >> C_SHIFT];
426    dest += line_size;
427    dest[0] = cm[(c2 + c3) >> C_SHIFT];
428    dest += line_size;
429    dest[0] = cm[(c2 - c3) >> C_SHIFT];
430    dest += line_size;
431    dest[0] = cm[(c0 - c1) >> C_SHIFT];
432}
433
434#define BF(k) \
435{\
436    int a0, a1;\
437    a0 = ptr[k];\
438    a1 = ptr[8 + k];\
439    ptr[k] = a0 + a1;\
440    ptr[8 + k] = a0 - a1;\
441}
442
443/* only used by DV codec. The input must be interlaced. 128 is added
444   to the pixels before clamping to avoid systematic error
445   (1024*sqrt(2)) offset would be needed otherwise. */
446/* XXX: I think a 1.0/sqrt(2) normalization should be needed to
447   compensate the extra butterfly stage - I don't have the full DV
448   specification */
449void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block)
450{
451    int i;
452    DCTELEM *ptr;
453
454    /* butterfly */
455    ptr = block;
456    for(i=0;i<4;i++) {
457        BF(0);
458        BF(1);
459        BF(2);
460        BF(3);
461        BF(4);
462        BF(5);
463        BF(6);
464        BF(7);
465        ptr += 2 * 8;
466    }
467
468    /* IDCT8 on each line */
469    for(i=0; i<8; i++) {
470        idctRowCondDC(block + i*8);
471    }
472
473    /* IDCT4 and store */
474    for(i=0;i<8;i++) {
475        idct4col_put(dest + i, 2 * line_size, block + i);
476        idct4col_put(dest + line_size + i, 2 * line_size, block + 8 + i);
477    }
478}
479
480/* 8x4 & 4x8 WMV2 IDCT */
481#undef CN_SHIFT
482#undef C_SHIFT
483#undef C_FIX
484#undef C1
485#undef C2
486#define CN_SHIFT 12
487#define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
488#define C1 C_FIX(0.6532814824)
489#define C2 C_FIX(0.2705980501)
490#define C3 C_FIX(0.5)
491#define C_SHIFT (4+1+12)
492static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col)
493{
494    int c0, c1, c2, c3, a0, a1, a2, a3;
495    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
496
497    a0 = col[8*0];
498    a1 = col[8*1];
499    a2 = col[8*2];
500    a3 = col[8*3];
501    c0 = (a0 + a2)*C3 + (1 << (C_SHIFT - 1));
502    c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1));
503    c1 = a1 * C1 + a3 * C2;
504    c3 = a1 * C2 - a3 * C1;
505    dest[0] = cm[dest[0] + ((c0 + c1) >> C_SHIFT)];
506    dest += line_size;
507    dest[0] = cm[dest[0] + ((c2 + c3) >> C_SHIFT)];
508    dest += line_size;
509    dest[0] = cm[dest[0] + ((c2 - c3) >> C_SHIFT)];
510    dest += line_size;
511    dest[0] = cm[dest[0] + ((c0 - c1) >> C_SHIFT)];
512}
513
514#define RN_SHIFT 15
515#define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
516#define R1 R_FIX(0.6532814824)
517#define R2 R_FIX(0.2705980501)
518#define R3 R_FIX(0.5)
519#define R_SHIFT 11
520static inline void idct4row(DCTELEM *row)
521{
522    int c0, c1, c2, c3, a0, a1, a2, a3;
523    //const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
524
525    a0 = row[0];
526    a1 = row[1];
527    a2 = row[2];
528    a3 = row[3];
529    c0 = (a0 + a2)*R3 + (1 << (R_SHIFT - 1));
530    c2 = (a0 - a2)*R3 + (1 << (R_SHIFT - 1));
531    c1 = a1 * R1 + a3 * R2;
532    c3 = a1 * R2 - a3 * R1;
533    row[0]= (c0 + c1) >> R_SHIFT;
534    row[1]= (c2 + c3) >> R_SHIFT;
535    row[2]= (c2 - c3) >> R_SHIFT;
536    row[3]= (c0 - c1) >> R_SHIFT;
537}
538
539void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block)
540{
541    int i;
542
543    /* IDCT8 on each line */
544    for(i=0; i<4; i++) {
545        idctRowCondDC(block + i*8);
546    }
547
548    /* IDCT4 and store */
549    for(i=0;i<8;i++) {
550        idct4col_add(dest + i, line_size, block + i);
551    }
552}
553
554void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block)
555{
556    int i;
557
558    /* IDCT4 on each line */
559    for(i=0; i<8; i++) {
560        idct4row(block + i*8);
561    }
562
563    /* IDCT8 and store */
564    for(i=0; i<4; i++){
565        idctSparseColAdd(dest + i, line_size, block + i);
566    }
567}
568
569void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block)
570{
571    int i;
572
573    /* IDCT4 on each line */
574    for(i=0; i<4; i++) {
575        idct4row(block + i*8);
576    }
577
578    /* IDCT4 and store */
579    for(i=0; i<4; i++){
580        idct4col_add(dest + i, line_size, block + i);
581    }
582}
583