1/*
2 * (c) 2001 Fabrice Bellard
3 *     2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22/**
23 * @file
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <sys/time.h>
32#include <unistd.h>
33#include <math.h>
34
35#include "libavutil/common.h"
36#include "libavutil/lfg.h"
37
38#include "simple_idct.h"
39#include "aandcttab.h"
40#include "faandct.h"
41#include "faanidct.h"
42#include "x86/idct_xvid.h"
43#include "dctref.h"
44
45#undef printf
46
47void ff_mmx_idct(DCTELEM *data);
48void ff_mmxext_idct(DCTELEM *data);
49
50void odivx_idct_c(short *block);
51
52// BFIN
53void ff_bfin_idct(DCTELEM *block);
54void ff_bfin_fdct(DCTELEM *block);
55
56// ALTIVEC
57void fdct_altivec(DCTELEM *block);
58//void idct_altivec(DCTELEM *block);?? no routine
59
60// ARM
61void ff_j_rev_dct_arm(DCTELEM *data);
62void ff_simple_idct_arm(DCTELEM *data);
63void ff_simple_idct_armv5te(DCTELEM *data);
64void ff_simple_idct_armv6(DCTELEM *data);
65void ff_simple_idct_neon(DCTELEM *data);
66
67void ff_simple_idct_axp(DCTELEM *data);
68
69struct algo {
70  const char *name;
71  enum { FDCT, IDCT } is_idct;
72  void (* func) (DCTELEM *block);
73  void (* ref)  (DCTELEM *block);
74  enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
75  int  mm_support;
76};
77
78#ifndef FAAN_POSTSCALE
79#define FAAN_SCALE SCALE_PERM
80#else
81#define FAAN_SCALE NO_PERM
82#endif
83
84static int cpu_flags;
85
86struct algo algos[] = {
87  {"REF-DBL",         0, ff_ref_fdct,        ff_ref_fdct, NO_PERM},
88  {"FAAN",            0, ff_faandct,         ff_ref_fdct, FAAN_SCALE},
89  {"FAANI",           1, ff_faanidct,        ff_ref_idct, NO_PERM},
90  {"IJG-AAN-INT",     0, fdct_ifast,         ff_ref_fdct, SCALE_PERM},
91  {"IJG-LLM-INT",     0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
92  {"REF-DBL",         1, ff_ref_idct,        ff_ref_idct, NO_PERM},
93  {"INT",             1, j_rev_dct,          ff_ref_idct, MMX_PERM},
94  {"SIMPLE-C",        1, ff_simple_idct,     ff_ref_idct, NO_PERM},
95
96#if HAVE_MMX
97  {"MMX",             0, ff_fdct_mmx,        ff_ref_fdct, NO_PERM, FF_MM_MMX},
98#if HAVE_MMX2
99  {"MMX2",            0, ff_fdct_mmx2,       ff_ref_fdct, NO_PERM, FF_MM_MMX2},
100  {"SSE2",            0, ff_fdct_sse2,       ff_ref_fdct, NO_PERM, FF_MM_SSE2},
101#endif
102
103#if CONFIG_GPL
104  {"LIBMPEG2-MMX",    1, ff_mmx_idct,        ff_ref_idct, MMX_PERM, FF_MM_MMX},
105  {"LIBMPEG2-MMX2",   1, ff_mmxext_idct,     ff_ref_idct, MMX_PERM, FF_MM_MMX2},
106#endif
107  {"SIMPLE-MMX",      1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX},
108  {"XVID-MMX",        1, ff_idct_xvid_mmx,   ff_ref_idct, NO_PERM, FF_MM_MMX},
109  {"XVID-MMX2",       1, ff_idct_xvid_mmx2,  ff_ref_idct, NO_PERM, FF_MM_MMX2},
110  {"XVID-SSE2",       1, ff_idct_xvid_sse2,  ff_ref_idct, SSE2_PERM, FF_MM_SSE2},
111#endif
112
113#if HAVE_ALTIVEC
114  {"altivecfdct",     0, fdct_altivec,       ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC},
115#endif
116
117#if ARCH_BFIN
118  {"BFINfdct",        0, ff_bfin_fdct,       ff_ref_fdct, NO_PERM},
119  {"BFINidct",        1, ff_bfin_idct,       ff_ref_idct, NO_PERM},
120#endif
121
122#if ARCH_ARM
123  {"SIMPLE-ARM",      1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
124  {"INT-ARM",         1, ff_j_rev_dct_arm,   ff_ref_idct, MMX_PERM },
125#if HAVE_ARMV5TE
126  {"SIMPLE-ARMV5TE",  1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
127#endif
128#if HAVE_ARMV6
129  {"SIMPLE-ARMV6",    1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
130#endif
131#if HAVE_NEON
132  {"SIMPLE-NEON",     1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
133#endif
134#endif /* ARCH_ARM */
135
136#if ARCH_ALPHA
137  {"SIMPLE-ALPHA",    1, ff_simple_idct_axp,  ff_ref_idct, NO_PERM },
138#endif
139
140  { 0 }
141};
142
143#define AANSCALE_BITS 12
144
145uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
146
147static int64_t gettime(void)
148{
149    struct timeval tv;
150    gettimeofday(&tv,NULL);
151    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
152}
153
154#define NB_ITS 20000
155#define NB_ITS_SPEED 50000
156
157static short idct_mmx_perm[64];
158
159static short idct_simple_mmx_perm[64]={
160        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
161        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
162        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
163        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
164        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
165        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
166        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
167        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
168};
169
170static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
171
172static void idct_mmx_init(void)
173{
174    int i;
175
176    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
177    for (i = 0; i < 64; i++) {
178        idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
179//        idct_simple_mmx_perm[i] = simple_block_permute_op(i);
180    }
181}
182
183DECLARE_ALIGNED(16, static DCTELEM, block)[64];
184DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
185DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
186
187static inline void mmx_emms(void)
188{
189#if HAVE_MMX
190    if (cpu_flags & FF_MM_MMX)
191        __asm__ volatile ("emms\n\t");
192#endif
193}
194
195static void dct_error(const char *name, int is_idct,
196               void (*fdct_func)(DCTELEM *block),
197               void (*fdct_ref)(DCTELEM *block), int form, int test)
198{
199    int it, i, scale;
200    int err_inf, v;
201    int64_t err2, ti, ti1, it1;
202    int64_t sysErr[64], sysErrMax=0;
203    int maxout=0;
204    int blockSumErrMax=0, blockSumErr;
205    AVLFG prng;
206
207    av_lfg_init(&prng, 1);
208
209    err_inf = 0;
210    err2 = 0;
211    for(i=0; i<64; i++) sysErr[i]=0;
212    for(it=0;it<NB_ITS;it++) {
213        for(i=0;i<64;i++)
214            block1[i] = 0;
215        switch(test){
216        case 0:
217            for(i=0;i<64;i++)
218                block1[i] = (av_lfg_get(&prng) % 512) -256;
219            if (is_idct){
220                ff_ref_fdct(block1);
221
222                for(i=0;i<64;i++)
223                    block1[i]>>=3;
224            }
225        break;
226        case 1:{
227            int num = av_lfg_get(&prng) % 10 + 1;
228            for(i=0;i<num;i++)
229                block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
230        }break;
231        case 2:
232            block1[0] = av_lfg_get(&prng) % 4096 - 2048;
233            block1[63]= (block1[0]&1)^1;
234        break;
235        }
236
237#if 0 // simulate mismatch control
238{ int sum=0;
239        for(i=0;i<64;i++)
240           sum+=block1[i];
241
242        if((sum&1)==0) block1[63]^=1;
243}
244#endif
245
246        for(i=0; i<64; i++)
247            block_org[i]= block1[i];
248
249        if (form == MMX_PERM) {
250            for(i=0;i<64;i++)
251                block[idct_mmx_perm[i]] = block1[i];
252            } else if (form == MMX_SIMPLE_PERM) {
253            for(i=0;i<64;i++)
254                block[idct_simple_mmx_perm[i]] = block1[i];
255
256        } else if (form == SSE2_PERM) {
257            for(i=0; i<64; i++)
258                block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
259        } else if (form == PARTTRANS_PERM) {
260            for(i=0; i<64; i++)
261                block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
262        } else {
263            for(i=0; i<64; i++)
264                block[i]= block1[i];
265        }
266#if 0 // simulate mismatch control for tested IDCT but not the ref
267{ int sum=0;
268        for(i=0;i<64;i++)
269           sum+=block[i];
270
271        if((sum&1)==0) block[63]^=1;
272}
273#endif
274
275        fdct_func(block);
276        mmx_emms();
277
278        if (form == SCALE_PERM) {
279            for(i=0; i<64; i++) {
280                scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
281                block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
282            }
283        }
284
285        fdct_ref(block1);
286
287        blockSumErr=0;
288        for(i=0;i<64;i++) {
289            v = abs(block[i] - block1[i]);
290            if (v > err_inf)
291                err_inf = v;
292            err2 += v * v;
293            sysErr[i] += block[i] - block1[i];
294            blockSumErr += v;
295            if( abs(block[i])>maxout) maxout=abs(block[i]);
296        }
297        if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
298#if 0 // print different matrix pairs
299        if(blockSumErr){
300            printf("\n");
301            for(i=0; i<64; i++){
302                if((i&7)==0) printf("\n");
303                printf("%4d ", block_org[i]);
304            }
305            for(i=0; i<64; i++){
306                if((i&7)==0) printf("\n");
307                printf("%4d ", block[i] - block1[i]);
308            }
309        }
310#endif
311    }
312    for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
313
314#if 1 // dump systematic errors
315    for(i=0; i<64; i++){
316        if(i%8==0) printf("\n");
317        printf("%7d ", (int)sysErr[i]);
318    }
319    printf("\n");
320#endif
321
322    printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
323           is_idct ? "IDCT" : "DCT",
324           name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
325#if 1 //Speed test
326    /* speed test */
327    for(i=0;i<64;i++)
328        block1[i] = 0;
329    switch(test){
330    case 0:
331        for(i=0;i<64;i++)
332            block1[i] = av_lfg_get(&prng) % 512 -256;
333        if (is_idct){
334            ff_ref_fdct(block1);
335
336            for(i=0;i<64;i++)
337                block1[i]>>=3;
338        }
339    break;
340    case 1:{
341    case 2:
342        block1[0] = av_lfg_get(&prng) % 512 -256;
343        block1[1] = av_lfg_get(&prng) % 512 -256;
344        block1[2] = av_lfg_get(&prng) % 512 -256;
345        block1[3] = av_lfg_get(&prng) % 512 -256;
346    }break;
347    }
348
349    if (form == MMX_PERM) {
350        for(i=0;i<64;i++)
351            block[idct_mmx_perm[i]] = block1[i];
352    } else if(form == MMX_SIMPLE_PERM) {
353        for(i=0;i<64;i++)
354            block[idct_simple_mmx_perm[i]] = block1[i];
355    } else {
356        for(i=0; i<64; i++)
357            block[i]= block1[i];
358    }
359
360    ti = gettime();
361    it1 = 0;
362    do {
363        for(it=0;it<NB_ITS_SPEED;it++) {
364            for(i=0; i<64; i++)
365                block[i]= block1[i];
366//            memcpy(block, block1, sizeof(DCTELEM) * 64);
367// do not memcpy especially not fastmemcpy because it does movntq !!!
368            fdct_func(block);
369        }
370        it1 += NB_ITS_SPEED;
371        ti1 = gettime() - ti;
372    } while (ti1 < 1000000);
373    mmx_emms();
374
375    printf("%s %s: %0.1f kdct/s\n",
376           is_idct ? "IDCT" : "DCT",
377           name, (double)it1 * 1000.0 / (double)ti1);
378#endif
379}
380
381DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
382DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
383
384static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
385{
386    static int init;
387    static double c8[8][8];
388    static double c4[4][4];
389    double block1[64], block2[64], block3[64];
390    double s, sum, v;
391    int i, j, k;
392
393    if (!init) {
394        init = 1;
395
396        for(i=0;i<8;i++) {
397            sum = 0;
398            for(j=0;j<8;j++) {
399                s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
400                c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
401                sum += c8[i][j] * c8[i][j];
402            }
403        }
404
405        for(i=0;i<4;i++) {
406            sum = 0;
407            for(j=0;j<4;j++) {
408                s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
409                c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
410                sum += c4[i][j] * c4[i][j];
411            }
412        }
413    }
414
415    /* butterfly */
416    s = 0.5 * sqrt(2.0);
417    for(i=0;i<4;i++) {
418        for(j=0;j<8;j++) {
419            block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
420            block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
421        }
422    }
423
424    /* idct8 on lines */
425    for(i=0;i<8;i++) {
426        for(j=0;j<8;j++) {
427            sum = 0;
428            for(k=0;k<8;k++)
429                sum += c8[k][j] * block1[8*i+k];
430            block2[8*i+j] = sum;
431        }
432    }
433
434    /* idct4 */
435    for(i=0;i<8;i++) {
436        for(j=0;j<4;j++) {
437            /* top */
438            sum = 0;
439            for(k=0;k<4;k++)
440                sum += c4[k][j] * block2[8*(2*k)+i];
441            block3[8*(2*j)+i] = sum;
442
443            /* bottom */
444            sum = 0;
445            for(k=0;k<4;k++)
446                sum += c4[k][j] * block2[8*(2*k+1)+i];
447            block3[8*(2*j+1)+i] = sum;
448        }
449    }
450
451    /* clamp and store the result */
452    for(i=0;i<8;i++) {
453        for(j=0;j<8;j++) {
454            v = block3[8*i+j];
455            if (v < 0)
456                v = 0;
457            else if (v > 255)
458                v = 255;
459            dest[i * linesize + j] = (int)rint(v);
460        }
461    }
462}
463
464static void idct248_error(const char *name,
465                    void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
466{
467    int it, i, it1, ti, ti1, err_max, v;
468
469    AVLFG prng;
470
471    av_lfg_init(&prng, 1);
472
473    /* just one test to see if code is correct (precision is less
474       important here) */
475    err_max = 0;
476    for(it=0;it<NB_ITS;it++) {
477
478        /* XXX: use forward transform to generate values */
479        for(i=0;i<64;i++)
480            block1[i] = av_lfg_get(&prng) % 256 - 128;
481        block1[0] += 1024;
482
483        for(i=0; i<64; i++)
484            block[i]= block1[i];
485        idct248_ref(img_dest1, 8, block);
486
487        for(i=0; i<64; i++)
488            block[i]= block1[i];
489        idct248_put(img_dest, 8, block);
490
491        for(i=0;i<64;i++) {
492            v = abs((int)img_dest[i] - (int)img_dest1[i]);
493            if (v == 255)
494                printf("%d %d\n", img_dest[i], img_dest1[i]);
495            if (v > err_max)
496                err_max = v;
497        }
498#if 0
499        printf("ref=\n");
500        for(i=0;i<8;i++) {
501            int j;
502            for(j=0;j<8;j++) {
503                printf(" %3d", img_dest1[i*8+j]);
504            }
505            printf("\n");
506        }
507
508        printf("out=\n");
509        for(i=0;i<8;i++) {
510            int j;
511            for(j=0;j<8;j++) {
512                printf(" %3d", img_dest[i*8+j]);
513            }
514            printf("\n");
515        }
516#endif
517    }
518    printf("%s %s: err_inf=%d\n",
519           1 ? "IDCT248" : "DCT248",
520           name, err_max);
521
522    ti = gettime();
523    it1 = 0;
524    do {
525        for(it=0;it<NB_ITS_SPEED;it++) {
526            for(i=0; i<64; i++)
527                block[i]= block1[i];
528//            memcpy(block, block1, sizeof(DCTELEM) * 64);
529// do not memcpy especially not fastmemcpy because it does movntq !!!
530            idct248_put(img_dest, 8, block);
531        }
532        it1 += NB_ITS_SPEED;
533        ti1 = gettime() - ti;
534    } while (ti1 < 1000000);
535    mmx_emms();
536
537    printf("%s %s: %0.1f kdct/s\n",
538           1 ? "IDCT248" : "DCT248",
539           name, (double)it1 * 1000.0 / (double)ti1);
540}
541
542static void help(void)
543{
544    printf("dct-test [-i] [<test-number>]\n"
545           "test-number 0 -> test with random matrixes\n"
546           "            1 -> test with random sparse matrixes\n"
547           "            2 -> do 3. test from mpeg4 std\n"
548           "-i          test IDCT implementations\n"
549           "-4          test IDCT248 implementations\n");
550}
551
552int main(int argc, char **argv)
553{
554    int test_idct = 0, test_248_dct = 0;
555    int c,i;
556    int test=1;
557    cpu_flags = mm_support();
558
559    ff_ref_dct_init();
560    idct_mmx_init();
561
562    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
563    for(i=0;i<MAX_NEG_CROP;i++) {
564        cropTbl[i] = 0;
565        cropTbl[i + MAX_NEG_CROP + 256] = 255;
566    }
567
568    for(;;) {
569        c = getopt(argc, argv, "ih4");
570        if (c == -1)
571            break;
572        switch(c) {
573        case 'i':
574            test_idct = 1;
575            break;
576        case '4':
577            test_248_dct = 1;
578            break;
579        default :
580        case 'h':
581            help();
582            return 0;
583        }
584    }
585
586    if(optind <argc) test= atoi(argv[optind]);
587
588    printf("ffmpeg DCT/IDCT test\n");
589
590    if (test_248_dct) {
591        idct248_error("SIMPLE-C", ff_simple_idct248_put);
592    } else {
593      for (i=0;algos[i].name;i++)
594        if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
595          dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
596        }
597    }
598    return 0;
599}
600