1/*
2 * (c) 2001 Fabrice Bellard
3 *     2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22/**
23 * @file libavcodec/dct-test.c
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <sys/time.h>
32#include <unistd.h>
33#include <math.h>
34
35#include "libavutil/common.h"
36
37#include "simple_idct.h"
38#include "aandcttab.h"
39#include "faandct.h"
40#include "faanidct.h"
41#include "x86/idct_xvid.h"
42
43#undef printf
44#undef random
45
46void *fast_memcpy(void *a, const void *b, size_t c){return memcpy(a,b,c);};
47
48/* reference fdct/idct */
49void ff_ref_fdct(DCTELEM *block);
50void ff_ref_idct(DCTELEM *block);
51void ff_ref_dct_init(void);
52
53void ff_mmx_idct(DCTELEM *data);
54void ff_mmxext_idct(DCTELEM *data);
55
56void odivx_idct_c(short *block);
57
58// BFIN
59void ff_bfin_idct(DCTELEM *block);
60void ff_bfin_fdct(DCTELEM *block);
61
62// ALTIVEC
63void fdct_altivec(DCTELEM *block);
64//void idct_altivec(DCTELEM *block);?? no routine
65
66// ARM
67void j_rev_dct_ARM(DCTELEM *data);
68void simple_idct_ARM(DCTELEM *data);
69void simple_idct_armv5te(DCTELEM *data);
70void ff_simple_idct_armv6(DCTELEM *data);
71void ff_simple_idct_neon(DCTELEM *data);
72
73void ff_simple_idct_axp(DCTELEM *data);
74
75struct algo {
76  const char *name;
77  enum { FDCT, IDCT } is_idct;
78  void (* func) (DCTELEM *block);
79  void (* ref)  (DCTELEM *block);
80  enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
81  int  mm_support;
82};
83
84#ifndef FAAN_POSTSCALE
85#define FAAN_SCALE SCALE_PERM
86#else
87#define FAAN_SCALE NO_PERM
88#endif
89
90static int cpu_flags;
91
92struct algo algos[] = {
93  {"REF-DBL",         0, ff_ref_fdct,        ff_ref_fdct, NO_PERM},
94  {"FAAN",            0, ff_faandct,         ff_ref_fdct, FAAN_SCALE},
95  {"FAANI",           1, ff_faanidct,        ff_ref_idct, NO_PERM},
96  {"IJG-AAN-INT",     0, fdct_ifast,         ff_ref_fdct, SCALE_PERM},
97  {"IJG-LLM-INT",     0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
98  {"REF-DBL",         1, ff_ref_idct,        ff_ref_idct, NO_PERM},
99  {"INT",             1, j_rev_dct,          ff_ref_idct, MMX_PERM},
100  {"SIMPLE-C",        1, ff_simple_idct,     ff_ref_idct, NO_PERM},
101
102#if HAVE_MMX
103  {"MMX",             0, ff_fdct_mmx,        ff_ref_fdct, NO_PERM, FF_MM_MMX},
104#if HAVE_MMX2
105  {"MMX2",            0, ff_fdct_mmx2,       ff_ref_fdct, NO_PERM, FF_MM_MMXEXT},
106  {"SSE2",            0, ff_fdct_sse2,       ff_ref_fdct, NO_PERM, FF_MM_SSE2},
107#endif
108
109#if CONFIG_GPL
110  {"LIBMPEG2-MMX",    1, ff_mmx_idct,        ff_ref_idct, MMX_PERM, FF_MM_MMX},
111  {"LIBMPEG2-MMXEXT", 1, ff_mmxext_idct,     ff_ref_idct, MMX_PERM, FF_MM_MMXEXT},
112#endif
113  {"SIMPLE-MMX",      1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX},
114  {"XVID-MMX",        1, ff_idct_xvid_mmx,   ff_ref_idct, NO_PERM, FF_MM_MMX},
115  {"XVID-MMX2",       1, ff_idct_xvid_mmx2,  ff_ref_idct, NO_PERM, FF_MM_MMXEXT},
116  {"XVID-SSE2",       1, ff_idct_xvid_sse2,  ff_ref_idct, SSE2_PERM, FF_MM_SSE2},
117#endif
118
119#if HAVE_ALTIVEC
120  {"altivecfdct",     0, fdct_altivec,       ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC},
121#endif
122
123#if ARCH_BFIN
124  {"BFINfdct",        0, ff_bfin_fdct,       ff_ref_fdct, NO_PERM},
125  {"BFINidct",        1, ff_bfin_idct,       ff_ref_idct, NO_PERM},
126#endif
127
128#if ARCH_ARM
129  {"SIMPLE-ARM",      1, simple_idct_ARM,    ff_ref_idct, NO_PERM },
130  {"INT-ARM",         1, j_rev_dct_ARM,      ff_ref_idct, MMX_PERM },
131#if HAVE_ARMV5TE
132  {"SIMPLE-ARMV5TE",  1, simple_idct_armv5te, ff_ref_idct, NO_PERM },
133#endif
134#if HAVE_ARMV6
135  {"SIMPLE-ARMV6",    1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
136#endif
137#if HAVE_NEON
138  {"SIMPLE-NEON",     1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
139#endif
140#endif /* ARCH_ARM */
141
142#if ARCH_ALPHA
143  {"SIMPLE-ALPHA",    1, ff_simple_idct_axp,  ff_ref_idct, NO_PERM },
144#endif
145
146  { 0 }
147};
148
149#define AANSCALE_BITS 12
150
151uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
152
153int64_t gettime(void)
154{
155    struct timeval tv;
156    gettimeofday(&tv,NULL);
157    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
158}
159
160#define NB_ITS 20000
161#define NB_ITS_SPEED 50000
162
163static short idct_mmx_perm[64];
164
165static short idct_simple_mmx_perm[64]={
166        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
167        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
168        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
169        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
170        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
171        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
172        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
173        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
174};
175
176static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
177
178void idct_mmx_init(void)
179{
180    int i;
181
182    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
183    for (i = 0; i < 64; i++) {
184        idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
185//        idct_simple_mmx_perm[i] = simple_block_permute_op(i);
186    }
187}
188
189static DCTELEM block[64] __attribute__ ((aligned (16)));
190static DCTELEM block1[64] __attribute__ ((aligned (8)));
191static DCTELEM block_org[64] __attribute__ ((aligned (8)));
192
193static inline void mmx_emms(void)
194{
195#if HAVE_MMX
196    if (cpu_flags & FF_MM_MMX)
197        __asm__ volatile ("emms\n\t");
198#endif
199}
200
201void dct_error(const char *name, int is_idct,
202               void (*fdct_func)(DCTELEM *block),
203               void (*fdct_ref)(DCTELEM *block), int form, int test)
204{
205    int it, i, scale;
206    int err_inf, v;
207    int64_t err2, ti, ti1, it1;
208    int64_t sysErr[64], sysErrMax=0;
209    int maxout=0;
210    int blockSumErrMax=0, blockSumErr;
211
212    srandom(0);
213
214    err_inf = 0;
215    err2 = 0;
216    for(i=0; i<64; i++) sysErr[i]=0;
217    for(it=0;it<NB_ITS;it++) {
218        for(i=0;i<64;i++)
219            block1[i] = 0;
220        switch(test){
221        case 0:
222            for(i=0;i<64;i++)
223                block1[i] = (random() % 512) -256;
224            if (is_idct){
225                ff_ref_fdct(block1);
226
227                for(i=0;i<64;i++)
228                    block1[i]>>=3;
229            }
230        break;
231        case 1:{
232            int num= (random()%10)+1;
233            for(i=0;i<num;i++)
234                block1[random()%64] = (random() % 512) -256;
235        }break;
236        case 2:
237            block1[0]= (random()%4096)-2048;
238            block1[63]= (block1[0]&1)^1;
239        break;
240        }
241
242#if 0 // simulate mismatch control
243{ int sum=0;
244        for(i=0;i<64;i++)
245           sum+=block1[i];
246
247        if((sum&1)==0) block1[63]^=1;
248}
249#endif
250
251        for(i=0; i<64; i++)
252            block_org[i]= block1[i];
253
254        if (form == MMX_PERM) {
255            for(i=0;i<64;i++)
256                block[idct_mmx_perm[i]] = block1[i];
257            } else if (form == MMX_SIMPLE_PERM) {
258            for(i=0;i<64;i++)
259                block[idct_simple_mmx_perm[i]] = block1[i];
260
261        } else if (form == SSE2_PERM) {
262            for(i=0; i<64; i++)
263                block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
264        } else if (form == PARTTRANS_PERM) {
265            for(i=0; i<64; i++)
266                block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
267        } else {
268            for(i=0; i<64; i++)
269                block[i]= block1[i];
270        }
271#if 0 // simulate mismatch control for tested IDCT but not the ref
272{ int sum=0;
273        for(i=0;i<64;i++)
274           sum+=block[i];
275
276        if((sum&1)==0) block[63]^=1;
277}
278#endif
279
280        fdct_func(block);
281        mmx_emms();
282
283        if (form == SCALE_PERM) {
284            for(i=0; i<64; i++) {
285                scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
286                block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
287            }
288        }
289
290        fdct_ref(block1);
291
292        blockSumErr=0;
293        for(i=0;i<64;i++) {
294            v = abs(block[i] - block1[i]);
295            if (v > err_inf)
296                err_inf = v;
297            err2 += v * v;
298            sysErr[i] += block[i] - block1[i];
299            blockSumErr += v;
300            if( abs(block[i])>maxout) maxout=abs(block[i]);
301        }
302        if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
303#if 0 // print different matrix pairs
304        if(blockSumErr){
305            printf("\n");
306            for(i=0; i<64; i++){
307                if((i&7)==0) printf("\n");
308                printf("%4d ", block_org[i]);
309            }
310            for(i=0; i<64; i++){
311                if((i&7)==0) printf("\n");
312                printf("%4d ", block[i] - block1[i]);
313            }
314        }
315#endif
316    }
317    for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
318
319#if 1 // dump systematic errors
320    for(i=0; i<64; i++){
321        if(i%8==0) printf("\n");
322        printf("%5d ", (int)sysErr[i]);
323    }
324    printf("\n");
325#endif
326
327    printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
328           is_idct ? "IDCT" : "DCT",
329           name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
330#if 1 //Speed test
331    /* speed test */
332    for(i=0;i<64;i++)
333        block1[i] = 0;
334    switch(test){
335    case 0:
336        for(i=0;i<64;i++)
337            block1[i] = (random() % 512) -256;
338        if (is_idct){
339            ff_ref_fdct(block1);
340
341            for(i=0;i<64;i++)
342                block1[i]>>=3;
343        }
344    break;
345    case 1:{
346    case 2:
347        block1[0] = (random() % 512) -256;
348        block1[1] = (random() % 512) -256;
349        block1[2] = (random() % 512) -256;
350        block1[3] = (random() % 512) -256;
351    }break;
352    }
353
354    if (form == MMX_PERM) {
355        for(i=0;i<64;i++)
356            block[idct_mmx_perm[i]] = block1[i];
357    } else if(form == MMX_SIMPLE_PERM) {
358        for(i=0;i<64;i++)
359            block[idct_simple_mmx_perm[i]] = block1[i];
360    } else {
361        for(i=0; i<64; i++)
362            block[i]= block1[i];
363    }
364
365    ti = gettime();
366    it1 = 0;
367    do {
368        for(it=0;it<NB_ITS_SPEED;it++) {
369            for(i=0; i<64; i++)
370                block[i]= block1[i];
371//            memcpy(block, block1, sizeof(DCTELEM) * 64);
372// do not memcpy especially not fastmemcpy because it does movntq !!!
373            fdct_func(block);
374        }
375        it1 += NB_ITS_SPEED;
376        ti1 = gettime() - ti;
377    } while (ti1 < 1000000);
378    mmx_emms();
379
380    printf("%s %s: %0.1f kdct/s\n",
381           is_idct ? "IDCT" : "DCT",
382           name, (double)it1 * 1000.0 / (double)ti1);
383#endif
384}
385
386static uint8_t img_dest[64] __attribute__ ((aligned (8)));
387static uint8_t img_dest1[64] __attribute__ ((aligned (8)));
388
389void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
390{
391    static int init;
392    static double c8[8][8];
393    static double c4[4][4];
394    double block1[64], block2[64], block3[64];
395    double s, sum, v;
396    int i, j, k;
397
398    if (!init) {
399        init = 1;
400
401        for(i=0;i<8;i++) {
402            sum = 0;
403            for(j=0;j<8;j++) {
404                s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
405                c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
406                sum += c8[i][j] * c8[i][j];
407            }
408        }
409
410        for(i=0;i<4;i++) {
411            sum = 0;
412            for(j=0;j<4;j++) {
413                s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
414                c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
415                sum += c4[i][j] * c4[i][j];
416            }
417        }
418    }
419
420    /* butterfly */
421    s = 0.5 * sqrt(2.0);
422    for(i=0;i<4;i++) {
423        for(j=0;j<8;j++) {
424            block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
425            block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
426        }
427    }
428
429    /* idct8 on lines */
430    for(i=0;i<8;i++) {
431        for(j=0;j<8;j++) {
432            sum = 0;
433            for(k=0;k<8;k++)
434                sum += c8[k][j] * block1[8*i+k];
435            block2[8*i+j] = sum;
436        }
437    }
438
439    /* idct4 */
440    for(i=0;i<8;i++) {
441        for(j=0;j<4;j++) {
442            /* top */
443            sum = 0;
444            for(k=0;k<4;k++)
445                sum += c4[k][j] * block2[8*(2*k)+i];
446            block3[8*(2*j)+i] = sum;
447
448            /* bottom */
449            sum = 0;
450            for(k=0;k<4;k++)
451                sum += c4[k][j] * block2[8*(2*k+1)+i];
452            block3[8*(2*j+1)+i] = sum;
453        }
454    }
455
456    /* clamp and store the result */
457    for(i=0;i<8;i++) {
458        for(j=0;j<8;j++) {
459            v = block3[8*i+j];
460            if (v < 0)
461                v = 0;
462            else if (v > 255)
463                v = 255;
464            dest[i * linesize + j] = (int)rint(v);
465        }
466    }
467}
468
469void idct248_error(const char *name,
470                    void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
471{
472    int it, i, it1, ti, ti1, err_max, v;
473
474    srandom(0);
475
476    /* just one test to see if code is correct (precision is less
477       important here) */
478    err_max = 0;
479    for(it=0;it<NB_ITS;it++) {
480
481        /* XXX: use forward transform to generate values */
482        for(i=0;i<64;i++)
483            block1[i] = (random() % 256) - 128;
484        block1[0] += 1024;
485
486        for(i=0; i<64; i++)
487            block[i]= block1[i];
488        idct248_ref(img_dest1, 8, block);
489
490        for(i=0; i<64; i++)
491            block[i]= block1[i];
492        idct248_put(img_dest, 8, block);
493
494        for(i=0;i<64;i++) {
495            v = abs((int)img_dest[i] - (int)img_dest1[i]);
496            if (v == 255)
497                printf("%d %d\n", img_dest[i], img_dest1[i]);
498            if (v > err_max)
499                err_max = v;
500        }
501#if 0
502        printf("ref=\n");
503        for(i=0;i<8;i++) {
504            int j;
505            for(j=0;j<8;j++) {
506                printf(" %3d", img_dest1[i*8+j]);
507            }
508            printf("\n");
509        }
510
511        printf("out=\n");
512        for(i=0;i<8;i++) {
513            int j;
514            for(j=0;j<8;j++) {
515                printf(" %3d", img_dest[i*8+j]);
516            }
517            printf("\n");
518        }
519#endif
520    }
521    printf("%s %s: err_inf=%d\n",
522           1 ? "IDCT248" : "DCT248",
523           name, err_max);
524
525    ti = gettime();
526    it1 = 0;
527    do {
528        for(it=0;it<NB_ITS_SPEED;it++) {
529            for(i=0; i<64; i++)
530                block[i]= block1[i];
531//            memcpy(block, block1, sizeof(DCTELEM) * 64);
532// do not memcpy especially not fastmemcpy because it does movntq !!!
533            idct248_put(img_dest, 8, block);
534        }
535        it1 += NB_ITS_SPEED;
536        ti1 = gettime() - ti;
537    } while (ti1 < 1000000);
538    mmx_emms();
539
540    printf("%s %s: %0.1f kdct/s\n",
541           1 ? "IDCT248" : "DCT248",
542           name, (double)it1 * 1000.0 / (double)ti1);
543}
544
545void help(void)
546{
547    printf("dct-test [-i] [<test-number>]\n"
548           "test-number 0 -> test with random matrixes\n"
549           "            1 -> test with random sparse matrixes\n"
550           "            2 -> do 3. test from mpeg4 std\n"
551           "-i          test IDCT implementations\n"
552           "-4          test IDCT248 implementations\n");
553}
554
555int main(int argc, char **argv)
556{
557    int test_idct = 0, test_248_dct = 0;
558    int c,i;
559    int test=1;
560    cpu_flags = mm_support();
561
562    ff_ref_dct_init();
563    idct_mmx_init();
564
565    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
566    for(i=0;i<MAX_NEG_CROP;i++) {
567        cropTbl[i] = 0;
568        cropTbl[i + MAX_NEG_CROP + 256] = 255;
569    }
570
571    for(;;) {
572        c = getopt(argc, argv, "ih4");
573        if (c == -1)
574            break;
575        switch(c) {
576        case 'i':
577            test_idct = 1;
578            break;
579        case '4':
580            test_248_dct = 1;
581            break;
582        default :
583        case 'h':
584            help();
585            return 0;
586        }
587    }
588
589    if(optind <argc) test= atoi(argv[optind]);
590
591    printf("ffmpeg DCT/IDCT test\n");
592
593    if (test_248_dct) {
594        idct248_error("SIMPLE-C", ff_simple_idct248_put);
595    } else {
596      for (i=0;algos[i].name;i++)
597        if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
598          dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
599        }
600    }
601    return 0;
602}
603