1/*
2 * (c) 2001 Fabrice Bellard
3 *     2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22/**
23 * @file
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <sys/time.h>
32#include <unistd.h>
33#include <math.h>
34
35#include "libavutil/cpu.h"
36#include "libavutil/common.h"
37#include "libavutil/lfg.h"
38
39#include "simple_idct.h"
40#include "aandcttab.h"
41#include "faandct.h"
42#include "faanidct.h"
43#include "x86/idct_xvid.h"
44#include "dctref.h"
45
46#undef printf
47
48void ff_mmx_idct(DCTELEM *data);
49void ff_mmxext_idct(DCTELEM *data);
50
51void odivx_idct_c(short *block);
52
53// BFIN
54void ff_bfin_idct(DCTELEM *block);
55void ff_bfin_fdct(DCTELEM *block);
56
57// ALTIVEC
58void fdct_altivec(DCTELEM *block);
59//void idct_altivec(DCTELEM *block);?? no routine
60
61// ARM
62void ff_j_rev_dct_arm(DCTELEM *data);
63void ff_simple_idct_arm(DCTELEM *data);
64void ff_simple_idct_armv5te(DCTELEM *data);
65void ff_simple_idct_armv6(DCTELEM *data);
66void ff_simple_idct_neon(DCTELEM *data);
67
68void ff_simple_idct_axp(DCTELEM *data);
69
70struct algo {
71    const char *name;
72    void (*func)(DCTELEM *block);
73    enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74                     SSE2_PERM, PARTTRANS_PERM } format;
75    int mm_support;
76    int nonspec;
77};
78
79#ifndef FAAN_POSTSCALE
80#define FAAN_SCALE SCALE_PERM
81#else
82#define FAAN_SCALE NO_PERM
83#endif
84
85static int cpu_flags;
86
87static const struct algo fdct_tab[] = {
88    { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
89    { "FAAN",           ff_faandct,            FAAN_SCALE },
90    { "IJG-AAN-INT",    fdct_ifast,            SCALE_PERM },
91    { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
92
93#if HAVE_MMX
94    { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
95    { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
96    { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
97#endif
98
99#if HAVE_ALTIVEC
100    { "altivecfdct",    fdct_altivec,          NO_PERM,   AV_CPU_FLAG_ALTIVEC },
101#endif
102
103#if ARCH_BFIN
104    { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
105#endif
106
107    { 0 }
108};
109
110static const struct algo idct_tab[] = {
111    { "FAANI",          ff_faanidct,           NO_PERM  },
112    { "REF-DBL",        ff_ref_idct,           NO_PERM  },
113    { "INT",            j_rev_dct,             MMX_PERM },
114    { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
115
116#if HAVE_MMX
117#if CONFIG_GPL
118    { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
119    { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
120#endif
121    { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
122    { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
123    { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
124    { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
125#endif
126
127#if ARCH_BFIN
128    { "BFINidct",       ff_bfin_idct,          NO_PERM  },
129#endif
130
131#if ARCH_ARM
132    { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
133    { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
134#endif
135#if HAVE_ARMV5TE
136    { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
137#endif
138#if HAVE_ARMV6
139    { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
140#endif
141#if HAVE_NEON
142    { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
143#endif
144
145#if ARCH_ALPHA
146    { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
147#endif
148
149    { 0 }
150};
151
152#define AANSCALE_BITS 12
153
154static int64_t gettime(void)
155{
156    struct timeval tv;
157    gettimeofday(&tv, NULL);
158    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
159}
160
161#define NB_ITS 20000
162#define NB_ITS_SPEED 50000
163
164static short idct_mmx_perm[64];
165
166static short idct_simple_mmx_perm[64] = {
167    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
168    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
169    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
170    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
171    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
172    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
173    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
174    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
175};
176
177static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
178
179static void idct_mmx_init(void)
180{
181    int i;
182
183    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
184    for (i = 0; i < 64; i++) {
185        idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
186    }
187}
188
189DECLARE_ALIGNED(16, static DCTELEM, block)[64];
190DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
191
192static inline void mmx_emms(void)
193{
194#if HAVE_MMX
195    if (cpu_flags & AV_CPU_FLAG_MMX)
196        __asm__ volatile ("emms\n\t");
197#endif
198}
199
200static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
201{
202    int i, j;
203
204    memset(block, 0, 64 * sizeof(*block));
205
206    switch (test) {
207    case 0:
208        for (i = 0; i < 64; i++)
209            block[i] = (av_lfg_get(prng) % 512) - 256;
210        if (is_idct) {
211            ff_ref_fdct(block);
212            for (i = 0; i < 64; i++)
213                block[i] >>= 3;
214        }
215        break;
216    case 1:
217        j = av_lfg_get(prng) % 10 + 1;
218        for (i = 0; i < j; i++)
219            block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
220        break;
221    case 2:
222        block[ 0] = av_lfg_get(prng) % 4096 - 2048;
223        block[63] = (block[0] & 1) ^ 1;
224        break;
225    }
226}
227
228static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
229{
230    int i;
231
232    if (perm == MMX_PERM) {
233        for (i = 0; i < 64; i++)
234            dst[idct_mmx_perm[i]] = src[i];
235    } else if (perm == MMX_SIMPLE_PERM) {
236        for (i = 0; i < 64; i++)
237            dst[idct_simple_mmx_perm[i]] = src[i];
238    } else if (perm == SSE2_PERM) {
239        for (i = 0; i < 64; i++)
240            dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241    } else if (perm == PARTTRANS_PERM) {
242        for (i = 0; i < 64; i++)
243            dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
244    } else {
245        for (i = 0; i < 64; i++)
246            dst[i] = src[i];
247    }
248}
249
250static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
251{
252    void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
253    int it, i, scale;
254    int err_inf, v;
255    int64_t err2, ti, ti1, it1, err_sum = 0;
256    int64_t sysErr[64], sysErrMax = 0;
257    int maxout = 0;
258    int blockSumErrMax = 0, blockSumErr;
259    AVLFG prng;
260    double omse, ome;
261    int spec_err;
262
263    av_lfg_init(&prng, 1);
264
265    err_inf = 0;
266    err2 = 0;
267    for (i = 0; i < 64; i++)
268        sysErr[i] = 0;
269    for (it = 0; it < NB_ITS; it++) {
270        init_block(block1, test, is_idct, &prng);
271        permute(block, block1, dct->format);
272
273        dct->func(block);
274        mmx_emms();
275
276        if (dct->format == SCALE_PERM) {
277            for (i = 0; i < 64; i++) {
278                scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
279                block[i] = (block[i] * scale) >> AANSCALE_BITS;
280            }
281        }
282
283        ref(block1);
284
285        blockSumErr = 0;
286        for (i = 0; i < 64; i++) {
287            int err = block[i] - block1[i];
288            err_sum += err;
289            v = abs(err);
290            if (v > err_inf)
291                err_inf = v;
292            err2 += v * v;
293            sysErr[i] += block[i] - block1[i];
294            blockSumErr += v;
295            if (abs(block[i]) > maxout)
296                maxout = abs(block[i]);
297        }
298        if (blockSumErrMax < blockSumErr)
299            blockSumErrMax = blockSumErr;
300    }
301    for (i = 0; i < 64; i++)
302        sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
303
304    for (i = 0; i < 64; i++) {
305        if (i % 8 == 0)
306            printf("\n");
307        printf("%7d ", (int) sysErr[i]);
308    }
309    printf("\n");
310
311    omse = (double) err2 / NB_ITS / 64;
312    ome  = (double) err_sum / NB_ITS / 64;
313
314    spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
315
316    printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
317           is_idct ? "IDCT" : "DCT", dct->name, err_inf,
318           omse, ome, (double) sysErrMax / NB_ITS,
319           maxout, blockSumErrMax);
320
321    if (spec_err && !dct->nonspec)
322        return 1;
323
324    if (!speed)
325        return 0;
326
327    /* speed test */
328    init_block(block, test, is_idct, &prng);
329    permute(block1, block, dct->format);
330
331    ti = gettime();
332    it1 = 0;
333    do {
334        for (it = 0; it < NB_ITS_SPEED; it++) {
335            memcpy(block, block1, sizeof(block));
336            dct->func(block);
337        }
338        it1 += NB_ITS_SPEED;
339        ti1 = gettime() - ti;
340    } while (ti1 < 1000000);
341    mmx_emms();
342
343    printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
344           (double) it1 * 1000.0 / (double) ti1);
345
346    return 0;
347}
348
349DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
350DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
351
352static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
353{
354    static int init;
355    static double c8[8][8];
356    static double c4[4][4];
357    double block1[64], block2[64], block3[64];
358    double s, sum, v;
359    int i, j, k;
360
361    if (!init) {
362        init = 1;
363
364        for (i = 0; i < 8; i++) {
365            sum = 0;
366            for (j = 0; j < 8; j++) {
367                s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
368                c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
369                sum += c8[i][j] * c8[i][j];
370            }
371        }
372
373        for (i = 0; i < 4; i++) {
374            sum = 0;
375            for (j = 0; j < 4; j++) {
376                s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
377                c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
378                sum += c4[i][j] * c4[i][j];
379            }
380        }
381    }
382
383    /* butterfly */
384    s = 0.5 * sqrt(2.0);
385    for (i = 0; i < 4; i++) {
386        for (j = 0; j < 8; j++) {
387            block1[8 * (2 * i) + j] =
388                (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
389            block1[8 * (2 * i + 1) + j] =
390                (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
391        }
392    }
393
394    /* idct8 on lines */
395    for (i = 0; i < 8; i++) {
396        for (j = 0; j < 8; j++) {
397            sum = 0;
398            for (k = 0; k < 8; k++)
399                sum += c8[k][j] * block1[8 * i + k];
400            block2[8 * i + j] = sum;
401        }
402    }
403
404    /* idct4 */
405    for (i = 0; i < 8; i++) {
406        for (j = 0; j < 4; j++) {
407            /* top */
408            sum = 0;
409            for (k = 0; k < 4; k++)
410                sum += c4[k][j] * block2[8 * (2 * k) + i];
411            block3[8 * (2 * j) + i] = sum;
412
413            /* bottom */
414            sum = 0;
415            for (k = 0; k < 4; k++)
416                sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
417            block3[8 * (2 * j + 1) + i] = sum;
418        }
419    }
420
421    /* clamp and store the result */
422    for (i = 0; i < 8; i++) {
423        for (j = 0; j < 8; j++) {
424            v = block3[8 * i + j];
425            if      (v < 0)   v = 0;
426            else if (v > 255) v = 255;
427            dest[i * linesize + j] = (int) rint(v);
428        }
429    }
430}
431
432static void idct248_error(const char *name,
433                          void (*idct248_put)(uint8_t *dest, int line_size,
434                                              int16_t *block),
435                          int speed)
436{
437    int it, i, it1, ti, ti1, err_max, v;
438    AVLFG prng;
439
440    av_lfg_init(&prng, 1);
441
442    /* just one test to see if code is correct (precision is less
443       important here) */
444    err_max = 0;
445    for (it = 0; it < NB_ITS; it++) {
446        /* XXX: use forward transform to generate values */
447        for (i = 0; i < 64; i++)
448            block1[i] = av_lfg_get(&prng) % 256 - 128;
449        block1[0] += 1024;
450
451        for (i = 0; i < 64; i++)
452            block[i] = block1[i];
453        idct248_ref(img_dest1, 8, block);
454
455        for (i = 0; i < 64; i++)
456            block[i] = block1[i];
457        idct248_put(img_dest, 8, block);
458
459        for (i = 0; i < 64; i++) {
460            v = abs((int) img_dest[i] - (int) img_dest1[i]);
461            if (v == 255)
462                printf("%d %d\n", img_dest[i], img_dest1[i]);
463            if (v > err_max)
464                err_max = v;
465        }
466    }
467    printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
468
469    if (!speed)
470        return;
471
472    ti = gettime();
473    it1 = 0;
474    do {
475        for (it = 0; it < NB_ITS_SPEED; it++) {
476            for (i = 0; i < 64; i++)
477                block[i] = block1[i];
478            idct248_put(img_dest, 8, block);
479        }
480        it1 += NB_ITS_SPEED;
481        ti1 = gettime() - ti;
482    } while (ti1 < 1000000);
483    mmx_emms();
484
485    printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
486           (double) it1 * 1000.0 / (double) ti1);
487}
488
489static void help(void)
490{
491    printf("dct-test [-i] [<test-number>]\n"
492           "test-number 0 -> test with random matrixes\n"
493           "            1 -> test with random sparse matrixes\n"
494           "            2 -> do 3. test from mpeg4 std\n"
495           "-i          test IDCT implementations\n"
496           "-4          test IDCT248 implementations\n"
497           "-t          speed test\n");
498}
499
500int main(int argc, char **argv)
501{
502    int test_idct = 0, test_248_dct = 0;
503    int c, i;
504    int test = 1;
505    int speed = 0;
506    int err = 0;
507
508    cpu_flags = av_get_cpu_flags();
509
510    ff_ref_dct_init();
511    idct_mmx_init();
512
513    for (;;) {
514        c = getopt(argc, argv, "ih4t");
515        if (c == -1)
516            break;
517        switch (c) {
518        case 'i':
519            test_idct = 1;
520            break;
521        case '4':
522            test_248_dct = 1;
523            break;
524        case 't':
525            speed = 1;
526            break;
527        default:
528        case 'h':
529            help();
530            return 0;
531        }
532    }
533
534    if (optind < argc)
535        test = atoi(argv[optind]);
536
537    printf("Libav DCT/IDCT test\n");
538
539    if (test_248_dct) {
540        idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
541    } else {
542        const struct algo *algos = test_idct ? idct_tab : fdct_tab;
543        for (i = 0; algos[i].name; i++)
544            if (!(~cpu_flags & algos[i].mm_support)) {
545                err |= dct_error(&algos[i], test, test_idct, speed);
546            }
547    }
548
549    return err;
550}
551