1/*
2 * (c) 2001 Fabrice Bellard
3 *     2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22/**
23 * @file
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
28#include "config.h"
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#if HAVE_UNISTD_H
33#include <unistd.h>
34#endif
35#include <math.h>
36
37#include "libavutil/cpu.h"
38#include "libavutil/common.h"
39#include "libavutil/lfg.h"
40#include "libavutil/time.h"
41
42#include "dct.h"
43#include "simple_idct.h"
44#include "aandcttab.h"
45#include "faandct.h"
46#include "faanidct.h"
47#include "x86/idct_xvid.h"
48#include "dctref.h"
49
50// ALTIVEC
51void ff_fdct_altivec(int16_t *block);
52
53// ARM
54void ff_j_rev_dct_arm(int16_t *data);
55void ff_simple_idct_arm(int16_t *data);
56void ff_simple_idct_armv5te(int16_t *data);
57void ff_simple_idct_armv6(int16_t *data);
58void ff_simple_idct_neon(int16_t *data);
59
60struct algo {
61    const char *name;
62    void (*func)(int16_t *block);
63    enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
64                     SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
65    int mm_support;
66    int nonspec;
67};
68
69static int cpu_flags;
70
71static const struct algo fdct_tab[] = {
72    { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
73    { "FAAN",           ff_faandct,            NO_PERM    },
74    { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
75    { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
76
77#if HAVE_MMX_INLINE
78    { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
79#endif
80#if HAVE_MMXEXT_INLINE
81    { "MMXEXT",         ff_fdct_mmxext,        NO_PERM,   AV_CPU_FLAG_MMXEXT  },
82#endif
83#if HAVE_SSE2_INLINE
84    { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
85#endif
86
87#if HAVE_ALTIVEC
88    { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
89#endif
90
91    { 0 }
92};
93
94static void ff_prores_idct_wrap(int16_t *dst){
95    DECLARE_ALIGNED(16, static int16_t, qmat)[64];
96    int i;
97
98    for(i=0; i<64; i++){
99        qmat[i]=4;
100    }
101    ff_prores_idct(dst, qmat);
102    for(i=0; i<64; i++) {
103         dst[i] -= 512;
104    }
105}
106#if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
107void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
108                                int16_t *block, int16_t *qmat);
109
110static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
111    DECLARE_ALIGNED(16, static int16_t, qmat)[64];
112    DECLARE_ALIGNED(16, static int16_t, tmp)[64];
113    int i;
114
115    for(i=0; i<64; i++){
116        qmat[i]=4;
117        tmp[i]= dst[i];
118    }
119    ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
120
121    for(i=0; i<64; i++) {
122         dst[i] -= 512;
123    }
124}
125#endif
126
127static const struct algo idct_tab[] = {
128    { "FAANI",          ff_faanidct,           NO_PERM  },
129    { "REF-DBL",        ff_ref_idct,           NO_PERM  },
130    { "INT",            ff_j_rev_dct,          MMX_PERM },
131    { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
132    { "PR-C",           ff_prores_idct_wrap,   NO_PERM, 0, 1 },
133
134#if HAVE_MMX_INLINE
135    { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
136    { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
137#endif
138#if HAVE_MMXEXT_INLINE
139    { "XVID-MMXEXT",    ff_idct_xvid_mmxext,   NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
140#endif
141#if HAVE_SSE2_INLINE
142    { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
143#if ARCH_X86_64 && HAVE_YASM
144    { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
145#endif
146#endif
147
148#if ARCH_ARM
149    { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
150    { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
151#endif
152#if HAVE_ARMV5TE
153    { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM,   AV_CPU_FLAG_ARMV5TE },
154#endif
155#if HAVE_ARMV6
156    { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM,  AV_CPU_FLAG_ARMV6   },
157#endif
158#if HAVE_NEON && ARCH_ARM
159    { "SIMPLE-NEON",    ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
160#endif
161
162    { 0 }
163};
164
165#define AANSCALE_BITS 12
166
167#define NB_ITS 20000
168#define NB_ITS_SPEED 50000
169
170static short idct_mmx_perm[64];
171
172static short idct_simple_mmx_perm[64] = {
173    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
174    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
175    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
176    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
177    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
178    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
179    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
180    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
181};
182
183static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
184
185static void idct_mmx_init(void)
186{
187    int i;
188
189    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
190    for (i = 0; i < 64; i++) {
191        idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
192    }
193}
194
195DECLARE_ALIGNED(16, static int16_t, block)[64];
196DECLARE_ALIGNED(8,  static int16_t, block1)[64];
197
198static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
199{
200    int i, j;
201
202    memset(block, 0, 64 * sizeof(*block));
203
204    switch (test) {
205    case 0:
206        for (i = 0; i < 64; i++)
207            block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
208        if (is_idct) {
209            ff_ref_fdct(block);
210            for (i = 0; i < 64; i++)
211                block[i] >>= 3;
212        }
213        break;
214    case 1:
215        j = av_lfg_get(prng) % 10 + 1;
216        for (i = 0; i < j; i++) {
217            int idx = av_lfg_get(prng) % 64;
218            block[idx] = av_lfg_get(prng) % (2*vals) -vals;
219        }
220        break;
221    case 2:
222        block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
223        block[63] = (block[0] & 1) ^ 1;
224        break;
225    }
226}
227
228static void permute(int16_t dst[64], const int16_t src[64], int perm)
229{
230    int i;
231
232    if (perm == MMX_PERM) {
233        for (i = 0; i < 64; i++)
234            dst[idct_mmx_perm[i]] = src[i];
235    } else if (perm == MMX_SIMPLE_PERM) {
236        for (i = 0; i < 64; i++)
237            dst[idct_simple_mmx_perm[i]] = src[i];
238    } else if (perm == SSE2_PERM) {
239        for (i = 0; i < 64; i++)
240            dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241    } else if (perm == PARTTRANS_PERM) {
242        for (i = 0; i < 64; i++)
243            dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
244    } else if (perm == TRANSPOSE_PERM) {
245        for (i = 0; i < 64; i++)
246            dst[(i>>3) | ((i<<3)&0x38)] = src[i];
247    } else {
248        for (i = 0; i < 64; i++)
249            dst[i] = src[i];
250    }
251}
252
253static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
254{
255    void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
256    int it, i, scale;
257    int err_inf, v;
258    int64_t err2, ti, ti1, it1, err_sum = 0;
259    int64_t sysErr[64], sysErrMax = 0;
260    int maxout = 0;
261    int blockSumErrMax = 0, blockSumErr;
262    AVLFG prng;
263    const int vals=1<<bits;
264    double omse, ome;
265    int spec_err;
266
267    av_lfg_init(&prng, 1);
268
269    err_inf = 0;
270    err2 = 0;
271    for (i = 0; i < 64; i++)
272        sysErr[i] = 0;
273    for (it = 0; it < NB_ITS; it++) {
274        init_block(block1, test, is_idct, &prng, vals);
275        permute(block, block1, dct->format);
276
277        dct->func(block);
278        emms_c();
279
280        if (dct->format == SCALE_PERM) {
281            for (i = 0; i < 64; i++) {
282                scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
283                block[i] = (block[i] * scale) >> AANSCALE_BITS;
284            }
285        }
286
287        ref(block1);
288        if (!strcmp(dct->name, "PR-SSE2"))
289            for (i = 0; i < 64; i++)
290                block1[i] = av_clip(block1[i], 4-512, 1019-512);
291
292        blockSumErr = 0;
293        for (i = 0; i < 64; i++) {
294            int err = block[i] - block1[i];
295            err_sum += err;
296            v = abs(err);
297            if (v > err_inf)
298                err_inf = v;
299            err2 += v * v;
300            sysErr[i] += block[i] - block1[i];
301            blockSumErr += v;
302            if (abs(block[i]) > maxout)
303                maxout = abs(block[i]);
304        }
305        if (blockSumErrMax < blockSumErr)
306            blockSumErrMax = blockSumErr;
307    }
308    for (i = 0; i < 64; i++)
309        sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
310
311    for (i = 0; i < 64; i++) {
312        if (i % 8 == 0)
313            printf("\n");
314        printf("%7d ", (int) sysErr[i]);
315    }
316    printf("\n");
317
318    omse = (double) err2 / NB_ITS / 64;
319    ome  = (double) err_sum / NB_ITS / 64;
320
321    spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
322
323    printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
324           is_idct ? "IDCT" : "DCT", dct->name, err_inf,
325           omse, ome, (double) sysErrMax / NB_ITS,
326           maxout, blockSumErrMax);
327
328    if (spec_err && !dct->nonspec)
329        return 1;
330
331    if (!speed)
332        return 0;
333
334    /* speed test */
335
336    init_block(block, test, is_idct, &prng, vals);
337    permute(block1, block, dct->format);
338
339    ti = av_gettime_relative();
340    it1 = 0;
341    do {
342        for (it = 0; it < NB_ITS_SPEED; it++) {
343            memcpy(block, block1, sizeof(block));
344            dct->func(block);
345        }
346        emms_c();
347        it1 += NB_ITS_SPEED;
348        ti1 = av_gettime_relative() - ti;
349    } while (ti1 < 1000000);
350
351    printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
352           (double) it1 * 1000.0 / (double) ti1);
353
354    return 0;
355}
356
357DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
358DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
359
360static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
361{
362    static int init;
363    static double c8[8][8];
364    static double c4[4][4];
365    double block1[64], block2[64], block3[64];
366    double s, sum, v;
367    int i, j, k;
368
369    if (!init) {
370        init = 1;
371
372        for (i = 0; i < 8; i++) {
373            sum = 0;
374            for (j = 0; j < 8; j++) {
375                s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
376                c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
377                sum += c8[i][j] * c8[i][j];
378            }
379        }
380
381        for (i = 0; i < 4; i++) {
382            sum = 0;
383            for (j = 0; j < 4; j++) {
384                s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
385                c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
386                sum += c4[i][j] * c4[i][j];
387            }
388        }
389    }
390
391    /* butterfly */
392    s = 0.5 * sqrt(2.0);
393    for (i = 0; i < 4; i++) {
394        for (j = 0; j < 8; j++) {
395            block1[8 * (2 * i) + j] =
396                (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
397            block1[8 * (2 * i + 1) + j] =
398                (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
399        }
400    }
401
402    /* idct8 on lines */
403    for (i = 0; i < 8; i++) {
404        for (j = 0; j < 8; j++) {
405            sum = 0;
406            for (k = 0; k < 8; k++)
407                sum += c8[k][j] * block1[8 * i + k];
408            block2[8 * i + j] = sum;
409        }
410    }
411
412    /* idct4 */
413    for (i = 0; i < 8; i++) {
414        for (j = 0; j < 4; j++) {
415            /* top */
416            sum = 0;
417            for (k = 0; k < 4; k++)
418                sum += c4[k][j] * block2[8 * (2 * k) + i];
419            block3[8 * (2 * j) + i] = sum;
420
421            /* bottom */
422            sum = 0;
423            for (k = 0; k < 4; k++)
424                sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
425            block3[8 * (2 * j + 1) + i] = sum;
426        }
427    }
428
429    /* clamp and store the result */
430    for (i = 0; i < 8; i++) {
431        for (j = 0; j < 8; j++) {
432            v = block3[8 * i + j];
433            if      (v < 0)   v = 0;
434            else if (v > 255) v = 255;
435            dest[i * linesize + j] = (int) rint(v);
436        }
437    }
438}
439
440static void idct248_error(const char *name,
441                          void (*idct248_put)(uint8_t *dest, int line_size,
442                                              int16_t *block),
443                          int speed)
444{
445    int it, i, it1, ti, ti1, err_max, v;
446    AVLFG prng;
447
448    av_lfg_init(&prng, 1);
449
450    /* just one test to see if code is correct (precision is less
451       important here) */
452    err_max = 0;
453    for (it = 0; it < NB_ITS; it++) {
454        /* XXX: use forward transform to generate values */
455        for (i = 0; i < 64; i++)
456            block1[i] = av_lfg_get(&prng) % 256 - 128;
457        block1[0] += 1024;
458
459        for (i = 0; i < 64; i++)
460            block[i] = block1[i];
461        idct248_ref(img_dest1, 8, block);
462
463        for (i = 0; i < 64; i++)
464            block[i] = block1[i];
465        idct248_put(img_dest, 8, block);
466
467        for (i = 0; i < 64; i++) {
468            v = abs((int) img_dest[i] - (int) img_dest1[i]);
469            if (v == 255)
470                printf("%d %d\n", img_dest[i], img_dest1[i]);
471            if (v > err_max)
472                err_max = v;
473        }
474#if 0
475        printf("ref=\n");
476        for(i=0;i<8;i++) {
477            int j;
478            for(j=0;j<8;j++) {
479                printf(" %3d", img_dest1[i*8+j]);
480            }
481            printf("\n");
482        }
483
484        printf("out=\n");
485        for(i=0;i<8;i++) {
486            int j;
487            for(j=0;j<8;j++) {
488                printf(" %3d", img_dest[i*8+j]);
489            }
490            printf("\n");
491        }
492#endif
493    }
494    printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
495
496    if (!speed)
497        return;
498
499    ti = av_gettime_relative();
500    it1 = 0;
501    do {
502        for (it = 0; it < NB_ITS_SPEED; it++) {
503            for (i = 0; i < 64; i++)
504                block[i] = block1[i];
505            idct248_put(img_dest, 8, block);
506        }
507        emms_c();
508        it1 += NB_ITS_SPEED;
509        ti1 = av_gettime_relative() - ti;
510    } while (ti1 < 1000000);
511
512    printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
513           (double) it1 * 1000.0 / (double) ti1);
514}
515
516static void help(void)
517{
518    printf("dct-test [-i] [<test-number>] [<bits>]\n"
519           "test-number 0 -> test with random matrixes\n"
520           "            1 -> test with random sparse matrixes\n"
521           "            2 -> do 3. test from mpeg4 std\n"
522           "bits        Number of time domain bits to use, 8 is default\n"
523           "-i          test IDCT implementations\n"
524           "-4          test IDCT248 implementations\n"
525           "-t          speed test\n");
526}
527
528#if !HAVE_GETOPT
529#include "compat/getopt.c"
530#endif
531
532int main(int argc, char **argv)
533{
534    int test_idct = 0, test_248_dct = 0;
535    int c, i;
536    int test = 1;
537    int speed = 0;
538    int err = 0;
539    int bits=8;
540
541    cpu_flags = av_get_cpu_flags();
542
543    ff_ref_dct_init();
544    idct_mmx_init();
545
546    for (;;) {
547        c = getopt(argc, argv, "ih4t");
548        if (c == -1)
549            break;
550        switch (c) {
551        case 'i':
552            test_idct = 1;
553            break;
554        case '4':
555            test_248_dct = 1;
556            break;
557        case 't':
558            speed = 1;
559            break;
560        default:
561        case 'h':
562            help();
563            return 0;
564        }
565    }
566
567    if (optind < argc)
568        test = atoi(argv[optind]);
569    if(optind+1 < argc) bits= atoi(argv[optind+1]);
570
571    printf("ffmpeg DCT/IDCT test\n");
572
573    if (test_248_dct) {
574        idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
575    } else {
576        const struct algo *algos = test_idct ? idct_tab : fdct_tab;
577        for (i = 0; algos[i].name; i++)
578            if (!(~cpu_flags & algos[i].mm_support)) {
579                err |= dct_error(&algos[i], test, test_idct, speed, bits);
580            }
581    }
582
583    if (err)
584        printf("Error: %d.\n", err);
585
586    return !!err;
587}
588