1/* 2 * (c) 2001 Fabrice Bellard 3 * 2007 Marc Hoffman <marc.hoffman@analog.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file 24 * DCT test (c) 2001 Fabrice Bellard 25 * Started from sample code by Juan J. Sierralta P. 26 */ 27 28#include "config.h" 29#include <stdlib.h> 30#include <stdio.h> 31#include <string.h> 32#if HAVE_UNISTD_H 33#include <unistd.h> 34#endif 35#include <math.h> 36 37#include "libavutil/cpu.h" 38#include "libavutil/common.h" 39#include "libavutil/lfg.h" 40#include "libavutil/time.h" 41 42#include "dct.h" 43#include "simple_idct.h" 44#include "aandcttab.h" 45#include "faandct.h" 46#include "faanidct.h" 47#include "x86/idct_xvid.h" 48#include "dctref.h" 49 50// ALTIVEC 51void ff_fdct_altivec(int16_t *block); 52 53// ARM 54void ff_j_rev_dct_arm(int16_t *data); 55void ff_simple_idct_arm(int16_t *data); 56void ff_simple_idct_armv5te(int16_t *data); 57void ff_simple_idct_armv6(int16_t *data); 58void ff_simple_idct_neon(int16_t *data); 59 60struct algo { 61 const char *name; 62 void (*func)(int16_t *block); 63 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, 64 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format; 65 int mm_support; 66 int nonspec; 67}; 68 69static int cpu_flags; 70 71static const struct algo fdct_tab[] = { 72 { "REF-DBL", ff_ref_fdct, NO_PERM }, 73 { "FAAN", ff_faandct, NO_PERM }, 74 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM }, 75 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM }, 76 77#if HAVE_MMX_INLINE 78 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX }, 79#endif 80#if HAVE_MMXEXT_INLINE 81 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT }, 82#endif 83#if HAVE_SSE2_INLINE 84 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 }, 85#endif 86 87#if HAVE_ALTIVEC 88 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC }, 89#endif 90 91 { 0 } 92}; 93 94static void ff_prores_idct_wrap(int16_t *dst){ 95 DECLARE_ALIGNED(16, static int16_t, qmat)[64]; 96 int i; 97 98 for(i=0; i<64; i++){ 99 qmat[i]=4; 100 } 101 ff_prores_idct(dst, qmat); 102 for(i=0; i<64; i++) { 103 dst[i] -= 512; 104 } 105} 106#if ARCH_X86_64 && HAVE_MMX && HAVE_YASM 107void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, 108 int16_t *block, int16_t *qmat); 109 110static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){ 111 DECLARE_ALIGNED(16, static int16_t, qmat)[64]; 112 DECLARE_ALIGNED(16, static int16_t, tmp)[64]; 113 int i; 114 115 for(i=0; i<64; i++){ 116 qmat[i]=4; 117 tmp[i]= dst[i]; 118 } 119 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat); 120 121 for(i=0; i<64; i++) { 122 dst[i] -= 512; 123 } 124} 125#endif 126 127static const struct algo idct_tab[] = { 128 { "FAANI", ff_faanidct, NO_PERM }, 129 { "REF-DBL", ff_ref_idct, NO_PERM }, 130 { "INT", ff_j_rev_dct, MMX_PERM }, 131 { "SIMPLE-C", ff_simple_idct_8, NO_PERM }, 132 { "PR-C", ff_prores_idct_wrap, NO_PERM, 0, 1 }, 133 134#if HAVE_MMX_INLINE 135 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX }, 136 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 }, 137#endif 138#if HAVE_MMXEXT_INLINE 139 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 }, 140#endif 141#if HAVE_SSE2_INLINE 142 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 }, 143#if ARCH_X86_64 && HAVE_YASM 144 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 }, 145#endif 146#endif 147 148#if ARCH_ARM 149 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM }, 150 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM }, 151#endif 152#if HAVE_ARMV5TE 153 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE }, 154#endif 155#if HAVE_ARMV6 156 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 }, 157#endif 158#if HAVE_NEON && ARCH_ARM 159 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON }, 160#endif 161 162 { 0 } 163}; 164 165#define AANSCALE_BITS 12 166 167#define NB_ITS 20000 168#define NB_ITS_SPEED 50000 169 170static short idct_mmx_perm[64]; 171 172static short idct_simple_mmx_perm[64] = { 173 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 174 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 175 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 176 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 177 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 178 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 179 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 180 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, 181}; 182 183static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; 184 185static void idct_mmx_init(void) 186{ 187 int i; 188 189 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ 190 for (i = 0; i < 64; i++) { 191 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); 192 } 193} 194 195DECLARE_ALIGNED(16, static int16_t, block)[64]; 196DECLARE_ALIGNED(8, static int16_t, block1)[64]; 197 198static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals) 199{ 200 int i, j; 201 202 memset(block, 0, 64 * sizeof(*block)); 203 204 switch (test) { 205 case 0: 206 for (i = 0; i < 64; i++) 207 block[i] = (av_lfg_get(prng) % (2*vals)) -vals; 208 if (is_idct) { 209 ff_ref_fdct(block); 210 for (i = 0; i < 64; i++) 211 block[i] >>= 3; 212 } 213 break; 214 case 1: 215 j = av_lfg_get(prng) % 10 + 1; 216 for (i = 0; i < j; i++) { 217 int idx = av_lfg_get(prng) % 64; 218 block[idx] = av_lfg_get(prng) % (2*vals) -vals; 219 } 220 break; 221 case 2: 222 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals); 223 block[63] = (block[0] & 1) ^ 1; 224 break; 225 } 226} 227 228static void permute(int16_t dst[64], const int16_t src[64], int perm) 229{ 230 int i; 231 232 if (perm == MMX_PERM) { 233 for (i = 0; i < 64; i++) 234 dst[idct_mmx_perm[i]] = src[i]; 235 } else if (perm == MMX_SIMPLE_PERM) { 236 for (i = 0; i < 64; i++) 237 dst[idct_simple_mmx_perm[i]] = src[i]; 238 } else if (perm == SSE2_PERM) { 239 for (i = 0; i < 64; i++) 240 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i]; 241 } else if (perm == PARTTRANS_PERM) { 242 for (i = 0; i < 64; i++) 243 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i]; 244 } else if (perm == TRANSPOSE_PERM) { 245 for (i = 0; i < 64; i++) 246 dst[(i>>3) | ((i<<3)&0x38)] = src[i]; 247 } else { 248 for (i = 0; i < 64; i++) 249 dst[i] = src[i]; 250 } 251} 252 253static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits) 254{ 255 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct; 256 int it, i, scale; 257 int err_inf, v; 258 int64_t err2, ti, ti1, it1, err_sum = 0; 259 int64_t sysErr[64], sysErrMax = 0; 260 int maxout = 0; 261 int blockSumErrMax = 0, blockSumErr; 262 AVLFG prng; 263 const int vals=1<<bits; 264 double omse, ome; 265 int spec_err; 266 267 av_lfg_init(&prng, 1); 268 269 err_inf = 0; 270 err2 = 0; 271 for (i = 0; i < 64; i++) 272 sysErr[i] = 0; 273 for (it = 0; it < NB_ITS; it++) { 274 init_block(block1, test, is_idct, &prng, vals); 275 permute(block, block1, dct->format); 276 277 dct->func(block); 278 emms_c(); 279 280 if (dct->format == SCALE_PERM) { 281 for (i = 0; i < 64; i++) { 282 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i]; 283 block[i] = (block[i] * scale) >> AANSCALE_BITS; 284 } 285 } 286 287 ref(block1); 288 if (!strcmp(dct->name, "PR-SSE2")) 289 for (i = 0; i < 64; i++) 290 block1[i] = av_clip(block1[i], 4-512, 1019-512); 291 292 blockSumErr = 0; 293 for (i = 0; i < 64; i++) { 294 int err = block[i] - block1[i]; 295 err_sum += err; 296 v = abs(err); 297 if (v > err_inf) 298 err_inf = v; 299 err2 += v * v; 300 sysErr[i] += block[i] - block1[i]; 301 blockSumErr += v; 302 if (abs(block[i]) > maxout) 303 maxout = abs(block[i]); 304 } 305 if (blockSumErrMax < blockSumErr) 306 blockSumErrMax = blockSumErr; 307 } 308 for (i = 0; i < 64; i++) 309 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i])); 310 311 for (i = 0; i < 64; i++) { 312 if (i % 8 == 0) 313 printf("\n"); 314 printf("%7d ", (int) sysErr[i]); 315 } 316 printf("\n"); 317 318 omse = (double) err2 / NB_ITS / 64; 319 ome = (double) err_sum / NB_ITS / 64; 320 321 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015); 322 323 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n", 324 is_idct ? "IDCT" : "DCT", dct->name, err_inf, 325 omse, ome, (double) sysErrMax / NB_ITS, 326 maxout, blockSumErrMax); 327 328 if (spec_err && !dct->nonspec) 329 return 1; 330 331 if (!speed) 332 return 0; 333 334 /* speed test */ 335 336 init_block(block, test, is_idct, &prng, vals); 337 permute(block1, block, dct->format); 338 339 ti = av_gettime_relative(); 340 it1 = 0; 341 do { 342 for (it = 0; it < NB_ITS_SPEED; it++) { 343 memcpy(block, block1, sizeof(block)); 344 dct->func(block); 345 } 346 emms_c(); 347 it1 += NB_ITS_SPEED; 348 ti1 = av_gettime_relative() - ti; 349 } while (ti1 < 1000000); 350 351 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name, 352 (double) it1 * 1000.0 / (double) ti1); 353 354 return 0; 355} 356 357DECLARE_ALIGNED(8, static uint8_t, img_dest)[64]; 358DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64]; 359 360static void idct248_ref(uint8_t *dest, int linesize, int16_t *block) 361{ 362 static int init; 363 static double c8[8][8]; 364 static double c4[4][4]; 365 double block1[64], block2[64], block3[64]; 366 double s, sum, v; 367 int i, j, k; 368 369 if (!init) { 370 init = 1; 371 372 for (i = 0; i < 8; i++) { 373 sum = 0; 374 for (j = 0; j < 8; j++) { 375 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0); 376 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0); 377 sum += c8[i][j] * c8[i][j]; 378 } 379 } 380 381 for (i = 0; i < 4; i++) { 382 sum = 0; 383 for (j = 0; j < 4; j++) { 384 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0); 385 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0); 386 sum += c4[i][j] * c4[i][j]; 387 } 388 } 389 } 390 391 /* butterfly */ 392 s = 0.5 * sqrt(2.0); 393 for (i = 0; i < 4; i++) { 394 for (j = 0; j < 8; j++) { 395 block1[8 * (2 * i) + j] = 396 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s; 397 block1[8 * (2 * i + 1) + j] = 398 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s; 399 } 400 } 401 402 /* idct8 on lines */ 403 for (i = 0; i < 8; i++) { 404 for (j = 0; j < 8; j++) { 405 sum = 0; 406 for (k = 0; k < 8; k++) 407 sum += c8[k][j] * block1[8 * i + k]; 408 block2[8 * i + j] = sum; 409 } 410 } 411 412 /* idct4 */ 413 for (i = 0; i < 8; i++) { 414 for (j = 0; j < 4; j++) { 415 /* top */ 416 sum = 0; 417 for (k = 0; k < 4; k++) 418 sum += c4[k][j] * block2[8 * (2 * k) + i]; 419 block3[8 * (2 * j) + i] = sum; 420 421 /* bottom */ 422 sum = 0; 423 for (k = 0; k < 4; k++) 424 sum += c4[k][j] * block2[8 * (2 * k + 1) + i]; 425 block3[8 * (2 * j + 1) + i] = sum; 426 } 427 } 428 429 /* clamp and store the result */ 430 for (i = 0; i < 8; i++) { 431 for (j = 0; j < 8; j++) { 432 v = block3[8 * i + j]; 433 if (v < 0) v = 0; 434 else if (v > 255) v = 255; 435 dest[i * linesize + j] = (int) rint(v); 436 } 437 } 438} 439 440static void idct248_error(const char *name, 441 void (*idct248_put)(uint8_t *dest, int line_size, 442 int16_t *block), 443 int speed) 444{ 445 int it, i, it1, ti, ti1, err_max, v; 446 AVLFG prng; 447 448 av_lfg_init(&prng, 1); 449 450 /* just one test to see if code is correct (precision is less 451 important here) */ 452 err_max = 0; 453 for (it = 0; it < NB_ITS; it++) { 454 /* XXX: use forward transform to generate values */ 455 for (i = 0; i < 64; i++) 456 block1[i] = av_lfg_get(&prng) % 256 - 128; 457 block1[0] += 1024; 458 459 for (i = 0; i < 64; i++) 460 block[i] = block1[i]; 461 idct248_ref(img_dest1, 8, block); 462 463 for (i = 0; i < 64; i++) 464 block[i] = block1[i]; 465 idct248_put(img_dest, 8, block); 466 467 for (i = 0; i < 64; i++) { 468 v = abs((int) img_dest[i] - (int) img_dest1[i]); 469 if (v == 255) 470 printf("%d %d\n", img_dest[i], img_dest1[i]); 471 if (v > err_max) 472 err_max = v; 473 } 474#if 0 475 printf("ref=\n"); 476 for(i=0;i<8;i++) { 477 int j; 478 for(j=0;j<8;j++) { 479 printf(" %3d", img_dest1[i*8+j]); 480 } 481 printf("\n"); 482 } 483 484 printf("out=\n"); 485 for(i=0;i<8;i++) { 486 int j; 487 for(j=0;j<8;j++) { 488 printf(" %3d", img_dest[i*8+j]); 489 } 490 printf("\n"); 491 } 492#endif 493 } 494 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max); 495 496 if (!speed) 497 return; 498 499 ti = av_gettime_relative(); 500 it1 = 0; 501 do { 502 for (it = 0; it < NB_ITS_SPEED; it++) { 503 for (i = 0; i < 64; i++) 504 block[i] = block1[i]; 505 idct248_put(img_dest, 8, block); 506 } 507 emms_c(); 508 it1 += NB_ITS_SPEED; 509 ti1 = av_gettime_relative() - ti; 510 } while (ti1 < 1000000); 511 512 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name, 513 (double) it1 * 1000.0 / (double) ti1); 514} 515 516static void help(void) 517{ 518 printf("dct-test [-i] [<test-number>] [<bits>]\n" 519 "test-number 0 -> test with random matrixes\n" 520 " 1 -> test with random sparse matrixes\n" 521 " 2 -> do 3. test from mpeg4 std\n" 522 "bits Number of time domain bits to use, 8 is default\n" 523 "-i test IDCT implementations\n" 524 "-4 test IDCT248 implementations\n" 525 "-t speed test\n"); 526} 527 528#if !HAVE_GETOPT 529#include "compat/getopt.c" 530#endif 531 532int main(int argc, char **argv) 533{ 534 int test_idct = 0, test_248_dct = 0; 535 int c, i; 536 int test = 1; 537 int speed = 0; 538 int err = 0; 539 int bits=8; 540 541 cpu_flags = av_get_cpu_flags(); 542 543 ff_ref_dct_init(); 544 idct_mmx_init(); 545 546 for (;;) { 547 c = getopt(argc, argv, "ih4t"); 548 if (c == -1) 549 break; 550 switch (c) { 551 case 'i': 552 test_idct = 1; 553 break; 554 case '4': 555 test_248_dct = 1; 556 break; 557 case 't': 558 speed = 1; 559 break; 560 default: 561 case 'h': 562 help(); 563 return 0; 564 } 565 } 566 567 if (optind < argc) 568 test = atoi(argv[optind]); 569 if(optind+1 < argc) bits= atoi(argv[optind+1]); 570 571 printf("ffmpeg DCT/IDCT test\n"); 572 573 if (test_248_dct) { 574 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed); 575 } else { 576 const struct algo *algos = test_idct ? idct_tab : fdct_tab; 577 for (i = 0; algos[i].name; i++) 578 if (!(~cpu_flags & algos[i].mm_support)) { 579 err |= dct_error(&algos[i], test, test_idct, speed, bits); 580 } 581 } 582 583 if (err) 584 printf("Error: %d.\n", err); 585 586 return !!err; 587} 588