1/* 2 * (c) 2001 Fabrice Bellard 3 * 2007 Marc Hoffman <marc.hoffman@analog.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file 24 * DCT test (c) 2001 Fabrice Bellard 25 * Started from sample code by Juan J. Sierralta P. 26 */ 27 28#include <stdlib.h> 29#include <stdio.h> 30#include <string.h> 31#include <sys/time.h> 32#include <unistd.h> 33#include <math.h> 34 35#include "libavutil/common.h" 36#include "libavutil/lfg.h" 37 38#include "simple_idct.h" 39#include "aandcttab.h" 40#include "faandct.h" 41#include "faanidct.h" 42#include "x86/idct_xvid.h" 43#include "dctref.h" 44 45#undef printf 46 47void ff_mmx_idct(DCTELEM *data); 48void ff_mmxext_idct(DCTELEM *data); 49 50void odivx_idct_c(short *block); 51 52// BFIN 53void ff_bfin_idct(DCTELEM *block); 54void ff_bfin_fdct(DCTELEM *block); 55 56// ALTIVEC 57void fdct_altivec(DCTELEM *block); 58//void idct_altivec(DCTELEM *block);?? no routine 59 60// ARM 61void ff_j_rev_dct_arm(DCTELEM *data); 62void ff_simple_idct_arm(DCTELEM *data); 63void ff_simple_idct_armv5te(DCTELEM *data); 64void ff_simple_idct_armv6(DCTELEM *data); 65void ff_simple_idct_neon(DCTELEM *data); 66 67void ff_simple_idct_axp(DCTELEM *data); 68 69struct algo { 70 const char *name; 71 enum { FDCT, IDCT } is_idct; 72 void (* func) (DCTELEM *block); 73 void (* ref) (DCTELEM *block); 74 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format; 75 int mm_support; 76}; 77 78#ifndef FAAN_POSTSCALE 79#define FAAN_SCALE SCALE_PERM 80#else 81#define FAAN_SCALE NO_PERM 82#endif 83 84static int cpu_flags; 85 86struct algo algos[] = { 87 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM}, 88 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE}, 89 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM}, 90 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM}, 91 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM}, 92 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM}, 93 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM}, 94 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM}, 95 96#if HAVE_MMX 97 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, FF_MM_MMX}, 98#if HAVE_MMX2 99 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, FF_MM_MMX2}, 100 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, FF_MM_SSE2}, 101#endif 102 103#if CONFIG_GPL 104 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX}, 105 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX2}, 106#endif 107 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX}, 108 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, FF_MM_MMX}, 109 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, FF_MM_MMX2}, 110 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, FF_MM_SSE2}, 111#endif 112 113#if HAVE_ALTIVEC 114 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC}, 115#endif 116 117#if ARCH_BFIN 118 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM}, 119 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM}, 120#endif 121 122#if ARCH_ARM 123 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM }, 124 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM }, 125#if HAVE_ARMV5TE 126 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM }, 127#endif 128#if HAVE_ARMV6 129 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM }, 130#endif 131#if HAVE_NEON 132 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM }, 133#endif 134#endif /* ARCH_ARM */ 135 136#if ARCH_ALPHA 137 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM }, 138#endif 139 140 { 0 } 141}; 142 143#define AANSCALE_BITS 12 144 145uint8_t cropTbl[256 + 2 * MAX_NEG_CROP]; 146 147static int64_t gettime(void) 148{ 149 struct timeval tv; 150 gettimeofday(&tv,NULL); 151 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; 152} 153 154#define NB_ITS 20000 155#define NB_ITS_SPEED 50000 156 157static short idct_mmx_perm[64]; 158 159static short idct_simple_mmx_perm[64]={ 160 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 161 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 162 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 163 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 164 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 165 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 166 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 167 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, 168}; 169 170static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7}; 171 172static void idct_mmx_init(void) 173{ 174 int i; 175 176 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ 177 for (i = 0; i < 64; i++) { 178 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); 179// idct_simple_mmx_perm[i] = simple_block_permute_op(i); 180 } 181} 182 183DECLARE_ALIGNED(16, static DCTELEM, block)[64]; 184DECLARE_ALIGNED(8, static DCTELEM, block1)[64]; 185DECLARE_ALIGNED(8, static DCTELEM, block_org)[64]; 186 187static inline void mmx_emms(void) 188{ 189#if HAVE_MMX 190 if (cpu_flags & FF_MM_MMX) 191 __asm__ volatile ("emms\n\t"); 192#endif 193} 194 195static void dct_error(const char *name, int is_idct, 196 void (*fdct_func)(DCTELEM *block), 197 void (*fdct_ref)(DCTELEM *block), int form, int test) 198{ 199 int it, i, scale; 200 int err_inf, v; 201 int64_t err2, ti, ti1, it1; 202 int64_t sysErr[64], sysErrMax=0; 203 int maxout=0; 204 int blockSumErrMax=0, blockSumErr; 205 AVLFG prng; 206 207 av_lfg_init(&prng, 1); 208 209 err_inf = 0; 210 err2 = 0; 211 for(i=0; i<64; i++) sysErr[i]=0; 212 for(it=0;it<NB_ITS;it++) { 213 for(i=0;i<64;i++) 214 block1[i] = 0; 215 switch(test){ 216 case 0: 217 for(i=0;i<64;i++) 218 block1[i] = (av_lfg_get(&prng) % 512) -256; 219 if (is_idct){ 220 ff_ref_fdct(block1); 221 222 for(i=0;i<64;i++) 223 block1[i]>>=3; 224 } 225 break; 226 case 1:{ 227 int num = av_lfg_get(&prng) % 10 + 1; 228 for(i=0;i<num;i++) 229 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256; 230 }break; 231 case 2: 232 block1[0] = av_lfg_get(&prng) % 4096 - 2048; 233 block1[63]= (block1[0]&1)^1; 234 break; 235 } 236 237#if 0 // simulate mismatch control 238{ int sum=0; 239 for(i=0;i<64;i++) 240 sum+=block1[i]; 241 242 if((sum&1)==0) block1[63]^=1; 243} 244#endif 245 246 for(i=0; i<64; i++) 247 block_org[i]= block1[i]; 248 249 if (form == MMX_PERM) { 250 for(i=0;i<64;i++) 251 block[idct_mmx_perm[i]] = block1[i]; 252 } else if (form == MMX_SIMPLE_PERM) { 253 for(i=0;i<64;i++) 254 block[idct_simple_mmx_perm[i]] = block1[i]; 255 256 } else if (form == SSE2_PERM) { 257 for(i=0; i<64; i++) 258 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i]; 259 } else if (form == PARTTRANS_PERM) { 260 for(i=0; i<64; i++) 261 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i]; 262 } else { 263 for(i=0; i<64; i++) 264 block[i]= block1[i]; 265 } 266#if 0 // simulate mismatch control for tested IDCT but not the ref 267{ int sum=0; 268 for(i=0;i<64;i++) 269 sum+=block[i]; 270 271 if((sum&1)==0) block[63]^=1; 272} 273#endif 274 275 fdct_func(block); 276 mmx_emms(); 277 278 if (form == SCALE_PERM) { 279 for(i=0; i<64; i++) { 280 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i]; 281 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS; 282 } 283 } 284 285 fdct_ref(block1); 286 287 blockSumErr=0; 288 for(i=0;i<64;i++) { 289 v = abs(block[i] - block1[i]); 290 if (v > err_inf) 291 err_inf = v; 292 err2 += v * v; 293 sysErr[i] += block[i] - block1[i]; 294 blockSumErr += v; 295 if( abs(block[i])>maxout) maxout=abs(block[i]); 296 } 297 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr; 298#if 0 // print different matrix pairs 299 if(blockSumErr){ 300 printf("\n"); 301 for(i=0; i<64; i++){ 302 if((i&7)==0) printf("\n"); 303 printf("%4d ", block_org[i]); 304 } 305 for(i=0; i<64; i++){ 306 if((i&7)==0) printf("\n"); 307 printf("%4d ", block[i] - block1[i]); 308 } 309 } 310#endif 311 } 312 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i])); 313 314#if 1 // dump systematic errors 315 for(i=0; i<64; i++){ 316 if(i%8==0) printf("\n"); 317 printf("%7d ", (int)sysErr[i]); 318 } 319 printf("\n"); 320#endif 321 322 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n", 323 is_idct ? "IDCT" : "DCT", 324 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax); 325#if 1 //Speed test 326 /* speed test */ 327 for(i=0;i<64;i++) 328 block1[i] = 0; 329 switch(test){ 330 case 0: 331 for(i=0;i<64;i++) 332 block1[i] = av_lfg_get(&prng) % 512 -256; 333 if (is_idct){ 334 ff_ref_fdct(block1); 335 336 for(i=0;i<64;i++) 337 block1[i]>>=3; 338 } 339 break; 340 case 1:{ 341 case 2: 342 block1[0] = av_lfg_get(&prng) % 512 -256; 343 block1[1] = av_lfg_get(&prng) % 512 -256; 344 block1[2] = av_lfg_get(&prng) % 512 -256; 345 block1[3] = av_lfg_get(&prng) % 512 -256; 346 }break; 347 } 348 349 if (form == MMX_PERM) { 350 for(i=0;i<64;i++) 351 block[idct_mmx_perm[i]] = block1[i]; 352 } else if(form == MMX_SIMPLE_PERM) { 353 for(i=0;i<64;i++) 354 block[idct_simple_mmx_perm[i]] = block1[i]; 355 } else { 356 for(i=0; i<64; i++) 357 block[i]= block1[i]; 358 } 359 360 ti = gettime(); 361 it1 = 0; 362 do { 363 for(it=0;it<NB_ITS_SPEED;it++) { 364 for(i=0; i<64; i++) 365 block[i]= block1[i]; 366// memcpy(block, block1, sizeof(DCTELEM) * 64); 367// do not memcpy especially not fastmemcpy because it does movntq !!! 368 fdct_func(block); 369 } 370 it1 += NB_ITS_SPEED; 371 ti1 = gettime() - ti; 372 } while (ti1 < 1000000); 373 mmx_emms(); 374 375 printf("%s %s: %0.1f kdct/s\n", 376 is_idct ? "IDCT" : "DCT", 377 name, (double)it1 * 1000.0 / (double)ti1); 378#endif 379} 380 381DECLARE_ALIGNED(8, static uint8_t, img_dest)[64]; 382DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64]; 383 384static void idct248_ref(uint8_t *dest, int linesize, int16_t *block) 385{ 386 static int init; 387 static double c8[8][8]; 388 static double c4[4][4]; 389 double block1[64], block2[64], block3[64]; 390 double s, sum, v; 391 int i, j, k; 392 393 if (!init) { 394 init = 1; 395 396 for(i=0;i<8;i++) { 397 sum = 0; 398 for(j=0;j<8;j++) { 399 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0); 400 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0); 401 sum += c8[i][j] * c8[i][j]; 402 } 403 } 404 405 for(i=0;i<4;i++) { 406 sum = 0; 407 for(j=0;j<4;j++) { 408 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0); 409 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0); 410 sum += c4[i][j] * c4[i][j]; 411 } 412 } 413 } 414 415 /* butterfly */ 416 s = 0.5 * sqrt(2.0); 417 for(i=0;i<4;i++) { 418 for(j=0;j<8;j++) { 419 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s; 420 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s; 421 } 422 } 423 424 /* idct8 on lines */ 425 for(i=0;i<8;i++) { 426 for(j=0;j<8;j++) { 427 sum = 0; 428 for(k=0;k<8;k++) 429 sum += c8[k][j] * block1[8*i+k]; 430 block2[8*i+j] = sum; 431 } 432 } 433 434 /* idct4 */ 435 for(i=0;i<8;i++) { 436 for(j=0;j<4;j++) { 437 /* top */ 438 sum = 0; 439 for(k=0;k<4;k++) 440 sum += c4[k][j] * block2[8*(2*k)+i]; 441 block3[8*(2*j)+i] = sum; 442 443 /* bottom */ 444 sum = 0; 445 for(k=0;k<4;k++) 446 sum += c4[k][j] * block2[8*(2*k+1)+i]; 447 block3[8*(2*j+1)+i] = sum; 448 } 449 } 450 451 /* clamp and store the result */ 452 for(i=0;i<8;i++) { 453 for(j=0;j<8;j++) { 454 v = block3[8*i+j]; 455 if (v < 0) 456 v = 0; 457 else if (v > 255) 458 v = 255; 459 dest[i * linesize + j] = (int)rint(v); 460 } 461 } 462} 463 464static void idct248_error(const char *name, 465 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block)) 466{ 467 int it, i, it1, ti, ti1, err_max, v; 468 469 AVLFG prng; 470 471 av_lfg_init(&prng, 1); 472 473 /* just one test to see if code is correct (precision is less 474 important here) */ 475 err_max = 0; 476 for(it=0;it<NB_ITS;it++) { 477 478 /* XXX: use forward transform to generate values */ 479 for(i=0;i<64;i++) 480 block1[i] = av_lfg_get(&prng) % 256 - 128; 481 block1[0] += 1024; 482 483 for(i=0; i<64; i++) 484 block[i]= block1[i]; 485 idct248_ref(img_dest1, 8, block); 486 487 for(i=0; i<64; i++) 488 block[i]= block1[i]; 489 idct248_put(img_dest, 8, block); 490 491 for(i=0;i<64;i++) { 492 v = abs((int)img_dest[i] - (int)img_dest1[i]); 493 if (v == 255) 494 printf("%d %d\n", img_dest[i], img_dest1[i]); 495 if (v > err_max) 496 err_max = v; 497 } 498#if 0 499 printf("ref=\n"); 500 for(i=0;i<8;i++) { 501 int j; 502 for(j=0;j<8;j++) { 503 printf(" %3d", img_dest1[i*8+j]); 504 } 505 printf("\n"); 506 } 507 508 printf("out=\n"); 509 for(i=0;i<8;i++) { 510 int j; 511 for(j=0;j<8;j++) { 512 printf(" %3d", img_dest[i*8+j]); 513 } 514 printf("\n"); 515 } 516#endif 517 } 518 printf("%s %s: err_inf=%d\n", 519 1 ? "IDCT248" : "DCT248", 520 name, err_max); 521 522 ti = gettime(); 523 it1 = 0; 524 do { 525 for(it=0;it<NB_ITS_SPEED;it++) { 526 for(i=0; i<64; i++) 527 block[i]= block1[i]; 528// memcpy(block, block1, sizeof(DCTELEM) * 64); 529// do not memcpy especially not fastmemcpy because it does movntq !!! 530 idct248_put(img_dest, 8, block); 531 } 532 it1 += NB_ITS_SPEED; 533 ti1 = gettime() - ti; 534 } while (ti1 < 1000000); 535 mmx_emms(); 536 537 printf("%s %s: %0.1f kdct/s\n", 538 1 ? "IDCT248" : "DCT248", 539 name, (double)it1 * 1000.0 / (double)ti1); 540} 541 542static void help(void) 543{ 544 printf("dct-test [-i] [<test-number>]\n" 545 "test-number 0 -> test with random matrixes\n" 546 " 1 -> test with random sparse matrixes\n" 547 " 2 -> do 3. test from mpeg4 std\n" 548 "-i test IDCT implementations\n" 549 "-4 test IDCT248 implementations\n"); 550} 551 552int main(int argc, char **argv) 553{ 554 int test_idct = 0, test_248_dct = 0; 555 int c,i; 556 int test=1; 557 cpu_flags = mm_support(); 558 559 ff_ref_dct_init(); 560 idct_mmx_init(); 561 562 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i; 563 for(i=0;i<MAX_NEG_CROP;i++) { 564 cropTbl[i] = 0; 565 cropTbl[i + MAX_NEG_CROP + 256] = 255; 566 } 567 568 for(;;) { 569 c = getopt(argc, argv, "ih4"); 570 if (c == -1) 571 break; 572 switch(c) { 573 case 'i': 574 test_idct = 1; 575 break; 576 case '4': 577 test_248_dct = 1; 578 break; 579 default : 580 case 'h': 581 help(); 582 return 0; 583 } 584 } 585 586 if(optind <argc) test= atoi(argv[optind]); 587 588 printf("ffmpeg DCT/IDCT test\n"); 589 590 if (test_248_dct) { 591 idct248_error("SIMPLE-C", ff_simple_idct248_put); 592 } else { 593 for (i=0;algos[i].name;i++) 594 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) { 595 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test); 596 } 597 } 598 return 0; 599} 600