1/* 2 * (c) 2001 Fabrice Bellard 3 * 2007 Marc Hoffman <marc.hoffman@analog.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file libavcodec/dct-test.c 24 * DCT test (c) 2001 Fabrice Bellard 25 * Started from sample code by Juan J. Sierralta P. 26 */ 27 28#include <stdlib.h> 29#include <stdio.h> 30#include <string.h> 31#include <sys/time.h> 32#include <unistd.h> 33#include <math.h> 34 35#include "libavutil/common.h" 36 37#include "simple_idct.h" 38#include "aandcttab.h" 39#include "faandct.h" 40#include "faanidct.h" 41#include "x86/idct_xvid.h" 42 43#undef printf 44#undef random 45 46void *fast_memcpy(void *a, const void *b, size_t c){return memcpy(a,b,c);}; 47 48/* reference fdct/idct */ 49void ff_ref_fdct(DCTELEM *block); 50void ff_ref_idct(DCTELEM *block); 51void ff_ref_dct_init(void); 52 53void ff_mmx_idct(DCTELEM *data); 54void ff_mmxext_idct(DCTELEM *data); 55 56void odivx_idct_c(short *block); 57 58// BFIN 59void ff_bfin_idct(DCTELEM *block); 60void ff_bfin_fdct(DCTELEM *block); 61 62// ALTIVEC 63void fdct_altivec(DCTELEM *block); 64//void idct_altivec(DCTELEM *block);?? no routine 65 66// ARM 67void j_rev_dct_ARM(DCTELEM *data); 68void simple_idct_ARM(DCTELEM *data); 69void simple_idct_armv5te(DCTELEM *data); 70void ff_simple_idct_armv6(DCTELEM *data); 71void ff_simple_idct_neon(DCTELEM *data); 72 73void ff_simple_idct_axp(DCTELEM *data); 74 75struct algo { 76 const char *name; 77 enum { FDCT, IDCT } is_idct; 78 void (* func) (DCTELEM *block); 79 void (* ref) (DCTELEM *block); 80 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format; 81 int mm_support; 82}; 83 84#ifndef FAAN_POSTSCALE 85#define FAAN_SCALE SCALE_PERM 86#else 87#define FAAN_SCALE NO_PERM 88#endif 89 90static int cpu_flags; 91 92struct algo algos[] = { 93 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM}, 94 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE}, 95 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM}, 96 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM}, 97 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM}, 98 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM}, 99 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM}, 100 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM}, 101 102#if HAVE_MMX 103 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, FF_MM_MMX}, 104#if HAVE_MMX2 105 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, FF_MM_MMXEXT}, 106 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, FF_MM_SSE2}, 107#endif 108 109#if CONFIG_GPL 110 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX}, 111 {"LIBMPEG2-MMXEXT", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, FF_MM_MMXEXT}, 112#endif 113 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX}, 114 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, FF_MM_MMX}, 115 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, FF_MM_MMXEXT}, 116 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, FF_MM_SSE2}, 117#endif 118 119#if HAVE_ALTIVEC 120 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC}, 121#endif 122 123#if ARCH_BFIN 124 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM}, 125 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM}, 126#endif 127 128#if ARCH_ARM 129 {"SIMPLE-ARM", 1, simple_idct_ARM, ff_ref_idct, NO_PERM }, 130 {"INT-ARM", 1, j_rev_dct_ARM, ff_ref_idct, MMX_PERM }, 131#if HAVE_ARMV5TE 132 {"SIMPLE-ARMV5TE", 1, simple_idct_armv5te, ff_ref_idct, NO_PERM }, 133#endif 134#if HAVE_ARMV6 135 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM }, 136#endif 137#if HAVE_NEON 138 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM }, 139#endif 140#endif /* ARCH_ARM */ 141 142#if ARCH_ALPHA 143 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM }, 144#endif 145 146 { 0 } 147}; 148 149#define AANSCALE_BITS 12 150 151uint8_t cropTbl[256 + 2 * MAX_NEG_CROP]; 152 153int64_t gettime(void) 154{ 155 struct timeval tv; 156 gettimeofday(&tv,NULL); 157 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; 158} 159 160#define NB_ITS 20000 161#define NB_ITS_SPEED 50000 162 163static short idct_mmx_perm[64]; 164 165static short idct_simple_mmx_perm[64]={ 166 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 167 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 168 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 169 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 170 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 171 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 172 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 173 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, 174}; 175 176static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7}; 177 178void idct_mmx_init(void) 179{ 180 int i; 181 182 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ 183 for (i = 0; i < 64; i++) { 184 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); 185// idct_simple_mmx_perm[i] = simple_block_permute_op(i); 186 } 187} 188 189static DCTELEM block[64] __attribute__ ((aligned (16))); 190static DCTELEM block1[64] __attribute__ ((aligned (8))); 191static DCTELEM block_org[64] __attribute__ ((aligned (8))); 192 193static inline void mmx_emms(void) 194{ 195#if HAVE_MMX 196 if (cpu_flags & FF_MM_MMX) 197 __asm__ volatile ("emms\n\t"); 198#endif 199} 200 201void dct_error(const char *name, int is_idct, 202 void (*fdct_func)(DCTELEM *block), 203 void (*fdct_ref)(DCTELEM *block), int form, int test) 204{ 205 int it, i, scale; 206 int err_inf, v; 207 int64_t err2, ti, ti1, it1; 208 int64_t sysErr[64], sysErrMax=0; 209 int maxout=0; 210 int blockSumErrMax=0, blockSumErr; 211 212 srandom(0); 213 214 err_inf = 0; 215 err2 = 0; 216 for(i=0; i<64; i++) sysErr[i]=0; 217 for(it=0;it<NB_ITS;it++) { 218 for(i=0;i<64;i++) 219 block1[i] = 0; 220 switch(test){ 221 case 0: 222 for(i=0;i<64;i++) 223 block1[i] = (random() % 512) -256; 224 if (is_idct){ 225 ff_ref_fdct(block1); 226 227 for(i=0;i<64;i++) 228 block1[i]>>=3; 229 } 230 break; 231 case 1:{ 232 int num= (random()%10)+1; 233 for(i=0;i<num;i++) 234 block1[random()%64] = (random() % 512) -256; 235 }break; 236 case 2: 237 block1[0]= (random()%4096)-2048; 238 block1[63]= (block1[0]&1)^1; 239 break; 240 } 241 242#if 0 // simulate mismatch control 243{ int sum=0; 244 for(i=0;i<64;i++) 245 sum+=block1[i]; 246 247 if((sum&1)==0) block1[63]^=1; 248} 249#endif 250 251 for(i=0; i<64; i++) 252 block_org[i]= block1[i]; 253 254 if (form == MMX_PERM) { 255 for(i=0;i<64;i++) 256 block[idct_mmx_perm[i]] = block1[i]; 257 } else if (form == MMX_SIMPLE_PERM) { 258 for(i=0;i<64;i++) 259 block[idct_simple_mmx_perm[i]] = block1[i]; 260 261 } else if (form == SSE2_PERM) { 262 for(i=0; i<64; i++) 263 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i]; 264 } else if (form == PARTTRANS_PERM) { 265 for(i=0; i<64; i++) 266 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i]; 267 } else { 268 for(i=0; i<64; i++) 269 block[i]= block1[i]; 270 } 271#if 0 // simulate mismatch control for tested IDCT but not the ref 272{ int sum=0; 273 for(i=0;i<64;i++) 274 sum+=block[i]; 275 276 if((sum&1)==0) block[63]^=1; 277} 278#endif 279 280 fdct_func(block); 281 mmx_emms(); 282 283 if (form == SCALE_PERM) { 284 for(i=0; i<64; i++) { 285 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i]; 286 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS; 287 } 288 } 289 290 fdct_ref(block1); 291 292 blockSumErr=0; 293 for(i=0;i<64;i++) { 294 v = abs(block[i] - block1[i]); 295 if (v > err_inf) 296 err_inf = v; 297 err2 += v * v; 298 sysErr[i] += block[i] - block1[i]; 299 blockSumErr += v; 300 if( abs(block[i])>maxout) maxout=abs(block[i]); 301 } 302 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr; 303#if 0 // print different matrix pairs 304 if(blockSumErr){ 305 printf("\n"); 306 for(i=0; i<64; i++){ 307 if((i&7)==0) printf("\n"); 308 printf("%4d ", block_org[i]); 309 } 310 for(i=0; i<64; i++){ 311 if((i&7)==0) printf("\n"); 312 printf("%4d ", block[i] - block1[i]); 313 } 314 } 315#endif 316 } 317 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i])); 318 319#if 1 // dump systematic errors 320 for(i=0; i<64; i++){ 321 if(i%8==0) printf("\n"); 322 printf("%5d ", (int)sysErr[i]); 323 } 324 printf("\n"); 325#endif 326 327 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n", 328 is_idct ? "IDCT" : "DCT", 329 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax); 330#if 1 //Speed test 331 /* speed test */ 332 for(i=0;i<64;i++) 333 block1[i] = 0; 334 switch(test){ 335 case 0: 336 for(i=0;i<64;i++) 337 block1[i] = (random() % 512) -256; 338 if (is_idct){ 339 ff_ref_fdct(block1); 340 341 for(i=0;i<64;i++) 342 block1[i]>>=3; 343 } 344 break; 345 case 1:{ 346 case 2: 347 block1[0] = (random() % 512) -256; 348 block1[1] = (random() % 512) -256; 349 block1[2] = (random() % 512) -256; 350 block1[3] = (random() % 512) -256; 351 }break; 352 } 353 354 if (form == MMX_PERM) { 355 for(i=0;i<64;i++) 356 block[idct_mmx_perm[i]] = block1[i]; 357 } else if(form == MMX_SIMPLE_PERM) { 358 for(i=0;i<64;i++) 359 block[idct_simple_mmx_perm[i]] = block1[i]; 360 } else { 361 for(i=0; i<64; i++) 362 block[i]= block1[i]; 363 } 364 365 ti = gettime(); 366 it1 = 0; 367 do { 368 for(it=0;it<NB_ITS_SPEED;it++) { 369 for(i=0; i<64; i++) 370 block[i]= block1[i]; 371// memcpy(block, block1, sizeof(DCTELEM) * 64); 372// do not memcpy especially not fastmemcpy because it does movntq !!! 373 fdct_func(block); 374 } 375 it1 += NB_ITS_SPEED; 376 ti1 = gettime() - ti; 377 } while (ti1 < 1000000); 378 mmx_emms(); 379 380 printf("%s %s: %0.1f kdct/s\n", 381 is_idct ? "IDCT" : "DCT", 382 name, (double)it1 * 1000.0 / (double)ti1); 383#endif 384} 385 386static uint8_t img_dest[64] __attribute__ ((aligned (8))); 387static uint8_t img_dest1[64] __attribute__ ((aligned (8))); 388 389void idct248_ref(uint8_t *dest, int linesize, int16_t *block) 390{ 391 static int init; 392 static double c8[8][8]; 393 static double c4[4][4]; 394 double block1[64], block2[64], block3[64]; 395 double s, sum, v; 396 int i, j, k; 397 398 if (!init) { 399 init = 1; 400 401 for(i=0;i<8;i++) { 402 sum = 0; 403 for(j=0;j<8;j++) { 404 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0); 405 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0); 406 sum += c8[i][j] * c8[i][j]; 407 } 408 } 409 410 for(i=0;i<4;i++) { 411 sum = 0; 412 for(j=0;j<4;j++) { 413 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0); 414 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0); 415 sum += c4[i][j] * c4[i][j]; 416 } 417 } 418 } 419 420 /* butterfly */ 421 s = 0.5 * sqrt(2.0); 422 for(i=0;i<4;i++) { 423 for(j=0;j<8;j++) { 424 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s; 425 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s; 426 } 427 } 428 429 /* idct8 on lines */ 430 for(i=0;i<8;i++) { 431 for(j=0;j<8;j++) { 432 sum = 0; 433 for(k=0;k<8;k++) 434 sum += c8[k][j] * block1[8*i+k]; 435 block2[8*i+j] = sum; 436 } 437 } 438 439 /* idct4 */ 440 for(i=0;i<8;i++) { 441 for(j=0;j<4;j++) { 442 /* top */ 443 sum = 0; 444 for(k=0;k<4;k++) 445 sum += c4[k][j] * block2[8*(2*k)+i]; 446 block3[8*(2*j)+i] = sum; 447 448 /* bottom */ 449 sum = 0; 450 for(k=0;k<4;k++) 451 sum += c4[k][j] * block2[8*(2*k+1)+i]; 452 block3[8*(2*j+1)+i] = sum; 453 } 454 } 455 456 /* clamp and store the result */ 457 for(i=0;i<8;i++) { 458 for(j=0;j<8;j++) { 459 v = block3[8*i+j]; 460 if (v < 0) 461 v = 0; 462 else if (v > 255) 463 v = 255; 464 dest[i * linesize + j] = (int)rint(v); 465 } 466 } 467} 468 469void idct248_error(const char *name, 470 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block)) 471{ 472 int it, i, it1, ti, ti1, err_max, v; 473 474 srandom(0); 475 476 /* just one test to see if code is correct (precision is less 477 important here) */ 478 err_max = 0; 479 for(it=0;it<NB_ITS;it++) { 480 481 /* XXX: use forward transform to generate values */ 482 for(i=0;i<64;i++) 483 block1[i] = (random() % 256) - 128; 484 block1[0] += 1024; 485 486 for(i=0; i<64; i++) 487 block[i]= block1[i]; 488 idct248_ref(img_dest1, 8, block); 489 490 for(i=0; i<64; i++) 491 block[i]= block1[i]; 492 idct248_put(img_dest, 8, block); 493 494 for(i=0;i<64;i++) { 495 v = abs((int)img_dest[i] - (int)img_dest1[i]); 496 if (v == 255) 497 printf("%d %d\n", img_dest[i], img_dest1[i]); 498 if (v > err_max) 499 err_max = v; 500 } 501#if 0 502 printf("ref=\n"); 503 for(i=0;i<8;i++) { 504 int j; 505 for(j=0;j<8;j++) { 506 printf(" %3d", img_dest1[i*8+j]); 507 } 508 printf("\n"); 509 } 510 511 printf("out=\n"); 512 for(i=0;i<8;i++) { 513 int j; 514 for(j=0;j<8;j++) { 515 printf(" %3d", img_dest[i*8+j]); 516 } 517 printf("\n"); 518 } 519#endif 520 } 521 printf("%s %s: err_inf=%d\n", 522 1 ? "IDCT248" : "DCT248", 523 name, err_max); 524 525 ti = gettime(); 526 it1 = 0; 527 do { 528 for(it=0;it<NB_ITS_SPEED;it++) { 529 for(i=0; i<64; i++) 530 block[i]= block1[i]; 531// memcpy(block, block1, sizeof(DCTELEM) * 64); 532// do not memcpy especially not fastmemcpy because it does movntq !!! 533 idct248_put(img_dest, 8, block); 534 } 535 it1 += NB_ITS_SPEED; 536 ti1 = gettime() - ti; 537 } while (ti1 < 1000000); 538 mmx_emms(); 539 540 printf("%s %s: %0.1f kdct/s\n", 541 1 ? "IDCT248" : "DCT248", 542 name, (double)it1 * 1000.0 / (double)ti1); 543} 544 545void help(void) 546{ 547 printf("dct-test [-i] [<test-number>]\n" 548 "test-number 0 -> test with random matrixes\n" 549 " 1 -> test with random sparse matrixes\n" 550 " 2 -> do 3. test from mpeg4 std\n" 551 "-i test IDCT implementations\n" 552 "-4 test IDCT248 implementations\n"); 553} 554 555int main(int argc, char **argv) 556{ 557 int test_idct = 0, test_248_dct = 0; 558 int c,i; 559 int test=1; 560 cpu_flags = mm_support(); 561 562 ff_ref_dct_init(); 563 idct_mmx_init(); 564 565 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i; 566 for(i=0;i<MAX_NEG_CROP;i++) { 567 cropTbl[i] = 0; 568 cropTbl[i + MAX_NEG_CROP + 256] = 255; 569 } 570 571 for(;;) { 572 c = getopt(argc, argv, "ih4"); 573 if (c == -1) 574 break; 575 switch(c) { 576 case 'i': 577 test_idct = 1; 578 break; 579 case '4': 580 test_248_dct = 1; 581 break; 582 default : 583 case 'h': 584 help(); 585 return 0; 586 } 587 } 588 589 if(optind <argc) test= atoi(argv[optind]); 590 591 printf("ffmpeg DCT/IDCT test\n"); 592 593 if (test_248_dct) { 594 idct248_error("SIMPLE-C", ff_simple_idct248_put); 595 } else { 596 for (i=0;algos[i].name;i++) 597 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) { 598 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test); 599 } 600 } 601 return 0; 602} 603