1/* 2 * (c) 2001 Fabrice Bellard 3 * 2007 Marc Hoffman <marc.hoffman@analog.com> 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file 24 * DCT test (c) 2001 Fabrice Bellard 25 * Started from sample code by Juan J. Sierralta P. 26 */ 27 28#include <stdlib.h> 29#include <stdio.h> 30#include <string.h> 31#include <sys/time.h> 32#include <unistd.h> 33#include <math.h> 34 35#include "libavutil/cpu.h" 36#include "libavutil/common.h" 37#include "libavutil/lfg.h" 38 39#include "simple_idct.h" 40#include "aandcttab.h" 41#include "faandct.h" 42#include "faanidct.h" 43#include "x86/idct_xvid.h" 44#include "dctref.h" 45 46#undef printf 47 48void ff_mmx_idct(DCTELEM *data); 49void ff_mmxext_idct(DCTELEM *data); 50 51void odivx_idct_c(short *block); 52 53// BFIN 54void ff_bfin_idct(DCTELEM *block); 55void ff_bfin_fdct(DCTELEM *block); 56 57// ALTIVEC 58void fdct_altivec(DCTELEM *block); 59//void idct_altivec(DCTELEM *block);?? no routine 60 61// ARM 62void ff_j_rev_dct_arm(DCTELEM *data); 63void ff_simple_idct_arm(DCTELEM *data); 64void ff_simple_idct_armv5te(DCTELEM *data); 65void ff_simple_idct_armv6(DCTELEM *data); 66void ff_simple_idct_neon(DCTELEM *data); 67 68void ff_simple_idct_axp(DCTELEM *data); 69 70struct algo { 71 const char *name; 72 void (*func)(DCTELEM *block); 73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, 74 SSE2_PERM, PARTTRANS_PERM } format; 75 int mm_support; 76 int nonspec; 77}; 78 79#ifndef FAAN_POSTSCALE 80#define FAAN_SCALE SCALE_PERM 81#else 82#define FAAN_SCALE NO_PERM 83#endif 84 85static int cpu_flags; 86 87static const struct algo fdct_tab[] = { 88 { "REF-DBL", ff_ref_fdct, NO_PERM }, 89 { "FAAN", ff_faandct, FAAN_SCALE }, 90 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM }, 91 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM }, 92 93#if HAVE_MMX 94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX }, 95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 }, 96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 }, 97#endif 98 99#if HAVE_ALTIVEC 100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC }, 101#endif 102 103#if ARCH_BFIN 104 { "BFINfdct", ff_bfin_fdct, NO_PERM }, 105#endif 106 107 { 0 } 108}; 109 110static const struct algo idct_tab[] = { 111 { "FAANI", ff_faanidct, NO_PERM }, 112 { "REF-DBL", ff_ref_idct, NO_PERM }, 113 { "INT", j_rev_dct, MMX_PERM }, 114 { "SIMPLE-C", ff_simple_idct_8, NO_PERM }, 115 116#if HAVE_MMX 117#if CONFIG_GPL 118 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 }, 119 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 }, 120#endif 121 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX }, 122 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 }, 123 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 }, 124 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 }, 125#endif 126 127#if ARCH_BFIN 128 { "BFINidct", ff_bfin_idct, NO_PERM }, 129#endif 130 131#if ARCH_ARM 132 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM }, 133 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM }, 134#endif 135#if HAVE_ARMV5TE 136 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM }, 137#endif 138#if HAVE_ARMV6 139 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM }, 140#endif 141#if HAVE_NEON 142 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM }, 143#endif 144 145#if ARCH_ALPHA 146 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM }, 147#endif 148 149 { 0 } 150}; 151 152#define AANSCALE_BITS 12 153 154static int64_t gettime(void) 155{ 156 struct timeval tv; 157 gettimeofday(&tv, NULL); 158 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; 159} 160 161#define NB_ITS 20000 162#define NB_ITS_SPEED 50000 163 164static short idct_mmx_perm[64]; 165 166static short idct_simple_mmx_perm[64] = { 167 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 168 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 169 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 170 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 171 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 172 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 173 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 174 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, 175}; 176 177static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; 178 179static void idct_mmx_init(void) 180{ 181 int i; 182 183 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ 184 for (i = 0; i < 64; i++) { 185 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); 186 } 187} 188 189DECLARE_ALIGNED(16, static DCTELEM, block)[64]; 190DECLARE_ALIGNED(8, static DCTELEM, block1)[64]; 191 192static inline void mmx_emms(void) 193{ 194#if HAVE_MMX 195 if (cpu_flags & AV_CPU_FLAG_MMX) 196 __asm__ volatile ("emms\n\t"); 197#endif 198} 199 200static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng) 201{ 202 int i, j; 203 204 memset(block, 0, 64 * sizeof(*block)); 205 206 switch (test) { 207 case 0: 208 for (i = 0; i < 64; i++) 209 block[i] = (av_lfg_get(prng) % 512) - 256; 210 if (is_idct) { 211 ff_ref_fdct(block); 212 for (i = 0; i < 64; i++) 213 block[i] >>= 3; 214 } 215 break; 216 case 1: 217 j = av_lfg_get(prng) % 10 + 1; 218 for (i = 0; i < j; i++) 219 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256; 220 break; 221 case 2: 222 block[ 0] = av_lfg_get(prng) % 4096 - 2048; 223 block[63] = (block[0] & 1) ^ 1; 224 break; 225 } 226} 227 228static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm) 229{ 230 int i; 231 232 if (perm == MMX_PERM) { 233 for (i = 0; i < 64; i++) 234 dst[idct_mmx_perm[i]] = src[i]; 235 } else if (perm == MMX_SIMPLE_PERM) { 236 for (i = 0; i < 64; i++) 237 dst[idct_simple_mmx_perm[i]] = src[i]; 238 } else if (perm == SSE2_PERM) { 239 for (i = 0; i < 64; i++) 240 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i]; 241 } else if (perm == PARTTRANS_PERM) { 242 for (i = 0; i < 64; i++) 243 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i]; 244 } else { 245 for (i = 0; i < 64; i++) 246 dst[i] = src[i]; 247 } 248} 249 250static int dct_error(const struct algo *dct, int test, int is_idct, int speed) 251{ 252 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct; 253 int it, i, scale; 254 int err_inf, v; 255 int64_t err2, ti, ti1, it1, err_sum = 0; 256 int64_t sysErr[64], sysErrMax = 0; 257 int maxout = 0; 258 int blockSumErrMax = 0, blockSumErr; 259 AVLFG prng; 260 double omse, ome; 261 int spec_err; 262 263 av_lfg_init(&prng, 1); 264 265 err_inf = 0; 266 err2 = 0; 267 for (i = 0; i < 64; i++) 268 sysErr[i] = 0; 269 for (it = 0; it < NB_ITS; it++) { 270 init_block(block1, test, is_idct, &prng); 271 permute(block, block1, dct->format); 272 273 dct->func(block); 274 mmx_emms(); 275 276 if (dct->format == SCALE_PERM) { 277 for (i = 0; i < 64; i++) { 278 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i]; 279 block[i] = (block[i] * scale) >> AANSCALE_BITS; 280 } 281 } 282 283 ref(block1); 284 285 blockSumErr = 0; 286 for (i = 0; i < 64; i++) { 287 int err = block[i] - block1[i]; 288 err_sum += err; 289 v = abs(err); 290 if (v > err_inf) 291 err_inf = v; 292 err2 += v * v; 293 sysErr[i] += block[i] - block1[i]; 294 blockSumErr += v; 295 if (abs(block[i]) > maxout) 296 maxout = abs(block[i]); 297 } 298 if (blockSumErrMax < blockSumErr) 299 blockSumErrMax = blockSumErr; 300 } 301 for (i = 0; i < 64; i++) 302 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i])); 303 304 for (i = 0; i < 64; i++) { 305 if (i % 8 == 0) 306 printf("\n"); 307 printf("%7d ", (int) sysErr[i]); 308 } 309 printf("\n"); 310 311 omse = (double) err2 / NB_ITS / 64; 312 ome = (double) err_sum / NB_ITS / 64; 313 314 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015); 315 316 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n", 317 is_idct ? "IDCT" : "DCT", dct->name, err_inf, 318 omse, ome, (double) sysErrMax / NB_ITS, 319 maxout, blockSumErrMax); 320 321 if (spec_err && !dct->nonspec) 322 return 1; 323 324 if (!speed) 325 return 0; 326 327 /* speed test */ 328 init_block(block, test, is_idct, &prng); 329 permute(block1, block, dct->format); 330 331 ti = gettime(); 332 it1 = 0; 333 do { 334 for (it = 0; it < NB_ITS_SPEED; it++) { 335 memcpy(block, block1, sizeof(block)); 336 dct->func(block); 337 } 338 it1 += NB_ITS_SPEED; 339 ti1 = gettime() - ti; 340 } while (ti1 < 1000000); 341 mmx_emms(); 342 343 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name, 344 (double) it1 * 1000.0 / (double) ti1); 345 346 return 0; 347} 348 349DECLARE_ALIGNED(8, static uint8_t, img_dest)[64]; 350DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64]; 351 352static void idct248_ref(uint8_t *dest, int linesize, int16_t *block) 353{ 354 static int init; 355 static double c8[8][8]; 356 static double c4[4][4]; 357 double block1[64], block2[64], block3[64]; 358 double s, sum, v; 359 int i, j, k; 360 361 if (!init) { 362 init = 1; 363 364 for (i = 0; i < 8; i++) { 365 sum = 0; 366 for (j = 0; j < 8; j++) { 367 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0); 368 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0); 369 sum += c8[i][j] * c8[i][j]; 370 } 371 } 372 373 for (i = 0; i < 4; i++) { 374 sum = 0; 375 for (j = 0; j < 4; j++) { 376 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0); 377 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0); 378 sum += c4[i][j] * c4[i][j]; 379 } 380 } 381 } 382 383 /* butterfly */ 384 s = 0.5 * sqrt(2.0); 385 for (i = 0; i < 4; i++) { 386 for (j = 0; j < 8; j++) { 387 block1[8 * (2 * i) + j] = 388 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s; 389 block1[8 * (2 * i + 1) + j] = 390 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s; 391 } 392 } 393 394 /* idct8 on lines */ 395 for (i = 0; i < 8; i++) { 396 for (j = 0; j < 8; j++) { 397 sum = 0; 398 for (k = 0; k < 8; k++) 399 sum += c8[k][j] * block1[8 * i + k]; 400 block2[8 * i + j] = sum; 401 } 402 } 403 404 /* idct4 */ 405 for (i = 0; i < 8; i++) { 406 for (j = 0; j < 4; j++) { 407 /* top */ 408 sum = 0; 409 for (k = 0; k < 4; k++) 410 sum += c4[k][j] * block2[8 * (2 * k) + i]; 411 block3[8 * (2 * j) + i] = sum; 412 413 /* bottom */ 414 sum = 0; 415 for (k = 0; k < 4; k++) 416 sum += c4[k][j] * block2[8 * (2 * k + 1) + i]; 417 block3[8 * (2 * j + 1) + i] = sum; 418 } 419 } 420 421 /* clamp and store the result */ 422 for (i = 0; i < 8; i++) { 423 for (j = 0; j < 8; j++) { 424 v = block3[8 * i + j]; 425 if (v < 0) v = 0; 426 else if (v > 255) v = 255; 427 dest[i * linesize + j] = (int) rint(v); 428 } 429 } 430} 431 432static void idct248_error(const char *name, 433 void (*idct248_put)(uint8_t *dest, int line_size, 434 int16_t *block), 435 int speed) 436{ 437 int it, i, it1, ti, ti1, err_max, v; 438 AVLFG prng; 439 440 av_lfg_init(&prng, 1); 441 442 /* just one test to see if code is correct (precision is less 443 important here) */ 444 err_max = 0; 445 for (it = 0; it < NB_ITS; it++) { 446 /* XXX: use forward transform to generate values */ 447 for (i = 0; i < 64; i++) 448 block1[i] = av_lfg_get(&prng) % 256 - 128; 449 block1[0] += 1024; 450 451 for (i = 0; i < 64; i++) 452 block[i] = block1[i]; 453 idct248_ref(img_dest1, 8, block); 454 455 for (i = 0; i < 64; i++) 456 block[i] = block1[i]; 457 idct248_put(img_dest, 8, block); 458 459 for (i = 0; i < 64; i++) { 460 v = abs((int) img_dest[i] - (int) img_dest1[i]); 461 if (v == 255) 462 printf("%d %d\n", img_dest[i], img_dest1[i]); 463 if (v > err_max) 464 err_max = v; 465 } 466 } 467 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max); 468 469 if (!speed) 470 return; 471 472 ti = gettime(); 473 it1 = 0; 474 do { 475 for (it = 0; it < NB_ITS_SPEED; it++) { 476 for (i = 0; i < 64; i++) 477 block[i] = block1[i]; 478 idct248_put(img_dest, 8, block); 479 } 480 it1 += NB_ITS_SPEED; 481 ti1 = gettime() - ti; 482 } while (ti1 < 1000000); 483 mmx_emms(); 484 485 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name, 486 (double) it1 * 1000.0 / (double) ti1); 487} 488 489static void help(void) 490{ 491 printf("dct-test [-i] [<test-number>]\n" 492 "test-number 0 -> test with random matrixes\n" 493 " 1 -> test with random sparse matrixes\n" 494 " 2 -> do 3. test from mpeg4 std\n" 495 "-i test IDCT implementations\n" 496 "-4 test IDCT248 implementations\n" 497 "-t speed test\n"); 498} 499 500int main(int argc, char **argv) 501{ 502 int test_idct = 0, test_248_dct = 0; 503 int c, i; 504 int test = 1; 505 int speed = 0; 506 int err = 0; 507 508 cpu_flags = av_get_cpu_flags(); 509 510 ff_ref_dct_init(); 511 idct_mmx_init(); 512 513 for (;;) { 514 c = getopt(argc, argv, "ih4t"); 515 if (c == -1) 516 break; 517 switch (c) { 518 case 'i': 519 test_idct = 1; 520 break; 521 case '4': 522 test_248_dct = 1; 523 break; 524 case 't': 525 speed = 1; 526 break; 527 default: 528 case 'h': 529 help(); 530 return 0; 531 } 532 } 533 534 if (optind < argc) 535 test = atoi(argv[optind]); 536 537 printf("Libav DCT/IDCT test\n"); 538 539 if (test_248_dct) { 540 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed); 541 } else { 542 const struct algo *algos = test_idct ? idct_tab : fdct_tab; 543 for (i = 0; algos[i].name; i++) 544 if (!(~cpu_flags & algos[i].mm_support)) { 545 err |= dct_error(&algos[i], test, test_idct, speed); 546 } 547 } 548 549 return err; 550} 551