1/* 2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) 3 * 4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23/** 24 * @file libpostproc/postprocess.c 25 * postprocessing. 26 */ 27 28/* 29 C MMX MMX2 3DNow AltiVec 30isVertDC Ec Ec Ec 31isVertMinMaxOk Ec Ec Ec 32doVertLowPass E e e Ec 33doVertDefFilter Ec Ec e e Ec 34isHorizDC Ec Ec Ec 35isHorizMinMaxOk a E Ec 36doHorizLowPass E e e Ec 37doHorizDefFilter Ec Ec e e Ec 38do_a_deblock Ec E Ec E 39deRing E e e* Ecp 40Vertical RKAlgo1 E a a 41Horizontal RKAlgo1 a a 42Vertical X1# a E E 43Horizontal X1# a E E 44LinIpolDeinterlace e E E* 45CubicIpolDeinterlace a e e* 46LinBlendDeinterlace e E E* 47MedianDeinterlace# E Ec Ec 48TempDeNoiser# E e e Ec 49 50* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work 51# more or less selfinvented filters so the exactness is not too meaningful 52E = Exact implementation 53e = almost exact implementation (slightly different rounding,...) 54a = alternative / approximate impl 55c = checked against the other implementations (-vo md5) 56p = partially optimized, still some work to do 57*/ 58 59/* 60TODO: 61reduce the time wasted on the mem transfer 62unroll stuff if instructions depend too much on the prior one 63move YScale thing to the end instead of fixing QP 64write a faster and higher quality deblocking filter :) 65make the mainloop more flexible (variable number of blocks at once 66 (the if/else stuff per block is slowing things down) 67compare the quality & speed of all filters 68split this huge file 69optimize c versions 70try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 71... 72*/ 73 74//Changelog: use the Subversion log 75 76#include "config.h" 77#include "libavutil/avutil.h" 78#include <inttypes.h> 79#include <stdio.h> 80#include <stdlib.h> 81#include <string.h> 82//#undef HAVE_MMX2 83//#define HAVE_AMD3DNOW 84//#undef HAVE_MMX 85//#undef ARCH_X86 86//#define DEBUG_BRIGHTNESS 87#include "postprocess.h" 88#include "postprocess_internal.h" 89 90unsigned postproc_version(void) 91{ 92 return LIBPOSTPROC_VERSION_INT; 93} 94 95#if HAVE_ALTIVEC_H 96#include <altivec.h> 97#endif 98 99#define GET_MODE_BUFFER_SIZE 500 100#define OPTIONS_ARRAY_SIZE 10 101#define BLOCK_SIZE 8 102#define TEMP_STRIDE 8 103//#define NUM_BLOCKS_AT_ONCE 16 //not used yet 104 105#if ARCH_X86 106DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL; 107DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL; 108DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL; 109DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL; 110DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL; 111DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL; 112DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL; 113DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL; 114#endif 115 116DECLARE_ASM_CONST(8, int, deringThreshold)= 20; 117 118 119static struct PPFilter filters[]= 120{ 121 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, 122 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, 123/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER}, 124 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/ 125 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, 126 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, 127 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK}, 128 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK}, 129 {"dr", "dering", 1, 5, 6, DERING}, 130 {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, 131 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER}, 132 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER}, 133 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER}, 134 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER}, 135 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER}, 136 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER}, 137 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, 138 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT}, 139 {NULL, NULL,0,0,0,0} //End Marker 140}; 141 142static const char *replaceTable[]= 143{ 144 "default", "hb:a,vb:a,dr:a", 145 "de", "hb:a,vb:a,dr:a", 146 "fast", "h1:a,v1:a,dr:a", 147 "fa", "h1:a,v1:a,dr:a", 148 "ac", "ha:a:128:7,va:a,dr:a", 149 NULL //End Marker 150}; 151 152 153#if ARCH_X86 154static inline void prefetchnta(void *p) 155{ 156 __asm__ volatile( "prefetchnta (%0)\n\t" 157 : : "r" (p) 158 ); 159} 160 161static inline void prefetcht0(void *p) 162{ 163 __asm__ volatile( "prefetcht0 (%0)\n\t" 164 : : "r" (p) 165 ); 166} 167 168static inline void prefetcht1(void *p) 169{ 170 __asm__ volatile( "prefetcht1 (%0)\n\t" 171 : : "r" (p) 172 ); 173} 174 175static inline void prefetcht2(void *p) 176{ 177 __asm__ volatile( "prefetcht2 (%0)\n\t" 178 : : "r" (p) 179 ); 180} 181#endif 182 183/* The horizontal functions exist only in C because the MMX 184 * code is faster with vertical filters and transposing. */ 185 186/** 187 * Check if the given 8x8 Block is mostly "flat" 188 */ 189static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c) 190{ 191 int numEq= 0; 192 int y; 193 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 194 const int dcThreshold= dcOffset*2 + 1; 195 196 for(y=0; y<BLOCK_SIZE; y++){ 197 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++; 198 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++; 199 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++; 200 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++; 201 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++; 202 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++; 203 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++; 204 src+= stride; 205 } 206 return numEq > c->ppMode.flatnessThreshold; 207} 208 209/** 210 * Check if the middle 8x8 Block in the given 8x16 block is flat 211 */ 212static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c) 213{ 214 int numEq= 0; 215 int y; 216 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 217 const int dcThreshold= dcOffset*2 + 1; 218 219 src+= stride*4; // src points to begin of the 8x8 Block 220 for(y=0; y<BLOCK_SIZE-1; y++){ 221 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++; 222 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++; 223 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++; 224 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++; 225 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++; 226 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++; 227 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++; 228 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++; 229 src+= stride; 230 } 231 return numEq > c->ppMode.flatnessThreshold; 232} 233 234static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP) 235{ 236 int i; 237#if 1 238 for(i=0; i<2; i++){ 239 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0; 240 src += stride; 241 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0; 242 src += stride; 243 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0; 244 src += stride; 245 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0; 246 src += stride; 247 } 248#else 249 for(i=0; i<8; i++){ 250 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0; 251 src += stride; 252 } 253#endif 254 return 1; 255} 256 257static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP) 258{ 259#if 1 260#if 1 261 int x; 262 src+= stride*4; 263 for(x=0; x<BLOCK_SIZE; x+=4){ 264 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0; 265 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0; 266 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0; 267 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0; 268 } 269#else 270 int x; 271 src+= stride*3; 272 for(x=0; x<BLOCK_SIZE; x++){ 273 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; 274 } 275#endif 276 return 1; 277#else 278 int x; 279 src+= stride*4; 280 for(x=0; x<BLOCK_SIZE; x++){ 281 int min=255; 282 int max=0; 283 int y; 284 for(y=0; y<8; y++){ 285 int v= src[x + y*stride]; 286 if(v>max) max=v; 287 if(v<min) min=v; 288 } 289 if(max-min > 2*QP) return 0; 290 } 291 return 1; 292#endif 293} 294 295static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c) 296{ 297 if( isHorizDC_C(src, stride, c) ){ 298 if( isHorizMinMaxOk_C(src, stride, c->QP) ) 299 return 1; 300 else 301 return 0; 302 }else{ 303 return 2; 304 } 305} 306 307static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c) 308{ 309 if( isVertDC_C(src, stride, c) ){ 310 if( isVertMinMaxOk_C(src, stride, c->QP) ) 311 return 1; 312 else 313 return 0; 314 }else{ 315 return 2; 316 } 317} 318 319static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c) 320{ 321 int y; 322 for(y=0; y<BLOCK_SIZE; y++){ 323 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); 324 325 if(FFABS(middleEnergy) < 8*c->QP){ 326 const int q=(dst[3] - dst[4])/2; 327 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); 328 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); 329 330 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 331 d= FFMAX(d, 0); 332 333 d= (5*d + 32) >> 6; 334 d*= FFSIGN(-middleEnergy); 335 336 if(q>0) 337 { 338 d= d<0 ? 0 : d; 339 d= d>q ? q : d; 340 } 341 else 342 { 343 d= d>0 ? 0 : d; 344 d= d<q ? q : d; 345 } 346 347 dst[3]-= d; 348 dst[4]+= d; 349 } 350 dst+= stride; 351 } 352} 353 354/** 355 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) 356 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) 357 */ 358static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c) 359{ 360 int y; 361 for(y=0; y<BLOCK_SIZE; y++){ 362 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; 363 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; 364 365 int sums[10]; 366 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4; 367 sums[1] = sums[0] - first + dst[3]; 368 sums[2] = sums[1] - first + dst[4]; 369 sums[3] = sums[2] - first + dst[5]; 370 sums[4] = sums[3] - first + dst[6]; 371 sums[5] = sums[4] - dst[0] + dst[7]; 372 sums[6] = sums[5] - dst[1] + last; 373 sums[7] = sums[6] - dst[2] + last; 374 sums[8] = sums[7] - dst[3] + last; 375 sums[9] = sums[8] - dst[4] + last; 376 377 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4; 378 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4; 379 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4; 380 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4; 381 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4; 382 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4; 383 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4; 384 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4; 385 386 dst+= stride; 387 } 388} 389 390/** 391 * Experimental Filter 1 (Horizontal) 392 * will not damage linear gradients 393 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter 394 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) 395 * MMX2 version does correct clipping C version does not 396 * not identical with the vertical one 397 */ 398static inline void horizX1Filter(uint8_t *src, int stride, int QP) 399{ 400 int y; 401 static uint64_t *lut= NULL; 402 if(lut==NULL) 403 { 404 int i; 405 lut = av_malloc(256*8); 406 for(i=0; i<256; i++) 407 { 408 int v= i < 128 ? 2*i : 2*(i-256); 409/* 410//Simulate 112242211 9-Tap filter 411 uint64_t a= (v/16) & 0xFF; 412 uint64_t b= (v/8) & 0xFF; 413 uint64_t c= (v/4) & 0xFF; 414 uint64_t d= (3*v/8) & 0xFF; 415*/ 416//Simulate piecewise linear interpolation 417 uint64_t a= (v/16) & 0xFF; 418 uint64_t b= (v*3/16) & 0xFF; 419 uint64_t c= (v*5/16) & 0xFF; 420 uint64_t d= (7*v/16) & 0xFF; 421 uint64_t A= (0x100 - a)&0xFF; 422 uint64_t B= (0x100 - b)&0xFF; 423 uint64_t C= (0x100 - c)&0xFF; 424 uint64_t D= (0x100 - c)&0xFF; 425 426 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | 427 (D<<24) | (C<<16) | (B<<8) | (A); 428 //lut[i] = (v<<32) | (v<<24); 429 } 430 } 431 432 for(y=0; y<BLOCK_SIZE; y++){ 433 int a= src[1] - src[2]; 434 int b= src[3] - src[4]; 435 int c= src[5] - src[6]; 436 437 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0); 438 439 if(d < QP){ 440 int v = d * FFSIGN(-b); 441 442 src[1] +=v/8; 443 src[2] +=v/4; 444 src[3] +=3*v/8; 445 src[4] -=3*v/8; 446 src[5] -=v/4; 447 src[6] -=v/8; 448 } 449 src+=stride; 450 } 451} 452 453/** 454 * accurate deblock filter 455 */ 456static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){ 457 int y; 458 const int QP= c->QP; 459 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 460 const int dcThreshold= dcOffset*2 + 1; 461//START_TIMER 462 src+= step*4; // src points to begin of the 8x8 Block 463 for(y=0; y<8; y++){ 464 int numEq= 0; 465 466 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++; 467 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++; 468 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++; 469 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++; 470 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++; 471 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++; 472 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++; 473 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++; 474 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++; 475 if(numEq > c->ppMode.flatnessThreshold){ 476 int min, max, x; 477 478 if(src[0] > src[step]){ 479 max= src[0]; 480 min= src[step]; 481 }else{ 482 max= src[step]; 483 min= src[0]; 484 } 485 for(x=2; x<8; x+=2){ 486 if(src[x*step] > src[(x+1)*step]){ 487 if(src[x *step] > max) max= src[ x *step]; 488 if(src[(x+1)*step] < min) min= src[(x+1)*step]; 489 }else{ 490 if(src[(x+1)*step] > max) max= src[(x+1)*step]; 491 if(src[ x *step] < min) min= src[ x *step]; 492 } 493 } 494 if(max-min < 2*QP){ 495 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; 496 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; 497 498 int sums[10]; 499 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; 500 sums[1] = sums[0] - first + src[3*step]; 501 sums[2] = sums[1] - first + src[4*step]; 502 sums[3] = sums[2] - first + src[5*step]; 503 sums[4] = sums[3] - first + src[6*step]; 504 sums[5] = sums[4] - src[0*step] + src[7*step]; 505 sums[6] = sums[5] - src[1*step] + last; 506 sums[7] = sums[6] - src[2*step] + last; 507 sums[8] = sums[7] - src[3*step] + last; 508 sums[9] = sums[8] - src[4*step] + last; 509 510 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; 511 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; 512 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; 513 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; 514 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; 515 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; 516 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; 517 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; 518 } 519 }else{ 520 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]); 521 522 if(FFABS(middleEnergy) < 8*QP){ 523 const int q=(src[3*step] - src[4*step])/2; 524 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]); 525 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]); 526 527 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 528 d= FFMAX(d, 0); 529 530 d= (5*d + 32) >> 6; 531 d*= FFSIGN(-middleEnergy); 532 533 if(q>0){ 534 d= d<0 ? 0 : d; 535 d= d>q ? q : d; 536 }else{ 537 d= d>0 ? 0 : d; 538 d= d<q ? q : d; 539 } 540 541 src[3*step]-= d; 542 src[4*step]+= d; 543 } 544 } 545 546 src += stride; 547 } 548/*if(step==16){ 549 STOP_TIMER("step16") 550}else{ 551 STOP_TIMER("stepX") 552}*/ 553} 554 555//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one 556//Plain C versions 557#if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT 558#define COMPILE_C 559#endif 560 561#if HAVE_ALTIVEC 562#define COMPILE_ALTIVEC 563#endif //HAVE_ALTIVEC 564 565#if ARCH_X86 566 567#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 568#define COMPILE_MMX 569#endif 570 571#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT 572#define COMPILE_MMX2 573#endif 574 575#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 576#define COMPILE_3DNOW 577#endif 578#endif /* ARCH_X86 */ 579 580#undef HAVE_MMX 581#define HAVE_MMX 0 582#undef HAVE_MMX2 583#define HAVE_MMX2 0 584#undef HAVE_AMD3DNOW 585#define HAVE_AMD3DNOW 0 586#undef HAVE_ALTIVEC 587#define HAVE_ALTIVEC 0 588 589#ifdef COMPILE_C 590#define RENAME(a) a ## _C 591#include "postprocess_template.c" 592#endif 593 594#ifdef COMPILE_ALTIVEC 595#undef RENAME 596#undef HAVE_ALTIVEC 597#define HAVE_ALTIVEC 1 598#define RENAME(a) a ## _altivec 599#include "postprocess_altivec_template.c" 600#include "postprocess_template.c" 601#endif 602 603//MMX versions 604#ifdef COMPILE_MMX 605#undef RENAME 606#undef HAVE_MMX 607#define HAVE_MMX 1 608#define RENAME(a) a ## _MMX 609#include "postprocess_template.c" 610#endif 611 612//MMX2 versions 613#ifdef COMPILE_MMX2 614#undef RENAME 615#undef HAVE_MMX 616#undef HAVE_MMX2 617#define HAVE_MMX 1 618#define HAVE_MMX2 1 619#define RENAME(a) a ## _MMX2 620#include "postprocess_template.c" 621#endif 622 623//3DNOW versions 624#ifdef COMPILE_3DNOW 625#undef RENAME 626#undef HAVE_MMX 627#undef HAVE_MMX2 628#undef HAVE_AMD3DNOW 629#define HAVE_MMX 1 630#define HAVE_MMX2 0 631#define HAVE_AMD3DNOW 1 632#define RENAME(a) a ## _3DNow 633#include "postprocess_template.c" 634#endif 635 636// minor note: the HAVE_xyz is messed up after that line so do not use it. 637 638static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 639 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc) 640{ 641 PPContext *c= (PPContext *)vc; 642 PPMode *ppMode= (PPMode *)vm; 643 c->ppMode= *ppMode; //FIXME 644 645 // Using ifs here as they are faster than function pointers although the 646 // difference would not be measurable here but it is much better because 647 // someone might exchange the CPU whithout restarting MPlayer ;) 648#if CONFIG_RUNTIME_CPUDETECT 649#if ARCH_X86 650 // ordered per speed fastest first 651 if(c->cpuCaps & PP_CPU_CAPS_MMX2) 652 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 653 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW) 654 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 655 else if(c->cpuCaps & PP_CPU_CAPS_MMX) 656 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 657 else 658 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 659#else 660#if HAVE_ALTIVEC 661 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC) 662 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 663 else 664#endif 665 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 666#endif 667#else //CONFIG_RUNTIME_CPUDETECT 668#if HAVE_MMX2 669 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 670#elif HAVE_AMD3DNOW 671 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 672#elif HAVE_MMX 673 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 674#elif HAVE_ALTIVEC 675 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 676#else 677 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 678#endif 679#endif //!CONFIG_RUNTIME_CPUDETECT 680} 681 682//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 683// QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); 684 685/* -pp Command line Help 686*/ 687#if LIBPOSTPROC_VERSION_INT < (52<<16) 688const char *const pp_help= 689#else 690const char pp_help[] = 691#endif 692"Available postprocessing filters:\n" 693"Filters Options\n" 694"short long name short long option Description\n" 695"* * a autoq CPU power dependent enabler\n" 696" c chrom chrominance filtering enabled\n" 697" y nochrom chrominance filtering disabled\n" 698" n noluma luma filtering disabled\n" 699"hb hdeblock (2 threshold) horizontal deblocking filter\n" 700" 1. difference factor: default=32, higher -> more deblocking\n" 701" 2. flatness threshold: default=39, lower -> more deblocking\n" 702" the h & v deblocking filters share these\n" 703" so you can't set different thresholds for h / v\n" 704"vb vdeblock (2 threshold) vertical deblocking filter\n" 705"ha hadeblock (2 threshold) horizontal deblocking filter\n" 706"va vadeblock (2 threshold) vertical deblocking filter\n" 707"h1 x1hdeblock experimental h deblock filter 1\n" 708"v1 x1vdeblock experimental v deblock filter 1\n" 709"dr dering deringing filter\n" 710"al autolevels automatic brightness / contrast\n" 711" f fullyrange stretch luminance to (0..255)\n" 712"lb linblenddeint linear blend deinterlacer\n" 713"li linipoldeint linear interpolating deinterlace\n" 714"ci cubicipoldeint cubic interpolating deinterlacer\n" 715"md mediandeint median deinterlacer\n" 716"fd ffmpegdeint ffmpeg deinterlacer\n" 717"l5 lowpass5 FIR lowpass deinterlacer\n" 718"de default hb:a,vb:a,dr:a\n" 719"fa fast h1:a,v1:a,dr:a\n" 720"ac ha:a:128:7,va:a,dr:a\n" 721"tn tmpnoise (3 threshold) temporal noise reducer\n" 722" 1. <= 2. <= 3. larger -> stronger filtering\n" 723"fq forceQuant <quantizer> force quantizer\n" 724"Usage:\n" 725"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n" 726"long form example:\n" 727"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n" 728"short form example:\n" 729"vb:a/hb:a/lb de,-vb\n" 730"more examples:\n" 731"tn:64:128:256\n" 732"\n" 733; 734 735pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality) 736{ 737 char temp[GET_MODE_BUFFER_SIZE]; 738 char *p= temp; 739 static const char filterDelimiters[] = ",/"; 740 static const char optionDelimiters[] = ":"; 741 struct PPMode *ppMode; 742 char *filterToken; 743 744 ppMode= av_malloc(sizeof(PPMode)); 745 746 ppMode->lumMode= 0; 747 ppMode->chromMode= 0; 748 ppMode->maxTmpNoise[0]= 700; 749 ppMode->maxTmpNoise[1]= 1500; 750 ppMode->maxTmpNoise[2]= 3000; 751 ppMode->maxAllowedY= 234; 752 ppMode->minAllowedY= 16; 753 ppMode->baseDcDiff= 256/8; 754 ppMode->flatnessThreshold= 56-16-1; 755 ppMode->maxClippedThreshold= 0.01; 756 ppMode->error=0; 757 758 strncpy(temp, name, GET_MODE_BUFFER_SIZE); 759 760 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name); 761 762 for(;;){ 763 char *filterName; 764 int q= 1000000; //PP_QUALITY_MAX; 765 int chrom=-1; 766 int luma=-1; 767 char *option; 768 char *options[OPTIONS_ARRAY_SIZE]; 769 int i; 770 int filterNameOk=0; 771 int numOfUnknownOptions=0; 772 int enable=1; //does the user want us to enabled or disabled the filter 773 774 filterToken= strtok(p, filterDelimiters); 775 if(filterToken == NULL) break; 776 p+= strlen(filterToken) + 1; // p points to next filterToken 777 filterName= strtok(filterToken, optionDelimiters); 778 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName); 779 780 if(*filterName == '-'){ 781 enable=0; 782 filterName++; 783 } 784 785 for(;;){ //for all options 786 option= strtok(NULL, optionDelimiters); 787 if(option == NULL) break; 788 789 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option); 790 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; 791 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; 792 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; 793 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0; 794 else{ 795 options[numOfUnknownOptions] = option; 796 numOfUnknownOptions++; 797 } 798 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; 799 } 800 options[numOfUnknownOptions] = NULL; 801 802 /* replace stuff from the replace Table */ 803 for(i=0; replaceTable[2*i]!=NULL; i++){ 804 if(!strcmp(replaceTable[2*i], filterName)){ 805 int newlen= strlen(replaceTable[2*i + 1]); 806 int plen; 807 int spaceLeft; 808 809 if(p==NULL) p= temp, *p=0; //last filter 810 else p--, *p=','; //not last filter 811 812 plen= strlen(p); 813 spaceLeft= p - temp + plen; 814 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE){ 815 ppMode->error++; 816 break; 817 } 818 memmove(p + newlen, p, plen+1); 819 memcpy(p, replaceTable[2*i + 1], newlen); 820 filterNameOk=1; 821 } 822 } 823 824 for(i=0; filters[i].shortName!=NULL; i++){ 825 if( !strcmp(filters[i].longName, filterName) 826 || !strcmp(filters[i].shortName, filterName)){ 827 ppMode->lumMode &= ~filters[i].mask; 828 ppMode->chromMode &= ~filters[i].mask; 829 830 filterNameOk=1; 831 if(!enable) break; // user wants to disable it 832 833 if(q >= filters[i].minLumQuality && luma) 834 ppMode->lumMode|= filters[i].mask; 835 if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) 836 if(q >= filters[i].minChromQuality) 837 ppMode->chromMode|= filters[i].mask; 838 839 if(filters[i].mask == LEVEL_FIX){ 840 int o; 841 ppMode->minAllowedY= 16; 842 ppMode->maxAllowedY= 234; 843 for(o=0; options[o]!=NULL; o++){ 844 if( !strcmp(options[o],"fullyrange") 845 ||!strcmp(options[o],"f")){ 846 ppMode->minAllowedY= 0; 847 ppMode->maxAllowedY= 255; 848 numOfUnknownOptions--; 849 } 850 } 851 } 852 else if(filters[i].mask == TEMP_NOISE_FILTER) 853 { 854 int o; 855 int numOfNoises=0; 856 857 for(o=0; options[o]!=NULL; o++){ 858 char *tail; 859 ppMode->maxTmpNoise[numOfNoises]= 860 strtol(options[o], &tail, 0); 861 if(tail!=options[o]){ 862 numOfNoises++; 863 numOfUnknownOptions--; 864 if(numOfNoises >= 3) break; 865 } 866 } 867 } 868 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK 869 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){ 870 int o; 871 872 for(o=0; options[o]!=NULL && o<2; o++){ 873 char *tail; 874 int val= strtol(options[o], &tail, 0); 875 if(tail==options[o]) break; 876 877 numOfUnknownOptions--; 878 if(o==0) ppMode->baseDcDiff= val; 879 else ppMode->flatnessThreshold= val; 880 } 881 } 882 else if(filters[i].mask == FORCE_QUANT){ 883 int o; 884 ppMode->forcedQuant= 15; 885 886 for(o=0; options[o]!=NULL && o<1; o++){ 887 char *tail; 888 int val= strtol(options[o], &tail, 0); 889 if(tail==options[o]) break; 890 891 numOfUnknownOptions--; 892 ppMode->forcedQuant= val; 893 } 894 } 895 } 896 } 897 if(!filterNameOk) ppMode->error++; 898 ppMode->error += numOfUnknownOptions; 899 } 900 901 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode); 902 if(ppMode->error){ 903 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name); 904 av_free(ppMode); 905 return NULL; 906 } 907 return ppMode; 908} 909 910void pp_free_mode(pp_mode *mode){ 911 av_free(mode); 912} 913 914static void reallocAlign(void **p, int alignment, int size){ 915 av_free(*p); 916 *p= av_mallocz(size); 917} 918 919static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){ 920 int mbWidth = (width+15)>>4; 921 int mbHeight= (height+15)>>4; 922 int i; 923 924 c->stride= stride; 925 c->qpStride= qpStride; 926 927 reallocAlign((void **)&c->tempDst, 8, stride*24); 928 reallocAlign((void **)&c->tempSrc, 8, stride*24); 929 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8); 930 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t)); 931 for(i=0; i<256; i++) 932 c->yHistogram[i]= width*height/64*15/256; 933 934 for(i=0; i<3; i++){ 935 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end. 936 reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024); 937 reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size 938 } 939 940 reallocAlign((void **)&c->deintTemp, 8, 2*width+32); 941 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 942 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 943 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T)); 944} 945 946static const char * context_to_name(void * ptr) { 947 return "postproc"; 948} 949 950static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL }; 951 952pp_context *pp_get_context(int width, int height, int cpuCaps){ 953 PPContext *c= av_malloc(sizeof(PPContext)); 954 int stride= (width+15)&(~15); //assumed / will realloc if needed 955 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed 956 957 memset(c, 0, sizeof(PPContext)); 958 c->av_class = &av_codec_context_class; 959 c->cpuCaps= cpuCaps; 960 if(cpuCaps&PP_FORMAT){ 961 c->hChromaSubSample= cpuCaps&0x3; 962 c->vChromaSubSample= (cpuCaps>>4)&0x3; 963 }else{ 964 c->hChromaSubSample= 1; 965 c->vChromaSubSample= 1; 966 } 967 968 reallocBuffers(c, width, height, stride, qpStride); 969 970 c->frameNum=-1; 971 972 return c; 973} 974 975void pp_free_context(void *vc){ 976 PPContext *c = (PPContext*)vc; 977 int i; 978 979 for(i=0; i<3; i++) av_free(c->tempBlurred[i]); 980 for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]); 981 982 av_free(c->tempBlocks); 983 av_free(c->yHistogram); 984 av_free(c->tempDst); 985 av_free(c->tempSrc); 986 av_free(c->deintTemp); 987 av_free(c->stdQPTable); 988 av_free(c->nonBQPTable); 989 av_free(c->forcedQPTable); 990 991 memset(c, 0, sizeof(PPContext)); 992 993 av_free(c); 994} 995 996void pp_postprocess(const uint8_t * src[3], const int srcStride[3], 997 uint8_t * dst[3], const int dstStride[3], 998 int width, int height, 999 const QP_STORE_T *QP_store, int QPStride, 1000 pp_mode *vm, void *vc, int pict_type) 1001{ 1002 int mbWidth = (width+15)>>4; 1003 int mbHeight= (height+15)>>4; 1004 PPMode *mode = (PPMode*)vm; 1005 PPContext *c = (PPContext*)vc; 1006 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0])); 1007 int absQPStride = FFABS(QPStride); 1008 1009 // c->stride and c->QPStride are always positive 1010 if(c->stride < minStride || c->qpStride < absQPStride) 1011 reallocBuffers(c, width, height, 1012 FFMAX(minStride, c->stride), 1013 FFMAX(c->qpStride, absQPStride)); 1014 1015 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){ 1016 int i; 1017 QP_store= c->forcedQPTable; 1018 absQPStride = QPStride = 0; 1019 if(mode->lumMode & FORCE_QUANT) 1020 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant; 1021 else 1022 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1; 1023 } 1024 1025 if(pict_type & PP_PICT_TYPE_QP2){ 1026 int i; 1027 const int count= mbHeight * absQPStride; 1028 for(i=0; i<(count>>2); i++){ 1029 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F; 1030 } 1031 for(i<<=2; i<count; i++){ 1032 c->stdQPTable[i] = QP_store[i]>>1; 1033 } 1034 QP_store= c->stdQPTable; 1035 QPStride= absQPStride; 1036 } 1037 1038 if(0){ 1039 int x,y; 1040 for(y=0; y<mbHeight; y++){ 1041 for(x=0; x<mbWidth; x++){ 1042 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]); 1043 } 1044 av_log(c, AV_LOG_INFO, "\n"); 1045 } 1046 av_log(c, AV_LOG_INFO, "\n"); 1047 } 1048 1049 if((pict_type&7)!=3){ 1050 if (QPStride >= 0){ 1051 int i; 1052 const int count= mbHeight * QPStride; 1053 for(i=0; i<(count>>2); i++){ 1054 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F; 1055 } 1056 for(i<<=2; i<count; i++){ 1057 c->nonBQPTable[i] = QP_store[i] & 0x3F; 1058 } 1059 } else { 1060 int i,j; 1061 for(i=0; i<mbHeight; i++) { 1062 for(j=0; j<absQPStride; j++) { 1063 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F; 1064 } 1065 } 1066 } 1067 } 1068 1069 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n", 1070 mode->lumMode, mode->chromMode); 1071 1072 postProcess(src[0], srcStride[0], dst[0], dstStride[0], 1073 width, height, QP_store, QPStride, 0, mode, c); 1074 1075 width = (width )>>c->hChromaSubSample; 1076 height = (height)>>c->vChromaSubSample; 1077 1078 if(mode->chromMode){ 1079 postProcess(src[1], srcStride[1], dst[1], dstStride[1], 1080 width, height, QP_store, QPStride, 1, mode, c); 1081 postProcess(src[2], srcStride[2], dst[2], dstStride[2], 1082 width, height, QP_store, QPStride, 2, mode, c); 1083 } 1084 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){ 1085 linecpy(dst[1], src[1], height, srcStride[1]); 1086 linecpy(dst[2], src[2], height, srcStride[2]); 1087 }else{ 1088 int y; 1089 for(y=0; y<height; y++){ 1090 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width); 1091 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width); 1092 } 1093 } 1094} 1095 1096