1/* 2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) 3 * 4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> 5 * 6 * This file is part of Libav. 7 * 8 * Libav is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * Libav is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with Libav; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23/** 24 * @file 25 * postprocessing. 26 */ 27 28/* 29 C MMX MMX2 3DNow AltiVec 30isVertDC Ec Ec Ec 31isVertMinMaxOk Ec Ec Ec 32doVertLowPass E e e Ec 33doVertDefFilter Ec Ec e e Ec 34isHorizDC Ec Ec Ec 35isHorizMinMaxOk a E Ec 36doHorizLowPass E e e Ec 37doHorizDefFilter Ec Ec e e Ec 38do_a_deblock Ec E Ec E 39deRing E e e* Ecp 40Vertical RKAlgo1 E a a 41Horizontal RKAlgo1 a a 42Vertical X1# a E E 43Horizontal X1# a E E 44LinIpolDeinterlace e E E* 45CubicIpolDeinterlace a e e* 46LinBlendDeinterlace e E E* 47MedianDeinterlace# E Ec Ec 48TempDeNoiser# E e e Ec 49 50* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work 51# more or less selfinvented filters so the exactness is not too meaningful 52E = Exact implementation 53e = almost exact implementation (slightly different rounding,...) 54a = alternative / approximate impl 55c = checked against the other implementations (-vo md5) 56p = partially optimized, still some work to do 57*/ 58 59/* 60TODO: 61reduce the time wasted on the mem transfer 62unroll stuff if instructions depend too much on the prior one 63move YScale thing to the end instead of fixing QP 64write a faster and higher quality deblocking filter :) 65make the mainloop more flexible (variable number of blocks at once 66 (the if/else stuff per block is slowing things down) 67compare the quality & speed of all filters 68split this huge file 69optimize c versions 70try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 71... 72*/ 73 74//Changelog: use git log 75 76#include "config.h" 77#include "libavutil/avutil.h" 78#include <inttypes.h> 79#include <stdio.h> 80#include <stdlib.h> 81#include <string.h> 82//#undef HAVE_MMX2 83//#define HAVE_AMD3DNOW 84//#undef HAVE_MMX 85//#undef ARCH_X86 86//#define DEBUG_BRIGHTNESS 87#include "postprocess.h" 88#include "postprocess_internal.h" 89#include "libavutil/avstring.h" 90 91unsigned postproc_version(void) 92{ 93 return LIBPOSTPROC_VERSION_INT; 94} 95 96const char *postproc_configuration(void) 97{ 98 return LIBAV_CONFIGURATION; 99} 100 101const char *postproc_license(void) 102{ 103#define LICENSE_PREFIX "libpostproc license: " 104 return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1; 105} 106 107#if HAVE_ALTIVEC_H 108#include <altivec.h> 109#endif 110 111#define GET_MODE_BUFFER_SIZE 500 112#define OPTIONS_ARRAY_SIZE 10 113#define BLOCK_SIZE 8 114#define TEMP_STRIDE 8 115//#define NUM_BLOCKS_AT_ONCE 16 //not used yet 116 117#if ARCH_X86 118DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL; 119DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL; 120DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL; 121DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL; 122DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL; 123DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL; 124DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL; 125DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL; 126#endif 127 128DECLARE_ASM_CONST(8, int, deringThreshold)= 20; 129 130 131static struct PPFilter filters[]= 132{ 133 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, 134 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, 135/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER}, 136 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/ 137 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, 138 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, 139 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK}, 140 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK}, 141 {"dr", "dering", 1, 5, 6, DERING}, 142 {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, 143 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER}, 144 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER}, 145 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER}, 146 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER}, 147 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER}, 148 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER}, 149 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, 150 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT}, 151 {NULL, NULL,0,0,0,0} //End Marker 152}; 153 154static const char *replaceTable[]= 155{ 156 "default", "hb:a,vb:a,dr:a", 157 "de", "hb:a,vb:a,dr:a", 158 "fast", "h1:a,v1:a,dr:a", 159 "fa", "h1:a,v1:a,dr:a", 160 "ac", "ha:a:128:7,va:a,dr:a", 161 NULL //End Marker 162}; 163 164 165#if ARCH_X86 166static inline void prefetchnta(void *p) 167{ 168 __asm__ volatile( "prefetchnta (%0)\n\t" 169 : : "r" (p) 170 ); 171} 172 173static inline void prefetcht0(void *p) 174{ 175 __asm__ volatile( "prefetcht0 (%0)\n\t" 176 : : "r" (p) 177 ); 178} 179 180static inline void prefetcht1(void *p) 181{ 182 __asm__ volatile( "prefetcht1 (%0)\n\t" 183 : : "r" (p) 184 ); 185} 186 187static inline void prefetcht2(void *p) 188{ 189 __asm__ volatile( "prefetcht2 (%0)\n\t" 190 : : "r" (p) 191 ); 192} 193#endif 194 195/* The horizontal functions exist only in C because the MMX 196 * code is faster with vertical filters and transposing. */ 197 198/** 199 * Check if the given 8x8 Block is mostly "flat" 200 */ 201static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c) 202{ 203 int numEq= 0; 204 int y; 205 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 206 const int dcThreshold= dcOffset*2 + 1; 207 208 for(y=0; y<BLOCK_SIZE; y++){ 209 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++; 210 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++; 211 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++; 212 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++; 213 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++; 214 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++; 215 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++; 216 src+= stride; 217 } 218 return numEq > c->ppMode.flatnessThreshold; 219} 220 221/** 222 * Check if the middle 8x8 Block in the given 8x16 block is flat 223 */ 224static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c) 225{ 226 int numEq= 0; 227 int y; 228 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 229 const int dcThreshold= dcOffset*2 + 1; 230 231 src+= stride*4; // src points to begin of the 8x8 Block 232 for(y=0; y<BLOCK_SIZE-1; y++){ 233 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++; 234 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++; 235 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++; 236 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++; 237 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++; 238 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++; 239 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++; 240 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++; 241 src+= stride; 242 } 243 return numEq > c->ppMode.flatnessThreshold; 244} 245 246static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP) 247{ 248 int i; 249 for(i=0; i<2; i++){ 250 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0; 251 src += stride; 252 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0; 253 src += stride; 254 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0; 255 src += stride; 256 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0; 257 src += stride; 258 } 259 return 1; 260} 261 262static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP) 263{ 264 int x; 265 src+= stride*4; 266 for(x=0; x<BLOCK_SIZE; x+=4){ 267 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0; 268 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0; 269 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0; 270 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0; 271 } 272 return 1; 273} 274 275static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c) 276{ 277 if( isHorizDC_C(src, stride, c) ){ 278 if( isHorizMinMaxOk_C(src, stride, c->QP) ) 279 return 1; 280 else 281 return 0; 282 }else{ 283 return 2; 284 } 285} 286 287static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c) 288{ 289 if( isVertDC_C(src, stride, c) ){ 290 if( isVertMinMaxOk_C(src, stride, c->QP) ) 291 return 1; 292 else 293 return 0; 294 }else{ 295 return 2; 296 } 297} 298 299static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c) 300{ 301 int y; 302 for(y=0; y<BLOCK_SIZE; y++){ 303 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); 304 305 if(FFABS(middleEnergy) < 8*c->QP){ 306 const int q=(dst[3] - dst[4])/2; 307 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); 308 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); 309 310 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 311 d= FFMAX(d, 0); 312 313 d= (5*d + 32) >> 6; 314 d*= FFSIGN(-middleEnergy); 315 316 if(q>0) 317 { 318 d= d<0 ? 0 : d; 319 d= d>q ? q : d; 320 } 321 else 322 { 323 d= d>0 ? 0 : d; 324 d= d<q ? q : d; 325 } 326 327 dst[3]-= d; 328 dst[4]+= d; 329 } 330 dst+= stride; 331 } 332} 333 334/** 335 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) 336 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) 337 */ 338static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c) 339{ 340 int y; 341 for(y=0; y<BLOCK_SIZE; y++){ 342 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; 343 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; 344 345 int sums[10]; 346 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4; 347 sums[1] = sums[0] - first + dst[3]; 348 sums[2] = sums[1] - first + dst[4]; 349 sums[3] = sums[2] - first + dst[5]; 350 sums[4] = sums[3] - first + dst[6]; 351 sums[5] = sums[4] - dst[0] + dst[7]; 352 sums[6] = sums[5] - dst[1] + last; 353 sums[7] = sums[6] - dst[2] + last; 354 sums[8] = sums[7] - dst[3] + last; 355 sums[9] = sums[8] - dst[4] + last; 356 357 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4; 358 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4; 359 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4; 360 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4; 361 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4; 362 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4; 363 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4; 364 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4; 365 366 dst+= stride; 367 } 368} 369 370/** 371 * Experimental Filter 1 (Horizontal) 372 * will not damage linear gradients 373 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter 374 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) 375 * MMX2 version does correct clipping C version does not 376 * not identical with the vertical one 377 */ 378static inline void horizX1Filter(uint8_t *src, int stride, int QP) 379{ 380 int y; 381 static uint64_t *lut= NULL; 382 if(lut==NULL) 383 { 384 int i; 385 lut = av_malloc(256*8); 386 for(i=0; i<256; i++) 387 { 388 int v= i < 128 ? 2*i : 2*(i-256); 389/* 390//Simulate 112242211 9-Tap filter 391 uint64_t a= (v/16) & 0xFF; 392 uint64_t b= (v/8) & 0xFF; 393 uint64_t c= (v/4) & 0xFF; 394 uint64_t d= (3*v/8) & 0xFF; 395*/ 396//Simulate piecewise linear interpolation 397 uint64_t a= (v/16) & 0xFF; 398 uint64_t b= (v*3/16) & 0xFF; 399 uint64_t c= (v*5/16) & 0xFF; 400 uint64_t d= (7*v/16) & 0xFF; 401 uint64_t A= (0x100 - a)&0xFF; 402 uint64_t B= (0x100 - b)&0xFF; 403 uint64_t C= (0x100 - c)&0xFF; 404 uint64_t D= (0x100 - c)&0xFF; 405 406 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | 407 (D<<24) | (C<<16) | (B<<8) | (A); 408 //lut[i] = (v<<32) | (v<<24); 409 } 410 } 411 412 for(y=0; y<BLOCK_SIZE; y++){ 413 int a= src[1] - src[2]; 414 int b= src[3] - src[4]; 415 int c= src[5] - src[6]; 416 417 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0); 418 419 if(d < QP){ 420 int v = d * FFSIGN(-b); 421 422 src[1] +=v/8; 423 src[2] +=v/4; 424 src[3] +=3*v/8; 425 src[4] -=3*v/8; 426 src[5] -=v/4; 427 src[6] -=v/8; 428 } 429 src+=stride; 430 } 431} 432 433/** 434 * accurate deblock filter 435 */ 436static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){ 437 int y; 438 const int QP= c->QP; 439 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; 440 const int dcThreshold= dcOffset*2 + 1; 441//START_TIMER 442 src+= step*4; // src points to begin of the 8x8 Block 443 for(y=0; y<8; y++){ 444 int numEq= 0; 445 446 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++; 447 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++; 448 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++; 449 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++; 450 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++; 451 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++; 452 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++; 453 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++; 454 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++; 455 if(numEq > c->ppMode.flatnessThreshold){ 456 int min, max, x; 457 458 if(src[0] > src[step]){ 459 max= src[0]; 460 min= src[step]; 461 }else{ 462 max= src[step]; 463 min= src[0]; 464 } 465 for(x=2; x<8; x+=2){ 466 if(src[x*step] > src[(x+1)*step]){ 467 if(src[x *step] > max) max= src[ x *step]; 468 if(src[(x+1)*step] < min) min= src[(x+1)*step]; 469 }else{ 470 if(src[(x+1)*step] > max) max= src[(x+1)*step]; 471 if(src[ x *step] < min) min= src[ x *step]; 472 } 473 } 474 if(max-min < 2*QP){ 475 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; 476 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; 477 478 int sums[10]; 479 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; 480 sums[1] = sums[0] - first + src[3*step]; 481 sums[2] = sums[1] - first + src[4*step]; 482 sums[3] = sums[2] - first + src[5*step]; 483 sums[4] = sums[3] - first + src[6*step]; 484 sums[5] = sums[4] - src[0*step] + src[7*step]; 485 sums[6] = sums[5] - src[1*step] + last; 486 sums[7] = sums[6] - src[2*step] + last; 487 sums[8] = sums[7] - src[3*step] + last; 488 sums[9] = sums[8] - src[4*step] + last; 489 490 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; 491 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; 492 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; 493 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; 494 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; 495 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; 496 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; 497 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; 498 } 499 }else{ 500 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]); 501 502 if(FFABS(middleEnergy) < 8*QP){ 503 const int q=(src[3*step] - src[4*step])/2; 504 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]); 505 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]); 506 507 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 508 d= FFMAX(d, 0); 509 510 d= (5*d + 32) >> 6; 511 d*= FFSIGN(-middleEnergy); 512 513 if(q>0){ 514 d= d<0 ? 0 : d; 515 d= d>q ? q : d; 516 }else{ 517 d= d>0 ? 0 : d; 518 d= d<q ? q : d; 519 } 520 521 src[3*step]-= d; 522 src[4*step]+= d; 523 } 524 } 525 526 src += stride; 527 } 528/*if(step==16){ 529 STOP_TIMER("step16") 530}else{ 531 STOP_TIMER("stepX") 532}*/ 533} 534 535//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one 536//Plain C versions 537#if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT 538#define COMPILE_C 539#endif 540 541#if HAVE_ALTIVEC 542#define COMPILE_ALTIVEC 543#endif //HAVE_ALTIVEC 544 545#if ARCH_X86 546 547#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 548#define COMPILE_MMX 549#endif 550 551#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT 552#define COMPILE_MMX2 553#endif 554 555#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT 556#define COMPILE_3DNOW 557#endif 558#endif /* ARCH_X86 */ 559 560#undef HAVE_MMX 561#define HAVE_MMX 0 562#undef HAVE_MMX2 563#define HAVE_MMX2 0 564#undef HAVE_AMD3DNOW 565#define HAVE_AMD3DNOW 0 566#undef HAVE_ALTIVEC 567#define HAVE_ALTIVEC 0 568 569#ifdef COMPILE_C 570#define RENAME(a) a ## _C 571#include "postprocess_template.c" 572#endif 573 574#ifdef COMPILE_ALTIVEC 575#undef RENAME 576#undef HAVE_ALTIVEC 577#define HAVE_ALTIVEC 1 578#define RENAME(a) a ## _altivec 579#include "postprocess_altivec_template.c" 580#include "postprocess_template.c" 581#endif 582 583//MMX versions 584#ifdef COMPILE_MMX 585#undef RENAME 586#undef HAVE_MMX 587#define HAVE_MMX 1 588#define RENAME(a) a ## _MMX 589#include "postprocess_template.c" 590#endif 591 592//MMX2 versions 593#ifdef COMPILE_MMX2 594#undef RENAME 595#undef HAVE_MMX 596#undef HAVE_MMX2 597#define HAVE_MMX 1 598#define HAVE_MMX2 1 599#define RENAME(a) a ## _MMX2 600#include "postprocess_template.c" 601#endif 602 603//3DNOW versions 604#ifdef COMPILE_3DNOW 605#undef RENAME 606#undef HAVE_MMX 607#undef HAVE_MMX2 608#undef HAVE_AMD3DNOW 609#define HAVE_MMX 1 610#define HAVE_MMX2 0 611#define HAVE_AMD3DNOW 1 612#define RENAME(a) a ## _3DNow 613#include "postprocess_template.c" 614#endif 615 616// minor note: the HAVE_xyz is messed up after that line so do not use it. 617 618static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 619 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc) 620{ 621 PPContext *c= (PPContext *)vc; 622 PPMode *ppMode= (PPMode *)vm; 623 c->ppMode= *ppMode; //FIXME 624 625 // Using ifs here as they are faster than function pointers although the 626 // difference would not be measurable here but it is much better because 627 // someone might exchange the CPU whithout restarting MPlayer ;) 628#if CONFIG_RUNTIME_CPUDETECT 629#if ARCH_X86 630 // ordered per speed fastest first 631 if(c->cpuCaps & PP_CPU_CAPS_MMX2) 632 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 633 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW) 634 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 635 else if(c->cpuCaps & PP_CPU_CAPS_MMX) 636 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 637 else 638 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 639#else 640#if HAVE_ALTIVEC 641 if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC) 642 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 643 else 644#endif 645 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 646#endif 647#else /* CONFIG_RUNTIME_CPUDETECT */ 648#if HAVE_MMX2 649 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 650#elif HAVE_AMD3DNOW 651 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 652#elif HAVE_MMX 653 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 654#elif HAVE_ALTIVEC 655 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 656#else 657 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); 658#endif 659#endif /* !CONFIG_RUNTIME_CPUDETECT */ 660} 661 662//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 663// QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); 664 665/* -pp Command line Help 666*/ 667const char pp_help[] = 668"Available postprocessing filters:\n" 669"Filters Options\n" 670"short long name short long option Description\n" 671"* * a autoq CPU power dependent enabler\n" 672" c chrom chrominance filtering enabled\n" 673" y nochrom chrominance filtering disabled\n" 674" n noluma luma filtering disabled\n" 675"hb hdeblock (2 threshold) horizontal deblocking filter\n" 676" 1. difference factor: default=32, higher -> more deblocking\n" 677" 2. flatness threshold: default=39, lower -> more deblocking\n" 678" the h & v deblocking filters share these\n" 679" so you can't set different thresholds for h / v\n" 680"vb vdeblock (2 threshold) vertical deblocking filter\n" 681"ha hadeblock (2 threshold) horizontal deblocking filter\n" 682"va vadeblock (2 threshold) vertical deblocking filter\n" 683"h1 x1hdeblock experimental h deblock filter 1\n" 684"v1 x1vdeblock experimental v deblock filter 1\n" 685"dr dering deringing filter\n" 686"al autolevels automatic brightness / contrast\n" 687" f fullyrange stretch luminance to (0..255)\n" 688"lb linblenddeint linear blend deinterlacer\n" 689"li linipoldeint linear interpolating deinterlace\n" 690"ci cubicipoldeint cubic interpolating deinterlacer\n" 691"md mediandeint median deinterlacer\n" 692"fd ffmpegdeint ffmpeg deinterlacer\n" 693"l5 lowpass5 FIR lowpass deinterlacer\n" 694"de default hb:a,vb:a,dr:a\n" 695"fa fast h1:a,v1:a,dr:a\n" 696"ac ha:a:128:7,va:a,dr:a\n" 697"tn tmpnoise (3 threshold) temporal noise reducer\n" 698" 1. <= 2. <= 3. larger -> stronger filtering\n" 699"fq forceQuant <quantizer> force quantizer\n" 700"Usage:\n" 701"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n" 702"long form example:\n" 703"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n" 704"short form example:\n" 705"vb:a/hb:a/lb de,-vb\n" 706"more examples:\n" 707"tn:64:128:256\n" 708"\n" 709; 710 711pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality) 712{ 713 char temp[GET_MODE_BUFFER_SIZE]; 714 char *p= temp; 715 static const char filterDelimiters[] = ",/"; 716 static const char optionDelimiters[] = ":"; 717 struct PPMode *ppMode; 718 char *filterToken; 719 720 ppMode= av_malloc(sizeof(PPMode)); 721 722 ppMode->lumMode= 0; 723 ppMode->chromMode= 0; 724 ppMode->maxTmpNoise[0]= 700; 725 ppMode->maxTmpNoise[1]= 1500; 726 ppMode->maxTmpNoise[2]= 3000; 727 ppMode->maxAllowedY= 234; 728 ppMode->minAllowedY= 16; 729 ppMode->baseDcDiff= 256/8; 730 ppMode->flatnessThreshold= 56-16-1; 731 ppMode->maxClippedThreshold= 0.01; 732 ppMode->error=0; 733 734 memset(temp, 0, GET_MODE_BUFFER_SIZE); 735 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1); 736 737 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name); 738 739 for(;;){ 740 char *filterName; 741 int q= 1000000; //PP_QUALITY_MAX; 742 int chrom=-1; 743 int luma=-1; 744 char *option; 745 char *options[OPTIONS_ARRAY_SIZE]; 746 int i; 747 int filterNameOk=0; 748 int numOfUnknownOptions=0; 749 int enable=1; //does the user want us to enabled or disabled the filter 750 751 filterToken= strtok(p, filterDelimiters); 752 if(filterToken == NULL) break; 753 p+= strlen(filterToken) + 1; // p points to next filterToken 754 filterName= strtok(filterToken, optionDelimiters); 755 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName); 756 757 if(*filterName == '-'){ 758 enable=0; 759 filterName++; 760 } 761 762 for(;;){ //for all options 763 option= strtok(NULL, optionDelimiters); 764 if(option == NULL) break; 765 766 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option); 767 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; 768 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; 769 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; 770 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0; 771 else{ 772 options[numOfUnknownOptions] = option; 773 numOfUnknownOptions++; 774 } 775 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; 776 } 777 options[numOfUnknownOptions] = NULL; 778 779 /* replace stuff from the replace Table */ 780 for(i=0; replaceTable[2*i]!=NULL; i++){ 781 if(!strcmp(replaceTable[2*i], filterName)){ 782 int newlen= strlen(replaceTable[2*i + 1]); 783 int plen; 784 int spaceLeft; 785 786 if(p==NULL) p= temp, *p=0; //last filter 787 else p--, *p=','; //not last filter 788 789 plen= strlen(p); 790 spaceLeft= p - temp + plen; 791 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){ 792 ppMode->error++; 793 break; 794 } 795 memmove(p + newlen, p, plen+1); 796 memcpy(p, replaceTable[2*i + 1], newlen); 797 filterNameOk=1; 798 } 799 } 800 801 for(i=0; filters[i].shortName!=NULL; i++){ 802 if( !strcmp(filters[i].longName, filterName) 803 || !strcmp(filters[i].shortName, filterName)){ 804 ppMode->lumMode &= ~filters[i].mask; 805 ppMode->chromMode &= ~filters[i].mask; 806 807 filterNameOk=1; 808 if(!enable) break; // user wants to disable it 809 810 if(q >= filters[i].minLumQuality && luma) 811 ppMode->lumMode|= filters[i].mask; 812 if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) 813 if(q >= filters[i].minChromQuality) 814 ppMode->chromMode|= filters[i].mask; 815 816 if(filters[i].mask == LEVEL_FIX){ 817 int o; 818 ppMode->minAllowedY= 16; 819 ppMode->maxAllowedY= 234; 820 for(o=0; options[o]!=NULL; o++){ 821 if( !strcmp(options[o],"fullyrange") 822 ||!strcmp(options[o],"f")){ 823 ppMode->minAllowedY= 0; 824 ppMode->maxAllowedY= 255; 825 numOfUnknownOptions--; 826 } 827 } 828 } 829 else if(filters[i].mask == TEMP_NOISE_FILTER) 830 { 831 int o; 832 int numOfNoises=0; 833 834 for(o=0; options[o]!=NULL; o++){ 835 char *tail; 836 ppMode->maxTmpNoise[numOfNoises]= 837 strtol(options[o], &tail, 0); 838 if(tail!=options[o]){ 839 numOfNoises++; 840 numOfUnknownOptions--; 841 if(numOfNoises >= 3) break; 842 } 843 } 844 } 845 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK 846 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){ 847 int o; 848 849 for(o=0; options[o]!=NULL && o<2; o++){ 850 char *tail; 851 int val= strtol(options[o], &tail, 0); 852 if(tail==options[o]) break; 853 854 numOfUnknownOptions--; 855 if(o==0) ppMode->baseDcDiff= val; 856 else ppMode->flatnessThreshold= val; 857 } 858 } 859 else if(filters[i].mask == FORCE_QUANT){ 860 int o; 861 ppMode->forcedQuant= 15; 862 863 for(o=0; options[o]!=NULL && o<1; o++){ 864 char *tail; 865 int val= strtol(options[o], &tail, 0); 866 if(tail==options[o]) break; 867 868 numOfUnknownOptions--; 869 ppMode->forcedQuant= val; 870 } 871 } 872 } 873 } 874 if(!filterNameOk) ppMode->error++; 875 ppMode->error += numOfUnknownOptions; 876 } 877 878 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode); 879 if(ppMode->error){ 880 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name); 881 av_free(ppMode); 882 return NULL; 883 } 884 return ppMode; 885} 886 887void pp_free_mode(pp_mode *mode){ 888 av_free(mode); 889} 890 891static void reallocAlign(void **p, int alignment, int size){ 892 av_free(*p); 893 *p= av_mallocz(size); 894} 895 896static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){ 897 int mbWidth = (width+15)>>4; 898 int mbHeight= (height+15)>>4; 899 int i; 900 901 c->stride= stride; 902 c->qpStride= qpStride; 903 904 reallocAlign((void **)&c->tempDst, 8, stride*24); 905 reallocAlign((void **)&c->tempSrc, 8, stride*24); 906 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8); 907 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t)); 908 for(i=0; i<256; i++) 909 c->yHistogram[i]= width*height/64*15/256; 910 911 for(i=0; i<3; i++){ 912 //Note: The +17*1024 is just there so I do not have to worry about r/w over the end. 913 reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024); 914 reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size 915 } 916 917 reallocAlign((void **)&c->deintTemp, 8, 2*width+32); 918 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 919 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T)); 920 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T)); 921} 922 923static const char * context_to_name(void * ptr) { 924 return "postproc"; 925} 926 927static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL }; 928 929pp_context *pp_get_context(int width, int height, int cpuCaps){ 930 PPContext *c= av_malloc(sizeof(PPContext)); 931 int stride= FFALIGN(width, 16); //assumed / will realloc if needed 932 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed 933 934 memset(c, 0, sizeof(PPContext)); 935 c->av_class = &av_codec_context_class; 936 c->cpuCaps= cpuCaps; 937 if(cpuCaps&PP_FORMAT){ 938 c->hChromaSubSample= cpuCaps&0x3; 939 c->vChromaSubSample= (cpuCaps>>4)&0x3; 940 }else{ 941 c->hChromaSubSample= 1; 942 c->vChromaSubSample= 1; 943 } 944 945 reallocBuffers(c, width, height, stride, qpStride); 946 947 c->frameNum=-1; 948 949 return c; 950} 951 952void pp_free_context(void *vc){ 953 PPContext *c = (PPContext*)vc; 954 int i; 955 956 for(i=0; i<3; i++) av_free(c->tempBlurred[i]); 957 for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]); 958 959 av_free(c->tempBlocks); 960 av_free(c->yHistogram); 961 av_free(c->tempDst); 962 av_free(c->tempSrc); 963 av_free(c->deintTemp); 964 av_free(c->stdQPTable); 965 av_free(c->nonBQPTable); 966 av_free(c->forcedQPTable); 967 968 memset(c, 0, sizeof(PPContext)); 969 970 av_free(c); 971} 972 973void pp_postprocess(const uint8_t * src[3], const int srcStride[3], 974 uint8_t * dst[3], const int dstStride[3], 975 int width, int height, 976 const QP_STORE_T *QP_store, int QPStride, 977 pp_mode *vm, void *vc, int pict_type) 978{ 979 int mbWidth = (width+15)>>4; 980 int mbHeight= (height+15)>>4; 981 PPMode *mode = (PPMode*)vm; 982 PPContext *c = (PPContext*)vc; 983 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0])); 984 int absQPStride = FFABS(QPStride); 985 986 // c->stride and c->QPStride are always positive 987 if(c->stride < minStride || c->qpStride < absQPStride) 988 reallocBuffers(c, width, height, 989 FFMAX(minStride, c->stride), 990 FFMAX(c->qpStride, absQPStride)); 991 992 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){ 993 int i; 994 QP_store= c->forcedQPTable; 995 absQPStride = QPStride = 0; 996 if(mode->lumMode & FORCE_QUANT) 997 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant; 998 else 999 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1; 1000 } 1001 1002 if(pict_type & PP_PICT_TYPE_QP2){ 1003 int i; 1004 const int count= mbHeight * absQPStride; 1005 for(i=0; i<(count>>2); i++){ 1006 ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F; 1007 } 1008 for(i<<=2; i<count; i++){ 1009 c->stdQPTable[i] = QP_store[i]>>1; 1010 } 1011 QP_store= c->stdQPTable; 1012 QPStride= absQPStride; 1013 } 1014 1015 if(0){ 1016 int x,y; 1017 for(y=0; y<mbHeight; y++){ 1018 for(x=0; x<mbWidth; x++){ 1019 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]); 1020 } 1021 av_log(c, AV_LOG_INFO, "\n"); 1022 } 1023 av_log(c, AV_LOG_INFO, "\n"); 1024 } 1025 1026 if((pict_type&7)!=3){ 1027 if (QPStride >= 0){ 1028 int i; 1029 const int count= mbHeight * QPStride; 1030 for(i=0; i<(count>>2); i++){ 1031 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F; 1032 } 1033 for(i<<=2; i<count; i++){ 1034 c->nonBQPTable[i] = QP_store[i] & 0x3F; 1035 } 1036 } else { 1037 int i,j; 1038 for(i=0; i<mbHeight; i++) { 1039 for(j=0; j<absQPStride; j++) { 1040 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F; 1041 } 1042 } 1043 } 1044 } 1045 1046 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n", 1047 mode->lumMode, mode->chromMode); 1048 1049 postProcess(src[0], srcStride[0], dst[0], dstStride[0], 1050 width, height, QP_store, QPStride, 0, mode, c); 1051 1052 width = (width )>>c->hChromaSubSample; 1053 height = (height)>>c->vChromaSubSample; 1054 1055 if(mode->chromMode){ 1056 postProcess(src[1], srcStride[1], dst[1], dstStride[1], 1057 width, height, QP_store, QPStride, 1, mode, c); 1058 postProcess(src[2], srcStride[2], dst[2], dstStride[2], 1059 width, height, QP_store, QPStride, 2, mode, c); 1060 } 1061 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){ 1062 linecpy(dst[1], src[1], height, srcStride[1]); 1063 linecpy(dst[2], src[2], height, srcStride[2]); 1064 }else{ 1065 int y; 1066 for(y=0; y<height; y++){ 1067 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width); 1068 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width); 1069 } 1070 } 1071} 1072