1/* 2 * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21/** 22 * @file 23 * mmx/mmx2/3dnow postprocess code. 24 */ 25 26#include "libavutil/x86/asm.h" 27 28/* A single TEMPLATE_PP_* should be defined (to 1) when this template is 29 * included. The following macros will define its dependencies to 1 as well 30 * (like MMX2 depending on MMX), and will define to 0 all the others. Every 31 * TEMPLATE_PP_* need to be undef at the end. */ 32 33#ifdef TEMPLATE_PP_C 34# define RENAME(a) a ## _C 35#else 36# define TEMPLATE_PP_C 0 37#endif 38 39#ifdef TEMPLATE_PP_ALTIVEC 40# define RENAME(a) a ## _altivec 41#else 42# define TEMPLATE_PP_ALTIVEC 0 43#endif 44 45#ifdef TEMPLATE_PP_MMX 46# define RENAME(a) a ## _MMX 47#else 48# define TEMPLATE_PP_MMX 0 49#endif 50 51#ifdef TEMPLATE_PP_MMXEXT 52# undef TEMPLATE_PP_MMX 53# define TEMPLATE_PP_MMX 1 54# define RENAME(a) a ## _MMX2 55#else 56# define TEMPLATE_PP_MMXEXT 0 57#endif 58 59#ifdef TEMPLATE_PP_3DNOW 60# undef TEMPLATE_PP_MMX 61# define TEMPLATE_PP_MMX 1 62# define RENAME(a) a ## _3DNow 63#else 64# define TEMPLATE_PP_3DNOW 0 65#endif 66 67#ifdef TEMPLATE_PP_SSE2 68# undef TEMPLATE_PP_MMX 69# define TEMPLATE_PP_MMX 1 70# undef TEMPLATE_PP_MMXEXT 71# define TEMPLATE_PP_MMXEXT 1 72# define RENAME(a) a ## _SSE2 73#else 74# define TEMPLATE_PP_SSE2 0 75#endif 76 77#undef REAL_PAVGB 78#undef PAVGB 79#undef PMINUB 80#undef PMAXUB 81 82#if TEMPLATE_PP_MMXEXT 83#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 84#elif TEMPLATE_PP_3DNOW 85#define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 86#endif 87#define PAVGB(a,b) REAL_PAVGB(a,b) 88 89#if TEMPLATE_PP_MMXEXT 90#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" 91#elif TEMPLATE_PP_MMX 92#define PMINUB(b,a,t) \ 93 "movq " #a ", " #t " \n\t"\ 94 "psubusb " #b ", " #t " \n\t"\ 95 "psubb " #t ", " #a " \n\t" 96#endif 97 98#if TEMPLATE_PP_MMXEXT 99#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" 100#elif TEMPLATE_PP_MMX 101#define PMAXUB(a,b) \ 102 "psubusb " #a ", " #b " \n\t"\ 103 "paddb " #a ", " #b " \n\t" 104#endif 105 106//FIXME? |255-0| = 1 (should not be a problem ...) 107#if TEMPLATE_PP_MMX 108/** 109 * Check if the middle 8x8 Block in the given 8x16 block is flat 110 */ 111static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ 112 int numEq= 0, dcOk; 113 src+= stride*4; // src points to begin of the 8x8 Block 114 __asm__ volatile( 115 "movq %0, %%mm7 \n\t" 116 "movq %1, %%mm6 \n\t" 117 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 118 ); 119 120 __asm__ volatile( 121 "lea (%2, %3), %%"REG_a" \n\t" 122// 0 1 2 3 4 5 6 7 8 9 123// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 124 125 "movq (%2), %%mm0 \n\t" 126 "movq (%%"REG_a"), %%mm1 \n\t" 127 "movq %%mm0, %%mm3 \n\t" 128 "movq %%mm0, %%mm4 \n\t" 129 PMAXUB(%%mm1, %%mm4) 130 PMINUB(%%mm1, %%mm3, %%mm5) 131 "psubb %%mm1, %%mm0 \n\t" // mm0 = difference 132 "paddb %%mm7, %%mm0 \n\t" 133 "pcmpgtb %%mm6, %%mm0 \n\t" 134 135 "movq (%%"REG_a",%3), %%mm2 \n\t" 136 PMAXUB(%%mm2, %%mm4) 137 PMINUB(%%mm2, %%mm3, %%mm5) 138 "psubb %%mm2, %%mm1 \n\t" 139 "paddb %%mm7, %%mm1 \n\t" 140 "pcmpgtb %%mm6, %%mm1 \n\t" 141 "paddb %%mm1, %%mm0 \n\t" 142 143 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 144 PMAXUB(%%mm1, %%mm4) 145 PMINUB(%%mm1, %%mm3, %%mm5) 146 "psubb %%mm1, %%mm2 \n\t" 147 "paddb %%mm7, %%mm2 \n\t" 148 "pcmpgtb %%mm6, %%mm2 \n\t" 149 "paddb %%mm2, %%mm0 \n\t" 150 151 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 152 153 "movq (%2, %3, 4), %%mm2 \n\t" 154 PMAXUB(%%mm2, %%mm4) 155 PMINUB(%%mm2, %%mm3, %%mm5) 156 "psubb %%mm2, %%mm1 \n\t" 157 "paddb %%mm7, %%mm1 \n\t" 158 "pcmpgtb %%mm6, %%mm1 \n\t" 159 "paddb %%mm1, %%mm0 \n\t" 160 161 "movq (%%"REG_a"), %%mm1 \n\t" 162 PMAXUB(%%mm1, %%mm4) 163 PMINUB(%%mm1, %%mm3, %%mm5) 164 "psubb %%mm1, %%mm2 \n\t" 165 "paddb %%mm7, %%mm2 \n\t" 166 "pcmpgtb %%mm6, %%mm2 \n\t" 167 "paddb %%mm2, %%mm0 \n\t" 168 169 "movq (%%"REG_a", %3), %%mm2 \n\t" 170 PMAXUB(%%mm2, %%mm4) 171 PMINUB(%%mm2, %%mm3, %%mm5) 172 "psubb %%mm2, %%mm1 \n\t" 173 "paddb %%mm7, %%mm1 \n\t" 174 "pcmpgtb %%mm6, %%mm1 \n\t" 175 "paddb %%mm1, %%mm0 \n\t" 176 177 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 178 PMAXUB(%%mm1, %%mm4) 179 PMINUB(%%mm1, %%mm3, %%mm5) 180 "psubb %%mm1, %%mm2 \n\t" 181 "paddb %%mm7, %%mm2 \n\t" 182 "pcmpgtb %%mm6, %%mm2 \n\t" 183 "paddb %%mm2, %%mm0 \n\t" 184 "psubusb %%mm3, %%mm4 \n\t" 185 186 " \n\t" 187#if TEMPLATE_PP_MMXEXT 188 "pxor %%mm7, %%mm7 \n\t" 189 "psadbw %%mm7, %%mm0 \n\t" 190#else 191 "movq %%mm0, %%mm1 \n\t" 192 "psrlw $8, %%mm0 \n\t" 193 "paddb %%mm1, %%mm0 \n\t" 194 "movq %%mm0, %%mm1 \n\t" 195 "psrlq $16, %%mm0 \n\t" 196 "paddb %%mm1, %%mm0 \n\t" 197 "movq %%mm0, %%mm1 \n\t" 198 "psrlq $32, %%mm0 \n\t" 199 "paddb %%mm1, %%mm0 \n\t" 200#endif 201 "movq %4, %%mm7 \n\t" // QP,..., QP 202 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 203 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 204 "packssdw %%mm4, %%mm4 \n\t" 205 "movd %%mm0, %0 \n\t" 206 "movd %%mm4, %1 \n\t" 207 208 : "=r" (numEq), "=r" (dcOk) 209 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 210 : "%"REG_a 211 ); 212 213 numEq= (-numEq) &0xFF; 214 if(numEq > c->ppMode.flatnessThreshold){ 215 if(dcOk) return 0; 216 else return 1; 217 }else{ 218 return 2; 219 } 220} 221#endif //TEMPLATE_PP_MMX 222 223/** 224 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) 225 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 226 */ 227#if !TEMPLATE_PP_ALTIVEC 228static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 229{ 230#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 231 src+= stride*3; 232 __asm__ volatile( //"movv %0 %1 %2\n\t" 233 "movq %2, %%mm0 \n\t" // QP,..., QP 234 "pxor %%mm4, %%mm4 \n\t" 235 236 "movq (%0), %%mm6 \n\t" 237 "movq (%0, %1), %%mm5 \n\t" 238 "movq %%mm5, %%mm1 \n\t" 239 "movq %%mm6, %%mm2 \n\t" 240 "psubusb %%mm6, %%mm5 \n\t" 241 "psubusb %%mm1, %%mm2 \n\t" 242 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 243 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 244 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 245 246 "pand %%mm2, %%mm6 \n\t" 247 "pandn %%mm1, %%mm2 \n\t" 248 "por %%mm2, %%mm6 \n\t"// First Line to Filter 249 250 "movq (%0, %1, 8), %%mm5 \n\t" 251 "lea (%0, %1, 4), %%"REG_a" \n\t" 252 "lea (%0, %1, 8), %%"REG_c" \n\t" 253 "sub %1, %%"REG_c" \n\t" 254 "add %1, %0 \n\t" // %0 points to line 1 not 0 255 "movq (%0, %1, 8), %%mm7 \n\t" 256 "movq %%mm5, %%mm1 \n\t" 257 "movq %%mm7, %%mm2 \n\t" 258 "psubusb %%mm7, %%mm5 \n\t" 259 "psubusb %%mm1, %%mm2 \n\t" 260 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 261 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 262 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 263 264 "pand %%mm2, %%mm7 \n\t" 265 "pandn %%mm1, %%mm2 \n\t" 266 "por %%mm2, %%mm7 \n\t" // First Line to Filter 267 268 269 // 1 2 3 4 5 6 7 8 270 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 271 // 6 4 2 2 1 1 272 // 6 4 4 2 273 // 6 8 2 274 275 "movq (%0, %1), %%mm0 \n\t" // 1 276 "movq %%mm0, %%mm1 \n\t" // 1 277 PAVGB(%%mm6, %%mm0) //1 1 /2 278 PAVGB(%%mm6, %%mm0) //3 1 /4 279 280 "movq (%0, %1, 4), %%mm2 \n\t" // 1 281 "movq %%mm2, %%mm5 \n\t" // 1 282 PAVGB((%%REGa), %%mm2) // 11 /2 283 PAVGB((%0, %1, 2), %%mm2) // 211 /4 284 "movq %%mm2, %%mm3 \n\t" // 211 /4 285 "movq (%0), %%mm4 \n\t" // 1 286 PAVGB(%%mm4, %%mm3) // 4 211 /8 287 PAVGB(%%mm0, %%mm3) //642211 /16 288 "movq %%mm3, (%0) \n\t" // X 289 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 290 "movq %%mm1, %%mm0 \n\t" // 1 291 PAVGB(%%mm6, %%mm0) //1 1 /2 292 "movq %%mm4, %%mm3 \n\t" // 1 293 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 294 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 295 PAVGB((%%REGa), %%mm5) // 211 /4 296 PAVGB(%%mm5, %%mm3) // 2 2211 /8 297 PAVGB(%%mm0, %%mm3) //4242211 /16 298 "movq %%mm3, (%0,%1) \n\t" // X 299 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 300 PAVGB(%%mm4, %%mm6) //11 /2 301 "movq (%%"REG_c"), %%mm0 \n\t" // 1 302 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 303 "movq %%mm0, %%mm3 \n\t" // 11/2 304 PAVGB(%%mm1, %%mm0) // 2 11/4 305 PAVGB(%%mm6, %%mm0) //222 11/8 306 PAVGB(%%mm2, %%mm0) //22242211/16 307 "movq (%0, %1, 2), %%mm2 \n\t" // 1 308 "movq %%mm0, (%0, %1, 2) \n\t" // X 309 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 310 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 311 PAVGB((%%REGc), %%mm0) // 11 /2 312 PAVGB(%%mm0, %%mm6) //11 11 /4 313 PAVGB(%%mm1, %%mm4) // 11 /2 314 PAVGB(%%mm2, %%mm1) // 11 /2 315 PAVGB(%%mm1, %%mm6) //1122 11 /8 316 PAVGB(%%mm5, %%mm6) //112242211 /16 317 "movq (%%"REG_a"), %%mm5 \n\t" // 1 318 "movq %%mm6, (%%"REG_a") \n\t" // X 319 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 320 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 321 PAVGB(%%mm7, %%mm6) // 11 /2 322 PAVGB(%%mm4, %%mm6) // 11 11 /4 323 PAVGB(%%mm3, %%mm6) // 11 2211 /8 324 PAVGB(%%mm5, %%mm2) // 11 /2 325 "movq (%0, %1, 4), %%mm4 \n\t" // 1 326 PAVGB(%%mm4, %%mm2) // 112 /4 327 PAVGB(%%mm2, %%mm6) // 112242211 /16 328 "movq %%mm6, (%0, %1, 4) \n\t" // X 329 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 330 PAVGB(%%mm7, %%mm1) // 11 2 /4 331 PAVGB(%%mm4, %%mm5) // 11 /2 332 PAVGB(%%mm5, %%mm0) // 11 11 /4 333 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 334 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 335 PAVGB(%%mm0, %%mm1) // 11224222 /16 336 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X 337 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 338 PAVGB((%%REGc), %%mm2) // 112 4 /8 339 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 340 PAVGB(%%mm0, %%mm6) // 1 1 /2 341 PAVGB(%%mm7, %%mm6) // 1 12 /4 342 PAVGB(%%mm2, %%mm6) // 1122424 /4 343 "movq %%mm6, (%%"REG_c") \n\t" // X 344 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 345 PAVGB(%%mm7, %%mm5) // 11 2 /4 346 PAVGB(%%mm7, %%mm5) // 11 6 /8 347 348 PAVGB(%%mm3, %%mm0) // 112 /4 349 PAVGB(%%mm0, %%mm5) // 112246 /16 350 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X 351 "sub %1, %0 \n\t" 352 353 : 354 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 355 : "%"REG_a, "%"REG_c 356 ); 357#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 358 const int l1= stride; 359 const int l2= stride + l1; 360 const int l3= stride + l2; 361 const int l4= stride + l3; 362 const int l5= stride + l4; 363 const int l6= stride + l5; 364 const int l7= stride + l6; 365 const int l8= stride + l7; 366 const int l9= stride + l8; 367 int x; 368 src+= stride*3; 369 for(x=0; x<BLOCK_SIZE; x++){ 370 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; 371 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; 372 373 int sums[10]; 374 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; 375 sums[1] = sums[0] - first + src[l4]; 376 sums[2] = sums[1] - first + src[l5]; 377 sums[3] = sums[2] - first + src[l6]; 378 sums[4] = sums[3] - first + src[l7]; 379 sums[5] = sums[4] - src[l1] + src[l8]; 380 sums[6] = sums[5] - src[l2] + last; 381 sums[7] = sums[6] - src[l3] + last; 382 sums[8] = sums[7] - src[l4] + last; 383 sums[9] = sums[8] - src[l5] + last; 384 385 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; 386 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; 387 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; 388 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; 389 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; 390 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; 391 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 392 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 393 394 src++; 395 } 396#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 397} 398#endif //TEMPLATE_PP_ALTIVEC 399 400/** 401 * Experimental Filter 1 402 * will not damage linear gradients 403 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter 404 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) 405 * MMX2 version does correct clipping C version does not 406 */ 407static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 408{ 409#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 410 src+= stride*3; 411 412 __asm__ volatile( 413 "pxor %%mm7, %%mm7 \n\t" // 0 414 "lea (%0, %1), %%"REG_a" \n\t" 415 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 416// 0 1 2 3 4 5 6 7 8 9 417// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 418 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 419 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 420 "movq %%mm1, %%mm2 \n\t" // line 4 421 "psubusb %%mm0, %%mm1 \n\t" 422 "psubusb %%mm2, %%mm0 \n\t" 423 "por %%mm1, %%mm0 \n\t" // |l2 - l3| 424 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 425 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 426 "movq %%mm3, %%mm5 \n\t" // line 5 427 "psubusb %%mm4, %%mm3 \n\t" 428 "psubusb %%mm5, %%mm4 \n\t" 429 "por %%mm4, %%mm3 \n\t" // |l5 - l6| 430 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 431 "movq %%mm2, %%mm1 \n\t" // line 4 432 "psubusb %%mm5, %%mm2 \n\t" 433 "movq %%mm2, %%mm4 \n\t" 434 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 435 "psubusb %%mm1, %%mm5 \n\t" 436 "por %%mm5, %%mm4 \n\t" // |l4 - l5| 437 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) 438 "movq %%mm4, %%mm3 \n\t" // d 439 "movq %2, %%mm0 \n\t" 440 "paddusb %%mm0, %%mm0 \n\t" 441 "psubusb %%mm0, %%mm4 \n\t" 442 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 443 "psubusb "MANGLE(b01)", %%mm3 \n\t" 444 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 445 446 PAVGB(%%mm7, %%mm3) // d/2 447 "movq %%mm3, %%mm1 \n\t" // d/2 448 PAVGB(%%mm7, %%mm3) // d/4 449 PAVGB(%%mm1, %%mm3) // 3*d/8 450 451 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 452 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 453 "psubusb %%mm3, %%mm0 \n\t" 454 "pxor %%mm2, %%mm0 \n\t" 455 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 456 457 "movq (%%"REG_c"), %%mm0 \n\t" // line 5 458 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 459 "paddusb %%mm3, %%mm0 \n\t" 460 "pxor %%mm2, %%mm0 \n\t" 461 "movq %%mm0, (%%"REG_c") \n\t" // line 5 462 463 PAVGB(%%mm7, %%mm1) // d/4 464 465 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 466 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 467 "psubusb %%mm1, %%mm0 \n\t" 468 "pxor %%mm2, %%mm0 \n\t" 469 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 470 471 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 472 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 473 "paddusb %%mm1, %%mm0 \n\t" 474 "pxor %%mm2, %%mm0 \n\t" 475 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 476 477 PAVGB(%%mm7, %%mm1) // d/8 478 479 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 480 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 481 "psubusb %%mm1, %%mm0 \n\t" 482 "pxor %%mm2, %%mm0 \n\t" 483 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 484 485 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 486 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 487 "paddusb %%mm1, %%mm0 \n\t" 488 "pxor %%mm2, %%mm0 \n\t" 489 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 490 491 : 492 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) 493 NAMED_CONSTRAINTS_ADD(b01) 494 : "%"REG_a, "%"REG_c 495 ); 496#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 497 498 const int l1= stride; 499 const int l2= stride + l1; 500 const int l3= stride + l2; 501 const int l4= stride + l3; 502 const int l5= stride + l4; 503 const int l6= stride + l5; 504 const int l7= stride + l6; 505// const int l8= stride + l7; 506// const int l9= stride + l8; 507 int x; 508 509 src+= stride*3; 510 for(x=0; x<BLOCK_SIZE; x++){ 511 int a= src[l3] - src[l4]; 512 int b= src[l4] - src[l5]; 513 int c= src[l5] - src[l6]; 514 515 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1); 516 d= FFMAX(d, 0); 517 518 if(d < co->QP*2){ 519 int v = d * FFSIGN(-b); 520 521 src[l2] +=v>>3; 522 src[l3] +=v>>2; 523 src[l4] +=(3*v)>>3; 524 src[l5] -=(3*v)>>3; 525 src[l6] -=v>>2; 526 src[l7] -=v>>3; 527 } 528 src++; 529 } 530#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 531} 532 533#if !TEMPLATE_PP_ALTIVEC 534static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) 535{ 536#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 537/* 538 uint8_t tmp[16]; 539 const int l1= stride; 540 const int l2= stride + l1; 541 const int l3= stride + l2; 542 const int l4= (int)tmp - (int)src - stride*3; 543 const int l5= (int)tmp - (int)src - stride*3 + 8; 544 const int l6= stride*3 + l3; 545 const int l7= stride + l6; 546 const int l8= stride + l7; 547 548 memcpy(tmp, src+stride*7, 8); 549 memcpy(tmp+8, src+stride*8, 8); 550*/ 551 src+= stride*4; 552 __asm__ volatile( 553 554#if 0 //slightly more accurate and slightly slower 555 "pxor %%mm7, %%mm7 \n\t" // 0 556 "lea (%0, %1), %%"REG_a" \n\t" 557 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 558// 0 1 2 3 4 5 6 7 559// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 560// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 561 562 563 "movq (%0, %1, 2), %%mm0 \n\t" // l2 564 "movq (%0), %%mm1 \n\t" // l0 565 "movq %%mm0, %%mm2 \n\t" // l2 566 PAVGB(%%mm7, %%mm0) // ~l2/2 567 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 568 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 569 570 "movq (%%"REG_a"), %%mm1 \n\t" // l1 571 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 572 "movq %%mm1, %%mm4 \n\t" // l1 573 PAVGB(%%mm7, %%mm1) // ~l1/2 574 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 575 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 576 577 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 578 "psubusb %%mm1, %%mm0 \n\t" 579 "psubusb %%mm4, %%mm1 \n\t" 580 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 581// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 582 583 "movq (%0, %1, 4), %%mm0 \n\t" // l4 584 "movq %%mm0, %%mm4 \n\t" // l4 585 PAVGB(%%mm7, %%mm0) // ~l4/2 586 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 587 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 588 589 "movq (%%"REG_c"), %%mm2 \n\t" // l5 590 "movq %%mm3, %%mm5 \n\t" // l3 591 PAVGB(%%mm7, %%mm3) // ~l3/2 592 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 593 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 594 595 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 596 "psubusb %%mm3, %%mm0 \n\t" 597 "psubusb %%mm6, %%mm3 \n\t" 598 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 599 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) 600// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 601 602 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 603 "movq %%mm6, %%mm5 \n\t" // l6 604 PAVGB(%%mm7, %%mm6) // ~l6/2 605 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 606 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 607 608 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 609 "movq %%mm2, %%mm4 \n\t" // l5 610 PAVGB(%%mm7, %%mm2) // ~l5/2 611 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 612 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 613 614 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 615 "psubusb %%mm2, %%mm6 \n\t" 616 "psubusb %%mm4, %%mm2 \n\t" 617 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 618// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 619 620 621 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 622 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? 623 "paddusb "MANGLE(b01)", %%mm4 \n\t" 624 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP 625 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 626 "pand %%mm4, %%mm3 \n\t" 627 628 "movq %%mm3, %%mm1 \n\t" 629// "psubusb "MANGLE(b01)", %%mm3 \n\t" 630 PAVGB(%%mm7, %%mm3) 631 PAVGB(%%mm7, %%mm3) 632 "paddusb %%mm1, %%mm3 \n\t" 633// "paddusb "MANGLE(b01)", %%mm3 \n\t" 634 635 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 636 "movq (%0, %1, 4), %%mm5 \n\t" //l4 637 "movq (%0, %1, 4), %%mm4 \n\t" //l4 638 "psubusb %%mm6, %%mm5 \n\t" 639 "psubusb %%mm4, %%mm6 \n\t" 640 "por %%mm6, %%mm5 \n\t" // |l3-l4| 641 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) 642 "pxor %%mm6, %%mm0 \n\t" 643 "pand %%mm0, %%mm3 \n\t" 644 PMINUB(%%mm5, %%mm3, %%mm0) 645 646 "psubusb "MANGLE(b01)", %%mm3 \n\t" 647 PAVGB(%%mm7, %%mm3) 648 649 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 650 "movq (%0, %1, 4), %%mm2 \n\t" 651 "pxor %%mm6, %%mm0 \n\t" 652 "pxor %%mm6, %%mm2 \n\t" 653 "psubb %%mm3, %%mm0 \n\t" 654 "paddb %%mm3, %%mm2 \n\t" 655 "pxor %%mm6, %%mm0 \n\t" 656 "pxor %%mm6, %%mm2 \n\t" 657 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 658 "movq %%mm2, (%0, %1, 4) \n\t" 659#endif //0 660 661 "lea (%0, %1), %%"REG_a" \n\t" 662 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 663// 0 1 2 3 4 5 6 7 664// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 665// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 666 667 668 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 669 "movq (%0, %1, 4), %%mm0 \n\t" // l4 670 "pxor %%mm6, %%mm1 \n\t" // -l3-1 671 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 672// mm1=-l3-1, mm0=128-q 673 674 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 675 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 676 "pxor %%mm6, %%mm2 \n\t" // -l5-1 677 "movq %%mm2, %%mm5 \n\t" // -l5-1 678 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 679 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 680 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 681 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 682 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 683 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 684// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 685 686 "movq (%%"REG_a"), %%mm2 \n\t" // l1 687 "pxor %%mm6, %%mm2 \n\t" // -l1-1 688 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 689 PAVGB((%0), %%mm1) // (l0-l3+256)/2 690 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 691 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 692 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 693 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 694// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 695 696 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 697 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 698 "pxor %%mm6, %%mm1 \n\t" // -l7-1 699 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 700 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 701 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 702 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 703 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 704// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 705 706 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 707 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 708 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 709 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 710 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| 711 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| 712 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 713 714// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 715 716 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 717 "movq %2, %%mm2 \n\t" // QP 718 PAVGB(%%mm6, %%mm2) // 128 + QP/2 719 "psubb %%mm6, %%mm2 \n\t" 720 721 "movq %%mm4, %%mm1 \n\t" 722 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) 723 "pxor %%mm1, %%mm4 \n\t" 724 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 725 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 726 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 727// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 728 729 "movq %%mm4, %%mm3 \n\t" // d 730 "psubusb "MANGLE(b01)", %%mm4 \n\t" 731 PAVGB(%%mm7, %%mm4) // d/32 732 PAVGB(%%mm7, %%mm4) // (d + 32)/64 733 "paddb %%mm3, %%mm4 \n\t" // 5d/64 734 "pand %%mm2, %%mm4 \n\t" 735 736 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 737 "psubb %%mm0, %%mm5 \n\t" // q 738 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding 739 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) 740 "pxor %%mm7, %%mm5 \n\t" 741 742 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) 743 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) 744 745 "pand %%mm7, %%mm4 \n\t" 746 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 747 "movq (%0, %1, 4), %%mm2 \n\t" 748 "pxor %%mm1, %%mm0 \n\t" 749 "pxor %%mm1, %%mm2 \n\t" 750 "paddb %%mm4, %%mm0 \n\t" 751 "psubb %%mm4, %%mm2 \n\t" 752 "pxor %%mm1, %%mm0 \n\t" 753 "pxor %%mm1, %%mm2 \n\t" 754 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 755 "movq %%mm2, (%0, %1, 4) \n\t" 756 757 : 758 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 759 NAMED_CONSTRAINTS_ADD(b80,b00,b01) 760 : "%"REG_a, "%"REG_c 761 ); 762 763/* 764 { 765 int x; 766 src-= stride; 767 for(x=0; x<BLOCK_SIZE; x++){ 768 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 769 if(FFABS(middleEnergy)< 8*QP){ 770 const int q=(src[l4] - src[l5])/2; 771 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 772 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 773 774 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 775 d= FFMAX(d, 0); 776 777 d= (5*d + 32) >> 6; 778 d*= FFSIGN(-middleEnergy); 779 780 if(q>0){ 781 d= d<0 ? 0 : d; 782 d= d>q ? q : d; 783 }else{ 784 d= d>0 ? 0 : d; 785 d= d<q ? q : d; 786 } 787 788 src[l4]-= d; 789 src[l5]+= d; 790 } 791 src++; 792 } 793 src-=8; 794 for(x=0; x<8; x++){ 795 int y; 796 for(y=4; y<6; y++){ 797 int d= src[x+y*stride] - tmp[x+(y-4)*8]; 798 int ad= FFABS(d); 799 static int max=0; 800 static int sum=0; 801 static int num=0; 802 static int bias=0; 803 804 if(max<ad) max=ad; 805 sum+= ad>3 ? 1 : 0; 806 if(ad>3){ 807 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; 808 } 809 if(y==4) bias+=d; 810 num++; 811 if(num%1000000 == 0){ 812 av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias); 813 } 814 } 815 } 816} 817*/ 818#elif TEMPLATE_PP_MMX 819 DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars 820 src+= stride*4; 821 __asm__ volatile( 822 "pxor %%mm7, %%mm7 \n\t" 823// 0 1 2 3 4 5 6 7 824// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 825// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 826 827 "movq (%0), %%mm0 \n\t" 828 "movq %%mm0, %%mm1 \n\t" 829 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 830 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 831 832 "movq (%0, %1), %%mm2 \n\t" 833 "lea (%0, %1, 2), %%"REG_a" \n\t" 834 "movq %%mm2, %%mm3 \n\t" 835 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 836 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 837 838 "movq (%%"REG_a"), %%mm4 \n\t" 839 "movq %%mm4, %%mm5 \n\t" 840 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 841 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 842 843 "paddw %%mm0, %%mm0 \n\t" // 2L0 844 "paddw %%mm1, %%mm1 \n\t" // 2H0 845 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 846 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 847 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 848 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 849 850 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 851 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 852 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 853 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 854 855 "movq (%%"REG_a", %1), %%mm2 \n\t" 856 "movq %%mm2, %%mm3 \n\t" 857 "punpcklbw %%mm7, %%mm2 \n\t" // L3 858 "punpckhbw %%mm7, %%mm3 \n\t" // H3 859 860 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 861 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 862 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 863 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 864 "movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 865 "movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 866 867 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 868 "movq %%mm0, %%mm1 \n\t" 869 "punpcklbw %%mm7, %%mm0 \n\t" // L4 870 "punpckhbw %%mm7, %%mm1 \n\t" // H4 871 872 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 873 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 874 "movq %%mm2, 16(%3) \n\t" // L3 - L4 875 "movq %%mm3, 24(%3) \n\t" // H3 - H4 876 "paddw %%mm4, %%mm4 \n\t" // 2L2 877 "paddw %%mm5, %%mm5 \n\t" // 2H2 878 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 879 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 880 881 "lea (%%"REG_a", %1), %0 \n\t" 882 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 883 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 884 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 885 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 886//50 opcodes so far 887 "movq (%0, %1, 2), %%mm2 \n\t" 888 "movq %%mm2, %%mm3 \n\t" 889 "punpcklbw %%mm7, %%mm2 \n\t" // L5 890 "punpckhbw %%mm7, %%mm3 \n\t" // H5 891 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 892 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 893 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 894 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 895 896 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 897 "punpcklbw %%mm7, %%mm6 \n\t" // L6 898 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 899 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 900 "punpckhbw %%mm7, %%mm6 \n\t" // H6 901 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 902 903 "paddw %%mm0, %%mm0 \n\t" // 2L4 904 "paddw %%mm1, %%mm1 \n\t" // 2H4 905 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 906 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 907 908 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 909 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 910 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 911 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 912 913 "movq (%0, %1, 4), %%mm2 \n\t" 914 "movq %%mm2, %%mm3 \n\t" 915 "punpcklbw %%mm7, %%mm2 \n\t" // L7 916 "punpckhbw %%mm7, %%mm3 \n\t" // H7 917 918 "paddw %%mm2, %%mm2 \n\t" // 2L7 919 "paddw %%mm3, %%mm3 \n\t" // 2H7 920 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 921 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 922 923 "movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 924 "movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 925 926#if TEMPLATE_PP_MMXEXT 927 "movq %%mm7, %%mm6 \n\t" // 0 928 "psubw %%mm0, %%mm6 \n\t" 929 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 930 "movq %%mm7, %%mm6 \n\t" // 0 931 "psubw %%mm1, %%mm6 \n\t" 932 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 933 "movq %%mm7, %%mm6 \n\t" // 0 934 "psubw %%mm2, %%mm6 \n\t" 935 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 936 "movq %%mm7, %%mm6 \n\t" // 0 937 "psubw %%mm3, %%mm6 \n\t" 938 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 939#else 940 "movq %%mm7, %%mm6 \n\t" // 0 941 "pcmpgtw %%mm0, %%mm6 \n\t" 942 "pxor %%mm6, %%mm0 \n\t" 943 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 944 "movq %%mm7, %%mm6 \n\t" // 0 945 "pcmpgtw %%mm1, %%mm6 \n\t" 946 "pxor %%mm6, %%mm1 \n\t" 947 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 948 "movq %%mm7, %%mm6 \n\t" // 0 949 "pcmpgtw %%mm2, %%mm6 \n\t" 950 "pxor %%mm6, %%mm2 \n\t" 951 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 952 "movq %%mm7, %%mm6 \n\t" // 0 953 "pcmpgtw %%mm3, %%mm6 \n\t" 954 "pxor %%mm6, %%mm3 \n\t" 955 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 956#endif 957 958#if TEMPLATE_PP_MMXEXT 959 "pminsw %%mm2, %%mm0 \n\t" 960 "pminsw %%mm3, %%mm1 \n\t" 961#else 962 "movq %%mm0, %%mm6 \n\t" 963 "psubusw %%mm2, %%mm6 \n\t" 964 "psubw %%mm6, %%mm0 \n\t" 965 "movq %%mm1, %%mm6 \n\t" 966 "psubusw %%mm3, %%mm6 \n\t" 967 "psubw %%mm6, %%mm1 \n\t" 968#endif 969 970 "movd %2, %%mm2 \n\t" // QP 971 "punpcklbw %%mm7, %%mm2 \n\t" 972 973 "movq %%mm7, %%mm6 \n\t" // 0 974 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 975 "pxor %%mm6, %%mm4 \n\t" 976 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 977 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 978 "pxor %%mm7, %%mm5 \n\t" 979 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 980// 100 opcodes 981 "psllw $3, %%mm2 \n\t" // 8QP 982 "movq %%mm2, %%mm3 \n\t" // 8QP 983 "pcmpgtw %%mm4, %%mm2 \n\t" 984 "pcmpgtw %%mm5, %%mm3 \n\t" 985 "pand %%mm2, %%mm4 \n\t" 986 "pand %%mm3, %%mm5 \n\t" 987 988 989 "psubusw %%mm0, %%mm4 \n\t" // hd 990 "psubusw %%mm1, %%mm5 \n\t" // ld 991 992 993 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 994 "pmullw %%mm2, %%mm4 \n\t" 995 "pmullw %%mm2, %%mm5 \n\t" 996 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 997 "paddw %%mm2, %%mm4 \n\t" 998 "paddw %%mm2, %%mm5 \n\t" 999 "psrlw $6, %%mm4 \n\t" 1000 "psrlw $6, %%mm5 \n\t" 1001 1002 "movq 16(%3), %%mm0 \n\t" // L3 - L4 1003 "movq 24(%3), %%mm1 \n\t" // H3 - H4 1004 1005 "pxor %%mm2, %%mm2 \n\t" 1006 "pxor %%mm3, %%mm3 \n\t" 1007 1008 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 1009 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 1010 "pxor %%mm2, %%mm0 \n\t" 1011 "pxor %%mm3, %%mm1 \n\t" 1012 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 1013 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 1014 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 1015 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 1016 1017 "pxor %%mm6, %%mm2 \n\t" 1018 "pxor %%mm7, %%mm3 \n\t" 1019 "pand %%mm2, %%mm4 \n\t" 1020 "pand %%mm3, %%mm5 \n\t" 1021 1022#if TEMPLATE_PP_MMXEXT 1023 "pminsw %%mm0, %%mm4 \n\t" 1024 "pminsw %%mm1, %%mm5 \n\t" 1025#else 1026 "movq %%mm4, %%mm2 \n\t" 1027 "psubusw %%mm0, %%mm2 \n\t" 1028 "psubw %%mm2, %%mm4 \n\t" 1029 "movq %%mm5, %%mm2 \n\t" 1030 "psubusw %%mm1, %%mm2 \n\t" 1031 "psubw %%mm2, %%mm5 \n\t" 1032#endif 1033 "pxor %%mm6, %%mm4 \n\t" 1034 "pxor %%mm7, %%mm5 \n\t" 1035 "psubw %%mm6, %%mm4 \n\t" 1036 "psubw %%mm7, %%mm5 \n\t" 1037 "packsswb %%mm5, %%mm4 \n\t" 1038 "movq (%0), %%mm0 \n\t" 1039 "paddb %%mm4, %%mm0 \n\t" 1040 "movq %%mm0, (%0) \n\t" 1041 "movq (%0, %1), %%mm0 \n\t" 1042 "psubb %%mm4, %%mm0 \n\t" 1043 "movq %%mm0, (%0, %1) \n\t" 1044 1045 : "+r" (src) 1046 : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp) 1047 NAMED_CONSTRAINTS_ADD(w05,w20) 1048 : "%"REG_a 1049 ); 1050#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1051 const int l1= stride; 1052 const int l2= stride + l1; 1053 const int l3= stride + l2; 1054 const int l4= stride + l3; 1055 const int l5= stride + l4; 1056 const int l6= stride + l5; 1057 const int l7= stride + l6; 1058 const int l8= stride + l7; 1059// const int l9= stride + l8; 1060 int x; 1061 src+= stride*3; 1062 for(x=0; x<BLOCK_SIZE; x++){ 1063 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 1064 if(FFABS(middleEnergy) < 8*c->QP){ 1065 const int q=(src[l4] - src[l5])/2; 1066 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 1067 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 1068 1069 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 1070 d= FFMAX(d, 0); 1071 1072 d= (5*d + 32) >> 6; 1073 d*= FFSIGN(-middleEnergy); 1074 1075 if(q>0){ 1076 d= d<0 ? 0 : d; 1077 d= d>q ? q : d; 1078 }else{ 1079 d= d>0 ? 0 : d; 1080 d= d<q ? q : d; 1081 } 1082 1083 src[l4]-= d; 1084 src[l5]+= d; 1085 } 1086 src++; 1087 } 1088#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1089} 1090#endif //TEMPLATE_PP_ALTIVEC 1091 1092#if !TEMPLATE_PP_ALTIVEC 1093static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) 1094{ 1095#if HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) 1096 DECLARE_ALIGNED(8, uint64_t, tmp)[3]; 1097 __asm__ volatile( 1098 "pxor %%mm6, %%mm6 \n\t" 1099 "pcmpeqb %%mm7, %%mm7 \n\t" 1100 "movq %2, %%mm0 \n\t" 1101 "punpcklbw %%mm6, %%mm0 \n\t" 1102 "psrlw $1, %%mm0 \n\t" 1103 "psubw %%mm7, %%mm0 \n\t" 1104 "packuswb %%mm0, %%mm0 \n\t" 1105 "movq %%mm0, %3 \n\t" 1106 1107 "lea (%0, %1), %%"REG_a" \n\t" 1108 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1109 1110// 0 1 2 3 4 5 6 7 8 9 1111// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1112 1113#undef REAL_FIND_MIN_MAX 1114#undef FIND_MIN_MAX 1115#if TEMPLATE_PP_MMXEXT 1116#define REAL_FIND_MIN_MAX(addr)\ 1117 "movq " #addr ", %%mm0 \n\t"\ 1118 "pminub %%mm0, %%mm7 \n\t"\ 1119 "pmaxub %%mm0, %%mm6 \n\t" 1120#else 1121#define REAL_FIND_MIN_MAX(addr)\ 1122 "movq " #addr ", %%mm0 \n\t"\ 1123 "movq %%mm7, %%mm1 \n\t"\ 1124 "psubusb %%mm0, %%mm6 \n\t"\ 1125 "paddb %%mm0, %%mm6 \n\t"\ 1126 "psubusb %%mm0, %%mm1 \n\t"\ 1127 "psubb %%mm1, %%mm7 \n\t" 1128#endif 1129#define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) 1130 1131FIND_MIN_MAX((%%REGa)) 1132FIND_MIN_MAX((%%REGa, %1)) 1133FIND_MIN_MAX((%%REGa, %1, 2)) 1134FIND_MIN_MAX((%0, %1, 4)) 1135FIND_MIN_MAX((%%REGd)) 1136FIND_MIN_MAX((%%REGd, %1)) 1137FIND_MIN_MAX((%%REGd, %1, 2)) 1138FIND_MIN_MAX((%0, %1, 8)) 1139 1140 "movq %%mm7, %%mm4 \n\t" 1141 "psrlq $8, %%mm7 \n\t" 1142#if TEMPLATE_PP_MMXEXT 1143 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1144 "pshufw $0xF9, %%mm7, %%mm4 \n\t" 1145 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1146 "pshufw $0xFE, %%mm7, %%mm4 \n\t" 1147 "pminub %%mm4, %%mm7 \n\t" 1148#else 1149 "movq %%mm7, %%mm1 \n\t" 1150 "psubusb %%mm4, %%mm1 \n\t" 1151 "psubb %%mm1, %%mm7 \n\t" 1152 "movq %%mm7, %%mm4 \n\t" 1153 "psrlq $16, %%mm7 \n\t" 1154 "movq %%mm7, %%mm1 \n\t" 1155 "psubusb %%mm4, %%mm1 \n\t" 1156 "psubb %%mm1, %%mm7 \n\t" 1157 "movq %%mm7, %%mm4 \n\t" 1158 "psrlq $32, %%mm7 \n\t" 1159 "movq %%mm7, %%mm1 \n\t" 1160 "psubusb %%mm4, %%mm1 \n\t" 1161 "psubb %%mm1, %%mm7 \n\t" 1162#endif 1163 1164 1165 "movq %%mm6, %%mm4 \n\t" 1166 "psrlq $8, %%mm6 \n\t" 1167#if TEMPLATE_PP_MMXEXT 1168 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels 1169 "pshufw $0xF9, %%mm6, %%mm4 \n\t" 1170 "pmaxub %%mm4, %%mm6 \n\t" 1171 "pshufw $0xFE, %%mm6, %%mm4 \n\t" 1172 "pmaxub %%mm4, %%mm6 \n\t" 1173#else 1174 "psubusb %%mm4, %%mm6 \n\t" 1175 "paddb %%mm4, %%mm6 \n\t" 1176 "movq %%mm6, %%mm4 \n\t" 1177 "psrlq $16, %%mm6 \n\t" 1178 "psubusb %%mm4, %%mm6 \n\t" 1179 "paddb %%mm4, %%mm6 \n\t" 1180 "movq %%mm6, %%mm4 \n\t" 1181 "psrlq $32, %%mm6 \n\t" 1182 "psubusb %%mm4, %%mm6 \n\t" 1183 "paddb %%mm4, %%mm6 \n\t" 1184#endif 1185 "movq %%mm6, %%mm0 \n\t" // max 1186 "psubb %%mm7, %%mm6 \n\t" // max - min 1187 "push %4 \n\t" 1188 "movd %%mm6, %k4 \n\t" 1189 "cmpb "MANGLE(deringThreshold)", %b4 \n\t" 1190 "pop %4 \n\t" 1191 " jb 1f \n\t" 1192 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 1193 "punpcklbw %%mm7, %%mm7 \n\t" 1194 "punpcklbw %%mm7, %%mm7 \n\t" 1195 "punpcklbw %%mm7, %%mm7 \n\t" 1196 "movq %%mm7, (%4) \n\t" 1197 1198 "movq (%0), %%mm0 \n\t" // L10 1199 "movq %%mm0, %%mm1 \n\t" // L10 1200 "movq %%mm0, %%mm2 \n\t" // L10 1201 "psllq $8, %%mm1 \n\t" 1202 "psrlq $8, %%mm2 \n\t" 1203 "movd -4(%0), %%mm3 \n\t" 1204 "movd 8(%0), %%mm4 \n\t" 1205 "psrlq $24, %%mm3 \n\t" 1206 "psllq $56, %%mm4 \n\t" 1207 "por %%mm3, %%mm1 \n\t" // L00 1208 "por %%mm4, %%mm2 \n\t" // L20 1209 "movq %%mm1, %%mm3 \n\t" // L00 1210 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 1211 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 1212 "psubusb %%mm7, %%mm0 \n\t" 1213 "psubusb %%mm7, %%mm2 \n\t" 1214 "psubusb %%mm7, %%mm3 \n\t" 1215 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 1216 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 1217 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 1218 "paddb %%mm2, %%mm0 \n\t" 1219 "paddb %%mm3, %%mm0 \n\t" 1220 1221 "movq (%%"REG_a"), %%mm2 \n\t" // L11 1222 "movq %%mm2, %%mm3 \n\t" // L11 1223 "movq %%mm2, %%mm4 \n\t" // L11 1224 "psllq $8, %%mm3 \n\t" 1225 "psrlq $8, %%mm4 \n\t" 1226 "movd -4(%%"REG_a"), %%mm5 \n\t" 1227 "movd 8(%%"REG_a"), %%mm6 \n\t" 1228 "psrlq $24, %%mm5 \n\t" 1229 "psllq $56, %%mm6 \n\t" 1230 "por %%mm5, %%mm3 \n\t" // L01 1231 "por %%mm6, %%mm4 \n\t" // L21 1232 "movq %%mm3, %%mm5 \n\t" // L01 1233 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 1234 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 1235 "psubusb %%mm7, %%mm2 \n\t" 1236 "psubusb %%mm7, %%mm4 \n\t" 1237 "psubusb %%mm7, %%mm5 \n\t" 1238 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 1239 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 1240 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 1241 "paddb %%mm4, %%mm2 \n\t" 1242 "paddb %%mm5, %%mm2 \n\t" 1243// 0, 2, 3, 1 1244#define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1245 "movq " #src ", " #sx " \n\t" /* src[0] */\ 1246 "movq " #sx ", " #lx " \n\t" /* src[0] */\ 1247 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ 1248 "psllq $8, " #lx " \n\t"\ 1249 "psrlq $8, " #t0 " \n\t"\ 1250 "movd -4" #src ", " #t1 " \n\t"\ 1251 "psrlq $24, " #t1 " \n\t"\ 1252 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ 1253 "movd 8" #src ", " #t1 " \n\t"\ 1254 "psllq $56, " #t1 " \n\t"\ 1255 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ 1256 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ 1257 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ 1258 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ 1259 PAVGB(lx, pplx) \ 1260 "movq " #lx ", 8(%4) \n\t"\ 1261 "movq (%4), " #lx " \n\t"\ 1262 "psubusb " #lx ", " #t1 " \n\t"\ 1263 "psubusb " #lx ", " #t0 " \n\t"\ 1264 "psubusb " #lx ", " #sx " \n\t"\ 1265 "movq "MANGLE(b00)", " #lx " \n\t"\ 1266 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ 1267 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ 1268 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ 1269 "paddb " #t1 ", " #t0 " \n\t"\ 1270 "paddb " #t0 ", " #sx " \n\t"\ 1271\ 1272 PAVGB(plx, pplx) /* filtered */\ 1273 "movq " #dst ", " #t0 " \n\t" /* dst */\ 1274 "movq " #t0 ", " #t1 " \n\t" /* dst */\ 1275 "psubusb %3, " #t0 " \n\t"\ 1276 "paddusb %3, " #t1 " \n\t"\ 1277 PMAXUB(t0, pplx)\ 1278 PMINUB(t1, pplx, t0)\ 1279 "paddb " #sx ", " #ppsx " \n\t"\ 1280 "paddb " #psx ", " #ppsx " \n\t"\ 1281 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ 1282 "pand "MANGLE(b08)", " #ppsx " \n\t"\ 1283 "pcmpeqb " #lx ", " #ppsx " \n\t"\ 1284 "pand " #ppsx ", " #pplx " \n\t"\ 1285 "pandn " #dst ", " #ppsx " \n\t"\ 1286 "por " #pplx ", " #ppsx " \n\t"\ 1287 "movq " #ppsx ", " #dst " \n\t"\ 1288 "movq 8(%4), " #lx " \n\t" 1289 1290#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1291 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) 1292/* 12930000000 12941111111 1295 12961111110 12971111101 12981111100 12991111011 13001111010 13011111001 1302 13031111000 13041110111 1305 1306*/ 1307//DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) 1308DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1309DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1310DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1311DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1312DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1313DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1314DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1315DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1316 1317 "1: \n\t" 1318 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp) 1319 NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08) 1320 : "%"REG_a, "%"REG_d, "%"REG_SP 1321 ); 1322#else // HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) 1323 int y; 1324 int min=255; 1325 int max=0; 1326 int avg; 1327 uint8_t *p; 1328 int s[10]; 1329 const int QP2= c->QP/2 + 1; 1330 1331 src --; 1332 for(y=1; y<9; y++){ 1333 int x; 1334 p= src + stride*y; 1335 for(x=1; x<9; x++){ 1336 p++; 1337 if(*p > max) max= *p; 1338 if(*p < min) min= *p; 1339 } 1340 } 1341 avg= (min + max + 1)>>1; 1342 1343 if(max - min <deringThreshold) return; 1344 1345 for(y=0; y<10; y++){ 1346 int t = 0; 1347 1348 if(src[stride*y + 0] > avg) t+= 1; 1349 if(src[stride*y + 1] > avg) t+= 2; 1350 if(src[stride*y + 2] > avg) t+= 4; 1351 if(src[stride*y + 3] > avg) t+= 8; 1352 if(src[stride*y + 4] > avg) t+= 16; 1353 if(src[stride*y + 5] > avg) t+= 32; 1354 if(src[stride*y + 6] > avg) t+= 64; 1355 if(src[stride*y + 7] > avg) t+= 128; 1356 if(src[stride*y + 8] > avg) t+= 256; 1357 if(src[stride*y + 9] > avg) t+= 512; 1358 1359 t |= (~t)<<16; 1360 t &= (t<<1) & (t>>1); 1361 s[y] = t; 1362 } 1363 1364 for(y=1; y<9; y++){ 1365 int t = s[y-1] & s[y] & s[y+1]; 1366 t|= t>>16; 1367 s[y-1]= t; 1368 } 1369 1370 for(y=1; y<9; y++){ 1371 int x; 1372 int t = s[y-1]; 1373 1374 p= src + stride*y; 1375 for(x=1; x<9; x++){ 1376 p++; 1377 if(t & (1<<x)){ 1378 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) 1379 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) 1380 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); 1381 f= (f + 8)>>4; 1382 1383#ifdef DEBUG_DERING_THRESHOLD 1384 __asm__ volatile("emms\n\t":); 1385 { 1386 static long long numPixels=0; 1387 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; 1388// if((max-min)<20 || (max-min)*QP<200) 1389// if((max-min)*QP < 500) 1390// if(max-min<QP/2) 1391 if(max-min < 20){ 1392 static int numSkipped=0; 1393 static int errorSum=0; 1394 static int worstQP=0; 1395 static int worstRange=0; 1396 static int worstDiff=0; 1397 int diff= (f - *p); 1398 int absDiff= FFABS(diff); 1399 int error= diff*diff; 1400 1401 if(x==1 || x==8 || y==1 || y==8) continue; 1402 1403 numSkipped++; 1404 if(absDiff > worstDiff){ 1405 worstDiff= absDiff; 1406 worstQP= QP; 1407 worstRange= max-min; 1408 } 1409 errorSum+= error; 1410 1411 if(1024LL*1024LL*1024LL % numSkipped == 0){ 1412 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, " 1413 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", 1414 (float)errorSum/numSkipped, numSkipped, worstQP, worstRange, 1415 worstDiff, (float)numSkipped/numPixels); 1416 } 1417 } 1418 } 1419#endif 1420 if (*p + QP2 < f) *p= *p + QP2; 1421 else if(*p - QP2 > f) *p= *p - QP2; 1422 else *p=f; 1423 } 1424 } 1425 } 1426#ifdef DEBUG_DERING_THRESHOLD 1427 if(max-min < 20){ 1428 for(y=1; y<9; y++){ 1429 int x; 1430 int t = 0; 1431 p= src + stride*y; 1432 for(x=1; x<9; x++){ 1433 p++; 1434 *p = FFMIN(*p + 20, 255); 1435 } 1436 } 1437// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; 1438 } 1439#endif 1440#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1441} 1442#endif //TEMPLATE_PP_ALTIVEC 1443 1444/** 1445 * Deinterlace the given block by linearly interpolating every second line. 1446 * will be called for every 8x8 block and can read & write from line 4-15 1447 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1448 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1449 */ 1450static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) 1451{ 1452#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1453 src+= 4*stride; 1454 __asm__ volatile( 1455 "lea (%0, %1), %%"REG_a" \n\t" 1456 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 1457// 0 1 2 3 4 5 6 7 8 9 1458// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 1459 1460 "movq (%0), %%mm0 \n\t" 1461 "movq (%%"REG_a", %1), %%mm1 \n\t" 1462 PAVGB(%%mm1, %%mm0) 1463 "movq %%mm0, (%%"REG_a") \n\t" 1464 "movq (%0, %1, 4), %%mm0 \n\t" 1465 PAVGB(%%mm0, %%mm1) 1466 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" 1467 "movq (%%"REG_c", %1), %%mm1 \n\t" 1468 PAVGB(%%mm1, %%mm0) 1469 "movq %%mm0, (%%"REG_c") \n\t" 1470 "movq (%0, %1, 8), %%mm0 \n\t" 1471 PAVGB(%%mm0, %%mm1) 1472 "movq %%mm1, (%%"REG_c", %1, 2) \n\t" 1473 1474 : : "r" (src), "r" ((x86_reg)stride) 1475 : "%"REG_a, "%"REG_c 1476 ); 1477#else 1478 int a, b, x; 1479 src+= 4*stride; 1480 1481 for(x=0; x<2; x++){ 1482 a= *(uint32_t*)&src[stride*0]; 1483 b= *(uint32_t*)&src[stride*2]; 1484 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1485 a= *(uint32_t*)&src[stride*4]; 1486 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1487 b= *(uint32_t*)&src[stride*6]; 1488 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1489 a= *(uint32_t*)&src[stride*8]; 1490 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1491 src += 4; 1492 } 1493#endif 1494} 1495 1496/** 1497 * Deinterlace the given block by cubic interpolating every second line. 1498 * will be called for every 8x8 block and can read & write from line 4-15 1499 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1500 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1501 * this filter will read lines 3-15 and write 7-13 1502 */ 1503static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) 1504{ 1505#if TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1506 src+= stride*3; 1507 __asm__ volatile( 1508 "lea (%0, %1), %%"REG_a" \n\t" 1509 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1510 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" 1511 "add %1, %%"REG_c" \n\t" 1512#if TEMPLATE_PP_SSE2 1513 "pxor %%xmm7, %%xmm7 \n\t" 1514#define REAL_DEINT_CUBIC(a,b,c,d,e)\ 1515 "movq " #a ", %%xmm0 \n\t"\ 1516 "movq " #b ", %%xmm1 \n\t"\ 1517 "movq " #d ", %%xmm2 \n\t"\ 1518 "movq " #e ", %%xmm3 \n\t"\ 1519 "pavgb %%xmm2, %%xmm1 \n\t"\ 1520 "pavgb %%xmm3, %%xmm0 \n\t"\ 1521 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1522 "punpcklbw %%xmm7, %%xmm1 \n\t"\ 1523 "psubw %%xmm1, %%xmm0 \n\t"\ 1524 "psraw $3, %%xmm0 \n\t"\ 1525 "psubw %%xmm0, %%xmm1 \n\t"\ 1526 "packuswb %%xmm1, %%xmm1 \n\t"\ 1527 "movlps %%xmm1, " #c " \n\t" 1528#else //TEMPLATE_PP_SSE2 1529 "pxor %%mm7, %%mm7 \n\t" 1530// 0 1 2 3 4 5 6 7 8 9 10 1531// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1532 1533#define REAL_DEINT_CUBIC(a,b,c,d,e)\ 1534 "movq " #a ", %%mm0 \n\t"\ 1535 "movq " #b ", %%mm1 \n\t"\ 1536 "movq " #d ", %%mm2 \n\t"\ 1537 "movq " #e ", %%mm3 \n\t"\ 1538 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ 1539 PAVGB(%%mm3, %%mm0) /* (a+e) /2 */\ 1540 "movq %%mm0, %%mm2 \n\t"\ 1541 "punpcklbw %%mm7, %%mm0 \n\t"\ 1542 "punpckhbw %%mm7, %%mm2 \n\t"\ 1543 "movq %%mm1, %%mm3 \n\t"\ 1544 "punpcklbw %%mm7, %%mm1 \n\t"\ 1545 "punpckhbw %%mm7, %%mm3 \n\t"\ 1546 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ 1547 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ 1548 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ 1549 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ 1550 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ 1551 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ 1552 "packuswb %%mm3, %%mm1 \n\t"\ 1553 "movq %%mm1, " #c " \n\t" 1554#endif //TEMPLATE_PP_SSE2 1555#define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) 1556 1557DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1)) 1558DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8)) 1559DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc)) 1560DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) 1561 1562 : : "r" (src), "r" ((x86_reg)stride) 1563 : 1564#if TEMPLATE_PP_SSE2 1565 XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",) 1566#endif 1567 "%"REG_a, "%"REG_d, "%"REG_c 1568 ); 1569#undef REAL_DEINT_CUBIC 1570#else //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1571 int x; 1572 src+= stride*3; 1573 for(x=0; x<8; x++){ 1574 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); 1575 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); 1576 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); 1577 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); 1578 src++; 1579 } 1580#endif //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1581} 1582 1583/** 1584 * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter. 1585 * will be called for every 8x8 block and can read & write from line 4-15 1586 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1587 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1588 * this filter will read lines 4-13 and write 5-11 1589 */ 1590static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) 1591{ 1592#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1593 src+= stride*4; 1594 __asm__ volatile( 1595 "lea (%0, %1), %%"REG_a" \n\t" 1596 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1597 "pxor %%mm7, %%mm7 \n\t" 1598 "movq (%2), %%mm0 \n\t" 1599// 0 1 2 3 4 5 6 7 8 9 10 1600// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1601 1602#define REAL_DEINT_FF(a,b,c,d)\ 1603 "movq " #a ", %%mm1 \n\t"\ 1604 "movq " #b ", %%mm2 \n\t"\ 1605 "movq " #c ", %%mm3 \n\t"\ 1606 "movq " #d ", %%mm4 \n\t"\ 1607 PAVGB(%%mm3, %%mm1) \ 1608 PAVGB(%%mm4, %%mm0) \ 1609 "movq %%mm0, %%mm3 \n\t"\ 1610 "punpcklbw %%mm7, %%mm0 \n\t"\ 1611 "punpckhbw %%mm7, %%mm3 \n\t"\ 1612 "movq %%mm1, %%mm4 \n\t"\ 1613 "punpcklbw %%mm7, %%mm1 \n\t"\ 1614 "punpckhbw %%mm7, %%mm4 \n\t"\ 1615 "psllw $2, %%mm1 \n\t"\ 1616 "psllw $2, %%mm4 \n\t"\ 1617 "psubw %%mm0, %%mm1 \n\t"\ 1618 "psubw %%mm3, %%mm4 \n\t"\ 1619 "movq %%mm2, %%mm5 \n\t"\ 1620 "movq %%mm2, %%mm0 \n\t"\ 1621 "punpcklbw %%mm7, %%mm2 \n\t"\ 1622 "punpckhbw %%mm7, %%mm5 \n\t"\ 1623 "paddw %%mm2, %%mm1 \n\t"\ 1624 "paddw %%mm5, %%mm4 \n\t"\ 1625 "psraw $2, %%mm1 \n\t"\ 1626 "psraw $2, %%mm4 \n\t"\ 1627 "packuswb %%mm4, %%mm1 \n\t"\ 1628 "movq %%mm1, " #b " \n\t"\ 1629 1630#define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) 1631 1632DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) 1633DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) 1634DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) 1635DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 1636 1637 "movq %%mm0, (%2) \n\t" 1638 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) 1639 : "%"REG_a, "%"REG_d 1640 ); 1641#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1642 int x; 1643 src+= stride*4; 1644 for(x=0; x<8; x++){ 1645 int t1= tmp[x]; 1646 int t2= src[stride*1]; 1647 1648 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); 1649 t1= src[stride*4]; 1650 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); 1651 t2= src[stride*6]; 1652 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); 1653 t1= src[stride*8]; 1654 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); 1655 tmp[x]= t1; 1656 1657 src++; 1658 } 1659#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1660} 1661 1662/** 1663 * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter. 1664 * will be called for every 8x8 block and can read & write from line 4-15 1665 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1666 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1667 * this filter will read lines 4-13 and write 4-11 1668 */ 1669static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) 1670{ 1671#if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS 1672 src+= stride*4; 1673 __asm__ volatile( 1674 "lea (%0, %1), %%"REG_a" \n\t" 1675 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1676 "pxor %%mm7, %%mm7 \n\t" 1677 "movq (%2), %%mm0 \n\t" 1678 "movq (%3), %%mm1 \n\t" 1679// 0 1 2 3 4 5 6 7 8 9 10 1680// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1681 1682#define REAL_DEINT_L5(t1,t2,a,b,c)\ 1683 "movq " #a ", %%mm2 \n\t"\ 1684 "movq " #b ", %%mm3 \n\t"\ 1685 "movq " #c ", %%mm4 \n\t"\ 1686 PAVGB(t2, %%mm3) \ 1687 PAVGB(t1, %%mm4) \ 1688 "movq %%mm2, %%mm5 \n\t"\ 1689 "movq %%mm2, " #t1 " \n\t"\ 1690 "punpcklbw %%mm7, %%mm2 \n\t"\ 1691 "punpckhbw %%mm7, %%mm5 \n\t"\ 1692 "movq %%mm2, %%mm6 \n\t"\ 1693 "paddw %%mm2, %%mm2 \n\t"\ 1694 "paddw %%mm6, %%mm2 \n\t"\ 1695 "movq %%mm5, %%mm6 \n\t"\ 1696 "paddw %%mm5, %%mm5 \n\t"\ 1697 "paddw %%mm6, %%mm5 \n\t"\ 1698 "movq %%mm3, %%mm6 \n\t"\ 1699 "punpcklbw %%mm7, %%mm3 \n\t"\ 1700 "punpckhbw %%mm7, %%mm6 \n\t"\ 1701 "paddw %%mm3, %%mm3 \n\t"\ 1702 "paddw %%mm6, %%mm6 \n\t"\ 1703 "paddw %%mm3, %%mm2 \n\t"\ 1704 "paddw %%mm6, %%mm5 \n\t"\ 1705 "movq %%mm4, %%mm6 \n\t"\ 1706 "punpcklbw %%mm7, %%mm4 \n\t"\ 1707 "punpckhbw %%mm7, %%mm6 \n\t"\ 1708 "psubw %%mm4, %%mm2 \n\t"\ 1709 "psubw %%mm6, %%mm5 \n\t"\ 1710 "psraw $2, %%mm2 \n\t"\ 1711 "psraw $2, %%mm5 \n\t"\ 1712 "packuswb %%mm5, %%mm2 \n\t"\ 1713 "movq %%mm2, " #a " \n\t"\ 1714 1715#define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) 1716 1717DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) 1718DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) 1719DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) ) 1720DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) 1721DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) 1722DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) 1723DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) 1724DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 1725 1726 "movq %%mm0, (%2) \n\t" 1727 "movq %%mm1, (%3) \n\t" 1728 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) 1729 : "%"REG_a, "%"REG_d 1730 ); 1731#else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS 1732 int x; 1733 src+= stride*4; 1734 for(x=0; x<8; x++){ 1735 int t1= tmp[x]; 1736 int t2= tmp2[x]; 1737 int t3= src[0]; 1738 1739 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); 1740 t1= src[stride*1]; 1741 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); 1742 t2= src[stride*2]; 1743 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); 1744 t3= src[stride*3]; 1745 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); 1746 t1= src[stride*4]; 1747 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); 1748 t2= src[stride*5]; 1749 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); 1750 t3= src[stride*6]; 1751 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); 1752 t1= src[stride*7]; 1753 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); 1754 1755 tmp[x]= t3; 1756 tmp2[x]= t1; 1757 1758 src++; 1759 } 1760#endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS 1761} 1762 1763/** 1764 * Deinterlace the given block by filtering all lines with a (1 2 1) filter. 1765 * will be called for every 8x8 block and can read & write from line 4-15 1766 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1767 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1768 * this filter will read lines 4-13 and write 4-11 1769 */ 1770static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) 1771{ 1772#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1773 src+= 4*stride; 1774 __asm__ volatile( 1775 "lea (%0, %1), %%"REG_a" \n\t" 1776 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1777// 0 1 2 3 4 5 6 7 8 9 1778// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1779 1780 "movq (%2), %%mm0 \n\t" // L0 1781 "movq (%%"REG_a"), %%mm1 \n\t" // L2 1782 PAVGB(%%mm1, %%mm0) // L0+L2 1783 "movq (%0), %%mm2 \n\t" // L1 1784 PAVGB(%%mm2, %%mm0) 1785 "movq %%mm0, (%0) \n\t" 1786 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 1787 PAVGB(%%mm0, %%mm2) // L1+L3 1788 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 1789 "movq %%mm2, (%%"REG_a") \n\t" 1790 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 1791 PAVGB(%%mm2, %%mm1) // L2+L4 1792 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 1793 "movq %%mm1, (%%"REG_a", %1) \n\t" 1794 "movq (%0, %1, 4), %%mm1 \n\t" // L5 1795 PAVGB(%%mm1, %%mm0) // L3+L5 1796 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 1797 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 1798 "movq (%%"REG_d"), %%mm0 \n\t" // L6 1799 PAVGB(%%mm0, %%mm2) // L4+L6 1800 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 1801 "movq %%mm2, (%0, %1, 4) \n\t" 1802 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 1803 PAVGB(%%mm2, %%mm1) // L5+L7 1804 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 1805 "movq %%mm1, (%%"REG_d") \n\t" 1806 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 1807 PAVGB(%%mm1, %%mm0) // L6+L8 1808 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 1809 "movq %%mm0, (%%"REG_d", %1) \n\t" 1810 "movq (%0, %1, 8), %%mm0 \n\t" // L9 1811 PAVGB(%%mm0, %%mm2) // L7+L9 1812 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 1813 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1814 "movq %%mm1, (%2) \n\t" 1815 1816 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) 1817 : "%"REG_a, "%"REG_d 1818 ); 1819#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1820 int a, b, c, x; 1821 src+= 4*stride; 1822 1823 for(x=0; x<2; x++){ 1824 a= *(uint32_t*)&tmp[stride*0]; 1825 b= *(uint32_t*)&src[stride*0]; 1826 c= *(uint32_t*)&src[stride*1]; 1827 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1828 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1829 1830 a= *(uint32_t*)&src[stride*2]; 1831 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1832 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1833 1834 b= *(uint32_t*)&src[stride*3]; 1835 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1836 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1837 1838 c= *(uint32_t*)&src[stride*4]; 1839 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1840 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1841 1842 a= *(uint32_t*)&src[stride*5]; 1843 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1844 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1845 1846 b= *(uint32_t*)&src[stride*6]; 1847 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1848 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1849 1850 c= *(uint32_t*)&src[stride*7]; 1851 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1852 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1853 1854 a= *(uint32_t*)&src[stride*8]; 1855 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1856 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1857 1858 *(uint32_t*)&tmp[stride*0]= c; 1859 src += 4; 1860 tmp += 4; 1861 } 1862#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW 1863} 1864 1865/** 1866 * Deinterlace the given block by applying a median filter to every second line. 1867 * will be called for every 8x8 block and can read & write from line 4-15, 1868 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1869 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1870 */ 1871static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) 1872{ 1873#if TEMPLATE_PP_MMX 1874 src+= 4*stride; 1875#if TEMPLATE_PP_MMXEXT 1876 __asm__ volatile( 1877 "lea (%0, %1), %%"REG_a" \n\t" 1878 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1879// 0 1 2 3 4 5 6 7 8 9 1880// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1881 1882 "movq (%0), %%mm0 \n\t" 1883 "movq (%%"REG_a", %1), %%mm2 \n\t" 1884 "movq (%%"REG_a"), %%mm1 \n\t" 1885 "movq %%mm0, %%mm3 \n\t" 1886 "pmaxub %%mm1, %%mm0 \n\t" 1887 "pminub %%mm3, %%mm1 \n\t" 1888 "pmaxub %%mm2, %%mm1 \n\t" 1889 "pminub %%mm1, %%mm0 \n\t" 1890 "movq %%mm0, (%%"REG_a") \n\t" 1891 1892 "movq (%0, %1, 4), %%mm0 \n\t" 1893 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" 1894 "movq %%mm2, %%mm3 \n\t" 1895 "pmaxub %%mm1, %%mm2 \n\t" 1896 "pminub %%mm3, %%mm1 \n\t" 1897 "pmaxub %%mm0, %%mm1 \n\t" 1898 "pminub %%mm1, %%mm2 \n\t" 1899 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" 1900 1901 "movq (%%"REG_d"), %%mm2 \n\t" 1902 "movq (%%"REG_d", %1), %%mm1 \n\t" 1903 "movq %%mm2, %%mm3 \n\t" 1904 "pmaxub %%mm0, %%mm2 \n\t" 1905 "pminub %%mm3, %%mm0 \n\t" 1906 "pmaxub %%mm1, %%mm0 \n\t" 1907 "pminub %%mm0, %%mm2 \n\t" 1908 "movq %%mm2, (%%"REG_d") \n\t" 1909 1910 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" 1911 "movq (%0, %1, 8), %%mm0 \n\t" 1912 "movq %%mm2, %%mm3 \n\t" 1913 "pmaxub %%mm0, %%mm2 \n\t" 1914 "pminub %%mm3, %%mm0 \n\t" 1915 "pmaxub %%mm1, %%mm0 \n\t" 1916 "pminub %%mm0, %%mm2 \n\t" 1917 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1918 1919 1920 : : "r" (src), "r" ((x86_reg)stride) 1921 : "%"REG_a, "%"REG_d 1922 ); 1923 1924#else // MMX without MMX2 1925 __asm__ volatile( 1926 "lea (%0, %1), %%"REG_a" \n\t" 1927 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1928// 0 1 2 3 4 5 6 7 8 9 1929// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1930 "pxor %%mm7, %%mm7 \n\t" 1931 1932#define REAL_MEDIAN(a,b,c)\ 1933 "movq " #a ", %%mm0 \n\t"\ 1934 "movq " #b ", %%mm2 \n\t"\ 1935 "movq " #c ", %%mm1 \n\t"\ 1936 "movq %%mm0, %%mm3 \n\t"\ 1937 "movq %%mm1, %%mm4 \n\t"\ 1938 "movq %%mm2, %%mm5 \n\t"\ 1939 "psubusb %%mm1, %%mm3 \n\t"\ 1940 "psubusb %%mm2, %%mm4 \n\t"\ 1941 "psubusb %%mm0, %%mm5 \n\t"\ 1942 "pcmpeqb %%mm7, %%mm3 \n\t"\ 1943 "pcmpeqb %%mm7, %%mm4 \n\t"\ 1944 "pcmpeqb %%mm7, %%mm5 \n\t"\ 1945 "movq %%mm3, %%mm6 \n\t"\ 1946 "pxor %%mm4, %%mm3 \n\t"\ 1947 "pxor %%mm5, %%mm4 \n\t"\ 1948 "pxor %%mm6, %%mm5 \n\t"\ 1949 "por %%mm3, %%mm1 \n\t"\ 1950 "por %%mm4, %%mm2 \n\t"\ 1951 "por %%mm5, %%mm0 \n\t"\ 1952 "pand %%mm2, %%mm0 \n\t"\ 1953 "pand %%mm1, %%mm0 \n\t"\ 1954 "movq %%mm0, " #b " \n\t" 1955#define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) 1956 1957MEDIAN((%0) , (%%REGa) , (%%REGa, %1)) 1958MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) 1959MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1)) 1960MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) 1961 1962 : : "r" (src), "r" ((x86_reg)stride) 1963 : "%"REG_a, "%"REG_d 1964 ); 1965#endif //TEMPLATE_PP_MMXEXT 1966#else //TEMPLATE_PP_MMX 1967 int x, y; 1968 src+= 4*stride; 1969 // FIXME - there should be a way to do a few columns in parallel like w/mmx 1970 for(x=0; x<8; x++){ 1971 uint8_t *colsrc = src; 1972 for (y=0; y<4; y++){ 1973 int a, b, c, d, e, f; 1974 a = colsrc[0 ]; 1975 b = colsrc[stride ]; 1976 c = colsrc[stride*2]; 1977 d = (a-b)>>31; 1978 e = (b-c)>>31; 1979 f = (c-a)>>31; 1980 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); 1981 colsrc += stride*2; 1982 } 1983 src++; 1984 } 1985#endif //TEMPLATE_PP_MMX 1986} 1987 1988#if TEMPLATE_PP_MMX 1989/** 1990 * Transpose and shift the given 8x8 Block into dst1 and dst2. 1991 */ 1992static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 1993{ 1994 __asm__( 1995 "lea (%0, %1), %%"REG_a" \n\t" 1996// 0 1 2 3 4 5 6 7 8 9 1997// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1998 "movq (%0), %%mm0 \n\t" // 12345678 1999 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 2000 "movq %%mm0, %%mm2 \n\t" // 12345678 2001 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2002 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2003 2004 "movq (%%"REG_a", %1), %%mm1 \n\t" 2005 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 2006 "movq %%mm1, %%mm4 \n\t" 2007 "punpcklbw %%mm3, %%mm1 \n\t" 2008 "punpckhbw %%mm3, %%mm4 \n\t" 2009 2010 "movq %%mm0, %%mm3 \n\t" 2011 "punpcklwd %%mm1, %%mm0 \n\t" 2012 "punpckhwd %%mm1, %%mm3 \n\t" 2013 "movq %%mm2, %%mm1 \n\t" 2014 "punpcklwd %%mm4, %%mm2 \n\t" 2015 "punpckhwd %%mm4, %%mm1 \n\t" 2016 2017 "movd %%mm0, 128(%2) \n\t" 2018 "psrlq $32, %%mm0 \n\t" 2019 "movd %%mm0, 144(%2) \n\t" 2020 "movd %%mm3, 160(%2) \n\t" 2021 "psrlq $32, %%mm3 \n\t" 2022 "movd %%mm3, 176(%2) \n\t" 2023 "movd %%mm3, 48(%3) \n\t" 2024 "movd %%mm2, 192(%2) \n\t" 2025 "movd %%mm2, 64(%3) \n\t" 2026 "psrlq $32, %%mm2 \n\t" 2027 "movd %%mm2, 80(%3) \n\t" 2028 "movd %%mm1, 96(%3) \n\t" 2029 "psrlq $32, %%mm1 \n\t" 2030 "movd %%mm1, 112(%3) \n\t" 2031 2032 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" 2033 2034 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 2035 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 2036 "movq %%mm0, %%mm2 \n\t" // 12345678 2037 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2038 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2039 2040 "movq (%%"REG_a", %1), %%mm1 \n\t" 2041 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 2042 "movq %%mm1, %%mm4 \n\t" 2043 "punpcklbw %%mm3, %%mm1 \n\t" 2044 "punpckhbw %%mm3, %%mm4 \n\t" 2045 2046 "movq %%mm0, %%mm3 \n\t" 2047 "punpcklwd %%mm1, %%mm0 \n\t" 2048 "punpckhwd %%mm1, %%mm3 \n\t" 2049 "movq %%mm2, %%mm1 \n\t" 2050 "punpcklwd %%mm4, %%mm2 \n\t" 2051 "punpckhwd %%mm4, %%mm1 \n\t" 2052 2053 "movd %%mm0, 132(%2) \n\t" 2054 "psrlq $32, %%mm0 \n\t" 2055 "movd %%mm0, 148(%2) \n\t" 2056 "movd %%mm3, 164(%2) \n\t" 2057 "psrlq $32, %%mm3 \n\t" 2058 "movd %%mm3, 180(%2) \n\t" 2059 "movd %%mm3, 52(%3) \n\t" 2060 "movd %%mm2, 196(%2) \n\t" 2061 "movd %%mm2, 68(%3) \n\t" 2062 "psrlq $32, %%mm2 \n\t" 2063 "movd %%mm2, 84(%3) \n\t" 2064 "movd %%mm1, 100(%3) \n\t" 2065 "psrlq $32, %%mm1 \n\t" 2066 "movd %%mm1, 116(%3) \n\t" 2067 2068 2069 :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2) 2070 : "%"REG_a 2071 ); 2072} 2073 2074/** 2075 * Transpose the given 8x8 block. 2076 */ 2077static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) 2078{ 2079 __asm__( 2080 "lea (%0, %1), %%"REG_a" \n\t" 2081 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t" 2082// 0 1 2 3 4 5 6 7 8 9 2083// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 2084 "movq (%2), %%mm0 \n\t" // 12345678 2085 "movq 16(%2), %%mm1 \n\t" // abcdefgh 2086 "movq %%mm0, %%mm2 \n\t" // 12345678 2087 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2088 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2089 2090 "movq 32(%2), %%mm1 \n\t" 2091 "movq 48(%2), %%mm3 \n\t" 2092 "movq %%mm1, %%mm4 \n\t" 2093 "punpcklbw %%mm3, %%mm1 \n\t" 2094 "punpckhbw %%mm3, %%mm4 \n\t" 2095 2096 "movq %%mm0, %%mm3 \n\t" 2097 "punpcklwd %%mm1, %%mm0 \n\t" 2098 "punpckhwd %%mm1, %%mm3 \n\t" 2099 "movq %%mm2, %%mm1 \n\t" 2100 "punpcklwd %%mm4, %%mm2 \n\t" 2101 "punpckhwd %%mm4, %%mm1 \n\t" 2102 2103 "movd %%mm0, (%0) \n\t" 2104 "psrlq $32, %%mm0 \n\t" 2105 "movd %%mm0, (%%"REG_a") \n\t" 2106 "movd %%mm3, (%%"REG_a", %1) \n\t" 2107 "psrlq $32, %%mm3 \n\t" 2108 "movd %%mm3, (%%"REG_a", %1, 2) \n\t" 2109 "movd %%mm2, (%0, %1, 4) \n\t" 2110 "psrlq $32, %%mm2 \n\t" 2111 "movd %%mm2, (%%"REG_d") \n\t" 2112 "movd %%mm1, (%%"REG_d", %1) \n\t" 2113 "psrlq $32, %%mm1 \n\t" 2114 "movd %%mm1, (%%"REG_d", %1, 2) \n\t" 2115 2116 2117 "movq 64(%2), %%mm0 \n\t" // 12345678 2118 "movq 80(%2), %%mm1 \n\t" // abcdefgh 2119 "movq %%mm0, %%mm2 \n\t" // 12345678 2120 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2121 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2122 2123 "movq 96(%2), %%mm1 \n\t" 2124 "movq 112(%2), %%mm3 \n\t" 2125 "movq %%mm1, %%mm4 \n\t" 2126 "punpcklbw %%mm3, %%mm1 \n\t" 2127 "punpckhbw %%mm3, %%mm4 \n\t" 2128 2129 "movq %%mm0, %%mm3 \n\t" 2130 "punpcklwd %%mm1, %%mm0 \n\t" 2131 "punpckhwd %%mm1, %%mm3 \n\t" 2132 "movq %%mm2, %%mm1 \n\t" 2133 "punpcklwd %%mm4, %%mm2 \n\t" 2134 "punpckhwd %%mm4, %%mm1 \n\t" 2135 2136 "movd %%mm0, 4(%0) \n\t" 2137 "psrlq $32, %%mm0 \n\t" 2138 "movd %%mm0, 4(%%"REG_a") \n\t" 2139 "movd %%mm3, 4(%%"REG_a", %1) \n\t" 2140 "psrlq $32, %%mm3 \n\t" 2141 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" 2142 "movd %%mm2, 4(%0, %1, 4) \n\t" 2143 "psrlq $32, %%mm2 \n\t" 2144 "movd %%mm2, 4(%%"REG_d") \n\t" 2145 "movd %%mm1, 4(%%"REG_d", %1) \n\t" 2146 "psrlq $32, %%mm1 \n\t" 2147 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" 2148 2149 :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src) 2150 : "%"REG_a, "%"REG_d 2151 ); 2152} 2153#endif //TEMPLATE_PP_MMX 2154//static long test=0; 2155 2156#if !TEMPLATE_PP_ALTIVEC 2157static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 2158 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise) 2159{ 2160 // to save a register (FIXME do this outside of the loops) 2161 tempBlurredPast[127]= maxNoise[0]; 2162 tempBlurredPast[128]= maxNoise[1]; 2163 tempBlurredPast[129]= maxNoise[2]; 2164 2165#define FAST_L2_DIFF 2166//#define L1_DIFF //u should change the thresholds too if u try that one 2167#if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS 2168 __asm__ volatile( 2169 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride 2170 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride 2171 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2172// 0 1 2 3 4 5 6 7 8 9 2173// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 2174//FIXME reorder? 2175#ifdef L1_DIFF //needs mmx2 2176 "movq (%0), %%mm0 \n\t" // L0 2177 "psadbw (%1), %%mm0 \n\t" // |L0-R0| 2178 "movq (%0, %2), %%mm1 \n\t" // L1 2179 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| 2180 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2181 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| 2182 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2183 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| 2184 2185 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2186 "paddw %%mm1, %%mm0 \n\t" 2187 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| 2188 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2189 "paddw %%mm2, %%mm0 \n\t" 2190 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| 2191 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2192 "paddw %%mm3, %%mm0 \n\t" 2193 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| 2194 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2195 "paddw %%mm4, %%mm0 \n\t" 2196 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| 2197 "paddw %%mm5, %%mm6 \n\t" 2198 "paddw %%mm7, %%mm6 \n\t" 2199 "paddw %%mm6, %%mm0 \n\t" 2200#else //L1_DIFF 2201#if defined (FAST_L2_DIFF) 2202 "pcmpeqb %%mm7, %%mm7 \n\t" 2203 "movq "MANGLE(b80)", %%mm6 \n\t" 2204 "pxor %%mm0, %%mm0 \n\t" 2205#define REAL_L2_DIFF_CORE(a, b)\ 2206 "movq " #a ", %%mm5 \n\t"\ 2207 "movq " #b ", %%mm2 \n\t"\ 2208 "pxor %%mm7, %%mm2 \n\t"\ 2209 PAVGB(%%mm2, %%mm5)\ 2210 "paddb %%mm6, %%mm5 \n\t"\ 2211 "movq %%mm5, %%mm2 \n\t"\ 2212 "psllw $8, %%mm5 \n\t"\ 2213 "pmaddwd %%mm5, %%mm5 \n\t"\ 2214 "pmaddwd %%mm2, %%mm2 \n\t"\ 2215 "paddd %%mm2, %%mm5 \n\t"\ 2216 "psrld $14, %%mm5 \n\t"\ 2217 "paddd %%mm5, %%mm0 \n\t" 2218 2219#else //defined (FAST_L2_DIFF) 2220 "pxor %%mm7, %%mm7 \n\t" 2221 "pxor %%mm0, %%mm0 \n\t" 2222#define REAL_L2_DIFF_CORE(a, b)\ 2223 "movq " #a ", %%mm5 \n\t"\ 2224 "movq " #b ", %%mm2 \n\t"\ 2225 "movq %%mm5, %%mm1 \n\t"\ 2226 "movq %%mm2, %%mm3 \n\t"\ 2227 "punpcklbw %%mm7, %%mm5 \n\t"\ 2228 "punpckhbw %%mm7, %%mm1 \n\t"\ 2229 "punpcklbw %%mm7, %%mm2 \n\t"\ 2230 "punpckhbw %%mm7, %%mm3 \n\t"\ 2231 "psubw %%mm2, %%mm5 \n\t"\ 2232 "psubw %%mm3, %%mm1 \n\t"\ 2233 "pmaddwd %%mm5, %%mm5 \n\t"\ 2234 "pmaddwd %%mm1, %%mm1 \n\t"\ 2235 "paddd %%mm1, %%mm5 \n\t"\ 2236 "paddd %%mm5, %%mm0 \n\t" 2237 2238#endif //defined (FAST_L2_DIFF) 2239 2240#define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) 2241 2242L2_DIFF_CORE((%0) , (%1)) 2243L2_DIFF_CORE((%0, %2) , (%1, %2)) 2244L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2)) 2245L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa)) 2246L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4)) 2247L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd)) 2248L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) 2249L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc)) 2250 2251#endif //L1_DIFF 2252 2253 "movq %%mm0, %%mm4 \n\t" 2254 "psrlq $32, %%mm0 \n\t" 2255 "paddd %%mm0, %%mm4 \n\t" 2256 "movd %%mm4, %%ecx \n\t" 2257 "shll $2, %%ecx \n\t" 2258 "mov %3, %%"REG_d" \n\t" 2259 "addl -4(%%"REG_d"), %%ecx \n\t" 2260 "addl 4(%%"REG_d"), %%ecx \n\t" 2261 "addl -1024(%%"REG_d"), %%ecx \n\t" 2262 "addl $4, %%ecx \n\t" 2263 "addl 1024(%%"REG_d"), %%ecx \n\t" 2264 "shrl $3, %%ecx \n\t" 2265 "movl %%ecx, (%%"REG_d") \n\t" 2266 2267// "mov %3, %%"REG_c" \n\t" 2268// "mov %%"REG_c", test \n\t" 2269// "jmp 4f \n\t" 2270 "cmpl 512(%%"REG_d"), %%ecx \n\t" 2271 " jb 2f \n\t" 2272 "cmpl 516(%%"REG_d"), %%ecx \n\t" 2273 " jb 1f \n\t" 2274 2275 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2276 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2277 "movq (%0), %%mm0 \n\t" // L0 2278 "movq (%0, %2), %%mm1 \n\t" // L1 2279 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2280 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2281 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2282 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2283 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2284 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2285 "movq %%mm0, (%1) \n\t" // L0 2286 "movq %%mm1, (%1, %2) \n\t" // L1 2287 "movq %%mm2, (%1, %2, 2) \n\t" // L2 2288 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 2289 "movq %%mm4, (%1, %2, 4) \n\t" // L4 2290 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 2291 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 2292 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 2293 "jmp 4f \n\t" 2294 2295 "1: \n\t" 2296 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2297 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2298 "movq (%0), %%mm0 \n\t" // L0 2299 PAVGB((%1), %%mm0) // L0 2300 "movq (%0, %2), %%mm1 \n\t" // L1 2301 PAVGB((%1, %2), %%mm1) // L1 2302 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2303 PAVGB((%1, %2, 2), %%mm2) // L2 2304 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2305 PAVGB((%1, %%REGa), %%mm3) // L3 2306 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2307 PAVGB((%1, %2, 4), %%mm4) // L4 2308 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2309 PAVGB((%1, %%REGd), %%mm5) // L5 2310 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2311 PAVGB((%1, %%REGa, 2), %%mm6) // L6 2312 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2313 PAVGB((%1, %%REGc), %%mm7) // L7 2314 "movq %%mm0, (%1) \n\t" // R0 2315 "movq %%mm1, (%1, %2) \n\t" // R1 2316 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2317 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2318 "movq %%mm4, (%1, %2, 4) \n\t" // R4 2319 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 2320 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 2321 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 2322 "movq %%mm0, (%0) \n\t" // L0 2323 "movq %%mm1, (%0, %2) \n\t" // L1 2324 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2325 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2326 "movq %%mm4, (%0, %2, 4) \n\t" // L4 2327 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 2328 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 2329 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 2330 "jmp 4f \n\t" 2331 2332 "2: \n\t" 2333 "cmpl 508(%%"REG_d"), %%ecx \n\t" 2334 " jb 3f \n\t" 2335 2336 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2337 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2338 "movq (%0), %%mm0 \n\t" // L0 2339 "movq (%0, %2), %%mm1 \n\t" // L1 2340 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2341 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2342 "movq (%1), %%mm4 \n\t" // R0 2343 "movq (%1, %2), %%mm5 \n\t" // R1 2344 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2345 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2346 PAVGB(%%mm4, %%mm0) 2347 PAVGB(%%mm5, %%mm1) 2348 PAVGB(%%mm6, %%mm2) 2349 PAVGB(%%mm7, %%mm3) 2350 PAVGB(%%mm4, %%mm0) 2351 PAVGB(%%mm5, %%mm1) 2352 PAVGB(%%mm6, %%mm2) 2353 PAVGB(%%mm7, %%mm3) 2354 "movq %%mm0, (%1) \n\t" // R0 2355 "movq %%mm1, (%1, %2) \n\t" // R1 2356 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2357 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2358 "movq %%mm0, (%0) \n\t" // L0 2359 "movq %%mm1, (%0, %2) \n\t" // L1 2360 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2361 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2362 2363 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2364 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2365 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2366 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2367 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2368 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2369 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2370 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2371 PAVGB(%%mm4, %%mm0) 2372 PAVGB(%%mm5, %%mm1) 2373 PAVGB(%%mm6, %%mm2) 2374 PAVGB(%%mm7, %%mm3) 2375 PAVGB(%%mm4, %%mm0) 2376 PAVGB(%%mm5, %%mm1) 2377 PAVGB(%%mm6, %%mm2) 2378 PAVGB(%%mm7, %%mm3) 2379 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2380 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2381 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2382 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2383 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2384 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2385 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2386 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2387 "jmp 4f \n\t" 2388 2389 "3: \n\t" 2390 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2391 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2392 "movq (%0), %%mm0 \n\t" // L0 2393 "movq (%0, %2), %%mm1 \n\t" // L1 2394 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2395 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2396 "movq (%1), %%mm4 \n\t" // R0 2397 "movq (%1, %2), %%mm5 \n\t" // R1 2398 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2399 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2400 PAVGB(%%mm4, %%mm0) 2401 PAVGB(%%mm5, %%mm1) 2402 PAVGB(%%mm6, %%mm2) 2403 PAVGB(%%mm7, %%mm3) 2404 PAVGB(%%mm4, %%mm0) 2405 PAVGB(%%mm5, %%mm1) 2406 PAVGB(%%mm6, %%mm2) 2407 PAVGB(%%mm7, %%mm3) 2408 PAVGB(%%mm4, %%mm0) 2409 PAVGB(%%mm5, %%mm1) 2410 PAVGB(%%mm6, %%mm2) 2411 PAVGB(%%mm7, %%mm3) 2412 "movq %%mm0, (%1) \n\t" // R0 2413 "movq %%mm1, (%1, %2) \n\t" // R1 2414 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2415 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2416 "movq %%mm0, (%0) \n\t" // L0 2417 "movq %%mm1, (%0, %2) \n\t" // L1 2418 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2419 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2420 2421 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2422 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2423 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2424 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2425 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2426 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2427 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2428 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2429 PAVGB(%%mm4, %%mm0) 2430 PAVGB(%%mm5, %%mm1) 2431 PAVGB(%%mm6, %%mm2) 2432 PAVGB(%%mm7, %%mm3) 2433 PAVGB(%%mm4, %%mm0) 2434 PAVGB(%%mm5, %%mm1) 2435 PAVGB(%%mm6, %%mm2) 2436 PAVGB(%%mm7, %%mm3) 2437 PAVGB(%%mm4, %%mm0) 2438 PAVGB(%%mm5, %%mm1) 2439 PAVGB(%%mm6, %%mm2) 2440 PAVGB(%%mm7, %%mm3) 2441 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2442 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2443 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2444 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2445 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2446 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2447 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2448 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2449 2450 "4: \n\t" 2451 2452 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) 2453 NAMED_CONSTRAINTS_ADD(b80) 2454 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" 2455 ); 2456#else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS 2457{ 2458 int y; 2459 int d=0; 2460// int sysd=0; 2461 int i; 2462 2463 for(y=0; y<8; y++){ 2464 int x; 2465 for(x=0; x<8; x++){ 2466 int ref= tempBlurred[ x + y*stride ]; 2467 int cur= src[ x + y*stride ]; 2468 int d1=ref - cur; 2469// if(x==0 || x==7) d1+= d1>>1; 2470// if(y==0 || y==7) d1+= d1>>1; 2471// d+= FFABS(d1); 2472 d+= d1*d1; 2473// sysd+= d1; 2474 } 2475 } 2476 i=d; 2477 d= ( 2478 4*d 2479 +(*(tempBlurredPast-256)) 2480 +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1)) 2481 +(*(tempBlurredPast+256)) 2482 +4)>>3; 2483 *tempBlurredPast=i; 2484// ((*tempBlurredPast)*3 + d + 2)>>2; 2485 2486/* 2487Switch between 2488 1 0 0 0 0 0 0 (0) 248964 32 16 8 4 2 1 (1) 249064 48 36 27 20 15 11 (33) (approx) 249164 56 49 43 37 33 29 (200) (approx) 2492*/ 2493 if(d > maxNoise[1]){ 2494 if(d < maxNoise[2]){ 2495 for(y=0; y<8; y++){ 2496 int x; 2497 for(x=0; x<8; x++){ 2498 int ref= tempBlurred[ x + y*stride ]; 2499 int cur= src[ x + y*stride ]; 2500 tempBlurred[ x + y*stride ]= 2501 src[ x + y*stride ]= 2502 (ref + cur + 1)>>1; 2503 } 2504 } 2505 }else{ 2506 for(y=0; y<8; y++){ 2507 int x; 2508 for(x=0; x<8; x++){ 2509 tempBlurred[ x + y*stride ]= src[ x + y*stride ]; 2510 } 2511 } 2512 } 2513 }else{ 2514 if(d < maxNoise[0]){ 2515 for(y=0; y<8; y++){ 2516 int x; 2517 for(x=0; x<8; x++){ 2518 int ref= tempBlurred[ x + y*stride ]; 2519 int cur= src[ x + y*stride ]; 2520 tempBlurred[ x + y*stride ]= 2521 src[ x + y*stride ]= 2522 (ref*7 + cur + 4)>>3; 2523 } 2524 } 2525 }else{ 2526 for(y=0; y<8; y++){ 2527 int x; 2528 for(x=0; x<8; x++){ 2529 int ref= tempBlurred[ x + y*stride ]; 2530 int cur= src[ x + y*stride ]; 2531 tempBlurred[ x + y*stride ]= 2532 src[ x + y*stride ]= 2533 (ref*3 + cur + 2)>>2; 2534 } 2535 } 2536 } 2537 } 2538} 2539#endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS 2540} 2541#endif //TEMPLATE_PP_ALTIVEC 2542 2543#if TEMPLATE_PP_MMX 2544/** 2545 * accurate deblock filter 2546 */ 2547static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ 2548 int64_t dc_mask, eq_mask, both_masks; 2549 int64_t sums[10*8*2]; 2550 src+= step*3; // src points to begin of the 8x8 Block 2551 //{ START_TIMER 2552 __asm__ volatile( 2553 "movq %0, %%mm7 \n\t" 2554 "movq %1, %%mm6 \n\t" 2555 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 2556 ); 2557 2558 __asm__ volatile( 2559 "lea (%2, %3), %%"REG_a" \n\t" 2560// 0 1 2 3 4 5 6 7 8 9 2561// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 2562 2563 "movq (%2), %%mm0 \n\t" 2564 "movq (%%"REG_a"), %%mm1 \n\t" 2565 "movq %%mm1, %%mm3 \n\t" 2566 "movq %%mm1, %%mm4 \n\t" 2567 "psubb %%mm1, %%mm0 \n\t" // mm0 = difference 2568 "paddb %%mm7, %%mm0 \n\t" 2569 "pcmpgtb %%mm6, %%mm0 \n\t" 2570 2571 "movq (%%"REG_a",%3), %%mm2 \n\t" 2572 PMAXUB(%%mm2, %%mm4) 2573 PMINUB(%%mm2, %%mm3, %%mm5) 2574 "psubb %%mm2, %%mm1 \n\t" 2575 "paddb %%mm7, %%mm1 \n\t" 2576 "pcmpgtb %%mm6, %%mm1 \n\t" 2577 "paddb %%mm1, %%mm0 \n\t" 2578 2579 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2580 PMAXUB(%%mm1, %%mm4) 2581 PMINUB(%%mm1, %%mm3, %%mm5) 2582 "psubb %%mm1, %%mm2 \n\t" 2583 "paddb %%mm7, %%mm2 \n\t" 2584 "pcmpgtb %%mm6, %%mm2 \n\t" 2585 "paddb %%mm2, %%mm0 \n\t" 2586 2587 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 2588 2589 "movq (%2, %3, 4), %%mm2 \n\t" 2590 PMAXUB(%%mm2, %%mm4) 2591 PMINUB(%%mm2, %%mm3, %%mm5) 2592 "psubb %%mm2, %%mm1 \n\t" 2593 "paddb %%mm7, %%mm1 \n\t" 2594 "pcmpgtb %%mm6, %%mm1 \n\t" 2595 "paddb %%mm1, %%mm0 \n\t" 2596 2597 "movq (%%"REG_a"), %%mm1 \n\t" 2598 PMAXUB(%%mm1, %%mm4) 2599 PMINUB(%%mm1, %%mm3, %%mm5) 2600 "psubb %%mm1, %%mm2 \n\t" 2601 "paddb %%mm7, %%mm2 \n\t" 2602 "pcmpgtb %%mm6, %%mm2 \n\t" 2603 "paddb %%mm2, %%mm0 \n\t" 2604 2605 "movq (%%"REG_a", %3), %%mm2 \n\t" 2606 PMAXUB(%%mm2, %%mm4) 2607 PMINUB(%%mm2, %%mm3, %%mm5) 2608 "psubb %%mm2, %%mm1 \n\t" 2609 "paddb %%mm7, %%mm1 \n\t" 2610 "pcmpgtb %%mm6, %%mm1 \n\t" 2611 "paddb %%mm1, %%mm0 \n\t" 2612 2613 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2614 PMAXUB(%%mm1, %%mm4) 2615 PMINUB(%%mm1, %%mm3, %%mm5) 2616 "psubb %%mm1, %%mm2 \n\t" 2617 "paddb %%mm7, %%mm2 \n\t" 2618 "pcmpgtb %%mm6, %%mm2 \n\t" 2619 "paddb %%mm2, %%mm0 \n\t" 2620 2621 "movq (%2, %3, 8), %%mm2 \n\t" 2622 PMAXUB(%%mm2, %%mm4) 2623 PMINUB(%%mm2, %%mm3, %%mm5) 2624 "psubb %%mm2, %%mm1 \n\t" 2625 "paddb %%mm7, %%mm1 \n\t" 2626 "pcmpgtb %%mm6, %%mm1 \n\t" 2627 "paddb %%mm1, %%mm0 \n\t" 2628 2629 "movq (%%"REG_a", %3, 4), %%mm1 \n\t" 2630 "psubb %%mm1, %%mm2 \n\t" 2631 "paddb %%mm7, %%mm2 \n\t" 2632 "pcmpgtb %%mm6, %%mm2 \n\t" 2633 "paddb %%mm2, %%mm0 \n\t" 2634 "psubusb %%mm3, %%mm4 \n\t" 2635 2636 "pxor %%mm6, %%mm6 \n\t" 2637 "movq %4, %%mm7 \n\t" // QP,..., QP 2638 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 2639 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 2640 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2641 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2642 "movq %%mm7, %1 \n\t" 2643 2644 "movq %5, %%mm7 \n\t" 2645 "punpcklbw %%mm7, %%mm7 \n\t" 2646 "punpcklbw %%mm7, %%mm7 \n\t" 2647 "punpcklbw %%mm7, %%mm7 \n\t" 2648 "psubb %%mm0, %%mm6 \n\t" 2649 "pcmpgtb %%mm7, %%mm6 \n\t" 2650 "movq %%mm6, %0 \n\t" 2651 2652 : "=m" (eq_mask), "=m" (dc_mask) 2653 : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) 2654 : "%"REG_a 2655 ); 2656 2657 both_masks = dc_mask & eq_mask; 2658 2659 if(both_masks){ 2660 x86_reg offset= -8*step; 2661 int64_t *temp_sums= sums; 2662 2663 __asm__ volatile( 2664 "movq %2, %%mm0 \n\t" // QP,..., QP 2665 "pxor %%mm4, %%mm4 \n\t" 2666 2667 "movq (%0), %%mm6 \n\t" 2668 "movq (%0, %1), %%mm5 \n\t" 2669 "movq %%mm5, %%mm1 \n\t" 2670 "movq %%mm6, %%mm2 \n\t" 2671 "psubusb %%mm6, %%mm5 \n\t" 2672 "psubusb %%mm1, %%mm2 \n\t" 2673 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2674 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2675 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2676 2677 "pxor %%mm6, %%mm1 \n\t" 2678 "pand %%mm0, %%mm1 \n\t" 2679 "pxor %%mm1, %%mm6 \n\t" 2680 // 0:QP 6:First 2681 2682 "movq (%0, %1, 8), %%mm5 \n\t" 2683 "add %1, %0 \n\t" // %0 points to line 1 not 0 2684 "movq (%0, %1, 8), %%mm7 \n\t" 2685 "movq %%mm5, %%mm1 \n\t" 2686 "movq %%mm7, %%mm2 \n\t" 2687 "psubusb %%mm7, %%mm5 \n\t" 2688 "psubusb %%mm1, %%mm2 \n\t" 2689 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2690 "movq %2, %%mm0 \n\t" // QP,..., QP 2691 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2692 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2693 2694 "pxor %%mm7, %%mm1 \n\t" 2695 "pand %%mm0, %%mm1 \n\t" 2696 "pxor %%mm1, %%mm7 \n\t" 2697 2698 "movq %%mm6, %%mm5 \n\t" 2699 "punpckhbw %%mm4, %%mm6 \n\t" 2700 "punpcklbw %%mm4, %%mm5 \n\t" 2701 // 4:0 5/6:First 7:Last 2702 2703 "movq %%mm5, %%mm0 \n\t" 2704 "movq %%mm6, %%mm1 \n\t" 2705 "psllw $2, %%mm0 \n\t" 2706 "psllw $2, %%mm1 \n\t" 2707 "paddw "MANGLE(w04)", %%mm0 \n\t" 2708 "paddw "MANGLE(w04)", %%mm1 \n\t" 2709 2710#define NEXT\ 2711 "movq (%0), %%mm2 \n\t"\ 2712 "movq (%0), %%mm3 \n\t"\ 2713 "add %1, %0 \n\t"\ 2714 "punpcklbw %%mm4, %%mm2 \n\t"\ 2715 "punpckhbw %%mm4, %%mm3 \n\t"\ 2716 "paddw %%mm2, %%mm0 \n\t"\ 2717 "paddw %%mm3, %%mm1 \n\t" 2718 2719#define PREV\ 2720 "movq (%0), %%mm2 \n\t"\ 2721 "movq (%0), %%mm3 \n\t"\ 2722 "add %1, %0 \n\t"\ 2723 "punpcklbw %%mm4, %%mm2 \n\t"\ 2724 "punpckhbw %%mm4, %%mm3 \n\t"\ 2725 "psubw %%mm2, %%mm0 \n\t"\ 2726 "psubw %%mm3, %%mm1 \n\t" 2727 2728 2729 NEXT //0 2730 NEXT //1 2731 NEXT //2 2732 "movq %%mm0, (%3) \n\t" 2733 "movq %%mm1, 8(%3) \n\t" 2734 2735 NEXT //3 2736 "psubw %%mm5, %%mm0 \n\t" 2737 "psubw %%mm6, %%mm1 \n\t" 2738 "movq %%mm0, 16(%3) \n\t" 2739 "movq %%mm1, 24(%3) \n\t" 2740 2741 NEXT //4 2742 "psubw %%mm5, %%mm0 \n\t" 2743 "psubw %%mm6, %%mm1 \n\t" 2744 "movq %%mm0, 32(%3) \n\t" 2745 "movq %%mm1, 40(%3) \n\t" 2746 2747 NEXT //5 2748 "psubw %%mm5, %%mm0 \n\t" 2749 "psubw %%mm6, %%mm1 \n\t" 2750 "movq %%mm0, 48(%3) \n\t" 2751 "movq %%mm1, 56(%3) \n\t" 2752 2753 NEXT //6 2754 "psubw %%mm5, %%mm0 \n\t" 2755 "psubw %%mm6, %%mm1 \n\t" 2756 "movq %%mm0, 64(%3) \n\t" 2757 "movq %%mm1, 72(%3) \n\t" 2758 2759 "movq %%mm7, %%mm6 \n\t" 2760 "punpckhbw %%mm4, %%mm7 \n\t" 2761 "punpcklbw %%mm4, %%mm6 \n\t" 2762 2763 NEXT //7 2764 "mov %4, %0 \n\t" 2765 "add %1, %0 \n\t" 2766 PREV //0 2767 "movq %%mm0, 80(%3) \n\t" 2768 "movq %%mm1, 88(%3) \n\t" 2769 2770 PREV //1 2771 "paddw %%mm6, %%mm0 \n\t" 2772 "paddw %%mm7, %%mm1 \n\t" 2773 "movq %%mm0, 96(%3) \n\t" 2774 "movq %%mm1, 104(%3) \n\t" 2775 2776 PREV //2 2777 "paddw %%mm6, %%mm0 \n\t" 2778 "paddw %%mm7, %%mm1 \n\t" 2779 "movq %%mm0, 112(%3) \n\t" 2780 "movq %%mm1, 120(%3) \n\t" 2781 2782 PREV //3 2783 "paddw %%mm6, %%mm0 \n\t" 2784 "paddw %%mm7, %%mm1 \n\t" 2785 "movq %%mm0, 128(%3) \n\t" 2786 "movq %%mm1, 136(%3) \n\t" 2787 2788 PREV //4 2789 "paddw %%mm6, %%mm0 \n\t" 2790 "paddw %%mm7, %%mm1 \n\t" 2791 "movq %%mm0, 144(%3) \n\t" 2792 "movq %%mm1, 152(%3) \n\t" 2793 2794 "mov %4, %0 \n\t" //FIXME 2795 2796 : "+&r"(src) 2797 : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src) 2798 NAMED_CONSTRAINTS_ADD(w04) 2799 ); 2800 2801 src+= step; // src points to begin of the 8x8 Block 2802 2803 __asm__ volatile( 2804 "movq %4, %%mm6 \n\t" 2805 "pcmpeqb %%mm5, %%mm5 \n\t" 2806 "pxor %%mm6, %%mm5 \n\t" 2807 "pxor %%mm7, %%mm7 \n\t" 2808 2809 "1: \n\t" 2810 "movq (%1), %%mm0 \n\t" 2811 "movq 8(%1), %%mm1 \n\t" 2812 "paddw 32(%1), %%mm0 \n\t" 2813 "paddw 40(%1), %%mm1 \n\t" 2814 "movq (%0, %3), %%mm2 \n\t" 2815 "movq %%mm2, %%mm3 \n\t" 2816 "movq %%mm2, %%mm4 \n\t" 2817 "punpcklbw %%mm7, %%mm2 \n\t" 2818 "punpckhbw %%mm7, %%mm3 \n\t" 2819 "paddw %%mm2, %%mm0 \n\t" 2820 "paddw %%mm3, %%mm1 \n\t" 2821 "paddw %%mm2, %%mm0 \n\t" 2822 "paddw %%mm3, %%mm1 \n\t" 2823 "psrlw $4, %%mm0 \n\t" 2824 "psrlw $4, %%mm1 \n\t" 2825 "packuswb %%mm1, %%mm0 \n\t" 2826 "pand %%mm6, %%mm0 \n\t" 2827 "pand %%mm5, %%mm4 \n\t" 2828 "por %%mm4, %%mm0 \n\t" 2829 "movq %%mm0, (%0, %3) \n\t" 2830 "add $16, %1 \n\t" 2831 "add %2, %0 \n\t" 2832 " js 1b \n\t" 2833 2834 : "+r"(offset), "+r"(temp_sums) 2835 : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks) 2836 ); 2837 }else 2838 src+= step; // src points to begin of the 8x8 Block 2839 2840 if(eq_mask != -1LL){ 2841 uint8_t *temp_src= src; 2842 DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars 2843 __asm__ volatile( 2844 "pxor %%mm7, %%mm7 \n\t" 2845// 0 1 2 3 4 5 6 7 8 9 2846// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 2847 2848 "movq (%0), %%mm0 \n\t" 2849 "movq %%mm0, %%mm1 \n\t" 2850 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 2851 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 2852 2853 "movq (%0, %1), %%mm2 \n\t" 2854 "lea (%0, %1, 2), %%"REG_a" \n\t" 2855 "movq %%mm2, %%mm3 \n\t" 2856 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 2857 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 2858 2859 "movq (%%"REG_a"), %%mm4 \n\t" 2860 "movq %%mm4, %%mm5 \n\t" 2861 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 2862 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 2863 2864 "paddw %%mm0, %%mm0 \n\t" // 2L0 2865 "paddw %%mm1, %%mm1 \n\t" // 2H0 2866 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 2867 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 2868 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 2869 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 2870 2871 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 2872 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 2873 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 2874 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 2875 2876 "movq (%%"REG_a", %1), %%mm2 \n\t" 2877 "movq %%mm2, %%mm3 \n\t" 2878 "punpcklbw %%mm7, %%mm2 \n\t" // L3 2879 "punpckhbw %%mm7, %%mm3 \n\t" // H3 2880 2881 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 2882 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 2883 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2884 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2885 "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2886 "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2887 2888 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 2889 "movq %%mm0, %%mm1 \n\t" 2890 "punpcklbw %%mm7, %%mm0 \n\t" // L4 2891 "punpckhbw %%mm7, %%mm1 \n\t" // H4 2892 2893 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 2894 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 2895 "movq %%mm2, 16(%4) \n\t" // L3 - L4 2896 "movq %%mm3, 24(%4) \n\t" // H3 - H4 2897 "paddw %%mm4, %%mm4 \n\t" // 2L2 2898 "paddw %%mm5, %%mm5 \n\t" // 2H2 2899 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 2900 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 2901 2902 "lea (%%"REG_a", %1), %0 \n\t" 2903 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 2904 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 2905 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 2906 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 2907//50 opcodes so far 2908 "movq (%0, %1, 2), %%mm2 \n\t" 2909 "movq %%mm2, %%mm3 \n\t" 2910 "punpcklbw %%mm7, %%mm2 \n\t" // L5 2911 "punpckhbw %%mm7, %%mm3 \n\t" // H5 2912 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 2913 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 2914 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 2915 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 2916 2917 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 2918 "punpcklbw %%mm7, %%mm6 \n\t" // L6 2919 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 2920 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 2921 "punpckhbw %%mm7, %%mm6 \n\t" // H6 2922 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 2923 2924 "paddw %%mm0, %%mm0 \n\t" // 2L4 2925 "paddw %%mm1, %%mm1 \n\t" // 2H4 2926 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 2927 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 2928 2929 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 2930 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 2931 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 2932 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 2933 2934 "movq (%0, %1, 4), %%mm2 \n\t" 2935 "movq %%mm2, %%mm3 \n\t" 2936 "punpcklbw %%mm7, %%mm2 \n\t" // L7 2937 "punpckhbw %%mm7, %%mm3 \n\t" // H7 2938 2939 "paddw %%mm2, %%mm2 \n\t" // 2L7 2940 "paddw %%mm3, %%mm3 \n\t" // 2H7 2941 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 2942 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 2943 2944 "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2945 "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2946 2947#if TEMPLATE_PP_MMXEXT 2948 "movq %%mm7, %%mm6 \n\t" // 0 2949 "psubw %%mm0, %%mm6 \n\t" 2950 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 2951 "movq %%mm7, %%mm6 \n\t" // 0 2952 "psubw %%mm1, %%mm6 \n\t" 2953 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 2954 "movq %%mm7, %%mm6 \n\t" // 0 2955 "psubw %%mm2, %%mm6 \n\t" 2956 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 2957 "movq %%mm7, %%mm6 \n\t" // 0 2958 "psubw %%mm3, %%mm6 \n\t" 2959 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 2960#else 2961 "movq %%mm7, %%mm6 \n\t" // 0 2962 "pcmpgtw %%mm0, %%mm6 \n\t" 2963 "pxor %%mm6, %%mm0 \n\t" 2964 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 2965 "movq %%mm7, %%mm6 \n\t" // 0 2966 "pcmpgtw %%mm1, %%mm6 \n\t" 2967 "pxor %%mm6, %%mm1 \n\t" 2968 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 2969 "movq %%mm7, %%mm6 \n\t" // 0 2970 "pcmpgtw %%mm2, %%mm6 \n\t" 2971 "pxor %%mm6, %%mm2 \n\t" 2972 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 2973 "movq %%mm7, %%mm6 \n\t" // 0 2974 "pcmpgtw %%mm3, %%mm6 \n\t" 2975 "pxor %%mm6, %%mm3 \n\t" 2976 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 2977#endif 2978 2979#if TEMPLATE_PP_MMXEXT 2980 "pminsw %%mm2, %%mm0 \n\t" 2981 "pminsw %%mm3, %%mm1 \n\t" 2982#else 2983 "movq %%mm0, %%mm6 \n\t" 2984 "psubusw %%mm2, %%mm6 \n\t" 2985 "psubw %%mm6, %%mm0 \n\t" 2986 "movq %%mm1, %%mm6 \n\t" 2987 "psubusw %%mm3, %%mm6 \n\t" 2988 "psubw %%mm6, %%mm1 \n\t" 2989#endif 2990 2991 "movd %2, %%mm2 \n\t" // QP 2992 "punpcklbw %%mm7, %%mm2 \n\t" 2993 2994 "movq %%mm7, %%mm6 \n\t" // 0 2995 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 2996 "pxor %%mm6, %%mm4 \n\t" 2997 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 2998 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 2999 "pxor %%mm7, %%mm5 \n\t" 3000 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 3001// 100 opcodes 3002 "psllw $3, %%mm2 \n\t" // 8QP 3003 "movq %%mm2, %%mm3 \n\t" // 8QP 3004 "pcmpgtw %%mm4, %%mm2 \n\t" 3005 "pcmpgtw %%mm5, %%mm3 \n\t" 3006 "pand %%mm2, %%mm4 \n\t" 3007 "pand %%mm3, %%mm5 \n\t" 3008 3009 3010 "psubusw %%mm0, %%mm4 \n\t" // hd 3011 "psubusw %%mm1, %%mm5 \n\t" // ld 3012 3013 3014 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 3015 "pmullw %%mm2, %%mm4 \n\t" 3016 "pmullw %%mm2, %%mm5 \n\t" 3017 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 3018 "paddw %%mm2, %%mm4 \n\t" 3019 "paddw %%mm2, %%mm5 \n\t" 3020 "psrlw $6, %%mm4 \n\t" 3021 "psrlw $6, %%mm5 \n\t" 3022 3023 "movq 16(%4), %%mm0 \n\t" // L3 - L4 3024 "movq 24(%4), %%mm1 \n\t" // H3 - H4 3025 3026 "pxor %%mm2, %%mm2 \n\t" 3027 "pxor %%mm3, %%mm3 \n\t" 3028 3029 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 3030 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 3031 "pxor %%mm2, %%mm0 \n\t" 3032 "pxor %%mm3, %%mm1 \n\t" 3033 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 3034 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 3035 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 3036 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 3037 3038 "pxor %%mm6, %%mm2 \n\t" 3039 "pxor %%mm7, %%mm3 \n\t" 3040 "pand %%mm2, %%mm4 \n\t" 3041 "pand %%mm3, %%mm5 \n\t" 3042 3043#if TEMPLATE_PP_MMXEXT 3044 "pminsw %%mm0, %%mm4 \n\t" 3045 "pminsw %%mm1, %%mm5 \n\t" 3046#else 3047 "movq %%mm4, %%mm2 \n\t" 3048 "psubusw %%mm0, %%mm2 \n\t" 3049 "psubw %%mm2, %%mm4 \n\t" 3050 "movq %%mm5, %%mm2 \n\t" 3051 "psubusw %%mm1, %%mm2 \n\t" 3052 "psubw %%mm2, %%mm5 \n\t" 3053#endif 3054 "pxor %%mm6, %%mm4 \n\t" 3055 "pxor %%mm7, %%mm5 \n\t" 3056 "psubw %%mm6, %%mm4 \n\t" 3057 "psubw %%mm7, %%mm5 \n\t" 3058 "packsswb %%mm5, %%mm4 \n\t" 3059 "movq %3, %%mm1 \n\t" 3060 "pandn %%mm4, %%mm1 \n\t" 3061 "movq (%0), %%mm0 \n\t" 3062 "paddb %%mm1, %%mm0 \n\t" 3063 "movq %%mm0, (%0) \n\t" 3064 "movq (%0, %1), %%mm0 \n\t" 3065 "psubb %%mm1, %%mm0 \n\t" 3066 "movq %%mm0, (%0, %1) \n\t" 3067 3068 : "+r" (temp_src) 3069 : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp) 3070 NAMED_CONSTRAINTS_ADD(w05,w20) 3071 : "%"REG_a 3072 ); 3073 } 3074/*if(step==16){ 3075 STOP_TIMER("step16") 3076}else{ 3077 STOP_TIMER("stepX") 3078} 3079 } */ 3080} 3081#endif //TEMPLATE_PP_MMX 3082 3083static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3084 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); 3085 3086/** 3087 * Copy a block from src to dst and fixes the blacklevel. 3088 * levelFix == 0 -> do not touch the brightness & contrast 3089 */ 3090#undef REAL_SCALED_CPY 3091#undef SCALED_CPY 3092 3093static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, 3094 int levelFix, int64_t *packedOffsetAndScale) 3095{ 3096#if !TEMPLATE_PP_MMX || !HAVE_6REGS 3097 int i; 3098#endif 3099 if(levelFix){ 3100#if TEMPLATE_PP_MMX && HAVE_6REGS 3101 __asm__ volatile( 3102 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset 3103 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale 3104 "lea (%2,%4), %%"REG_a" \n\t" 3105 "lea (%3,%5), %%"REG_d" \n\t" 3106 "pxor %%mm4, %%mm4 \n\t" 3107#if TEMPLATE_PP_MMXEXT 3108#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3109 "movq " #src1 ", %%mm0 \n\t"\ 3110 "movq " #src1 ", %%mm5 \n\t"\ 3111 "movq " #src2 ", %%mm1 \n\t"\ 3112 "movq " #src2 ", %%mm6 \n\t"\ 3113 "punpcklbw %%mm0, %%mm0 \n\t"\ 3114 "punpckhbw %%mm5, %%mm5 \n\t"\ 3115 "punpcklbw %%mm1, %%mm1 \n\t"\ 3116 "punpckhbw %%mm6, %%mm6 \n\t"\ 3117 "pmulhuw %%mm3, %%mm0 \n\t"\ 3118 "pmulhuw %%mm3, %%mm5 \n\t"\ 3119 "pmulhuw %%mm3, %%mm1 \n\t"\ 3120 "pmulhuw %%mm3, %%mm6 \n\t"\ 3121 "psubw %%mm2, %%mm0 \n\t"\ 3122 "psubw %%mm2, %%mm5 \n\t"\ 3123 "psubw %%mm2, %%mm1 \n\t"\ 3124 "psubw %%mm2, %%mm6 \n\t"\ 3125 "packuswb %%mm5, %%mm0 \n\t"\ 3126 "packuswb %%mm6, %%mm1 \n\t"\ 3127 "movq %%mm0, " #dst1 " \n\t"\ 3128 "movq %%mm1, " #dst2 " \n\t"\ 3129 3130#else //TEMPLATE_PP_MMXEXT 3131#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3132 "movq " #src1 ", %%mm0 \n\t"\ 3133 "movq " #src1 ", %%mm5 \n\t"\ 3134 "punpcklbw %%mm4, %%mm0 \n\t"\ 3135 "punpckhbw %%mm4, %%mm5 \n\t"\ 3136 "psubw %%mm2, %%mm0 \n\t"\ 3137 "psubw %%mm2, %%mm5 \n\t"\ 3138 "movq " #src2 ", %%mm1 \n\t"\ 3139 "psllw $6, %%mm0 \n\t"\ 3140 "psllw $6, %%mm5 \n\t"\ 3141 "pmulhw %%mm3, %%mm0 \n\t"\ 3142 "movq " #src2 ", %%mm6 \n\t"\ 3143 "pmulhw %%mm3, %%mm5 \n\t"\ 3144 "punpcklbw %%mm4, %%mm1 \n\t"\ 3145 "punpckhbw %%mm4, %%mm6 \n\t"\ 3146 "psubw %%mm2, %%mm1 \n\t"\ 3147 "psubw %%mm2, %%mm6 \n\t"\ 3148 "psllw $6, %%mm1 \n\t"\ 3149 "psllw $6, %%mm6 \n\t"\ 3150 "pmulhw %%mm3, %%mm1 \n\t"\ 3151 "pmulhw %%mm3, %%mm6 \n\t"\ 3152 "packuswb %%mm5, %%mm0 \n\t"\ 3153 "packuswb %%mm6, %%mm1 \n\t"\ 3154 "movq %%mm0, " #dst1 " \n\t"\ 3155 "movq %%mm1, " #dst2 " \n\t"\ 3156 3157#endif //TEMPLATE_PP_MMXEXT 3158#define SCALED_CPY(src1, src2, dst1, dst2)\ 3159 REAL_SCALED_CPY(src1, src2, dst1, dst2) 3160 3161SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) 3162SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) 3163SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) 3164 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" 3165 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" 3166SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) 3167 3168 3169 : "=&a" (packedOffsetAndScale) 3170 : "0" (packedOffsetAndScale), 3171 "r"(src), 3172 "r"(dst), 3173 "r" ((x86_reg)srcStride), 3174 "r" ((x86_reg)dstStride) 3175 : "%"REG_d 3176 ); 3177#else //TEMPLATE_PP_MMX && HAVE_6REGS 3178 for(i=0; i<8; i++) 3179 memcpy( &(dst[dstStride*i]), 3180 &(src[srcStride*i]), BLOCK_SIZE); 3181#endif //TEMPLATE_PP_MMX && HAVE_6REGS 3182 }else{ 3183#if TEMPLATE_PP_MMX && HAVE_6REGS 3184 __asm__ volatile( 3185 "lea (%0,%2), %%"REG_a" \n\t" 3186 "lea (%1,%3), %%"REG_d" \n\t" 3187 3188#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ 3189 "movq " #src1 ", %%mm0 \n\t"\ 3190 "movq " #src2 ", %%mm1 \n\t"\ 3191 "movq %%mm0, " #dst1 " \n\t"\ 3192 "movq %%mm1, " #dst2 " \n\t"\ 3193 3194#define SIMPLE_CPY(src1, src2, dst1, dst2)\ 3195 REAL_SIMPLE_CPY(src1, src2, dst1, dst2) 3196 3197SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) 3198SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) 3199SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) 3200 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" 3201 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" 3202SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) 3203 3204 : : "r" (src), 3205 "r" (dst), 3206 "r" ((x86_reg)srcStride), 3207 "r" ((x86_reg)dstStride) 3208 : "%"REG_a, "%"REG_d 3209 ); 3210#else //TEMPLATE_PP_MMX && HAVE_6REGS 3211 for(i=0; i<8; i++) 3212 memcpy( &(dst[dstStride*i]), 3213 &(src[srcStride*i]), BLOCK_SIZE); 3214#endif //TEMPLATE_PP_MMX && HAVE_6REGS 3215 } 3216} 3217 3218/** 3219 * Duplicate the given 8 src pixels ? times upward 3220 */ 3221static inline void RENAME(duplicate)(uint8_t src[], int stride) 3222{ 3223#if TEMPLATE_PP_MMX 3224 __asm__ volatile( 3225 "movq (%0), %%mm0 \n\t" 3226 "movq %%mm0, (%0, %1, 4) \n\t" 3227 "add %1, %0 \n\t" 3228 "movq %%mm0, (%0) \n\t" 3229 "movq %%mm0, (%0, %1) \n\t" 3230 "movq %%mm0, (%0, %1, 2) \n\t" 3231 "movq %%mm0, (%0, %1, 4) \n\t" 3232 : "+r" (src) 3233 : "r" ((x86_reg)-stride) 3234 ); 3235#else 3236 int i; 3237 uint8_t *p=src; 3238 for(i=0; i<5; i++){ 3239 p-= stride; 3240 memcpy(p, src, 8); 3241 } 3242#endif 3243} 3244 3245/** 3246 * Filter array of bytes (Y or U or V values) 3247 */ 3248static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3249 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) 3250{ 3251 DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access 3252 int x,y; 3253#ifdef TEMPLATE_PP_TIME_MODE 3254 const int mode= TEMPLATE_PP_TIME_MODE; 3255#else 3256 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; 3257#endif 3258 int black=0, white=255; // blackest black and whitest white in the picture 3259 int QPCorrecture= 256*256; 3260 3261 int copyAhead; 3262#if TEMPLATE_PP_MMX 3263 int i; 3264#endif 3265 3266 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; 3267 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; 3268 3269 //FIXME remove 3270 uint64_t * const yHistogram= c.yHistogram; 3271 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; 3272 uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32; 3273 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; 3274 3275#if TEMPLATE_PP_MMX 3276 for(i=0; i<57; i++){ 3277 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; 3278 int threshold= offset*2 + 1; 3279 c.mmxDcOffset[i]= 0x7F - offset; 3280 c.mmxDcThreshold[i]= 0x7F - threshold; 3281 c.mmxDcOffset[i]*= 0x0101010101010101LL; 3282 c.mmxDcThreshold[i]*= 0x0101010101010101LL; 3283 } 3284#endif 3285 3286 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; 3287 else if( (mode & LINEAR_BLEND_DEINT_FILTER) 3288 || (mode & FFMPEG_DEINT_FILTER) 3289 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; 3290 else if( (mode & V_DEBLOCK) 3291 || (mode & LINEAR_IPOL_DEINT_FILTER) 3292 || (mode & MEDIAN_DEINT_FILTER) 3293 || (mode & V_A_DEBLOCK)) copyAhead=13; 3294 else if(mode & V_X1_FILTER) copyAhead=11; 3295// else if(mode & V_RK1_FILTER) copyAhead=10; 3296 else if(mode & DERING) copyAhead=9; 3297 else copyAhead=8; 3298 3299 copyAhead-= 8; 3300 3301 if(!isColor){ 3302 uint64_t sum= 0; 3303 int i; 3304 uint64_t maxClipped; 3305 uint64_t clipped; 3306 double scale; 3307 3308 c.frameNum++; 3309 // first frame is fscked so we ignore it 3310 if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256; 3311 3312 for(i=0; i<256; i++){ 3313 sum+= yHistogram[i]; 3314 } 3315 3316 /* We always get a completely black picture first. */ 3317 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); 3318 3319 clipped= sum; 3320 for(black=255; black>0; black--){ 3321 if(clipped < maxClipped) break; 3322 clipped-= yHistogram[black]; 3323 } 3324 3325 clipped= sum; 3326 for(white=0; white<256; white++){ 3327 if(clipped < maxClipped) break; 3328 clipped-= yHistogram[white]; 3329 } 3330 3331 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); 3332 3333#if TEMPLATE_PP_MMXEXT 3334 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); 3335 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; 3336#else 3337 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); 3338 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; 3339#endif 3340 3341 c.packedYOffset|= c.packedYOffset<<32; 3342 c.packedYOffset|= c.packedYOffset<<16; 3343 3344 c.packedYScale|= c.packedYScale<<32; 3345 c.packedYScale|= c.packedYScale<<16; 3346 3347 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); 3348 else QPCorrecture= 256*256; 3349 }else{ 3350 c.packedYScale= 0x0100010001000100LL; 3351 c.packedYOffset= 0; 3352 QPCorrecture= 256*256; 3353 } 3354 3355 /* copy & deinterlace first row of blocks */ 3356 y=-BLOCK_SIZE; 3357 { 3358 const uint8_t *srcBlock= &(src[y*srcStride]); 3359 uint8_t *dstBlock= tempDst + dstStride; 3360 3361 // From this point on it is guaranteed that we can read and write 16 lines downward 3362 // finish 1 block before the next otherwise we might have a problem 3363 // with the L1 Cache of the P4 ... or only a few blocks at a time or something 3364 for(x=0; x<width; x+=BLOCK_SIZE){ 3365 3366#if TEMPLATE_PP_MMXEXT && HAVE_6REGS 3367/* 3368 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3369 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3370 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3371 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3372*/ 3373 3374 __asm__( 3375 "mov %4, %%"REG_a" \n\t" 3376 "shr $2, %%"REG_a" \n\t" 3377 "and $6, %%"REG_a" \n\t" 3378 "add %5, %%"REG_a" \n\t" 3379 "mov %%"REG_a", %%"REG_d" \n\t" 3380 "imul %1, %%"REG_a" \n\t" 3381 "imul %3, %%"REG_d" \n\t" 3382 "prefetchnta 32(%%"REG_a", %0) \n\t" 3383 "prefetcht0 32(%%"REG_d", %2) \n\t" 3384 "add %1, %%"REG_a" \n\t" 3385 "add %3, %%"REG_d" \n\t" 3386 "prefetchnta 32(%%"REG_a", %0) \n\t" 3387 "prefetcht0 32(%%"REG_d", %2) \n\t" 3388 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3389 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3390 : "%"REG_a, "%"REG_d 3391 ); 3392 3393#elif TEMPLATE_PP_3DNOW 3394//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3395/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3396 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3397 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3398 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3399*/ 3400#endif 3401 3402 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, 3403 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3404 3405 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); 3406 3407 if(mode & LINEAR_IPOL_DEINT_FILTER) 3408 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3409 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3410 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3411 else if(mode & MEDIAN_DEINT_FILTER) 3412 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3413 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3414 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3415 else if(mode & FFMPEG_DEINT_FILTER) 3416 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3417 else if(mode & LOWPASS5_DEINT_FILTER) 3418 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3419/* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3420 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3421*/ 3422 dstBlock+=8; 3423 srcBlock+=8; 3424 } 3425 if(width==FFABS(dstStride)) 3426 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); 3427 else{ 3428 int i; 3429 for(i=0; i<copyAhead; i++){ 3430 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); 3431 } 3432 } 3433 } 3434 3435 for(y=0; y<height; y+=BLOCK_SIZE){ 3436 //1% speedup if these are here instead of the inner loop 3437 const uint8_t *srcBlock= &(src[y*srcStride]); 3438 uint8_t *dstBlock= &(dst[y*dstStride]); 3439#if TEMPLATE_PP_MMX 3440 uint8_t *tempBlock1= c.tempBlocks; 3441 uint8_t *tempBlock2= c.tempBlocks + 8; 3442#endif 3443 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; 3444 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; 3445 int QP=0; 3446 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 3447 if not than use a temporary buffer */ 3448 if(y+15 >= height){ 3449 int i; 3450 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with 3451 blockcopy to dst later */ 3452 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, 3453 FFMAX(height-y-copyAhead, 0), srcStride); 3454 3455 /* duplicate last line of src to fill the void up to line (copyAhead+7) */ 3456 for(i=FFMAX(height-y, 8); i<copyAhead+8; i++) 3457 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride)); 3458 3459 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ 3460 linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride); 3461 3462 /* duplicate last line of dst to fill the void up to line (copyAhead) */ 3463 for(i=height-y+1; i<=copyAhead; i++) 3464 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride)); 3465 3466 dstBlock= tempDst + dstStride; 3467 srcBlock= tempSrc; 3468 } 3469 3470 // From this point on it is guaranteed that we can read and write 16 lines downward 3471 // finish 1 block before the next otherwise we might have a problem 3472 // with the L1 Cache of the P4 ... or only a few blocks at a time or something 3473 for(x=0; x<width; x+=BLOCK_SIZE){ 3474 const int stride= dstStride; 3475#if TEMPLATE_PP_MMX 3476 uint8_t *tmpXchg; 3477#endif 3478 if(isColor){ 3479 QP= QPptr[x>>qpHShift]; 3480 c.nonBQP= nonBQPptr[x>>qpHShift]; 3481 }else{ 3482 QP= QPptr[x>>4]; 3483 QP= (QP* QPCorrecture + 256*128)>>16; 3484 c.nonBQP= nonBQPptr[x>>4]; 3485 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; 3486 yHistogram[ srcBlock[srcStride*12 + 4] ]++; 3487 } 3488 c.QP= QP; 3489#if TEMPLATE_PP_MMX 3490 __asm__ volatile( 3491 "movd %1, %%mm7 \n\t" 3492 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 3493 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 3494 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 3495 "movq %%mm7, %0 \n\t" 3496 : "=m" (c.pQPb) 3497 : "r" (QP) 3498 ); 3499#endif 3500 3501 3502#if TEMPLATE_PP_MMXEXT && HAVE_6REGS 3503/* 3504 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3505 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3506 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3507 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3508*/ 3509 3510 __asm__( 3511 "mov %4, %%"REG_a" \n\t" 3512 "shr $2, %%"REG_a" \n\t" 3513 "and $6, %%"REG_a" \n\t" 3514 "add %5, %%"REG_a" \n\t" 3515 "mov %%"REG_a", %%"REG_d" \n\t" 3516 "imul %1, %%"REG_a" \n\t" 3517 "imul %3, %%"REG_d" \n\t" 3518 "prefetchnta 32(%%"REG_a", %0) \n\t" 3519 "prefetcht0 32(%%"REG_d", %2) \n\t" 3520 "add %1, %%"REG_a" \n\t" 3521 "add %3, %%"REG_d" \n\t" 3522 "prefetchnta 32(%%"REG_a", %0) \n\t" 3523 "prefetcht0 32(%%"REG_d", %2) \n\t" 3524 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3525 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3526 : "%"REG_a, "%"REG_d 3527 ); 3528 3529#elif TEMPLATE_PP_3DNOW 3530//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3531/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3532 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3533 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3534 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3535*/ 3536#endif 3537 3538 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, 3539 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3540 3541 if(mode & LINEAR_IPOL_DEINT_FILTER) 3542 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3543 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3544 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3545 else if(mode & MEDIAN_DEINT_FILTER) 3546 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3547 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3548 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3549 else if(mode & FFMPEG_DEINT_FILTER) 3550 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3551 else if(mode & LOWPASS5_DEINT_FILTER) 3552 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3553/* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3554 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3555*/ 3556 3557 /* only deblock if we have 2 blocks */ 3558 if(y + 8 < height){ 3559 if(mode & V_X1_FILTER) 3560 RENAME(vertX1Filter)(dstBlock, stride, &c); 3561 else if(mode & V_DEBLOCK){ 3562 const int t= RENAME(vertClassify)(dstBlock, stride, &c); 3563 3564 if(t==1) 3565 RENAME(doVertLowPass)(dstBlock, stride, &c); 3566 else if(t==2) 3567 RENAME(doVertDefFilter)(dstBlock, stride, &c); 3568 }else if(mode & V_A_DEBLOCK){ 3569 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); 3570 } 3571 } 3572 3573#if TEMPLATE_PP_MMX 3574 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); 3575#endif 3576 /* check if we have a previous block to deblock it with dstBlock */ 3577 if(x - 8 >= 0){ 3578#if TEMPLATE_PP_MMX 3579 if(mode & H_X1_FILTER) 3580 RENAME(vertX1Filter)(tempBlock1, 16, &c); 3581 else if(mode & H_DEBLOCK){ 3582//START_TIMER 3583 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); 3584//STOP_TIMER("dc & minmax") 3585 if(t==1) 3586 RENAME(doVertLowPass)(tempBlock1, 16, &c); 3587 else if(t==2) 3588 RENAME(doVertDefFilter)(tempBlock1, 16, &c); 3589 }else if(mode & H_A_DEBLOCK){ 3590 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); 3591 } 3592 3593 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); 3594 3595#else 3596 if(mode & H_X1_FILTER) 3597 horizX1Filter(dstBlock-4, stride, QP); 3598 else if(mode & H_DEBLOCK){ 3599#if TEMPLATE_PP_ALTIVEC 3600 DECLARE_ALIGNED(16, unsigned char, tempBlock)[272]; 3601 int t; 3602 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); 3603 3604 t = vertClassify_altivec(tempBlock-48, 16, &c); 3605 if(t==1) { 3606 doVertLowPass_altivec(tempBlock-48, 16, &c); 3607 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3608 } 3609 else if(t==2) { 3610 doVertDefFilter_altivec(tempBlock-48, 16, &c); 3611 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3612 } 3613#else 3614 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); 3615 3616 if(t==1) 3617 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); 3618 else if(t==2) 3619 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); 3620#endif 3621 }else if(mode & H_A_DEBLOCK){ 3622 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); 3623 } 3624#endif //TEMPLATE_PP_MMX 3625 if(mode & DERING){ 3626 //FIXME filter first line 3627 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); 3628 } 3629 3630 if(mode & TEMP_NOISE_FILTER) 3631 { 3632 RENAME(tempNoiseReducer)(dstBlock-8, stride, 3633 c.tempBlurred[isColor] + y*dstStride + x, 3634 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256, 3635 c.ppMode.maxTmpNoise); 3636 } 3637 } 3638 3639 dstBlock+=8; 3640 srcBlock+=8; 3641 3642#if TEMPLATE_PP_MMX 3643 tmpXchg= tempBlock1; 3644 tempBlock1= tempBlock2; 3645 tempBlock2 = tmpXchg; 3646#endif 3647 } 3648 3649 if(mode & DERING){ 3650 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); 3651 } 3652 3653 if((mode & TEMP_NOISE_FILTER)){ 3654 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, 3655 c.tempBlurred[isColor] + y*dstStride + x, 3656 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256, 3657 c.ppMode.maxTmpNoise); 3658 } 3659 3660 /* did we use a tmp buffer for the last lines*/ 3661 if(y+15 >= height){ 3662 uint8_t *dstBlock= &(dst[y*dstStride]); 3663 if(width==FFABS(dstStride)) 3664 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); 3665 else{ 3666 int i; 3667 for(i=0; i<height-y; i++){ 3668 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); 3669 } 3670 } 3671 } 3672/* 3673 for(x=0; x<width; x+=32){ 3674 volatile int i; 3675 i+= dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] 3676 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] 3677 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; 3678 + dstBlock[x +13*dstStride] 3679 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; 3680 }*/ 3681 } 3682#if TEMPLATE_PP_3DNOW 3683 __asm__ volatile("femms"); 3684#elif TEMPLATE_PP_MMX 3685 __asm__ volatile("emms"); 3686#endif 3687 3688#ifdef DEBUG_BRIGHTNESS 3689 if(!isColor){ 3690 int max=1; 3691 int i; 3692 for(i=0; i<256; i++) 3693 if(yHistogram[i] > max) max=yHistogram[i]; 3694 3695 for(i=1; i<256; i++){ 3696 int x; 3697 int start=yHistogram[i-1]/(max/256+1); 3698 int end=yHistogram[i]/(max/256+1); 3699 int inc= end > start ? 1 : -1; 3700 for(x=start; x!=end+inc; x+=inc) 3701 dst[ i*dstStride + x]+=128; 3702 } 3703 3704 for(i=0; i<100; i+=2){ 3705 dst[ (white)*dstStride + i]+=128; 3706 dst[ (black)*dstStride + i]+=128; 3707 } 3708 } 3709#endif 3710 3711 *c2= c; //copy local context back 3712 3713} 3714 3715#undef RENAME 3716#undef TEMPLATE_PP_C 3717#undef TEMPLATE_PP_ALTIVEC 3718#undef TEMPLATE_PP_MMX 3719#undef TEMPLATE_PP_MMXEXT 3720#undef TEMPLATE_PP_3DNOW 3721#undef TEMPLATE_PP_SSE2 3722