1/* 2 * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) 3 * 4 * This file is part of Libav. 5 * 6 * Libav is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * Libav is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with Libav; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21/** 22 * @file 23 * mmx/mmx2/3dnow postprocess code. 24 */ 25 26#include "libavutil/x86_cpu.h" 27 28#define ALIGN_MASK "$-8" 29 30#undef REAL_PAVGB 31#undef PAVGB 32#undef PMINUB 33#undef PMAXUB 34 35#if HAVE_MMX2 36#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 37#elif HAVE_AMD3DNOW 38#define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 39#endif 40#define PAVGB(a,b) REAL_PAVGB(a,b) 41 42#if HAVE_MMX2 43#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" 44#elif HAVE_MMX 45#define PMINUB(b,a,t) \ 46 "movq " #a ", " #t " \n\t"\ 47 "psubusb " #b ", " #t " \n\t"\ 48 "psubb " #t ", " #a " \n\t" 49#endif 50 51#if HAVE_MMX2 52#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" 53#elif HAVE_MMX 54#define PMAXUB(a,b) \ 55 "psubusb " #a ", " #b " \n\t"\ 56 "paddb " #a ", " #b " \n\t" 57#endif 58 59//FIXME? |255-0| = 1 (should not be a problem ...) 60#if HAVE_MMX 61/** 62 * Check if the middle 8x8 Block in the given 8x16 block is flat 63 */ 64static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ 65 int numEq= 0, dcOk; 66 src+= stride*4; // src points to begin of the 8x8 Block 67 __asm__ volatile( 68 "movq %0, %%mm7 \n\t" 69 "movq %1, %%mm6 \n\t" 70 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 71 ); 72 73 __asm__ volatile( 74 "lea (%2, %3), %%"REG_a" \n\t" 75// 0 1 2 3 4 5 6 7 8 9 76// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 77 78 "movq (%2), %%mm0 \n\t" 79 "movq (%%"REG_a"), %%mm1 \n\t" 80 "movq %%mm0, %%mm3 \n\t" 81 "movq %%mm0, %%mm4 \n\t" 82 PMAXUB(%%mm1, %%mm4) 83 PMINUB(%%mm1, %%mm3, %%mm5) 84 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 85 "paddb %%mm7, %%mm0 \n\t" 86 "pcmpgtb %%mm6, %%mm0 \n\t" 87 88 "movq (%%"REG_a",%3), %%mm2 \n\t" 89 PMAXUB(%%mm2, %%mm4) 90 PMINUB(%%mm2, %%mm3, %%mm5) 91 "psubb %%mm2, %%mm1 \n\t" 92 "paddb %%mm7, %%mm1 \n\t" 93 "pcmpgtb %%mm6, %%mm1 \n\t" 94 "paddb %%mm1, %%mm0 \n\t" 95 96 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 97 PMAXUB(%%mm1, %%mm4) 98 PMINUB(%%mm1, %%mm3, %%mm5) 99 "psubb %%mm1, %%mm2 \n\t" 100 "paddb %%mm7, %%mm2 \n\t" 101 "pcmpgtb %%mm6, %%mm2 \n\t" 102 "paddb %%mm2, %%mm0 \n\t" 103 104 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 105 106 "movq (%2, %3, 4), %%mm2 \n\t" 107 PMAXUB(%%mm2, %%mm4) 108 PMINUB(%%mm2, %%mm3, %%mm5) 109 "psubb %%mm2, %%mm1 \n\t" 110 "paddb %%mm7, %%mm1 \n\t" 111 "pcmpgtb %%mm6, %%mm1 \n\t" 112 "paddb %%mm1, %%mm0 \n\t" 113 114 "movq (%%"REG_a"), %%mm1 \n\t" 115 PMAXUB(%%mm1, %%mm4) 116 PMINUB(%%mm1, %%mm3, %%mm5) 117 "psubb %%mm1, %%mm2 \n\t" 118 "paddb %%mm7, %%mm2 \n\t" 119 "pcmpgtb %%mm6, %%mm2 \n\t" 120 "paddb %%mm2, %%mm0 \n\t" 121 122 "movq (%%"REG_a", %3), %%mm2 \n\t" 123 PMAXUB(%%mm2, %%mm4) 124 PMINUB(%%mm2, %%mm3, %%mm5) 125 "psubb %%mm2, %%mm1 \n\t" 126 "paddb %%mm7, %%mm1 \n\t" 127 "pcmpgtb %%mm6, %%mm1 \n\t" 128 "paddb %%mm1, %%mm0 \n\t" 129 130 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 131 PMAXUB(%%mm1, %%mm4) 132 PMINUB(%%mm1, %%mm3, %%mm5) 133 "psubb %%mm1, %%mm2 \n\t" 134 "paddb %%mm7, %%mm2 \n\t" 135 "pcmpgtb %%mm6, %%mm2 \n\t" 136 "paddb %%mm2, %%mm0 \n\t" 137 "psubusb %%mm3, %%mm4 \n\t" 138 139 " \n\t" 140#if HAVE_MMX2 141 "pxor %%mm7, %%mm7 \n\t" 142 "psadbw %%mm7, %%mm0 \n\t" 143#else 144 "movq %%mm0, %%mm1 \n\t" 145 "psrlw $8, %%mm0 \n\t" 146 "paddb %%mm1, %%mm0 \n\t" 147 "movq %%mm0, %%mm1 \n\t" 148 "psrlq $16, %%mm0 \n\t" 149 "paddb %%mm1, %%mm0 \n\t" 150 "movq %%mm0, %%mm1 \n\t" 151 "psrlq $32, %%mm0 \n\t" 152 "paddb %%mm1, %%mm0 \n\t" 153#endif 154 "movq %4, %%mm7 \n\t" // QP,..., QP 155 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 156 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 157 "packssdw %%mm4, %%mm4 \n\t" 158 "movd %%mm0, %0 \n\t" 159 "movd %%mm4, %1 \n\t" 160 161 : "=r" (numEq), "=r" (dcOk) 162 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 163 : "%"REG_a 164 ); 165 166 numEq= (-numEq) &0xFF; 167 if(numEq > c->ppMode.flatnessThreshold){ 168 if(dcOk) return 0; 169 else return 1; 170 }else{ 171 return 2; 172 } 173} 174#endif //HAVE_MMX 175 176/** 177 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) 178 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 179 */ 180#if !HAVE_ALTIVEC 181static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 182{ 183#if HAVE_MMX2 || HAVE_AMD3DNOW 184 src+= stride*3; 185 __asm__ volatile( //"movv %0 %1 %2\n\t" 186 "movq %2, %%mm0 \n\t" // QP,..., QP 187 "pxor %%mm4, %%mm4 \n\t" 188 189 "movq (%0), %%mm6 \n\t" 190 "movq (%0, %1), %%mm5 \n\t" 191 "movq %%mm5, %%mm1 \n\t" 192 "movq %%mm6, %%mm2 \n\t" 193 "psubusb %%mm6, %%mm5 \n\t" 194 "psubusb %%mm1, %%mm2 \n\t" 195 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 196 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 197 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 198 199 "pand %%mm2, %%mm6 \n\t" 200 "pandn %%mm1, %%mm2 \n\t" 201 "por %%mm2, %%mm6 \n\t"// First Line to Filter 202 203 "movq (%0, %1, 8), %%mm5 \n\t" 204 "lea (%0, %1, 4), %%"REG_a" \n\t" 205 "lea (%0, %1, 8), %%"REG_c" \n\t" 206 "sub %1, %%"REG_c" \n\t" 207 "add %1, %0 \n\t" // %0 points to line 1 not 0 208 "movq (%0, %1, 8), %%mm7 \n\t" 209 "movq %%mm5, %%mm1 \n\t" 210 "movq %%mm7, %%mm2 \n\t" 211 "psubusb %%mm7, %%mm5 \n\t" 212 "psubusb %%mm1, %%mm2 \n\t" 213 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 214 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 215 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 216 217 "pand %%mm2, %%mm7 \n\t" 218 "pandn %%mm1, %%mm2 \n\t" 219 "por %%mm2, %%mm7 \n\t" // First Line to Filter 220 221 222 // 1 2 3 4 5 6 7 8 223 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 224 // 6 4 2 2 1 1 225 // 6 4 4 2 226 // 6 8 2 227 228 "movq (%0, %1), %%mm0 \n\t" // 1 229 "movq %%mm0, %%mm1 \n\t" // 1 230 PAVGB(%%mm6, %%mm0) //1 1 /2 231 PAVGB(%%mm6, %%mm0) //3 1 /4 232 233 "movq (%0, %1, 4), %%mm2 \n\t" // 1 234 "movq %%mm2, %%mm5 \n\t" // 1 235 PAVGB((%%REGa), %%mm2) // 11 /2 236 PAVGB((%0, %1, 2), %%mm2) // 211 /4 237 "movq %%mm2, %%mm3 \n\t" // 211 /4 238 "movq (%0), %%mm4 \n\t" // 1 239 PAVGB(%%mm4, %%mm3) // 4 211 /8 240 PAVGB(%%mm0, %%mm3) //642211 /16 241 "movq %%mm3, (%0) \n\t" // X 242 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 243 "movq %%mm1, %%mm0 \n\t" // 1 244 PAVGB(%%mm6, %%mm0) //1 1 /2 245 "movq %%mm4, %%mm3 \n\t" // 1 246 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 247 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 248 PAVGB((%%REGa), %%mm5) // 211 /4 249 PAVGB(%%mm5, %%mm3) // 2 2211 /8 250 PAVGB(%%mm0, %%mm3) //4242211 /16 251 "movq %%mm3, (%0,%1) \n\t" // X 252 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 253 PAVGB(%%mm4, %%mm6) //11 /2 254 "movq (%%"REG_c"), %%mm0 \n\t" // 1 255 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 256 "movq %%mm0, %%mm3 \n\t" // 11/2 257 PAVGB(%%mm1, %%mm0) // 2 11/4 258 PAVGB(%%mm6, %%mm0) //222 11/8 259 PAVGB(%%mm2, %%mm0) //22242211/16 260 "movq (%0, %1, 2), %%mm2 \n\t" // 1 261 "movq %%mm0, (%0, %1, 2) \n\t" // X 262 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 263 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 264 PAVGB((%%REGc), %%mm0) // 11 /2 265 PAVGB(%%mm0, %%mm6) //11 11 /4 266 PAVGB(%%mm1, %%mm4) // 11 /2 267 PAVGB(%%mm2, %%mm1) // 11 /2 268 PAVGB(%%mm1, %%mm6) //1122 11 /8 269 PAVGB(%%mm5, %%mm6) //112242211 /16 270 "movq (%%"REG_a"), %%mm5 \n\t" // 1 271 "movq %%mm6, (%%"REG_a") \n\t" // X 272 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 273 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 274 PAVGB(%%mm7, %%mm6) // 11 /2 275 PAVGB(%%mm4, %%mm6) // 11 11 /4 276 PAVGB(%%mm3, %%mm6) // 11 2211 /8 277 PAVGB(%%mm5, %%mm2) // 11 /2 278 "movq (%0, %1, 4), %%mm4 \n\t" // 1 279 PAVGB(%%mm4, %%mm2) // 112 /4 280 PAVGB(%%mm2, %%mm6) // 112242211 /16 281 "movq %%mm6, (%0, %1, 4) \n\t" // X 282 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 283 PAVGB(%%mm7, %%mm1) // 11 2 /4 284 PAVGB(%%mm4, %%mm5) // 11 /2 285 PAVGB(%%mm5, %%mm0) // 11 11 /4 286 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 287 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 288 PAVGB(%%mm0, %%mm1) // 11224222 /16 289 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X 290 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 291 PAVGB((%%REGc), %%mm2) // 112 4 /8 292 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 293 PAVGB(%%mm0, %%mm6) // 1 1 /2 294 PAVGB(%%mm7, %%mm6) // 1 12 /4 295 PAVGB(%%mm2, %%mm6) // 1122424 /4 296 "movq %%mm6, (%%"REG_c") \n\t" // X 297 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 298 PAVGB(%%mm7, %%mm5) // 11 2 /4 299 PAVGB(%%mm7, %%mm5) // 11 6 /8 300 301 PAVGB(%%mm3, %%mm0) // 112 /4 302 PAVGB(%%mm0, %%mm5) // 112246 /16 303 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X 304 "sub %1, %0 \n\t" 305 306 : 307 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 308 : "%"REG_a, "%"REG_c 309 ); 310#else //HAVE_MMX2 || HAVE_AMD3DNOW 311 const int l1= stride; 312 const int l2= stride + l1; 313 const int l3= stride + l2; 314 const int l4= stride + l3; 315 const int l5= stride + l4; 316 const int l6= stride + l5; 317 const int l7= stride + l6; 318 const int l8= stride + l7; 319 const int l9= stride + l8; 320 int x; 321 src+= stride*3; 322 for(x=0; x<BLOCK_SIZE; x++){ 323 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; 324 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; 325 326 int sums[10]; 327 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; 328 sums[1] = sums[0] - first + src[l4]; 329 sums[2] = sums[1] - first + src[l5]; 330 sums[3] = sums[2] - first + src[l6]; 331 sums[4] = sums[3] - first + src[l7]; 332 sums[5] = sums[4] - src[l1] + src[l8]; 333 sums[6] = sums[5] - src[l2] + last; 334 sums[7] = sums[6] - src[l3] + last; 335 sums[8] = sums[7] - src[l4] + last; 336 sums[9] = sums[8] - src[l5] + last; 337 338 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; 339 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; 340 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; 341 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; 342 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; 343 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; 344 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 345 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 346 347 src++; 348 } 349#endif //HAVE_MMX2 || HAVE_AMD3DNOW 350} 351#endif //HAVE_ALTIVEC 352 353/** 354 * Experimental Filter 1 355 * will not damage linear gradients 356 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter 357 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) 358 * MMX2 version does correct clipping C version does not 359 */ 360static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 361{ 362#if HAVE_MMX2 || HAVE_AMD3DNOW 363 src+= stride*3; 364 365 __asm__ volatile( 366 "pxor %%mm7, %%mm7 \n\t" // 0 367 "lea (%0, %1), %%"REG_a" \n\t" 368 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 369// 0 1 2 3 4 5 6 7 8 9 370// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 371 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 372 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 373 "movq %%mm1, %%mm2 \n\t" // line 4 374 "psubusb %%mm0, %%mm1 \n\t" 375 "psubusb %%mm2, %%mm0 \n\t" 376 "por %%mm1, %%mm0 \n\t" // |l2 - l3| 377 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 378 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 379 "movq %%mm3, %%mm5 \n\t" // line 5 380 "psubusb %%mm4, %%mm3 \n\t" 381 "psubusb %%mm5, %%mm4 \n\t" 382 "por %%mm4, %%mm3 \n\t" // |l5 - l6| 383 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 384 "movq %%mm2, %%mm1 \n\t" // line 4 385 "psubusb %%mm5, %%mm2 \n\t" 386 "movq %%mm2, %%mm4 \n\t" 387 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 388 "psubusb %%mm1, %%mm5 \n\t" 389 "por %%mm5, %%mm4 \n\t" // |l4 - l5| 390 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) 391 "movq %%mm4, %%mm3 \n\t" // d 392 "movq %2, %%mm0 \n\t" 393 "paddusb %%mm0, %%mm0 \n\t" 394 "psubusb %%mm0, %%mm4 \n\t" 395 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 396 "psubusb "MANGLE(b01)", %%mm3 \n\t" 397 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 398 399 PAVGB(%%mm7, %%mm3) // d/2 400 "movq %%mm3, %%mm1 \n\t" // d/2 401 PAVGB(%%mm7, %%mm3) // d/4 402 PAVGB(%%mm1, %%mm3) // 3*d/8 403 404 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 405 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 406 "psubusb %%mm3, %%mm0 \n\t" 407 "pxor %%mm2, %%mm0 \n\t" 408 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 409 410 "movq (%%"REG_c"), %%mm0 \n\t" // line 5 411 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 412 "paddusb %%mm3, %%mm0 \n\t" 413 "pxor %%mm2, %%mm0 \n\t" 414 "movq %%mm0, (%%"REG_c") \n\t" // line 5 415 416 PAVGB(%%mm7, %%mm1) // d/4 417 418 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 419 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 420 "psubusb %%mm1, %%mm0 \n\t" 421 "pxor %%mm2, %%mm0 \n\t" 422 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 423 424 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 425 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 426 "paddusb %%mm1, %%mm0 \n\t" 427 "pxor %%mm2, %%mm0 \n\t" 428 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 429 430 PAVGB(%%mm7, %%mm1) // d/8 431 432 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 433 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 434 "psubusb %%mm1, %%mm0 \n\t" 435 "pxor %%mm2, %%mm0 \n\t" 436 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 437 438 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 439 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 440 "paddusb %%mm1, %%mm0 \n\t" 441 "pxor %%mm2, %%mm0 \n\t" 442 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 443 444 : 445 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) 446 : "%"REG_a, "%"REG_c 447 ); 448#else //HAVE_MMX2 || HAVE_AMD3DNOW 449 450 const int l1= stride; 451 const int l2= stride + l1; 452 const int l3= stride + l2; 453 const int l4= stride + l3; 454 const int l5= stride + l4; 455 const int l6= stride + l5; 456 const int l7= stride + l6; 457// const int l8= stride + l7; 458// const int l9= stride + l8; 459 int x; 460 461 src+= stride*3; 462 for(x=0; x<BLOCK_SIZE; x++){ 463 int a= src[l3] - src[l4]; 464 int b= src[l4] - src[l5]; 465 int c= src[l5] - src[l6]; 466 467 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1); 468 d= FFMAX(d, 0); 469 470 if(d < co->QP*2){ 471 int v = d * FFSIGN(-b); 472 473 src[l2] +=v>>3; 474 src[l3] +=v>>2; 475 src[l4] +=(3*v)>>3; 476 src[l5] -=(3*v)>>3; 477 src[l6] -=v>>2; 478 src[l7] -=v>>3; 479 } 480 src++; 481 } 482#endif //HAVE_MMX2 || HAVE_AMD3DNOW 483} 484 485#if !HAVE_ALTIVEC 486static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) 487{ 488#if HAVE_MMX2 || HAVE_AMD3DNOW 489/* 490 uint8_t tmp[16]; 491 const int l1= stride; 492 const int l2= stride + l1; 493 const int l3= stride + l2; 494 const int l4= (int)tmp - (int)src - stride*3; 495 const int l5= (int)tmp - (int)src - stride*3 + 8; 496 const int l6= stride*3 + l3; 497 const int l7= stride + l6; 498 const int l8= stride + l7; 499 500 memcpy(tmp, src+stride*7, 8); 501 memcpy(tmp+8, src+stride*8, 8); 502*/ 503 src+= stride*4; 504 __asm__ volatile( 505 506#if 0 //slightly more accurate and slightly slower 507 "pxor %%mm7, %%mm7 \n\t" // 0 508 "lea (%0, %1), %%"REG_a" \n\t" 509 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 510// 0 1 2 3 4 5 6 7 511// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 512// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 513 514 515 "movq (%0, %1, 2), %%mm0 \n\t" // l2 516 "movq (%0), %%mm1 \n\t" // l0 517 "movq %%mm0, %%mm2 \n\t" // l2 518 PAVGB(%%mm7, %%mm0) // ~l2/2 519 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 520 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 521 522 "movq (%%"REG_a"), %%mm1 \n\t" // l1 523 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 524 "movq %%mm1, %%mm4 \n\t" // l1 525 PAVGB(%%mm7, %%mm1) // ~l1/2 526 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 527 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 528 529 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 530 "psubusb %%mm1, %%mm0 \n\t" 531 "psubusb %%mm4, %%mm1 \n\t" 532 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 533// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 534 535 "movq (%0, %1, 4), %%mm0 \n\t" // l4 536 "movq %%mm0, %%mm4 \n\t" // l4 537 PAVGB(%%mm7, %%mm0) // ~l4/2 538 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 539 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 540 541 "movq (%%"REG_c"), %%mm2 \n\t" // l5 542 "movq %%mm3, %%mm5 \n\t" // l3 543 PAVGB(%%mm7, %%mm3) // ~l3/2 544 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 545 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 546 547 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 548 "psubusb %%mm3, %%mm0 \n\t" 549 "psubusb %%mm6, %%mm3 \n\t" 550 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 551 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) 552// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 553 554 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 555 "movq %%mm6, %%mm5 \n\t" // l6 556 PAVGB(%%mm7, %%mm6) // ~l6/2 557 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 558 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 559 560 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 561 "movq %%mm2, %%mm4 \n\t" // l5 562 PAVGB(%%mm7, %%mm2) // ~l5/2 563 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 564 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 565 566 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 567 "psubusb %%mm2, %%mm6 \n\t" 568 "psubusb %%mm4, %%mm2 \n\t" 569 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 570// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 571 572 573 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 574 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? 575 "paddusb "MANGLE(b01)", %%mm4 \n\t" 576 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP 577 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 578 "pand %%mm4, %%mm3 \n\t" 579 580 "movq %%mm3, %%mm1 \n\t" 581// "psubusb "MANGLE(b01)", %%mm3 \n\t" 582 PAVGB(%%mm7, %%mm3) 583 PAVGB(%%mm7, %%mm3) 584 "paddusb %%mm1, %%mm3 \n\t" 585// "paddusb "MANGLE(b01)", %%mm3 \n\t" 586 587 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 588 "movq (%0, %1, 4), %%mm5 \n\t" //l4 589 "movq (%0, %1, 4), %%mm4 \n\t" //l4 590 "psubusb %%mm6, %%mm5 \n\t" 591 "psubusb %%mm4, %%mm6 \n\t" 592 "por %%mm6, %%mm5 \n\t" // |l3-l4| 593 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) 594 "pxor %%mm6, %%mm0 \n\t" 595 "pand %%mm0, %%mm3 \n\t" 596 PMINUB(%%mm5, %%mm3, %%mm0) 597 598 "psubusb "MANGLE(b01)", %%mm3 \n\t" 599 PAVGB(%%mm7, %%mm3) 600 601 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 602 "movq (%0, %1, 4), %%mm2 \n\t" 603 "pxor %%mm6, %%mm0 \n\t" 604 "pxor %%mm6, %%mm2 \n\t" 605 "psubb %%mm3, %%mm0 \n\t" 606 "paddb %%mm3, %%mm2 \n\t" 607 "pxor %%mm6, %%mm0 \n\t" 608 "pxor %%mm6, %%mm2 \n\t" 609 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 610 "movq %%mm2, (%0, %1, 4) \n\t" 611#endif //0 612 613 "lea (%0, %1), %%"REG_a" \n\t" 614 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 615// 0 1 2 3 4 5 6 7 616// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 617// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 618 619 620 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 621 "movq (%0, %1, 4), %%mm0 \n\t" // l4 622 "pxor %%mm6, %%mm1 \n\t" // -l3-1 623 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 624// mm1=-l3-1, mm0=128-q 625 626 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 627 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 628 "pxor %%mm6, %%mm2 \n\t" // -l5-1 629 "movq %%mm2, %%mm5 \n\t" // -l5-1 630 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 631 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 632 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 633 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 634 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 635 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 636// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 637 638 "movq (%%"REG_a"), %%mm2 \n\t" // l1 639 "pxor %%mm6, %%mm2 \n\t" // -l1-1 640 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 641 PAVGB((%0), %%mm1) // (l0-l3+256)/2 642 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 643 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 644 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 645 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 646// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 647 648 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 649 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 650 "pxor %%mm6, %%mm1 \n\t" // -l7-1 651 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 652 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 653 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 654 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 655 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 656// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 657 658 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 659 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 660 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 661 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 662 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| 663 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| 664 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 665 666// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 667 668 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 669 "movq %2, %%mm2 \n\t" // QP 670 PAVGB(%%mm6, %%mm2) // 128 + QP/2 671 "psubb %%mm6, %%mm2 \n\t" 672 673 "movq %%mm4, %%mm1 \n\t" 674 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) 675 "pxor %%mm1, %%mm4 \n\t" 676 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 677 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 678 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 679// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 680 681 "movq %%mm4, %%mm3 \n\t" // d 682 "psubusb "MANGLE(b01)", %%mm4 \n\t" 683 PAVGB(%%mm7, %%mm4) // d/32 684 PAVGB(%%mm7, %%mm4) // (d + 32)/64 685 "paddb %%mm3, %%mm4 \n\t" // 5d/64 686 "pand %%mm2, %%mm4 \n\t" 687 688 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 689 "psubb %%mm0, %%mm5 \n\t" // q 690 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding 691 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) 692 "pxor %%mm7, %%mm5 \n\t" 693 694 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) 695 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) 696 697 "pand %%mm7, %%mm4 \n\t" 698 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 699 "movq (%0, %1, 4), %%mm2 \n\t" 700 "pxor %%mm1, %%mm0 \n\t" 701 "pxor %%mm1, %%mm2 \n\t" 702 "paddb %%mm4, %%mm0 \n\t" 703 "psubb %%mm4, %%mm2 \n\t" 704 "pxor %%mm1, %%mm0 \n\t" 705 "pxor %%mm1, %%mm2 \n\t" 706 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 707 "movq %%mm2, (%0, %1, 4) \n\t" 708 709 : 710 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 711 : "%"REG_a, "%"REG_c 712 ); 713 714/* 715 { 716 int x; 717 src-= stride; 718 for(x=0; x<BLOCK_SIZE; x++){ 719 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 720 if(FFABS(middleEnergy)< 8*QP){ 721 const int q=(src[l4] - src[l5])/2; 722 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 723 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 724 725 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 726 d= FFMAX(d, 0); 727 728 d= (5*d + 32) >> 6; 729 d*= FFSIGN(-middleEnergy); 730 731 if(q>0){ 732 d= d<0 ? 0 : d; 733 d= d>q ? q : d; 734 }else{ 735 d= d>0 ? 0 : d; 736 d= d<q ? q : d; 737 } 738 739 src[l4]-= d; 740 src[l5]+= d; 741 } 742 src++; 743 } 744 src-=8; 745 for(x=0; x<8; x++){ 746 int y; 747 for(y=4; y<6; y++){ 748 int d= src[x+y*stride] - tmp[x+(y-4)*8]; 749 int ad= FFABS(d); 750 static int max=0; 751 static int sum=0; 752 static int num=0; 753 static int bias=0; 754 755 if(max<ad) max=ad; 756 sum+= ad>3 ? 1 : 0; 757 if(ad>3){ 758 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; 759 } 760 if(y==4) bias+=d; 761 num++; 762 if(num%1000000 == 0){ 763 av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias); 764 } 765 } 766 } 767} 768*/ 769#elif HAVE_MMX 770 src+= stride*4; 771 __asm__ volatile( 772 "pxor %%mm7, %%mm7 \n\t" 773 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 774 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 775// 0 1 2 3 4 5 6 7 776// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 777// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 778 779 "movq (%0), %%mm0 \n\t" 780 "movq %%mm0, %%mm1 \n\t" 781 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 782 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 783 784 "movq (%0, %1), %%mm2 \n\t" 785 "lea (%0, %1, 2), %%"REG_a" \n\t" 786 "movq %%mm2, %%mm3 \n\t" 787 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 788 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 789 790 "movq (%%"REG_a"), %%mm4 \n\t" 791 "movq %%mm4, %%mm5 \n\t" 792 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 793 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 794 795 "paddw %%mm0, %%mm0 \n\t" // 2L0 796 "paddw %%mm1, %%mm1 \n\t" // 2H0 797 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 798 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 799 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 800 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 801 802 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 803 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 804 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 805 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 806 807 "movq (%%"REG_a", %1), %%mm2 \n\t" 808 "movq %%mm2, %%mm3 \n\t" 809 "punpcklbw %%mm7, %%mm2 \n\t" // L3 810 "punpckhbw %%mm7, %%mm3 \n\t" // H3 811 812 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 813 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 814 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 815 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 816 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 817 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 818 819 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 820 "movq %%mm0, %%mm1 \n\t" 821 "punpcklbw %%mm7, %%mm0 \n\t" // L4 822 "punpckhbw %%mm7, %%mm1 \n\t" // H4 823 824 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 825 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 826 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 827 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 828 "paddw %%mm4, %%mm4 \n\t" // 2L2 829 "paddw %%mm5, %%mm5 \n\t" // 2H2 830 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 831 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 832 833 "lea (%%"REG_a", %1), %0 \n\t" 834 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 835 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 836 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 837 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 838//50 opcodes so far 839 "movq (%0, %1, 2), %%mm2 \n\t" 840 "movq %%mm2, %%mm3 \n\t" 841 "punpcklbw %%mm7, %%mm2 \n\t" // L5 842 "punpckhbw %%mm7, %%mm3 \n\t" // H5 843 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 844 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 845 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 846 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 847 848 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 849 "punpcklbw %%mm7, %%mm6 \n\t" // L6 850 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 851 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 852 "punpckhbw %%mm7, %%mm6 \n\t" // H6 853 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 854 855 "paddw %%mm0, %%mm0 \n\t" // 2L4 856 "paddw %%mm1, %%mm1 \n\t" // 2H4 857 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 858 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 859 860 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 861 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 862 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 863 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 864 865 "movq (%0, %1, 4), %%mm2 \n\t" 866 "movq %%mm2, %%mm3 \n\t" 867 "punpcklbw %%mm7, %%mm2 \n\t" // L7 868 "punpckhbw %%mm7, %%mm3 \n\t" // H7 869 870 "paddw %%mm2, %%mm2 \n\t" // 2L7 871 "paddw %%mm3, %%mm3 \n\t" // 2H7 872 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 873 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 874 875 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 876 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 877 878#if HAVE_MMX2 879 "movq %%mm7, %%mm6 \n\t" // 0 880 "psubw %%mm0, %%mm6 \n\t" 881 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 882 "movq %%mm7, %%mm6 \n\t" // 0 883 "psubw %%mm1, %%mm6 \n\t" 884 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 885 "movq %%mm7, %%mm6 \n\t" // 0 886 "psubw %%mm2, %%mm6 \n\t" 887 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 888 "movq %%mm7, %%mm6 \n\t" // 0 889 "psubw %%mm3, %%mm6 \n\t" 890 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 891#else 892 "movq %%mm7, %%mm6 \n\t" // 0 893 "pcmpgtw %%mm0, %%mm6 \n\t" 894 "pxor %%mm6, %%mm0 \n\t" 895 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 896 "movq %%mm7, %%mm6 \n\t" // 0 897 "pcmpgtw %%mm1, %%mm6 \n\t" 898 "pxor %%mm6, %%mm1 \n\t" 899 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 900 "movq %%mm7, %%mm6 \n\t" // 0 901 "pcmpgtw %%mm2, %%mm6 \n\t" 902 "pxor %%mm6, %%mm2 \n\t" 903 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 904 "movq %%mm7, %%mm6 \n\t" // 0 905 "pcmpgtw %%mm3, %%mm6 \n\t" 906 "pxor %%mm6, %%mm3 \n\t" 907 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 908#endif 909 910#if HAVE_MMX2 911 "pminsw %%mm2, %%mm0 \n\t" 912 "pminsw %%mm3, %%mm1 \n\t" 913#else 914 "movq %%mm0, %%mm6 \n\t" 915 "psubusw %%mm2, %%mm6 \n\t" 916 "psubw %%mm6, %%mm0 \n\t" 917 "movq %%mm1, %%mm6 \n\t" 918 "psubusw %%mm3, %%mm6 \n\t" 919 "psubw %%mm6, %%mm1 \n\t" 920#endif 921 922 "movd %2, %%mm2 \n\t" // QP 923 "punpcklbw %%mm7, %%mm2 \n\t" 924 925 "movq %%mm7, %%mm6 \n\t" // 0 926 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 927 "pxor %%mm6, %%mm4 \n\t" 928 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 929 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 930 "pxor %%mm7, %%mm5 \n\t" 931 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 932// 100 opcodes 933 "psllw $3, %%mm2 \n\t" // 8QP 934 "movq %%mm2, %%mm3 \n\t" // 8QP 935 "pcmpgtw %%mm4, %%mm2 \n\t" 936 "pcmpgtw %%mm5, %%mm3 \n\t" 937 "pand %%mm2, %%mm4 \n\t" 938 "pand %%mm3, %%mm5 \n\t" 939 940 941 "psubusw %%mm0, %%mm4 \n\t" // hd 942 "psubusw %%mm1, %%mm5 \n\t" // ld 943 944 945 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 946 "pmullw %%mm2, %%mm4 \n\t" 947 "pmullw %%mm2, %%mm5 \n\t" 948 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 949 "paddw %%mm2, %%mm4 \n\t" 950 "paddw %%mm2, %%mm5 \n\t" 951 "psrlw $6, %%mm4 \n\t" 952 "psrlw $6, %%mm5 \n\t" 953 954 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 955 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 956 957 "pxor %%mm2, %%mm2 \n\t" 958 "pxor %%mm3, %%mm3 \n\t" 959 960 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 961 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 962 "pxor %%mm2, %%mm0 \n\t" 963 "pxor %%mm3, %%mm1 \n\t" 964 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 965 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 966 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 967 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 968 969 "pxor %%mm6, %%mm2 \n\t" 970 "pxor %%mm7, %%mm3 \n\t" 971 "pand %%mm2, %%mm4 \n\t" 972 "pand %%mm3, %%mm5 \n\t" 973 974#if HAVE_MMX2 975 "pminsw %%mm0, %%mm4 \n\t" 976 "pminsw %%mm1, %%mm5 \n\t" 977#else 978 "movq %%mm4, %%mm2 \n\t" 979 "psubusw %%mm0, %%mm2 \n\t" 980 "psubw %%mm2, %%mm4 \n\t" 981 "movq %%mm5, %%mm2 \n\t" 982 "psubusw %%mm1, %%mm2 \n\t" 983 "psubw %%mm2, %%mm5 \n\t" 984#endif 985 "pxor %%mm6, %%mm4 \n\t" 986 "pxor %%mm7, %%mm5 \n\t" 987 "psubw %%mm6, %%mm4 \n\t" 988 "psubw %%mm7, %%mm5 \n\t" 989 "packsswb %%mm5, %%mm4 \n\t" 990 "movq (%0), %%mm0 \n\t" 991 "paddb %%mm4, %%mm0 \n\t" 992 "movq %%mm0, (%0) \n\t" 993 "movq (%0, %1), %%mm0 \n\t" 994 "psubb %%mm4, %%mm0 \n\t" 995 "movq %%mm0, (%0, %1) \n\t" 996 997 : "+r" (src) 998 : "r" ((x86_reg)stride), "m" (c->pQPb) 999 : "%"REG_a, "%"REG_c 1000 ); 1001#else //HAVE_MMX2 || HAVE_AMD3DNOW 1002 const int l1= stride; 1003 const int l2= stride + l1; 1004 const int l3= stride + l2; 1005 const int l4= stride + l3; 1006 const int l5= stride + l4; 1007 const int l6= stride + l5; 1008 const int l7= stride + l6; 1009 const int l8= stride + l7; 1010// const int l9= stride + l8; 1011 int x; 1012 src+= stride*3; 1013 for(x=0; x<BLOCK_SIZE; x++){ 1014 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 1015 if(FFABS(middleEnergy) < 8*c->QP){ 1016 const int q=(src[l4] - src[l5])/2; 1017 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 1018 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 1019 1020 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 1021 d= FFMAX(d, 0); 1022 1023 d= (5*d + 32) >> 6; 1024 d*= FFSIGN(-middleEnergy); 1025 1026 if(q>0){ 1027 d= d<0 ? 0 : d; 1028 d= d>q ? q : d; 1029 }else{ 1030 d= d>0 ? 0 : d; 1031 d= d<q ? q : d; 1032 } 1033 1034 src[l4]-= d; 1035 src[l5]+= d; 1036 } 1037 src++; 1038 } 1039#endif //HAVE_MMX2 || HAVE_AMD3DNOW 1040} 1041#endif //HAVE_ALTIVEC 1042 1043#if !HAVE_ALTIVEC 1044static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) 1045{ 1046#if HAVE_MMX2 || HAVE_AMD3DNOW 1047 __asm__ volatile( 1048 "pxor %%mm6, %%mm6 \n\t" 1049 "pcmpeqb %%mm7, %%mm7 \n\t" 1050 "movq %2, %%mm0 \n\t" 1051 "punpcklbw %%mm6, %%mm0 \n\t" 1052 "psrlw $1, %%mm0 \n\t" 1053 "psubw %%mm7, %%mm0 \n\t" 1054 "packuswb %%mm0, %%mm0 \n\t" 1055 "movq %%mm0, %3 \n\t" 1056 1057 "lea (%0, %1), %%"REG_a" \n\t" 1058 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1059 1060// 0 1 2 3 4 5 6 7 8 9 1061// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1062 1063#undef REAL_FIND_MIN_MAX 1064#undef FIND_MIN_MAX 1065#if HAVE_MMX2 1066#define REAL_FIND_MIN_MAX(addr)\ 1067 "movq " #addr ", %%mm0 \n\t"\ 1068 "pminub %%mm0, %%mm7 \n\t"\ 1069 "pmaxub %%mm0, %%mm6 \n\t" 1070#else 1071#define REAL_FIND_MIN_MAX(addr)\ 1072 "movq " #addr ", %%mm0 \n\t"\ 1073 "movq %%mm7, %%mm1 \n\t"\ 1074 "psubusb %%mm0, %%mm6 \n\t"\ 1075 "paddb %%mm0, %%mm6 \n\t"\ 1076 "psubusb %%mm0, %%mm1 \n\t"\ 1077 "psubb %%mm1, %%mm7 \n\t" 1078#endif 1079#define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) 1080 1081FIND_MIN_MAX((%%REGa)) 1082FIND_MIN_MAX((%%REGa, %1)) 1083FIND_MIN_MAX((%%REGa, %1, 2)) 1084FIND_MIN_MAX((%0, %1, 4)) 1085FIND_MIN_MAX((%%REGd)) 1086FIND_MIN_MAX((%%REGd, %1)) 1087FIND_MIN_MAX((%%REGd, %1, 2)) 1088FIND_MIN_MAX((%0, %1, 8)) 1089 1090 "movq %%mm7, %%mm4 \n\t" 1091 "psrlq $8, %%mm7 \n\t" 1092#if HAVE_MMX2 1093 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1094 "pshufw $0xF9, %%mm7, %%mm4 \n\t" 1095 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1096 "pshufw $0xFE, %%mm7, %%mm4 \n\t" 1097 "pminub %%mm4, %%mm7 \n\t" 1098#else 1099 "movq %%mm7, %%mm1 \n\t" 1100 "psubusb %%mm4, %%mm1 \n\t" 1101 "psubb %%mm1, %%mm7 \n\t" 1102 "movq %%mm7, %%mm4 \n\t" 1103 "psrlq $16, %%mm7 \n\t" 1104 "movq %%mm7, %%mm1 \n\t" 1105 "psubusb %%mm4, %%mm1 \n\t" 1106 "psubb %%mm1, %%mm7 \n\t" 1107 "movq %%mm7, %%mm4 \n\t" 1108 "psrlq $32, %%mm7 \n\t" 1109 "movq %%mm7, %%mm1 \n\t" 1110 "psubusb %%mm4, %%mm1 \n\t" 1111 "psubb %%mm1, %%mm7 \n\t" 1112#endif 1113 1114 1115 "movq %%mm6, %%mm4 \n\t" 1116 "psrlq $8, %%mm6 \n\t" 1117#if HAVE_MMX2 1118 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels 1119 "pshufw $0xF9, %%mm6, %%mm4 \n\t" 1120 "pmaxub %%mm4, %%mm6 \n\t" 1121 "pshufw $0xFE, %%mm6, %%mm4 \n\t" 1122 "pmaxub %%mm4, %%mm6 \n\t" 1123#else 1124 "psubusb %%mm4, %%mm6 \n\t" 1125 "paddb %%mm4, %%mm6 \n\t" 1126 "movq %%mm6, %%mm4 \n\t" 1127 "psrlq $16, %%mm6 \n\t" 1128 "psubusb %%mm4, %%mm6 \n\t" 1129 "paddb %%mm4, %%mm6 \n\t" 1130 "movq %%mm6, %%mm4 \n\t" 1131 "psrlq $32, %%mm6 \n\t" 1132 "psubusb %%mm4, %%mm6 \n\t" 1133 "paddb %%mm4, %%mm6 \n\t" 1134#endif 1135 "movq %%mm6, %%mm0 \n\t" // max 1136 "psubb %%mm7, %%mm6 \n\t" // max - min 1137 "movd %%mm6, %%ecx \n\t" 1138 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" 1139 " jb 1f \n\t" 1140 "lea -24(%%"REG_SP"), %%"REG_c" \n\t" 1141 "and "ALIGN_MASK", %%"REG_c" \n\t" 1142 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 1143 "punpcklbw %%mm7, %%mm7 \n\t" 1144 "punpcklbw %%mm7, %%mm7 \n\t" 1145 "punpcklbw %%mm7, %%mm7 \n\t" 1146 "movq %%mm7, (%%"REG_c") \n\t" 1147 1148 "movq (%0), %%mm0 \n\t" // L10 1149 "movq %%mm0, %%mm1 \n\t" // L10 1150 "movq %%mm0, %%mm2 \n\t" // L10 1151 "psllq $8, %%mm1 \n\t" 1152 "psrlq $8, %%mm2 \n\t" 1153 "movd -4(%0), %%mm3 \n\t" 1154 "movd 8(%0), %%mm4 \n\t" 1155 "psrlq $24, %%mm3 \n\t" 1156 "psllq $56, %%mm4 \n\t" 1157 "por %%mm3, %%mm1 \n\t" // L00 1158 "por %%mm4, %%mm2 \n\t" // L20 1159 "movq %%mm1, %%mm3 \n\t" // L00 1160 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 1161 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 1162 "psubusb %%mm7, %%mm0 \n\t" 1163 "psubusb %%mm7, %%mm2 \n\t" 1164 "psubusb %%mm7, %%mm3 \n\t" 1165 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 1166 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 1167 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 1168 "paddb %%mm2, %%mm0 \n\t" 1169 "paddb %%mm3, %%mm0 \n\t" 1170 1171 "movq (%%"REG_a"), %%mm2 \n\t" // L11 1172 "movq %%mm2, %%mm3 \n\t" // L11 1173 "movq %%mm2, %%mm4 \n\t" // L11 1174 "psllq $8, %%mm3 \n\t" 1175 "psrlq $8, %%mm4 \n\t" 1176 "movd -4(%%"REG_a"), %%mm5 \n\t" 1177 "movd 8(%%"REG_a"), %%mm6 \n\t" 1178 "psrlq $24, %%mm5 \n\t" 1179 "psllq $56, %%mm6 \n\t" 1180 "por %%mm5, %%mm3 \n\t" // L01 1181 "por %%mm6, %%mm4 \n\t" // L21 1182 "movq %%mm3, %%mm5 \n\t" // L01 1183 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 1184 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 1185 "psubusb %%mm7, %%mm2 \n\t" 1186 "psubusb %%mm7, %%mm4 \n\t" 1187 "psubusb %%mm7, %%mm5 \n\t" 1188 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 1189 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 1190 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 1191 "paddb %%mm4, %%mm2 \n\t" 1192 "paddb %%mm5, %%mm2 \n\t" 1193// 0, 2, 3, 1 1194#define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1195 "movq " #src ", " #sx " \n\t" /* src[0] */\ 1196 "movq " #sx ", " #lx " \n\t" /* src[0] */\ 1197 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ 1198 "psllq $8, " #lx " \n\t"\ 1199 "psrlq $8, " #t0 " \n\t"\ 1200 "movd -4" #src ", " #t1 " \n\t"\ 1201 "psrlq $24, " #t1 " \n\t"\ 1202 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ 1203 "movd 8" #src ", " #t1 " \n\t"\ 1204 "psllq $56, " #t1 " \n\t"\ 1205 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ 1206 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ 1207 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ 1208 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ 1209 PAVGB(lx, pplx) \ 1210 "movq " #lx ", 8(%%"REG_c") \n\t"\ 1211 "movq (%%"REG_c"), " #lx " \n\t"\ 1212 "psubusb " #lx ", " #t1 " \n\t"\ 1213 "psubusb " #lx ", " #t0 " \n\t"\ 1214 "psubusb " #lx ", " #sx " \n\t"\ 1215 "movq "MANGLE(b00)", " #lx " \n\t"\ 1216 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ 1217 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ 1218 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ 1219 "paddb " #t1 ", " #t0 " \n\t"\ 1220 "paddb " #t0 ", " #sx " \n\t"\ 1221\ 1222 PAVGB(plx, pplx) /* filtered */\ 1223 "movq " #dst ", " #t0 " \n\t" /* dst */\ 1224 "movq " #t0 ", " #t1 " \n\t" /* dst */\ 1225 "psubusb %3, " #t0 " \n\t"\ 1226 "paddusb %3, " #t1 " \n\t"\ 1227 PMAXUB(t0, pplx)\ 1228 PMINUB(t1, pplx, t0)\ 1229 "paddb " #sx ", " #ppsx " \n\t"\ 1230 "paddb " #psx ", " #ppsx " \n\t"\ 1231 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ 1232 "pand "MANGLE(b08)", " #ppsx " \n\t"\ 1233 "pcmpeqb " #lx ", " #ppsx " \n\t"\ 1234 "pand " #ppsx ", " #pplx " \n\t"\ 1235 "pandn " #dst ", " #ppsx " \n\t"\ 1236 "por " #pplx ", " #ppsx " \n\t"\ 1237 "movq " #ppsx ", " #dst " \n\t"\ 1238 "movq 8(%%"REG_c"), " #lx " \n\t" 1239 1240#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1241 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) 1242/* 12430000000 12441111111 1245 12461111110 12471111101 12481111100 12491111011 12501111010 12511111001 1252 12531111000 12541110111 1255 1256*/ 1257//DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) 1258DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1259DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1260DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1261DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1262DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1263DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1264DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1265DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1266 1267 "1: \n\t" 1268 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) 1269 : "%"REG_a, "%"REG_d, "%"REG_c 1270 ); 1271#else //HAVE_MMX2 || HAVE_AMD3DNOW 1272 int y; 1273 int min=255; 1274 int max=0; 1275 int avg; 1276 uint8_t *p; 1277 int s[10]; 1278 const int QP2= c->QP/2 + 1; 1279 1280 for(y=1; y<9; y++){ 1281 int x; 1282 p= src + stride*y; 1283 for(x=1; x<9; x++){ 1284 p++; 1285 if(*p > max) max= *p; 1286 if(*p < min) min= *p; 1287 } 1288 } 1289 avg= (min + max + 1)>>1; 1290 1291 if(max - min <deringThreshold) return; 1292 1293 for(y=0; y<10; y++){ 1294 int t = 0; 1295 1296 if(src[stride*y + 0] > avg) t+= 1; 1297 if(src[stride*y + 1] > avg) t+= 2; 1298 if(src[stride*y + 2] > avg) t+= 4; 1299 if(src[stride*y + 3] > avg) t+= 8; 1300 if(src[stride*y + 4] > avg) t+= 16; 1301 if(src[stride*y + 5] > avg) t+= 32; 1302 if(src[stride*y + 6] > avg) t+= 64; 1303 if(src[stride*y + 7] > avg) t+= 128; 1304 if(src[stride*y + 8] > avg) t+= 256; 1305 if(src[stride*y + 9] > avg) t+= 512; 1306 1307 t |= (~t)<<16; 1308 t &= (t<<1) & (t>>1); 1309 s[y] = t; 1310 } 1311 1312 for(y=1; y<9; y++){ 1313 int t = s[y-1] & s[y] & s[y+1]; 1314 t|= t>>16; 1315 s[y-1]= t; 1316 } 1317 1318 for(y=1; y<9; y++){ 1319 int x; 1320 int t = s[y-1]; 1321 1322 p= src + stride*y; 1323 for(x=1; x<9; x++){ 1324 p++; 1325 if(t & (1<<x)){ 1326 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) 1327 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) 1328 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); 1329 f= (f + 8)>>4; 1330 1331#ifdef DEBUG_DERING_THRESHOLD 1332 __asm__ volatile("emms\n\t":); 1333 { 1334 static long long numPixels=0; 1335 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; 1336// if((max-min)<20 || (max-min)*QP<200) 1337// if((max-min)*QP < 500) 1338// if(max-min<QP/2) 1339 if(max-min < 20){ 1340 static int numSkipped=0; 1341 static int errorSum=0; 1342 static int worstQP=0; 1343 static int worstRange=0; 1344 static int worstDiff=0; 1345 int diff= (f - *p); 1346 int absDiff= FFABS(diff); 1347 int error= diff*diff; 1348 1349 if(x==1 || x==8 || y==1 || y==8) continue; 1350 1351 numSkipped++; 1352 if(absDiff > worstDiff){ 1353 worstDiff= absDiff; 1354 worstQP= QP; 1355 worstRange= max-min; 1356 } 1357 errorSum+= error; 1358 1359 if(1024LL*1024LL*1024LL % numSkipped == 0){ 1360 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, " 1361 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", 1362 (float)errorSum/numSkipped, numSkipped, worstQP, worstRange, 1363 worstDiff, (float)numSkipped/numPixels); 1364 } 1365 } 1366 } 1367#endif 1368 if (*p + QP2 < f) *p= *p + QP2; 1369 else if(*p - QP2 > f) *p= *p - QP2; 1370 else *p=f; 1371 } 1372 } 1373 } 1374#ifdef DEBUG_DERING_THRESHOLD 1375 if(max-min < 20){ 1376 for(y=1; y<9; y++){ 1377 int x; 1378 int t = 0; 1379 p= src + stride*y; 1380 for(x=1; x<9; x++){ 1381 p++; 1382 *p = FFMIN(*p + 20, 255); 1383 } 1384 } 1385// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; 1386 } 1387#endif 1388#endif //HAVE_MMX2 || HAVE_AMD3DNOW 1389} 1390#endif //HAVE_ALTIVEC 1391 1392/** 1393 * Deinterlace the given block by linearly interpolating every second line. 1394 * will be called for every 8x8 block and can read & write from line 4-15 1395 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1396 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1397 */ 1398static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) 1399{ 1400#if HAVE_MMX2 || HAVE_AMD3DNOW 1401 src+= 4*stride; 1402 __asm__ volatile( 1403 "lea (%0, %1), %%"REG_a" \n\t" 1404 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 1405// 0 1 2 3 4 5 6 7 8 9 1406// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 1407 1408 "movq (%0), %%mm0 \n\t" 1409 "movq (%%"REG_a", %1), %%mm1 \n\t" 1410 PAVGB(%%mm1, %%mm0) 1411 "movq %%mm0, (%%"REG_a") \n\t" 1412 "movq (%0, %1, 4), %%mm0 \n\t" 1413 PAVGB(%%mm0, %%mm1) 1414 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" 1415 "movq (%%"REG_c", %1), %%mm1 \n\t" 1416 PAVGB(%%mm1, %%mm0) 1417 "movq %%mm0, (%%"REG_c") \n\t" 1418 "movq (%0, %1, 8), %%mm0 \n\t" 1419 PAVGB(%%mm0, %%mm1) 1420 "movq %%mm1, (%%"REG_c", %1, 2) \n\t" 1421 1422 : : "r" (src), "r" ((x86_reg)stride) 1423 : "%"REG_a, "%"REG_c 1424 ); 1425#else 1426 int a, b, x; 1427 src+= 4*stride; 1428 1429 for(x=0; x<2; x++){ 1430 a= *(uint32_t*)&src[stride*0]; 1431 b= *(uint32_t*)&src[stride*2]; 1432 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1433 a= *(uint32_t*)&src[stride*4]; 1434 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1435 b= *(uint32_t*)&src[stride*6]; 1436 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1437 a= *(uint32_t*)&src[stride*8]; 1438 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1439 src += 4; 1440 } 1441#endif 1442} 1443 1444/** 1445 * Deinterlace the given block by cubic interpolating every second line. 1446 * will be called for every 8x8 block and can read & write from line 4-15 1447 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1448 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1449 * this filter will read lines 3-15 and write 7-13 1450 */ 1451static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) 1452{ 1453#if HAVE_MMX2 || HAVE_AMD3DNOW 1454 src+= stride*3; 1455 __asm__ volatile( 1456 "lea (%0, %1), %%"REG_a" \n\t" 1457 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1458 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" 1459 "add %1, %%"REG_c" \n\t" 1460 "pxor %%mm7, %%mm7 \n\t" 1461// 0 1 2 3 4 5 6 7 8 9 10 1462// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1463 1464#define REAL_DEINT_CUBIC(a,b,c,d,e)\ 1465 "movq " #a ", %%mm0 \n\t"\ 1466 "movq " #b ", %%mm1 \n\t"\ 1467 "movq " #d ", %%mm2 \n\t"\ 1468 "movq " #e ", %%mm3 \n\t"\ 1469 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ 1470 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ 1471 "movq %%mm0, %%mm2 \n\t"\ 1472 "punpcklbw %%mm7, %%mm0 \n\t"\ 1473 "punpckhbw %%mm7, %%mm2 \n\t"\ 1474 "movq %%mm1, %%mm3 \n\t"\ 1475 "punpcklbw %%mm7, %%mm1 \n\t"\ 1476 "punpckhbw %%mm7, %%mm3 \n\t"\ 1477 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ 1478 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ 1479 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ 1480 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ 1481 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ 1482 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ 1483 "packuswb %%mm3, %%mm1 \n\t"\ 1484 "movq %%mm1, " #c " \n\t" 1485#define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) 1486 1487DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1)) 1488DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8)) 1489DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc)) 1490DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) 1491 1492 : : "r" (src), "r" ((x86_reg)stride) 1493 : "%"REG_a, "%"REG_d, "%"REG_c 1494 ); 1495#else //HAVE_MMX2 || HAVE_AMD3DNOW 1496 int x; 1497 src+= stride*3; 1498 for(x=0; x<8; x++){ 1499 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); 1500 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); 1501 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); 1502 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); 1503 src++; 1504 } 1505#endif //HAVE_MMX2 || HAVE_AMD3DNOW 1506} 1507 1508/** 1509 * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter. 1510 * will be called for every 8x8 block and can read & write from line 4-15 1511 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1512 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1513 * this filter will read lines 4-13 and write 5-11 1514 */ 1515static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) 1516{ 1517#if HAVE_MMX2 || HAVE_AMD3DNOW 1518 src+= stride*4; 1519 __asm__ volatile( 1520 "lea (%0, %1), %%"REG_a" \n\t" 1521 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1522 "pxor %%mm7, %%mm7 \n\t" 1523 "movq (%2), %%mm0 \n\t" 1524// 0 1 2 3 4 5 6 7 8 9 10 1525// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1526 1527#define REAL_DEINT_FF(a,b,c,d)\ 1528 "movq " #a ", %%mm1 \n\t"\ 1529 "movq " #b ", %%mm2 \n\t"\ 1530 "movq " #c ", %%mm3 \n\t"\ 1531 "movq " #d ", %%mm4 \n\t"\ 1532 PAVGB(%%mm3, %%mm1) \ 1533 PAVGB(%%mm4, %%mm0) \ 1534 "movq %%mm0, %%mm3 \n\t"\ 1535 "punpcklbw %%mm7, %%mm0 \n\t"\ 1536 "punpckhbw %%mm7, %%mm3 \n\t"\ 1537 "movq %%mm1, %%mm4 \n\t"\ 1538 "punpcklbw %%mm7, %%mm1 \n\t"\ 1539 "punpckhbw %%mm7, %%mm4 \n\t"\ 1540 "psllw $2, %%mm1 \n\t"\ 1541 "psllw $2, %%mm4 \n\t"\ 1542 "psubw %%mm0, %%mm1 \n\t"\ 1543 "psubw %%mm3, %%mm4 \n\t"\ 1544 "movq %%mm2, %%mm5 \n\t"\ 1545 "movq %%mm2, %%mm0 \n\t"\ 1546 "punpcklbw %%mm7, %%mm2 \n\t"\ 1547 "punpckhbw %%mm7, %%mm5 \n\t"\ 1548 "paddw %%mm2, %%mm1 \n\t"\ 1549 "paddw %%mm5, %%mm4 \n\t"\ 1550 "psraw $2, %%mm1 \n\t"\ 1551 "psraw $2, %%mm4 \n\t"\ 1552 "packuswb %%mm4, %%mm1 \n\t"\ 1553 "movq %%mm1, " #b " \n\t"\ 1554 1555#define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) 1556 1557DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) 1558DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) 1559DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) 1560DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 1561 1562 "movq %%mm0, (%2) \n\t" 1563 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) 1564 : "%"REG_a, "%"REG_d 1565 ); 1566#else //HAVE_MMX2 || HAVE_AMD3DNOW 1567 int x; 1568 src+= stride*4; 1569 for(x=0; x<8; x++){ 1570 int t1= tmp[x]; 1571 int t2= src[stride*1]; 1572 1573 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); 1574 t1= src[stride*4]; 1575 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); 1576 t2= src[stride*6]; 1577 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); 1578 t1= src[stride*8]; 1579 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); 1580 tmp[x]= t1; 1581 1582 src++; 1583 } 1584#endif //HAVE_MMX2 || HAVE_AMD3DNOW 1585} 1586 1587/** 1588 * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter. 1589 * will be called for every 8x8 block and can read & write from line 4-15 1590 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1591 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1592 * this filter will read lines 4-13 and write 4-11 1593 */ 1594static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) 1595{ 1596#if HAVE_MMX2 || HAVE_AMD3DNOW 1597 src+= stride*4; 1598 __asm__ volatile( 1599 "lea (%0, %1), %%"REG_a" \n\t" 1600 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1601 "pxor %%mm7, %%mm7 \n\t" 1602 "movq (%2), %%mm0 \n\t" 1603 "movq (%3), %%mm1 \n\t" 1604// 0 1 2 3 4 5 6 7 8 9 10 1605// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1606 1607#define REAL_DEINT_L5(t1,t2,a,b,c)\ 1608 "movq " #a ", %%mm2 \n\t"\ 1609 "movq " #b ", %%mm3 \n\t"\ 1610 "movq " #c ", %%mm4 \n\t"\ 1611 PAVGB(t2, %%mm3) \ 1612 PAVGB(t1, %%mm4) \ 1613 "movq %%mm2, %%mm5 \n\t"\ 1614 "movq %%mm2, " #t1 " \n\t"\ 1615 "punpcklbw %%mm7, %%mm2 \n\t"\ 1616 "punpckhbw %%mm7, %%mm5 \n\t"\ 1617 "movq %%mm2, %%mm6 \n\t"\ 1618 "paddw %%mm2, %%mm2 \n\t"\ 1619 "paddw %%mm6, %%mm2 \n\t"\ 1620 "movq %%mm5, %%mm6 \n\t"\ 1621 "paddw %%mm5, %%mm5 \n\t"\ 1622 "paddw %%mm6, %%mm5 \n\t"\ 1623 "movq %%mm3, %%mm6 \n\t"\ 1624 "punpcklbw %%mm7, %%mm3 \n\t"\ 1625 "punpckhbw %%mm7, %%mm6 \n\t"\ 1626 "paddw %%mm3, %%mm3 \n\t"\ 1627 "paddw %%mm6, %%mm6 \n\t"\ 1628 "paddw %%mm3, %%mm2 \n\t"\ 1629 "paddw %%mm6, %%mm5 \n\t"\ 1630 "movq %%mm4, %%mm6 \n\t"\ 1631 "punpcklbw %%mm7, %%mm4 \n\t"\ 1632 "punpckhbw %%mm7, %%mm6 \n\t"\ 1633 "psubw %%mm4, %%mm2 \n\t"\ 1634 "psubw %%mm6, %%mm5 \n\t"\ 1635 "psraw $2, %%mm2 \n\t"\ 1636 "psraw $2, %%mm5 \n\t"\ 1637 "packuswb %%mm5, %%mm2 \n\t"\ 1638 "movq %%mm2, " #a " \n\t"\ 1639 1640#define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) 1641 1642DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) 1643DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) 1644DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) ) 1645DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) 1646DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) 1647DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) 1648DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) 1649DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 1650 1651 "movq %%mm0, (%2) \n\t" 1652 "movq %%mm1, (%3) \n\t" 1653 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) 1654 : "%"REG_a, "%"REG_d 1655 ); 1656#else //HAVE_MMX2 || HAVE_AMD3DNOW 1657 int x; 1658 src+= stride*4; 1659 for(x=0; x<8; x++){ 1660 int t1= tmp[x]; 1661 int t2= tmp2[x]; 1662 int t3= src[0]; 1663 1664 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); 1665 t1= src[stride*1]; 1666 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); 1667 t2= src[stride*2]; 1668 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); 1669 t3= src[stride*3]; 1670 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); 1671 t1= src[stride*4]; 1672 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); 1673 t2= src[stride*5]; 1674 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); 1675 t3= src[stride*6]; 1676 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); 1677 t1= src[stride*7]; 1678 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); 1679 1680 tmp[x]= t3; 1681 tmp2[x]= t1; 1682 1683 src++; 1684 } 1685#endif //HAVE_MMX2 || HAVE_AMD3DNOW 1686} 1687 1688/** 1689 * Deinterlace the given block by filtering all lines with a (1 2 1) filter. 1690 * will be called for every 8x8 block and can read & write from line 4-15 1691 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1692 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1693 * this filter will read lines 4-13 and write 4-11 1694 */ 1695static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) 1696{ 1697#if HAVE_MMX2 || HAVE_AMD3DNOW 1698 src+= 4*stride; 1699 __asm__ volatile( 1700 "lea (%0, %1), %%"REG_a" \n\t" 1701 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1702// 0 1 2 3 4 5 6 7 8 9 1703// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1704 1705 "movq (%2), %%mm0 \n\t" // L0 1706 "movq (%%"REG_a"), %%mm1 \n\t" // L2 1707 PAVGB(%%mm1, %%mm0) // L0+L2 1708 "movq (%0), %%mm2 \n\t" // L1 1709 PAVGB(%%mm2, %%mm0) 1710 "movq %%mm0, (%0) \n\t" 1711 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 1712 PAVGB(%%mm0, %%mm2) // L1+L3 1713 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 1714 "movq %%mm2, (%%"REG_a") \n\t" 1715 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 1716 PAVGB(%%mm2, %%mm1) // L2+L4 1717 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 1718 "movq %%mm1, (%%"REG_a", %1) \n\t" 1719 "movq (%0, %1, 4), %%mm1 \n\t" // L5 1720 PAVGB(%%mm1, %%mm0) // L3+L5 1721 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 1722 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 1723 "movq (%%"REG_d"), %%mm0 \n\t" // L6 1724 PAVGB(%%mm0, %%mm2) // L4+L6 1725 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 1726 "movq %%mm2, (%0, %1, 4) \n\t" 1727 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 1728 PAVGB(%%mm2, %%mm1) // L5+L7 1729 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 1730 "movq %%mm1, (%%"REG_d") \n\t" 1731 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 1732 PAVGB(%%mm1, %%mm0) // L6+L8 1733 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 1734 "movq %%mm0, (%%"REG_d", %1) \n\t" 1735 "movq (%0, %1, 8), %%mm0 \n\t" // L9 1736 PAVGB(%%mm0, %%mm2) // L7+L9 1737 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 1738 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1739 "movq %%mm1, (%2) \n\t" 1740 1741 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) 1742 : "%"REG_a, "%"REG_d 1743 ); 1744#else //HAVE_MMX2 || HAVE_AMD3DNOW 1745 int a, b, c, x; 1746 src+= 4*stride; 1747 1748 for(x=0; x<2; x++){ 1749 a= *(uint32_t*)&tmp[stride*0]; 1750 b= *(uint32_t*)&src[stride*0]; 1751 c= *(uint32_t*)&src[stride*1]; 1752 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1753 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1754 1755 a= *(uint32_t*)&src[stride*2]; 1756 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1757 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1758 1759 b= *(uint32_t*)&src[stride*3]; 1760 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1761 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1762 1763 c= *(uint32_t*)&src[stride*4]; 1764 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1765 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1766 1767 a= *(uint32_t*)&src[stride*5]; 1768 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1769 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1770 1771 b= *(uint32_t*)&src[stride*6]; 1772 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1773 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1774 1775 c= *(uint32_t*)&src[stride*7]; 1776 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1777 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1778 1779 a= *(uint32_t*)&src[stride*8]; 1780 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1781 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1782 1783 *(uint32_t*)&tmp[stride*0]= c; 1784 src += 4; 1785 tmp += 4; 1786 } 1787#endif //HAVE_MMX2 || HAVE_AMD3DNOW 1788} 1789 1790/** 1791 * Deinterlace the given block by applying a median filter to every second line. 1792 * will be called for every 8x8 block and can read & write from line 4-15, 1793 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1795 */ 1796static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) 1797{ 1798#if HAVE_MMX 1799 src+= 4*stride; 1800#if HAVE_MMX2 1801 __asm__ volatile( 1802 "lea (%0, %1), %%"REG_a" \n\t" 1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1804// 0 1 2 3 4 5 6 7 8 9 1805// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1806 1807 "movq (%0), %%mm0 \n\t" // 1808 "movq (%%"REG_a", %1), %%mm2 \n\t" // 1809 "movq (%%"REG_a"), %%mm1 \n\t" // 1810 "movq %%mm0, %%mm3 \n\t" 1811 "pmaxub %%mm1, %%mm0 \n\t" // 1812 "pminub %%mm3, %%mm1 \n\t" // 1813 "pmaxub %%mm2, %%mm1 \n\t" // 1814 "pminub %%mm1, %%mm0 \n\t" 1815 "movq %%mm0, (%%"REG_a") \n\t" 1816 1817 "movq (%0, %1, 4), %%mm0 \n\t" // 1818 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // 1819 "movq %%mm2, %%mm3 \n\t" 1820 "pmaxub %%mm1, %%mm2 \n\t" // 1821 "pminub %%mm3, %%mm1 \n\t" // 1822 "pmaxub %%mm0, %%mm1 \n\t" // 1823 "pminub %%mm1, %%mm2 \n\t" 1824 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" 1825 1826 "movq (%%"REG_d"), %%mm2 \n\t" // 1827 "movq (%%"REG_d", %1), %%mm1 \n\t" // 1828 "movq %%mm2, %%mm3 \n\t" 1829 "pmaxub %%mm0, %%mm2 \n\t" // 1830 "pminub %%mm3, %%mm0 \n\t" // 1831 "pmaxub %%mm1, %%mm0 \n\t" // 1832 "pminub %%mm0, %%mm2 \n\t" 1833 "movq %%mm2, (%%"REG_d") \n\t" 1834 1835 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" // 1836 "movq (%0, %1, 8), %%mm0 \n\t" // 1837 "movq %%mm2, %%mm3 \n\t" 1838 "pmaxub %%mm0, %%mm2 \n\t" // 1839 "pminub %%mm3, %%mm0 \n\t" // 1840 "pmaxub %%mm1, %%mm0 \n\t" // 1841 "pminub %%mm0, %%mm2 \n\t" 1842 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1843 1844 1845 : : "r" (src), "r" ((x86_reg)stride) 1846 : "%"REG_a, "%"REG_d 1847 ); 1848 1849#else // MMX without MMX2 1850 __asm__ volatile( 1851 "lea (%0, %1), %%"REG_a" \n\t" 1852 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1853// 0 1 2 3 4 5 6 7 8 9 1854// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1855 "pxor %%mm7, %%mm7 \n\t" 1856 1857#define REAL_MEDIAN(a,b,c)\ 1858 "movq " #a ", %%mm0 \n\t"\ 1859 "movq " #b ", %%mm2 \n\t"\ 1860 "movq " #c ", %%mm1 \n\t"\ 1861 "movq %%mm0, %%mm3 \n\t"\ 1862 "movq %%mm1, %%mm4 \n\t"\ 1863 "movq %%mm2, %%mm5 \n\t"\ 1864 "psubusb %%mm1, %%mm3 \n\t"\ 1865 "psubusb %%mm2, %%mm4 \n\t"\ 1866 "psubusb %%mm0, %%mm5 \n\t"\ 1867 "pcmpeqb %%mm7, %%mm3 \n\t"\ 1868 "pcmpeqb %%mm7, %%mm4 \n\t"\ 1869 "pcmpeqb %%mm7, %%mm5 \n\t"\ 1870 "movq %%mm3, %%mm6 \n\t"\ 1871 "pxor %%mm4, %%mm3 \n\t"\ 1872 "pxor %%mm5, %%mm4 \n\t"\ 1873 "pxor %%mm6, %%mm5 \n\t"\ 1874 "por %%mm3, %%mm1 \n\t"\ 1875 "por %%mm4, %%mm2 \n\t"\ 1876 "por %%mm5, %%mm0 \n\t"\ 1877 "pand %%mm2, %%mm0 \n\t"\ 1878 "pand %%mm1, %%mm0 \n\t"\ 1879 "movq %%mm0, " #b " \n\t" 1880#define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) 1881 1882MEDIAN((%0) , (%%REGa) , (%%REGa, %1)) 1883MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) 1884MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1)) 1885MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) 1886 1887 : : "r" (src), "r" ((x86_reg)stride) 1888 : "%"REG_a, "%"REG_d 1889 ); 1890#endif //HAVE_MMX2 1891#else //HAVE_MMX 1892 int x, y; 1893 src+= 4*stride; 1894 // FIXME - there should be a way to do a few columns in parallel like w/mmx 1895 for(x=0; x<8; x++){ 1896 uint8_t *colsrc = src; 1897 for (y=0; y<4; y++){ 1898 int a, b, c, d, e, f; 1899 a = colsrc[0 ]; 1900 b = colsrc[stride ]; 1901 c = colsrc[stride*2]; 1902 d = (a-b)>>31; 1903 e = (b-c)>>31; 1904 f = (c-a)>>31; 1905 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); 1906 colsrc += stride*2; 1907 } 1908 src++; 1909 } 1910#endif //HAVE_MMX 1911} 1912 1913#if HAVE_MMX 1914/** 1915 * Transpose and shift the given 8x8 Block into dst1 and dst2. 1916 */ 1917static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 1918{ 1919 __asm__( 1920 "lea (%0, %1), %%"REG_a" \n\t" 1921// 0 1 2 3 4 5 6 7 8 9 1922// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1923 "movq (%0), %%mm0 \n\t" // 12345678 1924 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 1925 "movq %%mm0, %%mm2 \n\t" // 12345678 1926 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 1927 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 1928 1929 "movq (%%"REG_a", %1), %%mm1 \n\t" 1930 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 1931 "movq %%mm1, %%mm4 \n\t" 1932 "punpcklbw %%mm3, %%mm1 \n\t" 1933 "punpckhbw %%mm3, %%mm4 \n\t" 1934 1935 "movq %%mm0, %%mm3 \n\t" 1936 "punpcklwd %%mm1, %%mm0 \n\t" 1937 "punpckhwd %%mm1, %%mm3 \n\t" 1938 "movq %%mm2, %%mm1 \n\t" 1939 "punpcklwd %%mm4, %%mm2 \n\t" 1940 "punpckhwd %%mm4, %%mm1 \n\t" 1941 1942 "movd %%mm0, 128(%2) \n\t" 1943 "psrlq $32, %%mm0 \n\t" 1944 "movd %%mm0, 144(%2) \n\t" 1945 "movd %%mm3, 160(%2) \n\t" 1946 "psrlq $32, %%mm3 \n\t" 1947 "movd %%mm3, 176(%2) \n\t" 1948 "movd %%mm3, 48(%3) \n\t" 1949 "movd %%mm2, 192(%2) \n\t" 1950 "movd %%mm2, 64(%3) \n\t" 1951 "psrlq $32, %%mm2 \n\t" 1952 "movd %%mm2, 80(%3) \n\t" 1953 "movd %%mm1, 96(%3) \n\t" 1954 "psrlq $32, %%mm1 \n\t" 1955 "movd %%mm1, 112(%3) \n\t" 1956 1957 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" 1958 1959 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 1960 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 1961 "movq %%mm0, %%mm2 \n\t" // 12345678 1962 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 1963 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 1964 1965 "movq (%%"REG_a", %1), %%mm1 \n\t" 1966 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 1967 "movq %%mm1, %%mm4 \n\t" 1968 "punpcklbw %%mm3, %%mm1 \n\t" 1969 "punpckhbw %%mm3, %%mm4 \n\t" 1970 1971 "movq %%mm0, %%mm3 \n\t" 1972 "punpcklwd %%mm1, %%mm0 \n\t" 1973 "punpckhwd %%mm1, %%mm3 \n\t" 1974 "movq %%mm2, %%mm1 \n\t" 1975 "punpcklwd %%mm4, %%mm2 \n\t" 1976 "punpckhwd %%mm4, %%mm1 \n\t" 1977 1978 "movd %%mm0, 132(%2) \n\t" 1979 "psrlq $32, %%mm0 \n\t" 1980 "movd %%mm0, 148(%2) \n\t" 1981 "movd %%mm3, 164(%2) \n\t" 1982 "psrlq $32, %%mm3 \n\t" 1983 "movd %%mm3, 180(%2) \n\t" 1984 "movd %%mm3, 52(%3) \n\t" 1985 "movd %%mm2, 196(%2) \n\t" 1986 "movd %%mm2, 68(%3) \n\t" 1987 "psrlq $32, %%mm2 \n\t" 1988 "movd %%mm2, 84(%3) \n\t" 1989 "movd %%mm1, 100(%3) \n\t" 1990 "psrlq $32, %%mm1 \n\t" 1991 "movd %%mm1, 116(%3) \n\t" 1992 1993 1994 :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2) 1995 : "%"REG_a 1996 ); 1997} 1998 1999/** 2000 * Transpose the given 8x8 block. 2001 */ 2002static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) 2003{ 2004 __asm__( 2005 "lea (%0, %1), %%"REG_a" \n\t" 2006 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t" 2007// 0 1 2 3 4 5 6 7 8 9 2008// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 2009 "movq (%2), %%mm0 \n\t" // 12345678 2010 "movq 16(%2), %%mm1 \n\t" // abcdefgh 2011 "movq %%mm0, %%mm2 \n\t" // 12345678 2012 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2013 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2014 2015 "movq 32(%2), %%mm1 \n\t" 2016 "movq 48(%2), %%mm3 \n\t" 2017 "movq %%mm1, %%mm4 \n\t" 2018 "punpcklbw %%mm3, %%mm1 \n\t" 2019 "punpckhbw %%mm3, %%mm4 \n\t" 2020 2021 "movq %%mm0, %%mm3 \n\t" 2022 "punpcklwd %%mm1, %%mm0 \n\t" 2023 "punpckhwd %%mm1, %%mm3 \n\t" 2024 "movq %%mm2, %%mm1 \n\t" 2025 "punpcklwd %%mm4, %%mm2 \n\t" 2026 "punpckhwd %%mm4, %%mm1 \n\t" 2027 2028 "movd %%mm0, (%0) \n\t" 2029 "psrlq $32, %%mm0 \n\t" 2030 "movd %%mm0, (%%"REG_a") \n\t" 2031 "movd %%mm3, (%%"REG_a", %1) \n\t" 2032 "psrlq $32, %%mm3 \n\t" 2033 "movd %%mm3, (%%"REG_a", %1, 2) \n\t" 2034 "movd %%mm2, (%0, %1, 4) \n\t" 2035 "psrlq $32, %%mm2 \n\t" 2036 "movd %%mm2, (%%"REG_d") \n\t" 2037 "movd %%mm1, (%%"REG_d", %1) \n\t" 2038 "psrlq $32, %%mm1 \n\t" 2039 "movd %%mm1, (%%"REG_d", %1, 2) \n\t" 2040 2041 2042 "movq 64(%2), %%mm0 \n\t" // 12345678 2043 "movq 80(%2), %%mm1 \n\t" // abcdefgh 2044 "movq %%mm0, %%mm2 \n\t" // 12345678 2045 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2046 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2047 2048 "movq 96(%2), %%mm1 \n\t" 2049 "movq 112(%2), %%mm3 \n\t" 2050 "movq %%mm1, %%mm4 \n\t" 2051 "punpcklbw %%mm3, %%mm1 \n\t" 2052 "punpckhbw %%mm3, %%mm4 \n\t" 2053 2054 "movq %%mm0, %%mm3 \n\t" 2055 "punpcklwd %%mm1, %%mm0 \n\t" 2056 "punpckhwd %%mm1, %%mm3 \n\t" 2057 "movq %%mm2, %%mm1 \n\t" 2058 "punpcklwd %%mm4, %%mm2 \n\t" 2059 "punpckhwd %%mm4, %%mm1 \n\t" 2060 2061 "movd %%mm0, 4(%0) \n\t" 2062 "psrlq $32, %%mm0 \n\t" 2063 "movd %%mm0, 4(%%"REG_a") \n\t" 2064 "movd %%mm3, 4(%%"REG_a", %1) \n\t" 2065 "psrlq $32, %%mm3 \n\t" 2066 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" 2067 "movd %%mm2, 4(%0, %1, 4) \n\t" 2068 "psrlq $32, %%mm2 \n\t" 2069 "movd %%mm2, 4(%%"REG_d") \n\t" 2070 "movd %%mm1, 4(%%"REG_d", %1) \n\t" 2071 "psrlq $32, %%mm1 \n\t" 2072 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" 2073 2074 :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src) 2075 : "%"REG_a, "%"REG_d 2076 ); 2077} 2078#endif //HAVE_MMX 2079//static long test=0; 2080 2081#if !HAVE_ALTIVEC 2082static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 2083 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise) 2084{ 2085 // to save a register (FIXME do this outside of the loops) 2086 tempBlurredPast[127]= maxNoise[0]; 2087 tempBlurredPast[128]= maxNoise[1]; 2088 tempBlurredPast[129]= maxNoise[2]; 2089 2090#define FAST_L2_DIFF 2091//#define L1_DIFF //u should change the thresholds too if u try that one 2092#if HAVE_MMX2 || HAVE_AMD3DNOW 2093 __asm__ volatile( 2094 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride 2095 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride 2096 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2097// 0 1 2 3 4 5 6 7 8 9 2098// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 2099//FIXME reorder? 2100#ifdef L1_DIFF //needs mmx2 2101 "movq (%0), %%mm0 \n\t" // L0 2102 "psadbw (%1), %%mm0 \n\t" // |L0-R0| 2103 "movq (%0, %2), %%mm1 \n\t" // L1 2104 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| 2105 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2106 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| 2107 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2108 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| 2109 2110 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2111 "paddw %%mm1, %%mm0 \n\t" 2112 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| 2113 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2114 "paddw %%mm2, %%mm0 \n\t" 2115 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| 2116 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2117 "paddw %%mm3, %%mm0 \n\t" 2118 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| 2119 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2120 "paddw %%mm4, %%mm0 \n\t" 2121 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| 2122 "paddw %%mm5, %%mm6 \n\t" 2123 "paddw %%mm7, %%mm6 \n\t" 2124 "paddw %%mm6, %%mm0 \n\t" 2125#else //L1_DIFF 2126#if defined (FAST_L2_DIFF) 2127 "pcmpeqb %%mm7, %%mm7 \n\t" 2128 "movq "MANGLE(b80)", %%mm6 \n\t" 2129 "pxor %%mm0, %%mm0 \n\t" 2130#define REAL_L2_DIFF_CORE(a, b)\ 2131 "movq " #a ", %%mm5 \n\t"\ 2132 "movq " #b ", %%mm2 \n\t"\ 2133 "pxor %%mm7, %%mm2 \n\t"\ 2134 PAVGB(%%mm2, %%mm5)\ 2135 "paddb %%mm6, %%mm5 \n\t"\ 2136 "movq %%mm5, %%mm2 \n\t"\ 2137 "psllw $8, %%mm5 \n\t"\ 2138 "pmaddwd %%mm5, %%mm5 \n\t"\ 2139 "pmaddwd %%mm2, %%mm2 \n\t"\ 2140 "paddd %%mm2, %%mm5 \n\t"\ 2141 "psrld $14, %%mm5 \n\t"\ 2142 "paddd %%mm5, %%mm0 \n\t" 2143 2144#else //defined (FAST_L2_DIFF) 2145 "pxor %%mm7, %%mm7 \n\t" 2146 "pxor %%mm0, %%mm0 \n\t" 2147#define REAL_L2_DIFF_CORE(a, b)\ 2148 "movq " #a ", %%mm5 \n\t"\ 2149 "movq " #b ", %%mm2 \n\t"\ 2150 "movq %%mm5, %%mm1 \n\t"\ 2151 "movq %%mm2, %%mm3 \n\t"\ 2152 "punpcklbw %%mm7, %%mm5 \n\t"\ 2153 "punpckhbw %%mm7, %%mm1 \n\t"\ 2154 "punpcklbw %%mm7, %%mm2 \n\t"\ 2155 "punpckhbw %%mm7, %%mm3 \n\t"\ 2156 "psubw %%mm2, %%mm5 \n\t"\ 2157 "psubw %%mm3, %%mm1 \n\t"\ 2158 "pmaddwd %%mm5, %%mm5 \n\t"\ 2159 "pmaddwd %%mm1, %%mm1 \n\t"\ 2160 "paddd %%mm1, %%mm5 \n\t"\ 2161 "paddd %%mm5, %%mm0 \n\t" 2162 2163#endif //defined (FAST_L2_DIFF) 2164 2165#define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) 2166 2167L2_DIFF_CORE((%0) , (%1)) 2168L2_DIFF_CORE((%0, %2) , (%1, %2)) 2169L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2)) 2170L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa)) 2171L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4)) 2172L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd)) 2173L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) 2174L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc)) 2175 2176#endif //L1_DIFF 2177 2178 "movq %%mm0, %%mm4 \n\t" 2179 "psrlq $32, %%mm0 \n\t" 2180 "paddd %%mm0, %%mm4 \n\t" 2181 "movd %%mm4, %%ecx \n\t" 2182 "shll $2, %%ecx \n\t" 2183 "mov %3, %%"REG_d" \n\t" 2184 "addl -4(%%"REG_d"), %%ecx \n\t" 2185 "addl 4(%%"REG_d"), %%ecx \n\t" 2186 "addl -1024(%%"REG_d"), %%ecx \n\t" 2187 "addl $4, %%ecx \n\t" 2188 "addl 1024(%%"REG_d"), %%ecx \n\t" 2189 "shrl $3, %%ecx \n\t" 2190 "movl %%ecx, (%%"REG_d") \n\t" 2191 2192// "mov %3, %%"REG_c" \n\t" 2193// "mov %%"REG_c", test \n\t" 2194// "jmp 4f \n\t" 2195 "cmpl 512(%%"REG_d"), %%ecx \n\t" 2196 " jb 2f \n\t" 2197 "cmpl 516(%%"REG_d"), %%ecx \n\t" 2198 " jb 1f \n\t" 2199 2200 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2201 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2202 "movq (%0), %%mm0 \n\t" // L0 2203 "movq (%0, %2), %%mm1 \n\t" // L1 2204 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2205 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2206 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2207 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2208 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2209 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2210 "movq %%mm0, (%1) \n\t" // L0 2211 "movq %%mm1, (%1, %2) \n\t" // L1 2212 "movq %%mm2, (%1, %2, 2) \n\t" // L2 2213 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 2214 "movq %%mm4, (%1, %2, 4) \n\t" // L4 2215 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 2216 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 2217 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 2218 "jmp 4f \n\t" 2219 2220 "1: \n\t" 2221 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2222 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2223 "movq (%0), %%mm0 \n\t" // L0 2224 PAVGB((%1), %%mm0) // L0 2225 "movq (%0, %2), %%mm1 \n\t" // L1 2226 PAVGB((%1, %2), %%mm1) // L1 2227 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2228 PAVGB((%1, %2, 2), %%mm2) // L2 2229 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2230 PAVGB((%1, %%REGa), %%mm3) // L3 2231 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2232 PAVGB((%1, %2, 4), %%mm4) // L4 2233 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2234 PAVGB((%1, %%REGd), %%mm5) // L5 2235 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2236 PAVGB((%1, %%REGa, 2), %%mm6) // L6 2237 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2238 PAVGB((%1, %%REGc), %%mm7) // L7 2239 "movq %%mm0, (%1) \n\t" // R0 2240 "movq %%mm1, (%1, %2) \n\t" // R1 2241 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2242 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2243 "movq %%mm4, (%1, %2, 4) \n\t" // R4 2244 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 2245 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 2246 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 2247 "movq %%mm0, (%0) \n\t" // L0 2248 "movq %%mm1, (%0, %2) \n\t" // L1 2249 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2250 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2251 "movq %%mm4, (%0, %2, 4) \n\t" // L4 2252 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 2253 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 2254 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 2255 "jmp 4f \n\t" 2256 2257 "2: \n\t" 2258 "cmpl 508(%%"REG_d"), %%ecx \n\t" 2259 " jb 3f \n\t" 2260 2261 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2262 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2263 "movq (%0), %%mm0 \n\t" // L0 2264 "movq (%0, %2), %%mm1 \n\t" // L1 2265 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2266 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2267 "movq (%1), %%mm4 \n\t" // R0 2268 "movq (%1, %2), %%mm5 \n\t" // R1 2269 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2270 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2271 PAVGB(%%mm4, %%mm0) 2272 PAVGB(%%mm5, %%mm1) 2273 PAVGB(%%mm6, %%mm2) 2274 PAVGB(%%mm7, %%mm3) 2275 PAVGB(%%mm4, %%mm0) 2276 PAVGB(%%mm5, %%mm1) 2277 PAVGB(%%mm6, %%mm2) 2278 PAVGB(%%mm7, %%mm3) 2279 "movq %%mm0, (%1) \n\t" // R0 2280 "movq %%mm1, (%1, %2) \n\t" // R1 2281 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2282 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2283 "movq %%mm0, (%0) \n\t" // L0 2284 "movq %%mm1, (%0, %2) \n\t" // L1 2285 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2286 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2287 2288 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2289 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2290 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2291 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2292 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2293 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2294 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2295 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2296 PAVGB(%%mm4, %%mm0) 2297 PAVGB(%%mm5, %%mm1) 2298 PAVGB(%%mm6, %%mm2) 2299 PAVGB(%%mm7, %%mm3) 2300 PAVGB(%%mm4, %%mm0) 2301 PAVGB(%%mm5, %%mm1) 2302 PAVGB(%%mm6, %%mm2) 2303 PAVGB(%%mm7, %%mm3) 2304 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2305 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2306 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2307 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2308 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2309 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2310 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2311 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2312 "jmp 4f \n\t" 2313 2314 "3: \n\t" 2315 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2316 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2317 "movq (%0), %%mm0 \n\t" // L0 2318 "movq (%0, %2), %%mm1 \n\t" // L1 2319 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2320 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2321 "movq (%1), %%mm4 \n\t" // R0 2322 "movq (%1, %2), %%mm5 \n\t" // R1 2323 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2324 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2325 PAVGB(%%mm4, %%mm0) 2326 PAVGB(%%mm5, %%mm1) 2327 PAVGB(%%mm6, %%mm2) 2328 PAVGB(%%mm7, %%mm3) 2329 PAVGB(%%mm4, %%mm0) 2330 PAVGB(%%mm5, %%mm1) 2331 PAVGB(%%mm6, %%mm2) 2332 PAVGB(%%mm7, %%mm3) 2333 PAVGB(%%mm4, %%mm0) 2334 PAVGB(%%mm5, %%mm1) 2335 PAVGB(%%mm6, %%mm2) 2336 PAVGB(%%mm7, %%mm3) 2337 "movq %%mm0, (%1) \n\t" // R0 2338 "movq %%mm1, (%1, %2) \n\t" // R1 2339 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2340 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2341 "movq %%mm0, (%0) \n\t" // L0 2342 "movq %%mm1, (%0, %2) \n\t" // L1 2343 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2344 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2345 2346 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2347 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2348 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2349 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2350 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2351 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2352 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2353 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2354 PAVGB(%%mm4, %%mm0) 2355 PAVGB(%%mm5, %%mm1) 2356 PAVGB(%%mm6, %%mm2) 2357 PAVGB(%%mm7, %%mm3) 2358 PAVGB(%%mm4, %%mm0) 2359 PAVGB(%%mm5, %%mm1) 2360 PAVGB(%%mm6, %%mm2) 2361 PAVGB(%%mm7, %%mm3) 2362 PAVGB(%%mm4, %%mm0) 2363 PAVGB(%%mm5, %%mm1) 2364 PAVGB(%%mm6, %%mm2) 2365 PAVGB(%%mm7, %%mm3) 2366 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2367 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2368 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2369 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2370 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2371 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2372 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2373 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2374 2375 "4: \n\t" 2376 2377 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) 2378 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" 2379 ); 2380#else //HAVE_MMX2 || HAVE_AMD3DNOW 2381{ 2382 int y; 2383 int d=0; 2384// int sysd=0; 2385 int i; 2386 2387 for(y=0; y<8; y++){ 2388 int x; 2389 for(x=0; x<8; x++){ 2390 int ref= tempBlurred[ x + y*stride ]; 2391 int cur= src[ x + y*stride ]; 2392 int d1=ref - cur; 2393// if(x==0 || x==7) d1+= d1>>1; 2394// if(y==0 || y==7) d1+= d1>>1; 2395// d+= FFABS(d1); 2396 d+= d1*d1; 2397// sysd+= d1; 2398 } 2399 } 2400 i=d; 2401 d= ( 2402 4*d 2403 +(*(tempBlurredPast-256)) 2404 +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1)) 2405 +(*(tempBlurredPast+256)) 2406 +4)>>3; 2407 *tempBlurredPast=i; 2408// ((*tempBlurredPast)*3 + d + 2)>>2; 2409 2410/* 2411Switch between 2412 1 0 0 0 0 0 0 (0) 241364 32 16 8 4 2 1 (1) 241464 48 36 27 20 15 11 (33) (approx) 241564 56 49 43 37 33 29 (200) (approx) 2416*/ 2417 if(d > maxNoise[1]){ 2418 if(d < maxNoise[2]){ 2419 for(y=0; y<8; y++){ 2420 int x; 2421 for(x=0; x<8; x++){ 2422 int ref= tempBlurred[ x + y*stride ]; 2423 int cur= src[ x + y*stride ]; 2424 tempBlurred[ x + y*stride ]= 2425 src[ x + y*stride ]= 2426 (ref + cur + 1)>>1; 2427 } 2428 } 2429 }else{ 2430 for(y=0; y<8; y++){ 2431 int x; 2432 for(x=0; x<8; x++){ 2433 tempBlurred[ x + y*stride ]= src[ x + y*stride ]; 2434 } 2435 } 2436 } 2437 }else{ 2438 if(d < maxNoise[0]){ 2439 for(y=0; y<8; y++){ 2440 int x; 2441 for(x=0; x<8; x++){ 2442 int ref= tempBlurred[ x + y*stride ]; 2443 int cur= src[ x + y*stride ]; 2444 tempBlurred[ x + y*stride ]= 2445 src[ x + y*stride ]= 2446 (ref*7 + cur + 4)>>3; 2447 } 2448 } 2449 }else{ 2450 for(y=0; y<8; y++){ 2451 int x; 2452 for(x=0; x<8; x++){ 2453 int ref= tempBlurred[ x + y*stride ]; 2454 int cur= src[ x + y*stride ]; 2455 tempBlurred[ x + y*stride ]= 2456 src[ x + y*stride ]= 2457 (ref*3 + cur + 2)>>2; 2458 } 2459 } 2460 } 2461 } 2462} 2463#endif //HAVE_MMX2 || HAVE_AMD3DNOW 2464} 2465#endif //HAVE_ALTIVEC 2466 2467#if HAVE_MMX 2468/** 2469 * accurate deblock filter 2470 */ 2471static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ 2472 int64_t dc_mask, eq_mask, both_masks; 2473 int64_t sums[10*8*2]; 2474 src+= step*3; // src points to begin of the 8x8 Block 2475 //{ START_TIMER 2476 __asm__ volatile( 2477 "movq %0, %%mm7 \n\t" 2478 "movq %1, %%mm6 \n\t" 2479 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 2480 ); 2481 2482 __asm__ volatile( 2483 "lea (%2, %3), %%"REG_a" \n\t" 2484// 0 1 2 3 4 5 6 7 8 9 2485// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 2486 2487 "movq (%2), %%mm0 \n\t" 2488 "movq (%%"REG_a"), %%mm1 \n\t" 2489 "movq %%mm1, %%mm3 \n\t" 2490 "movq %%mm1, %%mm4 \n\t" 2491 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 2492 "paddb %%mm7, %%mm0 \n\t" 2493 "pcmpgtb %%mm6, %%mm0 \n\t" 2494 2495 "movq (%%"REG_a",%3), %%mm2 \n\t" 2496 PMAXUB(%%mm2, %%mm4) 2497 PMINUB(%%mm2, %%mm3, %%mm5) 2498 "psubb %%mm2, %%mm1 \n\t" 2499 "paddb %%mm7, %%mm1 \n\t" 2500 "pcmpgtb %%mm6, %%mm1 \n\t" 2501 "paddb %%mm1, %%mm0 \n\t" 2502 2503 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2504 PMAXUB(%%mm1, %%mm4) 2505 PMINUB(%%mm1, %%mm3, %%mm5) 2506 "psubb %%mm1, %%mm2 \n\t" 2507 "paddb %%mm7, %%mm2 \n\t" 2508 "pcmpgtb %%mm6, %%mm2 \n\t" 2509 "paddb %%mm2, %%mm0 \n\t" 2510 2511 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 2512 2513 "movq (%2, %3, 4), %%mm2 \n\t" 2514 PMAXUB(%%mm2, %%mm4) 2515 PMINUB(%%mm2, %%mm3, %%mm5) 2516 "psubb %%mm2, %%mm1 \n\t" 2517 "paddb %%mm7, %%mm1 \n\t" 2518 "pcmpgtb %%mm6, %%mm1 \n\t" 2519 "paddb %%mm1, %%mm0 \n\t" 2520 2521 "movq (%%"REG_a"), %%mm1 \n\t" 2522 PMAXUB(%%mm1, %%mm4) 2523 PMINUB(%%mm1, %%mm3, %%mm5) 2524 "psubb %%mm1, %%mm2 \n\t" 2525 "paddb %%mm7, %%mm2 \n\t" 2526 "pcmpgtb %%mm6, %%mm2 \n\t" 2527 "paddb %%mm2, %%mm0 \n\t" 2528 2529 "movq (%%"REG_a", %3), %%mm2 \n\t" 2530 PMAXUB(%%mm2, %%mm4) 2531 PMINUB(%%mm2, %%mm3, %%mm5) 2532 "psubb %%mm2, %%mm1 \n\t" 2533 "paddb %%mm7, %%mm1 \n\t" 2534 "pcmpgtb %%mm6, %%mm1 \n\t" 2535 "paddb %%mm1, %%mm0 \n\t" 2536 2537 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2538 PMAXUB(%%mm1, %%mm4) 2539 PMINUB(%%mm1, %%mm3, %%mm5) 2540 "psubb %%mm1, %%mm2 \n\t" 2541 "paddb %%mm7, %%mm2 \n\t" 2542 "pcmpgtb %%mm6, %%mm2 \n\t" 2543 "paddb %%mm2, %%mm0 \n\t" 2544 2545 "movq (%2, %3, 8), %%mm2 \n\t" 2546 PMAXUB(%%mm2, %%mm4) 2547 PMINUB(%%mm2, %%mm3, %%mm5) 2548 "psubb %%mm2, %%mm1 \n\t" 2549 "paddb %%mm7, %%mm1 \n\t" 2550 "pcmpgtb %%mm6, %%mm1 \n\t" 2551 "paddb %%mm1, %%mm0 \n\t" 2552 2553 "movq (%%"REG_a", %3, 4), %%mm1 \n\t" 2554 "psubb %%mm1, %%mm2 \n\t" 2555 "paddb %%mm7, %%mm2 \n\t" 2556 "pcmpgtb %%mm6, %%mm2 \n\t" 2557 "paddb %%mm2, %%mm0 \n\t" 2558 "psubusb %%mm3, %%mm4 \n\t" 2559 2560 "pxor %%mm6, %%mm6 \n\t" 2561 "movq %4, %%mm7 \n\t" // QP,..., QP 2562 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 2563 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 2564 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2565 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2566 "movq %%mm7, %1 \n\t" 2567 2568 "movq %5, %%mm7 \n\t" 2569 "punpcklbw %%mm7, %%mm7 \n\t" 2570 "punpcklbw %%mm7, %%mm7 \n\t" 2571 "punpcklbw %%mm7, %%mm7 \n\t" 2572 "psubb %%mm0, %%mm6 \n\t" 2573 "pcmpgtb %%mm7, %%mm6 \n\t" 2574 "movq %%mm6, %0 \n\t" 2575 2576 : "=m" (eq_mask), "=m" (dc_mask) 2577 : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) 2578 : "%"REG_a 2579 ); 2580 2581 both_masks = dc_mask & eq_mask; 2582 2583 if(both_masks){ 2584 x86_reg offset= -8*step; 2585 int64_t *temp_sums= sums; 2586 2587 __asm__ volatile( 2588 "movq %2, %%mm0 \n\t" // QP,..., QP 2589 "pxor %%mm4, %%mm4 \n\t" 2590 2591 "movq (%0), %%mm6 \n\t" 2592 "movq (%0, %1), %%mm5 \n\t" 2593 "movq %%mm5, %%mm1 \n\t" 2594 "movq %%mm6, %%mm2 \n\t" 2595 "psubusb %%mm6, %%mm5 \n\t" 2596 "psubusb %%mm1, %%mm2 \n\t" 2597 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2598 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2599 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2600 2601 "pxor %%mm6, %%mm1 \n\t" 2602 "pand %%mm0, %%mm1 \n\t" 2603 "pxor %%mm1, %%mm6 \n\t" 2604 // 0:QP 6:First 2605 2606 "movq (%0, %1, 8), %%mm5 \n\t" 2607 "add %1, %0 \n\t" // %0 points to line 1 not 0 2608 "movq (%0, %1, 8), %%mm7 \n\t" 2609 "movq %%mm5, %%mm1 \n\t" 2610 "movq %%mm7, %%mm2 \n\t" 2611 "psubusb %%mm7, %%mm5 \n\t" 2612 "psubusb %%mm1, %%mm2 \n\t" 2613 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2614 "movq %2, %%mm0 \n\t" // QP,..., QP 2615 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2616 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2617 2618 "pxor %%mm7, %%mm1 \n\t" 2619 "pand %%mm0, %%mm1 \n\t" 2620 "pxor %%mm1, %%mm7 \n\t" 2621 2622 "movq %%mm6, %%mm5 \n\t" 2623 "punpckhbw %%mm4, %%mm6 \n\t" 2624 "punpcklbw %%mm4, %%mm5 \n\t" 2625 // 4:0 5/6:First 7:Last 2626 2627 "movq %%mm5, %%mm0 \n\t" 2628 "movq %%mm6, %%mm1 \n\t" 2629 "psllw $2, %%mm0 \n\t" 2630 "psllw $2, %%mm1 \n\t" 2631 "paddw "MANGLE(w04)", %%mm0 \n\t" 2632 "paddw "MANGLE(w04)", %%mm1 \n\t" 2633 2634#define NEXT\ 2635 "movq (%0), %%mm2 \n\t"\ 2636 "movq (%0), %%mm3 \n\t"\ 2637 "add %1, %0 \n\t"\ 2638 "punpcklbw %%mm4, %%mm2 \n\t"\ 2639 "punpckhbw %%mm4, %%mm3 \n\t"\ 2640 "paddw %%mm2, %%mm0 \n\t"\ 2641 "paddw %%mm3, %%mm1 \n\t" 2642 2643#define PREV\ 2644 "movq (%0), %%mm2 \n\t"\ 2645 "movq (%0), %%mm3 \n\t"\ 2646 "add %1, %0 \n\t"\ 2647 "punpcklbw %%mm4, %%mm2 \n\t"\ 2648 "punpckhbw %%mm4, %%mm3 \n\t"\ 2649 "psubw %%mm2, %%mm0 \n\t"\ 2650 "psubw %%mm3, %%mm1 \n\t" 2651 2652 2653 NEXT //0 2654 NEXT //1 2655 NEXT //2 2656 "movq %%mm0, (%3) \n\t" 2657 "movq %%mm1, 8(%3) \n\t" 2658 2659 NEXT //3 2660 "psubw %%mm5, %%mm0 \n\t" 2661 "psubw %%mm6, %%mm1 \n\t" 2662 "movq %%mm0, 16(%3) \n\t" 2663 "movq %%mm1, 24(%3) \n\t" 2664 2665 NEXT //4 2666 "psubw %%mm5, %%mm0 \n\t" 2667 "psubw %%mm6, %%mm1 \n\t" 2668 "movq %%mm0, 32(%3) \n\t" 2669 "movq %%mm1, 40(%3) \n\t" 2670 2671 NEXT //5 2672 "psubw %%mm5, %%mm0 \n\t" 2673 "psubw %%mm6, %%mm1 \n\t" 2674 "movq %%mm0, 48(%3) \n\t" 2675 "movq %%mm1, 56(%3) \n\t" 2676 2677 NEXT //6 2678 "psubw %%mm5, %%mm0 \n\t" 2679 "psubw %%mm6, %%mm1 \n\t" 2680 "movq %%mm0, 64(%3) \n\t" 2681 "movq %%mm1, 72(%3) \n\t" 2682 2683 "movq %%mm7, %%mm6 \n\t" 2684 "punpckhbw %%mm4, %%mm7 \n\t" 2685 "punpcklbw %%mm4, %%mm6 \n\t" 2686 2687 NEXT //7 2688 "mov %4, %0 \n\t" 2689 "add %1, %0 \n\t" 2690 PREV //0 2691 "movq %%mm0, 80(%3) \n\t" 2692 "movq %%mm1, 88(%3) \n\t" 2693 2694 PREV //1 2695 "paddw %%mm6, %%mm0 \n\t" 2696 "paddw %%mm7, %%mm1 \n\t" 2697 "movq %%mm0, 96(%3) \n\t" 2698 "movq %%mm1, 104(%3) \n\t" 2699 2700 PREV //2 2701 "paddw %%mm6, %%mm0 \n\t" 2702 "paddw %%mm7, %%mm1 \n\t" 2703 "movq %%mm0, 112(%3) \n\t" 2704 "movq %%mm1, 120(%3) \n\t" 2705 2706 PREV //3 2707 "paddw %%mm6, %%mm0 \n\t" 2708 "paddw %%mm7, %%mm1 \n\t" 2709 "movq %%mm0, 128(%3) \n\t" 2710 "movq %%mm1, 136(%3) \n\t" 2711 2712 PREV //4 2713 "paddw %%mm6, %%mm0 \n\t" 2714 "paddw %%mm7, %%mm1 \n\t" 2715 "movq %%mm0, 144(%3) \n\t" 2716 "movq %%mm1, 152(%3) \n\t" 2717 2718 "mov %4, %0 \n\t" //FIXME 2719 2720 : "+&r"(src) 2721 : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src) 2722 ); 2723 2724 src+= step; // src points to begin of the 8x8 Block 2725 2726 __asm__ volatile( 2727 "movq %4, %%mm6 \n\t" 2728 "pcmpeqb %%mm5, %%mm5 \n\t" 2729 "pxor %%mm6, %%mm5 \n\t" 2730 "pxor %%mm7, %%mm7 \n\t" 2731 2732 "1: \n\t" 2733 "movq (%1), %%mm0 \n\t" 2734 "movq 8(%1), %%mm1 \n\t" 2735 "paddw 32(%1), %%mm0 \n\t" 2736 "paddw 40(%1), %%mm1 \n\t" 2737 "movq (%0, %3), %%mm2 \n\t" 2738 "movq %%mm2, %%mm3 \n\t" 2739 "movq %%mm2, %%mm4 \n\t" 2740 "punpcklbw %%mm7, %%mm2 \n\t" 2741 "punpckhbw %%mm7, %%mm3 \n\t" 2742 "paddw %%mm2, %%mm0 \n\t" 2743 "paddw %%mm3, %%mm1 \n\t" 2744 "paddw %%mm2, %%mm0 \n\t" 2745 "paddw %%mm3, %%mm1 \n\t" 2746 "psrlw $4, %%mm0 \n\t" 2747 "psrlw $4, %%mm1 \n\t" 2748 "packuswb %%mm1, %%mm0 \n\t" 2749 "pand %%mm6, %%mm0 \n\t" 2750 "pand %%mm5, %%mm4 \n\t" 2751 "por %%mm4, %%mm0 \n\t" 2752 "movq %%mm0, (%0, %3) \n\t" 2753 "add $16, %1 \n\t" 2754 "add %2, %0 \n\t" 2755 " js 1b \n\t" 2756 2757 : "+r"(offset), "+r"(temp_sums) 2758 : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks) 2759 ); 2760 }else 2761 src+= step; // src points to begin of the 8x8 Block 2762 2763 if(eq_mask != -1LL){ 2764 uint8_t *temp_src= src; 2765 __asm__ volatile( 2766 "pxor %%mm7, %%mm7 \n\t" 2767 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 2768 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 2769// 0 1 2 3 4 5 6 7 8 9 2770// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 2771 2772 "movq (%0), %%mm0 \n\t" 2773 "movq %%mm0, %%mm1 \n\t" 2774 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 2775 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 2776 2777 "movq (%0, %1), %%mm2 \n\t" 2778 "lea (%0, %1, 2), %%"REG_a" \n\t" 2779 "movq %%mm2, %%mm3 \n\t" 2780 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 2781 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 2782 2783 "movq (%%"REG_a"), %%mm4 \n\t" 2784 "movq %%mm4, %%mm5 \n\t" 2785 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 2786 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 2787 2788 "paddw %%mm0, %%mm0 \n\t" // 2L0 2789 "paddw %%mm1, %%mm1 \n\t" // 2H0 2790 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 2791 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 2792 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 2793 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 2794 2795 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 2796 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 2797 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 2798 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 2799 2800 "movq (%%"REG_a", %1), %%mm2 \n\t" 2801 "movq %%mm2, %%mm3 \n\t" 2802 "punpcklbw %%mm7, %%mm2 \n\t" // L3 2803 "punpckhbw %%mm7, %%mm3 \n\t" // H3 2804 2805 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 2806 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 2807 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2808 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2809 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2810 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2811 2812 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 2813 "movq %%mm0, %%mm1 \n\t" 2814 "punpcklbw %%mm7, %%mm0 \n\t" // L4 2815 "punpckhbw %%mm7, %%mm1 \n\t" // H4 2816 2817 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 2818 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 2819 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 2820 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 2821 "paddw %%mm4, %%mm4 \n\t" // 2L2 2822 "paddw %%mm5, %%mm5 \n\t" // 2H2 2823 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 2824 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 2825 2826 "lea (%%"REG_a", %1), %0 \n\t" 2827 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 2828 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 2829 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 2830 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 2831//50 opcodes so far 2832 "movq (%0, %1, 2), %%mm2 \n\t" 2833 "movq %%mm2, %%mm3 \n\t" 2834 "punpcklbw %%mm7, %%mm2 \n\t" // L5 2835 "punpckhbw %%mm7, %%mm3 \n\t" // H5 2836 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 2837 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 2838 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 2839 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 2840 2841 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 2842 "punpcklbw %%mm7, %%mm6 \n\t" // L6 2843 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 2844 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 2845 "punpckhbw %%mm7, %%mm6 \n\t" // H6 2846 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 2847 2848 "paddw %%mm0, %%mm0 \n\t" // 2L4 2849 "paddw %%mm1, %%mm1 \n\t" // 2H4 2850 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 2851 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 2852 2853 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 2854 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 2855 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 2856 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 2857 2858 "movq (%0, %1, 4), %%mm2 \n\t" 2859 "movq %%mm2, %%mm3 \n\t" 2860 "punpcklbw %%mm7, %%mm2 \n\t" // L7 2861 "punpckhbw %%mm7, %%mm3 \n\t" // H7 2862 2863 "paddw %%mm2, %%mm2 \n\t" // 2L7 2864 "paddw %%mm3, %%mm3 \n\t" // 2H7 2865 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 2866 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 2867 2868 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2869 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2870 2871#if HAVE_MMX2 2872 "movq %%mm7, %%mm6 \n\t" // 0 2873 "psubw %%mm0, %%mm6 \n\t" 2874 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 2875 "movq %%mm7, %%mm6 \n\t" // 0 2876 "psubw %%mm1, %%mm6 \n\t" 2877 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 2878 "movq %%mm7, %%mm6 \n\t" // 0 2879 "psubw %%mm2, %%mm6 \n\t" 2880 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 2881 "movq %%mm7, %%mm6 \n\t" // 0 2882 "psubw %%mm3, %%mm6 \n\t" 2883 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 2884#else 2885 "movq %%mm7, %%mm6 \n\t" // 0 2886 "pcmpgtw %%mm0, %%mm6 \n\t" 2887 "pxor %%mm6, %%mm0 \n\t" 2888 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 2889 "movq %%mm7, %%mm6 \n\t" // 0 2890 "pcmpgtw %%mm1, %%mm6 \n\t" 2891 "pxor %%mm6, %%mm1 \n\t" 2892 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 2893 "movq %%mm7, %%mm6 \n\t" // 0 2894 "pcmpgtw %%mm2, %%mm6 \n\t" 2895 "pxor %%mm6, %%mm2 \n\t" 2896 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 2897 "movq %%mm7, %%mm6 \n\t" // 0 2898 "pcmpgtw %%mm3, %%mm6 \n\t" 2899 "pxor %%mm6, %%mm3 \n\t" 2900 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 2901#endif 2902 2903#if HAVE_MMX2 2904 "pminsw %%mm2, %%mm0 \n\t" 2905 "pminsw %%mm3, %%mm1 \n\t" 2906#else 2907 "movq %%mm0, %%mm6 \n\t" 2908 "psubusw %%mm2, %%mm6 \n\t" 2909 "psubw %%mm6, %%mm0 \n\t" 2910 "movq %%mm1, %%mm6 \n\t" 2911 "psubusw %%mm3, %%mm6 \n\t" 2912 "psubw %%mm6, %%mm1 \n\t" 2913#endif 2914 2915 "movd %2, %%mm2 \n\t" // QP 2916 "punpcklbw %%mm7, %%mm2 \n\t" 2917 2918 "movq %%mm7, %%mm6 \n\t" // 0 2919 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 2920 "pxor %%mm6, %%mm4 \n\t" 2921 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 2922 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 2923 "pxor %%mm7, %%mm5 \n\t" 2924 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 2925// 100 opcodes 2926 "psllw $3, %%mm2 \n\t" // 8QP 2927 "movq %%mm2, %%mm3 \n\t" // 8QP 2928 "pcmpgtw %%mm4, %%mm2 \n\t" 2929 "pcmpgtw %%mm5, %%mm3 \n\t" 2930 "pand %%mm2, %%mm4 \n\t" 2931 "pand %%mm3, %%mm5 \n\t" 2932 2933 2934 "psubusw %%mm0, %%mm4 \n\t" // hd 2935 "psubusw %%mm1, %%mm5 \n\t" // ld 2936 2937 2938 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 2939 "pmullw %%mm2, %%mm4 \n\t" 2940 "pmullw %%mm2, %%mm5 \n\t" 2941 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 2942 "paddw %%mm2, %%mm4 \n\t" 2943 "paddw %%mm2, %%mm5 \n\t" 2944 "psrlw $6, %%mm4 \n\t" 2945 "psrlw $6, %%mm5 \n\t" 2946 2947 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 2948 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 2949 2950 "pxor %%mm2, %%mm2 \n\t" 2951 "pxor %%mm3, %%mm3 \n\t" 2952 2953 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 2954 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 2955 "pxor %%mm2, %%mm0 \n\t" 2956 "pxor %%mm3, %%mm1 \n\t" 2957 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 2958 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 2959 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 2960 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 2961 2962 "pxor %%mm6, %%mm2 \n\t" 2963 "pxor %%mm7, %%mm3 \n\t" 2964 "pand %%mm2, %%mm4 \n\t" 2965 "pand %%mm3, %%mm5 \n\t" 2966 2967#if HAVE_MMX2 2968 "pminsw %%mm0, %%mm4 \n\t" 2969 "pminsw %%mm1, %%mm5 \n\t" 2970#else 2971 "movq %%mm4, %%mm2 \n\t" 2972 "psubusw %%mm0, %%mm2 \n\t" 2973 "psubw %%mm2, %%mm4 \n\t" 2974 "movq %%mm5, %%mm2 \n\t" 2975 "psubusw %%mm1, %%mm2 \n\t" 2976 "psubw %%mm2, %%mm5 \n\t" 2977#endif 2978 "pxor %%mm6, %%mm4 \n\t" 2979 "pxor %%mm7, %%mm5 \n\t" 2980 "psubw %%mm6, %%mm4 \n\t" 2981 "psubw %%mm7, %%mm5 \n\t" 2982 "packsswb %%mm5, %%mm4 \n\t" 2983 "movq %3, %%mm1 \n\t" 2984 "pandn %%mm4, %%mm1 \n\t" 2985 "movq (%0), %%mm0 \n\t" 2986 "paddb %%mm1, %%mm0 \n\t" 2987 "movq %%mm0, (%0) \n\t" 2988 "movq (%0, %1), %%mm0 \n\t" 2989 "psubb %%mm1, %%mm0 \n\t" 2990 "movq %%mm0, (%0, %1) \n\t" 2991 2992 : "+r" (temp_src) 2993 : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask) 2994 : "%"REG_a, "%"REG_c 2995 ); 2996 } 2997/*if(step==16){ 2998 STOP_TIMER("step16") 2999}else{ 3000 STOP_TIMER("stepX") 3001} 3002 } */ 3003} 3004#endif //HAVE_MMX 3005 3006static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3007 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); 3008 3009/** 3010 * Copy a block from src to dst and fixes the blacklevel. 3011 * levelFix == 0 -> do not touch the brighness & contrast 3012 */ 3013#undef REAL_SCALED_CPY 3014#undef SCALED_CPY 3015 3016static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, 3017 int levelFix, int64_t *packedOffsetAndScale) 3018{ 3019#if !HAVE_MMX 3020 int i; 3021#endif 3022 if(levelFix){ 3023#if HAVE_MMX 3024 __asm__ volatile( 3025 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset 3026 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale 3027 "lea (%2,%4), %%"REG_a" \n\t" 3028 "lea (%3,%5), %%"REG_d" \n\t" 3029 "pxor %%mm4, %%mm4 \n\t" 3030#if HAVE_MMX2 3031#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3032 "movq " #src1 ", %%mm0 \n\t"\ 3033 "movq " #src1 ", %%mm5 \n\t"\ 3034 "movq " #src2 ", %%mm1 \n\t"\ 3035 "movq " #src2 ", %%mm6 \n\t"\ 3036 "punpcklbw %%mm0, %%mm0 \n\t"\ 3037 "punpckhbw %%mm5, %%mm5 \n\t"\ 3038 "punpcklbw %%mm1, %%mm1 \n\t"\ 3039 "punpckhbw %%mm6, %%mm6 \n\t"\ 3040 "pmulhuw %%mm3, %%mm0 \n\t"\ 3041 "pmulhuw %%mm3, %%mm5 \n\t"\ 3042 "pmulhuw %%mm3, %%mm1 \n\t"\ 3043 "pmulhuw %%mm3, %%mm6 \n\t"\ 3044 "psubw %%mm2, %%mm0 \n\t"\ 3045 "psubw %%mm2, %%mm5 \n\t"\ 3046 "psubw %%mm2, %%mm1 \n\t"\ 3047 "psubw %%mm2, %%mm6 \n\t"\ 3048 "packuswb %%mm5, %%mm0 \n\t"\ 3049 "packuswb %%mm6, %%mm1 \n\t"\ 3050 "movq %%mm0, " #dst1 " \n\t"\ 3051 "movq %%mm1, " #dst2 " \n\t"\ 3052 3053#else //HAVE_MMX2 3054#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3055 "movq " #src1 ", %%mm0 \n\t"\ 3056 "movq " #src1 ", %%mm5 \n\t"\ 3057 "punpcklbw %%mm4, %%mm0 \n\t"\ 3058 "punpckhbw %%mm4, %%mm5 \n\t"\ 3059 "psubw %%mm2, %%mm0 \n\t"\ 3060 "psubw %%mm2, %%mm5 \n\t"\ 3061 "movq " #src2 ", %%mm1 \n\t"\ 3062 "psllw $6, %%mm0 \n\t"\ 3063 "psllw $6, %%mm5 \n\t"\ 3064 "pmulhw %%mm3, %%mm0 \n\t"\ 3065 "movq " #src2 ", %%mm6 \n\t"\ 3066 "pmulhw %%mm3, %%mm5 \n\t"\ 3067 "punpcklbw %%mm4, %%mm1 \n\t"\ 3068 "punpckhbw %%mm4, %%mm6 \n\t"\ 3069 "psubw %%mm2, %%mm1 \n\t"\ 3070 "psubw %%mm2, %%mm6 \n\t"\ 3071 "psllw $6, %%mm1 \n\t"\ 3072 "psllw $6, %%mm6 \n\t"\ 3073 "pmulhw %%mm3, %%mm1 \n\t"\ 3074 "pmulhw %%mm3, %%mm6 \n\t"\ 3075 "packuswb %%mm5, %%mm0 \n\t"\ 3076 "packuswb %%mm6, %%mm1 \n\t"\ 3077 "movq %%mm0, " #dst1 " \n\t"\ 3078 "movq %%mm1, " #dst2 " \n\t"\ 3079 3080#endif //HAVE_MMX2 3081#define SCALED_CPY(src1, src2, dst1, dst2)\ 3082 REAL_SCALED_CPY(src1, src2, dst1, dst2) 3083 3084SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) 3085SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) 3086SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) 3087 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" 3088 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" 3089SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) 3090 3091 3092 : "=&a" (packedOffsetAndScale) 3093 : "0" (packedOffsetAndScale), 3094 "r"(src), 3095 "r"(dst), 3096 "r" ((x86_reg)srcStride), 3097 "r" ((x86_reg)dstStride) 3098 : "%"REG_d 3099 ); 3100#else //HAVE_MMX 3101 for(i=0; i<8; i++) 3102 memcpy( &(dst[dstStride*i]), 3103 &(src[srcStride*i]), BLOCK_SIZE); 3104#endif //HAVE_MMX 3105 }else{ 3106#if HAVE_MMX 3107 __asm__ volatile( 3108 "lea (%0,%2), %%"REG_a" \n\t" 3109 "lea (%1,%3), %%"REG_d" \n\t" 3110 3111#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ 3112 "movq " #src1 ", %%mm0 \n\t"\ 3113 "movq " #src2 ", %%mm1 \n\t"\ 3114 "movq %%mm0, " #dst1 " \n\t"\ 3115 "movq %%mm1, " #dst2 " \n\t"\ 3116 3117#define SIMPLE_CPY(src1, src2, dst1, dst2)\ 3118 REAL_SIMPLE_CPY(src1, src2, dst1, dst2) 3119 3120SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) 3121SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) 3122SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) 3123 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" 3124 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" 3125SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) 3126 3127 : : "r" (src), 3128 "r" (dst), 3129 "r" ((x86_reg)srcStride), 3130 "r" ((x86_reg)dstStride) 3131 : "%"REG_a, "%"REG_d 3132 ); 3133#else //HAVE_MMX 3134 for(i=0; i<8; i++) 3135 memcpy( &(dst[dstStride*i]), 3136 &(src[srcStride*i]), BLOCK_SIZE); 3137#endif //HAVE_MMX 3138 } 3139} 3140 3141/** 3142 * Duplicate the given 8 src pixels ? times upward 3143 */ 3144static inline void RENAME(duplicate)(uint8_t src[], int stride) 3145{ 3146#if HAVE_MMX 3147 __asm__ volatile( 3148 "movq (%0), %%mm0 \n\t" 3149 "add %1, %0 \n\t" 3150 "movq %%mm0, (%0) \n\t" 3151 "movq %%mm0, (%0, %1) \n\t" 3152 "movq %%mm0, (%0, %1, 2) \n\t" 3153 : "+r" (src) 3154 : "r" ((x86_reg)-stride) 3155 ); 3156#else 3157 int i; 3158 uint8_t *p=src; 3159 for(i=0; i<3; i++){ 3160 p-= stride; 3161 memcpy(p, src, 8); 3162 } 3163#endif 3164} 3165 3166/** 3167 * Filter array of bytes (Y or U or V values) 3168 */ 3169static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3170 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) 3171{ 3172 DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access 3173 int x,y; 3174#ifdef COMPILE_TIME_MODE 3175 const int mode= COMPILE_TIME_MODE; 3176#else 3177 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; 3178#endif 3179 int black=0, white=255; // blackest black and whitest white in the picture 3180 int QPCorrecture= 256*256; 3181 3182 int copyAhead; 3183#if HAVE_MMX 3184 int i; 3185#endif 3186 3187 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; 3188 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; 3189 3190 //FIXME remove 3191 uint64_t * const yHistogram= c.yHistogram; 3192 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; 3193 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; 3194 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; 3195 3196#if HAVE_MMX 3197 for(i=0; i<57; i++){ 3198 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; 3199 int threshold= offset*2 + 1; 3200 c.mmxDcOffset[i]= 0x7F - offset; 3201 c.mmxDcThreshold[i]= 0x7F - threshold; 3202 c.mmxDcOffset[i]*= 0x0101010101010101LL; 3203 c.mmxDcThreshold[i]*= 0x0101010101010101LL; 3204 } 3205#endif 3206 3207 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; 3208 else if( (mode & LINEAR_BLEND_DEINT_FILTER) 3209 || (mode & FFMPEG_DEINT_FILTER) 3210 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; 3211 else if( (mode & V_DEBLOCK) 3212 || (mode & LINEAR_IPOL_DEINT_FILTER) 3213 || (mode & MEDIAN_DEINT_FILTER) 3214 || (mode & V_A_DEBLOCK)) copyAhead=13; 3215 else if(mode & V_X1_FILTER) copyAhead=11; 3216// else if(mode & V_RK1_FILTER) copyAhead=10; 3217 else if(mode & DERING) copyAhead=9; 3218 else copyAhead=8; 3219 3220 copyAhead-= 8; 3221 3222 if(!isColor){ 3223 uint64_t sum= 0; 3224 int i; 3225 uint64_t maxClipped; 3226 uint64_t clipped; 3227 double scale; 3228 3229 c.frameNum++; 3230 // first frame is fscked so we ignore it 3231 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; 3232 3233 for(i=0; i<256; i++){ 3234 sum+= yHistogram[i]; 3235 } 3236 3237 /* We always get a completely black picture first. */ 3238 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); 3239 3240 clipped= sum; 3241 for(black=255; black>0; black--){ 3242 if(clipped < maxClipped) break; 3243 clipped-= yHistogram[black]; 3244 } 3245 3246 clipped= sum; 3247 for(white=0; white<256; white++){ 3248 if(clipped < maxClipped) break; 3249 clipped-= yHistogram[white]; 3250 } 3251 3252 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); 3253 3254#if HAVE_MMX2 3255 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); 3256 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; 3257#else 3258 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); 3259 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; 3260#endif 3261 3262 c.packedYOffset|= c.packedYOffset<<32; 3263 c.packedYOffset|= c.packedYOffset<<16; 3264 3265 c.packedYScale|= c.packedYScale<<32; 3266 c.packedYScale|= c.packedYScale<<16; 3267 3268 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); 3269 else QPCorrecture= 256*256; 3270 }else{ 3271 c.packedYScale= 0x0100010001000100LL; 3272 c.packedYOffset= 0; 3273 QPCorrecture= 256*256; 3274 } 3275 3276 /* copy & deinterlace first row of blocks */ 3277 y=-BLOCK_SIZE; 3278 { 3279 const uint8_t *srcBlock= &(src[y*srcStride]); 3280 uint8_t *dstBlock= tempDst + dstStride; 3281 3282 // From this point on it is guaranteed that we can read and write 16 lines downward 3283 // finish 1 block before the next otherwise we might have a problem 3284 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 3285 for(x=0; x<width; x+=BLOCK_SIZE){ 3286 3287#if HAVE_MMX2 3288/* 3289 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3290 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3291 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3292 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3293*/ 3294 3295 __asm__( 3296 "mov %4, %%"REG_a" \n\t" 3297 "shr $2, %%"REG_a" \n\t" 3298 "and $6, %%"REG_a" \n\t" 3299 "add %5, %%"REG_a" \n\t" 3300 "mov %%"REG_a", %%"REG_d" \n\t" 3301 "imul %1, %%"REG_a" \n\t" 3302 "imul %3, %%"REG_d" \n\t" 3303 "prefetchnta 32(%%"REG_a", %0) \n\t" 3304 "prefetcht0 32(%%"REG_d", %2) \n\t" 3305 "add %1, %%"REG_a" \n\t" 3306 "add %3, %%"REG_d" \n\t" 3307 "prefetchnta 32(%%"REG_a", %0) \n\t" 3308 "prefetcht0 32(%%"REG_d", %2) \n\t" 3309 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3310 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3311 : "%"REG_a, "%"REG_d 3312 ); 3313 3314#elif HAVE_AMD3DNOW 3315//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3316/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3317 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3318 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3319 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3320*/ 3321#endif 3322 3323 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, 3324 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3325 3326 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); 3327 3328 if(mode & LINEAR_IPOL_DEINT_FILTER) 3329 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3330 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3331 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3332 else if(mode & MEDIAN_DEINT_FILTER) 3333 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3334 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3335 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3336 else if(mode & FFMPEG_DEINT_FILTER) 3337 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3338 else if(mode & LOWPASS5_DEINT_FILTER) 3339 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3340/* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3341 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3342*/ 3343 dstBlock+=8; 3344 srcBlock+=8; 3345 } 3346 if(width==FFABS(dstStride)) 3347 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); 3348 else{ 3349 int i; 3350 for(i=0; i<copyAhead; i++){ 3351 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); 3352 } 3353 } 3354 } 3355 3356 for(y=0; y<height; y+=BLOCK_SIZE){ 3357 //1% speedup if these are here instead of the inner loop 3358 const uint8_t *srcBlock= &(src[y*srcStride]); 3359 uint8_t *dstBlock= &(dst[y*dstStride]); 3360#if HAVE_MMX 3361 uint8_t *tempBlock1= c.tempBlocks; 3362 uint8_t *tempBlock2= c.tempBlocks + 8; 3363#endif 3364 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; 3365 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; 3366 int QP=0; 3367 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 3368 if not than use a temporary buffer */ 3369 if(y+15 >= height){ 3370 int i; 3371 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with 3372 blockcopy to dst later */ 3373 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, 3374 FFMAX(height-y-copyAhead, 0), srcStride); 3375 3376 /* duplicate last line of src to fill the void up to line (copyAhead+7) */ 3377 for(i=FFMAX(height-y, 8); i<copyAhead+8; i++) 3378 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride)); 3379 3380 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ 3381 linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride); 3382 3383 /* duplicate last line of dst to fill the void up to line (copyAhead) */ 3384 for(i=height-y+1; i<=copyAhead; i++) 3385 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride)); 3386 3387 dstBlock= tempDst + dstStride; 3388 srcBlock= tempSrc; 3389 } 3390 3391 // From this point on it is guaranteed that we can read and write 16 lines downward 3392 // finish 1 block before the next otherwise we might have a problem 3393 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 3394 for(x=0; x<width; x+=BLOCK_SIZE){ 3395 const int stride= dstStride; 3396#if HAVE_MMX 3397 uint8_t *tmpXchg; 3398#endif 3399 if(isColor){ 3400 QP= QPptr[x>>qpHShift]; 3401 c.nonBQP= nonBQPptr[x>>qpHShift]; 3402 }else{ 3403 QP= QPptr[x>>4]; 3404 QP= (QP* QPCorrecture + 256*128)>>16; 3405 c.nonBQP= nonBQPptr[x>>4]; 3406 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; 3407 yHistogram[ srcBlock[srcStride*12 + 4] ]++; 3408 } 3409 c.QP= QP; 3410#if HAVE_MMX 3411 __asm__ volatile( 3412 "movd %1, %%mm7 \n\t" 3413 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 3414 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 3415 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 3416 "movq %%mm7, %0 \n\t" 3417 : "=m" (c.pQPb) 3418 : "r" (QP) 3419 ); 3420#endif 3421 3422 3423#if HAVE_MMX2 3424/* 3425 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3426 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3427 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3428 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3429*/ 3430 3431 __asm__( 3432 "mov %4, %%"REG_a" \n\t" 3433 "shr $2, %%"REG_a" \n\t" 3434 "and $6, %%"REG_a" \n\t" 3435 "add %5, %%"REG_a" \n\t" 3436 "mov %%"REG_a", %%"REG_d" \n\t" 3437 "imul %1, %%"REG_a" \n\t" 3438 "imul %3, %%"REG_d" \n\t" 3439 "prefetchnta 32(%%"REG_a", %0) \n\t" 3440 "prefetcht0 32(%%"REG_d", %2) \n\t" 3441 "add %1, %%"REG_a" \n\t" 3442 "add %3, %%"REG_d" \n\t" 3443 "prefetchnta 32(%%"REG_a", %0) \n\t" 3444 "prefetcht0 32(%%"REG_d", %2) \n\t" 3445 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3446 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3447 : "%"REG_a, "%"REG_d 3448 ); 3449 3450#elif HAVE_AMD3DNOW 3451//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3452/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3453 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3454 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3455 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3456*/ 3457#endif 3458 3459 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, 3460 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3461 3462 if(mode & LINEAR_IPOL_DEINT_FILTER) 3463 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3464 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3465 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3466 else if(mode & MEDIAN_DEINT_FILTER) 3467 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3468 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3469 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3470 else if(mode & FFMPEG_DEINT_FILTER) 3471 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3472 else if(mode & LOWPASS5_DEINT_FILTER) 3473 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3474/* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3475 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3476*/ 3477 3478 /* only deblock if we have 2 blocks */ 3479 if(y + 8 < height){ 3480 if(mode & V_X1_FILTER) 3481 RENAME(vertX1Filter)(dstBlock, stride, &c); 3482 else if(mode & V_DEBLOCK){ 3483 const int t= RENAME(vertClassify)(dstBlock, stride, &c); 3484 3485 if(t==1) 3486 RENAME(doVertLowPass)(dstBlock, stride, &c); 3487 else if(t==2) 3488 RENAME(doVertDefFilter)(dstBlock, stride, &c); 3489 }else if(mode & V_A_DEBLOCK){ 3490 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); 3491 } 3492 } 3493 3494#if HAVE_MMX 3495 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); 3496#endif 3497 /* check if we have a previous block to deblock it with dstBlock */ 3498 if(x - 8 >= 0){ 3499#if HAVE_MMX 3500 if(mode & H_X1_FILTER) 3501 RENAME(vertX1Filter)(tempBlock1, 16, &c); 3502 else if(mode & H_DEBLOCK){ 3503//START_TIMER 3504 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); 3505//STOP_TIMER("dc & minmax") 3506 if(t==1) 3507 RENAME(doVertLowPass)(tempBlock1, 16, &c); 3508 else if(t==2) 3509 RENAME(doVertDefFilter)(tempBlock1, 16, &c); 3510 }else if(mode & H_A_DEBLOCK){ 3511 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); 3512 } 3513 3514 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); 3515 3516#else 3517 if(mode & H_X1_FILTER) 3518 horizX1Filter(dstBlock-4, stride, QP); 3519 else if(mode & H_DEBLOCK){ 3520#if HAVE_ALTIVEC 3521 DECLARE_ALIGNED(16, unsigned char, tempBlock)[272]; 3522 int t; 3523 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); 3524 3525 t = vertClassify_altivec(tempBlock-48, 16, &c); 3526 if(t==1) { 3527 doVertLowPass_altivec(tempBlock-48, 16, &c); 3528 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3529 } 3530 else if(t==2) { 3531 doVertDefFilter_altivec(tempBlock-48, 16, &c); 3532 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3533 } 3534#else 3535 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); 3536 3537 if(t==1) 3538 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); 3539 else if(t==2) 3540 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); 3541#endif 3542 }else if(mode & H_A_DEBLOCK){ 3543 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); 3544 } 3545#endif //HAVE_MMX 3546 if(mode & DERING){ 3547 //FIXME filter first line 3548 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); 3549 } 3550 3551 if(mode & TEMP_NOISE_FILTER) 3552 { 3553 RENAME(tempNoiseReducer)(dstBlock-8, stride, 3554 c.tempBlurred[isColor] + y*dstStride + x, 3555 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3), 3556 c.ppMode.maxTmpNoise); 3557 } 3558 } 3559 3560 dstBlock+=8; 3561 srcBlock+=8; 3562 3563#if HAVE_MMX 3564 tmpXchg= tempBlock1; 3565 tempBlock1= tempBlock2; 3566 tempBlock2 = tmpXchg; 3567#endif 3568 } 3569 3570 if(mode & DERING){ 3571 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); 3572 } 3573 3574 if((mode & TEMP_NOISE_FILTER)){ 3575 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, 3576 c.tempBlurred[isColor] + y*dstStride + x, 3577 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3), 3578 c.ppMode.maxTmpNoise); 3579 } 3580 3581 /* did we use a tmp buffer for the last lines*/ 3582 if(y+15 >= height){ 3583 uint8_t *dstBlock= &(dst[y*dstStride]); 3584 if(width==FFABS(dstStride)) 3585 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); 3586 else{ 3587 int i; 3588 for(i=0; i<height-y; i++){ 3589 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); 3590 } 3591 } 3592 } 3593/* 3594 for(x=0; x<width; x+=32){ 3595 volatile int i; 3596 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] 3597 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] 3598 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; 3599 + dstBlock[x +13*dstStride] 3600 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; 3601 }*/ 3602 } 3603#if HAVE_AMD3DNOW 3604 __asm__ volatile("femms"); 3605#elif HAVE_MMX 3606 __asm__ volatile("emms"); 3607#endif 3608 3609#ifdef DEBUG_BRIGHTNESS 3610 if(!isColor){ 3611 int max=1; 3612 int i; 3613 for(i=0; i<256; i++) 3614 if(yHistogram[i] > max) max=yHistogram[i]; 3615 3616 for(i=1; i<256; i++){ 3617 int x; 3618 int start=yHistogram[i-1]/(max/256+1); 3619 int end=yHistogram[i]/(max/256+1); 3620 int inc= end > start ? 1 : -1; 3621 for(x=start; x!=end+inc; x+=inc) 3622 dst[ i*dstStride + x]+=128; 3623 } 3624 3625 for(i=0; i<100; i+=2){ 3626 dst[ (white)*dstStride + i]+=128; 3627 dst[ (black)*dstStride + i]+=128; 3628 } 3629 } 3630#endif 3631 3632 *c2= c; //copy local context back 3633 3634} 3635