1/* 2 * DSP utils mmx functions are compiled twice for rnd/no_rnd 3 * Copyright (c) 2000, 2001 Fabrice Bellard 4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> 8 * and improved by Zdenek Kabelac <kabi@users.sf.net> 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27// put_pixels 28static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 29{ 30 MOVQ_BFE(mm6); 31 __asm__ volatile( 32 "lea (%3, %3), %%"REG_a" \n\t" 33 ASMALIGN(3) 34 "1: \n\t" 35 "movq (%1), %%mm0 \n\t" 36 "movq 1(%1), %%mm1 \n\t" 37 "movq (%1, %3), %%mm2 \n\t" 38 "movq 1(%1, %3), %%mm3 \n\t" 39 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 40 "movq %%mm4, (%2) \n\t" 41 "movq %%mm5, (%2, %3) \n\t" 42 "add %%"REG_a", %1 \n\t" 43 "add %%"REG_a", %2 \n\t" 44 "movq (%1), %%mm0 \n\t" 45 "movq 1(%1), %%mm1 \n\t" 46 "movq (%1, %3), %%mm2 \n\t" 47 "movq 1(%1, %3), %%mm3 \n\t" 48 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 49 "movq %%mm4, (%2) \n\t" 50 "movq %%mm5, (%2, %3) \n\t" 51 "add %%"REG_a", %1 \n\t" 52 "add %%"REG_a", %2 \n\t" 53 "subl $4, %0 \n\t" 54 "jnz 1b \n\t" 55 :"+g"(h), "+S"(pixels), "+D"(block) 56 :"r"((x86_reg)line_size) 57 :REG_a, "memory"); 58} 59 60static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 61{ 62 MOVQ_BFE(mm6); 63 __asm__ volatile( 64 "testl $1, %0 \n\t" 65 " jz 1f \n\t" 66 "movq (%1), %%mm0 \n\t" 67 "movq (%2), %%mm1 \n\t" 68 "add %4, %1 \n\t" 69 "add $8, %2 \n\t" 70 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) 71 "movq %%mm4, (%3) \n\t" 72 "add %5, %3 \n\t" 73 "decl %0 \n\t" 74 ASMALIGN(3) 75 "1: \n\t" 76 "movq (%1), %%mm0 \n\t" 77 "movq (%2), %%mm1 \n\t" 78 "add %4, %1 \n\t" 79 "movq (%1), %%mm2 \n\t" 80 "movq 8(%2), %%mm3 \n\t" 81 "add %4, %1 \n\t" 82 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 83 "movq %%mm4, (%3) \n\t" 84 "add %5, %3 \n\t" 85 "movq %%mm5, (%3) \n\t" 86 "add %5, %3 \n\t" 87 "movq (%1), %%mm0 \n\t" 88 "movq 16(%2), %%mm1 \n\t" 89 "add %4, %1 \n\t" 90 "movq (%1), %%mm2 \n\t" 91 "movq 24(%2), %%mm3 \n\t" 92 "add %4, %1 \n\t" 93 "add $32, %2 \n\t" 94 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 95 "movq %%mm4, (%3) \n\t" 96 "add %5, %3 \n\t" 97 "movq %%mm5, (%3) \n\t" 98 "add %5, %3 \n\t" 99 "subl $4, %0 \n\t" 100 "jnz 1b \n\t" 101#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 102 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 103#else 104 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 105#endif 106 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 107 :"memory"); 108} 109 110static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 111{ 112 MOVQ_BFE(mm6); 113 __asm__ volatile( 114 "lea (%3, %3), %%"REG_a" \n\t" 115 ASMALIGN(3) 116 "1: \n\t" 117 "movq (%1), %%mm0 \n\t" 118 "movq 1(%1), %%mm1 \n\t" 119 "movq (%1, %3), %%mm2 \n\t" 120 "movq 1(%1, %3), %%mm3 \n\t" 121 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 122 "movq %%mm4, (%2) \n\t" 123 "movq %%mm5, (%2, %3) \n\t" 124 "movq 8(%1), %%mm0 \n\t" 125 "movq 9(%1), %%mm1 \n\t" 126 "movq 8(%1, %3), %%mm2 \n\t" 127 "movq 9(%1, %3), %%mm3 \n\t" 128 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 129 "movq %%mm4, 8(%2) \n\t" 130 "movq %%mm5, 8(%2, %3) \n\t" 131 "add %%"REG_a", %1 \n\t" 132 "add %%"REG_a", %2 \n\t" 133 "movq (%1), %%mm0 \n\t" 134 "movq 1(%1), %%mm1 \n\t" 135 "movq (%1, %3), %%mm2 \n\t" 136 "movq 1(%1, %3), %%mm3 \n\t" 137 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 138 "movq %%mm4, (%2) \n\t" 139 "movq %%mm5, (%2, %3) \n\t" 140 "movq 8(%1), %%mm0 \n\t" 141 "movq 9(%1), %%mm1 \n\t" 142 "movq 8(%1, %3), %%mm2 \n\t" 143 "movq 9(%1, %3), %%mm3 \n\t" 144 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 145 "movq %%mm4, 8(%2) \n\t" 146 "movq %%mm5, 8(%2, %3) \n\t" 147 "add %%"REG_a", %1 \n\t" 148 "add %%"REG_a", %2 \n\t" 149 "subl $4, %0 \n\t" 150 "jnz 1b \n\t" 151 :"+g"(h), "+S"(pixels), "+D"(block) 152 :"r"((x86_reg)line_size) 153 :REG_a, "memory"); 154} 155 156static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 157{ 158 MOVQ_BFE(mm6); 159 __asm__ volatile( 160 "testl $1, %0 \n\t" 161 " jz 1f \n\t" 162 "movq (%1), %%mm0 \n\t" 163 "movq (%2), %%mm1 \n\t" 164 "movq 8(%1), %%mm2 \n\t" 165 "movq 8(%2), %%mm3 \n\t" 166 "add %4, %1 \n\t" 167 "add $16, %2 \n\t" 168 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 169 "movq %%mm4, (%3) \n\t" 170 "movq %%mm5, 8(%3) \n\t" 171 "add %5, %3 \n\t" 172 "decl %0 \n\t" 173 ASMALIGN(3) 174 "1: \n\t" 175 "movq (%1), %%mm0 \n\t" 176 "movq (%2), %%mm1 \n\t" 177 "movq 8(%1), %%mm2 \n\t" 178 "movq 8(%2), %%mm3 \n\t" 179 "add %4, %1 \n\t" 180 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 181 "movq %%mm4, (%3) \n\t" 182 "movq %%mm5, 8(%3) \n\t" 183 "add %5, %3 \n\t" 184 "movq (%1), %%mm0 \n\t" 185 "movq 16(%2), %%mm1 \n\t" 186 "movq 8(%1), %%mm2 \n\t" 187 "movq 24(%2), %%mm3 \n\t" 188 "add %4, %1 \n\t" 189 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) 190 "movq %%mm4, (%3) \n\t" 191 "movq %%mm5, 8(%3) \n\t" 192 "add %5, %3 \n\t" 193 "add $32, %2 \n\t" 194 "subl $2, %0 \n\t" 195 "jnz 1b \n\t" 196#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 197 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 198#else 199 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 200#endif 201 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 202 :"memory"); 203} 204 205static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 206{ 207 MOVQ_BFE(mm6); 208 __asm__ volatile( 209 "lea (%3, %3), %%"REG_a" \n\t" 210 "movq (%1), %%mm0 \n\t" 211 ASMALIGN(3) 212 "1: \n\t" 213 "movq (%1, %3), %%mm1 \n\t" 214 "movq (%1, %%"REG_a"),%%mm2 \n\t" 215 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) 216 "movq %%mm4, (%2) \n\t" 217 "movq %%mm5, (%2, %3) \n\t" 218 "add %%"REG_a", %1 \n\t" 219 "add %%"REG_a", %2 \n\t" 220 "movq (%1, %3), %%mm1 \n\t" 221 "movq (%1, %%"REG_a"),%%mm0 \n\t" 222 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) 223 "movq %%mm4, (%2) \n\t" 224 "movq %%mm5, (%2, %3) \n\t" 225 "add %%"REG_a", %1 \n\t" 226 "add %%"REG_a", %2 \n\t" 227 "subl $4, %0 \n\t" 228 "jnz 1b \n\t" 229 :"+g"(h), "+S"(pixels), "+D"(block) 230 :"r"((x86_reg)line_size) 231 :REG_a, "memory"); 232} 233 234static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 235{ 236 MOVQ_ZERO(mm7); 237 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version 238 __asm__ volatile( 239 "movq (%1), %%mm0 \n\t" 240 "movq 1(%1), %%mm4 \n\t" 241 "movq %%mm0, %%mm1 \n\t" 242 "movq %%mm4, %%mm5 \n\t" 243 "punpcklbw %%mm7, %%mm0 \n\t" 244 "punpcklbw %%mm7, %%mm4 \n\t" 245 "punpckhbw %%mm7, %%mm1 \n\t" 246 "punpckhbw %%mm7, %%mm5 \n\t" 247 "paddusw %%mm0, %%mm4 \n\t" 248 "paddusw %%mm1, %%mm5 \n\t" 249 "xor %%"REG_a", %%"REG_a" \n\t" 250 "add %3, %1 \n\t" 251 ASMALIGN(3) 252 "1: \n\t" 253 "movq (%1, %%"REG_a"), %%mm0 \n\t" 254 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" 255 "movq %%mm0, %%mm1 \n\t" 256 "movq %%mm2, %%mm3 \n\t" 257 "punpcklbw %%mm7, %%mm0 \n\t" 258 "punpcklbw %%mm7, %%mm2 \n\t" 259 "punpckhbw %%mm7, %%mm1 \n\t" 260 "punpckhbw %%mm7, %%mm3 \n\t" 261 "paddusw %%mm2, %%mm0 \n\t" 262 "paddusw %%mm3, %%mm1 \n\t" 263 "paddusw %%mm6, %%mm4 \n\t" 264 "paddusw %%mm6, %%mm5 \n\t" 265 "paddusw %%mm0, %%mm4 \n\t" 266 "paddusw %%mm1, %%mm5 \n\t" 267 "psrlw $2, %%mm4 \n\t" 268 "psrlw $2, %%mm5 \n\t" 269 "packuswb %%mm5, %%mm4 \n\t" 270 "movq %%mm4, (%2, %%"REG_a") \n\t" 271 "add %3, %%"REG_a" \n\t" 272 273 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 274 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" 275 "movq %%mm2, %%mm3 \n\t" 276 "movq %%mm4, %%mm5 \n\t" 277 "punpcklbw %%mm7, %%mm2 \n\t" 278 "punpcklbw %%mm7, %%mm4 \n\t" 279 "punpckhbw %%mm7, %%mm3 \n\t" 280 "punpckhbw %%mm7, %%mm5 \n\t" 281 "paddusw %%mm2, %%mm4 \n\t" 282 "paddusw %%mm3, %%mm5 \n\t" 283 "paddusw %%mm6, %%mm0 \n\t" 284 "paddusw %%mm6, %%mm1 \n\t" 285 "paddusw %%mm4, %%mm0 \n\t" 286 "paddusw %%mm5, %%mm1 \n\t" 287 "psrlw $2, %%mm0 \n\t" 288 "psrlw $2, %%mm1 \n\t" 289 "packuswb %%mm1, %%mm0 \n\t" 290 "movq %%mm0, (%2, %%"REG_a") \n\t" 291 "add %3, %%"REG_a" \n\t" 292 293 "subl $2, %0 \n\t" 294 "jnz 1b \n\t" 295 :"+g"(h), "+S"(pixels) 296 :"D"(block), "r"((x86_reg)line_size) 297 :REG_a, "memory"); 298} 299 300// avg_pixels 301static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 302{ 303 MOVQ_BFE(mm6); 304 JUMPALIGN(); 305 do { 306 __asm__ volatile( 307 "movd %0, %%mm0 \n\t" 308 "movd %1, %%mm1 \n\t" 309 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 310 "movd %%mm2, %0 \n\t" 311 :"+m"(*block) 312 :"m"(*pixels) 313 :"memory"); 314 pixels += line_size; 315 block += line_size; 316 } 317 while (--h); 318} 319 320// in case more speed is needed - unroling would certainly help 321static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 322{ 323 MOVQ_BFE(mm6); 324 JUMPALIGN(); 325 do { 326 __asm__ volatile( 327 "movq %0, %%mm0 \n\t" 328 "movq %1, %%mm1 \n\t" 329 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 330 "movq %%mm2, %0 \n\t" 331 :"+m"(*block) 332 :"m"(*pixels) 333 :"memory"); 334 pixels += line_size; 335 block += line_size; 336 } 337 while (--h); 338} 339 340static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 341{ 342 MOVQ_BFE(mm6); 343 JUMPALIGN(); 344 do { 345 __asm__ volatile( 346 "movq %0, %%mm0 \n\t" 347 "movq %1, %%mm1 \n\t" 348 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 349 "movq %%mm2, %0 \n\t" 350 "movq 8%0, %%mm0 \n\t" 351 "movq 8%1, %%mm1 \n\t" 352 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 353 "movq %%mm2, 8%0 \n\t" 354 :"+m"(*block) 355 :"m"(*pixels) 356 :"memory"); 357 pixels += line_size; 358 block += line_size; 359 } 360 while (--h); 361} 362 363static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 364{ 365 MOVQ_BFE(mm6); 366 JUMPALIGN(); 367 do { 368 __asm__ volatile( 369 "movq %1, %%mm0 \n\t" 370 "movq 1%1, %%mm1 \n\t" 371 "movq %0, %%mm3 \n\t" 372 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 373 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) 374 "movq %%mm0, %0 \n\t" 375 :"+m"(*block) 376 :"m"(*pixels) 377 :"memory"); 378 pixels += line_size; 379 block += line_size; 380 } while (--h); 381} 382 383static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 384{ 385 MOVQ_BFE(mm6); 386 JUMPALIGN(); 387 do { 388 __asm__ volatile( 389 "movq %1, %%mm0 \n\t" 390 "movq %2, %%mm1 \n\t" 391 "movq %0, %%mm3 \n\t" 392 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 393 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) 394 "movq %%mm0, %0 \n\t" 395 :"+m"(*dst) 396 :"m"(*src1), "m"(*src2) 397 :"memory"); 398 dst += dstStride; 399 src1 += src1Stride; 400 src2 += 8; 401 } while (--h); 402} 403 404static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 405{ 406 MOVQ_BFE(mm6); 407 JUMPALIGN(); 408 do { 409 __asm__ volatile( 410 "movq %1, %%mm0 \n\t" 411 "movq 1%1, %%mm1 \n\t" 412 "movq %0, %%mm3 \n\t" 413 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 414 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) 415 "movq %%mm0, %0 \n\t" 416 "movq 8%1, %%mm0 \n\t" 417 "movq 9%1, %%mm1 \n\t" 418 "movq 8%0, %%mm3 \n\t" 419 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 420 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) 421 "movq %%mm0, 8%0 \n\t" 422 :"+m"(*block) 423 :"m"(*pixels) 424 :"memory"); 425 pixels += line_size; 426 block += line_size; 427 } while (--h); 428} 429 430static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 431{ 432 MOVQ_BFE(mm6); 433 JUMPALIGN(); 434 do { 435 __asm__ volatile( 436 "movq %1, %%mm0 \n\t" 437 "movq %2, %%mm1 \n\t" 438 "movq %0, %%mm3 \n\t" 439 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 440 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) 441 "movq %%mm0, %0 \n\t" 442 "movq 8%1, %%mm0 \n\t" 443 "movq 8%2, %%mm1 \n\t" 444 "movq 8%0, %%mm3 \n\t" 445 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) 446 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) 447 "movq %%mm0, 8%0 \n\t" 448 :"+m"(*dst) 449 :"m"(*src1), "m"(*src2) 450 :"memory"); 451 dst += dstStride; 452 src1 += src1Stride; 453 src2 += 16; 454 } while (--h); 455} 456 457static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 458{ 459 MOVQ_BFE(mm6); 460 __asm__ volatile( 461 "lea (%3, %3), %%"REG_a" \n\t" 462 "movq (%1), %%mm0 \n\t" 463 ASMALIGN(3) 464 "1: \n\t" 465 "movq (%1, %3), %%mm1 \n\t" 466 "movq (%1, %%"REG_a"), %%mm2 \n\t" 467 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) 468 "movq (%2), %%mm3 \n\t" 469 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) 470 "movq (%2, %3), %%mm3 \n\t" 471 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) 472 "movq %%mm0, (%2) \n\t" 473 "movq %%mm1, (%2, %3) \n\t" 474 "add %%"REG_a", %1 \n\t" 475 "add %%"REG_a", %2 \n\t" 476 477 "movq (%1, %3), %%mm1 \n\t" 478 "movq (%1, %%"REG_a"), %%mm0 \n\t" 479 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) 480 "movq (%2), %%mm3 \n\t" 481 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) 482 "movq (%2, %3), %%mm3 \n\t" 483 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) 484 "movq %%mm2, (%2) \n\t" 485 "movq %%mm1, (%2, %3) \n\t" 486 "add %%"REG_a", %1 \n\t" 487 "add %%"REG_a", %2 \n\t" 488 489 "subl $4, %0 \n\t" 490 "jnz 1b \n\t" 491 :"+g"(h), "+S"(pixels), "+D"(block) 492 :"r"((x86_reg)line_size) 493 :REG_a, "memory"); 494} 495 496// this routine is 'slightly' suboptimal but mostly unused 497static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 498{ 499 MOVQ_ZERO(mm7); 500 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version 501 __asm__ volatile( 502 "movq (%1), %%mm0 \n\t" 503 "movq 1(%1), %%mm4 \n\t" 504 "movq %%mm0, %%mm1 \n\t" 505 "movq %%mm4, %%mm5 \n\t" 506 "punpcklbw %%mm7, %%mm0 \n\t" 507 "punpcklbw %%mm7, %%mm4 \n\t" 508 "punpckhbw %%mm7, %%mm1 \n\t" 509 "punpckhbw %%mm7, %%mm5 \n\t" 510 "paddusw %%mm0, %%mm4 \n\t" 511 "paddusw %%mm1, %%mm5 \n\t" 512 "xor %%"REG_a", %%"REG_a" \n\t" 513 "add %3, %1 \n\t" 514 ASMALIGN(3) 515 "1: \n\t" 516 "movq (%1, %%"REG_a"), %%mm0 \n\t" 517 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" 518 "movq %%mm0, %%mm1 \n\t" 519 "movq %%mm2, %%mm3 \n\t" 520 "punpcklbw %%mm7, %%mm0 \n\t" 521 "punpcklbw %%mm7, %%mm2 \n\t" 522 "punpckhbw %%mm7, %%mm1 \n\t" 523 "punpckhbw %%mm7, %%mm3 \n\t" 524 "paddusw %%mm2, %%mm0 \n\t" 525 "paddusw %%mm3, %%mm1 \n\t" 526 "paddusw %%mm6, %%mm4 \n\t" 527 "paddusw %%mm6, %%mm5 \n\t" 528 "paddusw %%mm0, %%mm4 \n\t" 529 "paddusw %%mm1, %%mm5 \n\t" 530 "psrlw $2, %%mm4 \n\t" 531 "psrlw $2, %%mm5 \n\t" 532 "movq (%2, %%"REG_a"), %%mm3 \n\t" 533 "packuswb %%mm5, %%mm4 \n\t" 534 "pcmpeqd %%mm2, %%mm2 \n\t" 535 "paddb %%mm2, %%mm2 \n\t" 536 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) 537 "movq %%mm5, (%2, %%"REG_a") \n\t" 538 "add %3, %%"REG_a" \n\t" 539 540 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 541 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" 542 "movq %%mm2, %%mm3 \n\t" 543 "movq %%mm4, %%mm5 \n\t" 544 "punpcklbw %%mm7, %%mm2 \n\t" 545 "punpcklbw %%mm7, %%mm4 \n\t" 546 "punpckhbw %%mm7, %%mm3 \n\t" 547 "punpckhbw %%mm7, %%mm5 \n\t" 548 "paddusw %%mm2, %%mm4 \n\t" 549 "paddusw %%mm3, %%mm5 \n\t" 550 "paddusw %%mm6, %%mm0 \n\t" 551 "paddusw %%mm6, %%mm1 \n\t" 552 "paddusw %%mm4, %%mm0 \n\t" 553 "paddusw %%mm5, %%mm1 \n\t" 554 "psrlw $2, %%mm0 \n\t" 555 "psrlw $2, %%mm1 \n\t" 556 "movq (%2, %%"REG_a"), %%mm3 \n\t" 557 "packuswb %%mm1, %%mm0 \n\t" 558 "pcmpeqd %%mm2, %%mm2 \n\t" 559 "paddb %%mm2, %%mm2 \n\t" 560 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) 561 "movq %%mm1, (%2, %%"REG_a") \n\t" 562 "add %3, %%"REG_a" \n\t" 563 564 "subl $2, %0 \n\t" 565 "jnz 1b \n\t" 566 :"+g"(h), "+S"(pixels) 567 :"D"(block), "r"((x86_reg)line_size) 568 :REG_a, "memory"); 569} 570 571//FIXME optimize 572static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 573 DEF(put, pixels8_y2)(block , pixels , line_size, h); 574 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); 575} 576 577static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 578 DEF(put, pixels8_xy2)(block , pixels , line_size, h); 579 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); 580} 581 582static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 583 DEF(avg, pixels8_y2)(block , pixels , line_size, h); 584 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); 585} 586 587static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 588 DEF(avg, pixels8_xy2)(block , pixels , line_size, h); 589 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); 590} 591