1/* 2 * software RGB to RGB converter 3 * pluralize by software PAL8 to RGB converter 4 * software YUV to YUV converter 5 * software YUV to RGB converter 6 * Written by Nick Kurshev. 7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) 8 * lot of big-endian byte order fixes by Alex Beregszaszi 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27#include <stddef.h> 28#include <stdint.h> 29 30#include "libavutil/attributes.h" 31#include "libavutil/x86/asm.h" 32 33#undef PREFETCH 34#undef MOVNTQ 35#undef EMMS 36#undef SFENCE 37#undef PAVGB 38 39#if COMPILE_TEMPLATE_AMD3DNOW 40#define PREFETCH "prefetch" 41#define PAVGB "pavgusb" 42#elif COMPILE_TEMPLATE_MMXEXT 43#define PREFETCH "prefetchnta" 44#define PAVGB "pavgb" 45#else 46#define PREFETCH " # nop" 47#endif 48 49#if COMPILE_TEMPLATE_AMD3DNOW 50/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ 51#define EMMS "femms" 52#else 53#define EMMS "emms" 54#endif 55 56#if COMPILE_TEMPLATE_MMXEXT 57#define MOVNTQ "movntq" 58#define SFENCE "sfence" 59#else 60#define MOVNTQ "movq" 61#define SFENCE " # nop" 62#endif 63 64#if !COMPILE_TEMPLATE_SSE2 65 66#if !COMPILE_TEMPLATE_AMD3DNOW 67 68static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size) 69{ 70 uint8_t *dest = dst; 71 const uint8_t *s = src; 72 const uint8_t *end; 73 const uint8_t *mm_end; 74 end = s + src_size; 75 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 76 mm_end = end - 23; 77 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); 78 while (s < mm_end) { 79 __asm__ volatile( 80 PREFETCH" 32(%1) \n\t" 81 "movd (%1), %%mm0 \n\t" 82 "punpckldq 3(%1), %%mm0 \n\t" 83 "movd 6(%1), %%mm1 \n\t" 84 "punpckldq 9(%1), %%mm1 \n\t" 85 "movd 12(%1), %%mm2 \n\t" 86 "punpckldq 15(%1), %%mm2 \n\t" 87 "movd 18(%1), %%mm3 \n\t" 88 "punpckldq 21(%1), %%mm3 \n\t" 89 "por %%mm7, %%mm0 \n\t" 90 "por %%mm7, %%mm1 \n\t" 91 "por %%mm7, %%mm2 \n\t" 92 "por %%mm7, %%mm3 \n\t" 93 MOVNTQ" %%mm0, (%0) \n\t" 94 MOVNTQ" %%mm1, 8(%0) \n\t" 95 MOVNTQ" %%mm2, 16(%0) \n\t" 96 MOVNTQ" %%mm3, 24(%0)" 97 :: "r"(dest), "r"(s) 98 :"memory"); 99 dest += 32; 100 s += 24; 101 } 102 __asm__ volatile(SFENCE:::"memory"); 103 __asm__ volatile(EMMS:::"memory"); 104 while (s < end) { 105 *dest++ = *s++; 106 *dest++ = *s++; 107 *dest++ = *s++; 108 *dest++ = 255; 109 } 110} 111 112#define STORE_BGR24_MMX \ 113 "psrlq $8, %%mm2 \n\t" \ 114 "psrlq $8, %%mm3 \n\t" \ 115 "psrlq $8, %%mm6 \n\t" \ 116 "psrlq $8, %%mm7 \n\t" \ 117 "pand "MANGLE(mask24l)", %%mm0\n\t" \ 118 "pand "MANGLE(mask24l)", %%mm1\n\t" \ 119 "pand "MANGLE(mask24l)", %%mm4\n\t" \ 120 "pand "MANGLE(mask24l)", %%mm5\n\t" \ 121 "pand "MANGLE(mask24h)", %%mm2\n\t" \ 122 "pand "MANGLE(mask24h)", %%mm3\n\t" \ 123 "pand "MANGLE(mask24h)", %%mm6\n\t" \ 124 "pand "MANGLE(mask24h)", %%mm7\n\t" \ 125 "por %%mm2, %%mm0 \n\t" \ 126 "por %%mm3, %%mm1 \n\t" \ 127 "por %%mm6, %%mm4 \n\t" \ 128 "por %%mm7, %%mm5 \n\t" \ 129 \ 130 "movq %%mm1, %%mm2 \n\t" \ 131 "movq %%mm4, %%mm3 \n\t" \ 132 "psllq $48, %%mm2 \n\t" \ 133 "psllq $32, %%mm3 \n\t" \ 134 "por %%mm2, %%mm0 \n\t" \ 135 "psrlq $16, %%mm1 \n\t" \ 136 "psrlq $32, %%mm4 \n\t" \ 137 "psllq $16, %%mm5 \n\t" \ 138 "por %%mm3, %%mm1 \n\t" \ 139 "por %%mm5, %%mm4 \n\t" \ 140 \ 141 MOVNTQ" %%mm0, (%0) \n\t" \ 142 MOVNTQ" %%mm1, 8(%0) \n\t" \ 143 MOVNTQ" %%mm4, 16(%0)" 144 145 146static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 147{ 148 uint8_t *dest = dst; 149 const uint8_t *s = src; 150 const uint8_t *end; 151 const uint8_t *mm_end; 152 end = s + src_size; 153 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 154 mm_end = end - 31; 155 while (s < mm_end) { 156 __asm__ volatile( 157 PREFETCH" 32(%1) \n\t" 158 "movq (%1), %%mm0 \n\t" 159 "movq 8(%1), %%mm1 \n\t" 160 "movq 16(%1), %%mm4 \n\t" 161 "movq 24(%1), %%mm5 \n\t" 162 "movq %%mm0, %%mm2 \n\t" 163 "movq %%mm1, %%mm3 \n\t" 164 "movq %%mm4, %%mm6 \n\t" 165 "movq %%mm5, %%mm7 \n\t" 166 STORE_BGR24_MMX 167 :: "r"(dest), "r"(s) 168 NAMED_CONSTRAINTS_ADD(mask24l,mask24h) 169 :"memory"); 170 dest += 24; 171 s += 32; 172 } 173 __asm__ volatile(SFENCE:::"memory"); 174 __asm__ volatile(EMMS:::"memory"); 175 while (s < end) { 176 *dest++ = *s++; 177 *dest++ = *s++; 178 *dest++ = *s++; 179 s++; 180 } 181} 182 183/* 184 original by Strepto/Astral 185 ported to gcc & bugfixed: A'rpi 186 MMXEXT, 3DNOW optimization by Nick Kurshev 187 32-bit C version, and and&add trick by Michael Niedermayer 188*/ 189static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size) 190{ 191 register const uint8_t* s=src; 192 register uint8_t* d=dst; 193 register const uint8_t *end; 194 const uint8_t *mm_end; 195 end = s + src_size; 196 __asm__ volatile(PREFETCH" %0"::"m"(*s)); 197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); 198 mm_end = end - 15; 199 while (s<mm_end) { 200 __asm__ volatile( 201 PREFETCH" 32(%1) \n\t" 202 "movq (%1), %%mm0 \n\t" 203 "movq 8(%1), %%mm2 \n\t" 204 "movq %%mm0, %%mm1 \n\t" 205 "movq %%mm2, %%mm3 \n\t" 206 "pand %%mm4, %%mm0 \n\t" 207 "pand %%mm4, %%mm2 \n\t" 208 "paddw %%mm1, %%mm0 \n\t" 209 "paddw %%mm3, %%mm2 \n\t" 210 MOVNTQ" %%mm0, (%0) \n\t" 211 MOVNTQ" %%mm2, 8(%0)" 212 :: "r"(d), "r"(s) 213 ); 214 d+=16; 215 s+=16; 216 } 217 __asm__ volatile(SFENCE:::"memory"); 218 __asm__ volatile(EMMS:::"memory"); 219 mm_end = end - 3; 220 while (s < mm_end) { 221 register unsigned x= *((const uint32_t *)s); 222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); 223 d+=4; 224 s+=4; 225 } 226 if (s < end) { 227 register unsigned short x= *((const uint16_t *)s); 228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); 229 } 230} 231 232static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size) 233{ 234 register const uint8_t* s=src; 235 register uint8_t* d=dst; 236 register const uint8_t *end; 237 const uint8_t *mm_end; 238 end = s + src_size; 239 __asm__ volatile(PREFETCH" %0"::"m"(*s)); 240 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); 241 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); 242 mm_end = end - 15; 243 while (s<mm_end) { 244 __asm__ volatile( 245 PREFETCH" 32(%1) \n\t" 246 "movq (%1), %%mm0 \n\t" 247 "movq 8(%1), %%mm2 \n\t" 248 "movq %%mm0, %%mm1 \n\t" 249 "movq %%mm2, %%mm3 \n\t" 250 "psrlq $1, %%mm0 \n\t" 251 "psrlq $1, %%mm2 \n\t" 252 "pand %%mm7, %%mm0 \n\t" 253 "pand %%mm7, %%mm2 \n\t" 254 "pand %%mm6, %%mm1 \n\t" 255 "pand %%mm6, %%mm3 \n\t" 256 "por %%mm1, %%mm0 \n\t" 257 "por %%mm3, %%mm2 \n\t" 258 MOVNTQ" %%mm0, (%0) \n\t" 259 MOVNTQ" %%mm2, 8(%0)" 260 :: "r"(d), "r"(s) 261 ); 262 d+=16; 263 s+=16; 264 } 265 __asm__ volatile(SFENCE:::"memory"); 266 __asm__ volatile(EMMS:::"memory"); 267 mm_end = end - 3; 268 while (s < mm_end) { 269 register uint32_t x= *((const uint32_t*)s); 270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); 271 s+=4; 272 d+=4; 273 } 274 if (s < end) { 275 register uint16_t x= *((const uint16_t*)s); 276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); 277 } 278} 279 280static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size) 281{ 282 const uint8_t *s = src; 283 const uint8_t *end; 284 const uint8_t *mm_end; 285 uint16_t *d = (uint16_t *)dst; 286 end = s + src_size; 287 mm_end = end - 15; 288 __asm__ volatile( 289 "movq %3, %%mm5 \n\t" 290 "movq %4, %%mm6 \n\t" 291 "movq %5, %%mm7 \n\t" 292 "jmp 2f \n\t" 293 ".p2align 4 \n\t" 294 "1: \n\t" 295 PREFETCH" 32(%1) \n\t" 296 "movd (%1), %%mm0 \n\t" 297 "movd 4(%1), %%mm3 \n\t" 298 "punpckldq 8(%1), %%mm0 \n\t" 299 "punpckldq 12(%1), %%mm3 \n\t" 300 "movq %%mm0, %%mm1 \n\t" 301 "movq %%mm3, %%mm4 \n\t" 302 "pand %%mm6, %%mm0 \n\t" 303 "pand %%mm6, %%mm3 \n\t" 304 "pmaddwd %%mm7, %%mm0 \n\t" 305 "pmaddwd %%mm7, %%mm3 \n\t" 306 "pand %%mm5, %%mm1 \n\t" 307 "pand %%mm5, %%mm4 \n\t" 308 "por %%mm1, %%mm0 \n\t" 309 "por %%mm4, %%mm3 \n\t" 310 "psrld $5, %%mm0 \n\t" 311 "pslld $11, %%mm3 \n\t" 312 "por %%mm3, %%mm0 \n\t" 313 MOVNTQ" %%mm0, (%0) \n\t" 314 "add $16, %1 \n\t" 315 "add $8, %0 \n\t" 316 "2: \n\t" 317 "cmp %2, %1 \n\t" 318 " jb 1b \n\t" 319 : "+r" (d), "+r"(s) 320 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) 321 ); 322 __asm__ volatile(SFENCE:::"memory"); 323 __asm__ volatile(EMMS:::"memory"); 324 while (s < end) { 325 register int rgb = *(const uint32_t*)s; s += 4; 326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); 327 } 328} 329 330static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) 331{ 332 const uint8_t *s = src; 333 const uint8_t *end; 334 const uint8_t *mm_end; 335 uint16_t *d = (uint16_t *)dst; 336 end = s + src_size; 337 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 338 __asm__ volatile( 339 "movq %0, %%mm7 \n\t" 340 "movq %1, %%mm6 \n\t" 341 ::"m"(red_16mask),"m"(green_16mask)); 342 mm_end = end - 15; 343 while (s < mm_end) { 344 __asm__ volatile( 345 PREFETCH" 32(%1) \n\t" 346 "movd (%1), %%mm0 \n\t" 347 "movd 4(%1), %%mm3 \n\t" 348 "punpckldq 8(%1), %%mm0 \n\t" 349 "punpckldq 12(%1), %%mm3 \n\t" 350 "movq %%mm0, %%mm1 \n\t" 351 "movq %%mm0, %%mm2 \n\t" 352 "movq %%mm3, %%mm4 \n\t" 353 "movq %%mm3, %%mm5 \n\t" 354 "psllq $8, %%mm0 \n\t" 355 "psllq $8, %%mm3 \n\t" 356 "pand %%mm7, %%mm0 \n\t" 357 "pand %%mm7, %%mm3 \n\t" 358 "psrlq $5, %%mm1 \n\t" 359 "psrlq $5, %%mm4 \n\t" 360 "pand %%mm6, %%mm1 \n\t" 361 "pand %%mm6, %%mm4 \n\t" 362 "psrlq $19, %%mm2 \n\t" 363 "psrlq $19, %%mm5 \n\t" 364 "pand %2, %%mm2 \n\t" 365 "pand %2, %%mm5 \n\t" 366 "por %%mm1, %%mm0 \n\t" 367 "por %%mm4, %%mm3 \n\t" 368 "por %%mm2, %%mm0 \n\t" 369 "por %%mm5, %%mm3 \n\t" 370 "psllq $16, %%mm3 \n\t" 371 "por %%mm3, %%mm0 \n\t" 372 MOVNTQ" %%mm0, (%0) \n\t" 373 :: "r"(d),"r"(s),"m"(blue_16mask):"memory"); 374 d += 4; 375 s += 16; 376 } 377 __asm__ volatile(SFENCE:::"memory"); 378 __asm__ volatile(EMMS:::"memory"); 379 while (s < end) { 380 register int rgb = *(const uint32_t*)s; s += 4; 381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); 382 } 383} 384 385static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size) 386{ 387 const uint8_t *s = src; 388 const uint8_t *end; 389 const uint8_t *mm_end; 390 uint16_t *d = (uint16_t *)dst; 391 end = s + src_size; 392 mm_end = end - 15; 393 __asm__ volatile( 394 "movq %3, %%mm5 \n\t" 395 "movq %4, %%mm6 \n\t" 396 "movq %5, %%mm7 \n\t" 397 "jmp 2f \n\t" 398 ".p2align 4 \n\t" 399 "1: \n\t" 400 PREFETCH" 32(%1) \n\t" 401 "movd (%1), %%mm0 \n\t" 402 "movd 4(%1), %%mm3 \n\t" 403 "punpckldq 8(%1), %%mm0 \n\t" 404 "punpckldq 12(%1), %%mm3 \n\t" 405 "movq %%mm0, %%mm1 \n\t" 406 "movq %%mm3, %%mm4 \n\t" 407 "pand %%mm6, %%mm0 \n\t" 408 "pand %%mm6, %%mm3 \n\t" 409 "pmaddwd %%mm7, %%mm0 \n\t" 410 "pmaddwd %%mm7, %%mm3 \n\t" 411 "pand %%mm5, %%mm1 \n\t" 412 "pand %%mm5, %%mm4 \n\t" 413 "por %%mm1, %%mm0 \n\t" 414 "por %%mm4, %%mm3 \n\t" 415 "psrld $6, %%mm0 \n\t" 416 "pslld $10, %%mm3 \n\t" 417 "por %%mm3, %%mm0 \n\t" 418 MOVNTQ" %%mm0, (%0) \n\t" 419 "add $16, %1 \n\t" 420 "add $8, %0 \n\t" 421 "2: \n\t" 422 "cmp %2, %1 \n\t" 423 " jb 1b \n\t" 424 : "+r" (d), "+r"(s) 425 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) 426 ); 427 __asm__ volatile(SFENCE:::"memory"); 428 __asm__ volatile(EMMS:::"memory"); 429 while (s < end) { 430 register int rgb = *(const uint32_t*)s; s += 4; 431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); 432 } 433} 434 435static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) 436{ 437 const uint8_t *s = src; 438 const uint8_t *end; 439 const uint8_t *mm_end; 440 uint16_t *d = (uint16_t *)dst; 441 end = s + src_size; 442 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 443 __asm__ volatile( 444 "movq %0, %%mm7 \n\t" 445 "movq %1, %%mm6 \n\t" 446 ::"m"(red_15mask),"m"(green_15mask)); 447 mm_end = end - 15; 448 while (s < mm_end) { 449 __asm__ volatile( 450 PREFETCH" 32(%1) \n\t" 451 "movd (%1), %%mm0 \n\t" 452 "movd 4(%1), %%mm3 \n\t" 453 "punpckldq 8(%1), %%mm0 \n\t" 454 "punpckldq 12(%1), %%mm3 \n\t" 455 "movq %%mm0, %%mm1 \n\t" 456 "movq %%mm0, %%mm2 \n\t" 457 "movq %%mm3, %%mm4 \n\t" 458 "movq %%mm3, %%mm5 \n\t" 459 "psllq $7, %%mm0 \n\t" 460 "psllq $7, %%mm3 \n\t" 461 "pand %%mm7, %%mm0 \n\t" 462 "pand %%mm7, %%mm3 \n\t" 463 "psrlq $6, %%mm1 \n\t" 464 "psrlq $6, %%mm4 \n\t" 465 "pand %%mm6, %%mm1 \n\t" 466 "pand %%mm6, %%mm4 \n\t" 467 "psrlq $19, %%mm2 \n\t" 468 "psrlq $19, %%mm5 \n\t" 469 "pand %2, %%mm2 \n\t" 470 "pand %2, %%mm5 \n\t" 471 "por %%mm1, %%mm0 \n\t" 472 "por %%mm4, %%mm3 \n\t" 473 "por %%mm2, %%mm0 \n\t" 474 "por %%mm5, %%mm3 \n\t" 475 "psllq $16, %%mm3 \n\t" 476 "por %%mm3, %%mm0 \n\t" 477 MOVNTQ" %%mm0, (%0) \n\t" 478 ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); 479 d += 4; 480 s += 16; 481 } 482 __asm__ volatile(SFENCE:::"memory"); 483 __asm__ volatile(EMMS:::"memory"); 484 while (s < end) { 485 register int rgb = *(const uint32_t*)s; s += 4; 486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); 487 } 488} 489 490static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) 491{ 492 const uint8_t *s = src; 493 const uint8_t *end; 494 const uint8_t *mm_end; 495 uint16_t *d = (uint16_t *)dst; 496 end = s + src_size; 497 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 498 __asm__ volatile( 499 "movq %0, %%mm7 \n\t" 500 "movq %1, %%mm6 \n\t" 501 ::"m"(red_16mask),"m"(green_16mask)); 502 mm_end = end - 11; 503 while (s < mm_end) { 504 __asm__ volatile( 505 PREFETCH" 32(%1) \n\t" 506 "movd (%1), %%mm0 \n\t" 507 "movd 3(%1), %%mm3 \n\t" 508 "punpckldq 6(%1), %%mm0 \n\t" 509 "punpckldq 9(%1), %%mm3 \n\t" 510 "movq %%mm0, %%mm1 \n\t" 511 "movq %%mm0, %%mm2 \n\t" 512 "movq %%mm3, %%mm4 \n\t" 513 "movq %%mm3, %%mm5 \n\t" 514 "psrlq $3, %%mm0 \n\t" 515 "psrlq $3, %%mm3 \n\t" 516 "pand %2, %%mm0 \n\t" 517 "pand %2, %%mm3 \n\t" 518 "psrlq $5, %%mm1 \n\t" 519 "psrlq $5, %%mm4 \n\t" 520 "pand %%mm6, %%mm1 \n\t" 521 "pand %%mm6, %%mm4 \n\t" 522 "psrlq $8, %%mm2 \n\t" 523 "psrlq $8, %%mm5 \n\t" 524 "pand %%mm7, %%mm2 \n\t" 525 "pand %%mm7, %%mm5 \n\t" 526 "por %%mm1, %%mm0 \n\t" 527 "por %%mm4, %%mm3 \n\t" 528 "por %%mm2, %%mm0 \n\t" 529 "por %%mm5, %%mm3 \n\t" 530 "psllq $16, %%mm3 \n\t" 531 "por %%mm3, %%mm0 \n\t" 532 MOVNTQ" %%mm0, (%0) \n\t" 533 ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); 534 d += 4; 535 s += 12; 536 } 537 __asm__ volatile(SFENCE:::"memory"); 538 __asm__ volatile(EMMS:::"memory"); 539 while (s < end) { 540 const int b = *s++; 541 const int g = *s++; 542 const int r = *s++; 543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 544 } 545} 546 547static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size) 548{ 549 const uint8_t *s = src; 550 const uint8_t *end; 551 const uint8_t *mm_end; 552 uint16_t *d = (uint16_t *)dst; 553 end = s + src_size; 554 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 555 __asm__ volatile( 556 "movq %0, %%mm7 \n\t" 557 "movq %1, %%mm6 \n\t" 558 ::"m"(red_16mask),"m"(green_16mask)); 559 mm_end = end - 15; 560 while (s < mm_end) { 561 __asm__ volatile( 562 PREFETCH" 32(%1) \n\t" 563 "movd (%1), %%mm0 \n\t" 564 "movd 3(%1), %%mm3 \n\t" 565 "punpckldq 6(%1), %%mm0 \n\t" 566 "punpckldq 9(%1), %%mm3 \n\t" 567 "movq %%mm0, %%mm1 \n\t" 568 "movq %%mm0, %%mm2 \n\t" 569 "movq %%mm3, %%mm4 \n\t" 570 "movq %%mm3, %%mm5 \n\t" 571 "psllq $8, %%mm0 \n\t" 572 "psllq $8, %%mm3 \n\t" 573 "pand %%mm7, %%mm0 \n\t" 574 "pand %%mm7, %%mm3 \n\t" 575 "psrlq $5, %%mm1 \n\t" 576 "psrlq $5, %%mm4 \n\t" 577 "pand %%mm6, %%mm1 \n\t" 578 "pand %%mm6, %%mm4 \n\t" 579 "psrlq $19, %%mm2 \n\t" 580 "psrlq $19, %%mm5 \n\t" 581 "pand %2, %%mm2 \n\t" 582 "pand %2, %%mm5 \n\t" 583 "por %%mm1, %%mm0 \n\t" 584 "por %%mm4, %%mm3 \n\t" 585 "por %%mm2, %%mm0 \n\t" 586 "por %%mm5, %%mm3 \n\t" 587 "psllq $16, %%mm3 \n\t" 588 "por %%mm3, %%mm0 \n\t" 589 MOVNTQ" %%mm0, (%0) \n\t" 590 ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); 591 d += 4; 592 s += 12; 593 } 594 __asm__ volatile(SFENCE:::"memory"); 595 __asm__ volatile(EMMS:::"memory"); 596 while (s < end) { 597 const int r = *s++; 598 const int g = *s++; 599 const int b = *s++; 600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 601 } 602} 603 604static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) 605{ 606 const uint8_t *s = src; 607 const uint8_t *end; 608 const uint8_t *mm_end; 609 uint16_t *d = (uint16_t *)dst; 610 end = s + src_size; 611 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 612 __asm__ volatile( 613 "movq %0, %%mm7 \n\t" 614 "movq %1, %%mm6 \n\t" 615 ::"m"(red_15mask),"m"(green_15mask)); 616 mm_end = end - 11; 617 while (s < mm_end) { 618 __asm__ volatile( 619 PREFETCH" 32(%1) \n\t" 620 "movd (%1), %%mm0 \n\t" 621 "movd 3(%1), %%mm3 \n\t" 622 "punpckldq 6(%1), %%mm0 \n\t" 623 "punpckldq 9(%1), %%mm3 \n\t" 624 "movq %%mm0, %%mm1 \n\t" 625 "movq %%mm0, %%mm2 \n\t" 626 "movq %%mm3, %%mm4 \n\t" 627 "movq %%mm3, %%mm5 \n\t" 628 "psrlq $3, %%mm0 \n\t" 629 "psrlq $3, %%mm3 \n\t" 630 "pand %2, %%mm0 \n\t" 631 "pand %2, %%mm3 \n\t" 632 "psrlq $6, %%mm1 \n\t" 633 "psrlq $6, %%mm4 \n\t" 634 "pand %%mm6, %%mm1 \n\t" 635 "pand %%mm6, %%mm4 \n\t" 636 "psrlq $9, %%mm2 \n\t" 637 "psrlq $9, %%mm5 \n\t" 638 "pand %%mm7, %%mm2 \n\t" 639 "pand %%mm7, %%mm5 \n\t" 640 "por %%mm1, %%mm0 \n\t" 641 "por %%mm4, %%mm3 \n\t" 642 "por %%mm2, %%mm0 \n\t" 643 "por %%mm5, %%mm3 \n\t" 644 "psllq $16, %%mm3 \n\t" 645 "por %%mm3, %%mm0 \n\t" 646 MOVNTQ" %%mm0, (%0) \n\t" 647 ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); 648 d += 4; 649 s += 12; 650 } 651 __asm__ volatile(SFENCE:::"memory"); 652 __asm__ volatile(EMMS:::"memory"); 653 while (s < end) { 654 const int b = *s++; 655 const int g = *s++; 656 const int r = *s++; 657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 658 } 659} 660 661static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size) 662{ 663 const uint8_t *s = src; 664 const uint8_t *end; 665 const uint8_t *mm_end; 666 uint16_t *d = (uint16_t *)dst; 667 end = s + src_size; 668 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 669 __asm__ volatile( 670 "movq %0, %%mm7 \n\t" 671 "movq %1, %%mm6 \n\t" 672 ::"m"(red_15mask),"m"(green_15mask)); 673 mm_end = end - 15; 674 while (s < mm_end) { 675 __asm__ volatile( 676 PREFETCH" 32(%1) \n\t" 677 "movd (%1), %%mm0 \n\t" 678 "movd 3(%1), %%mm3 \n\t" 679 "punpckldq 6(%1), %%mm0 \n\t" 680 "punpckldq 9(%1), %%mm3 \n\t" 681 "movq %%mm0, %%mm1 \n\t" 682 "movq %%mm0, %%mm2 \n\t" 683 "movq %%mm3, %%mm4 \n\t" 684 "movq %%mm3, %%mm5 \n\t" 685 "psllq $7, %%mm0 \n\t" 686 "psllq $7, %%mm3 \n\t" 687 "pand %%mm7, %%mm0 \n\t" 688 "pand %%mm7, %%mm3 \n\t" 689 "psrlq $6, %%mm1 \n\t" 690 "psrlq $6, %%mm4 \n\t" 691 "pand %%mm6, %%mm1 \n\t" 692 "pand %%mm6, %%mm4 \n\t" 693 "psrlq $19, %%mm2 \n\t" 694 "psrlq $19, %%mm5 \n\t" 695 "pand %2, %%mm2 \n\t" 696 "pand %2, %%mm5 \n\t" 697 "por %%mm1, %%mm0 \n\t" 698 "por %%mm4, %%mm3 \n\t" 699 "por %%mm2, %%mm0 \n\t" 700 "por %%mm5, %%mm3 \n\t" 701 "psllq $16, %%mm3 \n\t" 702 "por %%mm3, %%mm0 \n\t" 703 MOVNTQ" %%mm0, (%0) \n\t" 704 ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); 705 d += 4; 706 s += 12; 707 } 708 __asm__ volatile(SFENCE:::"memory"); 709 __asm__ volatile(EMMS:::"memory"); 710 while (s < end) { 711 const int r = *s++; 712 const int g = *s++; 713 const int b = *s++; 714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 715 } 716} 717 718static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 719{ 720 const uint16_t *end; 721 const uint16_t *mm_end; 722 uint8_t *d = dst; 723 const uint16_t *s = (const uint16_t*)src; 724 end = s + src_size/2; 725 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 726 mm_end = end - 7; 727 while (s < mm_end) { 728 __asm__ volatile( 729 PREFETCH" 32(%1) \n\t" 730 "movq (%1), %%mm0 \n\t" 731 "movq (%1), %%mm1 \n\t" 732 "movq (%1), %%mm2 \n\t" 733 "pand %2, %%mm0 \n\t" 734 "pand %3, %%mm1 \n\t" 735 "pand %4, %%mm2 \n\t" 736 "psllq $5, %%mm0 \n\t" 737 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" 738 "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" 739 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" 740 "movq %%mm0, %%mm3 \n\t" 741 "movq %%mm1, %%mm4 \n\t" 742 "movq %%mm2, %%mm5 \n\t" 743 "punpcklwd %5, %%mm0 \n\t" 744 "punpcklwd %5, %%mm1 \n\t" 745 "punpcklwd %5, %%mm2 \n\t" 746 "punpckhwd %5, %%mm3 \n\t" 747 "punpckhwd %5, %%mm4 \n\t" 748 "punpckhwd %5, %%mm5 \n\t" 749 "psllq $8, %%mm1 \n\t" 750 "psllq $16, %%mm2 \n\t" 751 "por %%mm1, %%mm0 \n\t" 752 "por %%mm2, %%mm0 \n\t" 753 "psllq $8, %%mm4 \n\t" 754 "psllq $16, %%mm5 \n\t" 755 "por %%mm4, %%mm3 \n\t" 756 "por %%mm5, %%mm3 \n\t" 757 758 "movq %%mm0, %%mm6 \n\t" 759 "movq %%mm3, %%mm7 \n\t" 760 761 "movq 8(%1), %%mm0 \n\t" 762 "movq 8(%1), %%mm1 \n\t" 763 "movq 8(%1), %%mm2 \n\t" 764 "pand %2, %%mm0 \n\t" 765 "pand %3, %%mm1 \n\t" 766 "pand %4, %%mm2 \n\t" 767 "psllq $5, %%mm0 \n\t" 768 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" 769 "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" 770 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" 771 "movq %%mm0, %%mm3 \n\t" 772 "movq %%mm1, %%mm4 \n\t" 773 "movq %%mm2, %%mm5 \n\t" 774 "punpcklwd %5, %%mm0 \n\t" 775 "punpcklwd %5, %%mm1 \n\t" 776 "punpcklwd %5, %%mm2 \n\t" 777 "punpckhwd %5, %%mm3 \n\t" 778 "punpckhwd %5, %%mm4 \n\t" 779 "punpckhwd %5, %%mm5 \n\t" 780 "psllq $8, %%mm1 \n\t" 781 "psllq $16, %%mm2 \n\t" 782 "por %%mm1, %%mm0 \n\t" 783 "por %%mm2, %%mm0 \n\t" 784 "psllq $8, %%mm4 \n\t" 785 "psllq $16, %%mm5 \n\t" 786 "por %%mm4, %%mm3 \n\t" 787 "por %%mm5, %%mm3 \n\t" 788 789 :"=m"(*d) 790 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) 791 NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi) 792 :"memory"); 793 /* borrowed 32 to 24 */ 794 __asm__ volatile( 795 "movq %%mm0, %%mm4 \n\t" 796 "movq %%mm3, %%mm5 \n\t" 797 "movq %%mm6, %%mm0 \n\t" 798 "movq %%mm7, %%mm1 \n\t" 799 800 "movq %%mm4, %%mm6 \n\t" 801 "movq %%mm5, %%mm7 \n\t" 802 "movq %%mm0, %%mm2 \n\t" 803 "movq %%mm1, %%mm3 \n\t" 804 805 STORE_BGR24_MMX 806 807 :: "r"(d), "m"(*s) 808 NAMED_CONSTRAINTS_ADD(mask24l,mask24h) 809 :"memory"); 810 d += 24; 811 s += 8; 812 } 813 __asm__ volatile(SFENCE:::"memory"); 814 __asm__ volatile(EMMS:::"memory"); 815 while (s < end) { 816 register uint16_t bgr; 817 bgr = *s++; 818 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); 819 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); 820 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); 821 } 822} 823 824static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 825{ 826 const uint16_t *end; 827 const uint16_t *mm_end; 828 uint8_t *d = (uint8_t *)dst; 829 const uint16_t *s = (const uint16_t *)src; 830 end = s + src_size/2; 831 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 832 mm_end = end - 7; 833 while (s < mm_end) { 834 __asm__ volatile( 835 PREFETCH" 32(%1) \n\t" 836 "movq (%1), %%mm0 \n\t" 837 "movq (%1), %%mm1 \n\t" 838 "movq (%1), %%mm2 \n\t" 839 "pand %2, %%mm0 \n\t" 840 "pand %3, %%mm1 \n\t" 841 "pand %4, %%mm2 \n\t" 842 "psllq $5, %%mm0 \n\t" 843 "psrlq $1, %%mm2 \n\t" 844 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" 845 "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" 846 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" 847 "movq %%mm0, %%mm3 \n\t" 848 "movq %%mm1, %%mm4 \n\t" 849 "movq %%mm2, %%mm5 \n\t" 850 "punpcklwd %5, %%mm0 \n\t" 851 "punpcklwd %5, %%mm1 \n\t" 852 "punpcklwd %5, %%mm2 \n\t" 853 "punpckhwd %5, %%mm3 \n\t" 854 "punpckhwd %5, %%mm4 \n\t" 855 "punpckhwd %5, %%mm5 \n\t" 856 "psllq $8, %%mm1 \n\t" 857 "psllq $16, %%mm2 \n\t" 858 "por %%mm1, %%mm0 \n\t" 859 "por %%mm2, %%mm0 \n\t" 860 "psllq $8, %%mm4 \n\t" 861 "psllq $16, %%mm5 \n\t" 862 "por %%mm4, %%mm3 \n\t" 863 "por %%mm5, %%mm3 \n\t" 864 865 "movq %%mm0, %%mm6 \n\t" 866 "movq %%mm3, %%mm7 \n\t" 867 868 "movq 8(%1), %%mm0 \n\t" 869 "movq 8(%1), %%mm1 \n\t" 870 "movq 8(%1), %%mm2 \n\t" 871 "pand %2, %%mm0 \n\t" 872 "pand %3, %%mm1 \n\t" 873 "pand %4, %%mm2 \n\t" 874 "psllq $5, %%mm0 \n\t" 875 "psrlq $1, %%mm2 \n\t" 876 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" 877 "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" 878 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" 879 "movq %%mm0, %%mm3 \n\t" 880 "movq %%mm1, %%mm4 \n\t" 881 "movq %%mm2, %%mm5 \n\t" 882 "punpcklwd %5, %%mm0 \n\t" 883 "punpcklwd %5, %%mm1 \n\t" 884 "punpcklwd %5, %%mm2 \n\t" 885 "punpckhwd %5, %%mm3 \n\t" 886 "punpckhwd %5, %%mm4 \n\t" 887 "punpckhwd %5, %%mm5 \n\t" 888 "psllq $8, %%mm1 \n\t" 889 "psllq $16, %%mm2 \n\t" 890 "por %%mm1, %%mm0 \n\t" 891 "por %%mm2, %%mm0 \n\t" 892 "psllq $8, %%mm4 \n\t" 893 "psllq $16, %%mm5 \n\t" 894 "por %%mm4, %%mm3 \n\t" 895 "por %%mm5, %%mm3 \n\t" 896 :"=m"(*d) 897 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) 898 NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi) 899 :"memory"); 900 /* borrowed 32 to 24 */ 901 __asm__ volatile( 902 "movq %%mm0, %%mm4 \n\t" 903 "movq %%mm3, %%mm5 \n\t" 904 "movq %%mm6, %%mm0 \n\t" 905 "movq %%mm7, %%mm1 \n\t" 906 907 "movq %%mm4, %%mm6 \n\t" 908 "movq %%mm5, %%mm7 \n\t" 909 "movq %%mm0, %%mm2 \n\t" 910 "movq %%mm1, %%mm3 \n\t" 911 912 STORE_BGR24_MMX 913 914 :: "r"(d), "m"(*s) 915 NAMED_CONSTRAINTS_ADD(mask24l,mask24h) 916 :"memory"); 917 d += 24; 918 s += 8; 919 } 920 __asm__ volatile(SFENCE:::"memory"); 921 __asm__ volatile(EMMS:::"memory"); 922 while (s < end) { 923 register uint16_t bgr; 924 bgr = *s++; 925 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); 926 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); 927 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); 928 } 929} 930 931/* 932 * mm0 = 00 B3 00 B2 00 B1 00 B0 933 * mm1 = 00 G3 00 G2 00 G1 00 G0 934 * mm2 = 00 R3 00 R2 00 R1 00 R0 935 * mm6 = FF FF FF FF FF FF FF FF 936 * mm7 = 00 00 00 00 00 00 00 00 937 */ 938#define PACK_RGB32 \ 939 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ 940 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ 941 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ 942 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ 943 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ 944 "movq %%mm0, %%mm3 \n\t" \ 945 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ 946 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ 947 MOVNTQ" %%mm0, (%0) \n\t" \ 948 MOVNTQ" %%mm3, 8(%0) \n\t" \ 949 950static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size) 951{ 952 const uint16_t *end; 953 const uint16_t *mm_end; 954 uint8_t *d = dst; 955 const uint16_t *s = (const uint16_t *)src; 956 end = s + src_size/2; 957 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 958 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); 959 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 960 mm_end = end - 3; 961 while (s < mm_end) { 962 __asm__ volatile( 963 PREFETCH" 32(%1) \n\t" 964 "movq (%1), %%mm0 \n\t" 965 "movq (%1), %%mm1 \n\t" 966 "movq (%1), %%mm2 \n\t" 967 "pand %2, %%mm0 \n\t" 968 "pand %3, %%mm1 \n\t" 969 "pand %4, %%mm2 \n\t" 970 "psllq $5, %%mm0 \n\t" 971 "pmulhw %5, %%mm0 \n\t" 972 "pmulhw %5, %%mm1 \n\t" 973 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" 974 PACK_RGB32 975 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) 976 NAMED_CONSTRAINTS_ADD(mul15_hi) 977 :"memory"); 978 d += 16; 979 s += 4; 980 } 981 __asm__ volatile(SFENCE:::"memory"); 982 __asm__ volatile(EMMS:::"memory"); 983 while (s < end) { 984 register uint16_t bgr; 985 bgr = *s++; 986 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); 987 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); 988 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); 989 *d++ = 255; 990 } 991} 992 993static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size) 994{ 995 const uint16_t *end; 996 const uint16_t *mm_end; 997 uint8_t *d = dst; 998 const uint16_t *s = (const uint16_t*)src; 999 end = s + src_size/2; 1000 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 1001 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); 1002 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 1003 mm_end = end - 3; 1004 while (s < mm_end) { 1005 __asm__ volatile( 1006 PREFETCH" 32(%1) \n\t" 1007 "movq (%1), %%mm0 \n\t" 1008 "movq (%1), %%mm1 \n\t" 1009 "movq (%1), %%mm2 \n\t" 1010 "pand %2, %%mm0 \n\t" 1011 "pand %3, %%mm1 \n\t" 1012 "pand %4, %%mm2 \n\t" 1013 "psllq $5, %%mm0 \n\t" 1014 "psrlq $1, %%mm2 \n\t" 1015 "pmulhw %5, %%mm0 \n\t" 1016 "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" 1017 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" 1018 PACK_RGB32 1019 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) 1020 NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi) 1021 :"memory"); 1022 d += 16; 1023 s += 4; 1024 } 1025 __asm__ volatile(SFENCE:::"memory"); 1026 __asm__ volatile(EMMS:::"memory"); 1027 while (s < end) { 1028 register uint16_t bgr; 1029 bgr = *s++; 1030 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); 1031 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); 1032 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); 1033 *d++ = 255; 1034 } 1035} 1036 1037static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size) 1038{ 1039 x86_reg idx = 15 - src_size; 1040 const uint8_t *s = src-idx; 1041 uint8_t *d = dst-idx; 1042 __asm__ volatile( 1043 "test %0, %0 \n\t" 1044 "jns 2f \n\t" 1045 PREFETCH" (%1, %0) \n\t" 1046 "movq %3, %%mm7 \n\t" 1047 "pxor %4, %%mm7 \n\t" 1048 "movq %%mm7, %%mm6 \n\t" 1049 "pxor %5, %%mm7 \n\t" 1050 ".p2align 4 \n\t" 1051 "1: \n\t" 1052 PREFETCH" 32(%1, %0) \n\t" 1053 "movq (%1, %0), %%mm0 \n\t" 1054 "movq 8(%1, %0), %%mm1 \n\t" 1055# if COMPILE_TEMPLATE_MMXEXT 1056 "pshufw $177, %%mm0, %%mm3 \n\t" 1057 "pshufw $177, %%mm1, %%mm5 \n\t" 1058 "pand %%mm7, %%mm0 \n\t" 1059 "pand %%mm6, %%mm3 \n\t" 1060 "pand %%mm7, %%mm1 \n\t" 1061 "pand %%mm6, %%mm5 \n\t" 1062 "por %%mm3, %%mm0 \n\t" 1063 "por %%mm5, %%mm1 \n\t" 1064# else 1065 "movq %%mm0, %%mm2 \n\t" 1066 "movq %%mm1, %%mm4 \n\t" 1067 "pand %%mm7, %%mm0 \n\t" 1068 "pand %%mm6, %%mm2 \n\t" 1069 "pand %%mm7, %%mm1 \n\t" 1070 "pand %%mm6, %%mm4 \n\t" 1071 "movq %%mm2, %%mm3 \n\t" 1072 "movq %%mm4, %%mm5 \n\t" 1073 "pslld $16, %%mm2 \n\t" 1074 "psrld $16, %%mm3 \n\t" 1075 "pslld $16, %%mm4 \n\t" 1076 "psrld $16, %%mm5 \n\t" 1077 "por %%mm2, %%mm0 \n\t" 1078 "por %%mm4, %%mm1 \n\t" 1079 "por %%mm3, %%mm0 \n\t" 1080 "por %%mm5, %%mm1 \n\t" 1081# endif 1082 MOVNTQ" %%mm0, (%2, %0) \n\t" 1083 MOVNTQ" %%mm1, 8(%2, %0) \n\t" 1084 "add $16, %0 \n\t" 1085 "js 1b \n\t" 1086 SFENCE" \n\t" 1087 EMMS" \n\t" 1088 "2: \n\t" 1089 : "+&r"(idx) 1090 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) 1091 : "memory"); 1092 for (; idx<15; idx+=4) { 1093 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; 1094 v &= 0xff00ff; 1095 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); 1096 } 1097} 1098 1099static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 1100{ 1101 unsigned i; 1102 x86_reg mmx_size= 23 - src_size; 1103 __asm__ volatile ( 1104 "test %%"REG_a", %%"REG_a" \n\t" 1105 "jns 2f \n\t" 1106 "movq "MANGLE(mask24r)", %%mm5 \n\t" 1107 "movq "MANGLE(mask24g)", %%mm6 \n\t" 1108 "movq "MANGLE(mask24b)", %%mm7 \n\t" 1109 ".p2align 4 \n\t" 1110 "1: \n\t" 1111 PREFETCH" 32(%1, %%"REG_a") \n\t" 1112 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 1113 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG 1114 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B 1115 "psllq $16, %%mm0 \n\t" // 00 BGR BGR 1116 "pand %%mm5, %%mm0 \n\t" 1117 "pand %%mm6, %%mm1 \n\t" 1118 "pand %%mm7, %%mm2 \n\t" 1119 "por %%mm0, %%mm1 \n\t" 1120 "por %%mm2, %%mm1 \n\t" 1121 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 1122 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG 1123 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B 1124 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR 1125 "pand %%mm7, %%mm0 \n\t" 1126 "pand %%mm5, %%mm1 \n\t" 1127 "pand %%mm6, %%mm2 \n\t" 1128 "por %%mm0, %%mm1 \n\t" 1129 "por %%mm2, %%mm1 \n\t" 1130 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B 1131 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R 1132 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR 1133 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG 1134 "pand %%mm6, %%mm0 \n\t" 1135 "pand %%mm7, %%mm1 \n\t" 1136 "pand %%mm5, %%mm2 \n\t" 1137 "por %%mm0, %%mm1 \n\t" 1138 "por %%mm2, %%mm1 \n\t" 1139 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" 1140 "add $24, %%"REG_a" \n\t" 1141 " js 1b \n\t" 1142 "2: \n\t" 1143 : "+a" (mmx_size) 1144 : "r" (src-mmx_size), "r"(dst-mmx_size) 1145 NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b) 1146 ); 1147 1148 __asm__ volatile(SFENCE:::"memory"); 1149 __asm__ volatile(EMMS:::"memory"); 1150 1151 if (mmx_size==23) return; //finished, was multiple of 8 1152 1153 src+= src_size; 1154 dst+= src_size; 1155 src_size= 23-mmx_size; 1156 src-= src_size; 1157 dst-= src_size; 1158 for (i=0; i<src_size; i+=3) { 1159 register uint8_t x; 1160 x = src[i + 2]; 1161 dst[i + 1] = src[i + 1]; 1162 dst[i + 2] = src[i + 0]; 1163 dst[i + 0] = x; 1164 } 1165} 1166 1167static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1168 int width, int height, 1169 int lumStride, int chromStride, int dstStride, int vertLumPerChroma) 1170{ 1171 int y; 1172 const x86_reg chromWidth= width>>1; 1173 for (y=0; y<height; y++) { 1174 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 1175 __asm__ volatile( 1176 "xor %%"REG_a", %%"REG_a" \n\t" 1177 ".p2align 4 \n\t" 1178 "1: \n\t" 1179 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 1180 PREFETCH" 32(%2, %%"REG_a") \n\t" 1181 PREFETCH" 32(%3, %%"REG_a") \n\t" 1182 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 1183 "movq %%mm0, %%mm2 \n\t" // U(0) 1184 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 1185 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1186 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1187 1188 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 1189 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 1190 "movq %%mm3, %%mm4 \n\t" // Y(0) 1191 "movq %%mm5, %%mm6 \n\t" // Y(8) 1192 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) 1193 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) 1194 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) 1195 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) 1196 1197 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" 1198 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 1199 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" 1200 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 1201 1202 "add $8, %%"REG_a" \n\t" 1203 "cmp %4, %%"REG_a" \n\t" 1204 " jb 1b \n\t" 1205 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1206 : "%"REG_a 1207 ); 1208 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 1209 usrc += chromStride; 1210 vsrc += chromStride; 1211 } 1212 ysrc += lumStride; 1213 dst += dstStride; 1214 } 1215 __asm__(EMMS" \n\t" 1216 SFENCE" \n\t" 1217 :::"memory"); 1218} 1219 1220/** 1221 * Height should be a multiple of 2 and width should be a multiple of 16. 1222 * (If this is a problem for anyone then tell me, and I will fix it.) 1223 */ 1224static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1225 int width, int height, 1226 int lumStride, int chromStride, int dstStride) 1227{ 1228 //FIXME interpolate chroma 1229 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 1230} 1231 1232static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1233 int width, int height, 1234 int lumStride, int chromStride, int dstStride, int vertLumPerChroma) 1235{ 1236 int y; 1237 const x86_reg chromWidth= width>>1; 1238 for (y=0; y<height; y++) { 1239 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 1240 __asm__ volatile( 1241 "xor %%"REG_a", %%"REG_a" \n\t" 1242 ".p2align 4 \n\t" 1243 "1: \n\t" 1244 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 1245 PREFETCH" 32(%2, %%"REG_a") \n\t" 1246 PREFETCH" 32(%3, %%"REG_a") \n\t" 1247 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 1248 "movq %%mm0, %%mm2 \n\t" // U(0) 1249 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 1250 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1251 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1252 1253 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 1254 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 1255 "movq %%mm0, %%mm4 \n\t" // Y(0) 1256 "movq %%mm2, %%mm6 \n\t" // Y(8) 1257 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) 1258 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) 1259 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) 1260 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) 1261 1262 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" 1263 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 1264 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" 1265 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 1266 1267 "add $8, %%"REG_a" \n\t" 1268 "cmp %4, %%"REG_a" \n\t" 1269 " jb 1b \n\t" 1270 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1271 : "%"REG_a 1272 ); 1273 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 1274 usrc += chromStride; 1275 vsrc += chromStride; 1276 } 1277 ysrc += lumStride; 1278 dst += dstStride; 1279 } 1280 __asm__(EMMS" \n\t" 1281 SFENCE" \n\t" 1282 :::"memory"); 1283} 1284 1285/** 1286 * Height should be a multiple of 2 and width should be a multiple of 16 1287 * (If this is a problem for anyone then tell me, and I will fix it.) 1288 */ 1289static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1290 int width, int height, 1291 int lumStride, int chromStride, int dstStride) 1292{ 1293 //FIXME interpolate chroma 1294 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 1295} 1296 1297/** 1298 * Width should be a multiple of 16. 1299 */ 1300static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1301 int width, int height, 1302 int lumStride, int chromStride, int dstStride) 1303{ 1304 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); 1305} 1306 1307/** 1308 * Width should be a multiple of 16. 1309 */ 1310static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 1311 int width, int height, 1312 int lumStride, int chromStride, int dstStride) 1313{ 1314 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); 1315} 1316 1317/** 1318 * Height should be a multiple of 2 and width should be a multiple of 16. 1319 * (If this is a problem for anyone then tell me, and I will fix it.) 1320 */ 1321static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 1322 int width, int height, 1323 int lumStride, int chromStride, int srcStride) 1324{ 1325 int y; 1326 const x86_reg chromWidth= width>>1; 1327 for (y=0; y<height; y+=2) { 1328 __asm__ volatile( 1329 "xor %%"REG_a", %%"REG_a" \n\t" 1330 "pcmpeqw %%mm7, %%mm7 \n\t" 1331 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 1332 ".p2align 4 \n\t" 1333 "1: \n\t" 1334 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1335 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 1336 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 1337 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) 1338 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) 1339 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) 1340 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) 1341 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 1342 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 1343 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1344 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 1345 1346 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 1347 1348 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) 1349 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) 1350 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) 1351 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) 1352 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) 1353 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) 1354 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 1355 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 1356 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 1357 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 1358 1359 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 1360 1361 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 1362 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 1363 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 1364 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 1365 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 1366 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 1367 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 1368 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 1369 1370 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 1371 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 1372 1373 "add $8, %%"REG_a" \n\t" 1374 "cmp %4, %%"REG_a" \n\t" 1375 " jb 1b \n\t" 1376 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1377 : "memory", "%"REG_a 1378 ); 1379 1380 ydst += lumStride; 1381 src += srcStride; 1382 1383 __asm__ volatile( 1384 "xor %%"REG_a", %%"REG_a" \n\t" 1385 ".p2align 4 \n\t" 1386 "1: \n\t" 1387 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1388 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 1389 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 1390 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 1391 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 1392 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 1393 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 1394 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 1395 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 1396 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 1397 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 1398 1399 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 1400 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 1401 1402 "add $8, %%"REG_a" \n\t" 1403 "cmp %4, %%"REG_a" \n\t" 1404 " jb 1b \n\t" 1405 1406 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1407 : "memory", "%"REG_a 1408 ); 1409 udst += chromStride; 1410 vdst += chromStride; 1411 ydst += lumStride; 1412 src += srcStride; 1413 } 1414 __asm__ volatile(EMMS" \n\t" 1415 SFENCE" \n\t" 1416 :::"memory"); 1417} 1418#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 1419 1420#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1421static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) 1422{ 1423 int x,y; 1424 1425 dst[0]= src[0]; 1426 1427 // first line 1428 for (x=0; x<srcWidth-1; x++) { 1429 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 1430 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 1431 } 1432 dst[2*srcWidth-1]= src[srcWidth-1]; 1433 1434 dst+= dstStride; 1435 1436 for (y=1; y<srcHeight; y++) { 1437 const x86_reg mmxSize= srcWidth&~15; 1438 __asm__ volatile( 1439 "mov %4, %%"REG_a" \n\t" 1440 "movq "MANGLE(mmx_ff)", %%mm0 \n\t" 1441 "movq (%0, %%"REG_a"), %%mm4 \n\t" 1442 "movq %%mm4, %%mm2 \n\t" 1443 "psllq $8, %%mm4 \n\t" 1444 "pand %%mm0, %%mm2 \n\t" 1445 "por %%mm2, %%mm4 \n\t" 1446 "movq (%1, %%"REG_a"), %%mm5 \n\t" 1447 "movq %%mm5, %%mm3 \n\t" 1448 "psllq $8, %%mm5 \n\t" 1449 "pand %%mm0, %%mm3 \n\t" 1450 "por %%mm3, %%mm5 \n\t" 1451 "1: \n\t" 1452 "movq (%0, %%"REG_a"), %%mm0 \n\t" 1453 "movq (%1, %%"REG_a"), %%mm1 \n\t" 1454 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" 1455 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" 1456 PAVGB" %%mm0, %%mm5 \n\t" 1457 PAVGB" %%mm0, %%mm3 \n\t" 1458 PAVGB" %%mm0, %%mm5 \n\t" 1459 PAVGB" %%mm0, %%mm3 \n\t" 1460 PAVGB" %%mm1, %%mm4 \n\t" 1461 PAVGB" %%mm1, %%mm2 \n\t" 1462 PAVGB" %%mm1, %%mm4 \n\t" 1463 PAVGB" %%mm1, %%mm2 \n\t" 1464 "movq %%mm5, %%mm7 \n\t" 1465 "movq %%mm4, %%mm6 \n\t" 1466 "punpcklbw %%mm3, %%mm5 \n\t" 1467 "punpckhbw %%mm3, %%mm7 \n\t" 1468 "punpcklbw %%mm2, %%mm4 \n\t" 1469 "punpckhbw %%mm2, %%mm6 \n\t" 1470 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" 1471 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" 1472 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" 1473 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" 1474 "add $8, %%"REG_a" \n\t" 1475 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" 1476 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" 1477 " js 1b \n\t" 1478 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), 1479 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), 1480 "g" (-mmxSize) 1481 NAMED_CONSTRAINTS_ADD(mmx_ff) 1482 : "%"REG_a 1483 ); 1484 1485 for (x=mmxSize-1; x<srcWidth-1; x++) { 1486 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; 1487 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; 1488 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; 1489 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; 1490 } 1491 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; 1492 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; 1493 1494 dst+=dstStride*2; 1495 src+=srcStride; 1496 } 1497 1498 // last line 1499 dst[0]= src[0]; 1500 1501 for (x=0; x<srcWidth-1; x++) { 1502 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 1503 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 1504 } 1505 dst[2*srcWidth-1]= src[srcWidth-1]; 1506 1507 __asm__ volatile(EMMS" \n\t" 1508 SFENCE" \n\t" 1509 :::"memory"); 1510} 1511#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ 1512 1513#if !COMPILE_TEMPLATE_AMD3DNOW 1514/** 1515 * Height should be a multiple of 2 and width should be a multiple of 16. 1516 * (If this is a problem for anyone then tell me, and I will fix it.) 1517 * Chrominance data is only taken from every second line, others are ignored. 1518 * FIXME: Write HQ version. 1519 */ 1520static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 1521 int width, int height, 1522 int lumStride, int chromStride, int srcStride) 1523{ 1524 int y; 1525 const x86_reg chromWidth= width>>1; 1526 for (y=0; y<height; y+=2) { 1527 __asm__ volatile( 1528 "xor %%"REG_a", %%"REG_a" \n\t" 1529 "pcmpeqw %%mm7, %%mm7 \n\t" 1530 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 1531 ".p2align 4 \n\t" 1532 "1: \n\t" 1533 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1534 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) 1535 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) 1536 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) 1537 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) 1538 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) 1539 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) 1540 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 1541 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 1542 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1543 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 1544 1545 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 1546 1547 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) 1548 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) 1549 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) 1550 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) 1551 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) 1552 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) 1553 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 1554 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 1555 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 1556 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 1557 1558 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 1559 1560 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 1561 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 1562 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 1563 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 1564 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 1565 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 1566 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 1567 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 1568 1569 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 1570 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 1571 1572 "add $8, %%"REG_a" \n\t" 1573 "cmp %4, %%"REG_a" \n\t" 1574 " jb 1b \n\t" 1575 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1576 : "memory", "%"REG_a 1577 ); 1578 1579 ydst += lumStride; 1580 src += srcStride; 1581 1582 __asm__ volatile( 1583 "xor %%"REG_a", %%"REG_a" \n\t" 1584 ".p2align 4 \n\t" 1585 "1: \n\t" 1586 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 1587 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 1588 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 1589 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 1590 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 1591 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 1592 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 1593 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 1594 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 1595 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 1596 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 1597 1598 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 1599 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 1600 1601 "add $8, %%"REG_a" \n\t" 1602 "cmp %4, %%"REG_a" \n\t" 1603 " jb 1b \n\t" 1604 1605 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1606 : "memory", "%"REG_a 1607 ); 1608 udst += chromStride; 1609 vdst += chromStride; 1610 ydst += lumStride; 1611 src += srcStride; 1612 } 1613 __asm__ volatile(EMMS" \n\t" 1614 SFENCE" \n\t" 1615 :::"memory"); 1616} 1617#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 1618 1619/** 1620 * Height should be a multiple of 2 and width should be a multiple of 2. 1621 * (If this is a problem for anyone then tell me, and I will fix it.) 1622 * Chrominance data is only taken from every second line, 1623 * others are ignored in the C version. 1624 * FIXME: Write HQ version. 1625 */ 1626#if HAVE_7REGS 1627static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 1628 int width, int height, 1629 int lumStride, int chromStride, int srcStride, 1630 int32_t *rgb2yuv) 1631{ 1632#define BGR2Y_IDX "16*4+16*32" 1633#define BGR2U_IDX "16*4+16*33" 1634#define BGR2V_IDX "16*4+16*34" 1635 int y; 1636 const x86_reg chromWidth= width>>1; 1637 for (y=0; y<height-2; y+=2) { 1638 int i; 1639 for (i=0; i<2; i++) { 1640 __asm__ volatile( 1641 "mov %2, %%"REG_a" \n\t" 1642 "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" 1643 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 1644 "pxor %%mm7, %%mm7 \n\t" 1645 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 1646 ".p2align 4 \n\t" 1647 "1: \n\t" 1648 PREFETCH" 64(%0, %%"REG_d") \n\t" 1649 "movd (%0, %%"REG_d"), %%mm0 \n\t" 1650 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" 1651 "punpcklbw %%mm7, %%mm0 \n\t" 1652 "punpcklbw %%mm7, %%mm1 \n\t" 1653 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" 1654 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" 1655 "punpcklbw %%mm7, %%mm2 \n\t" 1656 "punpcklbw %%mm7, %%mm3 \n\t" 1657 "pmaddwd %%mm6, %%mm0 \n\t" 1658 "pmaddwd %%mm6, %%mm1 \n\t" 1659 "pmaddwd %%mm6, %%mm2 \n\t" 1660 "pmaddwd %%mm6, %%mm3 \n\t" 1661 "psrad $8, %%mm0 \n\t" 1662 "psrad $8, %%mm1 \n\t" 1663 "psrad $8, %%mm2 \n\t" 1664 "psrad $8, %%mm3 \n\t" 1665 "packssdw %%mm1, %%mm0 \n\t" 1666 "packssdw %%mm3, %%mm2 \n\t" 1667 "pmaddwd %%mm5, %%mm0 \n\t" 1668 "pmaddwd %%mm5, %%mm2 \n\t" 1669 "packssdw %%mm2, %%mm0 \n\t" 1670 "psraw $7, %%mm0 \n\t" 1671 1672 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 1673 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" 1674 "punpcklbw %%mm7, %%mm4 \n\t" 1675 "punpcklbw %%mm7, %%mm1 \n\t" 1676 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" 1677 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" 1678 "punpcklbw %%mm7, %%mm2 \n\t" 1679 "punpcklbw %%mm7, %%mm3 \n\t" 1680 "pmaddwd %%mm6, %%mm4 \n\t" 1681 "pmaddwd %%mm6, %%mm1 \n\t" 1682 "pmaddwd %%mm6, %%mm2 \n\t" 1683 "pmaddwd %%mm6, %%mm3 \n\t" 1684 "psrad $8, %%mm4 \n\t" 1685 "psrad $8, %%mm1 \n\t" 1686 "psrad $8, %%mm2 \n\t" 1687 "psrad $8, %%mm3 \n\t" 1688 "packssdw %%mm1, %%mm4 \n\t" 1689 "packssdw %%mm3, %%mm2 \n\t" 1690 "pmaddwd %%mm5, %%mm4 \n\t" 1691 "pmaddwd %%mm5, %%mm2 \n\t" 1692 "add $24, %%"REG_d" \n\t" 1693 "packssdw %%mm2, %%mm4 \n\t" 1694 "psraw $7, %%mm4 \n\t" 1695 1696 "packuswb %%mm4, %%mm0 \n\t" 1697 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" 1698 1699 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" 1700 "add $8, %%"REG_a" \n\t" 1701 " js 1b \n\t" 1702 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) 1703 NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset) 1704 : "%"REG_a, "%"REG_d 1705 ); 1706 ydst += lumStride; 1707 src += srcStride; 1708 } 1709 src -= srcStride*2; 1710 __asm__ volatile( 1711 "mov %4, %%"REG_a" \n\t" 1712 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 1713 "movq "BGR2U_IDX"(%5), %%mm6 \n\t" 1714 "pxor %%mm7, %%mm7 \n\t" 1715 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 1716 "add %%"REG_d", %%"REG_d" \n\t" 1717 ".p2align 4 \n\t" 1718 "1: \n\t" 1719 PREFETCH" 64(%0, %%"REG_d") \n\t" 1720 PREFETCH" 64(%1, %%"REG_d") \n\t" 1721#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1722 "movq (%0, %%"REG_d"), %%mm0 \n\t" 1723 "movq (%1, %%"REG_d"), %%mm1 \n\t" 1724 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" 1725 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" 1726 PAVGB" %%mm1, %%mm0 \n\t" 1727 PAVGB" %%mm3, %%mm2 \n\t" 1728 "movq %%mm0, %%mm1 \n\t" 1729 "movq %%mm2, %%mm3 \n\t" 1730 "psrlq $24, %%mm0 \n\t" 1731 "psrlq $24, %%mm2 \n\t" 1732 PAVGB" %%mm1, %%mm0 \n\t" 1733 PAVGB" %%mm3, %%mm2 \n\t" 1734 "punpcklbw %%mm7, %%mm0 \n\t" 1735 "punpcklbw %%mm7, %%mm2 \n\t" 1736#else 1737 "movd (%0, %%"REG_d"), %%mm0 \n\t" 1738 "movd (%1, %%"REG_d"), %%mm1 \n\t" 1739 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" 1740 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" 1741 "punpcklbw %%mm7, %%mm0 \n\t" 1742 "punpcklbw %%mm7, %%mm1 \n\t" 1743 "punpcklbw %%mm7, %%mm2 \n\t" 1744 "punpcklbw %%mm7, %%mm3 \n\t" 1745 "paddw %%mm1, %%mm0 \n\t" 1746 "paddw %%mm3, %%mm2 \n\t" 1747 "paddw %%mm2, %%mm0 \n\t" 1748 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" 1749 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" 1750 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" 1751 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" 1752 "punpcklbw %%mm7, %%mm4 \n\t" 1753 "punpcklbw %%mm7, %%mm1 \n\t" 1754 "punpcklbw %%mm7, %%mm2 \n\t" 1755 "punpcklbw %%mm7, %%mm3 \n\t" 1756 "paddw %%mm1, %%mm4 \n\t" 1757 "paddw %%mm3, %%mm2 \n\t" 1758 "paddw %%mm4, %%mm2 \n\t" 1759 "psrlw $2, %%mm0 \n\t" 1760 "psrlw $2, %%mm2 \n\t" 1761#endif 1762 "movq "BGR2V_IDX"(%5), %%mm1 \n\t" 1763 "movq "BGR2V_IDX"(%5), %%mm3 \n\t" 1764 1765 "pmaddwd %%mm0, %%mm1 \n\t" 1766 "pmaddwd %%mm2, %%mm3 \n\t" 1767 "pmaddwd %%mm6, %%mm0 \n\t" 1768 "pmaddwd %%mm6, %%mm2 \n\t" 1769 "psrad $8, %%mm0 \n\t" 1770 "psrad $8, %%mm1 \n\t" 1771 "psrad $8, %%mm2 \n\t" 1772 "psrad $8, %%mm3 \n\t" 1773 "packssdw %%mm2, %%mm0 \n\t" 1774 "packssdw %%mm3, %%mm1 \n\t" 1775 "pmaddwd %%mm5, %%mm0 \n\t" 1776 "pmaddwd %%mm5, %%mm1 \n\t" 1777 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 1778 "psraw $7, %%mm0 \n\t" 1779 1780#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1781 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" 1782 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" 1783 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" 1784 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" 1785 PAVGB" %%mm1, %%mm4 \n\t" 1786 PAVGB" %%mm3, %%mm2 \n\t" 1787 "movq %%mm4, %%mm1 \n\t" 1788 "movq %%mm2, %%mm3 \n\t" 1789 "psrlq $24, %%mm4 \n\t" 1790 "psrlq $24, %%mm2 \n\t" 1791 PAVGB" %%mm1, %%mm4 \n\t" 1792 PAVGB" %%mm3, %%mm2 \n\t" 1793 "punpcklbw %%mm7, %%mm4 \n\t" 1794 "punpcklbw %%mm7, %%mm2 \n\t" 1795#else 1796 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 1797 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" 1798 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" 1799 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" 1800 "punpcklbw %%mm7, %%mm4 \n\t" 1801 "punpcklbw %%mm7, %%mm1 \n\t" 1802 "punpcklbw %%mm7, %%mm2 \n\t" 1803 "punpcklbw %%mm7, %%mm3 \n\t" 1804 "paddw %%mm1, %%mm4 \n\t" 1805 "paddw %%mm3, %%mm2 \n\t" 1806 "paddw %%mm2, %%mm4 \n\t" 1807 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" 1808 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" 1809 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" 1810 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" 1811 "punpcklbw %%mm7, %%mm5 \n\t" 1812 "punpcklbw %%mm7, %%mm1 \n\t" 1813 "punpcklbw %%mm7, %%mm2 \n\t" 1814 "punpcklbw %%mm7, %%mm3 \n\t" 1815 "paddw %%mm1, %%mm5 \n\t" 1816 "paddw %%mm3, %%mm2 \n\t" 1817 "paddw %%mm5, %%mm2 \n\t" 1818 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 1819 "psrlw $2, %%mm4 \n\t" 1820 "psrlw $2, %%mm2 \n\t" 1821#endif 1822 "movq "BGR2V_IDX"(%5), %%mm1 \n\t" 1823 "movq "BGR2V_IDX"(%5), %%mm3 \n\t" 1824 1825 "pmaddwd %%mm4, %%mm1 \n\t" 1826 "pmaddwd %%mm2, %%mm3 \n\t" 1827 "pmaddwd %%mm6, %%mm4 \n\t" 1828 "pmaddwd %%mm6, %%mm2 \n\t" 1829 "psrad $8, %%mm4 \n\t" 1830 "psrad $8, %%mm1 \n\t" 1831 "psrad $8, %%mm2 \n\t" 1832 "psrad $8, %%mm3 \n\t" 1833 "packssdw %%mm2, %%mm4 \n\t" 1834 "packssdw %%mm3, %%mm1 \n\t" 1835 "pmaddwd %%mm5, %%mm4 \n\t" 1836 "pmaddwd %%mm5, %%mm1 \n\t" 1837 "add $24, %%"REG_d" \n\t" 1838 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 1839 "psraw $7, %%mm4 \n\t" 1840 1841 "movq %%mm0, %%mm1 \n\t" 1842 "punpckldq %%mm4, %%mm0 \n\t" 1843 "punpckhdq %%mm4, %%mm1 \n\t" 1844 "packsswb %%mm1, %%mm0 \n\t" 1845 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" 1846 "movd %%mm0, (%2, %%"REG_a") \n\t" 1847 "punpckhdq %%mm0, %%mm0 \n\t" 1848 "movd %%mm0, (%3, %%"REG_a") \n\t" 1849 "add $4, %%"REG_a" \n\t" 1850 " js 1b \n\t" 1851 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) 1852 NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset) 1853 : "%"REG_a, "%"REG_d 1854 ); 1855 1856 udst += chromStride; 1857 vdst += chromStride; 1858 src += srcStride*2; 1859 } 1860 1861 __asm__ volatile(EMMS" \n\t" 1862 SFENCE" \n\t" 1863 :::"memory"); 1864 1865 ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); 1866} 1867#endif /* HAVE_7REGS */ 1868#endif /* !COMPILE_TEMPLATE_SSE2 */ 1869 1870#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX 1871static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, 1872 int width, int height, int src1Stride, 1873 int src2Stride, int dstStride) 1874{ 1875 int h; 1876 1877 for (h=0; h < height; h++) { 1878 int w; 1879 1880#if COMPILE_TEMPLATE_SSE2 1881 __asm__( 1882 "xor %%"REG_a", %%"REG_a" \n\t" 1883 "1: \n\t" 1884 PREFETCH" 64(%1, %%"REG_a") \n\t" 1885 PREFETCH" 64(%2, %%"REG_a") \n\t" 1886 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" 1887 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" 1888 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" 1889 "punpcklbw %%xmm2, %%xmm0 \n\t" 1890 "punpckhbw %%xmm2, %%xmm1 \n\t" 1891 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" 1892 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" 1893 "add $16, %%"REG_a" \n\t" 1894 "cmp %3, %%"REG_a" \n\t" 1895 " jb 1b \n\t" 1896 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 1897 : "memory", "%"REG_a"" 1898 ); 1899#else 1900 __asm__( 1901 "xor %%"REG_a", %%"REG_a" \n\t" 1902 "1: \n\t" 1903 PREFETCH" 64(%1, %%"REG_a") \n\t" 1904 PREFETCH" 64(%2, %%"REG_a") \n\t" 1905 "movq (%1, %%"REG_a"), %%mm0 \n\t" 1906 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" 1907 "movq %%mm0, %%mm1 \n\t" 1908 "movq %%mm2, %%mm3 \n\t" 1909 "movq (%2, %%"REG_a"), %%mm4 \n\t" 1910 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" 1911 "punpcklbw %%mm4, %%mm0 \n\t" 1912 "punpckhbw %%mm4, %%mm1 \n\t" 1913 "punpcklbw %%mm5, %%mm2 \n\t" 1914 "punpckhbw %%mm5, %%mm3 \n\t" 1915 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" 1916 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" 1917 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" 1918 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" 1919 "add $16, %%"REG_a" \n\t" 1920 "cmp %3, %%"REG_a" \n\t" 1921 " jb 1b \n\t" 1922 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 1923 : "memory", "%"REG_a 1924 ); 1925#endif 1926 for (w= (width&(~15)); w < width; w++) { 1927 dest[2*w+0] = src1[w]; 1928 dest[2*w+1] = src2[w]; 1929 } 1930 dest += dstStride; 1931 src1 += src1Stride; 1932 src2 += src2Stride; 1933 } 1934 __asm__( 1935 EMMS" \n\t" 1936 SFENCE" \n\t" 1937 ::: "memory" 1938 ); 1939} 1940#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ 1941 1942#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL 1943#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM 1944void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV, 1945 const uint8_t *unused, 1946 const uint8_t *src1, 1947 const uint8_t *src2, 1948 int w, 1949 uint32_t *unused2); 1950static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, 1951 int width, int height, int srcStride, 1952 int dst1Stride, int dst2Stride) 1953{ 1954 int h; 1955 1956 for (h = 0; h < height; h++) { 1957 RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL); 1958 src += srcStride; 1959 dst1 += dst1Stride; 1960 dst2 += dst2Stride; 1961 } 1962 __asm__( 1963 EMMS" \n\t" 1964 SFENCE" \n\t" 1965 ::: "memory" 1966 ); 1967} 1968#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 1969#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */ 1970 1971#if !COMPILE_TEMPLATE_SSE2 1972#if !COMPILE_TEMPLATE_AMD3DNOW 1973static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, 1974 uint8_t *dst1, uint8_t *dst2, 1975 int width, int height, 1976 int srcStride1, int srcStride2, 1977 int dstStride1, int dstStride2) 1978{ 1979 x86_reg x, y; 1980 int w,h; 1981 w=width/2; h=height/2; 1982 __asm__ volatile( 1983 PREFETCH" %0 \n\t" 1984 PREFETCH" %1 \n\t" 1985 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); 1986 for (y=0;y<h;y++) { 1987 const uint8_t* s1=src1+srcStride1*(y>>1); 1988 uint8_t* d=dst1+dstStride1*y; 1989 x=0; 1990 for (;x<w-31;x+=32) { 1991 __asm__ volatile( 1992 PREFETCH" 32(%1,%2) \n\t" 1993 "movq (%1,%2), %%mm0 \n\t" 1994 "movq 8(%1,%2), %%mm2 \n\t" 1995 "movq 16(%1,%2), %%mm4 \n\t" 1996 "movq 24(%1,%2), %%mm6 \n\t" 1997 "movq %%mm0, %%mm1 \n\t" 1998 "movq %%mm2, %%mm3 \n\t" 1999 "movq %%mm4, %%mm5 \n\t" 2000 "movq %%mm6, %%mm7 \n\t" 2001 "punpcklbw %%mm0, %%mm0 \n\t" 2002 "punpckhbw %%mm1, %%mm1 \n\t" 2003 "punpcklbw %%mm2, %%mm2 \n\t" 2004 "punpckhbw %%mm3, %%mm3 \n\t" 2005 "punpcklbw %%mm4, %%mm4 \n\t" 2006 "punpckhbw %%mm5, %%mm5 \n\t" 2007 "punpcklbw %%mm6, %%mm6 \n\t" 2008 "punpckhbw %%mm7, %%mm7 \n\t" 2009 MOVNTQ" %%mm0, (%0,%2,2) \n\t" 2010 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" 2011 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" 2012 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" 2013 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" 2014 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" 2015 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" 2016 MOVNTQ" %%mm7, 56(%0,%2,2)" 2017 :: "r"(d), "r"(s1), "r"(x) 2018 :"memory"); 2019 } 2020 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; 2021 } 2022 for (y=0;y<h;y++) { 2023 const uint8_t* s2=src2+srcStride2*(y>>1); 2024 uint8_t* d=dst2+dstStride2*y; 2025 x=0; 2026 for (;x<w-31;x+=32) { 2027 __asm__ volatile( 2028 PREFETCH" 32(%1,%2) \n\t" 2029 "movq (%1,%2), %%mm0 \n\t" 2030 "movq 8(%1,%2), %%mm2 \n\t" 2031 "movq 16(%1,%2), %%mm4 \n\t" 2032 "movq 24(%1,%2), %%mm6 \n\t" 2033 "movq %%mm0, %%mm1 \n\t" 2034 "movq %%mm2, %%mm3 \n\t" 2035 "movq %%mm4, %%mm5 \n\t" 2036 "movq %%mm6, %%mm7 \n\t" 2037 "punpcklbw %%mm0, %%mm0 \n\t" 2038 "punpckhbw %%mm1, %%mm1 \n\t" 2039 "punpcklbw %%mm2, %%mm2 \n\t" 2040 "punpckhbw %%mm3, %%mm3 \n\t" 2041 "punpcklbw %%mm4, %%mm4 \n\t" 2042 "punpckhbw %%mm5, %%mm5 \n\t" 2043 "punpcklbw %%mm6, %%mm6 \n\t" 2044 "punpckhbw %%mm7, %%mm7 \n\t" 2045 MOVNTQ" %%mm0, (%0,%2,2) \n\t" 2046 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" 2047 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" 2048 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" 2049 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" 2050 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" 2051 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" 2052 MOVNTQ" %%mm7, 56(%0,%2,2)" 2053 :: "r"(d), "r"(s2), "r"(x) 2054 :"memory"); 2055 } 2056 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; 2057 } 2058 __asm__( 2059 EMMS" \n\t" 2060 SFENCE" \n\t" 2061 ::: "memory" 2062 ); 2063} 2064 2065static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, 2066 uint8_t *dst, 2067 int width, int height, 2068 int srcStride1, int srcStride2, 2069 int srcStride3, int dstStride) 2070{ 2071 x86_reg x; 2072 int y,w,h; 2073 w=width/2; h=height; 2074 for (y=0;y<h;y++) { 2075 const uint8_t* yp=src1+srcStride1*y; 2076 const uint8_t* up=src2+srcStride2*(y>>2); 2077 const uint8_t* vp=src3+srcStride3*(y>>2); 2078 uint8_t* d=dst+dstStride*y; 2079 x=0; 2080 for (;x<w-7;x+=8) { 2081 __asm__ volatile( 2082 PREFETCH" 32(%1, %0) \n\t" 2083 PREFETCH" 32(%2, %0) \n\t" 2084 PREFETCH" 32(%3, %0) \n\t" 2085 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 2086 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ 2087 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ 2088 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 2089 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ 2090 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ 2091 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ 2092 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ 2093 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ 2094 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ 2095 2096 "movq %%mm1, %%mm6 \n\t" 2097 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ 2098 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ 2099 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ 2100 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" 2101 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" 2102 2103 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ 2104 "movq 8(%1, %0, 4), %%mm0 \n\t" 2105 "movq %%mm0, %%mm3 \n\t" 2106 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ 2107 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ 2108 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" 2109 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" 2110 2111 "movq %%mm4, %%mm6 \n\t" 2112 "movq 16(%1, %0, 4), %%mm0 \n\t" 2113 "movq %%mm0, %%mm3 \n\t" 2114 "punpcklbw %%mm5, %%mm4 \n\t" 2115 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ 2116 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ 2117 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" 2118 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" 2119 2120 "punpckhbw %%mm5, %%mm6 \n\t" 2121 "movq 24(%1, %0, 4), %%mm0 \n\t" 2122 "movq %%mm0, %%mm3 \n\t" 2123 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ 2124 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ 2125 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" 2126 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" 2127 2128 : "+r" (x) 2129 : "r"(yp), "r" (up), "r"(vp), "r"(d) 2130 :"memory"); 2131 } 2132 for (; x<w; x++) { 2133 const int x2 = x<<2; 2134 d[8*x+0] = yp[x2]; 2135 d[8*x+1] = up[x]; 2136 d[8*x+2] = yp[x2+1]; 2137 d[8*x+3] = vp[x]; 2138 d[8*x+4] = yp[x2+2]; 2139 d[8*x+5] = up[x]; 2140 d[8*x+6] = yp[x2+3]; 2141 d[8*x+7] = vp[x]; 2142 } 2143 } 2144 __asm__( 2145 EMMS" \n\t" 2146 SFENCE" \n\t" 2147 ::: "memory" 2148 ); 2149} 2150#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 2151 2152static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) 2153{ 2154 dst += count; 2155 src += 2*count; 2156 count= - count; 2157 2158 if(count <= -16) { 2159 count += 15; 2160 __asm__ volatile( 2161 "pcmpeqw %%mm7, %%mm7 \n\t" 2162 "psrlw $8, %%mm7 \n\t" 2163 "1: \n\t" 2164 "movq -30(%1, %0, 2), %%mm0 \n\t" 2165 "movq -22(%1, %0, 2), %%mm1 \n\t" 2166 "movq -14(%1, %0, 2), %%mm2 \n\t" 2167 "movq -6(%1, %0, 2), %%mm3 \n\t" 2168 "pand %%mm7, %%mm0 \n\t" 2169 "pand %%mm7, %%mm1 \n\t" 2170 "pand %%mm7, %%mm2 \n\t" 2171 "pand %%mm7, %%mm3 \n\t" 2172 "packuswb %%mm1, %%mm0 \n\t" 2173 "packuswb %%mm3, %%mm2 \n\t" 2174 MOVNTQ" %%mm0,-15(%2, %0) \n\t" 2175 MOVNTQ" %%mm2,- 7(%2, %0) \n\t" 2176 "add $16, %0 \n\t" 2177 " js 1b \n\t" 2178 : "+r"(count) 2179 : "r"(src), "r"(dst) 2180 ); 2181 count -= 15; 2182 } 2183 while(count<0) { 2184 dst[count]= src[2*count]; 2185 count++; 2186 } 2187} 2188 2189static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count) 2190{ 2191 src ++; 2192 dst += count; 2193 src += 2*count; 2194 count= - count; 2195 2196 if(count < -16) { 2197 count += 16; 2198 __asm__ volatile( 2199 "pcmpeqw %%mm7, %%mm7 \n\t" 2200 "psrlw $8, %%mm7 \n\t" 2201 "1: \n\t" 2202 "movq -32(%1, %0, 2), %%mm0 \n\t" 2203 "movq -24(%1, %0, 2), %%mm1 \n\t" 2204 "movq -16(%1, %0, 2), %%mm2 \n\t" 2205 "movq -8(%1, %0, 2), %%mm3 \n\t" 2206 "pand %%mm7, %%mm0 \n\t" 2207 "pand %%mm7, %%mm1 \n\t" 2208 "pand %%mm7, %%mm2 \n\t" 2209 "pand %%mm7, %%mm3 \n\t" 2210 "packuswb %%mm1, %%mm0 \n\t" 2211 "packuswb %%mm3, %%mm2 \n\t" 2212 MOVNTQ" %%mm0,-16(%2, %0) \n\t" 2213 MOVNTQ" %%mm2,- 8(%2, %0) \n\t" 2214 "add $16, %0 \n\t" 2215 " js 1b \n\t" 2216 : "+r"(count) 2217 : "r"(src), "r"(dst) 2218 ); 2219 count -= 16; 2220 } 2221 while(count<0) { 2222 dst[count]= src[2*count]; 2223 count++; 2224 } 2225} 2226 2227#if !COMPILE_TEMPLATE_AMD3DNOW 2228static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) 2229{ 2230 dst0+= count; 2231 dst1+= count; 2232 src += 4*count; 2233 count= - count; 2234 if(count <= -8) { 2235 count += 7; 2236 __asm__ volatile( 2237 "pcmpeqw %%mm7, %%mm7 \n\t" 2238 "psrlw $8, %%mm7 \n\t" 2239 "1: \n\t" 2240 "movq -28(%1, %0, 4), %%mm0 \n\t" 2241 "movq -20(%1, %0, 4), %%mm1 \n\t" 2242 "movq -12(%1, %0, 4), %%mm2 \n\t" 2243 "movq -4(%1, %0, 4), %%mm3 \n\t" 2244 "pand %%mm7, %%mm0 \n\t" 2245 "pand %%mm7, %%mm1 \n\t" 2246 "pand %%mm7, %%mm2 \n\t" 2247 "pand %%mm7, %%mm3 \n\t" 2248 "packuswb %%mm1, %%mm0 \n\t" 2249 "packuswb %%mm3, %%mm2 \n\t" 2250 "movq %%mm0, %%mm1 \n\t" 2251 "movq %%mm2, %%mm3 \n\t" 2252 "psrlw $8, %%mm0 \n\t" 2253 "psrlw $8, %%mm2 \n\t" 2254 "pand %%mm7, %%mm1 \n\t" 2255 "pand %%mm7, %%mm3 \n\t" 2256 "packuswb %%mm2, %%mm0 \n\t" 2257 "packuswb %%mm3, %%mm1 \n\t" 2258 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 2259 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 2260 "add $8, %0 \n\t" 2261 " js 1b \n\t" 2262 : "+r"(count) 2263 : "r"(src), "r"(dst0), "r"(dst1) 2264 ); 2265 count -= 7; 2266 } 2267 while(count<0) { 2268 dst0[count]= src[4*count+0]; 2269 dst1[count]= src[4*count+2]; 2270 count++; 2271 } 2272} 2273#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 2274 2275static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) 2276{ 2277 dst0 += count; 2278 dst1 += count; 2279 src0 += 4*count; 2280 src1 += 4*count; 2281 count= - count; 2282#ifdef PAVGB 2283 if(count <= -8) { 2284 count += 7; 2285 __asm__ volatile( 2286 "pcmpeqw %%mm7, %%mm7 \n\t" 2287 "psrlw $8, %%mm7 \n\t" 2288 "1: \n\t" 2289 "movq -28(%1, %0, 4), %%mm0 \n\t" 2290 "movq -20(%1, %0, 4), %%mm1 \n\t" 2291 "movq -12(%1, %0, 4), %%mm2 \n\t" 2292 "movq -4(%1, %0, 4), %%mm3 \n\t" 2293 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 2294 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 2295 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 2296 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 2297 "pand %%mm7, %%mm0 \n\t" 2298 "pand %%mm7, %%mm1 \n\t" 2299 "pand %%mm7, %%mm2 \n\t" 2300 "pand %%mm7, %%mm3 \n\t" 2301 "packuswb %%mm1, %%mm0 \n\t" 2302 "packuswb %%mm3, %%mm2 \n\t" 2303 "movq %%mm0, %%mm1 \n\t" 2304 "movq %%mm2, %%mm3 \n\t" 2305 "psrlw $8, %%mm0 \n\t" 2306 "psrlw $8, %%mm2 \n\t" 2307 "pand %%mm7, %%mm1 \n\t" 2308 "pand %%mm7, %%mm3 \n\t" 2309 "packuswb %%mm2, %%mm0 \n\t" 2310 "packuswb %%mm3, %%mm1 \n\t" 2311 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 2312 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 2313 "add $8, %0 \n\t" 2314 " js 1b \n\t" 2315 : "+r"(count) 2316 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) 2317 ); 2318 count -= 7; 2319 } 2320#endif 2321 while(count<0) { 2322 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; 2323 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; 2324 count++; 2325 } 2326} 2327 2328#if !COMPILE_TEMPLATE_AMD3DNOW 2329static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) 2330{ 2331 dst0+= count; 2332 dst1+= count; 2333 src += 4*count; 2334 count= - count; 2335 if(count <= -8) { 2336 count += 7; 2337 __asm__ volatile( 2338 "pcmpeqw %%mm7, %%mm7 \n\t" 2339 "psrlw $8, %%mm7 \n\t" 2340 "1: \n\t" 2341 "movq -28(%1, %0, 4), %%mm0 \n\t" 2342 "movq -20(%1, %0, 4), %%mm1 \n\t" 2343 "movq -12(%1, %0, 4), %%mm2 \n\t" 2344 "movq -4(%1, %0, 4), %%mm3 \n\t" 2345 "psrlw $8, %%mm0 \n\t" 2346 "psrlw $8, %%mm1 \n\t" 2347 "psrlw $8, %%mm2 \n\t" 2348 "psrlw $8, %%mm3 \n\t" 2349 "packuswb %%mm1, %%mm0 \n\t" 2350 "packuswb %%mm3, %%mm2 \n\t" 2351 "movq %%mm0, %%mm1 \n\t" 2352 "movq %%mm2, %%mm3 \n\t" 2353 "psrlw $8, %%mm0 \n\t" 2354 "psrlw $8, %%mm2 \n\t" 2355 "pand %%mm7, %%mm1 \n\t" 2356 "pand %%mm7, %%mm3 \n\t" 2357 "packuswb %%mm2, %%mm0 \n\t" 2358 "packuswb %%mm3, %%mm1 \n\t" 2359 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 2360 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 2361 "add $8, %0 \n\t" 2362 " js 1b \n\t" 2363 : "+r"(count) 2364 : "r"(src), "r"(dst0), "r"(dst1) 2365 ); 2366 count -= 7; 2367 } 2368 src++; 2369 while(count<0) { 2370 dst0[count]= src[4*count+0]; 2371 dst1[count]= src[4*count+2]; 2372 count++; 2373 } 2374} 2375#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 2376 2377static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) 2378{ 2379 dst0 += count; 2380 dst1 += count; 2381 src0 += 4*count; 2382 src1 += 4*count; 2383 count= - count; 2384#ifdef PAVGB 2385 if(count <= -8) { 2386 count += 7; 2387 __asm__ volatile( 2388 "pcmpeqw %%mm7, %%mm7 \n\t" 2389 "psrlw $8, %%mm7 \n\t" 2390 "1: \n\t" 2391 "movq -28(%1, %0, 4), %%mm0 \n\t" 2392 "movq -20(%1, %0, 4), %%mm1 \n\t" 2393 "movq -12(%1, %0, 4), %%mm2 \n\t" 2394 "movq -4(%1, %0, 4), %%mm3 \n\t" 2395 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 2396 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 2397 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 2398 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 2399 "psrlw $8, %%mm0 \n\t" 2400 "psrlw $8, %%mm1 \n\t" 2401 "psrlw $8, %%mm2 \n\t" 2402 "psrlw $8, %%mm3 \n\t" 2403 "packuswb %%mm1, %%mm0 \n\t" 2404 "packuswb %%mm3, %%mm2 \n\t" 2405 "movq %%mm0, %%mm1 \n\t" 2406 "movq %%mm2, %%mm3 \n\t" 2407 "psrlw $8, %%mm0 \n\t" 2408 "psrlw $8, %%mm2 \n\t" 2409 "pand %%mm7, %%mm1 \n\t" 2410 "pand %%mm7, %%mm3 \n\t" 2411 "packuswb %%mm2, %%mm0 \n\t" 2412 "packuswb %%mm3, %%mm1 \n\t" 2413 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 2414 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 2415 "add $8, %0 \n\t" 2416 " js 1b \n\t" 2417 : "+r"(count) 2418 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) 2419 ); 2420 count -= 7; 2421 } 2422#endif 2423 src0++; 2424 src1++; 2425 while(count<0) { 2426 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; 2427 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; 2428 count++; 2429 } 2430} 2431 2432static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 2433 int width, int height, 2434 int lumStride, int chromStride, int srcStride) 2435{ 2436 int y; 2437 const int chromWidth = FF_CEIL_RSHIFT(width, 1); 2438 2439 for (y=0; y<height; y++) { 2440 RENAME(extract_even)(src, ydst, width); 2441 if(y&1) { 2442 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth); 2443 udst+= chromStride; 2444 vdst+= chromStride; 2445 } 2446 2447 src += srcStride; 2448 ydst+= lumStride; 2449 } 2450 __asm__( 2451 EMMS" \n\t" 2452 SFENCE" \n\t" 2453 ::: "memory" 2454 ); 2455} 2456 2457#if !COMPILE_TEMPLATE_AMD3DNOW 2458static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 2459 int width, int height, 2460 int lumStride, int chromStride, int srcStride) 2461{ 2462 int y; 2463 const int chromWidth = FF_CEIL_RSHIFT(width, 1); 2464 2465 for (y=0; y<height; y++) { 2466 RENAME(extract_even)(src, ydst, width); 2467 RENAME(extract_odd2)(src, udst, vdst, chromWidth); 2468 2469 src += srcStride; 2470 ydst+= lumStride; 2471 udst+= chromStride; 2472 vdst+= chromStride; 2473 } 2474 __asm__( 2475 EMMS" \n\t" 2476 SFENCE" \n\t" 2477 ::: "memory" 2478 ); 2479} 2480#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 2481 2482static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 2483 int width, int height, 2484 int lumStride, int chromStride, int srcStride) 2485{ 2486 int y; 2487 const int chromWidth = FF_CEIL_RSHIFT(width, 1); 2488 2489 for (y=0; y<height; y++) { 2490 RENAME(extract_odd)(src, ydst, width); 2491 if(y&1) { 2492 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); 2493 udst+= chromStride; 2494 vdst+= chromStride; 2495 } 2496 2497 src += srcStride; 2498 ydst+= lumStride; 2499 } 2500 __asm__( 2501 EMMS" \n\t" 2502 SFENCE" \n\t" 2503 ::: "memory" 2504 ); 2505} 2506 2507#if !COMPILE_TEMPLATE_AMD3DNOW 2508static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 2509 int width, int height, 2510 int lumStride, int chromStride, int srcStride) 2511{ 2512 int y; 2513 const int chromWidth = FF_CEIL_RSHIFT(width, 1); 2514 2515 for (y=0; y<height; y++) { 2516 RENAME(extract_odd)(src, ydst, width); 2517 RENAME(extract_even2)(src, udst, vdst, chromWidth); 2518 2519 src += srcStride; 2520 ydst+= lumStride; 2521 udst+= chromStride; 2522 vdst+= chromStride; 2523 } 2524 __asm__( 2525 EMMS" \n\t" 2526 SFENCE" \n\t" 2527 ::: "memory" 2528 ); 2529} 2530#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 2531#endif /* !COMPILE_TEMPLATE_SSE2 */ 2532 2533static av_cold void RENAME(rgb2rgb_init)(void) 2534{ 2535#if !COMPILE_TEMPLATE_SSE2 2536#if !COMPILE_TEMPLATE_AMD3DNOW 2537 rgb15to16 = RENAME(rgb15to16); 2538 rgb15tobgr24 = RENAME(rgb15tobgr24); 2539 rgb15to32 = RENAME(rgb15to32); 2540 rgb16tobgr24 = RENAME(rgb16tobgr24); 2541 rgb16to32 = RENAME(rgb16to32); 2542 rgb16to15 = RENAME(rgb16to15); 2543 rgb24tobgr16 = RENAME(rgb24tobgr16); 2544 rgb24tobgr15 = RENAME(rgb24tobgr15); 2545 rgb24tobgr32 = RENAME(rgb24tobgr32); 2546 rgb32to16 = RENAME(rgb32to16); 2547 rgb32to15 = RENAME(rgb32to15); 2548 rgb32tobgr24 = RENAME(rgb32tobgr24); 2549 rgb24to15 = RENAME(rgb24to15); 2550 rgb24to16 = RENAME(rgb24to16); 2551 rgb24tobgr24 = RENAME(rgb24tobgr24); 2552 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103); 2553 rgb32tobgr16 = RENAME(rgb32tobgr16); 2554 rgb32tobgr15 = RENAME(rgb32tobgr15); 2555 yv12toyuy2 = RENAME(yv12toyuy2); 2556 yv12touyvy = RENAME(yv12touyvy); 2557 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); 2558 yuv422ptouyvy = RENAME(yuv422ptouyvy); 2559 yuy2toyv12 = RENAME(yuy2toyv12); 2560 vu9_to_vu12 = RENAME(vu9_to_vu12); 2561 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); 2562 uyvytoyuv422 = RENAME(uyvytoyuv422); 2563 yuyvtoyuv422 = RENAME(yuyvtoyuv422); 2564#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 2565 2566#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 2567 planar2x = RENAME(planar2x); 2568#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ 2569#if HAVE_7REGS 2570 ff_rgb24toyv12 = RENAME(rgb24toyv12); 2571#endif /* HAVE_7REGS */ 2572 2573 yuyvtoyuv420 = RENAME(yuyvtoyuv420); 2574 uyvytoyuv420 = RENAME(uyvytoyuv420); 2575#endif /* !COMPILE_TEMPLATE_SSE2 */ 2576 2577#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX 2578 interleaveBytes = RENAME(interleaveBytes); 2579#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ 2580#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL 2581#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM 2582 deinterleaveBytes = RENAME(deinterleaveBytes); 2583#endif 2584#endif 2585} 2586