1/* 2 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include <stdint.h> 22 23#include "libavutil/x86/asm.h" 24#include "libswscale/swscale_internal.h" 25 26#undef REAL_MOVNTQ 27#undef MOVNTQ 28#undef MOVNTQ2 29#undef PREFETCH 30 31#if COMPILE_TEMPLATE_MMXEXT 32#define PREFETCH "prefetchnta" 33#else 34#define PREFETCH " # nop" 35#endif 36 37#if COMPILE_TEMPLATE_MMXEXT 38#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 39#define MOVNTQ2 "movntq " 40#else 41#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 42#define MOVNTQ2 "movq " 43#endif 44#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 45 46#if !COMPILE_TEMPLATE_MMXEXT 47static av_always_inline void 48dither_8to16(const uint8_t *srcDither, int rot) 49{ 50 if (rot) { 51 __asm__ volatile("pxor %%mm0, %%mm0\n\t" 52 "movq (%0), %%mm3\n\t" 53 "movq %%mm3, %%mm4\n\t" 54 "psrlq $24, %%mm3\n\t" 55 "psllq $40, %%mm4\n\t" 56 "por %%mm4, %%mm3\n\t" 57 "movq %%mm3, %%mm4\n\t" 58 "punpcklbw %%mm0, %%mm3\n\t" 59 "punpckhbw %%mm0, %%mm4\n\t" 60 :: "r"(srcDither) 61 ); 62 } else { 63 __asm__ volatile("pxor %%mm0, %%mm0\n\t" 64 "movq (%0), %%mm3\n\t" 65 "movq %%mm3, %%mm4\n\t" 66 "punpcklbw %%mm0, %%mm3\n\t" 67 "punpckhbw %%mm0, %%mm4\n\t" 68 :: "r"(srcDither) 69 ); 70 } 71} 72#endif 73 74static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, 75 const int16_t **src, uint8_t *dest, int dstW, 76 const uint8_t *dither, int offset) 77{ 78 dither_8to16(dither, offset); 79 filterSize--; 80 __asm__ volatile( 81 "movd %0, %%mm1\n\t" 82 "punpcklwd %%mm1, %%mm1\n\t" 83 "punpckldq %%mm1, %%mm1\n\t" 84 "psllw $3, %%mm1\n\t" 85 "paddw %%mm1, %%mm3\n\t" 86 "paddw %%mm1, %%mm4\n\t" 87 "psraw $4, %%mm3\n\t" 88 "psraw $4, %%mm4\n\t" 89 ::"m"(filterSize) 90 ); 91 92 __asm__ volatile(\ 93 "movq %%mm3, %%mm6\n\t" 94 "movq %%mm4, %%mm7\n\t" 95 "movl %3, %%ecx\n\t" 96 "mov %0, %%"REG_d" \n\t"\ 97 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 98 ".p2align 4 \n\t" /* FIXME Unroll? */\ 99 "1: \n\t"\ 100 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 101 "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ 102 "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ 103 "add $16, %%"REG_d" \n\t"\ 104 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 105 "test %%"REG_S", %%"REG_S" \n\t"\ 106 "pmulhw %%mm0, %%mm2 \n\t"\ 107 "pmulhw %%mm0, %%mm5 \n\t"\ 108 "paddw %%mm2, %%mm3 \n\t"\ 109 "paddw %%mm5, %%mm4 \n\t"\ 110 " jnz 1b \n\t"\ 111 "psraw $3, %%mm3 \n\t"\ 112 "psraw $3, %%mm4 \n\t"\ 113 "packuswb %%mm4, %%mm3 \n\t" 114 MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" 115 "add $8, %%"REG_c" \n\t"\ 116 "cmp %2, %%"REG_c" \n\t"\ 117 "movq %%mm6, %%mm3\n\t" 118 "movq %%mm7, %%mm4\n\t" 119 "mov %0, %%"REG_d" \n\t"\ 120 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 121 "jb 1b \n\t"\ 122 :: "g" (filter), 123 "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) 124 : "%"REG_d, "%"REG_S, "%"REG_c 125 ); 126} 127 128#define YSCALEYUV2PACKEDX_UV \ 129 __asm__ volatile(\ 130 "xor %%"REG_a", %%"REG_a" \n\t"\ 131 ".p2align 4 \n\t"\ 132 "nop \n\t"\ 133 "1: \n\t"\ 134 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 135 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 136 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 137 "movq %%mm3, %%mm4 \n\t"\ 138 ".p2align 4 \n\t"\ 139 "2: \n\t"\ 140 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 141 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ 142 "add %6, %%"REG_S" \n\t" \ 143 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ 144 "add $16, %%"REG_d" \n\t"\ 145 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 146 "pmulhw %%mm0, %%mm2 \n\t"\ 147 "pmulhw %%mm0, %%mm5 \n\t"\ 148 "paddw %%mm2, %%mm3 \n\t"\ 149 "paddw %%mm5, %%mm4 \n\t"\ 150 "test %%"REG_S", %%"REG_S" \n\t"\ 151 " jnz 2b \n\t"\ 152 153#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 154 "lea "offset"(%0), %%"REG_d" \n\t"\ 155 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 156 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 157 "movq "#dst1", "#dst2" \n\t"\ 158 ".p2align 4 \n\t"\ 159 "2: \n\t"\ 160 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ 161 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ 162 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ 163 "add $16, %%"REG_d" \n\t"\ 164 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 165 "pmulhw "#coeff", "#src1" \n\t"\ 166 "pmulhw "#coeff", "#src2" \n\t"\ 167 "paddw "#src1", "#dst1" \n\t"\ 168 "paddw "#src2", "#dst2" \n\t"\ 169 "test %%"REG_S", %%"REG_S" \n\t"\ 170 " jnz 2b \n\t"\ 171 172#define YSCALEYUV2PACKEDX \ 173 YSCALEYUV2PACKEDX_UV \ 174 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 175 176#define YSCALEYUV2PACKEDX_END \ 177 :: "r" (&c->redDither), \ 178 "m" (dummy), "m" (dummy), "m" (dummy),\ 179 "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 180 NAMED_CONSTRAINTS_ADD(bF8,bFC) \ 181 : "%"REG_a, "%"REG_d, "%"REG_S \ 182 ); 183 184#define YSCALEYUV2PACKEDX_ACCURATE_UV \ 185 __asm__ volatile(\ 186 "xor %%"REG_a", %%"REG_a" \n\t"\ 187 ".p2align 4 \n\t"\ 188 "nop \n\t"\ 189 "1: \n\t"\ 190 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 191 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 192 "pxor %%mm4, %%mm4 \n\t"\ 193 "pxor %%mm5, %%mm5 \n\t"\ 194 "pxor %%mm6, %%mm6 \n\t"\ 195 "pxor %%mm7, %%mm7 \n\t"\ 196 ".p2align 4 \n\t"\ 197 "2: \n\t"\ 198 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ 199 "add %6, %%"REG_S" \n\t" \ 200 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ 201 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 202 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ 203 "movq %%mm0, %%mm3 \n\t"\ 204 "punpcklwd %%mm1, %%mm0 \n\t"\ 205 "punpckhwd %%mm1, %%mm3 \n\t"\ 206 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ 207 "pmaddwd %%mm1, %%mm0 \n\t"\ 208 "pmaddwd %%mm1, %%mm3 \n\t"\ 209 "paddd %%mm0, %%mm4 \n\t"\ 210 "paddd %%mm3, %%mm5 \n\t"\ 211 "add %6, %%"REG_S" \n\t" \ 212 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ 213 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 214 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 215 "test %%"REG_S", %%"REG_S" \n\t"\ 216 "movq %%mm2, %%mm0 \n\t"\ 217 "punpcklwd %%mm3, %%mm2 \n\t"\ 218 "punpckhwd %%mm3, %%mm0 \n\t"\ 219 "pmaddwd %%mm1, %%mm2 \n\t"\ 220 "pmaddwd %%mm1, %%mm0 \n\t"\ 221 "paddd %%mm2, %%mm6 \n\t"\ 222 "paddd %%mm0, %%mm7 \n\t"\ 223 " jnz 2b \n\t"\ 224 "psrad $16, %%mm4 \n\t"\ 225 "psrad $16, %%mm5 \n\t"\ 226 "psrad $16, %%mm6 \n\t"\ 227 "psrad $16, %%mm7 \n\t"\ 228 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 229 "packssdw %%mm5, %%mm4 \n\t"\ 230 "packssdw %%mm7, %%mm6 \n\t"\ 231 "paddw %%mm0, %%mm4 \n\t"\ 232 "paddw %%mm0, %%mm6 \n\t"\ 233 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 234 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 235 236#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 237 "lea "offset"(%0), %%"REG_d" \n\t"\ 238 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 239 "pxor %%mm1, %%mm1 \n\t"\ 240 "pxor %%mm5, %%mm5 \n\t"\ 241 "pxor %%mm7, %%mm7 \n\t"\ 242 "pxor %%mm6, %%mm6 \n\t"\ 243 ".p2align 4 \n\t"\ 244 "2: \n\t"\ 245 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ 246 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ 247 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 248 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ 249 "movq %%mm0, %%mm3 \n\t"\ 250 "punpcklwd %%mm4, %%mm0 \n\t"\ 251 "punpckhwd %%mm4, %%mm3 \n\t"\ 252 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ 253 "pmaddwd %%mm4, %%mm0 \n\t"\ 254 "pmaddwd %%mm4, %%mm3 \n\t"\ 255 "paddd %%mm0, %%mm1 \n\t"\ 256 "paddd %%mm3, %%mm5 \n\t"\ 257 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ 258 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 259 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 260 "test %%"REG_S", %%"REG_S" \n\t"\ 261 "movq %%mm2, %%mm0 \n\t"\ 262 "punpcklwd %%mm3, %%mm2 \n\t"\ 263 "punpckhwd %%mm3, %%mm0 \n\t"\ 264 "pmaddwd %%mm4, %%mm2 \n\t"\ 265 "pmaddwd %%mm4, %%mm0 \n\t"\ 266 "paddd %%mm2, %%mm7 \n\t"\ 267 "paddd %%mm0, %%mm6 \n\t"\ 268 " jnz 2b \n\t"\ 269 "psrad $16, %%mm1 \n\t"\ 270 "psrad $16, %%mm5 \n\t"\ 271 "psrad $16, %%mm7 \n\t"\ 272 "psrad $16, %%mm6 \n\t"\ 273 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 274 "packssdw %%mm5, %%mm1 \n\t"\ 275 "packssdw %%mm6, %%mm7 \n\t"\ 276 "paddw %%mm0, %%mm1 \n\t"\ 277 "paddw %%mm0, %%mm7 \n\t"\ 278 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 279 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 280 281#define YSCALEYUV2PACKEDX_ACCURATE \ 282 YSCALEYUV2PACKEDX_ACCURATE_UV \ 283 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 284 285#define YSCALEYUV2RGBX \ 286 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ 287 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ 288 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 289 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 290 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 291 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 292 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 293 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 294 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 295 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ 296 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ 297 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 298 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 299 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 300 "paddw %%mm3, %%mm4 \n\t"\ 301 "movq %%mm2, %%mm0 \n\t"\ 302 "movq %%mm5, %%mm6 \n\t"\ 303 "movq %%mm4, %%mm3 \n\t"\ 304 "punpcklwd %%mm2, %%mm2 \n\t"\ 305 "punpcklwd %%mm5, %%mm5 \n\t"\ 306 "punpcklwd %%mm4, %%mm4 \n\t"\ 307 "paddw %%mm1, %%mm2 \n\t"\ 308 "paddw %%mm1, %%mm5 \n\t"\ 309 "paddw %%mm1, %%mm4 \n\t"\ 310 "punpckhwd %%mm0, %%mm0 \n\t"\ 311 "punpckhwd %%mm6, %%mm6 \n\t"\ 312 "punpckhwd %%mm3, %%mm3 \n\t"\ 313 "paddw %%mm7, %%mm0 \n\t"\ 314 "paddw %%mm7, %%mm6 \n\t"\ 315 "paddw %%mm7, %%mm3 \n\t"\ 316 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 317 "packuswb %%mm0, %%mm2 \n\t"\ 318 "packuswb %%mm6, %%mm5 \n\t"\ 319 "packuswb %%mm3, %%mm4 \n\t"\ 320 321#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 322 "movq "#b", "#q2" \n\t" /* B */\ 323 "movq "#r", "#t" \n\t" /* R */\ 324 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ 325 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ 326 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ 327 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ 328 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ 329 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ 330 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ 331 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ 332 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ 333 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ 334\ 335 MOVNTQ( q0, (dst, index, 4))\ 336 MOVNTQ( b, 8(dst, index, 4))\ 337 MOVNTQ( q2, 16(dst, index, 4))\ 338 MOVNTQ( q3, 24(dst, index, 4))\ 339\ 340 "add $8, "#index" \n\t"\ 341 "cmp "dstw", "#index" \n\t"\ 342 " jb 1b \n\t" 343#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 344 345static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, 346 const int16_t **lumSrc, int lumFilterSize, 347 const int16_t *chrFilter, const int16_t **chrUSrc, 348 const int16_t **chrVSrc, 349 int chrFilterSize, const int16_t **alpSrc, 350 uint8_t *dest, int dstW, int dstY) 351{ 352 x86_reg dummy=0; 353 x86_reg dstW_reg = dstW; 354 x86_reg uv_off = c->uv_offx2; 355 356 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 357 YSCALEYUV2PACKEDX_ACCURATE 358 YSCALEYUV2RGBX 359 "movq %%mm2, "U_TEMP"(%0) \n\t" 360 "movq %%mm4, "V_TEMP"(%0) \n\t" 361 "movq %%mm5, "Y_TEMP"(%0) \n\t" 362 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) 363 "movq "Y_TEMP"(%0), %%mm5 \n\t" 364 "psraw $3, %%mm1 \n\t" 365 "psraw $3, %%mm7 \n\t" 366 "packuswb %%mm7, %%mm1 \n\t" 367 WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) 368 YSCALEYUV2PACKEDX_END 369 } else { 370 YSCALEYUV2PACKEDX_ACCURATE 371 YSCALEYUV2RGBX 372 "pcmpeqd %%mm7, %%mm7 \n\t" 373 WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 374 YSCALEYUV2PACKEDX_END 375 } 376} 377 378static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, 379 const int16_t **lumSrc, int lumFilterSize, 380 const int16_t *chrFilter, const int16_t **chrUSrc, 381 const int16_t **chrVSrc, 382 int chrFilterSize, const int16_t **alpSrc, 383 uint8_t *dest, int dstW, int dstY) 384{ 385 x86_reg dummy=0; 386 x86_reg dstW_reg = dstW; 387 x86_reg uv_off = c->uv_offx2; 388 389 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 390 YSCALEYUV2PACKEDX 391 YSCALEYUV2RGBX 392 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 393 "psraw $3, %%mm1 \n\t" 394 "psraw $3, %%mm7 \n\t" 395 "packuswb %%mm7, %%mm1 \n\t" 396 WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 397 YSCALEYUV2PACKEDX_END 398 } else { 399 YSCALEYUV2PACKEDX 400 YSCALEYUV2RGBX 401 "pcmpeqd %%mm7, %%mm7 \n\t" 402 WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 403 YSCALEYUV2PACKEDX_END 404 } 405} 406 407#define REAL_WRITERGB16(dst, dstw, index) \ 408 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 409 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ 410 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 411 "psrlq $3, %%mm2 \n\t"\ 412\ 413 "movq %%mm2, %%mm1 \n\t"\ 414 "movq %%mm4, %%mm3 \n\t"\ 415\ 416 "punpcklbw %%mm7, %%mm3 \n\t"\ 417 "punpcklbw %%mm5, %%mm2 \n\t"\ 418 "punpckhbw %%mm7, %%mm4 \n\t"\ 419 "punpckhbw %%mm5, %%mm1 \n\t"\ 420\ 421 "psllq $3, %%mm3 \n\t"\ 422 "psllq $3, %%mm4 \n\t"\ 423\ 424 "por %%mm3, %%mm2 \n\t"\ 425 "por %%mm4, %%mm1 \n\t"\ 426\ 427 MOVNTQ(%%mm2, (dst, index, 2))\ 428 MOVNTQ(%%mm1, 8(dst, index, 2))\ 429\ 430 "add $8, "#index" \n\t"\ 431 "cmp "dstw", "#index" \n\t"\ 432 " jb 1b \n\t" 433#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 434 435static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, 436 const int16_t **lumSrc, int lumFilterSize, 437 const int16_t *chrFilter, const int16_t **chrUSrc, 438 const int16_t **chrVSrc, 439 int chrFilterSize, const int16_t **alpSrc, 440 uint8_t *dest, int dstW, int dstY) 441{ 442 x86_reg dummy=0; 443 x86_reg dstW_reg = dstW; 444 x86_reg uv_off = c->uv_offx2; 445 446 YSCALEYUV2PACKEDX_ACCURATE 447 YSCALEYUV2RGBX 448 "pxor %%mm7, %%mm7 \n\t" 449 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 450#ifdef DITHER1XBPP 451 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 452 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 453 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 454#endif 455 WRITERGB16(%4, "%5", %%REGa) 456 YSCALEYUV2PACKEDX_END 457} 458 459static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, 460 const int16_t **lumSrc, int lumFilterSize, 461 const int16_t *chrFilter, const int16_t **chrUSrc, 462 const int16_t **chrVSrc, 463 int chrFilterSize, const int16_t **alpSrc, 464 uint8_t *dest, int dstW, int dstY) 465{ 466 x86_reg dummy=0; 467 x86_reg dstW_reg = dstW; 468 x86_reg uv_off = c->uv_offx2; 469 470 YSCALEYUV2PACKEDX 471 YSCALEYUV2RGBX 472 "pxor %%mm7, %%mm7 \n\t" 473 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 474#ifdef DITHER1XBPP 475 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 476 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 477 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 478#endif 479 WRITERGB16(%4, "%5", %%REGa) 480 YSCALEYUV2PACKEDX_END 481} 482 483#define REAL_WRITERGB15(dst, dstw, index) \ 484 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 485 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ 486 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 487 "psrlq $3, %%mm2 \n\t"\ 488 "psrlq $1, %%mm5 \n\t"\ 489\ 490 "movq %%mm2, %%mm1 \n\t"\ 491 "movq %%mm4, %%mm3 \n\t"\ 492\ 493 "punpcklbw %%mm7, %%mm3 \n\t"\ 494 "punpcklbw %%mm5, %%mm2 \n\t"\ 495 "punpckhbw %%mm7, %%mm4 \n\t"\ 496 "punpckhbw %%mm5, %%mm1 \n\t"\ 497\ 498 "psllq $2, %%mm3 \n\t"\ 499 "psllq $2, %%mm4 \n\t"\ 500\ 501 "por %%mm3, %%mm2 \n\t"\ 502 "por %%mm4, %%mm1 \n\t"\ 503\ 504 MOVNTQ(%%mm2, (dst, index, 2))\ 505 MOVNTQ(%%mm1, 8(dst, index, 2))\ 506\ 507 "add $8, "#index" \n\t"\ 508 "cmp "dstw", "#index" \n\t"\ 509 " jb 1b \n\t" 510#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 511 512static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, 513 const int16_t **lumSrc, int lumFilterSize, 514 const int16_t *chrFilter, const int16_t **chrUSrc, 515 const int16_t **chrVSrc, 516 int chrFilterSize, const int16_t **alpSrc, 517 uint8_t *dest, int dstW, int dstY) 518{ 519 x86_reg dummy=0; 520 x86_reg dstW_reg = dstW; 521 x86_reg uv_off = c->uv_offx2; 522 523 YSCALEYUV2PACKEDX_ACCURATE 524 YSCALEYUV2RGBX 525 "pxor %%mm7, %%mm7 \n\t" 526 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 527#ifdef DITHER1XBPP 528 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 529 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 530 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 531#endif 532 WRITERGB15(%4, "%5", %%REGa) 533 YSCALEYUV2PACKEDX_END 534} 535 536static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, 537 const int16_t **lumSrc, int lumFilterSize, 538 const int16_t *chrFilter, const int16_t **chrUSrc, 539 const int16_t **chrVSrc, 540 int chrFilterSize, const int16_t **alpSrc, 541 uint8_t *dest, int dstW, int dstY) 542{ 543 x86_reg dummy=0; 544 x86_reg dstW_reg = dstW; 545 x86_reg uv_off = c->uv_offx2; 546 547 YSCALEYUV2PACKEDX 548 YSCALEYUV2RGBX 549 "pxor %%mm7, %%mm7 \n\t" 550 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 551#ifdef DITHER1XBPP 552 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 553 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 554 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 555#endif 556 WRITERGB15(%4, "%5", %%REGa) 557 YSCALEYUV2PACKEDX_END 558} 559 560#define WRITEBGR24MMX(dst, dstw, index) \ 561 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 562 "movq %%mm2, %%mm1 \n\t" /* B */\ 563 "movq %%mm5, %%mm6 \n\t" /* R */\ 564 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 565 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 566 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 567 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 568 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 569 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 570 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 571 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 572 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 573 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 574\ 575 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 576 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ 577 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ 578 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ 579\ 580 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ 581 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ 582 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ 583 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ 584\ 585 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ 586 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ 587 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ 588 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ 589\ 590 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ 591 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ 592 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ 593 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 594 MOVNTQ(%%mm0, (dst))\ 595\ 596 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ 597 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ 598 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ 599 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ 600 MOVNTQ(%%mm6, 8(dst))\ 601\ 602 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ 603 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ 604 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ 605 MOVNTQ(%%mm5, 16(dst))\ 606\ 607 "add $24, "#dst" \n\t"\ 608\ 609 "add $8, "#index" \n\t"\ 610 "cmp "dstw", "#index" \n\t"\ 611 " jb 1b \n\t" 612 613#define WRITEBGR24MMXEXT(dst, dstw, index) \ 614 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 615 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 616 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 617 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ 618 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ 619 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ 620\ 621 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ 622 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ 623 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ 624\ 625 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ 626 "por %%mm1, %%mm6 \n\t"\ 627 "por %%mm3, %%mm6 \n\t"\ 628 MOVNTQ(%%mm6, (dst))\ 629\ 630 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ 631 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ 632 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ 633 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ 634\ 635 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ 636 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ 637 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ 638\ 639 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ 640 "por %%mm3, %%mm6 \n\t"\ 641 MOVNTQ(%%mm6, 8(dst))\ 642\ 643 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ 644 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ 645 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ 646\ 647 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ 648 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ 649 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ 650\ 651 "por %%mm1, %%mm3 \n\t"\ 652 "por %%mm3, %%mm6 \n\t"\ 653 MOVNTQ(%%mm6, 16(dst))\ 654\ 655 "add $24, "#dst" \n\t"\ 656\ 657 "add $8, "#index" \n\t"\ 658 "cmp "dstw", "#index" \n\t"\ 659 " jb 1b \n\t" 660 661#if COMPILE_TEMPLATE_MMXEXT 662#undef WRITEBGR24 663#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) 664#else 665#undef WRITEBGR24 666#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 667#endif 668 669#if HAVE_6REGS 670static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, 671 const int16_t **lumSrc, int lumFilterSize, 672 const int16_t *chrFilter, const int16_t **chrUSrc, 673 const int16_t **chrVSrc, 674 int chrFilterSize, const int16_t **alpSrc, 675 uint8_t *dest, int dstW, int dstY) 676{ 677 x86_reg dummy=0; 678 x86_reg dstW_reg = dstW; 679 x86_reg uv_off = c->uv_offx2; 680 681 YSCALEYUV2PACKEDX_ACCURATE 682 YSCALEYUV2RGBX 683 "pxor %%mm7, %%mm7 \n\t" 684 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize 685 "add %4, %%"REG_c" \n\t" 686 WRITEBGR24(%%REGc, "%5", %%REGa) 687 :: "r" (&c->redDither), 688 "m" (dummy), "m" (dummy), "m" (dummy), 689 "r" (dest), "m" (dstW_reg), "m"(uv_off) 690 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 691 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 692 ); 693} 694 695static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, 696 const int16_t **lumSrc, int lumFilterSize, 697 const int16_t *chrFilter, const int16_t **chrUSrc, 698 const int16_t **chrVSrc, 699 int chrFilterSize, const int16_t **alpSrc, 700 uint8_t *dest, int dstW, int dstY) 701{ 702 x86_reg dummy=0; 703 x86_reg dstW_reg = dstW; 704 x86_reg uv_off = c->uv_offx2; 705 706 YSCALEYUV2PACKEDX 707 YSCALEYUV2RGBX 708 "pxor %%mm7, %%mm7 \n\t" 709 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize 710 "add %4, %%"REG_c" \n\t" 711 WRITEBGR24(%%REGc, "%5", %%REGa) 712 :: "r" (&c->redDither), 713 "m" (dummy), "m" (dummy), "m" (dummy), 714 "r" (dest), "m" (dstW_reg), "m"(uv_off) 715 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 716 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 717 ); 718} 719#endif /* HAVE_6REGS */ 720 721#define REAL_WRITEYUY2(dst, dstw, index) \ 722 "packuswb %%mm3, %%mm3 \n\t"\ 723 "packuswb %%mm4, %%mm4 \n\t"\ 724 "packuswb %%mm7, %%mm1 \n\t"\ 725 "punpcklbw %%mm4, %%mm3 \n\t"\ 726 "movq %%mm1, %%mm7 \n\t"\ 727 "punpcklbw %%mm3, %%mm1 \n\t"\ 728 "punpckhbw %%mm3, %%mm7 \n\t"\ 729\ 730 MOVNTQ(%%mm1, (dst, index, 2))\ 731 MOVNTQ(%%mm7, 8(dst, index, 2))\ 732\ 733 "add $8, "#index" \n\t"\ 734 "cmp "dstw", "#index" \n\t"\ 735 " jb 1b \n\t" 736#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 737 738static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, 739 const int16_t **lumSrc, int lumFilterSize, 740 const int16_t *chrFilter, const int16_t **chrUSrc, 741 const int16_t **chrVSrc, 742 int chrFilterSize, const int16_t **alpSrc, 743 uint8_t *dest, int dstW, int dstY) 744{ 745 x86_reg dummy=0; 746 x86_reg dstW_reg = dstW; 747 x86_reg uv_off = c->uv_offx2; 748 749 YSCALEYUV2PACKEDX_ACCURATE 750 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 751 "psraw $3, %%mm3 \n\t" 752 "psraw $3, %%mm4 \n\t" 753 "psraw $3, %%mm1 \n\t" 754 "psraw $3, %%mm7 \n\t" 755 WRITEYUY2(%4, "%5", %%REGa) 756 YSCALEYUV2PACKEDX_END 757} 758 759static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, 760 const int16_t **lumSrc, int lumFilterSize, 761 const int16_t *chrFilter, const int16_t **chrUSrc, 762 const int16_t **chrVSrc, 763 int chrFilterSize, const int16_t **alpSrc, 764 uint8_t *dest, int dstW, int dstY) 765{ 766 x86_reg dummy=0; 767 x86_reg dstW_reg = dstW; 768 x86_reg uv_off = c->uv_offx2; 769 770 YSCALEYUV2PACKEDX 771 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 772 "psraw $3, %%mm3 \n\t" 773 "psraw $3, %%mm4 \n\t" 774 "psraw $3, %%mm1 \n\t" 775 "psraw $3, %%mm7 \n\t" 776 WRITEYUY2(%4, "%5", %%REGa) 777 YSCALEYUV2PACKEDX_END 778} 779 780#define REAL_YSCALEYUV2RGB_UV(index, c) \ 781 "xor "#index", "#index" \n\t"\ 782 ".p2align 4 \n\t"\ 783 "1: \n\t"\ 784 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 785 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 786 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 787 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 788 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 789 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 790 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 791 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 792 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 793 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 794 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 795 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 796 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 797 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 798 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 799 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 800 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 801 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 802 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 803 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 804 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 805 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 806 807#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 808 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 809 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 810 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 811 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 812 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 813 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 814 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 815 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 816 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 817 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 818 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 819 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 820 821#define REAL_YSCALEYUV2RGB_COEFF(c) \ 822 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 823 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 824 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 825 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 826 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 827 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 828 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 829 "paddw %%mm3, %%mm4 \n\t"\ 830 "movq %%mm2, %%mm0 \n\t"\ 831 "movq %%mm5, %%mm6 \n\t"\ 832 "movq %%mm4, %%mm3 \n\t"\ 833 "punpcklwd %%mm2, %%mm2 \n\t"\ 834 "punpcklwd %%mm5, %%mm5 \n\t"\ 835 "punpcklwd %%mm4, %%mm4 \n\t"\ 836 "paddw %%mm1, %%mm2 \n\t"\ 837 "paddw %%mm1, %%mm5 \n\t"\ 838 "paddw %%mm1, %%mm4 \n\t"\ 839 "punpckhwd %%mm0, %%mm0 \n\t"\ 840 "punpckhwd %%mm6, %%mm6 \n\t"\ 841 "punpckhwd %%mm3, %%mm3 \n\t"\ 842 "paddw %%mm7, %%mm0 \n\t"\ 843 "paddw %%mm7, %%mm6 \n\t"\ 844 "paddw %%mm7, %%mm3 \n\t"\ 845 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 846 "packuswb %%mm0, %%mm2 \n\t"\ 847 "packuswb %%mm6, %%mm5 \n\t"\ 848 "packuswb %%mm3, %%mm4 \n\t"\ 849 850#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 851 852#define YSCALEYUV2RGB(index, c) \ 853 REAL_YSCALEYUV2RGB_UV(index, c) \ 854 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 855 REAL_YSCALEYUV2RGB_COEFF(c) 856 857/** 858 * vertical bilinear scale YV12 to RGB 859 */ 860static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], 861 const int16_t *ubuf[2], const int16_t *vbuf[2], 862 const int16_t *abuf[2], uint8_t *dest, 863 int dstW, int yalpha, int uvalpha, int y) 864{ 865 const int16_t *buf0 = buf[0], *buf1 = buf[1], 866 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 867 868 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 869 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; 870#if ARCH_X86_64 871 __asm__ volatile( 872 YSCALEYUV2RGB(%%r8, %5) 873 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) 874 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 875 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 876 "packuswb %%mm7, %%mm1 \n\t" 877 WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 878 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), 879 "a" (&c->redDither), 880 "r" (abuf0), "r" (abuf1) 881 : "%r8" 882 ); 883#else 884 c->u_temp=(intptr_t)abuf0; 885 c->v_temp=(intptr_t)abuf1; 886 __asm__ volatile( 887 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 888 "mov %4, %%"REG_b" \n\t" 889 "push %%"REG_BP" \n\t" 890 YSCALEYUV2RGB(%%REGBP, %5) 891 "push %0 \n\t" 892 "push %1 \n\t" 893 "mov "U_TEMP"(%5), %0 \n\t" 894 "mov "V_TEMP"(%5), %1 \n\t" 895 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) 896 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 897 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 898 "packuswb %%mm7, %%mm1 \n\t" 899 "pop %1 \n\t" 900 "pop %0 \n\t" 901 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 902 "pop %%"REG_BP" \n\t" 903 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 904 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 905 "a" (&c->redDither) 906 ); 907#endif 908 } else { 909 __asm__ volatile( 910 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 911 "mov %4, %%"REG_b" \n\t" 912 "push %%"REG_BP" \n\t" 913 YSCALEYUV2RGB(%%REGBP, %5) 914 "pcmpeqd %%mm7, %%mm7 \n\t" 915 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 916 "pop %%"REG_BP" \n\t" 917 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 918 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 919 "a" (&c->redDither) 920 ); 921 } 922} 923 924static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], 925 const int16_t *ubuf[2], const int16_t *vbuf[2], 926 const int16_t *abuf[2], uint8_t *dest, 927 int dstW, int yalpha, int uvalpha, int y) 928{ 929 const int16_t *buf0 = buf[0], *buf1 = buf[1], 930 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 931 932 __asm__ volatile( 933 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 934 "mov %4, %%"REG_b" \n\t" 935 "push %%"REG_BP" \n\t" 936 YSCALEYUV2RGB(%%REGBP, %5) 937 "pxor %%mm7, %%mm7 \n\t" 938 WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 939 "pop %%"REG_BP" \n\t" 940 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 941 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 942 "a" (&c->redDither) 943 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 944 ); 945} 946 947static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], 948 const int16_t *ubuf[2], const int16_t *vbuf[2], 949 const int16_t *abuf[2], uint8_t *dest, 950 int dstW, int yalpha, int uvalpha, int y) 951{ 952 const int16_t *buf0 = buf[0], *buf1 = buf[1], 953 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 954 955 __asm__ volatile( 956 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 957 "mov %4, %%"REG_b" \n\t" 958 "push %%"REG_BP" \n\t" 959 YSCALEYUV2RGB(%%REGBP, %5) 960 "pxor %%mm7, %%mm7 \n\t" 961 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 962#ifdef DITHER1XBPP 963 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 964 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 965 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 966#endif 967 WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 968 "pop %%"REG_BP" \n\t" 969 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 970 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 971 "a" (&c->redDither) 972 NAMED_CONSTRAINTS_ADD(bF8) 973 ); 974} 975 976static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], 977 const int16_t *ubuf[2], const int16_t *vbuf[2], 978 const int16_t *abuf[2], uint8_t *dest, 979 int dstW, int yalpha, int uvalpha, int y) 980{ 981 const int16_t *buf0 = buf[0], *buf1 = buf[1], 982 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 983 984 __asm__ volatile( 985 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 986 "mov %4, %%"REG_b" \n\t" 987 "push %%"REG_BP" \n\t" 988 YSCALEYUV2RGB(%%REGBP, %5) 989 "pxor %%mm7, %%mm7 \n\t" 990 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 991#ifdef DITHER1XBPP 992 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 993 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 994 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 995#endif 996 WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 997 "pop %%"REG_BP" \n\t" 998 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 999 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1000 "a" (&c->redDither) 1001 NAMED_CONSTRAINTS_ADD(bF8,bFC) 1002 ); 1003} 1004 1005#define REAL_YSCALEYUV2PACKED(index, c) \ 1006 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 1007 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 1008 "psraw $3, %%mm0 \n\t"\ 1009 "psraw $3, %%mm1 \n\t"\ 1010 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 1011 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 1012 "xor "#index", "#index" \n\t"\ 1013 ".p2align 4 \n\t"\ 1014 "1: \n\t"\ 1015 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 1016 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 1017 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1018 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 1019 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 1020 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1021 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 1022 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 1023 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 1024 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 1025 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 1026 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 1027 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 1028 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 1029 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 1030 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 1031 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 1032 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 1033 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 1034 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 1035 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 1036 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 1037 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 1038 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1039 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1040 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 1041 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 1042 1043#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 1044 1045static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], 1046 const int16_t *ubuf[2], const int16_t *vbuf[2], 1047 const int16_t *abuf[2], uint8_t *dest, 1048 int dstW, int yalpha, int uvalpha, int y) 1049{ 1050 const int16_t *buf0 = buf[0], *buf1 = buf[1], 1051 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; 1052 1053 __asm__ volatile( 1054 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1055 "mov %4, %%"REG_b" \n\t" 1056 "push %%"REG_BP" \n\t" 1057 YSCALEYUV2PACKED(%%REGBP, %5) 1058 WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1059 "pop %%"REG_BP" \n\t" 1060 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1061 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1062 "a" (&c->redDither) 1063 ); 1064} 1065 1066#define REAL_YSCALEYUV2RGB1(index, c) \ 1067 "xor "#index", "#index" \n\t"\ 1068 ".p2align 4 \n\t"\ 1069 "1: \n\t"\ 1070 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 1071 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1072 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 1073 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1074 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 1075 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 1076 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 1077 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 1078 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 1079 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 1080 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1081 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1082 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 1083 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1084 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1085 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1086 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1087 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1088 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1089 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 1090 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 1091 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1092 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1093 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 1094 "paddw %%mm3, %%mm4 \n\t"\ 1095 "movq %%mm2, %%mm0 \n\t"\ 1096 "movq %%mm5, %%mm6 \n\t"\ 1097 "movq %%mm4, %%mm3 \n\t"\ 1098 "punpcklwd %%mm2, %%mm2 \n\t"\ 1099 "punpcklwd %%mm5, %%mm5 \n\t"\ 1100 "punpcklwd %%mm4, %%mm4 \n\t"\ 1101 "paddw %%mm1, %%mm2 \n\t"\ 1102 "paddw %%mm1, %%mm5 \n\t"\ 1103 "paddw %%mm1, %%mm4 \n\t"\ 1104 "punpckhwd %%mm0, %%mm0 \n\t"\ 1105 "punpckhwd %%mm6, %%mm6 \n\t"\ 1106 "punpckhwd %%mm3, %%mm3 \n\t"\ 1107 "paddw %%mm7, %%mm0 \n\t"\ 1108 "paddw %%mm7, %%mm6 \n\t"\ 1109 "paddw %%mm7, %%mm3 \n\t"\ 1110 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 1111 "packuswb %%mm0, %%mm2 \n\t"\ 1112 "packuswb %%mm6, %%mm5 \n\t"\ 1113 "packuswb %%mm3, %%mm4 \n\t"\ 1114 1115#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 1116 1117// do vertical chrominance interpolation 1118#define REAL_YSCALEYUV2RGB1b(index, c) \ 1119 "xor "#index", "#index" \n\t"\ 1120 ".p2align 4 \n\t"\ 1121 "1: \n\t"\ 1122 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 1123 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 1124 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1125 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 1126 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 1127 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1128 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 1129 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 1130 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ 1131 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ 1132 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 1133 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 1134 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 1135 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 1136 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1137 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1138 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 1139 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1140 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1141 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1142 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 1143 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1144 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1145 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 1146 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 1147 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1148 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1149 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 1150 "paddw %%mm3, %%mm4 \n\t"\ 1151 "movq %%mm2, %%mm0 \n\t"\ 1152 "movq %%mm5, %%mm6 \n\t"\ 1153 "movq %%mm4, %%mm3 \n\t"\ 1154 "punpcklwd %%mm2, %%mm2 \n\t"\ 1155 "punpcklwd %%mm5, %%mm5 \n\t"\ 1156 "punpcklwd %%mm4, %%mm4 \n\t"\ 1157 "paddw %%mm1, %%mm2 \n\t"\ 1158 "paddw %%mm1, %%mm5 \n\t"\ 1159 "paddw %%mm1, %%mm4 \n\t"\ 1160 "punpckhwd %%mm0, %%mm0 \n\t"\ 1161 "punpckhwd %%mm6, %%mm6 \n\t"\ 1162 "punpckhwd %%mm3, %%mm3 \n\t"\ 1163 "paddw %%mm7, %%mm0 \n\t"\ 1164 "paddw %%mm7, %%mm6 \n\t"\ 1165 "paddw %%mm7, %%mm3 \n\t"\ 1166 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 1167 "packuswb %%mm0, %%mm2 \n\t"\ 1168 "packuswb %%mm6, %%mm5 \n\t"\ 1169 "packuswb %%mm3, %%mm4 \n\t"\ 1170 1171#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 1172 1173#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 1174 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ 1175 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ 1176 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ 1177 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ 1178 "packuswb %%mm1, %%mm7 \n\t" 1179#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 1180 1181/** 1182 * YV12 to RGB without scaling or interpolating 1183 */ 1184static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, 1185 const int16_t *ubuf[2], const int16_t *vbuf[2], 1186 const int16_t *abuf0, uint8_t *dest, 1187 int dstW, int uvalpha, int y) 1188{ 1189 const int16_t *ubuf0 = ubuf[0]; 1190 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1191 1192 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1193 const int16_t *ubuf1 = ubuf[0]; 1194 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 1195 __asm__ volatile( 1196 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1197 "mov %4, %%"REG_b" \n\t" 1198 "push %%"REG_BP" \n\t" 1199 YSCALEYUV2RGB1(%%REGBP, %5) 1200 YSCALEYUV2RGB1_ALPHA(%%REGBP) 1201 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1202 "pop %%"REG_BP" \n\t" 1203 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1204 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1205 "a" (&c->redDither) 1206 ); 1207 } else { 1208 __asm__ volatile( 1209 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1210 "mov %4, %%"REG_b" \n\t" 1211 "push %%"REG_BP" \n\t" 1212 YSCALEYUV2RGB1(%%REGBP, %5) 1213 "pcmpeqd %%mm7, %%mm7 \n\t" 1214 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1215 "pop %%"REG_BP" \n\t" 1216 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1217 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1218 "a" (&c->redDither) 1219 ); 1220 } 1221 } else { 1222 const int16_t *ubuf1 = ubuf[1]; 1223 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 1224 __asm__ volatile( 1225 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1226 "mov %4, %%"REG_b" \n\t" 1227 "push %%"REG_BP" \n\t" 1228 YSCALEYUV2RGB1b(%%REGBP, %5) 1229 YSCALEYUV2RGB1_ALPHA(%%REGBP) 1230 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1231 "pop %%"REG_BP" \n\t" 1232 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1233 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1234 "a" (&c->redDither) 1235 ); 1236 } else { 1237 __asm__ volatile( 1238 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1239 "mov %4, %%"REG_b" \n\t" 1240 "push %%"REG_BP" \n\t" 1241 YSCALEYUV2RGB1b(%%REGBP, %5) 1242 "pcmpeqd %%mm7, %%mm7 \n\t" 1243 WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1244 "pop %%"REG_BP" \n\t" 1245 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1246 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1247 "a" (&c->redDither) 1248 ); 1249 } 1250 } 1251} 1252 1253static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, 1254 const int16_t *ubuf[2], const int16_t *vbuf[2], 1255 const int16_t *abuf0, uint8_t *dest, 1256 int dstW, int uvalpha, int y) 1257{ 1258 const int16_t *ubuf0 = ubuf[0]; 1259 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1260 1261 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1262 const int16_t *ubuf1 = ubuf[0]; 1263 __asm__ volatile( 1264 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1265 "mov %4, %%"REG_b" \n\t" 1266 "push %%"REG_BP" \n\t" 1267 YSCALEYUV2RGB1(%%REGBP, %5) 1268 "pxor %%mm7, %%mm7 \n\t" 1269 WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1270 "pop %%"REG_BP" \n\t" 1271 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1272 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1273 "a" (&c->redDither) 1274 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 1275 ); 1276 } else { 1277 const int16_t *ubuf1 = ubuf[1]; 1278 __asm__ volatile( 1279 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1280 "mov %4, %%"REG_b" \n\t" 1281 "push %%"REG_BP" \n\t" 1282 YSCALEYUV2RGB1b(%%REGBP, %5) 1283 "pxor %%mm7, %%mm7 \n\t" 1284 WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1285 "pop %%"REG_BP" \n\t" 1286 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1287 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1288 "a" (&c->redDither) 1289 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) 1290 ); 1291 } 1292} 1293 1294static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, 1295 const int16_t *ubuf[2], const int16_t *vbuf[2], 1296 const int16_t *abuf0, uint8_t *dest, 1297 int dstW, int uvalpha, int y) 1298{ 1299 const int16_t *ubuf0 = ubuf[0]; 1300 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1301 1302 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1303 const int16_t *ubuf1 = ubuf[0]; 1304 __asm__ volatile( 1305 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1306 "mov %4, %%"REG_b" \n\t" 1307 "push %%"REG_BP" \n\t" 1308 YSCALEYUV2RGB1(%%REGBP, %5) 1309 "pxor %%mm7, %%mm7 \n\t" 1310 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1311#ifdef DITHER1XBPP 1312 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1313 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1314 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1315#endif 1316 WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1317 "pop %%"REG_BP" \n\t" 1318 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1319 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1320 "a" (&c->redDither) 1321 NAMED_CONSTRAINTS_ADD(bF8) 1322 ); 1323 } else { 1324 const int16_t *ubuf1 = ubuf[1]; 1325 __asm__ volatile( 1326 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1327 "mov %4, %%"REG_b" \n\t" 1328 "push %%"REG_BP" \n\t" 1329 YSCALEYUV2RGB1b(%%REGBP, %5) 1330 "pxor %%mm7, %%mm7 \n\t" 1331 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1332#ifdef DITHER1XBPP 1333 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1334 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1335 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1336#endif 1337 WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1338 "pop %%"REG_BP" \n\t" 1339 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1340 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1341 "a" (&c->redDither) 1342 NAMED_CONSTRAINTS_ADD(bF8) 1343 ); 1344 } 1345} 1346 1347static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, 1348 const int16_t *ubuf[2], const int16_t *vbuf[2], 1349 const int16_t *abuf0, uint8_t *dest, 1350 int dstW, int uvalpha, int y) 1351{ 1352 const int16_t *ubuf0 = ubuf[0]; 1353 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1354 1355 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1356 const int16_t *ubuf1 = ubuf[0]; 1357 __asm__ volatile( 1358 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1359 "mov %4, %%"REG_b" \n\t" 1360 "push %%"REG_BP" \n\t" 1361 YSCALEYUV2RGB1(%%REGBP, %5) 1362 "pxor %%mm7, %%mm7 \n\t" 1363 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1364#ifdef DITHER1XBPP 1365 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1366 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1367 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1368#endif 1369 WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1370 "pop %%"REG_BP" \n\t" 1371 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1372 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1373 "a" (&c->redDither) 1374 NAMED_CONSTRAINTS_ADD(bF8,bFC) 1375 ); 1376 } else { 1377 const int16_t *ubuf1 = ubuf[1]; 1378 __asm__ volatile( 1379 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1380 "mov %4, %%"REG_b" \n\t" 1381 "push %%"REG_BP" \n\t" 1382 YSCALEYUV2RGB1b(%%REGBP, %5) 1383 "pxor %%mm7, %%mm7 \n\t" 1384 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1385#ifdef DITHER1XBPP 1386 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1387 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1388 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1389#endif 1390 WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1391 "pop %%"REG_BP" \n\t" 1392 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1393 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1394 "a" (&c->redDither) 1395 NAMED_CONSTRAINTS_ADD(bF8,bFC) 1396 ); 1397 } 1398} 1399 1400#define REAL_YSCALEYUV2PACKED1(index, c) \ 1401 "xor "#index", "#index" \n\t"\ 1402 ".p2align 4 \n\t"\ 1403 "1: \n\t"\ 1404 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 1405 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1406 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 1407 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1408 "psraw $7, %%mm3 \n\t" \ 1409 "psraw $7, %%mm4 \n\t" \ 1410 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1411 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1412 "psraw $7, %%mm1 \n\t" \ 1413 "psraw $7, %%mm7 \n\t" \ 1414 1415#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 1416 1417#define REAL_YSCALEYUV2PACKED1b(index, c) \ 1418 "xor "#index", "#index" \n\t"\ 1419 ".p2align 4 \n\t"\ 1420 "1: \n\t"\ 1421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 1422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 1423 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1424 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 1425 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 1426 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1427 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 1428 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 1429 "psrlw $8, %%mm3 \n\t" \ 1430 "psrlw $8, %%mm4 \n\t" \ 1431 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 1432 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 1433 "psraw $7, %%mm1 \n\t" \ 1434 "psraw $7, %%mm7 \n\t" 1435#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 1436 1437static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, 1438 const int16_t *ubuf[2], const int16_t *vbuf[2], 1439 const int16_t *abuf0, uint8_t *dest, 1440 int dstW, int uvalpha, int y) 1441{ 1442 const int16_t *ubuf0 = ubuf[0]; 1443 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1444 1445 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1446 const int16_t *ubuf1 = ubuf[0]; 1447 __asm__ volatile( 1448 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1449 "mov %4, %%"REG_b" \n\t" 1450 "push %%"REG_BP" \n\t" 1451 YSCALEYUV2PACKED1(%%REGBP, %5) 1452 WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1453 "pop %%"REG_BP" \n\t" 1454 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1455 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1456 "a" (&c->redDither) 1457 ); 1458 } else { 1459 const int16_t *ubuf1 = ubuf[1]; 1460 __asm__ volatile( 1461 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1462 "mov %4, %%"REG_b" \n\t" 1463 "push %%"REG_BP" \n\t" 1464 YSCALEYUV2PACKED1b(%%REGBP, %5) 1465 WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) 1466 "pop %%"REG_BP" \n\t" 1467 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1468 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), 1469 "a" (&c->redDither) 1470 ); 1471 } 1472} 1473 1474#if COMPILE_TEMPLATE_MMXEXT 1475static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, 1476 int dstWidth, const uint8_t *src, 1477 int srcW, int xInc) 1478{ 1479 int32_t *filterPos = c->hLumFilterPos; 1480 int16_t *filter = c->hLumFilter; 1481 void *mmxextFilterCode = c->lumMmxextFilterCode; 1482 int i; 1483#if defined(PIC) 1484 uint64_t ebxsave; 1485#endif 1486#if ARCH_X86_64 1487 uint64_t retsave; 1488#endif 1489 1490 __asm__ volatile( 1491#if defined(PIC) 1492 "mov %%"REG_b", %5 \n\t" 1493#if ARCH_X86_64 1494 "mov -8(%%rsp), %%"REG_a" \n\t" 1495 "mov %%"REG_a", %6 \n\t" 1496#endif 1497#else 1498#if ARCH_X86_64 1499 "mov -8(%%rsp), %%"REG_a" \n\t" 1500 "mov %%"REG_a", %5 \n\t" 1501#endif 1502#endif 1503 "pxor %%mm7, %%mm7 \n\t" 1504 "mov %0, %%"REG_c" \n\t" 1505 "mov %1, %%"REG_D" \n\t" 1506 "mov %2, %%"REG_d" \n\t" 1507 "mov %3, %%"REG_b" \n\t" 1508 "xor %%"REG_a", %%"REG_a" \n\t" // i 1509 PREFETCH" (%%"REG_c") \n\t" 1510 PREFETCH" 32(%%"REG_c") \n\t" 1511 PREFETCH" 64(%%"REG_c") \n\t" 1512 1513#if ARCH_X86_64 1514#define CALL_MMXEXT_FILTER_CODE \ 1515 "movl (%%"REG_b"), %%esi \n\t"\ 1516 "call *%4 \n\t"\ 1517 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 1518 "add %%"REG_S", %%"REG_c" \n\t"\ 1519 "add %%"REG_a", %%"REG_D" \n\t"\ 1520 "xor %%"REG_a", %%"REG_a" \n\t"\ 1521 1522#else 1523#define CALL_MMXEXT_FILTER_CODE \ 1524 "movl (%%"REG_b"), %%esi \n\t"\ 1525 "call *%4 \n\t"\ 1526 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 1527 "add %%"REG_a", %%"REG_D" \n\t"\ 1528 "xor %%"REG_a", %%"REG_a" \n\t"\ 1529 1530#endif /* ARCH_X86_64 */ 1531 1532 CALL_MMXEXT_FILTER_CODE 1533 CALL_MMXEXT_FILTER_CODE 1534 CALL_MMXEXT_FILTER_CODE 1535 CALL_MMXEXT_FILTER_CODE 1536 CALL_MMXEXT_FILTER_CODE 1537 CALL_MMXEXT_FILTER_CODE 1538 CALL_MMXEXT_FILTER_CODE 1539 CALL_MMXEXT_FILTER_CODE 1540 1541#if defined(PIC) 1542 "mov %5, %%"REG_b" \n\t" 1543#if ARCH_X86_64 1544 "mov %6, %%"REG_a" \n\t" 1545 "mov %%"REG_a", -8(%%rsp) \n\t" 1546#endif 1547#else 1548#if ARCH_X86_64 1549 "mov %5, %%"REG_a" \n\t" 1550 "mov %%"REG_a", -8(%%rsp) \n\t" 1551#endif 1552#endif 1553 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), 1554 "m" (mmxextFilterCode) 1555#if defined(PIC) 1556 ,"m" (ebxsave) 1557#endif 1558#if ARCH_X86_64 1559 ,"m"(retsave) 1560#endif 1561 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 1562#if !defined(PIC) 1563 ,"%"REG_b 1564#endif 1565 ); 1566 1567 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) 1568 dst[i] = src[srcW-1]*128; 1569} 1570 1571static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, 1572 int dstWidth, const uint8_t *src1, 1573 const uint8_t *src2, int srcW, int xInc) 1574{ 1575 int32_t *filterPos = c->hChrFilterPos; 1576 int16_t *filter = c->hChrFilter; 1577 void *mmxextFilterCode = c->chrMmxextFilterCode; 1578 int i; 1579#if defined(PIC) 1580 DECLARE_ALIGNED(8, uint64_t, ebxsave); 1581#endif 1582#if ARCH_X86_64 1583 DECLARE_ALIGNED(8, uint64_t, retsave); 1584#endif 1585 1586 __asm__ volatile( 1587#if defined(PIC) 1588 "mov %%"REG_b", %7 \n\t" 1589#if ARCH_X86_64 1590 "mov -8(%%rsp), %%"REG_a" \n\t" 1591 "mov %%"REG_a", %8 \n\t" 1592#endif 1593#else 1594#if ARCH_X86_64 1595 "mov -8(%%rsp), %%"REG_a" \n\t" 1596 "mov %%"REG_a", %7 \n\t" 1597#endif 1598#endif 1599 "pxor %%mm7, %%mm7 \n\t" 1600 "mov %0, %%"REG_c" \n\t" 1601 "mov %1, %%"REG_D" \n\t" 1602 "mov %2, %%"REG_d" \n\t" 1603 "mov %3, %%"REG_b" \n\t" 1604 "xor %%"REG_a", %%"REG_a" \n\t" // i 1605 PREFETCH" (%%"REG_c") \n\t" 1606 PREFETCH" 32(%%"REG_c") \n\t" 1607 PREFETCH" 64(%%"REG_c") \n\t" 1608 1609 CALL_MMXEXT_FILTER_CODE 1610 CALL_MMXEXT_FILTER_CODE 1611 CALL_MMXEXT_FILTER_CODE 1612 CALL_MMXEXT_FILTER_CODE 1613 "xor %%"REG_a", %%"REG_a" \n\t" // i 1614 "mov %5, %%"REG_c" \n\t" // src 1615 "mov %6, %%"REG_D" \n\t" // buf2 1616 PREFETCH" (%%"REG_c") \n\t" 1617 PREFETCH" 32(%%"REG_c") \n\t" 1618 PREFETCH" 64(%%"REG_c") \n\t" 1619 1620 CALL_MMXEXT_FILTER_CODE 1621 CALL_MMXEXT_FILTER_CODE 1622 CALL_MMXEXT_FILTER_CODE 1623 CALL_MMXEXT_FILTER_CODE 1624 1625#if defined(PIC) 1626 "mov %7, %%"REG_b" \n\t" 1627#if ARCH_X86_64 1628 "mov %8, %%"REG_a" \n\t" 1629 "mov %%"REG_a", -8(%%rsp) \n\t" 1630#endif 1631#else 1632#if ARCH_X86_64 1633 "mov %7, %%"REG_a" \n\t" 1634 "mov %%"REG_a", -8(%%rsp) \n\t" 1635#endif 1636#endif 1637 :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), 1638 "m" (mmxextFilterCode), "m" (src2), "m"(dst2) 1639#if defined(PIC) 1640 ,"m" (ebxsave) 1641#endif 1642#if ARCH_X86_64 1643 ,"m"(retsave) 1644#endif 1645 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 1646#if !defined(PIC) 1647 ,"%"REG_b 1648#endif 1649 ); 1650 1651 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { 1652 dst1[i] = src1[srcW-1]*128; 1653 dst2[i] = src2[srcW-1]*128; 1654 } 1655} 1656#endif /* COMPILE_TEMPLATE_MMXEXT */ 1657 1658static av_cold void RENAME(sws_init_swscale)(SwsContext *c) 1659{ 1660 enum AVPixelFormat dstFormat = c->dstFormat; 1661 1662 c->use_mmx_vfilter= 0; 1663 if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 1664 && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { 1665 if (c->flags & SWS_ACCURATE_RND) { 1666 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1667 switch (c->dstFormat) { 1668 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; 1669#if HAVE_6REGS 1670 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; 1671#endif 1672 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; 1673 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; 1674 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; 1675 default: break; 1676 } 1677 } 1678 } else { 1679 c->use_mmx_vfilter= 1; 1680 c->yuv2planeX = RENAME(yuv2yuvX ); 1681 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1682 switch (c->dstFormat) { 1683 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; 1684#if HAVE_6REGS 1685 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; 1686#endif 1687 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; 1688 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; 1689 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; 1690 default: break; 1691 } 1692 } 1693 } 1694 if (!(c->flags & SWS_FULL_CHR_H_INT)) { 1695 switch (c->dstFormat) { 1696 case AV_PIX_FMT_RGB32: 1697 c->yuv2packed1 = RENAME(yuv2rgb32_1); 1698 c->yuv2packed2 = RENAME(yuv2rgb32_2); 1699 break; 1700 case AV_PIX_FMT_BGR24: 1701 c->yuv2packed1 = RENAME(yuv2bgr24_1); 1702 c->yuv2packed2 = RENAME(yuv2bgr24_2); 1703 break; 1704 case AV_PIX_FMT_RGB555: 1705 c->yuv2packed1 = RENAME(yuv2rgb555_1); 1706 c->yuv2packed2 = RENAME(yuv2rgb555_2); 1707 break; 1708 case AV_PIX_FMT_RGB565: 1709 c->yuv2packed1 = RENAME(yuv2rgb565_1); 1710 c->yuv2packed2 = RENAME(yuv2rgb565_2); 1711 break; 1712 case AV_PIX_FMT_YUYV422: 1713 c->yuv2packed1 = RENAME(yuv2yuyv422_1); 1714 c->yuv2packed2 = RENAME(yuv2yuyv422_2); 1715 break; 1716 default: 1717 break; 1718 } 1719 } 1720 } 1721 1722 if (c->srcBpc == 8 && c->dstBpc <= 14) { 1723 // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). 1724#if COMPILE_TEMPLATE_MMXEXT 1725 if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { 1726 c->hyscale_fast = RENAME(hyscale_fast); 1727 c->hcscale_fast = RENAME(hcscale_fast); 1728 } else { 1729#endif /* COMPILE_TEMPLATE_MMXEXT */ 1730 c->hyscale_fast = NULL; 1731 c->hcscale_fast = NULL; 1732#if COMPILE_TEMPLATE_MMXEXT 1733 } 1734#endif /* COMPILE_TEMPLATE_MMXEXT */ 1735 } 1736} 1737