1/* 2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#undef REAL_MOVNTQ 22#undef MOVNTQ 23#undef PAVGB 24#undef PREFETCH 25 26#if COMPILE_TEMPLATE_AMD3DNOW 27#define PREFETCH "prefetch" 28#elif COMPILE_TEMPLATE_MMX2 29#define PREFETCH "prefetchnta" 30#else 31#define PREFETCH " # nop" 32#endif 33 34#if COMPILE_TEMPLATE_MMX2 35#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 36#elif COMPILE_TEMPLATE_AMD3DNOW 37#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 38#endif 39 40#if COMPILE_TEMPLATE_MMX2 41#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 42#else 43#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 44#endif 45#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 46 47#if COMPILE_TEMPLATE_ALTIVEC 48#include "ppc/swscale_altivec_template.c" 49#endif 50 51#define YSCALEYUV2YV12X(x, offset, dest, width) \ 52 __asm__ volatile(\ 53 "xor %%"REG_a", %%"REG_a" \n\t"\ 54 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 55 "movq %%mm3, %%mm4 \n\t"\ 56 "lea " offset "(%0), %%"REG_d" \n\t"\ 57 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 58 ASMALIGN(4) /* FIXME Unroll? */\ 59 "1: \n\t"\ 60 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 61 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ 62 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ 63 "add $16, %%"REG_d" \n\t"\ 64 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 65 "test %%"REG_S", %%"REG_S" \n\t"\ 66 "pmulhw %%mm0, %%mm2 \n\t"\ 67 "pmulhw %%mm0, %%mm5 \n\t"\ 68 "paddw %%mm2, %%mm3 \n\t"\ 69 "paddw %%mm5, %%mm4 \n\t"\ 70 " jnz 1b \n\t"\ 71 "psraw $3, %%mm3 \n\t"\ 72 "psraw $3, %%mm4 \n\t"\ 73 "packuswb %%mm4, %%mm3 \n\t"\ 74 MOVNTQ(%%mm3, (%1, %%REGa))\ 75 "add $8, %%"REG_a" \n\t"\ 76 "cmp %2, %%"REG_a" \n\t"\ 77 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 78 "movq %%mm3, %%mm4 \n\t"\ 79 "lea " offset "(%0), %%"REG_d" \n\t"\ 80 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 81 "jb 1b \n\t"\ 82 :: "r" (&c->redDither),\ 83 "r" (dest), "g" (width)\ 84 : "%"REG_a, "%"REG_d, "%"REG_S\ 85 ); 86 87#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ 88 __asm__ volatile(\ 89 "lea " offset "(%0), %%"REG_d" \n\t"\ 90 "xor %%"REG_a", %%"REG_a" \n\t"\ 91 "pxor %%mm4, %%mm4 \n\t"\ 92 "pxor %%mm5, %%mm5 \n\t"\ 93 "pxor %%mm6, %%mm6 \n\t"\ 94 "pxor %%mm7, %%mm7 \n\t"\ 95 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 96 ASMALIGN(4) \ 97 "1: \n\t"\ 98 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ 99 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ 100 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ 102 "movq %%mm0, %%mm3 \n\t"\ 103 "punpcklwd %%mm1, %%mm0 \n\t"\ 104 "punpckhwd %%mm1, %%mm3 \n\t"\ 105 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ 106 "pmaddwd %%mm1, %%mm0 \n\t"\ 107 "pmaddwd %%mm1, %%mm3 \n\t"\ 108 "paddd %%mm0, %%mm4 \n\t"\ 109 "paddd %%mm3, %%mm5 \n\t"\ 110 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ 111 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 112 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 113 "test %%"REG_S", %%"REG_S" \n\t"\ 114 "movq %%mm2, %%mm0 \n\t"\ 115 "punpcklwd %%mm3, %%mm2 \n\t"\ 116 "punpckhwd %%mm3, %%mm0 \n\t"\ 117 "pmaddwd %%mm1, %%mm2 \n\t"\ 118 "pmaddwd %%mm1, %%mm0 \n\t"\ 119 "paddd %%mm2, %%mm6 \n\t"\ 120 "paddd %%mm0, %%mm7 \n\t"\ 121 " jnz 1b \n\t"\ 122 "psrad $16, %%mm4 \n\t"\ 123 "psrad $16, %%mm5 \n\t"\ 124 "psrad $16, %%mm6 \n\t"\ 125 "psrad $16, %%mm7 \n\t"\ 126 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 127 "packssdw %%mm5, %%mm4 \n\t"\ 128 "packssdw %%mm7, %%mm6 \n\t"\ 129 "paddw %%mm0, %%mm4 \n\t"\ 130 "paddw %%mm0, %%mm6 \n\t"\ 131 "psraw $3, %%mm4 \n\t"\ 132 "psraw $3, %%mm6 \n\t"\ 133 "packuswb %%mm6, %%mm4 \n\t"\ 134 MOVNTQ(%%mm4, (%1, %%REGa))\ 135 "add $8, %%"REG_a" \n\t"\ 136 "cmp %2, %%"REG_a" \n\t"\ 137 "lea " offset "(%0), %%"REG_d" \n\t"\ 138 "pxor %%mm4, %%mm4 \n\t"\ 139 "pxor %%mm5, %%mm5 \n\t"\ 140 "pxor %%mm6, %%mm6 \n\t"\ 141 "pxor %%mm7, %%mm7 \n\t"\ 142 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 143 "jb 1b \n\t"\ 144 :: "r" (&c->redDither),\ 145 "r" (dest), "g" (width)\ 146 : "%"REG_a, "%"REG_d, "%"REG_S\ 147 ); 148 149#define YSCALEYUV2YV121 \ 150 "mov %2, %%"REG_a" \n\t"\ 151 ASMALIGN(4) /* FIXME Unroll? */\ 152 "1: \n\t"\ 153 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ 154 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ 155 "psraw $7, %%mm0 \n\t"\ 156 "psraw $7, %%mm1 \n\t"\ 157 "packuswb %%mm1, %%mm0 \n\t"\ 158 MOVNTQ(%%mm0, (%1, %%REGa))\ 159 "add $8, %%"REG_a" \n\t"\ 160 "jnc 1b \n\t" 161 162#define YSCALEYUV2YV121_ACCURATE \ 163 "mov %2, %%"REG_a" \n\t"\ 164 "pcmpeqw %%mm7, %%mm7 \n\t"\ 165 "psrlw $15, %%mm7 \n\t"\ 166 "psllw $6, %%mm7 \n\t"\ 167 ASMALIGN(4) /* FIXME Unroll? */\ 168 "1: \n\t"\ 169 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ 170 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ 171 "paddsw %%mm7, %%mm0 \n\t"\ 172 "paddsw %%mm7, %%mm1 \n\t"\ 173 "psraw $7, %%mm0 \n\t"\ 174 "psraw $7, %%mm1 \n\t"\ 175 "packuswb %%mm1, %%mm0 \n\t"\ 176 MOVNTQ(%%mm0, (%1, %%REGa))\ 177 "add $8, %%"REG_a" \n\t"\ 178 "jnc 1b \n\t" 179 180/* 181 :: "m" (-lumFilterSize), "m" (-chrFilterSize), 182 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), 183 "r" (dest), "m" (dstW), 184 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) 185 : "%eax", "%ebx", "%ecx", "%edx", "%esi" 186*/ 187#define YSCALEYUV2PACKEDX_UV \ 188 __asm__ volatile(\ 189 "xor %%"REG_a", %%"REG_a" \n\t"\ 190 ASMALIGN(4)\ 191 "nop \n\t"\ 192 "1: \n\t"\ 193 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 194 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 195 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 196 "movq %%mm3, %%mm4 \n\t"\ 197 ASMALIGN(4)\ 198 "2: \n\t"\ 199 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 200 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ 201 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ 202 "add $16, %%"REG_d" \n\t"\ 203 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 204 "pmulhw %%mm0, %%mm2 \n\t"\ 205 "pmulhw %%mm0, %%mm5 \n\t"\ 206 "paddw %%mm2, %%mm3 \n\t"\ 207 "paddw %%mm5, %%mm4 \n\t"\ 208 "test %%"REG_S", %%"REG_S" \n\t"\ 209 " jnz 2b \n\t"\ 210 211#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 212 "lea "offset"(%0), %%"REG_d" \n\t"\ 213 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 214 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 215 "movq "#dst1", "#dst2" \n\t"\ 216 ASMALIGN(4)\ 217 "2: \n\t"\ 218 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ 219 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ 220 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ 221 "add $16, %%"REG_d" \n\t"\ 222 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 223 "pmulhw "#coeff", "#src1" \n\t"\ 224 "pmulhw "#coeff", "#src2" \n\t"\ 225 "paddw "#src1", "#dst1" \n\t"\ 226 "paddw "#src2", "#dst2" \n\t"\ 227 "test %%"REG_S", %%"REG_S" \n\t"\ 228 " jnz 2b \n\t"\ 229 230#define YSCALEYUV2PACKEDX \ 231 YSCALEYUV2PACKEDX_UV \ 232 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 233 234#define YSCALEYUV2PACKEDX_END \ 235 :: "r" (&c->redDither), \ 236 "m" (dummy), "m" (dummy), "m" (dummy),\ 237 "r" (dest), "m" (dstW) \ 238 : "%"REG_a, "%"REG_d, "%"REG_S \ 239 ); 240 241#define YSCALEYUV2PACKEDX_ACCURATE_UV \ 242 __asm__ volatile(\ 243 "xor %%"REG_a", %%"REG_a" \n\t"\ 244 ASMALIGN(4)\ 245 "nop \n\t"\ 246 "1: \n\t"\ 247 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 248 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 249 "pxor %%mm4, %%mm4 \n\t"\ 250 "pxor %%mm5, %%mm5 \n\t"\ 251 "pxor %%mm6, %%mm6 \n\t"\ 252 "pxor %%mm7, %%mm7 \n\t"\ 253 ASMALIGN(4)\ 254 "2: \n\t"\ 255 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ 256 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ 257 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 258 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ 259 "movq %%mm0, %%mm3 \n\t"\ 260 "punpcklwd %%mm1, %%mm0 \n\t"\ 261 "punpckhwd %%mm1, %%mm3 \n\t"\ 262 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ 263 "pmaddwd %%mm1, %%mm0 \n\t"\ 264 "pmaddwd %%mm1, %%mm3 \n\t"\ 265 "paddd %%mm0, %%mm4 \n\t"\ 266 "paddd %%mm3, %%mm5 \n\t"\ 267 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ 268 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 269 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 270 "test %%"REG_S", %%"REG_S" \n\t"\ 271 "movq %%mm2, %%mm0 \n\t"\ 272 "punpcklwd %%mm3, %%mm2 \n\t"\ 273 "punpckhwd %%mm3, %%mm0 \n\t"\ 274 "pmaddwd %%mm1, %%mm2 \n\t"\ 275 "pmaddwd %%mm1, %%mm0 \n\t"\ 276 "paddd %%mm2, %%mm6 \n\t"\ 277 "paddd %%mm0, %%mm7 \n\t"\ 278 " jnz 2b \n\t"\ 279 "psrad $16, %%mm4 \n\t"\ 280 "psrad $16, %%mm5 \n\t"\ 281 "psrad $16, %%mm6 \n\t"\ 282 "psrad $16, %%mm7 \n\t"\ 283 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 284 "packssdw %%mm5, %%mm4 \n\t"\ 285 "packssdw %%mm7, %%mm6 \n\t"\ 286 "paddw %%mm0, %%mm4 \n\t"\ 287 "paddw %%mm0, %%mm6 \n\t"\ 288 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 289 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 290 291#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 292 "lea "offset"(%0), %%"REG_d" \n\t"\ 293 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 294 "pxor %%mm1, %%mm1 \n\t"\ 295 "pxor %%mm5, %%mm5 \n\t"\ 296 "pxor %%mm7, %%mm7 \n\t"\ 297 "pxor %%mm6, %%mm6 \n\t"\ 298 ASMALIGN(4)\ 299 "2: \n\t"\ 300 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ 301 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ 302 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 303 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ 304 "movq %%mm0, %%mm3 \n\t"\ 305 "punpcklwd %%mm4, %%mm0 \n\t"\ 306 "punpckhwd %%mm4, %%mm3 \n\t"\ 307 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ 308 "pmaddwd %%mm4, %%mm0 \n\t"\ 309 "pmaddwd %%mm4, %%mm3 \n\t"\ 310 "paddd %%mm0, %%mm1 \n\t"\ 311 "paddd %%mm3, %%mm5 \n\t"\ 312 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ 313 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 314 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 315 "test %%"REG_S", %%"REG_S" \n\t"\ 316 "movq %%mm2, %%mm0 \n\t"\ 317 "punpcklwd %%mm3, %%mm2 \n\t"\ 318 "punpckhwd %%mm3, %%mm0 \n\t"\ 319 "pmaddwd %%mm4, %%mm2 \n\t"\ 320 "pmaddwd %%mm4, %%mm0 \n\t"\ 321 "paddd %%mm2, %%mm7 \n\t"\ 322 "paddd %%mm0, %%mm6 \n\t"\ 323 " jnz 2b \n\t"\ 324 "psrad $16, %%mm1 \n\t"\ 325 "psrad $16, %%mm5 \n\t"\ 326 "psrad $16, %%mm7 \n\t"\ 327 "psrad $16, %%mm6 \n\t"\ 328 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 329 "packssdw %%mm5, %%mm1 \n\t"\ 330 "packssdw %%mm6, %%mm7 \n\t"\ 331 "paddw %%mm0, %%mm1 \n\t"\ 332 "paddw %%mm0, %%mm7 \n\t"\ 333 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 334 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 335 336#define YSCALEYUV2PACKEDX_ACCURATE \ 337 YSCALEYUV2PACKEDX_ACCURATE_UV \ 338 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 339 340#define YSCALEYUV2RGBX \ 341 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ 342 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ 343 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 344 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 345 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 346 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 347 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 348 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 349 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 350 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ 351 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ 352 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 353 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 354 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 355 "paddw %%mm3, %%mm4 \n\t"\ 356 "movq %%mm2, %%mm0 \n\t"\ 357 "movq %%mm5, %%mm6 \n\t"\ 358 "movq %%mm4, %%mm3 \n\t"\ 359 "punpcklwd %%mm2, %%mm2 \n\t"\ 360 "punpcklwd %%mm5, %%mm5 \n\t"\ 361 "punpcklwd %%mm4, %%mm4 \n\t"\ 362 "paddw %%mm1, %%mm2 \n\t"\ 363 "paddw %%mm1, %%mm5 \n\t"\ 364 "paddw %%mm1, %%mm4 \n\t"\ 365 "punpckhwd %%mm0, %%mm0 \n\t"\ 366 "punpckhwd %%mm6, %%mm6 \n\t"\ 367 "punpckhwd %%mm3, %%mm3 \n\t"\ 368 "paddw %%mm7, %%mm0 \n\t"\ 369 "paddw %%mm7, %%mm6 \n\t"\ 370 "paddw %%mm7, %%mm3 \n\t"\ 371 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 372 "packuswb %%mm0, %%mm2 \n\t"\ 373 "packuswb %%mm6, %%mm5 \n\t"\ 374 "packuswb %%mm3, %%mm4 \n\t"\ 375 376#define REAL_YSCALEYUV2PACKED(index, c) \ 377 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 378 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 379 "psraw $3, %%mm0 \n\t"\ 380 "psraw $3, %%mm1 \n\t"\ 381 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 382 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 383 "xor "#index", "#index" \n\t"\ 384 ASMALIGN(4)\ 385 "1: \n\t"\ 386 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 387 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 388 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 389 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 390 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 391 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 392 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 393 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 394 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 395 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 396 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 397 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 398 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 399 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 400 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 401 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 402 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 403 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 404 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 405 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 406 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 407 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 408 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 409 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 410 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 411 412#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 413 414#define REAL_YSCALEYUV2RGB_UV(index, c) \ 415 "xor "#index", "#index" \n\t"\ 416 ASMALIGN(4)\ 417 "1: \n\t"\ 418 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 419 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 420 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 421 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 422 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 423 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 425 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 426 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 427 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 428 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 431 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 432 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 433 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 434 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 435 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 436 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 437 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 438 439#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 440 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 441 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 442 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 443 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 444 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 445 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 446 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 447 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 448 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 449 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 450 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 451 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 452 453#define REAL_YSCALEYUV2RGB_COEFF(c) \ 454 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 455 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 461 "paddw %%mm3, %%mm4 \n\t"\ 462 "movq %%mm2, %%mm0 \n\t"\ 463 "movq %%mm5, %%mm6 \n\t"\ 464 "movq %%mm4, %%mm3 \n\t"\ 465 "punpcklwd %%mm2, %%mm2 \n\t"\ 466 "punpcklwd %%mm5, %%mm5 \n\t"\ 467 "punpcklwd %%mm4, %%mm4 \n\t"\ 468 "paddw %%mm1, %%mm2 \n\t"\ 469 "paddw %%mm1, %%mm5 \n\t"\ 470 "paddw %%mm1, %%mm4 \n\t"\ 471 "punpckhwd %%mm0, %%mm0 \n\t"\ 472 "punpckhwd %%mm6, %%mm6 \n\t"\ 473 "punpckhwd %%mm3, %%mm3 \n\t"\ 474 "paddw %%mm7, %%mm0 \n\t"\ 475 "paddw %%mm7, %%mm6 \n\t"\ 476 "paddw %%mm7, %%mm3 \n\t"\ 477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 478 "packuswb %%mm0, %%mm2 \n\t"\ 479 "packuswb %%mm6, %%mm5 \n\t"\ 480 "packuswb %%mm3, %%mm4 \n\t"\ 481 482#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 483 484#define YSCALEYUV2RGB(index, c) \ 485 REAL_YSCALEYUV2RGB_UV(index, c) \ 486 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 487 REAL_YSCALEYUV2RGB_COEFF(c) 488 489#define REAL_YSCALEYUV2PACKED1(index, c) \ 490 "xor "#index", "#index" \n\t"\ 491 ASMALIGN(4)\ 492 "1: \n\t"\ 493 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 494 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 495 "psraw $7, %%mm3 \n\t" \ 496 "psraw $7, %%mm4 \n\t" \ 497 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 498 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 499 "psraw $7, %%mm1 \n\t" \ 500 "psraw $7, %%mm7 \n\t" \ 501 502#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 503 504#define REAL_YSCALEYUV2RGB1(index, c) \ 505 "xor "#index", "#index" \n\t"\ 506 ASMALIGN(4)\ 507 "1: \n\t"\ 508 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 509 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 510 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 511 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 512 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 513 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 514 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 515 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 516 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 517 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 518 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 521 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 522 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 523 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 524 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 525 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 526 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 527 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 528 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 529 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 530 "paddw %%mm3, %%mm4 \n\t"\ 531 "movq %%mm2, %%mm0 \n\t"\ 532 "movq %%mm5, %%mm6 \n\t"\ 533 "movq %%mm4, %%mm3 \n\t"\ 534 "punpcklwd %%mm2, %%mm2 \n\t"\ 535 "punpcklwd %%mm5, %%mm5 \n\t"\ 536 "punpcklwd %%mm4, %%mm4 \n\t"\ 537 "paddw %%mm1, %%mm2 \n\t"\ 538 "paddw %%mm1, %%mm5 \n\t"\ 539 "paddw %%mm1, %%mm4 \n\t"\ 540 "punpckhwd %%mm0, %%mm0 \n\t"\ 541 "punpckhwd %%mm6, %%mm6 \n\t"\ 542 "punpckhwd %%mm3, %%mm3 \n\t"\ 543 "paddw %%mm7, %%mm0 \n\t"\ 544 "paddw %%mm7, %%mm6 \n\t"\ 545 "paddw %%mm7, %%mm3 \n\t"\ 546 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 547 "packuswb %%mm0, %%mm2 \n\t"\ 548 "packuswb %%mm6, %%mm5 \n\t"\ 549 "packuswb %%mm3, %%mm4 \n\t"\ 550 551#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 552 553#define REAL_YSCALEYUV2PACKED1b(index, c) \ 554 "xor "#index", "#index" \n\t"\ 555 ASMALIGN(4)\ 556 "1: \n\t"\ 557 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 558 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 559 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 560 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 561 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 562 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 563 "psrlw $8, %%mm3 \n\t" \ 564 "psrlw $8, %%mm4 \n\t" \ 565 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 566 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 567 "psraw $7, %%mm1 \n\t" \ 568 "psraw $7, %%mm7 \n\t" 569#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 570 571// do vertical chrominance interpolation 572#define REAL_YSCALEYUV2RGB1b(index, c) \ 573 "xor "#index", "#index" \n\t"\ 574 ASMALIGN(4)\ 575 "1: \n\t"\ 576 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 577 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 578 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 579 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 580 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 581 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 582 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ 583 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ 584 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 585 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 586 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 587 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 588 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 589 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 590 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 591 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 592 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 593 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 594 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 595 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 596 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 597 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 598 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 599 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 600 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 601 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 602 "paddw %%mm3, %%mm4 \n\t"\ 603 "movq %%mm2, %%mm0 \n\t"\ 604 "movq %%mm5, %%mm6 \n\t"\ 605 "movq %%mm4, %%mm3 \n\t"\ 606 "punpcklwd %%mm2, %%mm2 \n\t"\ 607 "punpcklwd %%mm5, %%mm5 \n\t"\ 608 "punpcklwd %%mm4, %%mm4 \n\t"\ 609 "paddw %%mm1, %%mm2 \n\t"\ 610 "paddw %%mm1, %%mm5 \n\t"\ 611 "paddw %%mm1, %%mm4 \n\t"\ 612 "punpckhwd %%mm0, %%mm0 \n\t"\ 613 "punpckhwd %%mm6, %%mm6 \n\t"\ 614 "punpckhwd %%mm3, %%mm3 \n\t"\ 615 "paddw %%mm7, %%mm0 \n\t"\ 616 "paddw %%mm7, %%mm6 \n\t"\ 617 "paddw %%mm7, %%mm3 \n\t"\ 618 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 619 "packuswb %%mm0, %%mm2 \n\t"\ 620 "packuswb %%mm6, %%mm5 \n\t"\ 621 "packuswb %%mm3, %%mm4 \n\t"\ 622 623#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 624 625#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 626 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ 627 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ 628 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ 629 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ 630 "packuswb %%mm1, %%mm7 \n\t" 631#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 632 633#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 634 "movq "#b", "#q2" \n\t" /* B */\ 635 "movq "#r", "#t" \n\t" /* R */\ 636 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ 637 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ 638 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ 639 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ 640 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ 641 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ 642 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ 643 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ 644 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ 645 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ 646\ 647 MOVNTQ( q0, (dst, index, 4))\ 648 MOVNTQ( b, 8(dst, index, 4))\ 649 MOVNTQ( q2, 16(dst, index, 4))\ 650 MOVNTQ( q3, 24(dst, index, 4))\ 651\ 652 "add $8, "#index" \n\t"\ 653 "cmp "#dstw", "#index" \n\t"\ 654 " jb 1b \n\t" 655#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 656 657#define REAL_WRITERGB16(dst, dstw, index) \ 658 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 659 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ 660 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 661 "psrlq $3, %%mm2 \n\t"\ 662\ 663 "movq %%mm2, %%mm1 \n\t"\ 664 "movq %%mm4, %%mm3 \n\t"\ 665\ 666 "punpcklbw %%mm7, %%mm3 \n\t"\ 667 "punpcklbw %%mm5, %%mm2 \n\t"\ 668 "punpckhbw %%mm7, %%mm4 \n\t"\ 669 "punpckhbw %%mm5, %%mm1 \n\t"\ 670\ 671 "psllq $3, %%mm3 \n\t"\ 672 "psllq $3, %%mm4 \n\t"\ 673\ 674 "por %%mm3, %%mm2 \n\t"\ 675 "por %%mm4, %%mm1 \n\t"\ 676\ 677 MOVNTQ(%%mm2, (dst, index, 2))\ 678 MOVNTQ(%%mm1, 8(dst, index, 2))\ 679\ 680 "add $8, "#index" \n\t"\ 681 "cmp "#dstw", "#index" \n\t"\ 682 " jb 1b \n\t" 683#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 684 685#define REAL_WRITERGB15(dst, dstw, index) \ 686 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 687 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ 688 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 689 "psrlq $3, %%mm2 \n\t"\ 690 "psrlq $1, %%mm5 \n\t"\ 691\ 692 "movq %%mm2, %%mm1 \n\t"\ 693 "movq %%mm4, %%mm3 \n\t"\ 694\ 695 "punpcklbw %%mm7, %%mm3 \n\t"\ 696 "punpcklbw %%mm5, %%mm2 \n\t"\ 697 "punpckhbw %%mm7, %%mm4 \n\t"\ 698 "punpckhbw %%mm5, %%mm1 \n\t"\ 699\ 700 "psllq $2, %%mm3 \n\t"\ 701 "psllq $2, %%mm4 \n\t"\ 702\ 703 "por %%mm3, %%mm2 \n\t"\ 704 "por %%mm4, %%mm1 \n\t"\ 705\ 706 MOVNTQ(%%mm2, (dst, index, 2))\ 707 MOVNTQ(%%mm1, 8(dst, index, 2))\ 708\ 709 "add $8, "#index" \n\t"\ 710 "cmp "#dstw", "#index" \n\t"\ 711 " jb 1b \n\t" 712#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 713 714#define WRITEBGR24OLD(dst, dstw, index) \ 715 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 716 "movq %%mm2, %%mm1 \n\t" /* B */\ 717 "movq %%mm5, %%mm6 \n\t" /* R */\ 718 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 719 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 720 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 721 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 722 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 723 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 724 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 725 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 726 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 727 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 728\ 729 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 730 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ 731 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\ 732 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\ 733 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ 734 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ 735 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ 736 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 737\ 738 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 739 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ 740 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ 741 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ 742 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\ 743 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ 744 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ 745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\ 746 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\ 747 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ 748 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ 749 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ 750 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ 751\ 752 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ 753 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ 754 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ 755 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\ 756 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\ 757 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ 758 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ 759 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ 760\ 761 MOVNTQ(%%mm0, (dst))\ 762 MOVNTQ(%%mm2, 8(dst))\ 763 MOVNTQ(%%mm3, 16(dst))\ 764 "add $24, "#dst" \n\t"\ 765\ 766 "add $8, "#index" \n\t"\ 767 "cmp "#dstw", "#index" \n\t"\ 768 " jb 1b \n\t" 769 770#define WRITEBGR24MMX(dst, dstw, index) \ 771 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 772 "movq %%mm2, %%mm1 \n\t" /* B */\ 773 "movq %%mm5, %%mm6 \n\t" /* R */\ 774 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 775 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 776 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 777 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 778 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 779 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 780 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 781 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 782 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 783 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 784\ 785 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 786 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ 787 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ 788 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ 789\ 790 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ 791 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ 792 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ 793 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ 794\ 795 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ 796 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ 797 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ 798 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ 799\ 800 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ 801 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ 802 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ 803 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 804 MOVNTQ(%%mm0, (dst))\ 805\ 806 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ 807 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ 808 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ 809 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ 810 MOVNTQ(%%mm6, 8(dst))\ 811\ 812 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ 813 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ 814 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ 815 MOVNTQ(%%mm5, 16(dst))\ 816\ 817 "add $24, "#dst" \n\t"\ 818\ 819 "add $8, "#index" \n\t"\ 820 "cmp "#dstw", "#index" \n\t"\ 821 " jb 1b \n\t" 822 823#define WRITEBGR24MMX2(dst, dstw, index) \ 824 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 825 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 826 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 827 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ 828 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ 829 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ 830\ 831 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ 832 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ 833 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ 834\ 835 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ 836 "por %%mm1, %%mm6 \n\t"\ 837 "por %%mm3, %%mm6 \n\t"\ 838 MOVNTQ(%%mm6, (dst))\ 839\ 840 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ 841 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ 842 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ 843 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ 844\ 845 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ 846 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ 847 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ 848\ 849 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ 850 "por %%mm3, %%mm6 \n\t"\ 851 MOVNTQ(%%mm6, 8(dst))\ 852\ 853 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ 854 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ 855 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ 856\ 857 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ 858 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ 859 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ 860\ 861 "por %%mm1, %%mm3 \n\t"\ 862 "por %%mm3, %%mm6 \n\t"\ 863 MOVNTQ(%%mm6, 16(dst))\ 864\ 865 "add $24, "#dst" \n\t"\ 866\ 867 "add $8, "#index" \n\t"\ 868 "cmp "#dstw", "#index" \n\t"\ 869 " jb 1b \n\t" 870 871#if COMPILE_TEMPLATE_MMX2 872#undef WRITEBGR24 873#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) 874#else 875#undef WRITEBGR24 876#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 877#endif 878 879#define REAL_WRITEYUY2(dst, dstw, index) \ 880 "packuswb %%mm3, %%mm3 \n\t"\ 881 "packuswb %%mm4, %%mm4 \n\t"\ 882 "packuswb %%mm7, %%mm1 \n\t"\ 883 "punpcklbw %%mm4, %%mm3 \n\t"\ 884 "movq %%mm1, %%mm7 \n\t"\ 885 "punpcklbw %%mm3, %%mm1 \n\t"\ 886 "punpckhbw %%mm3, %%mm7 \n\t"\ 887\ 888 MOVNTQ(%%mm1, (dst, index, 2))\ 889 MOVNTQ(%%mm7, 8(dst, index, 2))\ 890\ 891 "add $8, "#index" \n\t"\ 892 "cmp "#dstw", "#index" \n\t"\ 893 " jb 1b \n\t" 894#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 895 896 897static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, 898 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc, 899 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) 900{ 901#if COMPILE_TEMPLATE_MMX 902 if(!(c->flags & SWS_BITEXACT)) { 903 if (c->flags & SWS_ACCURATE_RND) { 904 if (uDest) { 905 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) 906 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) 907 } 908 if (CONFIG_SWSCALE_ALPHA && aDest) { 909 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) 910 } 911 912 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) 913 } else { 914 if (uDest) { 915 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) 916 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) 917 } 918 if (CONFIG_SWSCALE_ALPHA && aDest) { 919 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) 920 } 921 922 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) 923 } 924 return; 925 } 926#endif 927#if COMPILE_TEMPLATE_ALTIVEC 928 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, 929 chrFilter, chrSrc, chrFilterSize, 930 dest, uDest, vDest, dstW, chrDstW); 931#else //COMPILE_TEMPLATE_ALTIVEC 932 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, 933 chrFilter, chrSrc, chrFilterSize, 934 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW); 935#endif //!COMPILE_TEMPLATE_ALTIVEC 936} 937 938static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, 939 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, 940 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat) 941{ 942 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, 943 chrFilter, chrSrc, chrFilterSize, 944 dest, uDest, dstW, chrDstW, dstFormat); 945} 946 947static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc, 948 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) 949{ 950 int i; 951#if COMPILE_TEMPLATE_MMX 952 if(!(c->flags & SWS_BITEXACT)) { 953 long p= 4; 954 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; 955 uint8_t *dst[4]= {aDest, dest, uDest, vDest}; 956 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW}; 957 958 if (c->flags & SWS_ACCURATE_RND) { 959 while(p--) { 960 if (dst[p]) { 961 __asm__ volatile( 962 YSCALEYUV2YV121_ACCURATE 963 :: "r" (src[p]), "r" (dst[p] + counter[p]), 964 "g" (-counter[p]) 965 : "%"REG_a 966 ); 967 } 968 } 969 } else { 970 while(p--) { 971 if (dst[p]) { 972 __asm__ volatile( 973 YSCALEYUV2YV121 974 :: "r" (src[p]), "r" (dst[p] + counter[p]), 975 "g" (-counter[p]) 976 : "%"REG_a 977 ); 978 } 979 } 980 } 981 return; 982 } 983#endif 984 for (i=0; i<dstW; i++) { 985 int val= (lumSrc[i]+64)>>7; 986 987 if (val&256) { 988 if (val<0) val=0; 989 else val=255; 990 } 991 992 dest[i]= val; 993 } 994 995 if (uDest) 996 for (i=0; i<chrDstW; i++) { 997 int u=(chrSrc[i ]+64)>>7; 998 int v=(chrSrc[i + VOFW]+64)>>7; 999 1000 if ((u|v)&256) { 1001 if (u<0) u=0; 1002 else if (u>255) u=255; 1003 if (v<0) v=0; 1004 else if (v>255) v=255; 1005 } 1006 1007 uDest[i]= u; 1008 vDest[i]= v; 1009 } 1010 1011 if (CONFIG_SWSCALE_ALPHA && aDest) 1012 for (i=0; i<dstW; i++) { 1013 int val= (alpSrc[i]+64)>>7; 1014 aDest[i]= av_clip_uint8(val); 1015 } 1016} 1017 1018 1019/** 1020 * vertical scale YV12 to RGB 1021 */ 1022static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, 1023 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, 1024 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY) 1025{ 1026#if COMPILE_TEMPLATE_MMX 1027 x86_reg dummy=0; 1028 if(!(c->flags & SWS_BITEXACT)) { 1029 if (c->flags & SWS_ACCURATE_RND) { 1030 switch(c->dstFormat) { 1031 case PIX_FMT_RGB32: 1032 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 1033 YSCALEYUV2PACKEDX_ACCURATE 1034 YSCALEYUV2RGBX 1035 "movq %%mm2, "U_TEMP"(%0) \n\t" 1036 "movq %%mm4, "V_TEMP"(%0) \n\t" 1037 "movq %%mm5, "Y_TEMP"(%0) \n\t" 1038 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) 1039 "movq "Y_TEMP"(%0), %%mm5 \n\t" 1040 "psraw $3, %%mm1 \n\t" 1041 "psraw $3, %%mm7 \n\t" 1042 "packuswb %%mm7, %%mm1 \n\t" 1043 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) 1044 1045 YSCALEYUV2PACKEDX_END 1046 } else { 1047 YSCALEYUV2PACKEDX_ACCURATE 1048 YSCALEYUV2RGBX 1049 "pcmpeqd %%mm7, %%mm7 \n\t" 1050 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1051 1052 YSCALEYUV2PACKEDX_END 1053 } 1054 return; 1055 case PIX_FMT_BGR24: 1056 YSCALEYUV2PACKEDX_ACCURATE 1057 YSCALEYUV2RGBX 1058 "pxor %%mm7, %%mm7 \n\t" 1059 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize 1060 "add %4, %%"REG_c" \n\t" 1061 WRITEBGR24(%%REGc, %5, %%REGa) 1062 1063 1064 :: "r" (&c->redDither), 1065 "m" (dummy), "m" (dummy), "m" (dummy), 1066 "r" (dest), "m" (dstW) 1067 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 1068 ); 1069 return; 1070 case PIX_FMT_RGB555: 1071 YSCALEYUV2PACKEDX_ACCURATE 1072 YSCALEYUV2RGBX 1073 "pxor %%mm7, %%mm7 \n\t" 1074 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1075#ifdef DITHER1XBPP 1076 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 1077 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 1078 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 1079#endif 1080 1081 WRITERGB15(%4, %5, %%REGa) 1082 YSCALEYUV2PACKEDX_END 1083 return; 1084 case PIX_FMT_RGB565: 1085 YSCALEYUV2PACKEDX_ACCURATE 1086 YSCALEYUV2RGBX 1087 "pxor %%mm7, %%mm7 \n\t" 1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1089#ifdef DITHER1XBPP 1090 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 1091 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 1092 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 1093#endif 1094 1095 WRITERGB16(%4, %5, %%REGa) 1096 YSCALEYUV2PACKEDX_END 1097 return; 1098 case PIX_FMT_YUYV422: 1099 YSCALEYUV2PACKEDX_ACCURATE 1100 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1101 1102 "psraw $3, %%mm3 \n\t" 1103 "psraw $3, %%mm4 \n\t" 1104 "psraw $3, %%mm1 \n\t" 1105 "psraw $3, %%mm7 \n\t" 1106 WRITEYUY2(%4, %5, %%REGa) 1107 YSCALEYUV2PACKEDX_END 1108 return; 1109 } 1110 } else { 1111 switch(c->dstFormat) { 1112 case PIX_FMT_RGB32: 1113 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 1114 YSCALEYUV2PACKEDX 1115 YSCALEYUV2RGBX 1116 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) 1117 "psraw $3, %%mm1 \n\t" 1118 "psraw $3, %%mm7 \n\t" 1119 "packuswb %%mm7, %%mm1 \n\t" 1120 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 1121 YSCALEYUV2PACKEDX_END 1122 } else { 1123 YSCALEYUV2PACKEDX 1124 YSCALEYUV2RGBX 1125 "pcmpeqd %%mm7, %%mm7 \n\t" 1126 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1127 YSCALEYUV2PACKEDX_END 1128 } 1129 return; 1130 case PIX_FMT_BGR24: 1131 YSCALEYUV2PACKEDX 1132 YSCALEYUV2RGBX 1133 "pxor %%mm7, %%mm7 \n\t" 1134 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize 1135 "add %4, %%"REG_c" \n\t" 1136 WRITEBGR24(%%REGc, %5, %%REGa) 1137 1138 :: "r" (&c->redDither), 1139 "m" (dummy), "m" (dummy), "m" (dummy), 1140 "r" (dest), "m" (dstW) 1141 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 1142 ); 1143 return; 1144 case PIX_FMT_RGB555: 1145 YSCALEYUV2PACKEDX 1146 YSCALEYUV2RGBX 1147 "pxor %%mm7, %%mm7 \n\t" 1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1149#ifdef DITHER1XBPP 1150 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 1151 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 1152 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 1153#endif 1154 1155 WRITERGB15(%4, %5, %%REGa) 1156 YSCALEYUV2PACKEDX_END 1157 return; 1158 case PIX_FMT_RGB565: 1159 YSCALEYUV2PACKEDX 1160 YSCALEYUV2RGBX 1161 "pxor %%mm7, %%mm7 \n\t" 1162 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1163#ifdef DITHER1XBPP 1164 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 1165 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 1166 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 1167#endif 1168 1169 WRITERGB16(%4, %5, %%REGa) 1170 YSCALEYUV2PACKEDX_END 1171 return; 1172 case PIX_FMT_YUYV422: 1173 YSCALEYUV2PACKEDX 1174 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1175 1176 "psraw $3, %%mm3 \n\t" 1177 "psraw $3, %%mm4 \n\t" 1178 "psraw $3, %%mm1 \n\t" 1179 "psraw $3, %%mm7 \n\t" 1180 WRITEYUY2(%4, %5, %%REGa) 1181 YSCALEYUV2PACKEDX_END 1182 return; 1183 } 1184 } 1185 } 1186#endif /* COMPILE_TEMPLATE_MMX */ 1187#if COMPILE_TEMPLATE_ALTIVEC 1188 /* The following list of supported dstFormat values should 1189 match what's found in the body of ff_yuv2packedX_altivec() */ 1190 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf && 1191 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || 1192 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || 1193 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) 1194 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, 1195 chrFilter, chrSrc, chrFilterSize, 1196 dest, dstW, dstY); 1197 else 1198#endif 1199 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, 1200 chrFilter, chrSrc, chrFilterSize, 1201 alpSrc, dest, dstW, dstY); 1202} 1203 1204/** 1205 * vertical bilinear scale YV12 to RGB 1206 */ 1207static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1, 1208 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) 1209{ 1210 int yalpha1=4095- yalpha; 1211 int uvalpha1=4095-uvalpha; 1212 int i; 1213 1214#if COMPILE_TEMPLATE_MMX 1215 if(!(c->flags & SWS_BITEXACT)) { 1216 switch(c->dstFormat) { 1217 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( 1218 case PIX_FMT_RGB32: 1219 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 1220#if ARCH_X86_64 1221 __asm__ volatile( 1222 YSCALEYUV2RGB(%%r8, %5) 1223 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) 1224 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 1225 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 1226 "packuswb %%mm7, %%mm1 \n\t" 1227 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 1228 1229 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest), 1230 "a" (&c->redDither) 1231 ,"r" (abuf0), "r" (abuf1) 1232 : "%r8" 1233 ); 1234#else 1235 *(const uint16_t **)(&c->u_temp)=abuf0; 1236 *(const uint16_t **)(&c->v_temp)=abuf1; 1237 __asm__ volatile( 1238 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1239 "mov %4, %%"REG_b" \n\t" 1240 "push %%"REG_BP" \n\t" 1241 YSCALEYUV2RGB(%%REGBP, %5) 1242 "push %0 \n\t" 1243 "push %1 \n\t" 1244 "mov "U_TEMP"(%5), %0 \n\t" 1245 "mov "V_TEMP"(%5), %1 \n\t" 1246 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) 1247 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 1248 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ 1249 "packuswb %%mm7, %%mm1 \n\t" 1250 "pop %1 \n\t" 1251 "pop %0 \n\t" 1252 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) 1253 "pop %%"REG_BP" \n\t" 1254 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1255 1256 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1257 "a" (&c->redDither) 1258 ); 1259#endif 1260 } else { 1261 __asm__ volatile( 1262 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1263 "mov %4, %%"REG_b" \n\t" 1264 "push %%"REG_BP" \n\t" 1265 YSCALEYUV2RGB(%%REGBP, %5) 1266 "pcmpeqd %%mm7, %%mm7 \n\t" 1267 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1268 "pop %%"REG_BP" \n\t" 1269 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1270 1271 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1272 "a" (&c->redDither) 1273 ); 1274 } 1275 return; 1276 case PIX_FMT_BGR24: 1277 __asm__ volatile( 1278 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1279 "mov %4, %%"REG_b" \n\t" 1280 "push %%"REG_BP" \n\t" 1281 YSCALEYUV2RGB(%%REGBP, %5) 1282 "pxor %%mm7, %%mm7 \n\t" 1283 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 1284 "pop %%"REG_BP" \n\t" 1285 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1286 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1287 "a" (&c->redDither) 1288 ); 1289 return; 1290 case PIX_FMT_RGB555: 1291 __asm__ volatile( 1292 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1293 "mov %4, %%"REG_b" \n\t" 1294 "push %%"REG_BP" \n\t" 1295 YSCALEYUV2RGB(%%REGBP, %5) 1296 "pxor %%mm7, %%mm7 \n\t" 1297 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1298#ifdef DITHER1XBPP 1299 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1300 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1301 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1302#endif 1303 1304 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 1305 "pop %%"REG_BP" \n\t" 1306 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1307 1308 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1309 "a" (&c->redDither) 1310 ); 1311 return; 1312 case PIX_FMT_RGB565: 1313 __asm__ volatile( 1314 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1315 "mov %4, %%"REG_b" \n\t" 1316 "push %%"REG_BP" \n\t" 1317 YSCALEYUV2RGB(%%REGBP, %5) 1318 "pxor %%mm7, %%mm7 \n\t" 1319 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1320#ifdef DITHER1XBPP 1321 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1322 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1323 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1324#endif 1325 1326 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 1327 "pop %%"REG_BP" \n\t" 1328 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1329 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1330 "a" (&c->redDither) 1331 ); 1332 return; 1333 case PIX_FMT_YUYV422: 1334 __asm__ volatile( 1335 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1336 "mov %4, %%"REG_b" \n\t" 1337 "push %%"REG_BP" \n\t" 1338 YSCALEYUV2PACKED(%%REGBP, %5) 1339 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 1340 "pop %%"REG_BP" \n\t" 1341 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1342 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1343 "a" (&c->redDither) 1344 ); 1345 return; 1346 default: break; 1347 } 1348 } 1349#endif //COMPILE_TEMPLATE_MMX 1350 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) 1351} 1352 1353/** 1354 * YV12 to RGB without scaling or interpolating 1355 */ 1356static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1, 1357 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y) 1358{ 1359 const int yalpha1=0; 1360 int i; 1361 1362 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1363 const int yalpha= 4096; //FIXME ... 1364 1365 if (flags&SWS_FULL_CHR_H_INT) { 1366 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y); 1367 return; 1368 } 1369 1370#if COMPILE_TEMPLATE_MMX 1371 if(!(flags & SWS_BITEXACT)) { 1372 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1373 switch(dstFormat) { 1374 case PIX_FMT_RGB32: 1375 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 1376 __asm__ volatile( 1377 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1378 "mov %4, %%"REG_b" \n\t" 1379 "push %%"REG_BP" \n\t" 1380 YSCALEYUV2RGB1(%%REGBP, %5) 1381 YSCALEYUV2RGB1_ALPHA(%%REGBP) 1382 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1383 "pop %%"REG_BP" \n\t" 1384 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1385 1386 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1387 "a" (&c->redDither) 1388 ); 1389 } else { 1390 __asm__ volatile( 1391 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1392 "mov %4, %%"REG_b" \n\t" 1393 "push %%"REG_BP" \n\t" 1394 YSCALEYUV2RGB1(%%REGBP, %5) 1395 "pcmpeqd %%mm7, %%mm7 \n\t" 1396 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1397 "pop %%"REG_BP" \n\t" 1398 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1399 1400 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1401 "a" (&c->redDither) 1402 ); 1403 } 1404 return; 1405 case PIX_FMT_BGR24: 1406 __asm__ volatile( 1407 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1408 "mov %4, %%"REG_b" \n\t" 1409 "push %%"REG_BP" \n\t" 1410 YSCALEYUV2RGB1(%%REGBP, %5) 1411 "pxor %%mm7, %%mm7 \n\t" 1412 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 1413 "pop %%"REG_BP" \n\t" 1414 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1415 1416 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1417 "a" (&c->redDither) 1418 ); 1419 return; 1420 case PIX_FMT_RGB555: 1421 __asm__ volatile( 1422 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1423 "mov %4, %%"REG_b" \n\t" 1424 "push %%"REG_BP" \n\t" 1425 YSCALEYUV2RGB1(%%REGBP, %5) 1426 "pxor %%mm7, %%mm7 \n\t" 1427 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1428#ifdef DITHER1XBPP 1429 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1430 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1431 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1432#endif 1433 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 1434 "pop %%"REG_BP" \n\t" 1435 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1436 1437 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1438 "a" (&c->redDither) 1439 ); 1440 return; 1441 case PIX_FMT_RGB565: 1442 __asm__ volatile( 1443 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1444 "mov %4, %%"REG_b" \n\t" 1445 "push %%"REG_BP" \n\t" 1446 YSCALEYUV2RGB1(%%REGBP, %5) 1447 "pxor %%mm7, %%mm7 \n\t" 1448 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1449#ifdef DITHER1XBPP 1450 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1451 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1452 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1453#endif 1454 1455 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 1456 "pop %%"REG_BP" \n\t" 1457 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1458 1459 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1460 "a" (&c->redDither) 1461 ); 1462 return; 1463 case PIX_FMT_YUYV422: 1464 __asm__ volatile( 1465 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1466 "mov %4, %%"REG_b" \n\t" 1467 "push %%"REG_BP" \n\t" 1468 YSCALEYUV2PACKED1(%%REGBP, %5) 1469 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 1470 "pop %%"REG_BP" \n\t" 1471 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1472 1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1474 "a" (&c->redDither) 1475 ); 1476 return; 1477 } 1478 } else { 1479 switch(dstFormat) { 1480 case PIX_FMT_RGB32: 1481 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { 1482 __asm__ volatile( 1483 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1484 "mov %4, %%"REG_b" \n\t" 1485 "push %%"REG_BP" \n\t" 1486 YSCALEYUV2RGB1b(%%REGBP, %5) 1487 YSCALEYUV2RGB1_ALPHA(%%REGBP) 1488 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1489 "pop %%"REG_BP" \n\t" 1490 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1491 1492 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1493 "a" (&c->redDither) 1494 ); 1495 } else { 1496 __asm__ volatile( 1497 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1498 "mov %4, %%"REG_b" \n\t" 1499 "push %%"REG_BP" \n\t" 1500 YSCALEYUV2RGB1b(%%REGBP, %5) 1501 "pcmpeqd %%mm7, %%mm7 \n\t" 1502 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1503 "pop %%"REG_BP" \n\t" 1504 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1505 1506 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1507 "a" (&c->redDither) 1508 ); 1509 } 1510 return; 1511 case PIX_FMT_BGR24: 1512 __asm__ volatile( 1513 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1514 "mov %4, %%"REG_b" \n\t" 1515 "push %%"REG_BP" \n\t" 1516 YSCALEYUV2RGB1b(%%REGBP, %5) 1517 "pxor %%mm7, %%mm7 \n\t" 1518 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 1519 "pop %%"REG_BP" \n\t" 1520 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1521 1522 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1523 "a" (&c->redDither) 1524 ); 1525 return; 1526 case PIX_FMT_RGB555: 1527 __asm__ volatile( 1528 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1529 "mov %4, %%"REG_b" \n\t" 1530 "push %%"REG_BP" \n\t" 1531 YSCALEYUV2RGB1b(%%REGBP, %5) 1532 "pxor %%mm7, %%mm7 \n\t" 1533 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1534#ifdef DITHER1XBPP 1535 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1536 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1537 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1538#endif 1539 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 1540 "pop %%"REG_BP" \n\t" 1541 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1542 1543 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1544 "a" (&c->redDither) 1545 ); 1546 return; 1547 case PIX_FMT_RGB565: 1548 __asm__ volatile( 1549 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1550 "mov %4, %%"REG_b" \n\t" 1551 "push %%"REG_BP" \n\t" 1552 YSCALEYUV2RGB1b(%%REGBP, %5) 1553 "pxor %%mm7, %%mm7 \n\t" 1554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1555#ifdef DITHER1XBPP 1556 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1557 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1558 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1559#endif 1560 1561 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 1562 "pop %%"REG_BP" \n\t" 1563 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1564 1565 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1566 "a" (&c->redDither) 1567 ); 1568 return; 1569 case PIX_FMT_YUYV422: 1570 __asm__ volatile( 1571 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1572 "mov %4, %%"REG_b" \n\t" 1573 "push %%"REG_BP" \n\t" 1574 YSCALEYUV2PACKED1b(%%REGBP, %5) 1575 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 1576 "pop %%"REG_BP" \n\t" 1577 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1578 1579 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1580 "a" (&c->redDither) 1581 ); 1582 return; 1583 } 1584 } 1585 } 1586#endif /* COMPILE_TEMPLATE_MMX */ 1587 if (uvalpha < 2048) { 1588 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) 1589 } else { 1590 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) 1591 } 1592} 1593 1594//FIXME yuy2* can read up to 7 samples too much 1595 1596static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 1597{ 1598#if COMPILE_TEMPLATE_MMX 1599 __asm__ volatile( 1600 "movq "MANGLE(bm01010101)", %%mm2 \n\t" 1601 "mov %0, %%"REG_a" \n\t" 1602 "1: \n\t" 1603 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 1604 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 1605 "pand %%mm2, %%mm0 \n\t" 1606 "pand %%mm2, %%mm1 \n\t" 1607 "packuswb %%mm1, %%mm0 \n\t" 1608 "movq %%mm0, (%2, %%"REG_a") \n\t" 1609 "add $8, %%"REG_a" \n\t" 1610 " js 1b \n\t" 1611 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) 1612 : "%"REG_a 1613 ); 1614#else 1615 int i; 1616 for (i=0; i<width; i++) 1617 dst[i]= src[2*i]; 1618#endif 1619} 1620 1621static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 1622{ 1623#if COMPILE_TEMPLATE_MMX 1624 __asm__ volatile( 1625 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 1626 "mov %0, %%"REG_a" \n\t" 1627 "1: \n\t" 1628 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 1629 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 1630 "psrlw $8, %%mm0 \n\t" 1631 "psrlw $8, %%mm1 \n\t" 1632 "packuswb %%mm1, %%mm0 \n\t" 1633 "movq %%mm0, %%mm1 \n\t" 1634 "psrlw $8, %%mm0 \n\t" 1635 "pand %%mm4, %%mm1 \n\t" 1636 "packuswb %%mm0, %%mm0 \n\t" 1637 "packuswb %%mm1, %%mm1 \n\t" 1638 "movd %%mm0, (%3, %%"REG_a") \n\t" 1639 "movd %%mm1, (%2, %%"REG_a") \n\t" 1640 "add $4, %%"REG_a" \n\t" 1641 " js 1b \n\t" 1642 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 1643 : "%"REG_a 1644 ); 1645#else 1646 int i; 1647 for (i=0; i<width; i++) { 1648 dstU[i]= src1[4*i + 1]; 1649 dstV[i]= src1[4*i + 3]; 1650 } 1651#endif 1652 assert(src1 == src2); 1653} 1654 1655static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 1656{ 1657#if COMPILE_TEMPLATE_MMX 1658 __asm__ volatile( 1659 "mov %0, %%"REG_a" \n\t" 1660 "1: \n\t" 1661 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 1662 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 1663 "movq (%2, %%"REG_a",2), %%mm2 \n\t" 1664 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" 1665 "psrlw $8, %%mm0 \n\t" 1666 "psrlw $8, %%mm1 \n\t" 1667 "psrlw $8, %%mm2 \n\t" 1668 "psrlw $8, %%mm3 \n\t" 1669 "packuswb %%mm1, %%mm0 \n\t" 1670 "packuswb %%mm3, %%mm2 \n\t" 1671 "movq %%mm0, (%3, %%"REG_a") \n\t" 1672 "movq %%mm2, (%4, %%"REG_a") \n\t" 1673 "add $8, %%"REG_a" \n\t" 1674 " js 1b \n\t" 1675 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) 1676 : "%"REG_a 1677 ); 1678#else 1679 int i; 1680 for (i=0; i<width; i++) { 1681 dstU[i]= src1[2*i + 1]; 1682 dstV[i]= src2[2*i + 1]; 1683 } 1684#endif 1685} 1686 1687/* This is almost identical to the previous, end exists only because 1688 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ 1689static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 1690{ 1691#if COMPILE_TEMPLATE_MMX 1692 __asm__ volatile( 1693 "mov %0, %%"REG_a" \n\t" 1694 "1: \n\t" 1695 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 1696 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 1697 "psrlw $8, %%mm0 \n\t" 1698 "psrlw $8, %%mm1 \n\t" 1699 "packuswb %%mm1, %%mm0 \n\t" 1700 "movq %%mm0, (%2, %%"REG_a") \n\t" 1701 "add $8, %%"REG_a" \n\t" 1702 " js 1b \n\t" 1703 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) 1704 : "%"REG_a 1705 ); 1706#else 1707 int i; 1708 for (i=0; i<width; i++) 1709 dst[i]= src[2*i+1]; 1710#endif 1711} 1712 1713static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 1714{ 1715#if COMPILE_TEMPLATE_MMX 1716 __asm__ volatile( 1717 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 1718 "mov %0, %%"REG_a" \n\t" 1719 "1: \n\t" 1720 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 1721 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 1722 "pand %%mm4, %%mm0 \n\t" 1723 "pand %%mm4, %%mm1 \n\t" 1724 "packuswb %%mm1, %%mm0 \n\t" 1725 "movq %%mm0, %%mm1 \n\t" 1726 "psrlw $8, %%mm0 \n\t" 1727 "pand %%mm4, %%mm1 \n\t" 1728 "packuswb %%mm0, %%mm0 \n\t" 1729 "packuswb %%mm1, %%mm1 \n\t" 1730 "movd %%mm0, (%3, %%"REG_a") \n\t" 1731 "movd %%mm1, (%2, %%"REG_a") \n\t" 1732 "add $4, %%"REG_a" \n\t" 1733 " js 1b \n\t" 1734 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 1735 : "%"REG_a 1736 ); 1737#else 1738 int i; 1739 for (i=0; i<width; i++) { 1740 dstU[i]= src1[4*i + 0]; 1741 dstV[i]= src1[4*i + 2]; 1742 } 1743#endif 1744 assert(src1 == src2); 1745} 1746 1747static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 1748{ 1749#if COMPILE_TEMPLATE_MMX 1750 __asm__ volatile( 1751 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 1752 "mov %0, %%"REG_a" \n\t" 1753 "1: \n\t" 1754 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 1755 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 1756 "movq (%2, %%"REG_a",2), %%mm2 \n\t" 1757 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" 1758 "pand %%mm4, %%mm0 \n\t" 1759 "pand %%mm4, %%mm1 \n\t" 1760 "pand %%mm4, %%mm2 \n\t" 1761 "pand %%mm4, %%mm3 \n\t" 1762 "packuswb %%mm1, %%mm0 \n\t" 1763 "packuswb %%mm3, %%mm2 \n\t" 1764 "movq %%mm0, (%3, %%"REG_a") \n\t" 1765 "movq %%mm2, (%4, %%"REG_a") \n\t" 1766 "add $8, %%"REG_a" \n\t" 1767 " js 1b \n\t" 1768 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) 1769 : "%"REG_a 1770 ); 1771#else 1772 int i; 1773 for (i=0; i<width; i++) { 1774 dstU[i]= src1[2*i]; 1775 dstV[i]= src2[2*i]; 1776 } 1777#endif 1778} 1779 1780static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2, 1781 const uint8_t *src, long width) 1782{ 1783#if COMPILE_TEMPLATE_MMX 1784 __asm__ volatile( 1785 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 1786 "mov %0, %%"REG_a" \n\t" 1787 "1: \n\t" 1788 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 1789 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 1790 "movq %%mm0, %%mm2 \n\t" 1791 "movq %%mm1, %%mm3 \n\t" 1792 "pand %%mm4, %%mm0 \n\t" 1793 "pand %%mm4, %%mm1 \n\t" 1794 "psrlw $8, %%mm2 \n\t" 1795 "psrlw $8, %%mm3 \n\t" 1796 "packuswb %%mm1, %%mm0 \n\t" 1797 "packuswb %%mm3, %%mm2 \n\t" 1798 "movq %%mm0, (%2, %%"REG_a") \n\t" 1799 "movq %%mm2, (%3, %%"REG_a") \n\t" 1800 "add $8, %%"REG_a" \n\t" 1801 " js 1b \n\t" 1802 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width) 1803 : "%"REG_a 1804 ); 1805#else 1806 int i; 1807 for (i = 0; i < width; i++) { 1808 dst1[i] = src[2*i+0]; 1809 dst2[i] = src[2*i+1]; 1810 } 1811#endif 1812} 1813 1814static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, 1815 const uint8_t *src1, const uint8_t *src2, 1816 long width, uint32_t *unused) 1817{ 1818 RENAME(nvXXtoUV)(dstU, dstV, src1, width); 1819} 1820 1821static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, 1822 const uint8_t *src1, const uint8_t *src2, 1823 long width, uint32_t *unused) 1824{ 1825 RENAME(nvXXtoUV)(dstV, dstU, src1, width); 1826} 1827 1828#if COMPILE_TEMPLATE_MMX 1829static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat) 1830{ 1831 1832 if(srcFormat == PIX_FMT_BGR24) { 1833 __asm__ volatile( 1834 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t" 1835 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t" 1836 : 1837 ); 1838 } else { 1839 __asm__ volatile( 1840 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t" 1841 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t" 1842 : 1843 ); 1844 } 1845 1846 __asm__ volatile( 1847 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t" 1848 "mov %2, %%"REG_a" \n\t" 1849 "pxor %%mm7, %%mm7 \n\t" 1850 "1: \n\t" 1851 PREFETCH" 64(%0) \n\t" 1852 "movd (%0), %%mm0 \n\t" 1853 "movd 2(%0), %%mm1 \n\t" 1854 "movd 6(%0), %%mm2 \n\t" 1855 "movd 8(%0), %%mm3 \n\t" 1856 "add $12, %0 \n\t" 1857 "punpcklbw %%mm7, %%mm0 \n\t" 1858 "punpcklbw %%mm7, %%mm1 \n\t" 1859 "punpcklbw %%mm7, %%mm2 \n\t" 1860 "punpcklbw %%mm7, %%mm3 \n\t" 1861 "pmaddwd %%mm5, %%mm0 \n\t" 1862 "pmaddwd %%mm6, %%mm1 \n\t" 1863 "pmaddwd %%mm5, %%mm2 \n\t" 1864 "pmaddwd %%mm6, %%mm3 \n\t" 1865 "paddd %%mm1, %%mm0 \n\t" 1866 "paddd %%mm3, %%mm2 \n\t" 1867 "paddd %%mm4, %%mm0 \n\t" 1868 "paddd %%mm4, %%mm2 \n\t" 1869 "psrad $15, %%mm0 \n\t" 1870 "psrad $15, %%mm2 \n\t" 1871 "packssdw %%mm2, %%mm0 \n\t" 1872 "packuswb %%mm0, %%mm0 \n\t" 1873 "movd %%mm0, (%1, %%"REG_a") \n\t" 1874 "add $4, %%"REG_a" \n\t" 1875 " js 1b \n\t" 1876 : "+r" (src) 1877 : "r" (dst+width), "g" ((x86_reg)-width) 1878 : "%"REG_a 1879 ); 1880} 1881 1882static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat) 1883{ 1884 __asm__ volatile( 1885 "movq 24+%4, %%mm6 \n\t" 1886 "mov %3, %%"REG_a" \n\t" 1887 "pxor %%mm7, %%mm7 \n\t" 1888 "1: \n\t" 1889 PREFETCH" 64(%0) \n\t" 1890 "movd (%0), %%mm0 \n\t" 1891 "movd 2(%0), %%mm1 \n\t" 1892 "punpcklbw %%mm7, %%mm0 \n\t" 1893 "punpcklbw %%mm7, %%mm1 \n\t" 1894 "movq %%mm0, %%mm2 \n\t" 1895 "movq %%mm1, %%mm3 \n\t" 1896 "pmaddwd %4, %%mm0 \n\t" 1897 "pmaddwd 8+%4, %%mm1 \n\t" 1898 "pmaddwd 16+%4, %%mm2 \n\t" 1899 "pmaddwd %%mm6, %%mm3 \n\t" 1900 "paddd %%mm1, %%mm0 \n\t" 1901 "paddd %%mm3, %%mm2 \n\t" 1902 1903 "movd 6(%0), %%mm1 \n\t" 1904 "movd 8(%0), %%mm3 \n\t" 1905 "add $12, %0 \n\t" 1906 "punpcklbw %%mm7, %%mm1 \n\t" 1907 "punpcklbw %%mm7, %%mm3 \n\t" 1908 "movq %%mm1, %%mm4 \n\t" 1909 "movq %%mm3, %%mm5 \n\t" 1910 "pmaddwd %4, %%mm1 \n\t" 1911 "pmaddwd 8+%4, %%mm3 \n\t" 1912 "pmaddwd 16+%4, %%mm4 \n\t" 1913 "pmaddwd %%mm6, %%mm5 \n\t" 1914 "paddd %%mm3, %%mm1 \n\t" 1915 "paddd %%mm5, %%mm4 \n\t" 1916 1917 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t" 1918 "paddd %%mm3, %%mm0 \n\t" 1919 "paddd %%mm3, %%mm2 \n\t" 1920 "paddd %%mm3, %%mm1 \n\t" 1921 "paddd %%mm3, %%mm4 \n\t" 1922 "psrad $15, %%mm0 \n\t" 1923 "psrad $15, %%mm2 \n\t" 1924 "psrad $15, %%mm1 \n\t" 1925 "psrad $15, %%mm4 \n\t" 1926 "packssdw %%mm1, %%mm0 \n\t" 1927 "packssdw %%mm4, %%mm2 \n\t" 1928 "packuswb %%mm0, %%mm0 \n\t" 1929 "packuswb %%mm2, %%mm2 \n\t" 1930 "movd %%mm0, (%1, %%"REG_a") \n\t" 1931 "movd %%mm2, (%2, %%"REG_a") \n\t" 1932 "add $4, %%"REG_a" \n\t" 1933 " js 1b \n\t" 1934 : "+r" (src) 1935 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0]) 1936 : "%"REG_a 1937 ); 1938} 1939#endif 1940 1941static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 1942{ 1943#if COMPILE_TEMPLATE_MMX 1944 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); 1945#else 1946 int i; 1947 for (i=0; i<width; i++) { 1948 int b= src[i*3+0]; 1949 int g= src[i*3+1]; 1950 int r= src[i*3+2]; 1951 1952 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); 1953 } 1954#endif /* COMPILE_TEMPLATE_MMX */ 1955} 1956 1957static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 1958{ 1959#if COMPILE_TEMPLATE_MMX 1960 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24); 1961#else 1962 int i; 1963 for (i=0; i<width; i++) { 1964 int b= src1[3*i + 0]; 1965 int g= src1[3*i + 1]; 1966 int r= src1[3*i + 2]; 1967 1968 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 1969 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 1970 } 1971#endif /* COMPILE_TEMPLATE_MMX */ 1972 assert(src1 == src2); 1973} 1974 1975static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 1976{ 1977 int i; 1978 for (i=0; i<width; i++) { 1979 int b= src1[6*i + 0] + src1[6*i + 3]; 1980 int g= src1[6*i + 1] + src1[6*i + 4]; 1981 int r= src1[6*i + 2] + src1[6*i + 5]; 1982 1983 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 1984 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 1985 } 1986 assert(src1 == src2); 1987} 1988 1989static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) 1990{ 1991#if COMPILE_TEMPLATE_MMX 1992 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24); 1993#else 1994 int i; 1995 for (i=0; i<width; i++) { 1996 int r= src[i*3+0]; 1997 int g= src[i*3+1]; 1998 int b= src[i*3+2]; 1999 2000 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); 2001 } 2002#endif 2003} 2004 2005static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 2006{ 2007#if COMPILE_TEMPLATE_MMX 2008 assert(src1==src2); 2009 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); 2010#else 2011 int i; 2012 assert(src1==src2); 2013 for (i=0; i<width; i++) { 2014 int r= src1[3*i + 0]; 2015 int g= src1[3*i + 1]; 2016 int b= src1[3*i + 2]; 2017 2018 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 2019 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 2020 } 2021#endif 2022} 2023 2024static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) 2025{ 2026 int i; 2027 assert(src1==src2); 2028 for (i=0; i<width; i++) { 2029 int r= src1[6*i + 0] + src1[6*i + 3]; 2030 int g= src1[6*i + 1] + src1[6*i + 4]; 2031 int b= src1[6*i + 2] + src1[6*i + 5]; 2032 2033 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 2034 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 2035 } 2036} 2037 2038 2039// bilinear / bicubic scaling 2040static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc, 2041 const int16_t *filter, const int16_t *filterPos, long filterSize) 2042{ 2043#if COMPILE_TEMPLATE_MMX 2044 assert(filterSize % 4 == 0 && filterSize>0); 2045 if (filterSize==4) { // Always true for upscaling, sometimes for down, too. 2046 x86_reg counter= -2*dstW; 2047 filter-= counter*2; 2048 filterPos-= counter/2; 2049 dst-= counter/2; 2050 __asm__ volatile( 2051#if defined(PIC) 2052 "push %%"REG_b" \n\t" 2053#endif 2054 "pxor %%mm7, %%mm7 \n\t" 2055 "push %%"REG_BP" \n\t" // we use 7 regs here ... 2056 "mov %%"REG_a", %%"REG_BP" \n\t" 2057 ASMALIGN(4) 2058 "1: \n\t" 2059 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 2060 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 2061 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" 2062 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" 2063 "movd (%3, %%"REG_a"), %%mm0 \n\t" 2064 "movd (%3, %%"REG_b"), %%mm2 \n\t" 2065 "punpcklbw %%mm7, %%mm0 \n\t" 2066 "punpcklbw %%mm7, %%mm2 \n\t" 2067 "pmaddwd %%mm1, %%mm0 \n\t" 2068 "pmaddwd %%mm2, %%mm3 \n\t" 2069 "movq %%mm0, %%mm4 \n\t" 2070 "punpckldq %%mm3, %%mm0 \n\t" 2071 "punpckhdq %%mm3, %%mm4 \n\t" 2072 "paddd %%mm4, %%mm0 \n\t" 2073 "psrad $7, %%mm0 \n\t" 2074 "packssdw %%mm0, %%mm0 \n\t" 2075 "movd %%mm0, (%4, %%"REG_BP") \n\t" 2076 "add $4, %%"REG_BP" \n\t" 2077 " jnc 1b \n\t" 2078 2079 "pop %%"REG_BP" \n\t" 2080#if defined(PIC) 2081 "pop %%"REG_b" \n\t" 2082#endif 2083 : "+a" (counter) 2084 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 2085#if !defined(PIC) 2086 : "%"REG_b 2087#endif 2088 ); 2089 } else if (filterSize==8) { 2090 x86_reg counter= -2*dstW; 2091 filter-= counter*4; 2092 filterPos-= counter/2; 2093 dst-= counter/2; 2094 __asm__ volatile( 2095#if defined(PIC) 2096 "push %%"REG_b" \n\t" 2097#endif 2098 "pxor %%mm7, %%mm7 \n\t" 2099 "push %%"REG_BP" \n\t" // we use 7 regs here ... 2100 "mov %%"REG_a", %%"REG_BP" \n\t" 2101 ASMALIGN(4) 2102 "1: \n\t" 2103 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 2104 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 2105 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" 2106 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" 2107 "movd (%3, %%"REG_a"), %%mm0 \n\t" 2108 "movd (%3, %%"REG_b"), %%mm2 \n\t" 2109 "punpcklbw %%mm7, %%mm0 \n\t" 2110 "punpcklbw %%mm7, %%mm2 \n\t" 2111 "pmaddwd %%mm1, %%mm0 \n\t" 2112 "pmaddwd %%mm2, %%mm3 \n\t" 2113 2114 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" 2115 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" 2116 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" 2117 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" 2118 "punpcklbw %%mm7, %%mm4 \n\t" 2119 "punpcklbw %%mm7, %%mm2 \n\t" 2120 "pmaddwd %%mm1, %%mm4 \n\t" 2121 "pmaddwd %%mm2, %%mm5 \n\t" 2122 "paddd %%mm4, %%mm0 \n\t" 2123 "paddd %%mm5, %%mm3 \n\t" 2124 "movq %%mm0, %%mm4 \n\t" 2125 "punpckldq %%mm3, %%mm0 \n\t" 2126 "punpckhdq %%mm3, %%mm4 \n\t" 2127 "paddd %%mm4, %%mm0 \n\t" 2128 "psrad $7, %%mm0 \n\t" 2129 "packssdw %%mm0, %%mm0 \n\t" 2130 "movd %%mm0, (%4, %%"REG_BP") \n\t" 2131 "add $4, %%"REG_BP" \n\t" 2132 " jnc 1b \n\t" 2133 2134 "pop %%"REG_BP" \n\t" 2135#if defined(PIC) 2136 "pop %%"REG_b" \n\t" 2137#endif 2138 : "+a" (counter) 2139 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 2140#if !defined(PIC) 2141 : "%"REG_b 2142#endif 2143 ); 2144 } else { 2145 const uint8_t *offset = src+filterSize; 2146 x86_reg counter= -2*dstW; 2147 //filter-= counter*filterSize/2; 2148 filterPos-= counter/2; 2149 dst-= counter/2; 2150 __asm__ volatile( 2151 "pxor %%mm7, %%mm7 \n\t" 2152 ASMALIGN(4) 2153 "1: \n\t" 2154 "mov %2, %%"REG_c" \n\t" 2155 "movzwl (%%"REG_c", %0), %%eax \n\t" 2156 "movzwl 2(%%"REG_c", %0), %%edx \n\t" 2157 "mov %5, %%"REG_c" \n\t" 2158 "pxor %%mm4, %%mm4 \n\t" 2159 "pxor %%mm5, %%mm5 \n\t" 2160 "2: \n\t" 2161 "movq (%1), %%mm1 \n\t" 2162 "movq (%1, %6), %%mm3 \n\t" 2163 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" 2164 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" 2165 "punpcklbw %%mm7, %%mm0 \n\t" 2166 "punpcklbw %%mm7, %%mm2 \n\t" 2167 "pmaddwd %%mm1, %%mm0 \n\t" 2168 "pmaddwd %%mm2, %%mm3 \n\t" 2169 "paddd %%mm3, %%mm5 \n\t" 2170 "paddd %%mm0, %%mm4 \n\t" 2171 "add $8, %1 \n\t" 2172 "add $4, %%"REG_c" \n\t" 2173 "cmp %4, %%"REG_c" \n\t" 2174 " jb 2b \n\t" 2175 "add %6, %1 \n\t" 2176 "movq %%mm4, %%mm0 \n\t" 2177 "punpckldq %%mm5, %%mm4 \n\t" 2178 "punpckhdq %%mm5, %%mm0 \n\t" 2179 "paddd %%mm0, %%mm4 \n\t" 2180 "psrad $7, %%mm4 \n\t" 2181 "packssdw %%mm4, %%mm4 \n\t" 2182 "mov %3, %%"REG_a" \n\t" 2183 "movd %%mm4, (%%"REG_a", %0) \n\t" 2184 "add $4, %0 \n\t" 2185 " jnc 1b \n\t" 2186 2187 : "+r" (counter), "+r" (filter) 2188 : "m" (filterPos), "m" (dst), "m"(offset), 2189 "m" (src), "r" ((x86_reg)filterSize*2) 2190 : "%"REG_a, "%"REG_c, "%"REG_d 2191 ); 2192 } 2193#else 2194#if COMPILE_TEMPLATE_ALTIVEC 2195 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); 2196#else 2197 int i; 2198 for (i=0; i<dstW; i++) { 2199 int j; 2200 int srcPos= filterPos[i]; 2201 int val=0; 2202 //printf("filterPos: %d\n", filterPos[i]); 2203 for (j=0; j<filterSize; j++) { 2204 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); 2205 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; 2206 } 2207 //filter += hFilterSize; 2208 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ... 2209 //dst[i] = val>>7; 2210 } 2211#endif /* COMPILE_TEMPLATE_ALTIVEC */ 2212#endif /* COMPILE_MMX */ 2213} 2214 2215//FIXME all pal and rgb srcFormats could do this convertion as well 2216//FIXME all scalers more complex than bilinear could do half of this transform 2217static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width) 2218{ 2219 int i; 2220 for (i = 0; i < width; i++) { 2221 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264 2222 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 2223 } 2224} 2225static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width) 2226{ 2227 int i; 2228 for (i = 0; i < width; i++) { 2229 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469 2230 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469 2231 } 2232} 2233static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width) 2234{ 2235 int i; 2236 for (i = 0; i < width; i++) 2237 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14; 2238} 2239static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width) 2240{ 2241 int i; 2242 for (i = 0; i < width; i++) 2243 dst[i] = (dst[i]*14071 + 33561947)>>14; 2244} 2245 2246#define FAST_BILINEAR_X86 \ 2247 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \ 2248 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \ 2249 "shll $16, %%edi \n\t" \ 2250 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \ 2251 "mov %1, %%"REG_D"\n\t" \ 2252 "shrl $9, %%esi \n\t" \ 2253 2254static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, 2255 long dstWidth, const uint8_t *src, int srcW, 2256 int xInc) 2257{ 2258#if ARCH_X86 2259#if COMPILE_TEMPLATE_MMX2 2260 int32_t *filterPos = c->hLumFilterPos; 2261 int16_t *filter = c->hLumFilter; 2262 int canMMX2BeUsed = c->canMMX2BeUsed; 2263 void *mmx2FilterCode= c->lumMmx2FilterCode; 2264 int i; 2265#if defined(PIC) 2266 DECLARE_ALIGNED(8, uint64_t, ebxsave); 2267#endif 2268 if (canMMX2BeUsed) { 2269 __asm__ volatile( 2270#if defined(PIC) 2271 "mov %%"REG_b", %5 \n\t" 2272#endif 2273 "pxor %%mm7, %%mm7 \n\t" 2274 "mov %0, %%"REG_c" \n\t" 2275 "mov %1, %%"REG_D" \n\t" 2276 "mov %2, %%"REG_d" \n\t" 2277 "mov %3, %%"REG_b" \n\t" 2278 "xor %%"REG_a", %%"REG_a" \n\t" // i 2279 PREFETCH" (%%"REG_c") \n\t" 2280 PREFETCH" 32(%%"REG_c") \n\t" 2281 PREFETCH" 64(%%"REG_c") \n\t" 2282 2283#if ARCH_X86_64 2284 2285#define CALL_MMX2_FILTER_CODE \ 2286 "movl (%%"REG_b"), %%esi \n\t"\ 2287 "call *%4 \n\t"\ 2288 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 2289 "add %%"REG_S", %%"REG_c" \n\t"\ 2290 "add %%"REG_a", %%"REG_D" \n\t"\ 2291 "xor %%"REG_a", %%"REG_a" \n\t"\ 2292 2293#else 2294 2295#define CALL_MMX2_FILTER_CODE \ 2296 "movl (%%"REG_b"), %%esi \n\t"\ 2297 "call *%4 \n\t"\ 2298 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 2299 "add %%"REG_a", %%"REG_D" \n\t"\ 2300 "xor %%"REG_a", %%"REG_a" \n\t"\ 2301 2302#endif /* ARCH_X86_64 */ 2303 2304 CALL_MMX2_FILTER_CODE 2305 CALL_MMX2_FILTER_CODE 2306 CALL_MMX2_FILTER_CODE 2307 CALL_MMX2_FILTER_CODE 2308 CALL_MMX2_FILTER_CODE 2309 CALL_MMX2_FILTER_CODE 2310 CALL_MMX2_FILTER_CODE 2311 CALL_MMX2_FILTER_CODE 2312 2313#if defined(PIC) 2314 "mov %5, %%"REG_b" \n\t" 2315#endif 2316 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), 2317 "m" (mmx2FilterCode) 2318#if defined(PIC) 2319 ,"m" (ebxsave) 2320#endif 2321 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 2322#if !defined(PIC) 2323 ,"%"REG_b 2324#endif 2325 ); 2326 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; 2327 } else { 2328#endif /* COMPILE_TEMPLATE_MMX2 */ 2329 x86_reg xInc_shr16 = xInc >> 16; 2330 uint16_t xInc_mask = xInc & 0xffff; 2331 //NO MMX just normal asm ... 2332 __asm__ volatile( 2333 "xor %%"REG_a", %%"REG_a" \n\t" // i 2334 "xor %%"REG_d", %%"REG_d" \n\t" // xx 2335 "xorl %%ecx, %%ecx \n\t" // xalpha 2336 ASMALIGN(4) 2337 "1: \n\t" 2338 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] 2339 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] 2340 FAST_BILINEAR_X86 2341 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" 2342 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF 2343 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry 2344 2345 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] 2346 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] 2347 FAST_BILINEAR_X86 2348 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t" 2349 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF 2350 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry 2351 2352 2353 "add $2, %%"REG_a" \n\t" 2354 "cmp %2, %%"REG_a" \n\t" 2355 " jb 1b \n\t" 2356 2357 2358 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) 2359 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" 2360 ); 2361#if COMPILE_TEMPLATE_MMX2 2362 } //if MMX2 can't be used 2363#endif 2364#else 2365 int i; 2366 unsigned int xpos=0; 2367 for (i=0;i<dstWidth;i++) { 2368 register unsigned int xx=xpos>>16; 2369 register unsigned int xalpha=(xpos&0xFFFF)>>9; 2370 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; 2371 xpos+=xInc; 2372 } 2373#endif /* ARCH_X86 */ 2374} 2375 2376 // *** horizontal scale Y line to temp buffer 2377static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc, 2378 const int16_t *hLumFilter, 2379 const int16_t *hLumFilterPos, int hLumFilterSize, 2380 uint8_t *formatConvBuffer, 2381 uint32_t *pal, int isAlpha) 2382{ 2383 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12; 2384 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange; 2385 2386 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset; 2387 2388 if (toYV12) { 2389 toYV12(formatConvBuffer, src, srcW, pal); 2390 src= formatConvBuffer; 2391 } 2392 2393 if (!c->hyscale_fast) { 2394 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); 2395 } else { // fast bilinear upscale / crap downscale 2396 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc); 2397 } 2398 2399 if (convertRange) 2400 convertRange(dst, dstWidth); 2401} 2402 2403static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, 2404 long dstWidth, const uint8_t *src1, 2405 const uint8_t *src2, int srcW, int xInc) 2406{ 2407#if ARCH_X86 2408#if COMPILE_TEMPLATE_MMX2 2409 int32_t *filterPos = c->hChrFilterPos; 2410 int16_t *filter = c->hChrFilter; 2411 int canMMX2BeUsed = c->canMMX2BeUsed; 2412 void *mmx2FilterCode= c->chrMmx2FilterCode; 2413 int i; 2414#if defined(PIC) 2415 DECLARE_ALIGNED(8, uint64_t, ebxsave); 2416#endif 2417 if (canMMX2BeUsed) { 2418 __asm__ volatile( 2419#if defined(PIC) 2420 "mov %%"REG_b", %6 \n\t" 2421#endif 2422 "pxor %%mm7, %%mm7 \n\t" 2423 "mov %0, %%"REG_c" \n\t" 2424 "mov %1, %%"REG_D" \n\t" 2425 "mov %2, %%"REG_d" \n\t" 2426 "mov %3, %%"REG_b" \n\t" 2427 "xor %%"REG_a", %%"REG_a" \n\t" // i 2428 PREFETCH" (%%"REG_c") \n\t" 2429 PREFETCH" 32(%%"REG_c") \n\t" 2430 PREFETCH" 64(%%"REG_c") \n\t" 2431 2432 CALL_MMX2_FILTER_CODE 2433 CALL_MMX2_FILTER_CODE 2434 CALL_MMX2_FILTER_CODE 2435 CALL_MMX2_FILTER_CODE 2436 "xor %%"REG_a", %%"REG_a" \n\t" // i 2437 "mov %5, %%"REG_c" \n\t" // src 2438 "mov %1, %%"REG_D" \n\t" // buf1 2439 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t" 2440 PREFETCH" (%%"REG_c") \n\t" 2441 PREFETCH" 32(%%"REG_c") \n\t" 2442 PREFETCH" 64(%%"REG_c") \n\t" 2443 2444 CALL_MMX2_FILTER_CODE 2445 CALL_MMX2_FILTER_CODE 2446 CALL_MMX2_FILTER_CODE 2447 CALL_MMX2_FILTER_CODE 2448 2449#if defined(PIC) 2450 "mov %6, %%"REG_b" \n\t" 2451#endif 2452 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos), 2453 "m" (mmx2FilterCode), "m" (src2) 2454#if defined(PIC) 2455 ,"m" (ebxsave) 2456#endif 2457 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 2458#if !defined(PIC) 2459 ,"%"REG_b 2460#endif 2461 ); 2462 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { 2463 //printf("%d %d %d\n", dstWidth, i, srcW); 2464 dst[i] = src1[srcW-1]*128; 2465 dst[i+VOFW] = src2[srcW-1]*128; 2466 } 2467 } else { 2468#endif /* COMPILE_TEMPLATE_MMX2 */ 2469 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16); 2470 uint16_t xInc_mask = xInc & 0xffff; 2471 __asm__ volatile( 2472 "xor %%"REG_a", %%"REG_a" \n\t" // i 2473 "xor %%"REG_d", %%"REG_d" \n\t" // xx 2474 "xorl %%ecx, %%ecx \n\t" // xalpha 2475 ASMALIGN(4) 2476 "1: \n\t" 2477 "mov %0, %%"REG_S" \n\t" 2478 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] 2479 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] 2480 FAST_BILINEAR_X86 2481 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" 2482 2483 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] 2484 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] 2485 FAST_BILINEAR_X86 2486 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t" 2487 2488 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF 2489 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry 2490 "add $1, %%"REG_a" \n\t" 2491 "cmp %2, %%"REG_a" \n\t" 2492 " jb 1b \n\t" 2493 2494/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, 2495which is needed to support GCC 4.0. */ 2496#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4) 2497 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), 2498#else 2499 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), 2500#endif 2501 "r" (src2) 2502 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" 2503 ); 2504#if COMPILE_TEMPLATE_MMX2 2505 } //if MMX2 can't be used 2506#endif 2507#else 2508 int i; 2509 unsigned int xpos=0; 2510 for (i=0;i<dstWidth;i++) { 2511 register unsigned int xx=xpos>>16; 2512 register unsigned int xalpha=(xpos&0xFFFF)>>9; 2513 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); 2514 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); 2515 /* slower 2516 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; 2517 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; 2518 */ 2519 xpos+=xInc; 2520 } 2521#endif /* ARCH_X86 */ 2522} 2523 2524inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2, 2525 int srcW, int xInc, const int16_t *hChrFilter, 2526 const int16_t *hChrFilterPos, int hChrFilterSize, 2527 uint8_t *formatConvBuffer, 2528 uint32_t *pal) 2529{ 2530 2531 src1 += c->chrSrcOffset; 2532 src2 += c->chrSrcOffset; 2533 2534 if (c->chrToYV12) { 2535 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2536 src1= formatConvBuffer; 2537 src2= formatConvBuffer+VOFW; 2538 } 2539 2540 if (!c->hcscale_fast) { 2541 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 2542 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 2543 } else { // fast bilinear upscale / crap downscale 2544 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc); 2545 } 2546 2547 if (c->chrConvertRange) 2548 c->chrConvertRange(dst, dstWidth); 2549} 2550 2551#define DEBUG_SWSCALE_BUFFERS 0 2552#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__) 2553 2554static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY, 2555 int srcSliceH, uint8_t* dst[], int dstStride[]) 2556{ 2557 /* load a few things into local vars to make the code more readable? and faster */ 2558 const int srcW= c->srcW; 2559 const int dstW= c->dstW; 2560 const int dstH= c->dstH; 2561 const int chrDstW= c->chrDstW; 2562 const int chrSrcW= c->chrSrcW; 2563 const int lumXInc= c->lumXInc; 2564 const int chrXInc= c->chrXInc; 2565 const enum PixelFormat dstFormat= c->dstFormat; 2566 const int flags= c->flags; 2567 int16_t *vLumFilterPos= c->vLumFilterPos; 2568 int16_t *vChrFilterPos= c->vChrFilterPos; 2569 int16_t *hLumFilterPos= c->hLumFilterPos; 2570 int16_t *hChrFilterPos= c->hChrFilterPos; 2571 int16_t *vLumFilter= c->vLumFilter; 2572 int16_t *vChrFilter= c->vChrFilter; 2573 int16_t *hLumFilter= c->hLumFilter; 2574 int16_t *hChrFilter= c->hChrFilter; 2575 int32_t *lumMmxFilter= c->lumMmxFilter; 2576 int32_t *chrMmxFilter= c->chrMmxFilter; 2577 int32_t av_unused *alpMmxFilter= c->alpMmxFilter; 2578 const int vLumFilterSize= c->vLumFilterSize; 2579 const int vChrFilterSize= c->vChrFilterSize; 2580 const int hLumFilterSize= c->hLumFilterSize; 2581 const int hChrFilterSize= c->hChrFilterSize; 2582 int16_t **lumPixBuf= c->lumPixBuf; 2583 int16_t **chrPixBuf= c->chrPixBuf; 2584 int16_t **alpPixBuf= c->alpPixBuf; 2585 const int vLumBufSize= c->vLumBufSize; 2586 const int vChrBufSize= c->vChrBufSize; 2587 uint8_t *formatConvBuffer= c->formatConvBuffer; 2588 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; 2589 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); 2590 int lastDstY; 2591 uint32_t *pal=c->pal_yuv; 2592 2593 /* vars which will change and which we need to store back in the context */ 2594 int dstY= c->dstY; 2595 int lumBufIndex= c->lumBufIndex; 2596 int chrBufIndex= c->chrBufIndex; 2597 int lastInLumBuf= c->lastInLumBuf; 2598 int lastInChrBuf= c->lastInChrBuf; 2599 2600 if (isPacked(c->srcFormat)) { 2601 src[0]= 2602 src[1]= 2603 src[2]= 2604 src[3]= src[0]; 2605 srcStride[0]= 2606 srcStride[1]= 2607 srcStride[2]= 2608 srcStride[3]= srcStride[0]; 2609 } 2610 srcStride[1]<<= c->vChrDrop; 2611 srcStride[2]<<= c->vChrDrop; 2612 2613 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n", 2614 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3], 2615 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]); 2616 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n", 2617 srcSliceY, srcSliceH, dstY, dstH); 2618 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n", 2619 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize); 2620 2621 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) { 2622 static int warnedAlready=0; //FIXME move this into the context perhaps 2623 if (flags & SWS_PRINT_INFO && !warnedAlready) { 2624 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" 2625 " ->cannot do aligned memory accesses anymore\n"); 2626 warnedAlready=1; 2627 } 2628 } 2629 2630 /* Note the user might start scaling the picture in the middle so this 2631 will not get executed. This is not really intended but works 2632 currently, so people might do it. */ 2633 if (srcSliceY ==0) { 2634 lumBufIndex=-1; 2635 chrBufIndex=-1; 2636 dstY=0; 2637 lastInLumBuf= -1; 2638 lastInChrBuf= -1; 2639 } 2640 2641 lastDstY= dstY; 2642 2643 for (;dstY < dstH; dstY++) { 2644 unsigned char *dest =dst[0]+dstStride[0]*dstY; 2645 const int chrDstY= dstY>>c->chrDstVSubSample; 2646 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; 2647 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; 2648 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL; 2649 2650 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input 2651 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)]; 2652 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input 2653 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input 2654 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input 2655 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input 2656 int enough_lines; 2657 2658 //handle holes (FAST_BILINEAR & weird filters) 2659 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; 2660 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; 2661 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); 2662 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); 2663 2664 DEBUG_BUFFERS("dstY: %d\n", dstY); 2665 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n", 2666 firstLumSrcY, lastLumSrcY, lastInLumBuf); 2667 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n", 2668 firstChrSrcY, lastChrSrcY, lastInChrBuf); 2669 2670 // Do we have enough lines in this slice to output the dstY line 2671 enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample); 2672 2673 if (!enough_lines) { 2674 lastLumSrcY = srcSliceY + srcSliceH - 1; 2675 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1; 2676 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n", 2677 lastLumSrcY, lastChrSrcY); 2678 } 2679 2680 //Do horizontal scaling 2681 while(lastInLumBuf < lastLumSrcY) { 2682 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; 2683 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3]; 2684 lumBufIndex++; 2685 assert(lumBufIndex < 2*vLumBufSize); 2686 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); 2687 assert(lastInLumBuf + 1 - srcSliceY >= 0); 2688 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc, 2689 hLumFilter, hLumFilterPos, hLumFilterSize, 2690 formatConvBuffer, 2691 pal, 0); 2692 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) 2693 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc, 2694 hLumFilter, hLumFilterPos, hLumFilterSize, 2695 formatConvBuffer, 2696 pal, 1); 2697 lastInLumBuf++; 2698 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n", 2699 lumBufIndex, lastInLumBuf); 2700 } 2701 while(lastInChrBuf < lastChrSrcY) { 2702 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; 2703 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; 2704 chrBufIndex++; 2705 assert(chrBufIndex < 2*vChrBufSize); 2706 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); 2707 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); 2708 //FIXME replace parameters through context struct (some at least) 2709 2710 if (c->needs_hcscale) 2711 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, 2712 hChrFilter, hChrFilterPos, hChrFilterSize, 2713 formatConvBuffer, 2714 pal); 2715 lastInChrBuf++; 2716 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n", 2717 chrBufIndex, lastInChrBuf); 2718 } 2719 //wrap buf index around to stay inside the ring buffer 2720 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; 2721 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; 2722 if (!enough_lines) 2723 break; //we can't output a dstY line so let's try with the next slice 2724 2725#if COMPILE_TEMPLATE_MMX 2726 c->blueDither= ff_dither8[dstY&1]; 2727 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555) 2728 c->greenDither= ff_dither8[dstY&1]; 2729 else 2730 c->greenDither= ff_dither4[dstY&1]; 2731 c->redDither= ff_dither8[(dstY+1)&1]; 2732#endif 2733 if (dstY < dstH-2) { 2734 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; 2735 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; 2736 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; 2737#if COMPILE_TEMPLATE_MMX 2738 int i; 2739 if (flags & SWS_ACCURATE_RND) { 2740 int s= APCK_SIZE / 8; 2741 for (i=0; i<vLumFilterSize; i+=2) { 2742 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ]; 2743 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)]; 2744 lumMmxFilter[s*i+APCK_COEF/4 ]= 2745 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] 2746 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); 2747 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { 2748 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ]; 2749 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)]; 2750 alpMmxFilter[s*i+APCK_COEF/4 ]= 2751 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ]; 2752 } 2753 } 2754 for (i=0; i<vChrFilterSize; i+=2) { 2755 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ]; 2756 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)]; 2757 chrMmxFilter[s*i+APCK_COEF/4 ]= 2758 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] 2759 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); 2760 } 2761 } else { 2762 for (i=0; i<vLumFilterSize; i++) { 2763 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; 2764 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32; 2765 lumMmxFilter[4*i+2]= 2766 lumMmxFilter[4*i+3]= 2767 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; 2768 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { 2769 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i]; 2770 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32; 2771 alpMmxFilter[4*i+2]= 2772 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; 2773 } 2774 } 2775 for (i=0; i<vChrFilterSize; i++) { 2776 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; 2777 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32; 2778 chrMmxFilter[4*i+2]= 2779 chrMmxFilter[4*i+3]= 2780 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; 2781 } 2782 } 2783#endif 2784 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { 2785 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 2786 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi 2787 c->yuv2nv12X(c, 2788 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2789 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2790 dest, uDest, dstW, chrDstW, dstFormat); 2791 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like 2792 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 2793 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi 2794 if (is16BPS(dstFormat)) { 2795 yuv2yuvX16inC( 2796 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2797 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2798 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, 2799 dstFormat); 2800 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 2801 const int16_t *lumBuf = lumSrcPtr[0]; 2802 const int16_t *chrBuf= chrSrcPtr[0]; 2803 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; 2804 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW); 2805 } else { //General YV12 2806 c->yuv2yuvX(c, 2807 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2808 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2809 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); 2810 } 2811 } else { 2812 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); 2813 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); 2814 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB 2815 int chrAlpha= vChrFilter[2*dstY+1]; 2816 if(flags & SWS_FULL_CHR_H_INT) { 2817 yuv2rgbXinC_full(c, //FIXME write a packed1_full function 2818 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2819 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2820 alpSrcPtr, dest, dstW, dstY); 2821 } else { 2822 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), 2823 alpPixBuf ? *alpSrcPtr : NULL, 2824 dest, dstW, chrAlpha, dstFormat, flags, dstY); 2825 } 2826 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB 2827 int lumAlpha= vLumFilter[2*dstY+1]; 2828 int chrAlpha= vChrFilter[2*dstY+1]; 2829 lumMmxFilter[2]= 2830 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; 2831 chrMmxFilter[2]= 2832 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; 2833 if(flags & SWS_FULL_CHR_H_INT) { 2834 yuv2rgbXinC_full(c, //FIXME write a packed2_full function 2835 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2836 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2837 alpSrcPtr, dest, dstW, dstY); 2838 } else { 2839 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), 2840 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL, 2841 dest, dstW, lumAlpha, chrAlpha, dstY); 2842 } 2843 } else { //general RGB 2844 if(flags & SWS_FULL_CHR_H_INT) { 2845 yuv2rgbXinC_full(c, 2846 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2847 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2848 alpSrcPtr, dest, dstW, dstY); 2849 } else { 2850 c->yuv2packedX(c, 2851 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2852 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2853 alpSrcPtr, dest, dstW, dstY); 2854 } 2855 } 2856 } 2857 } else { // hmm looks like we can't use MMX here without overwriting this array's tail 2858 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; 2859 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; 2860 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; 2861 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { 2862 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 2863 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi 2864 yuv2nv12XinC( 2865 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2866 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2867 dest, uDest, dstW, chrDstW, dstFormat); 2868 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 2869 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 2870 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi 2871 if (is16BPS(dstFormat)) { 2872 yuv2yuvX16inC( 2873 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2874 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2875 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, 2876 dstFormat); 2877 } else { 2878 yuv2yuvXinC( 2879 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2880 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2881 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); 2882 } 2883 } else { 2884 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); 2885 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); 2886 if(flags & SWS_FULL_CHR_H_INT) { 2887 yuv2rgbXinC_full(c, 2888 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2889 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2890 alpSrcPtr, dest, dstW, dstY); 2891 } else { 2892 yuv2packedXinC(c, 2893 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2894 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2895 alpSrcPtr, dest, dstW, dstY); 2896 } 2897 } 2898 } 2899 } 2900 2901 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf) 2902 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255); 2903 2904#if COMPILE_TEMPLATE_MMX 2905 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory"); 2906 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ 2907 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory"); 2908 else __asm__ volatile("emms" :::"memory"); 2909#endif 2910 /* store changed local vars back in the context */ 2911 c->dstY= dstY; 2912 c->lumBufIndex= lumBufIndex; 2913 c->chrBufIndex= chrBufIndex; 2914 c->lastInLumBuf= lastInLumBuf; 2915 c->lastInChrBuf= lastInChrBuf; 2916 2917 return dstY - lastDstY; 2918} 2919 2920static void RENAME(sws_init_swScale)(SwsContext *c) 2921{ 2922 enum PixelFormat srcFormat = c->srcFormat; 2923 2924 c->yuv2nv12X = RENAME(yuv2nv12X ); 2925 c->yuv2yuv1 = RENAME(yuv2yuv1 ); 2926 c->yuv2yuvX = RENAME(yuv2yuvX ); 2927 c->yuv2packed1 = RENAME(yuv2packed1 ); 2928 c->yuv2packed2 = RENAME(yuv2packed2 ); 2929 c->yuv2packedX = RENAME(yuv2packedX ); 2930 2931 c->hScale = RENAME(hScale ); 2932 2933#if COMPILE_TEMPLATE_MMX 2934 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). 2935 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed) 2936#else 2937 if (c->flags & SWS_FAST_BILINEAR) 2938#endif 2939 { 2940 c->hyscale_fast = RENAME(hyscale_fast); 2941 c->hcscale_fast = RENAME(hcscale_fast); 2942 } 2943 2944 c->chrToYV12 = NULL; 2945 switch(srcFormat) { 2946 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break; 2947 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break; 2948 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break; 2949 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break; 2950 case PIX_FMT_RGB8 : 2951 case PIX_FMT_BGR8 : 2952 case PIX_FMT_PAL8 : 2953 case PIX_FMT_BGR4_BYTE: 2954 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break; 2955 case PIX_FMT_YUV420P16BE: 2956 case PIX_FMT_YUV422P16BE: 2957 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break; 2958 case PIX_FMT_YUV420P16LE: 2959 case PIX_FMT_YUV422P16LE: 2960 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break; 2961 } 2962 if (c->chrSrcHSubSample) { 2963 switch(srcFormat) { 2964 case PIX_FMT_RGB48BE: 2965 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break; 2966 case PIX_FMT_RGB32 : 2967 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break; 2968 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break; 2969 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break; 2970 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break; 2971 case PIX_FMT_BGR32 : 2972 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break; 2973 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break; 2974 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break; 2975 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break; 2976 } 2977 } else { 2978 switch(srcFormat) { 2979 case PIX_FMT_RGB48BE: 2980 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break; 2981 case PIX_FMT_RGB32 : 2982 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break; 2983 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break; 2984 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break; 2985 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break; 2986 case PIX_FMT_BGR32 : 2987 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break; 2988 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break; 2989 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break; 2990 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break; 2991 } 2992 } 2993 2994 c->lumToYV12 = NULL; 2995 c->alpToYV12 = NULL; 2996 switch (srcFormat) { 2997 case PIX_FMT_YUYV422 : 2998 case PIX_FMT_YUV420P16BE: 2999 case PIX_FMT_YUV422P16BE: 3000 case PIX_FMT_YUV444P16BE: 3001 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break; 3002 case PIX_FMT_UYVY422 : 3003 case PIX_FMT_YUV420P16LE: 3004 case PIX_FMT_YUV422P16LE: 3005 case PIX_FMT_YUV444P16LE: 3006 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break; 3007 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; 3008 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break; 3009 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break; 3010 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; 3011 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break; 3012 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break; 3013 case PIX_FMT_RGB8 : 3014 case PIX_FMT_BGR8 : 3015 case PIX_FMT_PAL8 : 3016 case PIX_FMT_BGR4_BYTE: 3017 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break; 3018 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break; 3019 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break; 3020 case PIX_FMT_RGB32 : 3021 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break; 3022 case PIX_FMT_BGR32 : 3023 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break; 3024 case PIX_FMT_RGB48BE: 3025 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break; 3026 } 3027 if (c->alpPixBuf) { 3028 switch (srcFormat) { 3029 case PIX_FMT_RGB32 : 3030 case PIX_FMT_RGB32_1: 3031 case PIX_FMT_BGR32 : 3032 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break; 3033 } 3034 } 3035 3036 switch (srcFormat) { 3037 case PIX_FMT_RGB32 : 3038 case PIX_FMT_BGR32 : 3039 c->alpSrcOffset = 3; 3040 break; 3041 case PIX_FMT_RGB32_1: 3042 case PIX_FMT_BGR32_1: 3043 c->lumSrcOffset = ALT32_CORR; 3044 c->chrSrcOffset = ALT32_CORR; 3045 break; 3046 case PIX_FMT_RGB48LE: 3047 c->lumSrcOffset = 1; 3048 c->chrSrcOffset = 1; 3049 c->alpSrcOffset = 1; 3050 break; 3051 } 3052 3053 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { 3054 if (c->srcRange) { 3055 c->lumConvertRange = RENAME(lumRangeFromJpeg); 3056 c->chrConvertRange = RENAME(chrRangeFromJpeg); 3057 } else { 3058 c->lumConvertRange = RENAME(lumRangeToJpeg); 3059 c->chrConvertRange = RENAME(chrRangeToJpeg); 3060 } 3061 } 3062 3063 if (!(isGray(srcFormat) || isGray(c->dstFormat) || 3064 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE)) 3065 c->needs_hcscale = 1; 3066} 3067