1/* 2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 * 20 * The C code (not assembly, MMX, ...) of this file can be used 21 * under the LGPL license. 22 */ 23 24#undef REAL_MOVNTQ 25#undef MOVNTQ 26#undef PAVGB 27#undef PREFETCH 28#undef PREFETCHW 29#undef EMMS 30#undef SFENCE 31 32#if HAVE_AMD3DNOW 33/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ 34#define EMMS "femms" 35#else 36#define EMMS "emms" 37#endif 38 39#if HAVE_AMD3DNOW 40#define PREFETCH "prefetch" 41#define PREFETCHW "prefetchw" 42#elif HAVE_MMX2 43#define PREFETCH "prefetchnta" 44#define PREFETCHW "prefetcht0" 45#else 46#define PREFETCH " # nop" 47#define PREFETCHW " # nop" 48#endif 49 50#if HAVE_MMX2 51#define SFENCE "sfence" 52#else 53#define SFENCE " # nop" 54#endif 55 56#if HAVE_MMX2 57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 58#elif HAVE_AMD3DNOW 59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 60#endif 61 62#if HAVE_MMX2 63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 64#else 65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 66#endif 67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 68 69#if HAVE_ALTIVEC 70#include "swscale_altivec_template.c" 71#endif 72 73#define YSCALEYUV2YV12X(x, offset, dest, width) \ 74 __asm__ volatile(\ 75 "xor %%"REG_a", %%"REG_a" \n\t"\ 76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 77 "movq %%mm3, %%mm4 \n\t"\ 78 "lea " offset "(%0), %%"REG_d" \n\t"\ 79 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 80 ASMALIGN(4) /* FIXME Unroll? */\ 81 "1: \n\t"\ 82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ 84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ 85 "add $16, %%"REG_d" \n\t"\ 86 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 87 "test %%"REG_S", %%"REG_S" \n\t"\ 88 "pmulhw %%mm0, %%mm2 \n\t"\ 89 "pmulhw %%mm0, %%mm5 \n\t"\ 90 "paddw %%mm2, %%mm3 \n\t"\ 91 "paddw %%mm5, %%mm4 \n\t"\ 92 " jnz 1b \n\t"\ 93 "psraw $3, %%mm3 \n\t"\ 94 "psraw $3, %%mm4 \n\t"\ 95 "packuswb %%mm4, %%mm3 \n\t"\ 96 MOVNTQ(%%mm3, (%1, %%REGa))\ 97 "add $8, %%"REG_a" \n\t"\ 98 "cmp %2, %%"REG_a" \n\t"\ 99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 100 "movq %%mm3, %%mm4 \n\t"\ 101 "lea " offset "(%0), %%"REG_d" \n\t"\ 102 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 103 "jb 1b \n\t"\ 104 :: "r" (&c->redDither),\ 105 "r" (dest), "g" (width)\ 106 : "%"REG_a, "%"REG_d, "%"REG_S\ 107 ); 108 109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ 110 __asm__ volatile(\ 111 "lea " offset "(%0), %%"REG_d" \n\t"\ 112 "xor %%"REG_a", %%"REG_a" \n\t"\ 113 "pxor %%mm4, %%mm4 \n\t"\ 114 "pxor %%mm5, %%mm5 \n\t"\ 115 "pxor %%mm6, %%mm6 \n\t"\ 116 "pxor %%mm7, %%mm7 \n\t"\ 117 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 118 ASMALIGN(4) \ 119 "1: \n\t"\ 120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ 121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ 122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ 124 "movq %%mm0, %%mm3 \n\t"\ 125 "punpcklwd %%mm1, %%mm0 \n\t"\ 126 "punpckhwd %%mm1, %%mm3 \n\t"\ 127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ 128 "pmaddwd %%mm1, %%mm0 \n\t"\ 129 "pmaddwd %%mm1, %%mm3 \n\t"\ 130 "paddd %%mm0, %%mm4 \n\t"\ 131 "paddd %%mm3, %%mm5 \n\t"\ 132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ 133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 135 "test %%"REG_S", %%"REG_S" \n\t"\ 136 "movq %%mm2, %%mm0 \n\t"\ 137 "punpcklwd %%mm3, %%mm2 \n\t"\ 138 "punpckhwd %%mm3, %%mm0 \n\t"\ 139 "pmaddwd %%mm1, %%mm2 \n\t"\ 140 "pmaddwd %%mm1, %%mm0 \n\t"\ 141 "paddd %%mm2, %%mm6 \n\t"\ 142 "paddd %%mm0, %%mm7 \n\t"\ 143 " jnz 1b \n\t"\ 144 "psrad $16, %%mm4 \n\t"\ 145 "psrad $16, %%mm5 \n\t"\ 146 "psrad $16, %%mm6 \n\t"\ 147 "psrad $16, %%mm7 \n\t"\ 148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 149 "packssdw %%mm5, %%mm4 \n\t"\ 150 "packssdw %%mm7, %%mm6 \n\t"\ 151 "paddw %%mm0, %%mm4 \n\t"\ 152 "paddw %%mm0, %%mm6 \n\t"\ 153 "psraw $3, %%mm4 \n\t"\ 154 "psraw $3, %%mm6 \n\t"\ 155 "packuswb %%mm6, %%mm4 \n\t"\ 156 MOVNTQ(%%mm4, (%1, %%REGa))\ 157 "add $8, %%"REG_a" \n\t"\ 158 "cmp %2, %%"REG_a" \n\t"\ 159 "lea " offset "(%0), %%"REG_d" \n\t"\ 160 "pxor %%mm4, %%mm4 \n\t"\ 161 "pxor %%mm5, %%mm5 \n\t"\ 162 "pxor %%mm6, %%mm6 \n\t"\ 163 "pxor %%mm7, %%mm7 \n\t"\ 164 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 165 "jb 1b \n\t"\ 166 :: "r" (&c->redDither),\ 167 "r" (dest), "g" (width)\ 168 : "%"REG_a, "%"REG_d, "%"REG_S\ 169 ); 170 171#define YSCALEYUV2YV121 \ 172 "mov %2, %%"REG_a" \n\t"\ 173 ASMALIGN(4) /* FIXME Unroll? */\ 174 "1: \n\t"\ 175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ 176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ 177 "psraw $7, %%mm0 \n\t"\ 178 "psraw $7, %%mm1 \n\t"\ 179 "packuswb %%mm1, %%mm0 \n\t"\ 180 MOVNTQ(%%mm0, (%1, %%REGa))\ 181 "add $8, %%"REG_a" \n\t"\ 182 "jnc 1b \n\t" 183 184#define YSCALEYUV2YV121_ACCURATE \ 185 "mov %2, %%"REG_a" \n\t"\ 186 "pcmpeqw %%mm7, %%mm7 \n\t"\ 187 "psrlw $15, %%mm7 \n\t"\ 188 "psllw $6, %%mm7 \n\t"\ 189 ASMALIGN(4) /* FIXME Unroll? */\ 190 "1: \n\t"\ 191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ 192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ 193 "paddsw %%mm7, %%mm0 \n\t"\ 194 "paddsw %%mm7, %%mm1 \n\t"\ 195 "psraw $7, %%mm0 \n\t"\ 196 "psraw $7, %%mm1 \n\t"\ 197 "packuswb %%mm1, %%mm0 \n\t"\ 198 MOVNTQ(%%mm0, (%1, %%REGa))\ 199 "add $8, %%"REG_a" \n\t"\ 200 "jnc 1b \n\t" 201 202/* 203 :: "m" (-lumFilterSize), "m" (-chrFilterSize), 204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), 205 "r" (dest), "m" (dstW), 206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) 207 : "%eax", "%ebx", "%ecx", "%edx", "%esi" 208*/ 209#define YSCALEYUV2PACKEDX_UV \ 210 __asm__ volatile(\ 211 "xor %%"REG_a", %%"REG_a" \n\t"\ 212 ASMALIGN(4)\ 213 "nop \n\t"\ 214 "1: \n\t"\ 215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 216 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 218 "movq %%mm3, %%mm4 \n\t"\ 219 ASMALIGN(4)\ 220 "2: \n\t"\ 221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ 223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ 224 "add $16, %%"REG_d" \n\t"\ 225 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 226 "pmulhw %%mm0, %%mm2 \n\t"\ 227 "pmulhw %%mm0, %%mm5 \n\t"\ 228 "paddw %%mm2, %%mm3 \n\t"\ 229 "paddw %%mm5, %%mm4 \n\t"\ 230 "test %%"REG_S", %%"REG_S" \n\t"\ 231 " jnz 2b \n\t"\ 232 233#define YSCALEYUV2PACKEDX_YA(offset) \ 234 "lea "offset"(%0), %%"REG_d" \n\t"\ 235 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 236 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\ 237 "movq %%mm1, %%mm7 \n\t"\ 238 ASMALIGN(4)\ 239 "2: \n\t"\ 240 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ 241 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ 242 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ 243 "add $16, %%"REG_d" \n\t"\ 244 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 245 "pmulhw %%mm0, %%mm2 \n\t"\ 246 "pmulhw %%mm0, %%mm5 \n\t"\ 247 "paddw %%mm2, %%mm1 \n\t"\ 248 "paddw %%mm5, %%mm7 \n\t"\ 249 "test %%"REG_S", %%"REG_S" \n\t"\ 250 " jnz 2b \n\t"\ 251 252#define YSCALEYUV2PACKEDX \ 253 YSCALEYUV2PACKEDX_UV \ 254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET) \ 255 256#define YSCALEYUV2PACKEDX_END \ 257 :: "r" (&c->redDither), \ 258 "m" (dummy), "m" (dummy), "m" (dummy),\ 259 "r" (dest), "m" (dstW) \ 260 : "%"REG_a, "%"REG_d, "%"REG_S \ 261 ); 262 263#define YSCALEYUV2PACKEDX_ACCURATE_UV \ 264 __asm__ volatile(\ 265 "xor %%"REG_a", %%"REG_a" \n\t"\ 266 ASMALIGN(4)\ 267 "nop \n\t"\ 268 "1: \n\t"\ 269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 270 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 271 "pxor %%mm4, %%mm4 \n\t"\ 272 "pxor %%mm5, %%mm5 \n\t"\ 273 "pxor %%mm6, %%mm6 \n\t"\ 274 "pxor %%mm7, %%mm7 \n\t"\ 275 ASMALIGN(4)\ 276 "2: \n\t"\ 277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ 278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ 279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ 281 "movq %%mm0, %%mm3 \n\t"\ 282 "punpcklwd %%mm1, %%mm0 \n\t"\ 283 "punpckhwd %%mm1, %%mm3 \n\t"\ 284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ 285 "pmaddwd %%mm1, %%mm0 \n\t"\ 286 "pmaddwd %%mm1, %%mm3 \n\t"\ 287 "paddd %%mm0, %%mm4 \n\t"\ 288 "paddd %%mm3, %%mm5 \n\t"\ 289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ 290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 292 "test %%"REG_S", %%"REG_S" \n\t"\ 293 "movq %%mm2, %%mm0 \n\t"\ 294 "punpcklwd %%mm3, %%mm2 \n\t"\ 295 "punpckhwd %%mm3, %%mm0 \n\t"\ 296 "pmaddwd %%mm1, %%mm2 \n\t"\ 297 "pmaddwd %%mm1, %%mm0 \n\t"\ 298 "paddd %%mm2, %%mm6 \n\t"\ 299 "paddd %%mm0, %%mm7 \n\t"\ 300 " jnz 2b \n\t"\ 301 "psrad $16, %%mm4 \n\t"\ 302 "psrad $16, %%mm5 \n\t"\ 303 "psrad $16, %%mm6 \n\t"\ 304 "psrad $16, %%mm7 \n\t"\ 305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 306 "packssdw %%mm5, %%mm4 \n\t"\ 307 "packssdw %%mm7, %%mm6 \n\t"\ 308 "paddw %%mm0, %%mm4 \n\t"\ 309 "paddw %%mm0, %%mm6 \n\t"\ 310 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 311 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 312 313#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 314 "lea "offset"(%0), %%"REG_d" \n\t"\ 315 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 316 "pxor %%mm1, %%mm1 \n\t"\ 317 "pxor %%mm5, %%mm5 \n\t"\ 318 "pxor %%mm7, %%mm7 \n\t"\ 319 "pxor %%mm6, %%mm6 \n\t"\ 320 ASMALIGN(4)\ 321 "2: \n\t"\ 322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ 323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ 324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ 326 "movq %%mm0, %%mm3 \n\t"\ 327 "punpcklwd %%mm4, %%mm0 \n\t"\ 328 "punpckhwd %%mm4, %%mm3 \n\t"\ 329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ 330 "pmaddwd %%mm4, %%mm0 \n\t"\ 331 "pmaddwd %%mm4, %%mm3 \n\t"\ 332 "paddd %%mm0, %%mm1 \n\t"\ 333 "paddd %%mm3, %%mm5 \n\t"\ 334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ 335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 337 "test %%"REG_S", %%"REG_S" \n\t"\ 338 "movq %%mm2, %%mm0 \n\t"\ 339 "punpcklwd %%mm3, %%mm2 \n\t"\ 340 "punpckhwd %%mm3, %%mm0 \n\t"\ 341 "pmaddwd %%mm4, %%mm2 \n\t"\ 342 "pmaddwd %%mm4, %%mm0 \n\t"\ 343 "paddd %%mm2, %%mm7 \n\t"\ 344 "paddd %%mm0, %%mm6 \n\t"\ 345 " jnz 2b \n\t"\ 346 "psrad $16, %%mm1 \n\t"\ 347 "psrad $16, %%mm5 \n\t"\ 348 "psrad $16, %%mm7 \n\t"\ 349 "psrad $16, %%mm6 \n\t"\ 350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 351 "packssdw %%mm5, %%mm1 \n\t"\ 352 "packssdw %%mm6, %%mm7 \n\t"\ 353 "paddw %%mm0, %%mm1 \n\t"\ 354 "paddw %%mm0, %%mm7 \n\t"\ 355 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 356 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 357 358#define YSCALEYUV2PACKEDX_ACCURATE \ 359 YSCALEYUV2PACKEDX_ACCURATE_UV \ 360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 361 362#define YSCALEYUV2RGBX \ 363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ 364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ 365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 369/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ 373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ 374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 376/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 377 "paddw %%mm3, %%mm4 \n\t"\ 378 "movq %%mm2, %%mm0 \n\t"\ 379 "movq %%mm5, %%mm6 \n\t"\ 380 "movq %%mm4, %%mm3 \n\t"\ 381 "punpcklwd %%mm2, %%mm2 \n\t"\ 382 "punpcklwd %%mm5, %%mm5 \n\t"\ 383 "punpcklwd %%mm4, %%mm4 \n\t"\ 384 "paddw %%mm1, %%mm2 \n\t"\ 385 "paddw %%mm1, %%mm5 \n\t"\ 386 "paddw %%mm1, %%mm4 \n\t"\ 387 "punpckhwd %%mm0, %%mm0 \n\t"\ 388 "punpckhwd %%mm6, %%mm6 \n\t"\ 389 "punpckhwd %%mm3, %%mm3 \n\t"\ 390 "paddw %%mm7, %%mm0 \n\t"\ 391 "paddw %%mm7, %%mm6 \n\t"\ 392 "paddw %%mm7, %%mm3 \n\t"\ 393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 394 "packuswb %%mm0, %%mm2 \n\t"\ 395 "packuswb %%mm6, %%mm5 \n\t"\ 396 "packuswb %%mm3, %%mm4 \n\t"\ 397 398#define REAL_YSCALEYUV2PACKED(index, c) \ 399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 401 "psraw $3, %%mm0 \n\t"\ 402 "psraw $3, %%mm1 \n\t"\ 403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 405 "xor "#index", "#index" \n\t"\ 406 ASMALIGN(4)\ 407 "1: \n\t"\ 408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 433 434#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 435 436#define REAL_YSCALEYUV2RGB_UV(index, c) \ 437 "xor "#index", "#index" \n\t"\ 438 ASMALIGN(4)\ 439 "1: \n\t"\ 440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ 445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ 446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ 448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ 449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ 452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ 453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 460 461#define REAL_YSCALEYUV2RGB_YA(index, c) \ 462 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ 463 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ 464 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ 465 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ 466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ 467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ 468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ 470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ 474 475#define REAL_YSCALEYUV2RGB_COEFF(c) \ 476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 483 "paddw %%mm3, %%mm4 \n\t"\ 484 "movq %%mm2, %%mm0 \n\t"\ 485 "movq %%mm5, %%mm6 \n\t"\ 486 "movq %%mm4, %%mm3 \n\t"\ 487 "punpcklwd %%mm2, %%mm2 \n\t"\ 488 "punpcklwd %%mm5, %%mm5 \n\t"\ 489 "punpcklwd %%mm4, %%mm4 \n\t"\ 490 "paddw %%mm1, %%mm2 \n\t"\ 491 "paddw %%mm1, %%mm5 \n\t"\ 492 "paddw %%mm1, %%mm4 \n\t"\ 493 "punpckhwd %%mm0, %%mm0 \n\t"\ 494 "punpckhwd %%mm6, %%mm6 \n\t"\ 495 "punpckhwd %%mm3, %%mm3 \n\t"\ 496 "paddw %%mm7, %%mm0 \n\t"\ 497 "paddw %%mm7, %%mm6 \n\t"\ 498 "paddw %%mm7, %%mm3 \n\t"\ 499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 500 "packuswb %%mm0, %%mm2 \n\t"\ 501 "packuswb %%mm6, %%mm5 \n\t"\ 502 "packuswb %%mm3, %%mm4 \n\t"\ 503 504#define YSCALEYUV2RGB_YA(index, c) REAL_YSCALEYUV2RGB_YA(index, c) 505 506#define YSCALEYUV2RGB(index, c) \ 507 REAL_YSCALEYUV2RGB_UV(index, c) \ 508 REAL_YSCALEYUV2RGB_YA(index, c) \ 509 REAL_YSCALEYUV2RGB_COEFF(c) 510 511#define REAL_YSCALEYUV2PACKED1(index, c) \ 512 "xor "#index", "#index" \n\t"\ 513 ASMALIGN(4)\ 514 "1: \n\t"\ 515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 517 "psraw $7, %%mm3 \n\t" \ 518 "psraw $7, %%mm4 \n\t" \ 519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 521 "psraw $7, %%mm1 \n\t" \ 522 "psraw $7, %%mm7 \n\t" \ 523 524#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 525 526#define REAL_YSCALEYUV2RGB1(index, c) \ 527 "xor "#index", "#index" \n\t"\ 528 ASMALIGN(4)\ 529 "1: \n\t"\ 530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ 531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ 533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ 534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 552 "paddw %%mm3, %%mm4 \n\t"\ 553 "movq %%mm2, %%mm0 \n\t"\ 554 "movq %%mm5, %%mm6 \n\t"\ 555 "movq %%mm4, %%mm3 \n\t"\ 556 "punpcklwd %%mm2, %%mm2 \n\t"\ 557 "punpcklwd %%mm5, %%mm5 \n\t"\ 558 "punpcklwd %%mm4, %%mm4 \n\t"\ 559 "paddw %%mm1, %%mm2 \n\t"\ 560 "paddw %%mm1, %%mm5 \n\t"\ 561 "paddw %%mm1, %%mm4 \n\t"\ 562 "punpckhwd %%mm0, %%mm0 \n\t"\ 563 "punpckhwd %%mm6, %%mm6 \n\t"\ 564 "punpckhwd %%mm3, %%mm3 \n\t"\ 565 "paddw %%mm7, %%mm0 \n\t"\ 566 "paddw %%mm7, %%mm6 \n\t"\ 567 "paddw %%mm7, %%mm3 \n\t"\ 568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 569 "packuswb %%mm0, %%mm2 \n\t"\ 570 "packuswb %%mm6, %%mm5 \n\t"\ 571 "packuswb %%mm3, %%mm4 \n\t"\ 572 573#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 574 575#define REAL_YSCALEYUV2PACKED1b(index, c) \ 576 "xor "#index", "#index" \n\t"\ 577 ASMALIGN(4)\ 578 "1: \n\t"\ 579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 585 "psrlw $8, %%mm3 \n\t" \ 586 "psrlw $8, %%mm4 \n\t" \ 587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 589 "psraw $7, %%mm1 \n\t" \ 590 "psraw $7, %%mm7 \n\t" 591#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 592 593// do vertical chrominance interpolation 594#define REAL_YSCALEYUV2RGB1b(index, c) \ 595 "xor "#index", "#index" \n\t"\ 596 ASMALIGN(4)\ 597 "1: \n\t"\ 598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ 599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ 600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ 603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ 604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ 605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ 606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ 607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ 608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ 609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ 610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ 613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ 614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ 615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ 617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ 620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ 621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 624 "paddw %%mm3, %%mm4 \n\t"\ 625 "movq %%mm2, %%mm0 \n\t"\ 626 "movq %%mm5, %%mm6 \n\t"\ 627 "movq %%mm4, %%mm3 \n\t"\ 628 "punpcklwd %%mm2, %%mm2 \n\t"\ 629 "punpcklwd %%mm5, %%mm5 \n\t"\ 630 "punpcklwd %%mm4, %%mm4 \n\t"\ 631 "paddw %%mm1, %%mm2 \n\t"\ 632 "paddw %%mm1, %%mm5 \n\t"\ 633 "paddw %%mm1, %%mm4 \n\t"\ 634 "punpckhwd %%mm0, %%mm0 \n\t"\ 635 "punpckhwd %%mm6, %%mm6 \n\t"\ 636 "punpckhwd %%mm3, %%mm3 \n\t"\ 637 "paddw %%mm7, %%mm0 \n\t"\ 638 "paddw %%mm7, %%mm6 \n\t"\ 639 "paddw %%mm7, %%mm3 \n\t"\ 640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 641 "packuswb %%mm0, %%mm2 \n\t"\ 642 "packuswb %%mm6, %%mm5 \n\t"\ 643 "packuswb %%mm3, %%mm4 \n\t"\ 644 645#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 646 647#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 648 "movq "#b", "#q2" \n\t" /* B */\ 649 "movq "#r", "#t" \n\t" /* R */\ 650 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ 651 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ 652 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ 653 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ 654 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ 655 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ 656 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ 657 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ 658 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ 659 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ 660\ 661 MOVNTQ( q0, (dst, index, 4))\ 662 MOVNTQ( b, 8(dst, index, 4))\ 663 MOVNTQ( q2, 16(dst, index, 4))\ 664 MOVNTQ( q3, 24(dst, index, 4))\ 665\ 666 "add $8, "#index" \n\t"\ 667 "cmp "#dstw", "#index" \n\t"\ 668 " jb 1b \n\t" 669#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 670 671#define REAL_WRITERGB16(dst, dstw, index) \ 672 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 673 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ 674 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 675 "psrlq $3, %%mm2 \n\t"\ 676\ 677 "movq %%mm2, %%mm1 \n\t"\ 678 "movq %%mm4, %%mm3 \n\t"\ 679\ 680 "punpcklbw %%mm7, %%mm3 \n\t"\ 681 "punpcklbw %%mm5, %%mm2 \n\t"\ 682 "punpckhbw %%mm7, %%mm4 \n\t"\ 683 "punpckhbw %%mm5, %%mm1 \n\t"\ 684\ 685 "psllq $3, %%mm3 \n\t"\ 686 "psllq $3, %%mm4 \n\t"\ 687\ 688 "por %%mm3, %%mm2 \n\t"\ 689 "por %%mm4, %%mm1 \n\t"\ 690\ 691 MOVNTQ(%%mm2, (dst, index, 2))\ 692 MOVNTQ(%%mm1, 8(dst, index, 2))\ 693\ 694 "add $8, "#index" \n\t"\ 695 "cmp "#dstw", "#index" \n\t"\ 696 " jb 1b \n\t" 697#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 698 699#define REAL_WRITERGB15(dst, dstw, index) \ 700 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ 701 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ 702 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ 703 "psrlq $3, %%mm2 \n\t"\ 704 "psrlq $1, %%mm5 \n\t"\ 705\ 706 "movq %%mm2, %%mm1 \n\t"\ 707 "movq %%mm4, %%mm3 \n\t"\ 708\ 709 "punpcklbw %%mm7, %%mm3 \n\t"\ 710 "punpcklbw %%mm5, %%mm2 \n\t"\ 711 "punpckhbw %%mm7, %%mm4 \n\t"\ 712 "punpckhbw %%mm5, %%mm1 \n\t"\ 713\ 714 "psllq $2, %%mm3 \n\t"\ 715 "psllq $2, %%mm4 \n\t"\ 716\ 717 "por %%mm3, %%mm2 \n\t"\ 718 "por %%mm4, %%mm1 \n\t"\ 719\ 720 MOVNTQ(%%mm2, (dst, index, 2))\ 721 MOVNTQ(%%mm1, 8(dst, index, 2))\ 722\ 723 "add $8, "#index" \n\t"\ 724 "cmp "#dstw", "#index" \n\t"\ 725 " jb 1b \n\t" 726#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 727 728#define WRITEBGR24OLD(dst, dstw, index) \ 729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 730 "movq %%mm2, %%mm1 \n\t" /* B */\ 731 "movq %%mm5, %%mm6 \n\t" /* R */\ 732 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 733 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 734 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 735 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 736 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 737 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 738 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 739 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 740 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 741 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 742\ 743 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 744 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ 745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\ 746 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\ 747 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ 748 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ 749 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ 750 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 751\ 752 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 753 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ 754 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ 755 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ 756 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\ 757 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ 758 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ 759 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\ 760 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\ 761 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ 762 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ 763 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ 764 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ 765\ 766 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ 767 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ 768 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ 769 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\ 770 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\ 771 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ 772 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ 773 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ 774\ 775 MOVNTQ(%%mm0, (dst))\ 776 MOVNTQ(%%mm2, 8(dst))\ 777 MOVNTQ(%%mm3, 16(dst))\ 778 "add $24, "#dst" \n\t"\ 779\ 780 "add $8, "#index" \n\t"\ 781 "cmp "#dstw", "#index" \n\t"\ 782 " jb 1b \n\t" 783 784#define WRITEBGR24MMX(dst, dstw, index) \ 785 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 786 "movq %%mm2, %%mm1 \n\t" /* B */\ 787 "movq %%mm5, %%mm6 \n\t" /* R */\ 788 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 790 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 791 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 792 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 793 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 794 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 795 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 796 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 797 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 798\ 799 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 800 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ 801 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ 802 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ 803\ 804 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ 805 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ 806 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ 807 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ 808\ 809 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ 810 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ 811 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ 812 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ 813\ 814 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ 815 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ 816 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ 817 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 818 MOVNTQ(%%mm0, (dst))\ 819\ 820 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ 821 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ 822 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ 823 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ 824 MOVNTQ(%%mm6, 8(dst))\ 825\ 826 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ 827 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ 828 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ 829 MOVNTQ(%%mm5, 16(dst))\ 830\ 831 "add $24, "#dst" \n\t"\ 832\ 833 "add $8, "#index" \n\t"\ 834 "cmp "#dstw", "#index" \n\t"\ 835 " jb 1b \n\t" 836 837#define WRITEBGR24MMX2(dst, dstw, index) \ 838 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 839 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 840 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 841 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ 842 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ 843 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ 844\ 845 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ 846 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ 847 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ 848\ 849 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ 850 "por %%mm1, %%mm6 \n\t"\ 851 "por %%mm3, %%mm6 \n\t"\ 852 MOVNTQ(%%mm6, (dst))\ 853\ 854 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ 855 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ 856 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ 857 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ 858\ 859 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ 860 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ 861 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ 862\ 863 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ 864 "por %%mm3, %%mm6 \n\t"\ 865 MOVNTQ(%%mm6, 8(dst))\ 866\ 867 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ 868 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ 869 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ 870\ 871 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ 872 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ 873 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ 874\ 875 "por %%mm1, %%mm3 \n\t"\ 876 "por %%mm3, %%mm6 \n\t"\ 877 MOVNTQ(%%mm6, 16(dst))\ 878\ 879 "add $24, "#dst" \n\t"\ 880\ 881 "add $8, "#index" \n\t"\ 882 "cmp "#dstw", "#index" \n\t"\ 883 " jb 1b \n\t" 884 885#if HAVE_MMX2 886#undef WRITEBGR24 887#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) 888#else 889#undef WRITEBGR24 890#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 891#endif 892 893#define REAL_WRITEYUY2(dst, dstw, index) \ 894 "packuswb %%mm3, %%mm3 \n\t"\ 895 "packuswb %%mm4, %%mm4 \n\t"\ 896 "packuswb %%mm7, %%mm1 \n\t"\ 897 "punpcklbw %%mm4, %%mm3 \n\t"\ 898 "movq %%mm1, %%mm7 \n\t"\ 899 "punpcklbw %%mm3, %%mm1 \n\t"\ 900 "punpckhbw %%mm3, %%mm7 \n\t"\ 901\ 902 MOVNTQ(%%mm1, (dst, index, 2))\ 903 MOVNTQ(%%mm7, 8(dst, index, 2))\ 904\ 905 "add $8, "#index" \n\t"\ 906 "cmp "#dstw", "#index" \n\t"\ 907 " jb 1b \n\t" 908#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 909 910 911static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 912 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, 913 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) 914{ 915#if HAVE_MMX 916 if(!(c->flags & SWS_BITEXACT)){ 917 if (c->flags & SWS_ACCURATE_RND){ 918 if (uDest){ 919 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) 920 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) 921 } 922 923 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) 924 }else{ 925 if (uDest){ 926 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) 927 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) 928 } 929 930 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) 931 } 932 return; 933 } 934#endif 935#if HAVE_ALTIVEC 936yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, 937 chrFilter, chrSrc, chrFilterSize, 938 dest, uDest, vDest, dstW, chrDstW); 939#else //HAVE_ALTIVEC 940yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, 941 chrFilter, chrSrc, chrFilterSize, 942 dest, uDest, vDest, dstW, chrDstW); 943#endif //!HAVE_ALTIVEC 944} 945 946static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 947 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, 948 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) 949{ 950yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, 951 chrFilter, chrSrc, chrFilterSize, 952 dest, uDest, dstW, chrDstW, dstFormat); 953} 954 955static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, 956 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) 957{ 958 int i; 959#if HAVE_MMX 960 if(!(c->flags & SWS_BITEXACT)){ 961 long p= uDest ? 3 : 1; 962 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; 963 uint8_t *dst[3]= {dest, uDest, vDest}; 964 long counter[3] = {dstW, chrDstW, chrDstW}; 965 966 if (c->flags & SWS_ACCURATE_RND){ 967 while(p--){ 968 __asm__ volatile( 969 YSCALEYUV2YV121_ACCURATE 970 :: "r" (src[p]), "r" (dst[p] + counter[p]), 971 "g" (-counter[p]) 972 : "%"REG_a 973 ); 974 } 975 }else{ 976 while(p--){ 977 __asm__ volatile( 978 YSCALEYUV2YV121 979 :: "r" (src[p]), "r" (dst[p] + counter[p]), 980 "g" (-counter[p]) 981 : "%"REG_a 982 ); 983 } 984 } 985 return; 986 } 987#endif 988 for (i=0; i<dstW; i++) 989 { 990 int val= (lumSrc[i]+64)>>7; 991 992 if (val&256){ 993 if (val<0) val=0; 994 else val=255; 995 } 996 997 dest[i]= val; 998 } 999 1000 if (uDest) 1001 for (i=0; i<chrDstW; i++) 1002 { 1003 int u=(chrSrc[i ]+64)>>7; 1004 int v=(chrSrc[i + VOFW]+64)>>7; 1005 1006 if ((u|v)&256){ 1007 if (u<0) u=0; 1008 else if (u>255) u=255; 1009 if (v<0) v=0; 1010 else if (v>255) v=255; 1011 } 1012 1013 uDest[i]= u; 1014 vDest[i]= v; 1015 } 1016} 1017 1018 1019/** 1020 * vertical scale YV12 to RGB 1021 */ 1022static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 1023 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, 1024 uint8_t *dest, long dstW, long dstY) 1025{ 1026#if HAVE_MMX 1027 long dummy=0; 1028 if(!(c->flags & SWS_BITEXACT)){ 1029 if (c->flags & SWS_ACCURATE_RND){ 1030 switch(c->dstFormat){ 1031 case PIX_FMT_RGB32: 1032 YSCALEYUV2PACKEDX_ACCURATE 1033 YSCALEYUV2RGBX 1034 "pcmpeqd %%mm7, %%mm7 \n\t" 1035 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1036 1037 YSCALEYUV2PACKEDX_END 1038 return; 1039 case PIX_FMT_BGR24: 1040 YSCALEYUV2PACKEDX_ACCURATE 1041 YSCALEYUV2RGBX 1042 "pxor %%mm7, %%mm7 \n\t" 1043 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize 1044 "add %4, %%"REG_c" \n\t" 1045 WRITEBGR24(%%REGc, %5, %%REGa) 1046 1047 1048 :: "r" (&c->redDither), 1049 "m" (dummy), "m" (dummy), "m" (dummy), 1050 "r" (dest), "m" (dstW) 1051 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 1052 ); 1053 return; 1054 case PIX_FMT_RGB555: 1055 YSCALEYUV2PACKEDX_ACCURATE 1056 YSCALEYUV2RGBX 1057 "pxor %%mm7, %%mm7 \n\t" 1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1059#ifdef DITHER1XBPP 1060 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 1061 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 1062 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 1063#endif 1064 1065 WRITERGB15(%4, %5, %%REGa) 1066 YSCALEYUV2PACKEDX_END 1067 return; 1068 case PIX_FMT_RGB565: 1069 YSCALEYUV2PACKEDX_ACCURATE 1070 YSCALEYUV2RGBX 1071 "pxor %%mm7, %%mm7 \n\t" 1072 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1073#ifdef DITHER1XBPP 1074 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" 1075 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" 1076 "paddusb "RED_DITHER"(%0), %%mm5\n\t" 1077#endif 1078 1079 WRITERGB16(%4, %5, %%REGa) 1080 YSCALEYUV2PACKEDX_END 1081 return; 1082 case PIX_FMT_YUYV422: 1083 YSCALEYUV2PACKEDX_ACCURATE 1084 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1085 1086 "psraw $3, %%mm3 \n\t" 1087 "psraw $3, %%mm4 \n\t" 1088 "psraw $3, %%mm1 \n\t" 1089 "psraw $3, %%mm7 \n\t" 1090 WRITEYUY2(%4, %5, %%REGa) 1091 YSCALEYUV2PACKEDX_END 1092 return; 1093 } 1094 }else{ 1095 switch(c->dstFormat) 1096 { 1097 case PIX_FMT_RGB32: 1098 YSCALEYUV2PACKEDX 1099 YSCALEYUV2RGBX 1100 "pcmpeqd %%mm7, %%mm7 \n\t" 1101 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1102 YSCALEYUV2PACKEDX_END 1103 return; 1104 case PIX_FMT_BGR24: 1105 YSCALEYUV2PACKEDX 1106 YSCALEYUV2RGBX 1107 "pxor %%mm7, %%mm7 \n\t" 1108 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize 1109 "add %4, %%"REG_c" \n\t" 1110 WRITEBGR24(%%REGc, %5, %%REGa) 1111 1112 :: "r" (&c->redDither), 1113 "m" (dummy), "m" (dummy), "m" (dummy), 1114 "r" (dest), "m" (dstW) 1115 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S 1116 ); 1117 return; 1118 case PIX_FMT_RGB555: 1119 YSCALEYUV2PACKEDX 1120 YSCALEYUV2RGBX 1121 "pxor %%mm7, %%mm7 \n\t" 1122 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1123#ifdef DITHER1XBPP 1124 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 1125 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 1126 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 1127#endif 1128 1129 WRITERGB15(%4, %5, %%REGa) 1130 YSCALEYUV2PACKEDX_END 1131 return; 1132 case PIX_FMT_RGB565: 1133 YSCALEYUV2PACKEDX 1134 YSCALEYUV2RGBX 1135 "pxor %%mm7, %%mm7 \n\t" 1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1137#ifdef DITHER1XBPP 1138 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" 1139 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" 1140 "paddusb "RED_DITHER"(%0), %%mm5 \n\t" 1141#endif 1142 1143 WRITERGB16(%4, %5, %%REGa) 1144 YSCALEYUV2PACKEDX_END 1145 return; 1146 case PIX_FMT_YUYV422: 1147 YSCALEYUV2PACKEDX 1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1149 1150 "psraw $3, %%mm3 \n\t" 1151 "psraw $3, %%mm4 \n\t" 1152 "psraw $3, %%mm1 \n\t" 1153 "psraw $3, %%mm7 \n\t" 1154 WRITEYUY2(%4, %5, %%REGa) 1155 YSCALEYUV2PACKEDX_END 1156 return; 1157 } 1158 } 1159 } 1160#endif /* HAVE_MMX */ 1161#if HAVE_ALTIVEC 1162 /* The following list of supported dstFormat values should 1163 match what's found in the body of altivec_yuv2packedX() */ 1164 if (!(c->flags & SWS_BITEXACT) && 1165 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || 1166 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || 1167 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) 1168 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, 1169 chrFilter, chrSrc, chrFilterSize, 1170 dest, dstW, dstY); 1171 else 1172#endif 1173 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, 1174 chrFilter, chrSrc, chrFilterSize, 1175 dest, dstW, dstY); 1176} 1177 1178/** 1179 * vertical bilinear scale YV12 to RGB 1180 */ 1181static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, 1182 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) 1183{ 1184 int yalpha1=4095- yalpha; 1185 int uvalpha1=4095-uvalpha; 1186 int i; 1187 1188#if HAVE_MMX 1189 if(!(c->flags & SWS_BITEXACT)){ 1190 switch(c->dstFormat) 1191 { 1192 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( 1193 case PIX_FMT_RGB32: 1194 __asm__ volatile( 1195 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1196 "mov %4, %%"REG_b" \n\t" 1197 "push %%"REG_BP" \n\t" 1198 YSCALEYUV2RGB(%%REGBP, %5) 1199 "pcmpeqd %%mm7, %%mm7 \n\t" 1200 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1201 "pop %%"REG_BP" \n\t" 1202 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1203 1204 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1205 "a" (&c->redDither) 1206 ); 1207 return; 1208 case PIX_FMT_BGR24: 1209 __asm__ volatile( 1210 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1211 "mov %4, %%"REG_b" \n\t" 1212 "push %%"REG_BP" \n\t" 1213 YSCALEYUV2RGB(%%REGBP, %5) 1214 "pxor %%mm7, %%mm7 \n\t" 1215 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 1216 "pop %%"REG_BP" \n\t" 1217 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1218 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1219 "a" (&c->redDither) 1220 ); 1221 return; 1222 case PIX_FMT_RGB555: 1223 __asm__ volatile( 1224 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1225 "mov %4, %%"REG_b" \n\t" 1226 "push %%"REG_BP" \n\t" 1227 YSCALEYUV2RGB(%%REGBP, %5) 1228 "pxor %%mm7, %%mm7 \n\t" 1229 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1230#ifdef DITHER1XBPP 1231 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1232 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1233 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1234#endif 1235 1236 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 1237 "pop %%"REG_BP" \n\t" 1238 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1239 1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1241 "a" (&c->redDither) 1242 ); 1243 return; 1244 case PIX_FMT_RGB565: 1245 __asm__ volatile( 1246 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1247 "mov %4, %%"REG_b" \n\t" 1248 "push %%"REG_BP" \n\t" 1249 YSCALEYUV2RGB(%%REGBP, %5) 1250 "pxor %%mm7, %%mm7 \n\t" 1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1252#ifdef DITHER1XBPP 1253 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1254 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1255 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1256#endif 1257 1258 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 1259 "pop %%"REG_BP" \n\t" 1260 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1261 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1262 "a" (&c->redDither) 1263 ); 1264 return; 1265 case PIX_FMT_YUYV422: 1266 __asm__ volatile( 1267 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1268 "mov %4, %%"REG_b" \n\t" 1269 "push %%"REG_BP" \n\t" 1270 YSCALEYUV2PACKED(%%REGBP, %5) 1271 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 1272 "pop %%"REG_BP" \n\t" 1273 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1275 "a" (&c->redDither) 1276 ); 1277 return; 1278 default: break; 1279 } 1280 } 1281#endif //HAVE_MMX 1282YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) 1283} 1284 1285/** 1286 * YV12 to RGB without scaling or interpolating 1287 */ 1288static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, 1289 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) 1290{ 1291 const int yalpha1=0; 1292 int i; 1293 1294 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 1295 const int yalpha= 4096; //FIXME ... 1296 1297 if (flags&SWS_FULL_CHR_H_INT) 1298 { 1299 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); 1300 return; 1301 } 1302 1303#if HAVE_MMX 1304 if(!(flags & SWS_BITEXACT)){ 1305 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster 1306 { 1307 switch(dstFormat) 1308 { 1309 case PIX_FMT_RGB32: 1310 __asm__ volatile( 1311 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1312 "mov %4, %%"REG_b" \n\t" 1313 "push %%"REG_BP" \n\t" 1314 YSCALEYUV2RGB1(%%REGBP, %5) 1315 "pcmpeqd %%mm7, %%mm7 \n\t" 1316 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1317 "pop %%"REG_BP" \n\t" 1318 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1319 1320 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1321 "a" (&c->redDither) 1322 ); 1323 return; 1324 case PIX_FMT_BGR24: 1325 __asm__ volatile( 1326 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1327 "mov %4, %%"REG_b" \n\t" 1328 "push %%"REG_BP" \n\t" 1329 YSCALEYUV2RGB1(%%REGBP, %5) 1330 "pxor %%mm7, %%mm7 \n\t" 1331 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 1332 "pop %%"REG_BP" \n\t" 1333 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1334 1335 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1336 "a" (&c->redDither) 1337 ); 1338 return; 1339 case PIX_FMT_RGB555: 1340 __asm__ volatile( 1341 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1342 "mov %4, %%"REG_b" \n\t" 1343 "push %%"REG_BP" \n\t" 1344 YSCALEYUV2RGB1(%%REGBP, %5) 1345 "pxor %%mm7, %%mm7 \n\t" 1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1347#ifdef DITHER1XBPP 1348 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1349 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1350 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1351#endif 1352 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 1353 "pop %%"REG_BP" \n\t" 1354 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1355 1356 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1357 "a" (&c->redDither) 1358 ); 1359 return; 1360 case PIX_FMT_RGB565: 1361 __asm__ volatile( 1362 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1363 "mov %4, %%"REG_b" \n\t" 1364 "push %%"REG_BP" \n\t" 1365 YSCALEYUV2RGB1(%%REGBP, %5) 1366 "pxor %%mm7, %%mm7 \n\t" 1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1368#ifdef DITHER1XBPP 1369 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1370 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1371 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1372#endif 1373 1374 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 1375 "pop %%"REG_BP" \n\t" 1376 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1377 1378 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1379 "a" (&c->redDither) 1380 ); 1381 return; 1382 case PIX_FMT_YUYV422: 1383 __asm__ volatile( 1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1385 "mov %4, %%"REG_b" \n\t" 1386 "push %%"REG_BP" \n\t" 1387 YSCALEYUV2PACKED1(%%REGBP, %5) 1388 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 1389 "pop %%"REG_BP" \n\t" 1390 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1391 1392 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1393 "a" (&c->redDither) 1394 ); 1395 return; 1396 } 1397 } 1398 else 1399 { 1400 switch(dstFormat) 1401 { 1402 case PIX_FMT_RGB32: 1403 __asm__ volatile( 1404 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1405 "mov %4, %%"REG_b" \n\t" 1406 "push %%"REG_BP" \n\t" 1407 YSCALEYUV2RGB1b(%%REGBP, %5) 1408 "pcmpeqd %%mm7, %%mm7 \n\t" 1409 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) 1410 "pop %%"REG_BP" \n\t" 1411 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1412 1413 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1414 "a" (&c->redDither) 1415 ); 1416 return; 1417 case PIX_FMT_BGR24: 1418 __asm__ volatile( 1419 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1420 "mov %4, %%"REG_b" \n\t" 1421 "push %%"REG_BP" \n\t" 1422 YSCALEYUV2RGB1b(%%REGBP, %5) 1423 "pxor %%mm7, %%mm7 \n\t" 1424 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) 1425 "pop %%"REG_BP" \n\t" 1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1427 1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1429 "a" (&c->redDither) 1430 ); 1431 return; 1432 case PIX_FMT_RGB555: 1433 __asm__ volatile( 1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1435 "mov %4, %%"REG_b" \n\t" 1436 "push %%"REG_BP" \n\t" 1437 YSCALEYUV2RGB1b(%%REGBP, %5) 1438 "pxor %%mm7, %%mm7 \n\t" 1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1440#ifdef DITHER1XBPP 1441 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1442 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1443 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1444#endif 1445 WRITERGB15(%%REGb, 8280(%5), %%REGBP) 1446 "pop %%"REG_BP" \n\t" 1447 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1448 1449 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1450 "a" (&c->redDither) 1451 ); 1452 return; 1453 case PIX_FMT_RGB565: 1454 __asm__ volatile( 1455 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1456 "mov %4, %%"REG_b" \n\t" 1457 "push %%"REG_BP" \n\t" 1458 YSCALEYUV2RGB1b(%%REGBP, %5) 1459 "pxor %%mm7, %%mm7 \n\t" 1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 1461#ifdef DITHER1XBPP 1462 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" 1463 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" 1464 "paddusb "RED_DITHER"(%5), %%mm5 \n\t" 1465#endif 1466 1467 WRITERGB16(%%REGb, 8280(%5), %%REGBP) 1468 "pop %%"REG_BP" \n\t" 1469 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1470 1471 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1472 "a" (&c->redDither) 1473 ); 1474 return; 1475 case PIX_FMT_YUYV422: 1476 __asm__ volatile( 1477 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" 1478 "mov %4, %%"REG_b" \n\t" 1479 "push %%"REG_BP" \n\t" 1480 YSCALEYUV2PACKED1b(%%REGBP, %5) 1481 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) 1482 "pop %%"REG_BP" \n\t" 1483 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" 1484 1485 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), 1486 "a" (&c->redDither) 1487 ); 1488 return; 1489 } 1490 } 1491 } 1492#endif /* HAVE_MMX */ 1493 if (uvalpha < 2048) 1494 { 1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) 1496 }else{ 1497 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) 1498 } 1499} 1500 1501//FIXME yuy2* can read up to 7 samples too much 1502 1503static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) 1504{ 1505#if HAVE_MMX 1506 __asm__ volatile( 1507 "movq "MANGLE(bm01010101)", %%mm2 \n\t" 1508 "mov %0, %%"REG_a" \n\t" 1509 "1: \n\t" 1510 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 1511 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 1512 "pand %%mm2, %%mm0 \n\t" 1513 "pand %%mm2, %%mm1 \n\t" 1514 "packuswb %%mm1, %%mm0 \n\t" 1515 "movq %%mm0, (%2, %%"REG_a") \n\t" 1516 "add $8, %%"REG_a" \n\t" 1517 " js 1b \n\t" 1518 : : "g" (-width), "r" (src+width*2), "r" (dst+width) 1519 : "%"REG_a 1520 ); 1521#else 1522 int i; 1523 for (i=0; i<width; i++) 1524 dst[i]= src[2*i]; 1525#endif 1526} 1527 1528static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) 1529{ 1530#if HAVE_MMX 1531 __asm__ volatile( 1532 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 1533 "mov %0, %%"REG_a" \n\t" 1534 "1: \n\t" 1535 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 1536 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 1537 "psrlw $8, %%mm0 \n\t" 1538 "psrlw $8, %%mm1 \n\t" 1539 "packuswb %%mm1, %%mm0 \n\t" 1540 "movq %%mm0, %%mm1 \n\t" 1541 "psrlw $8, %%mm0 \n\t" 1542 "pand %%mm4, %%mm1 \n\t" 1543 "packuswb %%mm0, %%mm0 \n\t" 1544 "packuswb %%mm1, %%mm1 \n\t" 1545 "movd %%mm0, (%3, %%"REG_a") \n\t" 1546 "movd %%mm1, (%2, %%"REG_a") \n\t" 1547 "add $4, %%"REG_a" \n\t" 1548 " js 1b \n\t" 1549 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 1550 : "%"REG_a 1551 ); 1552#else 1553 int i; 1554 for (i=0; i<width; i++) 1555 { 1556 dstU[i]= src1[4*i + 1]; 1557 dstV[i]= src1[4*i + 3]; 1558 } 1559#endif 1560 assert(src1 == src2); 1561} 1562 1563/* This is almost identical to the previous, end exists only because 1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ 1565static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) 1566{ 1567#if HAVE_MMX 1568 __asm__ volatile( 1569 "mov %0, %%"REG_a" \n\t" 1570 "1: \n\t" 1571 "movq (%1, %%"REG_a",2), %%mm0 \n\t" 1572 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" 1573 "psrlw $8, %%mm0 \n\t" 1574 "psrlw $8, %%mm1 \n\t" 1575 "packuswb %%mm1, %%mm0 \n\t" 1576 "movq %%mm0, (%2, %%"REG_a") \n\t" 1577 "add $8, %%"REG_a" \n\t" 1578 " js 1b \n\t" 1579 : : "g" (-width), "r" (src+width*2), "r" (dst+width) 1580 : "%"REG_a 1581 ); 1582#else 1583 int i; 1584 for (i=0; i<width; i++) 1585 dst[i]= src[2*i+1]; 1586#endif 1587} 1588 1589static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) 1590{ 1591#if HAVE_MMX 1592 __asm__ volatile( 1593 "movq "MANGLE(bm01010101)", %%mm4 \n\t" 1594 "mov %0, %%"REG_a" \n\t" 1595 "1: \n\t" 1596 "movq (%1, %%"REG_a",4), %%mm0 \n\t" 1597 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" 1598 "pand %%mm4, %%mm0 \n\t" 1599 "pand %%mm4, %%mm1 \n\t" 1600 "packuswb %%mm1, %%mm0 \n\t" 1601 "movq %%mm0, %%mm1 \n\t" 1602 "psrlw $8, %%mm0 \n\t" 1603 "pand %%mm4, %%mm1 \n\t" 1604 "packuswb %%mm0, %%mm0 \n\t" 1605 "packuswb %%mm1, %%mm1 \n\t" 1606 "movd %%mm0, (%3, %%"REG_a") \n\t" 1607 "movd %%mm1, (%2, %%"REG_a") \n\t" 1608 "add $4, %%"REG_a" \n\t" 1609 " js 1b \n\t" 1610 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) 1611 : "%"REG_a 1612 ); 1613#else 1614 int i; 1615 for (i=0; i<width; i++) 1616 { 1617 dstU[i]= src1[4*i + 0]; 1618 dstV[i]= src1[4*i + 2]; 1619 } 1620#endif 1621 assert(src1 == src2); 1622} 1623 1624#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\ 1625static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\ 1626{\ 1627 int i;\ 1628 for (i=0; i<width; i++)\ 1629 {\ 1630 int b= (((type*)src)[i]>>shb)&maskb;\ 1631 int g= (((type*)src)[i]>>shg)&maskg;\ 1632 int r= (((type*)src)[i]>>shr)&maskr;\ 1633\ 1634 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\ 1635 }\ 1636} 1637 1638BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8) 1639BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8) 1640BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8) 1641BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7) 1642BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8) 1643BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7) 1644 1645#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\ 1646static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\ 1647{\ 1648 int i;\ 1649 for (i=0; i<width; i++)\ 1650 {\ 1651 int b= (((type*)src)[i]&maskb)>>shb;\ 1652 int g= (((type*)src)[i]&maskg)>>shg;\ 1653 int r= (((type*)src)[i]&maskr)>>shr;\ 1654\ 1655 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\ 1656 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\ 1657 }\ 1658}\ 1659static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\ 1660{\ 1661 int i;\ 1662 for (i=0; i<width; i++)\ 1663 {\ 1664 int pix0= ((type*)src)[2*i+0];\ 1665 int pix1= ((type*)src)[2*i+1];\ 1666 int g= (pix0&(maskg|maska))+(pix1&(maskg|maska));\ 1667 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\ 1668 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\ 1669 g&= maskg|(2*maskg);\ 1670\ 1671 g>>=shg;\ 1672\ 1673 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\ 1674 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\ 1675 }\ 1676} 1677 1678BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8) 1679BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8) 1680BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8) 1681BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7) 1682BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8) 1683BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7) 1684 1685#if HAVE_MMX 1686static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat) 1687{ 1688 1689 if(srcFormat == PIX_FMT_BGR24){ 1690 __asm__ volatile( 1691 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t" 1692 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t" 1693 : 1694 ); 1695 }else{ 1696 __asm__ volatile( 1697 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t" 1698 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t" 1699 : 1700 ); 1701 } 1702 1703 __asm__ volatile( 1704 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t" 1705 "mov %2, %%"REG_a" \n\t" 1706 "pxor %%mm7, %%mm7 \n\t" 1707 "1: \n\t" 1708 PREFETCH" 64(%0) \n\t" 1709 "movd (%0), %%mm0 \n\t" 1710 "movd 2(%0), %%mm1 \n\t" 1711 "movd 6(%0), %%mm2 \n\t" 1712 "movd 8(%0), %%mm3 \n\t" 1713 "add $12, %0 \n\t" 1714 "punpcklbw %%mm7, %%mm0 \n\t" 1715 "punpcklbw %%mm7, %%mm1 \n\t" 1716 "punpcklbw %%mm7, %%mm2 \n\t" 1717 "punpcklbw %%mm7, %%mm3 \n\t" 1718 "pmaddwd %%mm5, %%mm0 \n\t" 1719 "pmaddwd %%mm6, %%mm1 \n\t" 1720 "pmaddwd %%mm5, %%mm2 \n\t" 1721 "pmaddwd %%mm6, %%mm3 \n\t" 1722 "paddd %%mm1, %%mm0 \n\t" 1723 "paddd %%mm3, %%mm2 \n\t" 1724 "paddd %%mm4, %%mm0 \n\t" 1725 "paddd %%mm4, %%mm2 \n\t" 1726 "psrad $15, %%mm0 \n\t" 1727 "psrad $15, %%mm2 \n\t" 1728 "packssdw %%mm2, %%mm0 \n\t" 1729 "packuswb %%mm0, %%mm0 \n\t" 1730 "movd %%mm0, (%1, %%"REG_a") \n\t" 1731 "add $4, %%"REG_a" \n\t" 1732 " js 1b \n\t" 1733 : "+r" (src) 1734 : "r" (dst+width), "g" (-width) 1735 : "%"REG_a 1736 ); 1737} 1738 1739static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat) 1740{ 1741 __asm__ volatile( 1742 "movq 24+%4, %%mm6 \n\t" 1743 "mov %3, %%"REG_a" \n\t" 1744 "pxor %%mm7, %%mm7 \n\t" 1745 "1: \n\t" 1746 PREFETCH" 64(%0) \n\t" 1747 "movd (%0), %%mm0 \n\t" 1748 "movd 2(%0), %%mm1 \n\t" 1749 "punpcklbw %%mm7, %%mm0 \n\t" 1750 "punpcklbw %%mm7, %%mm1 \n\t" 1751 "movq %%mm0, %%mm2 \n\t" 1752 "movq %%mm1, %%mm3 \n\t" 1753 "pmaddwd %4, %%mm0 \n\t" 1754 "pmaddwd 8+%4, %%mm1 \n\t" 1755 "pmaddwd 16+%4, %%mm2 \n\t" 1756 "pmaddwd %%mm6, %%mm3 \n\t" 1757 "paddd %%mm1, %%mm0 \n\t" 1758 "paddd %%mm3, %%mm2 \n\t" 1759 1760 "movd 6(%0), %%mm1 \n\t" 1761 "movd 8(%0), %%mm3 \n\t" 1762 "add $12, %0 \n\t" 1763 "punpcklbw %%mm7, %%mm1 \n\t" 1764 "punpcklbw %%mm7, %%mm3 \n\t" 1765 "movq %%mm1, %%mm4 \n\t" 1766 "movq %%mm3, %%mm5 \n\t" 1767 "pmaddwd %4, %%mm1 \n\t" 1768 "pmaddwd 8+%4, %%mm3 \n\t" 1769 "pmaddwd 16+%4, %%mm4 \n\t" 1770 "pmaddwd %%mm6, %%mm5 \n\t" 1771 "paddd %%mm3, %%mm1 \n\t" 1772 "paddd %%mm5, %%mm4 \n\t" 1773 1774 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t" 1775 "paddd %%mm3, %%mm0 \n\t" 1776 "paddd %%mm3, %%mm2 \n\t" 1777 "paddd %%mm3, %%mm1 \n\t" 1778 "paddd %%mm3, %%mm4 \n\t" 1779 "psrad $15, %%mm0 \n\t" 1780 "psrad $15, %%mm2 \n\t" 1781 "psrad $15, %%mm1 \n\t" 1782 "psrad $15, %%mm4 \n\t" 1783 "packssdw %%mm1, %%mm0 \n\t" 1784 "packssdw %%mm4, %%mm2 \n\t" 1785 "packuswb %%mm0, %%mm0 \n\t" 1786 "packuswb %%mm2, %%mm2 \n\t" 1787 "movd %%mm0, (%1, %%"REG_a") \n\t" 1788 "movd %%mm2, (%2, %%"REG_a") \n\t" 1789 "add $4, %%"REG_a" \n\t" 1790 " js 1b \n\t" 1791 : "+r" (src) 1792 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0]) 1793 : "%"REG_a 1794 ); 1795} 1796#endif 1797 1798static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) 1799{ 1800#if HAVE_MMX 1801 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); 1802#else 1803 int i; 1804 for (i=0; i<width; i++) 1805 { 1806 int b= src[i*3+0]; 1807 int g= src[i*3+1]; 1808 int r= src[i*3+2]; 1809 1810 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); 1811 } 1812#endif /* HAVE_MMX */ 1813} 1814 1815static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) 1816{ 1817#if HAVE_MMX 1818 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24); 1819#else 1820 int i; 1821 for (i=0; i<width; i++) 1822 { 1823 int b= src1[3*i + 0]; 1824 int g= src1[3*i + 1]; 1825 int r= src1[3*i + 2]; 1826 1827 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 1828 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 1829 } 1830#endif /* HAVE_MMX */ 1831 assert(src1 == src2); 1832} 1833 1834static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) 1835{ 1836 int i; 1837 for (i=0; i<width; i++) 1838 { 1839 int b= src1[6*i + 0] + src1[6*i + 3]; 1840 int g= src1[6*i + 1] + src1[6*i + 4]; 1841 int r= src1[6*i + 2] + src1[6*i + 5]; 1842 1843 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 1844 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 1845 } 1846 assert(src1 == src2); 1847} 1848 1849static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) 1850{ 1851#if HAVE_MMX 1852 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24); 1853#else 1854 int i; 1855 for (i=0; i<width; i++) 1856 { 1857 int r= src[i*3+0]; 1858 int g= src[i*3+1]; 1859 int b= src[i*3+2]; 1860 1861 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); 1862 } 1863#endif 1864} 1865 1866static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) 1867{ 1868#if HAVE_MMX 1869 assert(src1==src2); 1870 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); 1871#else 1872 int i; 1873 assert(src1==src2); 1874 for (i=0; i<width; i++) 1875 { 1876 int r= src1[3*i + 0]; 1877 int g= src1[3*i + 1]; 1878 int b= src1[3*i + 2]; 1879 1880 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 1881 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; 1882 } 1883#endif 1884} 1885 1886static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused) 1887{ 1888 int i; 1889 assert(src1==src2); 1890 for (i=0; i<width; i++) 1891 { 1892 int r= src1[6*i + 0] + src1[6*i + 3]; 1893 int g= src1[6*i + 1] + src1[6*i + 4]; 1894 int b= src1[6*i + 2] + src1[6*i + 5]; 1895 1896 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 1897 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1); 1898 } 1899} 1900 1901 1902static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal) 1903{ 1904 int i; 1905 for (i=0; i<width; i++) 1906 { 1907 int d= src[i]; 1908 1909 dst[i]= pal[d] & 0xFF; 1910 } 1911} 1912 1913static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal) 1914{ 1915 int i; 1916 assert(src1 == src2); 1917 for (i=0; i<width; i++) 1918 { 1919 int p= pal[src1[i]]; 1920 1921 dstU[i]= p>>8; 1922 dstV[i]= p>>16; 1923 } 1924} 1925 1926static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) 1927{ 1928 int i, j; 1929 for (i=0; i<width/8; i++){ 1930 int d= ~src[i]; 1931 for(j=0; j<8; j++) 1932 dst[8*i+j]= ((d>>(7-j))&1)*255; 1933 } 1934} 1935 1936static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused) 1937{ 1938 int i, j; 1939 for (i=0; i<width/8; i++){ 1940 int d= src[i]; 1941 for(j=0; j<8; j++) 1942 dst[8*i+j]= ((d>>(7-j))&1)*255; 1943 } 1944} 1945 1946// bilinear / bicubic scaling 1947static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, 1948 int16_t *filter, int16_t *filterPos, long filterSize) 1949{ 1950#if HAVE_MMX 1951 assert(filterSize % 4 == 0 && filterSize>0); 1952 if (filterSize==4) // Always true for upscaling, sometimes for down, too. 1953 { 1954 long counter= -2*dstW; 1955 filter-= counter*2; 1956 filterPos-= counter/2; 1957 dst-= counter/2; 1958 __asm__ volatile( 1959#if defined(PIC) 1960 "push %%"REG_b" \n\t" 1961#endif 1962 "pxor %%mm7, %%mm7 \n\t" 1963 "push %%"REG_BP" \n\t" // we use 7 regs here ... 1964 "mov %%"REG_a", %%"REG_BP" \n\t" 1965 ASMALIGN(4) 1966 "1: \n\t" 1967 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 1968 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 1969 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" 1970 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" 1971 "movd (%3, %%"REG_a"), %%mm0 \n\t" 1972 "movd (%3, %%"REG_b"), %%mm2 \n\t" 1973 "punpcklbw %%mm7, %%mm0 \n\t" 1974 "punpcklbw %%mm7, %%mm2 \n\t" 1975 "pmaddwd %%mm1, %%mm0 \n\t" 1976 "pmaddwd %%mm2, %%mm3 \n\t" 1977 "movq %%mm0, %%mm4 \n\t" 1978 "punpckldq %%mm3, %%mm0 \n\t" 1979 "punpckhdq %%mm3, %%mm4 \n\t" 1980 "paddd %%mm4, %%mm0 \n\t" 1981 "psrad $7, %%mm0 \n\t" 1982 "packssdw %%mm0, %%mm0 \n\t" 1983 "movd %%mm0, (%4, %%"REG_BP") \n\t" 1984 "add $4, %%"REG_BP" \n\t" 1985 " jnc 1b \n\t" 1986 1987 "pop %%"REG_BP" \n\t" 1988#if defined(PIC) 1989 "pop %%"REG_b" \n\t" 1990#endif 1991 : "+a" (counter) 1992 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 1993#if !defined(PIC) 1994 : "%"REG_b 1995#endif 1996 ); 1997 } 1998 else if (filterSize==8) 1999 { 2000 long counter= -2*dstW; 2001 filter-= counter*4; 2002 filterPos-= counter/2; 2003 dst-= counter/2; 2004 __asm__ volatile( 2005#if defined(PIC) 2006 "push %%"REG_b" \n\t" 2007#endif 2008 "pxor %%mm7, %%mm7 \n\t" 2009 "push %%"REG_BP" \n\t" // we use 7 regs here ... 2010 "mov %%"REG_a", %%"REG_BP" \n\t" 2011 ASMALIGN(4) 2012 "1: \n\t" 2013 "movzwl (%2, %%"REG_BP"), %%eax \n\t" 2014 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" 2015 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" 2016 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" 2017 "movd (%3, %%"REG_a"), %%mm0 \n\t" 2018 "movd (%3, %%"REG_b"), %%mm2 \n\t" 2019 "punpcklbw %%mm7, %%mm0 \n\t" 2020 "punpcklbw %%mm7, %%mm2 \n\t" 2021 "pmaddwd %%mm1, %%mm0 \n\t" 2022 "pmaddwd %%mm2, %%mm3 \n\t" 2023 2024 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" 2025 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" 2026 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" 2027 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" 2028 "punpcklbw %%mm7, %%mm4 \n\t" 2029 "punpcklbw %%mm7, %%mm2 \n\t" 2030 "pmaddwd %%mm1, %%mm4 \n\t" 2031 "pmaddwd %%mm2, %%mm5 \n\t" 2032 "paddd %%mm4, %%mm0 \n\t" 2033 "paddd %%mm5, %%mm3 \n\t" 2034 "movq %%mm0, %%mm4 \n\t" 2035 "punpckldq %%mm3, %%mm0 \n\t" 2036 "punpckhdq %%mm3, %%mm4 \n\t" 2037 "paddd %%mm4, %%mm0 \n\t" 2038 "psrad $7, %%mm0 \n\t" 2039 "packssdw %%mm0, %%mm0 \n\t" 2040 "movd %%mm0, (%4, %%"REG_BP") \n\t" 2041 "add $4, %%"REG_BP" \n\t" 2042 " jnc 1b \n\t" 2043 2044 "pop %%"REG_BP" \n\t" 2045#if defined(PIC) 2046 "pop %%"REG_b" \n\t" 2047#endif 2048 : "+a" (counter) 2049 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) 2050#if !defined(PIC) 2051 : "%"REG_b 2052#endif 2053 ); 2054 } 2055 else 2056 { 2057 uint8_t *offset = src+filterSize; 2058 long counter= -2*dstW; 2059 //filter-= counter*filterSize/2; 2060 filterPos-= counter/2; 2061 dst-= counter/2; 2062 __asm__ volatile( 2063 "pxor %%mm7, %%mm7 \n\t" 2064 ASMALIGN(4) 2065 "1: \n\t" 2066 "mov %2, %%"REG_c" \n\t" 2067 "movzwl (%%"REG_c", %0), %%eax \n\t" 2068 "movzwl 2(%%"REG_c", %0), %%edx \n\t" 2069 "mov %5, %%"REG_c" \n\t" 2070 "pxor %%mm4, %%mm4 \n\t" 2071 "pxor %%mm5, %%mm5 \n\t" 2072 "2: \n\t" 2073 "movq (%1), %%mm1 \n\t" 2074 "movq (%1, %6), %%mm3 \n\t" 2075 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" 2076 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" 2077 "punpcklbw %%mm7, %%mm0 \n\t" 2078 "punpcklbw %%mm7, %%mm2 \n\t" 2079 "pmaddwd %%mm1, %%mm0 \n\t" 2080 "pmaddwd %%mm2, %%mm3 \n\t" 2081 "paddd %%mm3, %%mm5 \n\t" 2082 "paddd %%mm0, %%mm4 \n\t" 2083 "add $8, %1 \n\t" 2084 "add $4, %%"REG_c" \n\t" 2085 "cmp %4, %%"REG_c" \n\t" 2086 " jb 2b \n\t" 2087 "add %6, %1 \n\t" 2088 "movq %%mm4, %%mm0 \n\t" 2089 "punpckldq %%mm5, %%mm4 \n\t" 2090 "punpckhdq %%mm5, %%mm0 \n\t" 2091 "paddd %%mm0, %%mm4 \n\t" 2092 "psrad $7, %%mm4 \n\t" 2093 "packssdw %%mm4, %%mm4 \n\t" 2094 "mov %3, %%"REG_a" \n\t" 2095 "movd %%mm4, (%%"REG_a", %0) \n\t" 2096 "add $4, %0 \n\t" 2097 " jnc 1b \n\t" 2098 2099 : "+r" (counter), "+r" (filter) 2100 : "m" (filterPos), "m" (dst), "m"(offset), 2101 "m" (src), "r" (filterSize*2) 2102 : "%"REG_a, "%"REG_c, "%"REG_d 2103 ); 2104 } 2105#else 2106#if HAVE_ALTIVEC 2107 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); 2108#else 2109 int i; 2110 for (i=0; i<dstW; i++) 2111 { 2112 int j; 2113 int srcPos= filterPos[i]; 2114 int val=0; 2115 //printf("filterPos: %d\n", filterPos[i]); 2116 for (j=0; j<filterSize; j++) 2117 { 2118 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); 2119 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; 2120 } 2121 //filter += hFilterSize; 2122 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ... 2123 //dst[i] = val>>7; 2124 } 2125#endif /* HAVE_ALTIVEC */ 2126#endif /* HAVE_MMX */ 2127} 2128 // *** horizontal scale Y line to temp buffer 2129static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, 2130 int flags, int canMMX2BeUsed, int16_t *hLumFilter, 2131 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 2132 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, 2133 int32_t *mmx2FilterPos, uint32_t *pal) 2134{ 2135 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE) 2136 { 2137 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal); 2138 src= formatConvBuffer; 2139 } 2140 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE) 2141 { 2142 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal); 2143 src= formatConvBuffer; 2144 } 2145 else if (srcFormat==PIX_FMT_RGB32) 2146 { 2147 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal); 2148 src= formatConvBuffer; 2149 } 2150 else if (srcFormat==PIX_FMT_RGB32_1) 2151 { 2152 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); 2153 src= formatConvBuffer; 2154 } 2155 else if (srcFormat==PIX_FMT_BGR24) 2156 { 2157 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal); 2158 src= formatConvBuffer; 2159 } 2160 else if (srcFormat==PIX_FMT_BGR565) 2161 { 2162 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal); 2163 src= formatConvBuffer; 2164 } 2165 else if (srcFormat==PIX_FMT_BGR555) 2166 { 2167 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal); 2168 src= formatConvBuffer; 2169 } 2170 else if (srcFormat==PIX_FMT_BGR32) 2171 { 2172 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal); 2173 src= formatConvBuffer; 2174 } 2175 else if (srcFormat==PIX_FMT_BGR32_1) 2176 { 2177 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal); 2178 src= formatConvBuffer; 2179 } 2180 else if (srcFormat==PIX_FMT_RGB24) 2181 { 2182 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal); 2183 src= formatConvBuffer; 2184 } 2185 else if (srcFormat==PIX_FMT_RGB565) 2186 { 2187 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal); 2188 src= formatConvBuffer; 2189 } 2190 else if (srcFormat==PIX_FMT_RGB555) 2191 { 2192 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal); 2193 src= formatConvBuffer; 2194 } 2195 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) 2196 { 2197 RENAME(palToY)(formatConvBuffer, src, srcW, pal); 2198 src= formatConvBuffer; 2199 } 2200 else if (srcFormat==PIX_FMT_MONOBLACK) 2201 { 2202 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal); 2203 src= formatConvBuffer; 2204 } 2205 else if (srcFormat==PIX_FMT_MONOWHITE) 2206 { 2207 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal); 2208 src= formatConvBuffer; 2209 } 2210 2211#if HAVE_MMX 2212 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). 2213 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) 2214#else 2215 if (!(flags&SWS_FAST_BILINEAR)) 2216#endif 2217 { 2218 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); 2219 } 2220 else // fast bilinear upscale / crap downscale 2221 { 2222#if ARCH_X86 && CONFIG_GPL 2223#if HAVE_MMX2 2224 int i; 2225#if defined(PIC) 2226 uint64_t ebxsave __attribute__((aligned(8))); 2227#endif 2228 if (canMMX2BeUsed) 2229 { 2230 __asm__ volatile( 2231#if defined(PIC) 2232 "mov %%"REG_b", %5 \n\t" 2233#endif 2234 "pxor %%mm7, %%mm7 \n\t" 2235 "mov %0, %%"REG_c" \n\t" 2236 "mov %1, %%"REG_D" \n\t" 2237 "mov %2, %%"REG_d" \n\t" 2238 "mov %3, %%"REG_b" \n\t" 2239 "xor %%"REG_a", %%"REG_a" \n\t" // i 2240 PREFETCH" (%%"REG_c") \n\t" 2241 PREFETCH" 32(%%"REG_c") \n\t" 2242 PREFETCH" 64(%%"REG_c") \n\t" 2243 2244#if ARCH_X86_64 2245 2246#define FUNNY_Y_CODE \ 2247 "movl (%%"REG_b"), %%esi \n\t"\ 2248 "call *%4 \n\t"\ 2249 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 2250 "add %%"REG_S", %%"REG_c" \n\t"\ 2251 "add %%"REG_a", %%"REG_D" \n\t"\ 2252 "xor %%"REG_a", %%"REG_a" \n\t"\ 2253 2254#else 2255 2256#define FUNNY_Y_CODE \ 2257 "movl (%%"REG_b"), %%esi \n\t"\ 2258 "call *%4 \n\t"\ 2259 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 2260 "add %%"REG_a", %%"REG_D" \n\t"\ 2261 "xor %%"REG_a", %%"REG_a" \n\t"\ 2262 2263#endif /* ARCH_X86_64 */ 2264 2265FUNNY_Y_CODE 2266FUNNY_Y_CODE 2267FUNNY_Y_CODE 2268FUNNY_Y_CODE 2269FUNNY_Y_CODE 2270FUNNY_Y_CODE 2271FUNNY_Y_CODE 2272FUNNY_Y_CODE 2273 2274#if defined(PIC) 2275 "mov %5, %%"REG_b" \n\t" 2276#endif 2277 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), 2278 "m" (funnyYCode) 2279#if defined(PIC) 2280 ,"m" (ebxsave) 2281#endif 2282 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 2283#if !defined(PIC) 2284 ,"%"REG_b 2285#endif 2286 ); 2287 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; 2288 } 2289 else 2290 { 2291#endif /* HAVE_MMX2 */ 2292 long xInc_shr16 = xInc >> 16; 2293 uint16_t xInc_mask = xInc & 0xffff; 2294 //NO MMX just normal asm ... 2295 __asm__ volatile( 2296 "xor %%"REG_a", %%"REG_a" \n\t" // i 2297 "xor %%"REG_d", %%"REG_d" \n\t" // xx 2298 "xorl %%ecx, %%ecx \n\t" // 2*xalpha 2299 ASMALIGN(4) 2300 "1: \n\t" 2301 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] 2302 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] 2303 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 2304 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 2305 "shll $16, %%edi \n\t" 2306 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 2307 "mov %1, %%"REG_D" \n\t" 2308 "shrl $9, %%esi \n\t" 2309 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" 2310 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF 2311 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry 2312 2313 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] 2314 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] 2315 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 2316 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 2317 "shll $16, %%edi \n\t" 2318 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 2319 "mov %1, %%"REG_D" \n\t" 2320 "shrl $9, %%esi \n\t" 2321 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t" 2322 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF 2323 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry 2324 2325 2326 "add $2, %%"REG_a" \n\t" 2327 "cmp %2, %%"REG_a" \n\t" 2328 " jb 1b \n\t" 2329 2330 2331 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) 2332 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" 2333 ); 2334#if HAVE_MMX2 2335 } //if MMX2 can't be used 2336#endif 2337#else 2338 int i; 2339 unsigned int xpos=0; 2340 for (i=0;i<dstWidth;i++) 2341 { 2342 register unsigned int xx=xpos>>16; 2343 register unsigned int xalpha=(xpos&0xFFFF)>>9; 2344 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; 2345 xpos+=xInc; 2346 } 2347#endif /* ARCH_X86 */ 2348 } 2349 2350 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ 2351 int i; 2352 //FIXME all pal and rgb srcFormats could do this convertion as well 2353 //FIXME all scalers more complex than bilinear could do half of this transform 2354 if(c->srcRange){ 2355 for (i=0; i<dstWidth; i++) 2356 dst[i]= (dst[i]*14071 + 33561947)>>14; 2357 }else{ 2358 for (i=0; i<dstWidth; i++) 2359 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14; 2360 } 2361 } 2362} 2363 2364inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, 2365 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, 2366 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, 2367 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, 2368 int32_t *mmx2FilterPos, uint32_t *pal) 2369{ 2370 if (srcFormat==PIX_FMT_YUYV422) 2371 { 2372 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2373 src1= formatConvBuffer; 2374 src2= formatConvBuffer+VOFW; 2375 } 2376 else if (srcFormat==PIX_FMT_UYVY422) 2377 { 2378 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2379 src1= formatConvBuffer; 2380 src2= formatConvBuffer+VOFW; 2381 } 2382 else if (srcFormat==PIX_FMT_RGB32) 2383 { 2384 if(c->chrSrcHSubSample) 2385 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2386 else 2387 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2388 src1= formatConvBuffer; 2389 src2= formatConvBuffer+VOFW; 2390 } 2391 else if (srcFormat==PIX_FMT_RGB32_1) 2392 { 2393 if(c->chrSrcHSubSample) 2394 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); 2395 else 2396 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); 2397 src1= formatConvBuffer; 2398 src2= formatConvBuffer+VOFW; 2399 } 2400 else if (srcFormat==PIX_FMT_BGR24) 2401 { 2402 if(c->chrSrcHSubSample) 2403 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2404 else 2405 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2406 src1= formatConvBuffer; 2407 src2= formatConvBuffer+VOFW; 2408 } 2409 else if (srcFormat==PIX_FMT_BGR565) 2410 { 2411 if(c->chrSrcHSubSample) 2412 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2413 else 2414 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2415 src1= formatConvBuffer; 2416 src2= formatConvBuffer+VOFW; 2417 } 2418 else if (srcFormat==PIX_FMT_BGR555) 2419 { 2420 if(c->chrSrcHSubSample) 2421 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2422 else 2423 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2424 src1= formatConvBuffer; 2425 src2= formatConvBuffer+VOFW; 2426 } 2427 else if (srcFormat==PIX_FMT_BGR32) 2428 { 2429 if(c->chrSrcHSubSample) 2430 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2431 else 2432 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2433 src1= formatConvBuffer; 2434 src2= formatConvBuffer+VOFW; 2435 } 2436 else if (srcFormat==PIX_FMT_BGR32_1) 2437 { 2438 if(c->chrSrcHSubSample) 2439 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); 2440 else 2441 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal); 2442 src1= formatConvBuffer; 2443 src2= formatConvBuffer+VOFW; 2444 } 2445 else if (srcFormat==PIX_FMT_RGB24) 2446 { 2447 if(c->chrSrcHSubSample) 2448 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2449 else 2450 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2451 src1= formatConvBuffer; 2452 src2= formatConvBuffer+VOFW; 2453 } 2454 else if (srcFormat==PIX_FMT_RGB565) 2455 { 2456 if(c->chrSrcHSubSample) 2457 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2458 else 2459 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2460 src1= formatConvBuffer; 2461 src2= formatConvBuffer+VOFW; 2462 } 2463 else if (srcFormat==PIX_FMT_RGB555) 2464 { 2465 if(c->chrSrcHSubSample) 2466 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2467 else 2468 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2469 src1= formatConvBuffer; 2470 src2= formatConvBuffer+VOFW; 2471 } 2472 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE) 2473 { 2474 return; 2475 } 2476 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) 2477 { 2478 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); 2479 src1= formatConvBuffer; 2480 src2= formatConvBuffer+VOFW; 2481 } 2482 2483#if HAVE_MMX 2484 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). 2485 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) 2486#else 2487 if (!(flags&SWS_FAST_BILINEAR)) 2488#endif 2489 { 2490 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 2491 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); 2492 } 2493 else // fast bilinear upscale / crap downscale 2494 { 2495#if ARCH_X86 && CONFIG_GPL 2496#if HAVE_MMX2 2497 int i; 2498#if defined(PIC) 2499 uint64_t ebxsave __attribute__((aligned(8))); 2500#endif 2501 if (canMMX2BeUsed) 2502 { 2503 __asm__ volatile( 2504#if defined(PIC) 2505 "mov %%"REG_b", %6 \n\t" 2506#endif 2507 "pxor %%mm7, %%mm7 \n\t" 2508 "mov %0, %%"REG_c" \n\t" 2509 "mov %1, %%"REG_D" \n\t" 2510 "mov %2, %%"REG_d" \n\t" 2511 "mov %3, %%"REG_b" \n\t" 2512 "xor %%"REG_a", %%"REG_a" \n\t" // i 2513 PREFETCH" (%%"REG_c") \n\t" 2514 PREFETCH" 32(%%"REG_c") \n\t" 2515 PREFETCH" 64(%%"REG_c") \n\t" 2516 2517#if ARCH_X86_64 2518 2519#define FUNNY_UV_CODE \ 2520 "movl (%%"REG_b"), %%esi \n\t"\ 2521 "call *%4 \n\t"\ 2522 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 2523 "add %%"REG_S", %%"REG_c" \n\t"\ 2524 "add %%"REG_a", %%"REG_D" \n\t"\ 2525 "xor %%"REG_a", %%"REG_a" \n\t"\ 2526 2527#else 2528 2529#define FUNNY_UV_CODE \ 2530 "movl (%%"REG_b"), %%esi \n\t"\ 2531 "call *%4 \n\t"\ 2532 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 2533 "add %%"REG_a", %%"REG_D" \n\t"\ 2534 "xor %%"REG_a", %%"REG_a" \n\t"\ 2535 2536#endif /* ARCH_X86_64 */ 2537 2538FUNNY_UV_CODE 2539FUNNY_UV_CODE 2540FUNNY_UV_CODE 2541FUNNY_UV_CODE 2542 "xor %%"REG_a", %%"REG_a" \n\t" // i 2543 "mov %5, %%"REG_c" \n\t" // src 2544 "mov %1, %%"REG_D" \n\t" // buf1 2545 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t" 2546 PREFETCH" (%%"REG_c") \n\t" 2547 PREFETCH" 32(%%"REG_c") \n\t" 2548 PREFETCH" 64(%%"REG_c") \n\t" 2549 2550FUNNY_UV_CODE 2551FUNNY_UV_CODE 2552FUNNY_UV_CODE 2553FUNNY_UV_CODE 2554 2555#if defined(PIC) 2556 "mov %6, %%"REG_b" \n\t" 2557#endif 2558 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), 2559 "m" (funnyUVCode), "m" (src2) 2560#if defined(PIC) 2561 ,"m" (ebxsave) 2562#endif 2563 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D 2564#if !defined(PIC) 2565 ,"%"REG_b 2566#endif 2567 ); 2568 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) 2569 { 2570 //printf("%d %d %d\n", dstWidth, i, srcW); 2571 dst[i] = src1[srcW-1]*128; 2572 dst[i+VOFW] = src2[srcW-1]*128; 2573 } 2574 } 2575 else 2576 { 2577#endif /* HAVE_MMX2 */ 2578 long xInc_shr16 = (long) (xInc >> 16); 2579 uint16_t xInc_mask = xInc & 0xffff; 2580 __asm__ volatile( 2581 "xor %%"REG_a", %%"REG_a" \n\t" // i 2582 "xor %%"REG_d", %%"REG_d" \n\t" // xx 2583 "xorl %%ecx, %%ecx \n\t" // 2*xalpha 2584 ASMALIGN(4) 2585 "1: \n\t" 2586 "mov %0, %%"REG_S" \n\t" 2587 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] 2588 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] 2589 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 2590 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 2591 "shll $16, %%edi \n\t" 2592 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 2593 "mov %1, %%"REG_D" \n\t" 2594 "shrl $9, %%esi \n\t" 2595 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" 2596 2597 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] 2598 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] 2599 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] 2600 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha 2601 "shll $16, %%edi \n\t" 2602 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) 2603 "mov %1, %%"REG_D" \n\t" 2604 "shrl $9, %%esi \n\t" 2605 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t" 2606 2607 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF 2608 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry 2609 "add $1, %%"REG_a" \n\t" 2610 "cmp %2, %%"REG_a" \n\t" 2611 " jb 1b \n\t" 2612 2613/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, 2614 which is needed to support GCC 4.0. */ 2615#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) 2616 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), 2617#else 2618 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), 2619#endif 2620 "r" (src2) 2621 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" 2622 ); 2623#if HAVE_MMX2 2624 } //if MMX2 can't be used 2625#endif 2626#else 2627 int i; 2628 unsigned int xpos=0; 2629 for (i=0;i<dstWidth;i++) 2630 { 2631 register unsigned int xx=xpos>>16; 2632 register unsigned int xalpha=(xpos&0xFFFF)>>9; 2633 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); 2634 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); 2635 /* slower 2636 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; 2637 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; 2638 */ 2639 xpos+=xInc; 2640 } 2641#endif /* ARCH_X86 */ 2642 } 2643 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ 2644 int i; 2645 //FIXME all pal and rgb srcFormats could do this convertion as well 2646 //FIXME all scalers more complex than bilinear could do half of this transform 2647 if(c->srcRange){ 2648 for (i=0; i<dstWidth; i++){ 2649 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469 2650 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469 2651 } 2652 }else{ 2653 for (i=0; i<dstWidth; i++){ 2654 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264 2655 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 2656 } 2657 } 2658 } 2659} 2660 2661static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, 2662 int srcSliceH, uint8_t* dst[], int dstStride[]){ 2663 2664 /* load a few things into local vars to make the code more readable? and faster */ 2665 const int srcW= c->srcW; 2666 const int dstW= c->dstW; 2667 const int dstH= c->dstH; 2668 const int chrDstW= c->chrDstW; 2669 const int chrSrcW= c->chrSrcW; 2670 const int lumXInc= c->lumXInc; 2671 const int chrXInc= c->chrXInc; 2672 const int dstFormat= c->dstFormat; 2673 const int srcFormat= c->srcFormat; 2674 const int flags= c->flags; 2675 const int canMMX2BeUsed= c->canMMX2BeUsed; 2676 int16_t *vLumFilterPos= c->vLumFilterPos; 2677 int16_t *vChrFilterPos= c->vChrFilterPos; 2678 int16_t *hLumFilterPos= c->hLumFilterPos; 2679 int16_t *hChrFilterPos= c->hChrFilterPos; 2680 int16_t *vLumFilter= c->vLumFilter; 2681 int16_t *vChrFilter= c->vChrFilter; 2682 int16_t *hLumFilter= c->hLumFilter; 2683 int16_t *hChrFilter= c->hChrFilter; 2684 int32_t *lumMmxFilter= c->lumMmxFilter; 2685 int32_t *chrMmxFilter= c->chrMmxFilter; 2686 const int vLumFilterSize= c->vLumFilterSize; 2687 const int vChrFilterSize= c->vChrFilterSize; 2688 const int hLumFilterSize= c->hLumFilterSize; 2689 const int hChrFilterSize= c->hChrFilterSize; 2690 int16_t **lumPixBuf= c->lumPixBuf; 2691 int16_t **chrPixBuf= c->chrPixBuf; 2692 const int vLumBufSize= c->vLumBufSize; 2693 const int vChrBufSize= c->vChrBufSize; 2694 uint8_t *funnyYCode= c->funnyYCode; 2695 uint8_t *funnyUVCode= c->funnyUVCode; 2696 uint8_t *formatConvBuffer= c->formatConvBuffer; 2697 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; 2698 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); 2699 int lastDstY; 2700 uint32_t *pal=c->pal_yuv; 2701 2702 /* vars which will change and which we need to store back in the context */ 2703 int dstY= c->dstY; 2704 int lumBufIndex= c->lumBufIndex; 2705 int chrBufIndex= c->chrBufIndex; 2706 int lastInLumBuf= c->lastInLumBuf; 2707 int lastInChrBuf= c->lastInChrBuf; 2708 2709 if (isPacked(c->srcFormat)){ 2710 src[0]= 2711 src[1]= 2712 src[2]= src[0]; 2713 srcStride[0]= 2714 srcStride[1]= 2715 srcStride[2]= srcStride[0]; 2716 } 2717 srcStride[1]<<= c->vChrDrop; 2718 srcStride[2]<<= c->vChrDrop; 2719 2720 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], 2721 // (int)dst[0], (int)dst[1], (int)dst[2]); 2722 2723#if 0 //self test FIXME move to a vfilter or something 2724 { 2725 static volatile int i=0; 2726 i++; 2727 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH) 2728 selfTest(src, srcStride, c->srcW, c->srcH); 2729 i--; 2730 } 2731#endif 2732 2733 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], 2734 //dstStride[0],dstStride[1],dstStride[2]); 2735 2736 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) 2737 { 2738 static int warnedAlready=0; //FIXME move this into the context perhaps 2739 if (flags & SWS_PRINT_INFO && !warnedAlready) 2740 { 2741 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" 2742 " ->cannot do aligned memory accesses anymore\n"); 2743 warnedAlready=1; 2744 } 2745 } 2746 2747 /* Note the user might start scaling the picture in the middle so this 2748 will not get executed. This is not really intended but works 2749 currently, so people might do it. */ 2750 if (srcSliceY ==0){ 2751 lumBufIndex=0; 2752 chrBufIndex=0; 2753 dstY=0; 2754 lastInLumBuf= -1; 2755 lastInChrBuf= -1; 2756 } 2757 2758 lastDstY= dstY; 2759 2760 for (;dstY < dstH; dstY++){ 2761 unsigned char *dest =dst[0]+dstStride[0]*dstY; 2762 const int chrDstY= dstY>>c->chrDstVSubSample; 2763 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; 2764 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; 2765 2766 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input 2767 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input 2768 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input 2769 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input 2770 2771 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", 2772 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); 2773 //handle holes (FAST_BILINEAR & weird filters) 2774 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; 2775 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; 2776 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); 2777 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); 2778 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); 2779 2780 // Do we have enough lines in this slice to output the dstY line 2781 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) 2782 { 2783 //Do horizontal scaling 2784 while(lastInLumBuf < lastLumSrcY) 2785 { 2786 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; 2787 lumBufIndex++; 2788 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); 2789 assert(lumBufIndex < 2*vLumBufSize); 2790 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); 2791 assert(lastInLumBuf + 1 - srcSliceY >= 0); 2792 //printf("%d %d\n", lumBufIndex, vLumBufSize); 2793 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, 2794 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, 2795 funnyYCode, c->srcFormat, formatConvBuffer, 2796 c->lumMmx2Filter, c->lumMmx2FilterPos, pal); 2797 lastInLumBuf++; 2798 } 2799 while(lastInChrBuf < lastChrSrcY) 2800 { 2801 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; 2802 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; 2803 chrBufIndex++; 2804 assert(chrBufIndex < 2*vChrBufSize); 2805 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); 2806 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); 2807 //FIXME replace parameters through context struct (some at least) 2808 2809 if (!(isGray(srcFormat) || isGray(dstFormat))) 2810 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, 2811 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, 2812 funnyUVCode, c->srcFormat, formatConvBuffer, 2813 c->chrMmx2Filter, c->chrMmx2FilterPos, pal); 2814 lastInChrBuf++; 2815 } 2816 //wrap buf index around to stay inside the ring buffer 2817 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; 2818 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; 2819 } 2820 else // not enough lines left in this slice -> load the rest in the buffer 2821 { 2822 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", 2823 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, 2824 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, 2825 vChrBufSize, vLumBufSize);*/ 2826 2827 //Do horizontal scaling 2828 while(lastInLumBuf+1 < srcSliceY + srcSliceH) 2829 { 2830 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; 2831 lumBufIndex++; 2832 assert(lumBufIndex < 2*vLumBufSize); 2833 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); 2834 assert(lastInLumBuf + 1 - srcSliceY >= 0); 2835 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, 2836 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, 2837 funnyYCode, c->srcFormat, formatConvBuffer, 2838 c->lumMmx2Filter, c->lumMmx2FilterPos, pal); 2839 lastInLumBuf++; 2840 } 2841 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) 2842 { 2843 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; 2844 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; 2845 chrBufIndex++; 2846 assert(chrBufIndex < 2*vChrBufSize); 2847 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH); 2848 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); 2849 2850 if (!(isGray(srcFormat) || isGray(dstFormat))) 2851 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, 2852 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, 2853 funnyUVCode, c->srcFormat, formatConvBuffer, 2854 c->chrMmx2Filter, c->chrMmx2FilterPos, pal); 2855 lastInChrBuf++; 2856 } 2857 //wrap buf index around to stay inside the ring buffer 2858 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; 2859 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; 2860 break; //we can't output a dstY line so let's try with the next slice 2861 } 2862 2863#if HAVE_MMX 2864 c->blueDither= ff_dither8[dstY&1]; 2865 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555) 2866 c->greenDither= ff_dither8[dstY&1]; 2867 else 2868 c->greenDither= ff_dither4[dstY&1]; 2869 c->redDither= ff_dither8[(dstY+1)&1]; 2870#endif 2871 if (dstY < dstH-2) 2872 { 2873 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; 2874 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; 2875#if HAVE_MMX 2876 int i; 2877 if (flags & SWS_ACCURATE_RND){ 2878 int s= APCK_SIZE / 8; 2879 for (i=0; i<vLumFilterSize; i+=2){ 2880 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ]; 2881 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)]; 2882 lumMmxFilter[s*i+APCK_COEF/4 ]= 2883 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] 2884 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); 2885 } 2886 for (i=0; i<vChrFilterSize; i+=2){ 2887 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ]; 2888 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)]; 2889 chrMmxFilter[s*i+APCK_COEF/4 ]= 2890 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] 2891 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); 2892 } 2893 }else{ 2894 for (i=0; i<vLumFilterSize; i++) 2895 { 2896 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; 2897 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32; 2898 lumMmxFilter[4*i+2]= 2899 lumMmxFilter[4*i+3]= 2900 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; 2901 } 2902 for (i=0; i<vChrFilterSize; i++) 2903 { 2904 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; 2905 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32; 2906 chrMmxFilter[4*i+2]= 2907 chrMmxFilter[4*i+3]= 2908 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; 2909 } 2910 } 2911#endif 2912 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ 2913 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 2914 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi 2915 RENAME(yuv2nv12X)(c, 2916 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2917 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2918 dest, uDest, dstW, chrDstW, dstFormat); 2919 } 2920 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like 2921 { 2922 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 2923 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi 2924 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12 2925 { 2926 int16_t *lumBuf = lumPixBuf[0]; 2927 int16_t *chrBuf= chrPixBuf[0]; 2928 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); 2929 } 2930 else //General YV12 2931 { 2932 RENAME(yuv2yuvX)(c, 2933 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2934 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2935 dest, uDest, vDest, dstW, chrDstW); 2936 } 2937 } 2938 else 2939 { 2940 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); 2941 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); 2942 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB 2943 { 2944 int chrAlpha= vChrFilter[2*dstY+1]; 2945 if(flags & SWS_FULL_CHR_H_INT){ 2946 yuv2rgbXinC_full(c, //FIXME write a packed1_full function 2947 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2948 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2949 dest, dstW, dstY); 2950 }else{ 2951 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), 2952 dest, dstW, chrAlpha, dstFormat, flags, dstY); 2953 } 2954 } 2955 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB 2956 { 2957 int lumAlpha= vLumFilter[2*dstY+1]; 2958 int chrAlpha= vChrFilter[2*dstY+1]; 2959 lumMmxFilter[2]= 2960 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; 2961 chrMmxFilter[2]= 2962 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; 2963 if(flags & SWS_FULL_CHR_H_INT){ 2964 yuv2rgbXinC_full(c, //FIXME write a packed2_full function 2965 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2966 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2967 dest, dstW, dstY); 2968 }else{ 2969 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), 2970 dest, dstW, lumAlpha, chrAlpha, dstY); 2971 } 2972 } 2973 else //general RGB 2974 { 2975 if(flags & SWS_FULL_CHR_H_INT){ 2976 yuv2rgbXinC_full(c, 2977 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2978 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2979 dest, dstW, dstY); 2980 }else{ 2981 RENAME(yuv2packedX)(c, 2982 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 2983 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2984 dest, dstW, dstY); 2985 } 2986 } 2987 } 2988 } 2989 else // hmm looks like we can't use MMX here without overwriting this array's tail 2990 { 2991 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; 2992 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; 2993 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ 2994 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 2995 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi 2996 yuv2nv12XinC( 2997 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 2998 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 2999 dest, uDest, dstW, chrDstW, dstFormat); 3000 } 3001 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 3002 { 3003 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; 3004 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi 3005 yuv2yuvXinC( 3006 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, 3007 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 3008 dest, uDest, vDest, dstW, chrDstW); 3009 } 3010 else 3011 { 3012 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); 3013 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); 3014 if(flags & SWS_FULL_CHR_H_INT){ 3015 yuv2rgbXinC_full(c, 3016 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 3017 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 3018 dest, dstW, dstY); 3019 }else{ 3020 yuv2packedXinC(c, 3021 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, 3022 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, 3023 dest, dstW, dstY); 3024 } 3025 } 3026 } 3027 } 3028 3029#if HAVE_MMX 3030 __asm__ volatile(SFENCE:::"memory"); 3031 __asm__ volatile(EMMS:::"memory"); 3032#endif 3033 /* store changed local vars back in the context */ 3034 c->dstY= dstY; 3035 c->lumBufIndex= lumBufIndex; 3036 c->chrBufIndex= chrBufIndex; 3037 c->lastInLumBuf= lastInLumBuf; 3038 c->lastInChrBuf= lastInChrBuf; 3039 3040 return dstY - lastDstY; 3041} 3042