1/* 2 * VC-1 and WMV3 - DSP functions MMX-optimized 3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> 4 * 5 * Permission is hereby granted, free of charge, to any person 6 * obtaining a copy of this software and associated documentation 7 * files (the "Software"), to deal in the Software without 8 * restriction, including without limitation the rights to use, 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following 12 * conditions: 13 * 14 * The above copyright notice and this permission notice shall be 15 * included in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 */ 26 27#include "libavutil/cpu.h" 28#include "libavutil/mem.h" 29#include "libavutil/x86/asm.h" 30#include "libavutil/x86/cpu.h" 31#include "libavcodec/vc1dsp.h" 32#include "constants.h" 33#include "fpel.h" 34#include "vc1dsp.h" 35 36#if HAVE_6REGS && HAVE_INLINE_ASM 37 38#define OP_PUT(S,D) 39#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" 40 41/** Add rounder from mm7 to mm3 and pack result at destination */ 42#define NORMALIZE_MMX(SHIFT) \ 43 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ 44 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ 45 "psraw "SHIFT", %%mm3 \n\t" \ 46 "psraw "SHIFT", %%mm4 \n\t" 47 48#define TRANSFER_DO_PACK(OP) \ 49 "packuswb %%mm4, %%mm3 \n\t" \ 50 OP((%2), %%mm3) \ 51 "movq %%mm3, (%2) \n\t" 52 53#define TRANSFER_DONT_PACK(OP) \ 54 OP(0(%2), %%mm3) \ 55 OP(8(%2), %%mm4) \ 56 "movq %%mm3, 0(%2) \n\t" \ 57 "movq %%mm4, 8(%2) \n\t" 58 59/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ 60#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" 61#define DONT_UNPACK(reg) 62 63/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ 64#define LOAD_ROUNDER_MMX(ROUND) \ 65 "movd "ROUND", %%mm7 \n\t" \ 66 "punpcklwd %%mm7, %%mm7 \n\t" \ 67 "punpckldq %%mm7, %%mm7 \n\t" 68 69#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ 70 "paddw %%mm"#R2", %%mm"#R1" \n\t" \ 71 "movd (%0,%3), %%mm"#R0" \n\t" \ 72 "pmullw %%mm6, %%mm"#R1" \n\t" \ 73 "punpcklbw %%mm0, %%mm"#R0" \n\t" \ 74 "movd (%0,%2), %%mm"#R3" \n\t" \ 75 "psubw %%mm"#R0", %%mm"#R1" \n\t" \ 76 "punpcklbw %%mm0, %%mm"#R3" \n\t" \ 77 "paddw %%mm7, %%mm"#R1" \n\t" \ 78 "psubw %%mm"#R3", %%mm"#R1" \n\t" \ 79 "psraw %4, %%mm"#R1" \n\t" \ 80 "movq %%mm"#R1", "#OFF"(%1) \n\t" \ 81 "add %2, %0 \n\t" 82 83/** Sacrifying mm6 allows to pipeline loads from src */ 84static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, 85 const uint8_t *src, x86_reg stride, 86 int rnd, int64_t shift) 87{ 88 __asm__ volatile( 89 "mov $3, %%"REG_c" \n\t" 90 LOAD_ROUNDER_MMX("%5") 91 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" 92 "1: \n\t" 93 "movd (%0), %%mm2 \n\t" 94 "add %2, %0 \n\t" 95 "movd (%0), %%mm3 \n\t" 96 "punpcklbw %%mm0, %%mm2 \n\t" 97 "punpcklbw %%mm0, %%mm3 \n\t" 98 SHIFT2_LINE( 0, 1, 2, 3, 4) 99 SHIFT2_LINE( 24, 2, 3, 4, 1) 100 SHIFT2_LINE( 48, 3, 4, 1, 2) 101 SHIFT2_LINE( 72, 4, 1, 2, 3) 102 SHIFT2_LINE( 96, 1, 2, 3, 4) 103 SHIFT2_LINE(120, 2, 3, 4, 1) 104 SHIFT2_LINE(144, 3, 4, 1, 2) 105 SHIFT2_LINE(168, 4, 1, 2, 3) 106 "sub %6, %0 \n\t" 107 "add $8, %1 \n\t" 108 "dec %%"REG_c" \n\t" 109 "jnz 1b \n\t" 110 : "+r"(src), "+r"(dst) 111 : "r"(stride), "r"(-2*stride), 112 "m"(shift), "m"(rnd), "r"(9*stride-4) 113 NAMED_CONSTRAINTS_ADD(ff_pw_9) 114 : "%"REG_c, "memory" 115 ); 116} 117 118/** 119 * Data is already unpacked, so some operations can directly be made from 120 * memory. 121 */ 122#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ 123static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ 124 const int16_t *src, int rnd)\ 125{\ 126 int h = 8;\ 127\ 128 src -= 1;\ 129 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ 130 __asm__ volatile(\ 131 LOAD_ROUNDER_MMX("%4")\ 132 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ 133 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ 134 "1: \n\t"\ 135 "movq 2*0+0(%1), %%mm1 \n\t"\ 136 "movq 2*0+8(%1), %%mm2 \n\t"\ 137 "movq 2*1+0(%1), %%mm3 \n\t"\ 138 "movq 2*1+8(%1), %%mm4 \n\t"\ 139 "paddw 2*3+0(%1), %%mm1 \n\t"\ 140 "paddw 2*3+8(%1), %%mm2 \n\t"\ 141 "paddw 2*2+0(%1), %%mm3 \n\t"\ 142 "paddw 2*2+8(%1), %%mm4 \n\t"\ 143 "pmullw %%mm5, %%mm3 \n\t"\ 144 "pmullw %%mm5, %%mm4 \n\t"\ 145 "psubw %%mm1, %%mm3 \n\t"\ 146 "psubw %%mm2, %%mm4 \n\t"\ 147 NORMALIZE_MMX("$7")\ 148 /* Remove bias */\ 149 "paddw %%mm6, %%mm3 \n\t"\ 150 "paddw %%mm6, %%mm4 \n\t"\ 151 TRANSFER_DO_PACK(OP)\ 152 "add $24, %1 \n\t"\ 153 "add %3, %2 \n\t"\ 154 "decl %0 \n\t"\ 155 "jnz 1b \n\t"\ 156 : "+r"(h), "+r" (src), "+r" (dst)\ 157 : "r"(stride), "m"(rnd)\ 158 NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\ 159 : "memory"\ 160 );\ 161} 162 163VC1_HOR_16b_SHIFT2(OP_PUT, put_) 164VC1_HOR_16b_SHIFT2(OP_AVG, avg_) 165 166 167/** 168 * Purely vertical or horizontal 1/2 shift interpolation. 169 * Sacrify mm6 for *9 factor. 170 */ 171#define VC1_SHIFT2(OP, OPNAME)\ 172static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ 173 x86_reg stride, int rnd, x86_reg offset)\ 174{\ 175 rnd = 8-rnd;\ 176 __asm__ volatile(\ 177 "mov $8, %%"REG_c" \n\t"\ 178 LOAD_ROUNDER_MMX("%5")\ 179 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ 180 "1: \n\t"\ 181 "movd 0(%0 ), %%mm3 \n\t"\ 182 "movd 4(%0 ), %%mm4 \n\t"\ 183 "movd 0(%0,%2), %%mm1 \n\t"\ 184 "movd 4(%0,%2), %%mm2 \n\t"\ 185 "add %2, %0 \n\t"\ 186 "punpcklbw %%mm0, %%mm3 \n\t"\ 187 "punpcklbw %%mm0, %%mm4 \n\t"\ 188 "punpcklbw %%mm0, %%mm1 \n\t"\ 189 "punpcklbw %%mm0, %%mm2 \n\t"\ 190 "paddw %%mm1, %%mm3 \n\t"\ 191 "paddw %%mm2, %%mm4 \n\t"\ 192 "movd 0(%0,%3), %%mm1 \n\t"\ 193 "movd 4(%0,%3), %%mm2 \n\t"\ 194 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ 195 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ 196 "punpcklbw %%mm0, %%mm1 \n\t"\ 197 "punpcklbw %%mm0, %%mm2 \n\t"\ 198 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ 199 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ 200 "movd 0(%0,%2), %%mm1 \n\t"\ 201 "movd 4(%0,%2), %%mm2 \n\t"\ 202 "punpcklbw %%mm0, %%mm1 \n\t"\ 203 "punpcklbw %%mm0, %%mm2 \n\t"\ 204 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ 205 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ 206 NORMALIZE_MMX("$4")\ 207 "packuswb %%mm4, %%mm3 \n\t"\ 208 OP((%1), %%mm3)\ 209 "movq %%mm3, (%1) \n\t"\ 210 "add %6, %0 \n\t"\ 211 "add %4, %1 \n\t"\ 212 "dec %%"REG_c" \n\t"\ 213 "jnz 1b \n\t"\ 214 : "+r"(src), "+r"(dst)\ 215 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ 216 "g"(stride-offset)\ 217 NAMED_CONSTRAINTS_ADD(ff_pw_9)\ 218 : "%"REG_c, "memory"\ 219 );\ 220} 221 222VC1_SHIFT2(OP_PUT, put_) 223VC1_SHIFT2(OP_AVG, avg_) 224 225/** 226 * Core of the 1/4 and 3/4 shift bicubic interpolation. 227 * 228 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). 229 * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. 230 * @param A1 Address of 1st tap (beware of unpacked/packed). 231 * @param A2 Address of 2nd tap 232 * @param A3 Address of 3rd tap 233 * @param A4 Address of 4th tap 234 */ 235#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ 236 MOVQ "*0+"A1", %%mm1 \n\t" \ 237 MOVQ "*4+"A1", %%mm2 \n\t" \ 238 UNPACK("%%mm1") \ 239 UNPACK("%%mm2") \ 240 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ 241 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ 242 MOVQ "*0+"A2", %%mm3 \n\t" \ 243 MOVQ "*4+"A2", %%mm4 \n\t" \ 244 UNPACK("%%mm3") \ 245 UNPACK("%%mm4") \ 246 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ 247 "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ 248 "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ 249 "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ 250 MOVQ "*0+"A4", %%mm1 \n\t" \ 251 MOVQ "*4+"A4", %%mm2 \n\t" \ 252 UNPACK("%%mm1") \ 253 UNPACK("%%mm2") \ 254 "psllw $2, %%mm1 \n\t" /* 4* */ \ 255 "psllw $2, %%mm2 \n\t" /* 4* */ \ 256 "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ 257 "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ 258 MOVQ "*0+"A3", %%mm1 \n\t" \ 259 MOVQ "*4+"A3", %%mm2 \n\t" \ 260 UNPACK("%%mm1") \ 261 UNPACK("%%mm2") \ 262 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ 263 "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ 264 "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ 265 "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ 266 267/** 268 * Macro to build the vertical 16bits version of vc1_put_shift[13]. 269 * Here, offset=src_stride. Parameters passed A1 to A4 must use 270 * %3 (src_stride) and %4 (3*src_stride). 271 * 272 * @param NAME Either 1 or 3 273 * @see MSPEL_FILTER13_CORE for information on A1->A4 274 */ 275#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ 276static void \ 277vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ 278 x86_reg src_stride, \ 279 int rnd, int64_t shift) \ 280{ \ 281 int h = 8; \ 282 src -= src_stride; \ 283 __asm__ volatile( \ 284 LOAD_ROUNDER_MMX("%5") \ 285 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ 286 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ 287 ".p2align 3 \n\t" \ 288 "1: \n\t" \ 289 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ 290 NORMALIZE_MMX("%6") \ 291 TRANSFER_DONT_PACK(OP_PUT) \ 292 /* Last 3 (in fact 4) bytes on the line */ \ 293 "movd 8+"A1", %%mm1 \n\t" \ 294 DO_UNPACK("%%mm1") \ 295 "movq %%mm1, %%mm3 \n\t" \ 296 "paddw %%mm1, %%mm1 \n\t" \ 297 "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ 298 "movd 8+"A2", %%mm3 \n\t" \ 299 DO_UNPACK("%%mm3") \ 300 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ 301 "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ 302 "movd 8+"A3", %%mm1 \n\t" \ 303 DO_UNPACK("%%mm1") \ 304 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ 305 "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ 306 "movd 8+"A4", %%mm1 \n\t" \ 307 DO_UNPACK("%%mm1") \ 308 "psllw $2, %%mm1 \n\t" /* 4* */ \ 309 "psubw %%mm1, %%mm3 \n\t" \ 310 "paddw %%mm7, %%mm3 \n\t" \ 311 "psraw %6, %%mm3 \n\t" \ 312 "movq %%mm3, 16(%2) \n\t" \ 313 "add %3, %1 \n\t" \ 314 "add $24, %2 \n\t" \ 315 "decl %0 \n\t" \ 316 "jnz 1b \n\t" \ 317 : "+r"(h), "+r" (src), "+r" (dst) \ 318 : "r"(src_stride), "r"(3*src_stride), \ 319 "m"(rnd), "m"(shift) \ 320 NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \ 321 : "memory" \ 322 ); \ 323} 324 325/** 326 * Macro to build the horizontal 16bits version of vc1_put_shift[13]. 327 * Here, offset=16bits, so parameters passed A1 to A4 should be simple. 328 * 329 * @param NAME Either 1 or 3 330 * @see MSPEL_FILTER13_CORE for information on A1->A4 331 */ 332#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 333static void \ 334OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ 335 const int16_t *src, int rnd) \ 336{ \ 337 int h = 8; \ 338 src -= 1; \ 339 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ 340 __asm__ volatile( \ 341 LOAD_ROUNDER_MMX("%4") \ 342 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ 343 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ 344 ".p2align 3 \n\t" \ 345 "1: \n\t" \ 346 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ 347 NORMALIZE_MMX("$7") \ 348 /* Remove bias */ \ 349 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ 350 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ 351 TRANSFER_DO_PACK(OP) \ 352 "add $24, %1 \n\t" \ 353 "add %3, %2 \n\t" \ 354 "decl %0 \n\t" \ 355 "jnz 1b \n\t" \ 356 : "+r"(h), "+r" (src), "+r" (dst) \ 357 : "r"(stride), "m"(rnd) \ 358 NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \ 359 : "memory" \ 360 ); \ 361} 362 363/** 364 * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. 365 * Here, offset=src_stride. Parameters passed A1 to A4 must use 366 * %3 (offset) and %4 (3*offset). 367 * 368 * @param NAME Either 1 or 3 369 * @see MSPEL_FILTER13_CORE for information on A1->A4 370 */ 371#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 372static void \ 373OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ 374 x86_reg stride, int rnd, x86_reg offset) \ 375{ \ 376 int h = 8; \ 377 src -= offset; \ 378 rnd = 32-rnd; \ 379 __asm__ volatile ( \ 380 LOAD_ROUNDER_MMX("%6") \ 381 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ 382 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ 383 ".p2align 3 \n\t" \ 384 "1: \n\t" \ 385 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ 386 NORMALIZE_MMX("$6") \ 387 TRANSFER_DO_PACK(OP) \ 388 "add %5, %1 \n\t" \ 389 "add %5, %2 \n\t" \ 390 "decl %0 \n\t" \ 391 "jnz 1b \n\t" \ 392 : "+r"(h), "+r" (src), "+r" (dst) \ 393 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ 394 NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \ 395 : "memory" \ 396 ); \ 397} 398 399/** 1/4 shift bicubic interpolation */ 400MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) 401MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) 402MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") 403MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) 404MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) 405 406/** 3/4 shift bicubic interpolation */ 407MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) 408MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) 409MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") 410MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) 411MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) 412 413typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); 414typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); 415typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); 416 417/** 418 * Interpolate fractional pel values by applying proper vertical then 419 * horizontal filter. 420 * 421 * @param dst Destination buffer for interpolated pels. 422 * @param src Source buffer. 423 * @param stride Stride for both src and dst buffers. 424 * @param hmode Horizontal filter (expressed in quarter pixels shift). 425 * @param hmode Vertical filter. 426 * @param rnd Rounding bias. 427 */ 428#define VC1_MSPEL_MC(OP)\ 429static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ 430 int hmode, int vmode, int rnd)\ 431{\ 432 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ 433 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ 434 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ 435 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ 436 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ 437 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ 438\ 439 __asm__ volatile(\ 440 "pxor %%mm0, %%mm0 \n\t"\ 441 ::: "memory"\ 442 );\ 443\ 444 if (vmode) { /* Vertical filter to apply */\ 445 if (hmode) { /* Horizontal filter to apply, output to tmp */\ 446 static const int shift_value[] = { 0, 5, 1, 5 };\ 447 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ 448 int r;\ 449 DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ 450\ 451 r = (1<<(shift-1)) + rnd-1;\ 452 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ 453\ 454 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ 455 return;\ 456 }\ 457 else { /* No horizontal filter, output 8 lines to dst */\ 458 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ 459 return;\ 460 }\ 461 }\ 462\ 463 /* Horizontal mode with no vertical mode */\ 464 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ 465} \ 466static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ 467 int stride, int hmode, int vmode, int rnd)\ 468{ \ 469 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 470 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 471 dst += 8*stride; src += 8*stride; \ 472 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 473 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 474} 475 476VC1_MSPEL_MC(put_) 477VC1_MSPEL_MC(avg_) 478 479/** Macro to ease bicubic filter interpolation functions declarations */ 480#define DECLARE_FUNCTION(a, b) \ 481static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \ 482 const uint8_t *src, \ 483 ptrdiff_t stride, \ 484 int rnd) \ 485{ \ 486 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 487}\ 488static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ 489 const uint8_t *src, \ 490 ptrdiff_t stride, \ 491 int rnd) \ 492{ \ 493 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 494}\ 495static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \ 496 const uint8_t *src, \ 497 ptrdiff_t stride, \ 498 int rnd) \ 499{ \ 500 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 501}\ 502static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \ 503 const uint8_t *src,\ 504 ptrdiff_t stride, \ 505 int rnd) \ 506{ \ 507 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 508} 509 510DECLARE_FUNCTION(0, 1) 511DECLARE_FUNCTION(0, 2) 512DECLARE_FUNCTION(0, 3) 513 514DECLARE_FUNCTION(1, 0) 515DECLARE_FUNCTION(1, 1) 516DECLARE_FUNCTION(1, 2) 517DECLARE_FUNCTION(1, 3) 518 519DECLARE_FUNCTION(2, 0) 520DECLARE_FUNCTION(2, 1) 521DECLARE_FUNCTION(2, 2) 522DECLARE_FUNCTION(2, 3) 523 524DECLARE_FUNCTION(3, 0) 525DECLARE_FUNCTION(3, 1) 526DECLARE_FUNCTION(3, 2) 527DECLARE_FUNCTION(3, 3) 528 529static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize, 530 int16_t *block) 531{ 532 int dc = block[0]; 533 dc = (17 * dc + 4) >> 3; 534 dc = (17 * dc + 64) >> 7; 535 __asm__ volatile( 536 "movd %0, %%mm0 \n\t" 537 "pshufw $0, %%mm0, %%mm0 \n\t" 538 "pxor %%mm1, %%mm1 \n\t" 539 "psubw %%mm0, %%mm1 \n\t" 540 "packuswb %%mm0, %%mm0 \n\t" 541 "packuswb %%mm1, %%mm1 \n\t" 542 ::"r"(dc) 543 ); 544 __asm__ volatile( 545 "movd %0, %%mm2 \n\t" 546 "movd %1, %%mm3 \n\t" 547 "movd %2, %%mm4 \n\t" 548 "movd %3, %%mm5 \n\t" 549 "paddusb %%mm0, %%mm2 \n\t" 550 "paddusb %%mm0, %%mm3 \n\t" 551 "paddusb %%mm0, %%mm4 \n\t" 552 "paddusb %%mm0, %%mm5 \n\t" 553 "psubusb %%mm1, %%mm2 \n\t" 554 "psubusb %%mm1, %%mm3 \n\t" 555 "psubusb %%mm1, %%mm4 \n\t" 556 "psubusb %%mm1, %%mm5 \n\t" 557 "movd %%mm2, %0 \n\t" 558 "movd %%mm3, %1 \n\t" 559 "movd %%mm4, %2 \n\t" 560 "movd %%mm5, %3 \n\t" 561 :"+m"(*(uint32_t*)(dest+0*linesize)), 562 "+m"(*(uint32_t*)(dest+1*linesize)), 563 "+m"(*(uint32_t*)(dest+2*linesize)), 564 "+m"(*(uint32_t*)(dest+3*linesize)) 565 ); 566} 567 568static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize, 569 int16_t *block) 570{ 571 int dc = block[0]; 572 dc = (17 * dc + 4) >> 3; 573 dc = (12 * dc + 64) >> 7; 574 __asm__ volatile( 575 "movd %0, %%mm0 \n\t" 576 "pshufw $0, %%mm0, %%mm0 \n\t" 577 "pxor %%mm1, %%mm1 \n\t" 578 "psubw %%mm0, %%mm1 \n\t" 579 "packuswb %%mm0, %%mm0 \n\t" 580 "packuswb %%mm1, %%mm1 \n\t" 581 ::"r"(dc) 582 ); 583 __asm__ volatile( 584 "movd %0, %%mm2 \n\t" 585 "movd %1, %%mm3 \n\t" 586 "movd %2, %%mm4 \n\t" 587 "movd %3, %%mm5 \n\t" 588 "paddusb %%mm0, %%mm2 \n\t" 589 "paddusb %%mm0, %%mm3 \n\t" 590 "paddusb %%mm0, %%mm4 \n\t" 591 "paddusb %%mm0, %%mm5 \n\t" 592 "psubusb %%mm1, %%mm2 \n\t" 593 "psubusb %%mm1, %%mm3 \n\t" 594 "psubusb %%mm1, %%mm4 \n\t" 595 "psubusb %%mm1, %%mm5 \n\t" 596 "movd %%mm2, %0 \n\t" 597 "movd %%mm3, %1 \n\t" 598 "movd %%mm4, %2 \n\t" 599 "movd %%mm5, %3 \n\t" 600 :"+m"(*(uint32_t*)(dest+0*linesize)), 601 "+m"(*(uint32_t*)(dest+1*linesize)), 602 "+m"(*(uint32_t*)(dest+2*linesize)), 603 "+m"(*(uint32_t*)(dest+3*linesize)) 604 ); 605 dest += 4*linesize; 606 __asm__ volatile( 607 "movd %0, %%mm2 \n\t" 608 "movd %1, %%mm3 \n\t" 609 "movd %2, %%mm4 \n\t" 610 "movd %3, %%mm5 \n\t" 611 "paddusb %%mm0, %%mm2 \n\t" 612 "paddusb %%mm0, %%mm3 \n\t" 613 "paddusb %%mm0, %%mm4 \n\t" 614 "paddusb %%mm0, %%mm5 \n\t" 615 "psubusb %%mm1, %%mm2 \n\t" 616 "psubusb %%mm1, %%mm3 \n\t" 617 "psubusb %%mm1, %%mm4 \n\t" 618 "psubusb %%mm1, %%mm5 \n\t" 619 "movd %%mm2, %0 \n\t" 620 "movd %%mm3, %1 \n\t" 621 "movd %%mm4, %2 \n\t" 622 "movd %%mm5, %3 \n\t" 623 :"+m"(*(uint32_t*)(dest+0*linesize)), 624 "+m"(*(uint32_t*)(dest+1*linesize)), 625 "+m"(*(uint32_t*)(dest+2*linesize)), 626 "+m"(*(uint32_t*)(dest+3*linesize)) 627 ); 628} 629 630static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize, 631 int16_t *block) 632{ 633 int dc = block[0]; 634 dc = ( 3 * dc + 1) >> 1; 635 dc = (17 * dc + 64) >> 7; 636 __asm__ volatile( 637 "movd %0, %%mm0 \n\t" 638 "pshufw $0, %%mm0, %%mm0 \n\t" 639 "pxor %%mm1, %%mm1 \n\t" 640 "psubw %%mm0, %%mm1 \n\t" 641 "packuswb %%mm0, %%mm0 \n\t" 642 "packuswb %%mm1, %%mm1 \n\t" 643 ::"r"(dc) 644 ); 645 __asm__ volatile( 646 "movq %0, %%mm2 \n\t" 647 "movq %1, %%mm3 \n\t" 648 "movq %2, %%mm4 \n\t" 649 "movq %3, %%mm5 \n\t" 650 "paddusb %%mm0, %%mm2 \n\t" 651 "paddusb %%mm0, %%mm3 \n\t" 652 "paddusb %%mm0, %%mm4 \n\t" 653 "paddusb %%mm0, %%mm5 \n\t" 654 "psubusb %%mm1, %%mm2 \n\t" 655 "psubusb %%mm1, %%mm3 \n\t" 656 "psubusb %%mm1, %%mm4 \n\t" 657 "psubusb %%mm1, %%mm5 \n\t" 658 "movq %%mm2, %0 \n\t" 659 "movq %%mm3, %1 \n\t" 660 "movq %%mm4, %2 \n\t" 661 "movq %%mm5, %3 \n\t" 662 :"+m"(*(uint32_t*)(dest+0*linesize)), 663 "+m"(*(uint32_t*)(dest+1*linesize)), 664 "+m"(*(uint32_t*)(dest+2*linesize)), 665 "+m"(*(uint32_t*)(dest+3*linesize)) 666 ); 667} 668 669static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, 670 int16_t *block) 671{ 672 int dc = block[0]; 673 dc = (3 * dc + 1) >> 1; 674 dc = (3 * dc + 16) >> 5; 675 __asm__ volatile( 676 "movd %0, %%mm0 \n\t" 677 "pshufw $0, %%mm0, %%mm0 \n\t" 678 "pxor %%mm1, %%mm1 \n\t" 679 "psubw %%mm0, %%mm1 \n\t" 680 "packuswb %%mm0, %%mm0 \n\t" 681 "packuswb %%mm1, %%mm1 \n\t" 682 ::"r"(dc) 683 ); 684 __asm__ volatile( 685 "movq %0, %%mm2 \n\t" 686 "movq %1, %%mm3 \n\t" 687 "movq %2, %%mm4 \n\t" 688 "movq %3, %%mm5 \n\t" 689 "paddusb %%mm0, %%mm2 \n\t" 690 "paddusb %%mm0, %%mm3 \n\t" 691 "paddusb %%mm0, %%mm4 \n\t" 692 "paddusb %%mm0, %%mm5 \n\t" 693 "psubusb %%mm1, %%mm2 \n\t" 694 "psubusb %%mm1, %%mm3 \n\t" 695 "psubusb %%mm1, %%mm4 \n\t" 696 "psubusb %%mm1, %%mm5 \n\t" 697 "movq %%mm2, %0 \n\t" 698 "movq %%mm3, %1 \n\t" 699 "movq %%mm4, %2 \n\t" 700 "movq %%mm5, %3 \n\t" 701 :"+m"(*(uint32_t*)(dest+0*linesize)), 702 "+m"(*(uint32_t*)(dest+1*linesize)), 703 "+m"(*(uint32_t*)(dest+2*linesize)), 704 "+m"(*(uint32_t*)(dest+3*linesize)) 705 ); 706 dest += 4*linesize; 707 __asm__ volatile( 708 "movq %0, %%mm2 \n\t" 709 "movq %1, %%mm3 \n\t" 710 "movq %2, %%mm4 \n\t" 711 "movq %3, %%mm5 \n\t" 712 "paddusb %%mm0, %%mm2 \n\t" 713 "paddusb %%mm0, %%mm3 \n\t" 714 "paddusb %%mm0, %%mm4 \n\t" 715 "paddusb %%mm0, %%mm5 \n\t" 716 "psubusb %%mm1, %%mm2 \n\t" 717 "psubusb %%mm1, %%mm3 \n\t" 718 "psubusb %%mm1, %%mm4 \n\t" 719 "psubusb %%mm1, %%mm5 \n\t" 720 "movq %%mm2, %0 \n\t" 721 "movq %%mm3, %1 \n\t" 722 "movq %%mm4, %2 \n\t" 723 "movq %%mm5, %3 \n\t" 724 :"+m"(*(uint32_t*)(dest+0*linesize)), 725 "+m"(*(uint32_t*)(dest+1*linesize)), 726 "+m"(*(uint32_t*)(dest+2*linesize)), 727 "+m"(*(uint32_t*)(dest+3*linesize)) 728 ); 729} 730 731#if HAVE_MMX_EXTERNAL 732static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, 733 ptrdiff_t stride, int rnd) 734{ 735 ff_put_pixels8_mmx(dst, src, stride, 8); 736} 737static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, 738 ptrdiff_t stride, int rnd) 739{ 740 ff_put_pixels16_mmx(dst, src, stride, 16); 741} 742static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, 743 ptrdiff_t stride, int rnd) 744{ 745 ff_avg_pixels8_mmx(dst, src, stride, 8); 746} 747static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, 748 ptrdiff_t stride, int rnd) 749{ 750 ff_avg_pixels16_mmx(dst, src, stride, 16); 751} 752#endif 753 754#define FN_ASSIGN(OP, X, Y, INSN) \ 755 dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ 756 dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN 757 758av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) 759{ 760#if HAVE_MMX_EXTERNAL 761 FN_ASSIGN(put_, 0, 0, _mmx); 762 FN_ASSIGN(avg_, 0, 0, _mmx); 763#endif 764 FN_ASSIGN(put_, 0, 1, _mmx); 765 FN_ASSIGN(put_, 0, 2, _mmx); 766 FN_ASSIGN(put_, 0, 3, _mmx); 767 768 FN_ASSIGN(put_, 1, 0, _mmx); 769 FN_ASSIGN(put_, 1, 1, _mmx); 770 FN_ASSIGN(put_, 1, 2, _mmx); 771 FN_ASSIGN(put_, 1, 3, _mmx); 772 773 FN_ASSIGN(put_, 2, 0, _mmx); 774 FN_ASSIGN(put_, 2, 1, _mmx); 775 FN_ASSIGN(put_, 2, 2, _mmx); 776 FN_ASSIGN(put_, 2, 3, _mmx); 777 778 FN_ASSIGN(put_, 3, 0, _mmx); 779 FN_ASSIGN(put_, 3, 1, _mmx); 780 FN_ASSIGN(put_, 3, 2, _mmx); 781 FN_ASSIGN(put_, 3, 3, _mmx); 782} 783 784av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) 785{ 786 FN_ASSIGN(avg_, 0, 1, _mmxext); 787 FN_ASSIGN(avg_, 0, 2, _mmxext); 788 FN_ASSIGN(avg_, 0, 3, _mmxext); 789 790 FN_ASSIGN(avg_, 1, 0, _mmxext); 791 FN_ASSIGN(avg_, 1, 1, _mmxext); 792 FN_ASSIGN(avg_, 1, 2, _mmxext); 793 FN_ASSIGN(avg_, 1, 3, _mmxext); 794 795 FN_ASSIGN(avg_, 2, 0, _mmxext); 796 FN_ASSIGN(avg_, 2, 1, _mmxext); 797 FN_ASSIGN(avg_, 2, 2, _mmxext); 798 FN_ASSIGN(avg_, 2, 3, _mmxext); 799 800 FN_ASSIGN(avg_, 3, 0, _mmxext); 801 FN_ASSIGN(avg_, 3, 1, _mmxext); 802 FN_ASSIGN(avg_, 3, 2, _mmxext); 803 FN_ASSIGN(avg_, 3, 3, _mmxext); 804 805 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; 806 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; 807 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; 808 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; 809} 810#endif /* HAVE_6REGS && HAVE_INLINE_ASM */ 811