1/* 2 * MMX optimized DSP utils 3 * Copyright (c) 2000, 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 * 22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 23 */ 24 25#include "libavutil/x86_cpu.h" 26#include "libavcodec/dsputil.h" 27#include "libavcodec/h263.h" 28#include "libavcodec/mpegvideo.h" 29#include "libavcodec/simple_idct.h" 30#include "dsputil_mmx.h" 31#include "mmx.h" 32#include "vp3dsp_mmx.h" 33#include "vp3dsp_sse2.h" 34#include "vp6dsp_mmx.h" 35#include "vp6dsp_sse2.h" 36#include "idct_xvid.h" 37 38//#undef NDEBUG 39//#include <assert.h> 40 41int mm_flags; /* multimedia extension flags */ 42 43/* pixel operations */ 44DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL; 45DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL; 46 47DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) = 48{0x8000000080000000ULL, 0x8000000080000000ULL}; 49 50DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; 51DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; 52DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; 53DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; 54DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; 55DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; 56DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; 57DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; 58DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; 59DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; 60DECLARE_ALIGNED_16(const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; 61DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; 62DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; 63DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; 64 65DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; 66DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; 67DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; 68DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; 69DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; 70DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; 71DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; 72DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; 73 74DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 }; 75DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; 76 77#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) 78#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) 79 80#define MOVQ_BFE(regd) \ 81 __asm__ volatile ( \ 82 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ 83 "paddb %%" #regd ", %%" #regd " \n\t" ::) 84 85#ifndef PIC 86#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) 87#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) 88#else 89// for shared library it's better to use this way for accessing constants 90// pcmpeqd -> -1 91#define MOVQ_BONE(regd) \ 92 __asm__ volatile ( \ 93 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 94 "psrlw $15, %%" #regd " \n\t" \ 95 "packuswb %%" #regd ", %%" #regd " \n\t" ::) 96 97#define MOVQ_WTWO(regd) \ 98 __asm__ volatile ( \ 99 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 100 "psrlw $15, %%" #regd " \n\t" \ 101 "psllw $1, %%" #regd " \n\t"::) 102 103#endif 104 105// using regr as temporary and for the output result 106// first argument is unmodifed and second is trashed 107// regfe is supposed to contain 0xfefefefefefefefe 108#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ 109 "movq " #rega ", " #regr " \n\t"\ 110 "pand " #regb ", " #regr " \n\t"\ 111 "pxor " #rega ", " #regb " \n\t"\ 112 "pand " #regfe "," #regb " \n\t"\ 113 "psrlq $1, " #regb " \n\t"\ 114 "paddb " #regb ", " #regr " \n\t" 115 116#define PAVGB_MMX(rega, regb, regr, regfe) \ 117 "movq " #rega ", " #regr " \n\t"\ 118 "por " #regb ", " #regr " \n\t"\ 119 "pxor " #rega ", " #regb " \n\t"\ 120 "pand " #regfe "," #regb " \n\t"\ 121 "psrlq $1, " #regb " \n\t"\ 122 "psubb " #regb ", " #regr " \n\t" 123 124// mm6 is supposed to contain 0xfefefefefefefefe 125#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ 126 "movq " #rega ", " #regr " \n\t"\ 127 "movq " #regc ", " #regp " \n\t"\ 128 "pand " #regb ", " #regr " \n\t"\ 129 "pand " #regd ", " #regp " \n\t"\ 130 "pxor " #rega ", " #regb " \n\t"\ 131 "pxor " #regc ", " #regd " \n\t"\ 132 "pand %%mm6, " #regb " \n\t"\ 133 "pand %%mm6, " #regd " \n\t"\ 134 "psrlq $1, " #regb " \n\t"\ 135 "psrlq $1, " #regd " \n\t"\ 136 "paddb " #regb ", " #regr " \n\t"\ 137 "paddb " #regd ", " #regp " \n\t" 138 139#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ 140 "movq " #rega ", " #regr " \n\t"\ 141 "movq " #regc ", " #regp " \n\t"\ 142 "por " #regb ", " #regr " \n\t"\ 143 "por " #regd ", " #regp " \n\t"\ 144 "pxor " #rega ", " #regb " \n\t"\ 145 "pxor " #regc ", " #regd " \n\t"\ 146 "pand %%mm6, " #regb " \n\t"\ 147 "pand %%mm6, " #regd " \n\t"\ 148 "psrlq $1, " #regd " \n\t"\ 149 "psrlq $1, " #regb " \n\t"\ 150 "psubb " #regb ", " #regr " \n\t"\ 151 "psubb " #regd ", " #regp " \n\t" 152 153/***********************************/ 154/* MMX no rounding */ 155#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx 156#define SET_RND MOVQ_WONE 157#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) 158#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) 159 160#include "dsputil_mmx_rnd_template.c" 161 162#undef DEF 163#undef SET_RND 164#undef PAVGBP 165#undef PAVGB 166/***********************************/ 167/* MMX rounding */ 168 169#define DEF(x, y) x ## _ ## y ##_mmx 170#define SET_RND MOVQ_WTWO 171#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) 172#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) 173 174#include "dsputil_mmx_rnd_template.c" 175 176#undef DEF 177#undef SET_RND 178#undef PAVGBP 179#undef PAVGB 180 181/***********************************/ 182/* 3Dnow specific */ 183 184#define DEF(x) x ## _3dnow 185#define PAVGB "pavgusb" 186 187#include "dsputil_mmx_avg_template.c" 188 189#undef DEF 190#undef PAVGB 191 192/***********************************/ 193/* MMX2 specific */ 194 195#define DEF(x) x ## _mmx2 196 197/* Introduced only in MMX2 set */ 198#define PAVGB "pavgb" 199 200#include "dsputil_mmx_avg_template.c" 201 202#undef DEF 203#undef PAVGB 204 205#define put_no_rnd_pixels16_mmx put_pixels16_mmx 206#define put_no_rnd_pixels8_mmx put_pixels8_mmx 207#define put_pixels16_mmx2 put_pixels16_mmx 208#define put_pixels8_mmx2 put_pixels8_mmx 209#define put_pixels4_mmx2 put_pixels4_mmx 210#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx 211#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx 212#define put_pixels16_3dnow put_pixels16_mmx 213#define put_pixels8_3dnow put_pixels8_mmx 214#define put_pixels4_3dnow put_pixels4_mmx 215#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx 216#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx 217 218/***********************************/ 219/* standard MMX */ 220 221void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 222{ 223 const DCTELEM *p; 224 uint8_t *pix; 225 226 /* read the pixels */ 227 p = block; 228 pix = pixels; 229 /* unrolled loop */ 230 __asm__ volatile( 231 "movq %3, %%mm0 \n\t" 232 "movq 8%3, %%mm1 \n\t" 233 "movq 16%3, %%mm2 \n\t" 234 "movq 24%3, %%mm3 \n\t" 235 "movq 32%3, %%mm4 \n\t" 236 "movq 40%3, %%mm5 \n\t" 237 "movq 48%3, %%mm6 \n\t" 238 "movq 56%3, %%mm7 \n\t" 239 "packuswb %%mm1, %%mm0 \n\t" 240 "packuswb %%mm3, %%mm2 \n\t" 241 "packuswb %%mm5, %%mm4 \n\t" 242 "packuswb %%mm7, %%mm6 \n\t" 243 "movq %%mm0, (%0) \n\t" 244 "movq %%mm2, (%0, %1) \n\t" 245 "movq %%mm4, (%0, %1, 2) \n\t" 246 "movq %%mm6, (%0, %2) \n\t" 247 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) 248 :"memory"); 249 pix += line_size*4; 250 p += 32; 251 252 // if here would be an exact copy of the code above 253 // compiler would generate some very strange code 254 // thus using "r" 255 __asm__ volatile( 256 "movq (%3), %%mm0 \n\t" 257 "movq 8(%3), %%mm1 \n\t" 258 "movq 16(%3), %%mm2 \n\t" 259 "movq 24(%3), %%mm3 \n\t" 260 "movq 32(%3), %%mm4 \n\t" 261 "movq 40(%3), %%mm5 \n\t" 262 "movq 48(%3), %%mm6 \n\t" 263 "movq 56(%3), %%mm7 \n\t" 264 "packuswb %%mm1, %%mm0 \n\t" 265 "packuswb %%mm3, %%mm2 \n\t" 266 "packuswb %%mm5, %%mm4 \n\t" 267 "packuswb %%mm7, %%mm6 \n\t" 268 "movq %%mm0, (%0) \n\t" 269 "movq %%mm2, (%0, %1) \n\t" 270 "movq %%mm4, (%0, %1, 2) \n\t" 271 "movq %%mm6, (%0, %2) \n\t" 272 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) 273 :"memory"); 274} 275 276static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) = 277 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; 278 279void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 280{ 281 int i; 282 283 movq_m2r(*vector128, mm1); 284 for (i = 0; i < 8; i++) { 285 movq_m2r(*(block), mm0); 286 packsswb_m2r(*(block + 4), mm0); 287 block += 8; 288 paddb_r2r(mm1, mm0); 289 movq_r2m(mm0, *pixels); 290 pixels += line_size; 291 } 292} 293 294void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 295{ 296 const DCTELEM *p; 297 uint8_t *pix; 298 int i; 299 300 /* read the pixels */ 301 p = block; 302 pix = pixels; 303 MOVQ_ZERO(mm7); 304 i = 4; 305 do { 306 __asm__ volatile( 307 "movq (%2), %%mm0 \n\t" 308 "movq 8(%2), %%mm1 \n\t" 309 "movq 16(%2), %%mm2 \n\t" 310 "movq 24(%2), %%mm3 \n\t" 311 "movq %0, %%mm4 \n\t" 312 "movq %1, %%mm6 \n\t" 313 "movq %%mm4, %%mm5 \n\t" 314 "punpcklbw %%mm7, %%mm4 \n\t" 315 "punpckhbw %%mm7, %%mm5 \n\t" 316 "paddsw %%mm4, %%mm0 \n\t" 317 "paddsw %%mm5, %%mm1 \n\t" 318 "movq %%mm6, %%mm5 \n\t" 319 "punpcklbw %%mm7, %%mm6 \n\t" 320 "punpckhbw %%mm7, %%mm5 \n\t" 321 "paddsw %%mm6, %%mm2 \n\t" 322 "paddsw %%mm5, %%mm3 \n\t" 323 "packuswb %%mm1, %%mm0 \n\t" 324 "packuswb %%mm3, %%mm2 \n\t" 325 "movq %%mm0, %0 \n\t" 326 "movq %%mm2, %1 \n\t" 327 :"+m"(*pix), "+m"(*(pix+line_size)) 328 :"r"(p) 329 :"memory"); 330 pix += line_size*2; 331 p += 16; 332 } while (--i); 333} 334 335static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 336{ 337 __asm__ volatile( 338 "lea (%3, %3), %%"REG_a" \n\t" 339 ASMALIGN(3) 340 "1: \n\t" 341 "movd (%1), %%mm0 \n\t" 342 "movd (%1, %3), %%mm1 \n\t" 343 "movd %%mm0, (%2) \n\t" 344 "movd %%mm1, (%2, %3) \n\t" 345 "add %%"REG_a", %1 \n\t" 346 "add %%"REG_a", %2 \n\t" 347 "movd (%1), %%mm0 \n\t" 348 "movd (%1, %3), %%mm1 \n\t" 349 "movd %%mm0, (%2) \n\t" 350 "movd %%mm1, (%2, %3) \n\t" 351 "add %%"REG_a", %1 \n\t" 352 "add %%"REG_a", %2 \n\t" 353 "subl $4, %0 \n\t" 354 "jnz 1b \n\t" 355 : "+g"(h), "+r" (pixels), "+r" (block) 356 : "r"((x86_reg)line_size) 357 : "%"REG_a, "memory" 358 ); 359} 360 361static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 362{ 363 __asm__ volatile( 364 "lea (%3, %3), %%"REG_a" \n\t" 365 ASMALIGN(3) 366 "1: \n\t" 367 "movq (%1), %%mm0 \n\t" 368 "movq (%1, %3), %%mm1 \n\t" 369 "movq %%mm0, (%2) \n\t" 370 "movq %%mm1, (%2, %3) \n\t" 371 "add %%"REG_a", %1 \n\t" 372 "add %%"REG_a", %2 \n\t" 373 "movq (%1), %%mm0 \n\t" 374 "movq (%1, %3), %%mm1 \n\t" 375 "movq %%mm0, (%2) \n\t" 376 "movq %%mm1, (%2, %3) \n\t" 377 "add %%"REG_a", %1 \n\t" 378 "add %%"REG_a", %2 \n\t" 379 "subl $4, %0 \n\t" 380 "jnz 1b \n\t" 381 : "+g"(h), "+r" (pixels), "+r" (block) 382 : "r"((x86_reg)line_size) 383 : "%"REG_a, "memory" 384 ); 385} 386 387static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 388{ 389 __asm__ volatile( 390 "lea (%3, %3), %%"REG_a" \n\t" 391 ASMALIGN(3) 392 "1: \n\t" 393 "movq (%1), %%mm0 \n\t" 394 "movq 8(%1), %%mm4 \n\t" 395 "movq (%1, %3), %%mm1 \n\t" 396 "movq 8(%1, %3), %%mm5 \n\t" 397 "movq %%mm0, (%2) \n\t" 398 "movq %%mm4, 8(%2) \n\t" 399 "movq %%mm1, (%2, %3) \n\t" 400 "movq %%mm5, 8(%2, %3) \n\t" 401 "add %%"REG_a", %1 \n\t" 402 "add %%"REG_a", %2 \n\t" 403 "movq (%1), %%mm0 \n\t" 404 "movq 8(%1), %%mm4 \n\t" 405 "movq (%1, %3), %%mm1 \n\t" 406 "movq 8(%1, %3), %%mm5 \n\t" 407 "movq %%mm0, (%2) \n\t" 408 "movq %%mm4, 8(%2) \n\t" 409 "movq %%mm1, (%2, %3) \n\t" 410 "movq %%mm5, 8(%2, %3) \n\t" 411 "add %%"REG_a", %1 \n\t" 412 "add %%"REG_a", %2 \n\t" 413 "subl $4, %0 \n\t" 414 "jnz 1b \n\t" 415 : "+g"(h), "+r" (pixels), "+r" (block) 416 : "r"((x86_reg)line_size) 417 : "%"REG_a, "memory" 418 ); 419} 420 421static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) 422{ 423 __asm__ volatile( 424 "1: \n\t" 425 "movdqu (%1), %%xmm0 \n\t" 426 "movdqu (%1,%3), %%xmm1 \n\t" 427 "movdqu (%1,%3,2), %%xmm2 \n\t" 428 "movdqu (%1,%4), %%xmm3 \n\t" 429 "movdqa %%xmm0, (%2) \n\t" 430 "movdqa %%xmm1, (%2,%3) \n\t" 431 "movdqa %%xmm2, (%2,%3,2) \n\t" 432 "movdqa %%xmm3, (%2,%4) \n\t" 433 "subl $4, %0 \n\t" 434 "lea (%1,%3,4), %1 \n\t" 435 "lea (%2,%3,4), %2 \n\t" 436 "jnz 1b \n\t" 437 : "+g"(h), "+r" (pixels), "+r" (block) 438 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) 439 : "memory" 440 ); 441} 442 443static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) 444{ 445 __asm__ volatile( 446 "1: \n\t" 447 "movdqu (%1), %%xmm0 \n\t" 448 "movdqu (%1,%3), %%xmm1 \n\t" 449 "movdqu (%1,%3,2), %%xmm2 \n\t" 450 "movdqu (%1,%4), %%xmm3 \n\t" 451 "pavgb (%2), %%xmm0 \n\t" 452 "pavgb (%2,%3), %%xmm1 \n\t" 453 "pavgb (%2,%3,2), %%xmm2 \n\t" 454 "pavgb (%2,%4), %%xmm3 \n\t" 455 "movdqa %%xmm0, (%2) \n\t" 456 "movdqa %%xmm1, (%2,%3) \n\t" 457 "movdqa %%xmm2, (%2,%3,2) \n\t" 458 "movdqa %%xmm3, (%2,%4) \n\t" 459 "subl $4, %0 \n\t" 460 "lea (%1,%3,4), %1 \n\t" 461 "lea (%2,%3,4), %2 \n\t" 462 "jnz 1b \n\t" 463 : "+g"(h), "+r" (pixels), "+r" (block) 464 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) 465 : "memory" 466 ); 467} 468 469#define CLEAR_BLOCKS(name,n) \ 470static void name(DCTELEM *blocks)\ 471{\ 472 __asm__ volatile(\ 473 "pxor %%mm7, %%mm7 \n\t"\ 474 "mov %1, %%"REG_a" \n\t"\ 475 "1: \n\t"\ 476 "movq %%mm7, (%0, %%"REG_a") \n\t"\ 477 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ 478 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ 479 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ 480 "add $32, %%"REG_a" \n\t"\ 481 " js 1b \n\t"\ 482 : : "r" (((uint8_t *)blocks)+128*n),\ 483 "i" (-128*n)\ 484 : "%"REG_a\ 485 );\ 486} 487CLEAR_BLOCKS(clear_blocks_mmx, 6) 488CLEAR_BLOCKS(clear_block_mmx, 1) 489 490static void clear_block_sse(DCTELEM *block) 491{ 492 __asm__ volatile( 493 "xorps %%xmm0, %%xmm0 \n" 494 "movaps %%xmm0, (%0) \n" 495 "movaps %%xmm0, 16(%0) \n" 496 "movaps %%xmm0, 32(%0) \n" 497 "movaps %%xmm0, 48(%0) \n" 498 "movaps %%xmm0, 64(%0) \n" 499 "movaps %%xmm0, 80(%0) \n" 500 "movaps %%xmm0, 96(%0) \n" 501 "movaps %%xmm0, 112(%0) \n" 502 :: "r"(block) 503 : "memory" 504 ); 505} 506 507static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ 508 x86_reg i=0; 509 __asm__ volatile( 510 "jmp 2f \n\t" 511 "1: \n\t" 512 "movq (%1, %0), %%mm0 \n\t" 513 "movq (%2, %0), %%mm1 \n\t" 514 "paddb %%mm0, %%mm1 \n\t" 515 "movq %%mm1, (%2, %0) \n\t" 516 "movq 8(%1, %0), %%mm0 \n\t" 517 "movq 8(%2, %0), %%mm1 \n\t" 518 "paddb %%mm0, %%mm1 \n\t" 519 "movq %%mm1, 8(%2, %0) \n\t" 520 "add $16, %0 \n\t" 521 "2: \n\t" 522 "cmp %3, %0 \n\t" 523 " js 1b \n\t" 524 : "+r" (i) 525 : "r"(src), "r"(dst), "r"((x86_reg)w-15) 526 ); 527 for(; i<w; i++) 528 dst[i+0] += src[i+0]; 529} 530 531static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 532 x86_reg i=0; 533 __asm__ volatile( 534 "jmp 2f \n\t" 535 "1: \n\t" 536 "movq (%2, %0), %%mm0 \n\t" 537 "movq 8(%2, %0), %%mm1 \n\t" 538 "paddb (%3, %0), %%mm0 \n\t" 539 "paddb 8(%3, %0), %%mm1 \n\t" 540 "movq %%mm0, (%1, %0) \n\t" 541 "movq %%mm1, 8(%1, %0) \n\t" 542 "add $16, %0 \n\t" 543 "2: \n\t" 544 "cmp %4, %0 \n\t" 545 " js 1b \n\t" 546 : "+r" (i) 547 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) 548 ); 549 for(; i<w; i++) 550 dst[i] = src1[i] + src2[i]; 551} 552 553#if HAVE_7REGS && HAVE_TEN_OPERANDS 554static void add_hfyu_median_prediction_cmov(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top) { 555 x86_reg w2 = -w; 556 x86_reg x; 557 int l = *left & 0xff; 558 int tl = *left_top & 0xff; 559 int t; 560 __asm__ volatile( 561 "mov %7, %3 \n" 562 "1: \n" 563 "movzx (%3,%4), %2 \n" 564 "mov %2, %k3 \n" 565 "sub %b1, %b3 \n" 566 "add %b0, %b3 \n" 567 "mov %2, %1 \n" 568 "cmp %0, %2 \n" 569 "cmovg %0, %2 \n" 570 "cmovg %1, %0 \n" 571 "cmp %k3, %0 \n" 572 "cmovg %k3, %0 \n" 573 "mov %7, %3 \n" 574 "cmp %2, %0 \n" 575 "cmovl %2, %0 \n" 576 "add (%6,%4), %b0 \n" 577 "mov %b0, (%5,%4) \n" 578 "inc %4 \n" 579 "jl 1b \n" 580 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) 581 :"r"(dst+w), "r"(diff+w), "rm"(top+w) 582 ); 583 *left = l; 584 *left_top = tl; 585} 586#endif 587 588#define H263_LOOP_FILTER \ 589 "pxor %%mm7, %%mm7 \n\t"\ 590 "movq %0, %%mm0 \n\t"\ 591 "movq %0, %%mm1 \n\t"\ 592 "movq %3, %%mm2 \n\t"\ 593 "movq %3, %%mm3 \n\t"\ 594 "punpcklbw %%mm7, %%mm0 \n\t"\ 595 "punpckhbw %%mm7, %%mm1 \n\t"\ 596 "punpcklbw %%mm7, %%mm2 \n\t"\ 597 "punpckhbw %%mm7, %%mm3 \n\t"\ 598 "psubw %%mm2, %%mm0 \n\t"\ 599 "psubw %%mm3, %%mm1 \n\t"\ 600 "movq %1, %%mm2 \n\t"\ 601 "movq %1, %%mm3 \n\t"\ 602 "movq %2, %%mm4 \n\t"\ 603 "movq %2, %%mm5 \n\t"\ 604 "punpcklbw %%mm7, %%mm2 \n\t"\ 605 "punpckhbw %%mm7, %%mm3 \n\t"\ 606 "punpcklbw %%mm7, %%mm4 \n\t"\ 607 "punpckhbw %%mm7, %%mm5 \n\t"\ 608 "psubw %%mm2, %%mm4 \n\t"\ 609 "psubw %%mm3, %%mm5 \n\t"\ 610 "psllw $2, %%mm4 \n\t"\ 611 "psllw $2, %%mm5 \n\t"\ 612 "paddw %%mm0, %%mm4 \n\t"\ 613 "paddw %%mm1, %%mm5 \n\t"\ 614 "pxor %%mm6, %%mm6 \n\t"\ 615 "pcmpgtw %%mm4, %%mm6 \n\t"\ 616 "pcmpgtw %%mm5, %%mm7 \n\t"\ 617 "pxor %%mm6, %%mm4 \n\t"\ 618 "pxor %%mm7, %%mm5 \n\t"\ 619 "psubw %%mm6, %%mm4 \n\t"\ 620 "psubw %%mm7, %%mm5 \n\t"\ 621 "psrlw $3, %%mm4 \n\t"\ 622 "psrlw $3, %%mm5 \n\t"\ 623 "packuswb %%mm5, %%mm4 \n\t"\ 624 "packsswb %%mm7, %%mm6 \n\t"\ 625 "pxor %%mm7, %%mm7 \n\t"\ 626 "movd %4, %%mm2 \n\t"\ 627 "punpcklbw %%mm2, %%mm2 \n\t"\ 628 "punpcklbw %%mm2, %%mm2 \n\t"\ 629 "punpcklbw %%mm2, %%mm2 \n\t"\ 630 "psubusb %%mm4, %%mm2 \n\t"\ 631 "movq %%mm2, %%mm3 \n\t"\ 632 "psubusb %%mm4, %%mm3 \n\t"\ 633 "psubb %%mm3, %%mm2 \n\t"\ 634 "movq %1, %%mm3 \n\t"\ 635 "movq %2, %%mm4 \n\t"\ 636 "pxor %%mm6, %%mm3 \n\t"\ 637 "pxor %%mm6, %%mm4 \n\t"\ 638 "paddusb %%mm2, %%mm3 \n\t"\ 639 "psubusb %%mm2, %%mm4 \n\t"\ 640 "pxor %%mm6, %%mm3 \n\t"\ 641 "pxor %%mm6, %%mm4 \n\t"\ 642 "paddusb %%mm2, %%mm2 \n\t"\ 643 "packsswb %%mm1, %%mm0 \n\t"\ 644 "pcmpgtb %%mm0, %%mm7 \n\t"\ 645 "pxor %%mm7, %%mm0 \n\t"\ 646 "psubb %%mm7, %%mm0 \n\t"\ 647 "movq %%mm0, %%mm1 \n\t"\ 648 "psubusb %%mm2, %%mm0 \n\t"\ 649 "psubb %%mm0, %%mm1 \n\t"\ 650 "pand %5, %%mm1 \n\t"\ 651 "psrlw $2, %%mm1 \n\t"\ 652 "pxor %%mm7, %%mm1 \n\t"\ 653 "psubb %%mm7, %%mm1 \n\t"\ 654 "movq %0, %%mm5 \n\t"\ 655 "movq %3, %%mm6 \n\t"\ 656 "psubb %%mm1, %%mm5 \n\t"\ 657 "paddb %%mm1, %%mm6 \n\t" 658 659static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ 660 if(CONFIG_ANY_H263) { 661 const int strength= ff_h263_loop_filter_strength[qscale]; 662 663 __asm__ volatile( 664 665 H263_LOOP_FILTER 666 667 "movq %%mm3, %1 \n\t" 668 "movq %%mm4, %2 \n\t" 669 "movq %%mm5, %0 \n\t" 670 "movq %%mm6, %3 \n\t" 671 : "+m" (*(uint64_t*)(src - 2*stride)), 672 "+m" (*(uint64_t*)(src - 1*stride)), 673 "+m" (*(uint64_t*)(src + 0*stride)), 674 "+m" (*(uint64_t*)(src + 1*stride)) 675 : "g" (2*strength), "m"(ff_pb_FC) 676 ); 677 } 678} 679 680static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ 681 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... 682 "movd %4, %%mm0 \n\t" 683 "movd %5, %%mm1 \n\t" 684 "movd %6, %%mm2 \n\t" 685 "movd %7, %%mm3 \n\t" 686 "punpcklbw %%mm1, %%mm0 \n\t" 687 "punpcklbw %%mm3, %%mm2 \n\t" 688 "movq %%mm0, %%mm1 \n\t" 689 "punpcklwd %%mm2, %%mm0 \n\t" 690 "punpckhwd %%mm2, %%mm1 \n\t" 691 "movd %%mm0, %0 \n\t" 692 "punpckhdq %%mm0, %%mm0 \n\t" 693 "movd %%mm0, %1 \n\t" 694 "movd %%mm1, %2 \n\t" 695 "punpckhdq %%mm1, %%mm1 \n\t" 696 "movd %%mm1, %3 \n\t" 697 698 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), 699 "=m" (*(uint32_t*)(dst + 1*dst_stride)), 700 "=m" (*(uint32_t*)(dst + 2*dst_stride)), 701 "=m" (*(uint32_t*)(dst + 3*dst_stride)) 702 : "m" (*(uint32_t*)(src + 0*src_stride)), 703 "m" (*(uint32_t*)(src + 1*src_stride)), 704 "m" (*(uint32_t*)(src + 2*src_stride)), 705 "m" (*(uint32_t*)(src + 3*src_stride)) 706 ); 707} 708 709static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ 710 if(CONFIG_ANY_H263) { 711 const int strength= ff_h263_loop_filter_strength[qscale]; 712 DECLARE_ALIGNED(8, uint64_t, temp[4]); 713 uint8_t *btemp= (uint8_t*)temp; 714 715 src -= 2; 716 717 transpose4x4(btemp , src , 8, stride); 718 transpose4x4(btemp+4, src + 4*stride, 8, stride); 719 __asm__ volatile( 720 H263_LOOP_FILTER // 5 3 4 6 721 722 : "+m" (temp[0]), 723 "+m" (temp[1]), 724 "+m" (temp[2]), 725 "+m" (temp[3]) 726 : "g" (2*strength), "m"(ff_pb_FC) 727 ); 728 729 __asm__ volatile( 730 "movq %%mm5, %%mm1 \n\t" 731 "movq %%mm4, %%mm0 \n\t" 732 "punpcklbw %%mm3, %%mm5 \n\t" 733 "punpcklbw %%mm6, %%mm4 \n\t" 734 "punpckhbw %%mm3, %%mm1 \n\t" 735 "punpckhbw %%mm6, %%mm0 \n\t" 736 "movq %%mm5, %%mm3 \n\t" 737 "movq %%mm1, %%mm6 \n\t" 738 "punpcklwd %%mm4, %%mm5 \n\t" 739 "punpcklwd %%mm0, %%mm1 \n\t" 740 "punpckhwd %%mm4, %%mm3 \n\t" 741 "punpckhwd %%mm0, %%mm6 \n\t" 742 "movd %%mm5, (%0) \n\t" 743 "punpckhdq %%mm5, %%mm5 \n\t" 744 "movd %%mm5, (%0,%2) \n\t" 745 "movd %%mm3, (%0,%2,2) \n\t" 746 "punpckhdq %%mm3, %%mm3 \n\t" 747 "movd %%mm3, (%0,%3) \n\t" 748 "movd %%mm1, (%1) \n\t" 749 "punpckhdq %%mm1, %%mm1 \n\t" 750 "movd %%mm1, (%1,%2) \n\t" 751 "movd %%mm6, (%1,%2,2) \n\t" 752 "punpckhdq %%mm6, %%mm6 \n\t" 753 "movd %%mm6, (%1,%3) \n\t" 754 :: "r" (src), 755 "r" (src + 4*stride), 756 "r" ((x86_reg) stride ), 757 "r" ((x86_reg)(3*stride)) 758 ); 759 } 760} 761 762/* draw the edges of width 'w' of an image of size width, height 763 this mmx version can only handle w==8 || w==16 */ 764static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) 765{ 766 uint8_t *ptr, *last_line; 767 int i; 768 769 last_line = buf + (height - 1) * wrap; 770 /* left and right */ 771 ptr = buf; 772 if(w==8) 773 { 774 __asm__ volatile( 775 "1: \n\t" 776 "movd (%0), %%mm0 \n\t" 777 "punpcklbw %%mm0, %%mm0 \n\t" 778 "punpcklwd %%mm0, %%mm0 \n\t" 779 "punpckldq %%mm0, %%mm0 \n\t" 780 "movq %%mm0, -8(%0) \n\t" 781 "movq -8(%0, %2), %%mm1 \n\t" 782 "punpckhbw %%mm1, %%mm1 \n\t" 783 "punpckhwd %%mm1, %%mm1 \n\t" 784 "punpckhdq %%mm1, %%mm1 \n\t" 785 "movq %%mm1, (%0, %2) \n\t" 786 "add %1, %0 \n\t" 787 "cmp %3, %0 \n\t" 788 " jb 1b \n\t" 789 : "+r" (ptr) 790 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) 791 ); 792 } 793 else 794 { 795 __asm__ volatile( 796 "1: \n\t" 797 "movd (%0), %%mm0 \n\t" 798 "punpcklbw %%mm0, %%mm0 \n\t" 799 "punpcklwd %%mm0, %%mm0 \n\t" 800 "punpckldq %%mm0, %%mm0 \n\t" 801 "movq %%mm0, -8(%0) \n\t" 802 "movq %%mm0, -16(%0) \n\t" 803 "movq -8(%0, %2), %%mm1 \n\t" 804 "punpckhbw %%mm1, %%mm1 \n\t" 805 "punpckhwd %%mm1, %%mm1 \n\t" 806 "punpckhdq %%mm1, %%mm1 \n\t" 807 "movq %%mm1, (%0, %2) \n\t" 808 "movq %%mm1, 8(%0, %2) \n\t" 809 "add %1, %0 \n\t" 810 "cmp %3, %0 \n\t" 811 " jb 1b \n\t" 812 : "+r" (ptr) 813 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) 814 ); 815 } 816 817 for(i=0;i<w;i+=4) { 818 /* top and bottom (and hopefully also the corners) */ 819 ptr= buf - (i + 1) * wrap - w; 820 __asm__ volatile( 821 "1: \n\t" 822 "movq (%1, %0), %%mm0 \n\t" 823 "movq %%mm0, (%0) \n\t" 824 "movq %%mm0, (%0, %2) \n\t" 825 "movq %%mm0, (%0, %2, 2) \n\t" 826 "movq %%mm0, (%0, %3) \n\t" 827 "add $8, %0 \n\t" 828 "cmp %4, %0 \n\t" 829 " jb 1b \n\t" 830 : "+r" (ptr) 831 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) 832 ); 833 ptr= last_line + (i + 1) * wrap - w; 834 __asm__ volatile( 835 "1: \n\t" 836 "movq (%1, %0), %%mm0 \n\t" 837 "movq %%mm0, (%0) \n\t" 838 "movq %%mm0, (%0, %2) \n\t" 839 "movq %%mm0, (%0, %2, 2) \n\t" 840 "movq %%mm0, (%0, %3) \n\t" 841 "add $8, %0 \n\t" 842 "cmp %4, %0 \n\t" 843 " jb 1b \n\t" 844 : "+r" (ptr) 845 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) 846 ); 847 } 848} 849 850#define PAETH(cpu, abs3)\ 851static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ 852{\ 853 x86_reg i = -bpp;\ 854 x86_reg end = w-3;\ 855 __asm__ volatile(\ 856 "pxor %%mm7, %%mm7 \n"\ 857 "movd (%1,%0), %%mm0 \n"\ 858 "movd (%2,%0), %%mm1 \n"\ 859 "punpcklbw %%mm7, %%mm0 \n"\ 860 "punpcklbw %%mm7, %%mm1 \n"\ 861 "add %4, %0 \n"\ 862 "1: \n"\ 863 "movq %%mm1, %%mm2 \n"\ 864 "movd (%2,%0), %%mm1 \n"\ 865 "movq %%mm2, %%mm3 \n"\ 866 "punpcklbw %%mm7, %%mm1 \n"\ 867 "movq %%mm2, %%mm4 \n"\ 868 "psubw %%mm1, %%mm3 \n"\ 869 "psubw %%mm0, %%mm4 \n"\ 870 "movq %%mm3, %%mm5 \n"\ 871 "paddw %%mm4, %%mm5 \n"\ 872 abs3\ 873 "movq %%mm4, %%mm6 \n"\ 874 "pminsw %%mm5, %%mm6 \n"\ 875 "pcmpgtw %%mm6, %%mm3 \n"\ 876 "pcmpgtw %%mm5, %%mm4 \n"\ 877 "movq %%mm4, %%mm6 \n"\ 878 "pand %%mm3, %%mm4 \n"\ 879 "pandn %%mm3, %%mm6 \n"\ 880 "pandn %%mm0, %%mm3 \n"\ 881 "movd (%3,%0), %%mm0 \n"\ 882 "pand %%mm1, %%mm6 \n"\ 883 "pand %%mm4, %%mm2 \n"\ 884 "punpcklbw %%mm7, %%mm0 \n"\ 885 "movq %6, %%mm5 \n"\ 886 "paddw %%mm6, %%mm0 \n"\ 887 "paddw %%mm2, %%mm3 \n"\ 888 "paddw %%mm3, %%mm0 \n"\ 889 "pand %%mm5, %%mm0 \n"\ 890 "movq %%mm0, %%mm3 \n"\ 891 "packuswb %%mm3, %%mm3 \n"\ 892 "movd %%mm3, (%1,%0) \n"\ 893 "add %4, %0 \n"\ 894 "cmp %5, %0 \n"\ 895 "jle 1b \n"\ 896 :"+r"(i)\ 897 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ 898 "m"(ff_pw_255)\ 899 :"memory"\ 900 );\ 901} 902 903#define ABS3_MMX2\ 904 "psubw %%mm5, %%mm7 \n"\ 905 "pmaxsw %%mm7, %%mm5 \n"\ 906 "pxor %%mm6, %%mm6 \n"\ 907 "pxor %%mm7, %%mm7 \n"\ 908 "psubw %%mm3, %%mm6 \n"\ 909 "psubw %%mm4, %%mm7 \n"\ 910 "pmaxsw %%mm6, %%mm3 \n"\ 911 "pmaxsw %%mm7, %%mm4 \n"\ 912 "pxor %%mm7, %%mm7 \n" 913 914#define ABS3_SSSE3\ 915 "pabsw %%mm3, %%mm3 \n"\ 916 "pabsw %%mm4, %%mm4 \n"\ 917 "pabsw %%mm5, %%mm5 \n" 918 919PAETH(mmx2, ABS3_MMX2) 920#if HAVE_SSSE3 921PAETH(ssse3, ABS3_SSSE3) 922#endif 923 924#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ 925 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ 926 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ 927 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ 928 "movq "#in7", " #m3 " \n\t" /* d */\ 929 "movq "#in0", %%mm5 \n\t" /* D */\ 930 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ 931 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ 932 "movq "#in1", %%mm5 \n\t" /* C */\ 933 "movq "#in2", %%mm6 \n\t" /* B */\ 934 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ 935 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ 936 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ 937 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ 938 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ 939 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ 940 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ 941 "psraw $5, %%mm5 \n\t"\ 942 "packuswb %%mm5, %%mm5 \n\t"\ 943 OP(%%mm5, out, %%mm7, d) 944 945#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ 946static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 947 uint64_t temp;\ 948\ 949 __asm__ volatile(\ 950 "pxor %%mm7, %%mm7 \n\t"\ 951 "1: \n\t"\ 952 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ 953 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ 954 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ 955 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ 956 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ 957 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ 958 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ 959 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ 960 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ 961 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ 962 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ 963 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ 964 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ 965 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ 966 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ 967 "paddw %%mm3, %%mm5 \n\t" /* b */\ 968 "paddw %%mm2, %%mm6 \n\t" /* c */\ 969 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 970 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ 971 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ 972 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ 973 "paddw %%mm4, %%mm0 \n\t" /* a */\ 974 "paddw %%mm1, %%mm5 \n\t" /* d */\ 975 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ 976 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ 977 "paddw %6, %%mm6 \n\t"\ 978 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 979 "psraw $5, %%mm0 \n\t"\ 980 "movq %%mm0, %5 \n\t"\ 981 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ 982 \ 983 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ 984 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ 985 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ 986 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ 987 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ 988 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ 989 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ 990 "paddw %%mm0, %%mm2 \n\t" /* b */\ 991 "paddw %%mm5, %%mm3 \n\t" /* c */\ 992 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ 993 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ 994 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ 995 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ 996 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ 997 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ 998 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ 999 "paddw %%mm2, %%mm1 \n\t" /* a */\ 1000 "paddw %%mm6, %%mm4 \n\t" /* d */\ 1001 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ 1002 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ 1003 "paddw %6, %%mm1 \n\t"\ 1004 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ 1005 "psraw $5, %%mm3 \n\t"\ 1006 "movq %5, %%mm1 \n\t"\ 1007 "packuswb %%mm3, %%mm1 \n\t"\ 1008 OP_MMX2(%%mm1, (%1),%%mm4, q)\ 1009 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ 1010 \ 1011 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ 1012 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ 1013 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ 1014 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ 1015 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ 1016 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ 1017 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ 1018 "paddw %%mm1, %%mm5 \n\t" /* b */\ 1019 "paddw %%mm4, %%mm0 \n\t" /* c */\ 1020 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1021 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ 1022 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ 1023 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ 1024 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ 1025 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ 1026 "paddw %%mm3, %%mm2 \n\t" /* d */\ 1027 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ 1028 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ 1029 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ 1030 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ 1031 "paddw %%mm2, %%mm6 \n\t" /* a */\ 1032 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ 1033 "paddw %6, %%mm0 \n\t"\ 1034 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1035 "psraw $5, %%mm0 \n\t"\ 1036 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ 1037 \ 1038 "paddw %%mm5, %%mm3 \n\t" /* a */\ 1039 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ 1040 "paddw %%mm4, %%mm6 \n\t" /* b */\ 1041 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ 1042 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ 1043 "paddw %%mm1, %%mm4 \n\t" /* c */\ 1044 "paddw %%mm2, %%mm5 \n\t" /* d */\ 1045 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ 1046 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ 1047 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ 1048 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ 1049 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ 1050 "paddw %6, %%mm4 \n\t"\ 1051 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ 1052 "psraw $5, %%mm4 \n\t"\ 1053 "packuswb %%mm4, %%mm0 \n\t"\ 1054 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ 1055 \ 1056 "add %3, %0 \n\t"\ 1057 "add %4, %1 \n\t"\ 1058 "decl %2 \n\t"\ 1059 " jnz 1b \n\t"\ 1060 : "+a"(src), "+c"(dst), "+D"(h)\ 1061 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ 1062 : "memory"\ 1063 );\ 1064}\ 1065\ 1066static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1067 int i;\ 1068 int16_t temp[16];\ 1069 /* quick HACK, XXX FIXME MUST be optimized */\ 1070 for(i=0; i<h; i++)\ 1071 {\ 1072 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ 1073 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ 1074 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ 1075 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ 1076 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ 1077 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ 1078 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ 1079 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ 1080 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ 1081 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ 1082 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ 1083 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ 1084 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ 1085 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ 1086 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ 1087 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ 1088 __asm__ volatile(\ 1089 "movq (%0), %%mm0 \n\t"\ 1090 "movq 8(%0), %%mm1 \n\t"\ 1091 "paddw %2, %%mm0 \n\t"\ 1092 "paddw %2, %%mm1 \n\t"\ 1093 "psraw $5, %%mm0 \n\t"\ 1094 "psraw $5, %%mm1 \n\t"\ 1095 "packuswb %%mm1, %%mm0 \n\t"\ 1096 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ 1097 "movq 16(%0), %%mm0 \n\t"\ 1098 "movq 24(%0), %%mm1 \n\t"\ 1099 "paddw %2, %%mm0 \n\t"\ 1100 "paddw %2, %%mm1 \n\t"\ 1101 "psraw $5, %%mm0 \n\t"\ 1102 "psraw $5, %%mm1 \n\t"\ 1103 "packuswb %%mm1, %%mm0 \n\t"\ 1104 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ 1105 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ 1106 : "memory"\ 1107 );\ 1108 dst+=dstStride;\ 1109 src+=srcStride;\ 1110 }\ 1111}\ 1112\ 1113static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1114 __asm__ volatile(\ 1115 "pxor %%mm7, %%mm7 \n\t"\ 1116 "1: \n\t"\ 1117 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ 1118 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ 1119 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ 1120 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ 1121 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ 1122 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ 1123 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ 1124 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ 1125 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ 1126 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ 1127 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ 1128 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ 1129 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ 1130 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ 1131 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ 1132 "paddw %%mm3, %%mm5 \n\t" /* b */\ 1133 "paddw %%mm2, %%mm6 \n\t" /* c */\ 1134 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1135 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ 1136 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ 1137 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ 1138 "paddw %%mm4, %%mm0 \n\t" /* a */\ 1139 "paddw %%mm1, %%mm5 \n\t" /* d */\ 1140 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ 1141 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ 1142 "paddw %5, %%mm6 \n\t"\ 1143 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1144 "psraw $5, %%mm0 \n\t"\ 1145 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ 1146 \ 1147 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ 1148 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ 1149 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ 1150 "paddw %%mm5, %%mm1 \n\t" /* a */\ 1151 "paddw %%mm6, %%mm2 \n\t" /* b */\ 1152 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ 1153 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ 1154 "paddw %%mm6, %%mm3 \n\t" /* c */\ 1155 "paddw %%mm5, %%mm4 \n\t" /* d */\ 1156 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ 1157 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ 1158 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ 1159 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ 1160 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ 1161 "paddw %5, %%mm1 \n\t"\ 1162 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ 1163 "psraw $5, %%mm3 \n\t"\ 1164 "packuswb %%mm3, %%mm0 \n\t"\ 1165 OP_MMX2(%%mm0, (%1), %%mm4, q)\ 1166 \ 1167 "add %3, %0 \n\t"\ 1168 "add %4, %1 \n\t"\ 1169 "decl %2 \n\t"\ 1170 " jnz 1b \n\t"\ 1171 : "+a"(src), "+c"(dst), "+d"(h)\ 1172 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ 1173 : "memory"\ 1174 );\ 1175}\ 1176\ 1177static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1178 int i;\ 1179 int16_t temp[8];\ 1180 /* quick HACK, XXX FIXME MUST be optimized */\ 1181 for(i=0; i<h; i++)\ 1182 {\ 1183 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ 1184 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ 1185 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ 1186 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ 1187 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ 1188 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ 1189 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ 1190 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ 1191 __asm__ volatile(\ 1192 "movq (%0), %%mm0 \n\t"\ 1193 "movq 8(%0), %%mm1 \n\t"\ 1194 "paddw %2, %%mm0 \n\t"\ 1195 "paddw %2, %%mm1 \n\t"\ 1196 "psraw $5, %%mm0 \n\t"\ 1197 "psraw $5, %%mm1 \n\t"\ 1198 "packuswb %%mm1, %%mm0 \n\t"\ 1199 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ 1200 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ 1201 :"memory"\ 1202 );\ 1203 dst+=dstStride;\ 1204 src+=srcStride;\ 1205 }\ 1206} 1207 1208#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ 1209\ 1210static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1211 uint64_t temp[17*4];\ 1212 uint64_t *temp_ptr= temp;\ 1213 int count= 17;\ 1214\ 1215 /*FIXME unroll */\ 1216 __asm__ volatile(\ 1217 "pxor %%mm7, %%mm7 \n\t"\ 1218 "1: \n\t"\ 1219 "movq (%0), %%mm0 \n\t"\ 1220 "movq (%0), %%mm1 \n\t"\ 1221 "movq 8(%0), %%mm2 \n\t"\ 1222 "movq 8(%0), %%mm3 \n\t"\ 1223 "punpcklbw %%mm7, %%mm0 \n\t"\ 1224 "punpckhbw %%mm7, %%mm1 \n\t"\ 1225 "punpcklbw %%mm7, %%mm2 \n\t"\ 1226 "punpckhbw %%mm7, %%mm3 \n\t"\ 1227 "movq %%mm0, (%1) \n\t"\ 1228 "movq %%mm1, 17*8(%1) \n\t"\ 1229 "movq %%mm2, 2*17*8(%1) \n\t"\ 1230 "movq %%mm3, 3*17*8(%1) \n\t"\ 1231 "add $8, %1 \n\t"\ 1232 "add %3, %0 \n\t"\ 1233 "decl %2 \n\t"\ 1234 " jnz 1b \n\t"\ 1235 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 1236 : "r" ((x86_reg)srcStride)\ 1237 : "memory"\ 1238 );\ 1239 \ 1240 temp_ptr= temp;\ 1241 count=4;\ 1242 \ 1243/*FIXME reorder for speed */\ 1244 __asm__ volatile(\ 1245 /*"pxor %%mm7, %%mm7 \n\t"*/\ 1246 "1: \n\t"\ 1247 "movq (%0), %%mm0 \n\t"\ 1248 "movq 8(%0), %%mm1 \n\t"\ 1249 "movq 16(%0), %%mm2 \n\t"\ 1250 "movq 24(%0), %%mm3 \n\t"\ 1251 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 1252 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 1253 "add %4, %1 \n\t"\ 1254 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 1255 \ 1256 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 1257 "add %4, %1 \n\t"\ 1258 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 1259 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ 1260 "add %4, %1 \n\t"\ 1261 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ 1262 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ 1263 "add %4, %1 \n\t"\ 1264 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ 1265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ 1266 "add %4, %1 \n\t"\ 1267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ 1268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ 1269 "add %4, %1 \n\t"\ 1270 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ 1271 \ 1272 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ 1273 "add %4, %1 \n\t" \ 1274 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ 1275 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ 1276 \ 1277 "add $136, %0 \n\t"\ 1278 "add %6, %1 \n\t"\ 1279 "decl %2 \n\t"\ 1280 " jnz 1b \n\t"\ 1281 \ 1282 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 1283 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ 1284 :"memory"\ 1285 );\ 1286}\ 1287\ 1288static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1289 uint64_t temp[9*2];\ 1290 uint64_t *temp_ptr= temp;\ 1291 int count= 9;\ 1292\ 1293 /*FIXME unroll */\ 1294 __asm__ volatile(\ 1295 "pxor %%mm7, %%mm7 \n\t"\ 1296 "1: \n\t"\ 1297 "movq (%0), %%mm0 \n\t"\ 1298 "movq (%0), %%mm1 \n\t"\ 1299 "punpcklbw %%mm7, %%mm0 \n\t"\ 1300 "punpckhbw %%mm7, %%mm1 \n\t"\ 1301 "movq %%mm0, (%1) \n\t"\ 1302 "movq %%mm1, 9*8(%1) \n\t"\ 1303 "add $8, %1 \n\t"\ 1304 "add %3, %0 \n\t"\ 1305 "decl %2 \n\t"\ 1306 " jnz 1b \n\t"\ 1307 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 1308 : "r" ((x86_reg)srcStride)\ 1309 : "memory"\ 1310 );\ 1311 \ 1312 temp_ptr= temp;\ 1313 count=2;\ 1314 \ 1315/*FIXME reorder for speed */\ 1316 __asm__ volatile(\ 1317 /*"pxor %%mm7, %%mm7 \n\t"*/\ 1318 "1: \n\t"\ 1319 "movq (%0), %%mm0 \n\t"\ 1320 "movq 8(%0), %%mm1 \n\t"\ 1321 "movq 16(%0), %%mm2 \n\t"\ 1322 "movq 24(%0), %%mm3 \n\t"\ 1323 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 1324 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 1325 "add %4, %1 \n\t"\ 1326 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 1327 \ 1328 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 1329 "add %4, %1 \n\t"\ 1330 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 1331 \ 1332 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ 1333 "add %4, %1 \n\t"\ 1334 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ 1335 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ 1336 \ 1337 "add $72, %0 \n\t"\ 1338 "add %6, %1 \n\t"\ 1339 "decl %2 \n\t"\ 1340 " jnz 1b \n\t"\ 1341 \ 1342 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 1343 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ 1344 : "memory"\ 1345 );\ 1346}\ 1347\ 1348static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 1349 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ 1350}\ 1351\ 1352static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1353 uint64_t temp[8];\ 1354 uint8_t * const half= (uint8_t*)temp;\ 1355 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ 1356 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ 1357}\ 1358\ 1359static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1360 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ 1361}\ 1362\ 1363static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1364 uint64_t temp[8];\ 1365 uint8_t * const half= (uint8_t*)temp;\ 1366 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ 1367 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ 1368}\ 1369\ 1370static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1371 uint64_t temp[8];\ 1372 uint8_t * const half= (uint8_t*)temp;\ 1373 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ 1374 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ 1375}\ 1376\ 1377static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1378 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ 1379}\ 1380\ 1381static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1382 uint64_t temp[8];\ 1383 uint8_t * const half= (uint8_t*)temp;\ 1384 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ 1385 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ 1386}\ 1387static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1388 uint64_t half[8 + 9];\ 1389 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1390 uint8_t * const halfHV= ((uint8_t*)half);\ 1391 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1392 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 1393 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1394 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 1395}\ 1396static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1397 uint64_t half[8 + 9];\ 1398 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1399 uint8_t * const halfHV= ((uint8_t*)half);\ 1400 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1401 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 1402 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1403 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 1404}\ 1405static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1406 uint64_t half[8 + 9];\ 1407 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1408 uint8_t * const halfHV= ((uint8_t*)half);\ 1409 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1410 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 1411 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1412 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 1413}\ 1414static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1415 uint64_t half[8 + 9];\ 1416 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1417 uint8_t * const halfHV= ((uint8_t*)half);\ 1418 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1419 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 1420 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1421 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 1422}\ 1423static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1424 uint64_t half[8 + 9];\ 1425 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1426 uint8_t * const halfHV= ((uint8_t*)half);\ 1427 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1428 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1429 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 1430}\ 1431static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1432 uint64_t half[8 + 9];\ 1433 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1434 uint8_t * const halfHV= ((uint8_t*)half);\ 1435 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1436 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1437 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 1438}\ 1439static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1440 uint64_t half[8 + 9];\ 1441 uint8_t * const halfH= ((uint8_t*)half);\ 1442 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1443 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 1444 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 1445}\ 1446static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1447 uint64_t half[8 + 9];\ 1448 uint8_t * const halfH= ((uint8_t*)half);\ 1449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1450 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 1451 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 1452}\ 1453static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1454 uint64_t half[9];\ 1455 uint8_t * const halfH= ((uint8_t*)half);\ 1456 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1457 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 1458}\ 1459static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 1460 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ 1461}\ 1462\ 1463static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1464 uint64_t temp[32];\ 1465 uint8_t * const half= (uint8_t*)temp;\ 1466 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ 1467 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ 1468}\ 1469\ 1470static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1471 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ 1472}\ 1473\ 1474static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1475 uint64_t temp[32];\ 1476 uint8_t * const half= (uint8_t*)temp;\ 1477 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ 1478 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ 1479}\ 1480\ 1481static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1482 uint64_t temp[32];\ 1483 uint8_t * const half= (uint8_t*)temp;\ 1484 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ 1485 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ 1486}\ 1487\ 1488static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1489 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ 1490}\ 1491\ 1492static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1493 uint64_t temp[32];\ 1494 uint8_t * const half= (uint8_t*)temp;\ 1495 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ 1496 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ 1497}\ 1498static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1499 uint64_t half[16*2 + 17*2];\ 1500 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1501 uint8_t * const halfHV= ((uint8_t*)half);\ 1502 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1503 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 1504 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1505 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 1506}\ 1507static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1508 uint64_t half[16*2 + 17*2];\ 1509 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1510 uint8_t * const halfHV= ((uint8_t*)half);\ 1511 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1512 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 1513 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1514 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 1515}\ 1516static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1517 uint64_t half[16*2 + 17*2];\ 1518 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1519 uint8_t * const halfHV= ((uint8_t*)half);\ 1520 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1521 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 1522 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1523 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 1524}\ 1525static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1526 uint64_t half[16*2 + 17*2];\ 1527 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1528 uint8_t * const halfHV= ((uint8_t*)half);\ 1529 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1530 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 1531 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1532 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 1533}\ 1534static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1535 uint64_t half[16*2 + 17*2];\ 1536 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1537 uint8_t * const halfHV= ((uint8_t*)half);\ 1538 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1539 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1540 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 1541}\ 1542static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1543 uint64_t half[16*2 + 17*2];\ 1544 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1545 uint8_t * const halfHV= ((uint8_t*)half);\ 1546 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1547 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1548 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 1549}\ 1550static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1551 uint64_t half[17*2];\ 1552 uint8_t * const halfH= ((uint8_t*)half);\ 1553 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1554 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 1555 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 1556}\ 1557static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1558 uint64_t half[17*2];\ 1559 uint8_t * const halfH= ((uint8_t*)half);\ 1560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1561 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 1562 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 1563}\ 1564static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1565 uint64_t half[17*2];\ 1566 uint8_t * const halfH= ((uint8_t*)half);\ 1567 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1568 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 1569} 1570 1571#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 1572#define AVG_3DNOW_OP(a,b,temp, size) \ 1573"mov" #size " " #b ", " #temp " \n\t"\ 1574"pavgusb " #temp ", " #a " \n\t"\ 1575"mov" #size " " #a ", " #b " \n\t" 1576#define AVG_MMX2_OP(a,b,temp, size) \ 1577"mov" #size " " #b ", " #temp " \n\t"\ 1578"pavgb " #temp ", " #a " \n\t"\ 1579"mov" #size " " #a ", " #b " \n\t" 1580 1581QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) 1582QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) 1583QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) 1584QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) 1585QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) 1586QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) 1587QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) 1588QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) 1589QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) 1590 1591/***********************************/ 1592/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ 1593 1594#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ 1595static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1596 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ 1597} 1598#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ 1599static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1600 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ 1601} 1602 1603#define QPEL_2TAP(OPNAME, SIZE, MMX)\ 1604QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ 1605QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ 1606QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ 1607static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ 1608 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ 1609static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ 1610 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ 1611static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ 1612 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ 1613static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1614 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ 1615}\ 1616static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1617 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ 1618}\ 1619QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ 1620QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ 1621QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ 1622QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ 1623QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ 1624QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ 1625QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ 1626QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ 1627 1628QPEL_2TAP(put_, 16, mmx2) 1629QPEL_2TAP(avg_, 16, mmx2) 1630QPEL_2TAP(put_, 8, mmx2) 1631QPEL_2TAP(avg_, 8, mmx2) 1632QPEL_2TAP(put_, 16, 3dnow) 1633QPEL_2TAP(avg_, 16, 3dnow) 1634QPEL_2TAP(put_, 8, 3dnow) 1635QPEL_2TAP(avg_, 8, 3dnow) 1636 1637 1638#if 0 1639static void just_return(void) { return; } 1640#endif 1641 1642static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 1643 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ 1644 const int w = 8; 1645 const int ix = ox>>(16+shift); 1646 const int iy = oy>>(16+shift); 1647 const int oxs = ox>>4; 1648 const int oys = oy>>4; 1649 const int dxxs = dxx>>4; 1650 const int dxys = dxy>>4; 1651 const int dyxs = dyx>>4; 1652 const int dyys = dyy>>4; 1653 const uint16_t r4[4] = {r,r,r,r}; 1654 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; 1655 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; 1656 const uint64_t shift2 = 2*shift; 1657 uint8_t edge_buf[(h+1)*stride]; 1658 int x, y; 1659 1660 const int dxw = (dxx-(1<<(16+shift)))*(w-1); 1661 const int dyh = (dyy-(1<<(16+shift)))*(h-1); 1662 const int dxh = dxy*(h-1); 1663 const int dyw = dyx*(w-1); 1664 if( // non-constant fullpel offset (3% of blocks) 1665 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | 1666 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) 1667 // uses more than 16 bits of subpel mv (only at huge resolution) 1668 || (dxx|dxy|dyx|dyy)&15 ) 1669 { 1670 //FIXME could still use mmx for some of the rows 1671 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); 1672 return; 1673 } 1674 1675 src += ix + iy*stride; 1676 if( (unsigned)ix >= width-w || 1677 (unsigned)iy >= height-h ) 1678 { 1679 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); 1680 src = edge_buf; 1681 } 1682 1683 __asm__ volatile( 1684 "movd %0, %%mm6 \n\t" 1685 "pxor %%mm7, %%mm7 \n\t" 1686 "punpcklwd %%mm6, %%mm6 \n\t" 1687 "punpcklwd %%mm6, %%mm6 \n\t" 1688 :: "r"(1<<shift) 1689 ); 1690 1691 for(x=0; x<w; x+=4){ 1692 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), 1693 oxs - dxys + dxxs*(x+1), 1694 oxs - dxys + dxxs*(x+2), 1695 oxs - dxys + dxxs*(x+3) }; 1696 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), 1697 oys - dyys + dyxs*(x+1), 1698 oys - dyys + dyxs*(x+2), 1699 oys - dyys + dyxs*(x+3) }; 1700 1701 for(y=0; y<h; y++){ 1702 __asm__ volatile( 1703 "movq %0, %%mm4 \n\t" 1704 "movq %1, %%mm5 \n\t" 1705 "paddw %2, %%mm4 \n\t" 1706 "paddw %3, %%mm5 \n\t" 1707 "movq %%mm4, %0 \n\t" 1708 "movq %%mm5, %1 \n\t" 1709 "psrlw $12, %%mm4 \n\t" 1710 "psrlw $12, %%mm5 \n\t" 1711 : "+m"(*dx4), "+m"(*dy4) 1712 : "m"(*dxy4), "m"(*dyy4) 1713 ); 1714 1715 __asm__ volatile( 1716 "movq %%mm6, %%mm2 \n\t" 1717 "movq %%mm6, %%mm1 \n\t" 1718 "psubw %%mm4, %%mm2 \n\t" 1719 "psubw %%mm5, %%mm1 \n\t" 1720 "movq %%mm2, %%mm0 \n\t" 1721 "movq %%mm4, %%mm3 \n\t" 1722 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) 1723 "pmullw %%mm5, %%mm3 \n\t" // dx*dy 1724 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy 1725 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) 1726 1727 "movd %4, %%mm5 \n\t" 1728 "movd %3, %%mm4 \n\t" 1729 "punpcklbw %%mm7, %%mm5 \n\t" 1730 "punpcklbw %%mm7, %%mm4 \n\t" 1731 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy 1732 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy 1733 1734 "movd %2, %%mm5 \n\t" 1735 "movd %1, %%mm4 \n\t" 1736 "punpcklbw %%mm7, %%mm5 \n\t" 1737 "punpcklbw %%mm7, %%mm4 \n\t" 1738 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) 1739 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) 1740 "paddw %5, %%mm1 \n\t" 1741 "paddw %%mm3, %%mm2 \n\t" 1742 "paddw %%mm1, %%mm0 \n\t" 1743 "paddw %%mm2, %%mm0 \n\t" 1744 1745 "psrlw %6, %%mm0 \n\t" 1746 "packuswb %%mm0, %%mm0 \n\t" 1747 "movd %%mm0, %0 \n\t" 1748 1749 : "=m"(dst[x+y*stride]) 1750 : "m"(src[0]), "m"(src[1]), 1751 "m"(src[stride]), "m"(src[stride+1]), 1752 "m"(*r4), "m"(shift2) 1753 ); 1754 src += stride; 1755 } 1756 src += 4-h*stride; 1757 } 1758} 1759 1760#define PREFETCH(name, op) \ 1761static void name(void *mem, int stride, int h){\ 1762 const uint8_t *p= mem;\ 1763 do{\ 1764 __asm__ volatile(#op" %0" :: "m"(*p));\ 1765 p+= stride;\ 1766 }while(--h);\ 1767} 1768PREFETCH(prefetch_mmx2, prefetcht0) 1769PREFETCH(prefetch_3dnow, prefetch) 1770#undef PREFETCH 1771 1772#include "h264dsp_mmx.c" 1773#include "rv40dsp_mmx.c" 1774 1775/* CAVS specific */ 1776void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); 1777void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx); 1778 1779void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1780 put_pixels8_mmx(dst, src, stride, 8); 1781} 1782void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1783 avg_pixels8_mmx(dst, src, stride, 8); 1784} 1785void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1786 put_pixels16_mmx(dst, src, stride, 16); 1787} 1788void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1789 avg_pixels16_mmx(dst, src, stride, 16); 1790} 1791 1792/* VC1 specific */ 1793void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); 1794 1795void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { 1796 put_pixels8_mmx(dst, src, stride, 8); 1797} 1798 1799/* external functions, from idct_mmx.c */ 1800void ff_mmx_idct(DCTELEM *block); 1801void ff_mmxext_idct(DCTELEM *block); 1802 1803/* XXX: those functions should be suppressed ASAP when all IDCTs are 1804 converted */ 1805#if CONFIG_GPL 1806static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) 1807{ 1808 ff_mmx_idct (block); 1809 put_pixels_clamped_mmx(block, dest, line_size); 1810} 1811static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) 1812{ 1813 ff_mmx_idct (block); 1814 add_pixels_clamped_mmx(block, dest, line_size); 1815} 1816static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) 1817{ 1818 ff_mmxext_idct (block); 1819 put_pixels_clamped_mmx(block, dest, line_size); 1820} 1821static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) 1822{ 1823 ff_mmxext_idct (block); 1824 add_pixels_clamped_mmx(block, dest, line_size); 1825} 1826#endif 1827static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) 1828{ 1829 ff_idct_xvid_mmx (block); 1830 put_pixels_clamped_mmx(block, dest, line_size); 1831} 1832static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) 1833{ 1834 ff_idct_xvid_mmx (block); 1835 add_pixels_clamped_mmx(block, dest, line_size); 1836} 1837static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) 1838{ 1839 ff_idct_xvid_mmx2 (block); 1840 put_pixels_clamped_mmx(block, dest, line_size); 1841} 1842static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) 1843{ 1844 ff_idct_xvid_mmx2 (block); 1845 add_pixels_clamped_mmx(block, dest, line_size); 1846} 1847 1848static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) 1849{ 1850 int i; 1851 __asm__ volatile("pxor %%mm7, %%mm7":); 1852 for(i=0; i<blocksize; i+=2) { 1853 __asm__ volatile( 1854 "movq %0, %%mm0 \n\t" 1855 "movq %1, %%mm1 \n\t" 1856 "movq %%mm0, %%mm2 \n\t" 1857 "movq %%mm1, %%mm3 \n\t" 1858 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 1859 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 1860 "pslld $31, %%mm2 \n\t" // keep only the sign bit 1861 "pxor %%mm2, %%mm1 \n\t" 1862 "movq %%mm3, %%mm4 \n\t" 1863 "pand %%mm1, %%mm3 \n\t" 1864 "pandn %%mm1, %%mm4 \n\t" 1865 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) 1866 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) 1867 "movq %%mm3, %1 \n\t" 1868 "movq %%mm0, %0 \n\t" 1869 :"+m"(mag[i]), "+m"(ang[i]) 1870 ::"memory" 1871 ); 1872 } 1873 __asm__ volatile("femms"); 1874} 1875static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) 1876{ 1877 int i; 1878 1879 __asm__ volatile( 1880 "movaps %0, %%xmm5 \n\t" 1881 ::"m"(ff_pdw_80000000[0]) 1882 ); 1883 for(i=0; i<blocksize; i+=4) { 1884 __asm__ volatile( 1885 "movaps %0, %%xmm0 \n\t" 1886 "movaps %1, %%xmm1 \n\t" 1887 "xorps %%xmm2, %%xmm2 \n\t" 1888 "xorps %%xmm3, %%xmm3 \n\t" 1889 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 1890 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 1891 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit 1892 "xorps %%xmm2, %%xmm1 \n\t" 1893 "movaps %%xmm3, %%xmm4 \n\t" 1894 "andps %%xmm1, %%xmm3 \n\t" 1895 "andnps %%xmm1, %%xmm4 \n\t" 1896 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) 1897 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) 1898 "movaps %%xmm3, %1 \n\t" 1899 "movaps %%xmm0, %0 \n\t" 1900 :"+m"(mag[i]), "+m"(ang[i]) 1901 ::"memory" 1902 ); 1903 } 1904} 1905 1906#define IF1(x) x 1907#define IF0(x) 1908 1909#define MIX5(mono,stereo)\ 1910 __asm__ volatile(\ 1911 "movss 0(%2), %%xmm5 \n"\ 1912 "movss 8(%2), %%xmm6 \n"\ 1913 "movss 24(%2), %%xmm7 \n"\ 1914 "shufps $0, %%xmm5, %%xmm5 \n"\ 1915 "shufps $0, %%xmm6, %%xmm6 \n"\ 1916 "shufps $0, %%xmm7, %%xmm7 \n"\ 1917 "1: \n"\ 1918 "movaps (%0,%1), %%xmm0 \n"\ 1919 "movaps 0x400(%0,%1), %%xmm1 \n"\ 1920 "movaps 0x800(%0,%1), %%xmm2 \n"\ 1921 "movaps 0xc00(%0,%1), %%xmm3 \n"\ 1922 "movaps 0x1000(%0,%1), %%xmm4 \n"\ 1923 "mulps %%xmm5, %%xmm0 \n"\ 1924 "mulps %%xmm6, %%xmm1 \n"\ 1925 "mulps %%xmm5, %%xmm2 \n"\ 1926 "mulps %%xmm7, %%xmm3 \n"\ 1927 "mulps %%xmm7, %%xmm4 \n"\ 1928 stereo("addps %%xmm1, %%xmm0 \n")\ 1929 "addps %%xmm1, %%xmm2 \n"\ 1930 "addps %%xmm3, %%xmm0 \n"\ 1931 "addps %%xmm4, %%xmm2 \n"\ 1932 mono("addps %%xmm2, %%xmm0 \n")\ 1933 "movaps %%xmm0, (%0,%1) \n"\ 1934 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ 1935 "add $16, %0 \n"\ 1936 "jl 1b \n"\ 1937 :"+&r"(i)\ 1938 :"r"(samples[0]+len), "r"(matrix)\ 1939 :"memory"\ 1940 ); 1941 1942#define MIX_MISC(stereo)\ 1943 __asm__ volatile(\ 1944 "1: \n"\ 1945 "movaps (%3,%0), %%xmm0 \n"\ 1946 stereo("movaps %%xmm0, %%xmm1 \n")\ 1947 "mulps %%xmm6, %%xmm0 \n"\ 1948 stereo("mulps %%xmm7, %%xmm1 \n")\ 1949 "lea 1024(%3,%0), %1 \n"\ 1950 "mov %5, %2 \n"\ 1951 "2: \n"\ 1952 "movaps (%1), %%xmm2 \n"\ 1953 stereo("movaps %%xmm2, %%xmm3 \n")\ 1954 "mulps (%4,%2), %%xmm2 \n"\ 1955 stereo("mulps 16(%4,%2), %%xmm3 \n")\ 1956 "addps %%xmm2, %%xmm0 \n"\ 1957 stereo("addps %%xmm3, %%xmm1 \n")\ 1958 "add $1024, %1 \n"\ 1959 "add $32, %2 \n"\ 1960 "jl 2b \n"\ 1961 "movaps %%xmm0, (%3,%0) \n"\ 1962 stereo("movaps %%xmm1, 1024(%3,%0) \n")\ 1963 "add $16, %0 \n"\ 1964 "jl 1b \n"\ 1965 :"+&r"(i), "=&r"(j), "=&r"(k)\ 1966 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ 1967 :"memory"\ 1968 ); 1969 1970static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) 1971{ 1972 int (*matrix_cmp)[2] = (int(*)[2])matrix; 1973 intptr_t i,j,k; 1974 1975 i = -len*sizeof(float); 1976 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { 1977 MIX5(IF0,IF1); 1978 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { 1979 MIX5(IF1,IF0); 1980 } else { 1981 DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]); 1982 j = 2*in_ch*sizeof(float); 1983 __asm__ volatile( 1984 "1: \n" 1985 "sub $8, %0 \n" 1986 "movss (%2,%0), %%xmm6 \n" 1987 "movss 4(%2,%0), %%xmm7 \n" 1988 "shufps $0, %%xmm6, %%xmm6 \n" 1989 "shufps $0, %%xmm7, %%xmm7 \n" 1990 "movaps %%xmm6, (%1,%0,4) \n" 1991 "movaps %%xmm7, 16(%1,%0,4) \n" 1992 "jg 1b \n" 1993 :"+&r"(j) 1994 :"r"(matrix_simd), "r"(matrix) 1995 :"memory" 1996 ); 1997 if(out_ch == 2) { 1998 MIX_MISC(IF1); 1999 } else { 2000 MIX_MISC(IF0); 2001 } 2002 } 2003} 2004 2005static void vector_fmul_3dnow(float *dst, const float *src, int len){ 2006 x86_reg i = (len-4)*4; 2007 __asm__ volatile( 2008 "1: \n\t" 2009 "movq (%1,%0), %%mm0 \n\t" 2010 "movq 8(%1,%0), %%mm1 \n\t" 2011 "pfmul (%2,%0), %%mm0 \n\t" 2012 "pfmul 8(%2,%0), %%mm1 \n\t" 2013 "movq %%mm0, (%1,%0) \n\t" 2014 "movq %%mm1, 8(%1,%0) \n\t" 2015 "sub $16, %0 \n\t" 2016 "jge 1b \n\t" 2017 "femms \n\t" 2018 :"+r"(i) 2019 :"r"(dst), "r"(src) 2020 :"memory" 2021 ); 2022} 2023static void vector_fmul_sse(float *dst, const float *src, int len){ 2024 x86_reg i = (len-8)*4; 2025 __asm__ volatile( 2026 "1: \n\t" 2027 "movaps (%1,%0), %%xmm0 \n\t" 2028 "movaps 16(%1,%0), %%xmm1 \n\t" 2029 "mulps (%2,%0), %%xmm0 \n\t" 2030 "mulps 16(%2,%0), %%xmm1 \n\t" 2031 "movaps %%xmm0, (%1,%0) \n\t" 2032 "movaps %%xmm1, 16(%1,%0) \n\t" 2033 "sub $32, %0 \n\t" 2034 "jge 1b \n\t" 2035 :"+r"(i) 2036 :"r"(dst), "r"(src) 2037 :"memory" 2038 ); 2039} 2040 2041static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ 2042 x86_reg i = len*4-16; 2043 __asm__ volatile( 2044 "1: \n\t" 2045 "pswapd 8(%1), %%mm0 \n\t" 2046 "pswapd (%1), %%mm1 \n\t" 2047 "pfmul (%3,%0), %%mm0 \n\t" 2048 "pfmul 8(%3,%0), %%mm1 \n\t" 2049 "movq %%mm0, (%2,%0) \n\t" 2050 "movq %%mm1, 8(%2,%0) \n\t" 2051 "add $16, %1 \n\t" 2052 "sub $16, %0 \n\t" 2053 "jge 1b \n\t" 2054 :"+r"(i), "+r"(src1) 2055 :"r"(dst), "r"(src0) 2056 ); 2057 __asm__ volatile("femms"); 2058} 2059static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ 2060 x86_reg i = len*4-32; 2061 __asm__ volatile( 2062 "1: \n\t" 2063 "movaps 16(%1), %%xmm0 \n\t" 2064 "movaps (%1), %%xmm1 \n\t" 2065 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" 2066 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" 2067 "mulps (%3,%0), %%xmm0 \n\t" 2068 "mulps 16(%3,%0), %%xmm1 \n\t" 2069 "movaps %%xmm0, (%2,%0) \n\t" 2070 "movaps %%xmm1, 16(%2,%0) \n\t" 2071 "add $32, %1 \n\t" 2072 "sub $32, %0 \n\t" 2073 "jge 1b \n\t" 2074 :"+r"(i), "+r"(src1) 2075 :"r"(dst), "r"(src0) 2076 ); 2077} 2078 2079static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, 2080 const float *src2, int src3, int len, int step){ 2081 x86_reg i = (len-4)*4; 2082 if(step == 2 && src3 == 0){ 2083 dst += (len-4)*2; 2084 __asm__ volatile( 2085 "1: \n\t" 2086 "movq (%2,%0), %%mm0 \n\t" 2087 "movq 8(%2,%0), %%mm1 \n\t" 2088 "pfmul (%3,%0), %%mm0 \n\t" 2089 "pfmul 8(%3,%0), %%mm1 \n\t" 2090 "pfadd (%4,%0), %%mm0 \n\t" 2091 "pfadd 8(%4,%0), %%mm1 \n\t" 2092 "movd %%mm0, (%1) \n\t" 2093 "movd %%mm1, 16(%1) \n\t" 2094 "psrlq $32, %%mm0 \n\t" 2095 "psrlq $32, %%mm1 \n\t" 2096 "movd %%mm0, 8(%1) \n\t" 2097 "movd %%mm1, 24(%1) \n\t" 2098 "sub $32, %1 \n\t" 2099 "sub $16, %0 \n\t" 2100 "jge 1b \n\t" 2101 :"+r"(i), "+r"(dst) 2102 :"r"(src0), "r"(src1), "r"(src2) 2103 :"memory" 2104 ); 2105 } 2106 else if(step == 1 && src3 == 0){ 2107 __asm__ volatile( 2108 "1: \n\t" 2109 "movq (%2,%0), %%mm0 \n\t" 2110 "movq 8(%2,%0), %%mm1 \n\t" 2111 "pfmul (%3,%0), %%mm0 \n\t" 2112 "pfmul 8(%3,%0), %%mm1 \n\t" 2113 "pfadd (%4,%0), %%mm0 \n\t" 2114 "pfadd 8(%4,%0), %%mm1 \n\t" 2115 "movq %%mm0, (%1,%0) \n\t" 2116 "movq %%mm1, 8(%1,%0) \n\t" 2117 "sub $16, %0 \n\t" 2118 "jge 1b \n\t" 2119 :"+r"(i) 2120 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) 2121 :"memory" 2122 ); 2123 } 2124 else 2125 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); 2126 __asm__ volatile("femms"); 2127} 2128static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, 2129 const float *src2, int src3, int len, int step){ 2130 x86_reg i = (len-8)*4; 2131 if(step == 2 && src3 == 0){ 2132 dst += (len-8)*2; 2133 __asm__ volatile( 2134 "1: \n\t" 2135 "movaps (%2,%0), %%xmm0 \n\t" 2136 "movaps 16(%2,%0), %%xmm1 \n\t" 2137 "mulps (%3,%0), %%xmm0 \n\t" 2138 "mulps 16(%3,%0), %%xmm1 \n\t" 2139 "addps (%4,%0), %%xmm0 \n\t" 2140 "addps 16(%4,%0), %%xmm1 \n\t" 2141 "movss %%xmm0, (%1) \n\t" 2142 "movss %%xmm1, 32(%1) \n\t" 2143 "movhlps %%xmm0, %%xmm2 \n\t" 2144 "movhlps %%xmm1, %%xmm3 \n\t" 2145 "movss %%xmm2, 16(%1) \n\t" 2146 "movss %%xmm3, 48(%1) \n\t" 2147 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" 2148 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" 2149 "movss %%xmm0, 8(%1) \n\t" 2150 "movss %%xmm1, 40(%1) \n\t" 2151 "movhlps %%xmm0, %%xmm2 \n\t" 2152 "movhlps %%xmm1, %%xmm3 \n\t" 2153 "movss %%xmm2, 24(%1) \n\t" 2154 "movss %%xmm3, 56(%1) \n\t" 2155 "sub $64, %1 \n\t" 2156 "sub $32, %0 \n\t" 2157 "jge 1b \n\t" 2158 :"+r"(i), "+r"(dst) 2159 :"r"(src0), "r"(src1), "r"(src2) 2160 :"memory" 2161 ); 2162 } 2163 else if(step == 1 && src3 == 0){ 2164 __asm__ volatile( 2165 "1: \n\t" 2166 "movaps (%2,%0), %%xmm0 \n\t" 2167 "movaps 16(%2,%0), %%xmm1 \n\t" 2168 "mulps (%3,%0), %%xmm0 \n\t" 2169 "mulps 16(%3,%0), %%xmm1 \n\t" 2170 "addps (%4,%0), %%xmm0 \n\t" 2171 "addps 16(%4,%0), %%xmm1 \n\t" 2172 "movaps %%xmm0, (%1,%0) \n\t" 2173 "movaps %%xmm1, 16(%1,%0) \n\t" 2174 "sub $32, %0 \n\t" 2175 "jge 1b \n\t" 2176 :"+r"(i) 2177 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) 2178 :"memory" 2179 ); 2180 } 2181 else 2182 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); 2183} 2184 2185static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, 2186 const float *win, float add_bias, int len){ 2187#if HAVE_6REGS 2188 if(add_bias == 0){ 2189 x86_reg i = -len*4; 2190 x86_reg j = len*4-8; 2191 __asm__ volatile( 2192 "1: \n" 2193 "pswapd (%5,%1), %%mm1 \n" 2194 "movq (%5,%0), %%mm0 \n" 2195 "pswapd (%4,%1), %%mm5 \n" 2196 "movq (%3,%0), %%mm4 \n" 2197 "movq %%mm0, %%mm2 \n" 2198 "movq %%mm1, %%mm3 \n" 2199 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] 2200 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] 2201 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] 2202 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] 2203 "pfadd %%mm3, %%mm2 \n" 2204 "pfsub %%mm0, %%mm1 \n" 2205 "pswapd %%mm2, %%mm2 \n" 2206 "movq %%mm1, (%2,%0) \n" 2207 "movq %%mm2, (%2,%1) \n" 2208 "sub $8, %1 \n" 2209 "add $8, %0 \n" 2210 "jl 1b \n" 2211 "femms \n" 2212 :"+r"(i), "+r"(j) 2213 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) 2214 ); 2215 }else 2216#endif 2217 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); 2218} 2219 2220static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, 2221 const float *win, float add_bias, int len){ 2222#if HAVE_6REGS 2223 if(add_bias == 0){ 2224 x86_reg i = -len*4; 2225 x86_reg j = len*4-16; 2226 __asm__ volatile( 2227 "1: \n" 2228 "movaps (%5,%1), %%xmm1 \n" 2229 "movaps (%5,%0), %%xmm0 \n" 2230 "movaps (%4,%1), %%xmm5 \n" 2231 "movaps (%3,%0), %%xmm4 \n" 2232 "shufps $0x1b, %%xmm1, %%xmm1 \n" 2233 "shufps $0x1b, %%xmm5, %%xmm5 \n" 2234 "movaps %%xmm0, %%xmm2 \n" 2235 "movaps %%xmm1, %%xmm3 \n" 2236 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] 2237 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] 2238 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] 2239 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] 2240 "addps %%xmm3, %%xmm2 \n" 2241 "subps %%xmm0, %%xmm1 \n" 2242 "shufps $0x1b, %%xmm2, %%xmm2 \n" 2243 "movaps %%xmm1, (%2,%0) \n" 2244 "movaps %%xmm2, (%2,%1) \n" 2245 "sub $16, %1 \n" 2246 "add $16, %0 \n" 2247 "jl 1b \n" 2248 :"+r"(i), "+r"(j) 2249 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) 2250 ); 2251 }else 2252#endif 2253 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); 2254} 2255 2256static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) 2257{ 2258 x86_reg i = -4*len; 2259 __asm__ volatile( 2260 "movss %3, %%xmm4 \n" 2261 "shufps $0, %%xmm4, %%xmm4 \n" 2262 "1: \n" 2263 "cvtpi2ps (%2,%0), %%xmm0 \n" 2264 "cvtpi2ps 8(%2,%0), %%xmm1 \n" 2265 "cvtpi2ps 16(%2,%0), %%xmm2 \n" 2266 "cvtpi2ps 24(%2,%0), %%xmm3 \n" 2267 "movlhps %%xmm1, %%xmm0 \n" 2268 "movlhps %%xmm3, %%xmm2 \n" 2269 "mulps %%xmm4, %%xmm0 \n" 2270 "mulps %%xmm4, %%xmm2 \n" 2271 "movaps %%xmm0, (%1,%0) \n" 2272 "movaps %%xmm2, 16(%1,%0) \n" 2273 "add $32, %0 \n" 2274 "jl 1b \n" 2275 :"+r"(i) 2276 :"r"(dst+len), "r"(src+len), "m"(mul) 2277 ); 2278} 2279 2280static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) 2281{ 2282 x86_reg i = -4*len; 2283 __asm__ volatile( 2284 "movss %3, %%xmm4 \n" 2285 "shufps $0, %%xmm4, %%xmm4 \n" 2286 "1: \n" 2287 "cvtdq2ps (%2,%0), %%xmm0 \n" 2288 "cvtdq2ps 16(%2,%0), %%xmm1 \n" 2289 "mulps %%xmm4, %%xmm0 \n" 2290 "mulps %%xmm4, %%xmm1 \n" 2291 "movaps %%xmm0, (%1,%0) \n" 2292 "movaps %%xmm1, 16(%1,%0) \n" 2293 "add $32, %0 \n" 2294 "jl 1b \n" 2295 :"+r"(i) 2296 :"r"(dst+len), "r"(src+len), "m"(mul) 2297 ); 2298} 2299 2300static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ 2301 x86_reg reglen = len; 2302 // not bit-exact: pf2id uses different rounding than C and SSE 2303 __asm__ volatile( 2304 "add %0 , %0 \n\t" 2305 "lea (%2,%0,2) , %2 \n\t" 2306 "add %0 , %1 \n\t" 2307 "neg %0 \n\t" 2308 "1: \n\t" 2309 "pf2id (%2,%0,2) , %%mm0 \n\t" 2310 "pf2id 8(%2,%0,2) , %%mm1 \n\t" 2311 "pf2id 16(%2,%0,2) , %%mm2 \n\t" 2312 "pf2id 24(%2,%0,2) , %%mm3 \n\t" 2313 "packssdw %%mm1 , %%mm0 \n\t" 2314 "packssdw %%mm3 , %%mm2 \n\t" 2315 "movq %%mm0 , (%1,%0) \n\t" 2316 "movq %%mm2 , 8(%1,%0) \n\t" 2317 "add $16 , %0 \n\t" 2318 " js 1b \n\t" 2319 "femms \n\t" 2320 :"+r"(reglen), "+r"(dst), "+r"(src) 2321 ); 2322} 2323static void float_to_int16_sse(int16_t *dst, const float *src, long len){ 2324 x86_reg reglen = len; 2325 __asm__ volatile( 2326 "add %0 , %0 \n\t" 2327 "lea (%2,%0,2) , %2 \n\t" 2328 "add %0 , %1 \n\t" 2329 "neg %0 \n\t" 2330 "1: \n\t" 2331 "cvtps2pi (%2,%0,2) , %%mm0 \n\t" 2332 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" 2333 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" 2334 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" 2335 "packssdw %%mm1 , %%mm0 \n\t" 2336 "packssdw %%mm3 , %%mm2 \n\t" 2337 "movq %%mm0 , (%1,%0) \n\t" 2338 "movq %%mm2 , 8(%1,%0) \n\t" 2339 "add $16 , %0 \n\t" 2340 " js 1b \n\t" 2341 "emms \n\t" 2342 :"+r"(reglen), "+r"(dst), "+r"(src) 2343 ); 2344} 2345 2346static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ 2347 x86_reg reglen = len; 2348 __asm__ volatile( 2349 "add %0 , %0 \n\t" 2350 "lea (%2,%0,2) , %2 \n\t" 2351 "add %0 , %1 \n\t" 2352 "neg %0 \n\t" 2353 "1: \n\t" 2354 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" 2355 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" 2356 "packssdw %%xmm1 , %%xmm0 \n\t" 2357 "movdqa %%xmm0 , (%1,%0) \n\t" 2358 "add $16 , %0 \n\t" 2359 " js 1b \n\t" 2360 :"+r"(reglen), "+r"(dst), "+r"(src) 2361 ); 2362} 2363 2364#if HAVE_YASM 2365void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); 2366void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); 2367void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); 2368void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top); 2369void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 2370void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 2371void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); 2372void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); 2373#if ARCH_X86_32 2374static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) 2375{ 2376 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); 2377 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); 2378} 2379#endif 2380void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); 2381void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); 2382#else 2383#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) 2384#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) 2385#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) 2386#endif 2387#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse 2388 2389#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ 2390/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ 2391static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ 2392 DECLARE_ALIGNED_16(int16_t, tmp[len]);\ 2393 int i,j,c;\ 2394 for(c=0; c<channels; c++){\ 2395 float_to_int16_##cpu(tmp, src[c], len);\ 2396 for(i=0, j=c; i<len; i++, j+=channels)\ 2397 dst[j] = tmp[i];\ 2398 }\ 2399}\ 2400\ 2401static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ 2402 if(channels==1)\ 2403 float_to_int16_##cpu(dst, src[0], len);\ 2404 else if(channels==2){\ 2405 x86_reg reglen = len; \ 2406 const float *src0 = src[0];\ 2407 const float *src1 = src[1];\ 2408 __asm__ volatile(\ 2409 "shl $2, %0 \n"\ 2410 "add %0, %1 \n"\ 2411 "add %0, %2 \n"\ 2412 "add %0, %3 \n"\ 2413 "neg %0 \n"\ 2414 body\ 2415 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ 2416 );\ 2417 }else if(channels==6){\ 2418 ff_float_to_int16_interleave6_##cpu(dst, src, len);\ 2419 }else\ 2420 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ 2421} 2422 2423FLOAT_TO_INT16_INTERLEAVE(3dnow, 2424 "1: \n" 2425 "pf2id (%2,%0), %%mm0 \n" 2426 "pf2id 8(%2,%0), %%mm1 \n" 2427 "pf2id (%3,%0), %%mm2 \n" 2428 "pf2id 8(%3,%0), %%mm3 \n" 2429 "packssdw %%mm1, %%mm0 \n" 2430 "packssdw %%mm3, %%mm2 \n" 2431 "movq %%mm0, %%mm1 \n" 2432 "punpcklwd %%mm2, %%mm0 \n" 2433 "punpckhwd %%mm2, %%mm1 \n" 2434 "movq %%mm0, (%1,%0)\n" 2435 "movq %%mm1, 8(%1,%0)\n" 2436 "add $16, %0 \n" 2437 "js 1b \n" 2438 "femms \n" 2439) 2440 2441FLOAT_TO_INT16_INTERLEAVE(sse, 2442 "1: \n" 2443 "cvtps2pi (%2,%0), %%mm0 \n" 2444 "cvtps2pi 8(%2,%0), %%mm1 \n" 2445 "cvtps2pi (%3,%0), %%mm2 \n" 2446 "cvtps2pi 8(%3,%0), %%mm3 \n" 2447 "packssdw %%mm1, %%mm0 \n" 2448 "packssdw %%mm3, %%mm2 \n" 2449 "movq %%mm0, %%mm1 \n" 2450 "punpcklwd %%mm2, %%mm0 \n" 2451 "punpckhwd %%mm2, %%mm1 \n" 2452 "movq %%mm0, (%1,%0)\n" 2453 "movq %%mm1, 8(%1,%0)\n" 2454 "add $16, %0 \n" 2455 "js 1b \n" 2456 "emms \n" 2457) 2458 2459FLOAT_TO_INT16_INTERLEAVE(sse2, 2460 "1: \n" 2461 "cvtps2dq (%2,%0), %%xmm0 \n" 2462 "cvtps2dq (%3,%0), %%xmm1 \n" 2463 "packssdw %%xmm1, %%xmm0 \n" 2464 "movhlps %%xmm0, %%xmm1 \n" 2465 "punpcklwd %%xmm1, %%xmm0 \n" 2466 "movdqa %%xmm0, (%1,%0) \n" 2467 "add $16, %0 \n" 2468 "js 1b \n" 2469) 2470 2471static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ 2472 if(channels==6) 2473 ff_float_to_int16_interleave6_3dn2(dst, src, len); 2474 else 2475 float_to_int16_interleave_3dnow(dst, src, len, channels); 2476} 2477 2478 2479void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); 2480void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); 2481void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); 2482void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); 2483void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, 2484 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); 2485void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, 2486 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); 2487 2488 2489static void add_int16_sse2(int16_t * v1, int16_t * v2, int order) 2490{ 2491 x86_reg o = -(order << 1); 2492 v1 += order; 2493 v2 += order; 2494 __asm__ volatile( 2495 "1: \n\t" 2496 "movdqu (%1,%2), %%xmm0 \n\t" 2497 "movdqu 16(%1,%2), %%xmm1 \n\t" 2498 "paddw (%0,%2), %%xmm0 \n\t" 2499 "paddw 16(%0,%2), %%xmm1 \n\t" 2500 "movdqa %%xmm0, (%0,%2) \n\t" 2501 "movdqa %%xmm1, 16(%0,%2) \n\t" 2502 "add $32, %2 \n\t" 2503 "js 1b \n\t" 2504 : "+r"(v1), "+r"(v2), "+r"(o) 2505 ); 2506} 2507 2508static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order) 2509{ 2510 x86_reg o = -(order << 1); 2511 v1 += order; 2512 v2 += order; 2513 __asm__ volatile( 2514 "1: \n\t" 2515 "movdqa (%0,%2), %%xmm0 \n\t" 2516 "movdqa 16(%0,%2), %%xmm2 \n\t" 2517 "movdqu (%1,%2), %%xmm1 \n\t" 2518 "movdqu 16(%1,%2), %%xmm3 \n\t" 2519 "psubw %%xmm1, %%xmm0 \n\t" 2520 "psubw %%xmm3, %%xmm2 \n\t" 2521 "movdqa %%xmm0, (%0,%2) \n\t" 2522 "movdqa %%xmm2, 16(%0,%2) \n\t" 2523 "add $32, %2 \n\t" 2524 "js 1b \n\t" 2525 : "+r"(v1), "+r"(v2), "+r"(o) 2526 ); 2527} 2528 2529static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) 2530{ 2531 int res = 0; 2532 DECLARE_ALIGNED_16(xmm_reg, sh); 2533 x86_reg o = -(order << 1); 2534 2535 v1 += order; 2536 v2 += order; 2537 sh.a = shift; 2538 __asm__ volatile( 2539 "pxor %%xmm7, %%xmm7 \n\t" 2540 "1: \n\t" 2541 "movdqu (%0,%3), %%xmm0 \n\t" 2542 "movdqu 16(%0,%3), %%xmm1 \n\t" 2543 "pmaddwd (%1,%3), %%xmm0 \n\t" 2544 "pmaddwd 16(%1,%3), %%xmm1 \n\t" 2545 "paddd %%xmm0, %%xmm7 \n\t" 2546 "paddd %%xmm1, %%xmm7 \n\t" 2547 "add $32, %3 \n\t" 2548 "js 1b \n\t" 2549 "movhlps %%xmm7, %%xmm2 \n\t" 2550 "paddd %%xmm2, %%xmm7 \n\t" 2551 "psrad %4, %%xmm7 \n\t" 2552 "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t" 2553 "paddd %%xmm2, %%xmm7 \n\t" 2554 "movd %%xmm7, %2 \n\t" 2555 : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o) 2556 : "m"(sh) 2557 ); 2558 return res; 2559} 2560 2561void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) 2562{ 2563 mm_flags = mm_support(); 2564 2565 if (avctx->dsp_mask) { 2566 if (avctx->dsp_mask & FF_MM_FORCE) 2567 mm_flags |= (avctx->dsp_mask & 0xffff); 2568 else 2569 mm_flags &= ~(avctx->dsp_mask & 0xffff); 2570 } 2571 2572#if 0 2573 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); 2574 if (mm_flags & FF_MM_MMX) 2575 av_log(avctx, AV_LOG_INFO, " mmx"); 2576 if (mm_flags & FF_MM_MMXEXT) 2577 av_log(avctx, AV_LOG_INFO, " mmxext"); 2578 if (mm_flags & FF_MM_3DNOW) 2579 av_log(avctx, AV_LOG_INFO, " 3dnow"); 2580 if (mm_flags & FF_MM_SSE) 2581 av_log(avctx, AV_LOG_INFO, " sse"); 2582 if (mm_flags & FF_MM_SSE2) 2583 av_log(avctx, AV_LOG_INFO, " sse2"); 2584 av_log(avctx, AV_LOG_INFO, "\n"); 2585#endif 2586 2587 if (mm_flags & FF_MM_MMX) { 2588 const int idct_algo= avctx->idct_algo; 2589 2590 if(avctx->lowres==0){ 2591 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ 2592 c->idct_put= ff_simple_idct_put_mmx; 2593 c->idct_add= ff_simple_idct_add_mmx; 2594 c->idct = ff_simple_idct_mmx; 2595 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; 2596#if CONFIG_GPL 2597 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ 2598 if(mm_flags & FF_MM_MMXEXT){ 2599 c->idct_put= ff_libmpeg2mmx2_idct_put; 2600 c->idct_add= ff_libmpeg2mmx2_idct_add; 2601 c->idct = ff_mmxext_idct; 2602 }else{ 2603 c->idct_put= ff_libmpeg2mmx_idct_put; 2604 c->idct_add= ff_libmpeg2mmx_idct_add; 2605 c->idct = ff_mmx_idct; 2606 } 2607 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; 2608#endif 2609 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER) && 2610 idct_algo==FF_IDCT_VP3){ 2611 if(mm_flags & FF_MM_SSE2){ 2612 c->idct_put= ff_vp3_idct_put_sse2; 2613 c->idct_add= ff_vp3_idct_add_sse2; 2614 c->idct = ff_vp3_idct_sse2; 2615 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; 2616 }else{ 2617 c->idct_put= ff_vp3_idct_put_mmx; 2618 c->idct_add= ff_vp3_idct_add_mmx; 2619 c->idct = ff_vp3_idct_mmx; 2620 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; 2621 } 2622 }else if(idct_algo==FF_IDCT_CAVS){ 2623 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; 2624 }else if(idct_algo==FF_IDCT_XVIDMMX){ 2625 if(mm_flags & FF_MM_SSE2){ 2626 c->idct_put= ff_idct_xvid_sse2_put; 2627 c->idct_add= ff_idct_xvid_sse2_add; 2628 c->idct = ff_idct_xvid_sse2; 2629 c->idct_permutation_type= FF_SSE2_IDCT_PERM; 2630 }else if(mm_flags & FF_MM_MMXEXT){ 2631 c->idct_put= ff_idct_xvid_mmx2_put; 2632 c->idct_add= ff_idct_xvid_mmx2_add; 2633 c->idct = ff_idct_xvid_mmx2; 2634 }else{ 2635 c->idct_put= ff_idct_xvid_mmx_put; 2636 c->idct_add= ff_idct_xvid_mmx_add; 2637 c->idct = ff_idct_xvid_mmx; 2638 } 2639 } 2640 } 2641 2642 c->put_pixels_clamped = put_pixels_clamped_mmx; 2643 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; 2644 c->add_pixels_clamped = add_pixels_clamped_mmx; 2645 c->clear_block = clear_block_mmx; 2646 c->clear_blocks = clear_blocks_mmx; 2647 if (mm_flags & FF_MM_SSE) 2648 c->clear_block = clear_block_sse; 2649 2650#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ 2651 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ 2652 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ 2653 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ 2654 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU 2655 2656 SET_HPEL_FUNCS(put, 0, 16, mmx); 2657 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); 2658 SET_HPEL_FUNCS(avg, 0, 16, mmx); 2659 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); 2660 SET_HPEL_FUNCS(put, 1, 8, mmx); 2661 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); 2662 SET_HPEL_FUNCS(avg, 1, 8, mmx); 2663 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); 2664 2665 c->gmc= gmc_mmx; 2666 2667 c->add_bytes= add_bytes_mmx; 2668 c->add_bytes_l2= add_bytes_l2_mmx; 2669 2670 c->draw_edges = draw_edges_mmx; 2671 2672 if (CONFIG_ANY_H263) { 2673 c->h263_v_loop_filter= h263_v_loop_filter_mmx; 2674 c->h263_h_loop_filter= h263_h_loop_filter_mmx; 2675 } 2676 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; 2677 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; 2678 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; 2679 2680 c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx; 2681 c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx; 2682 2683 c->h264_idct_dc_add= 2684 c->h264_idct_add= ff_h264_idct_add_mmx; 2685 c->h264_idct8_dc_add= 2686 c->h264_idct8_add= ff_h264_idct8_add_mmx; 2687 2688 c->h264_idct_add16 = ff_h264_idct_add16_mmx; 2689 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; 2690 c->h264_idct_add8 = ff_h264_idct_add8_mmx; 2691 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; 2692 2693 if (CONFIG_VP6_DECODER) { 2694 c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; 2695 } 2696 2697 if (mm_flags & FF_MM_MMXEXT) { 2698 c->prefetch = prefetch_mmx2; 2699 2700 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; 2701 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; 2702 2703 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; 2704 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; 2705 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; 2706 2707 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; 2708 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; 2709 2710 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; 2711 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; 2712 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; 2713 2714 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; 2715 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; 2716 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; 2717 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; 2718 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; 2719 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; 2720 2721 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2722 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; 2723 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; 2724 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; 2725 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; 2726 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; 2727 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; 2728 2729 if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) { 2730 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; 2731 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; 2732 } 2733 } 2734 2735#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ 2736 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ 2737 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ 2738 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ 2739 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ 2740 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ 2741 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ 2742 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ 2743 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ 2744 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ 2745 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ 2746 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ 2747 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ 2748 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ 2749 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ 2750 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ 2751 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU 2752 2753 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); 2754 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); 2755 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); 2756 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); 2757 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); 2758 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); 2759 2760 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); 2761 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); 2762 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); 2763 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); 2764 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); 2765 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); 2766 2767 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); 2768 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); 2769 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); 2770 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); 2771 2772 c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2; 2773 c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2; 2774 2775 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; 2776 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; 2777 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; 2778 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; 2779 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; 2780 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; 2781 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; 2782 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; 2783 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; 2784 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; 2785 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; 2786 2787 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; 2788 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; 2789 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; 2790 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; 2791 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; 2792 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; 2793 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; 2794 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; 2795 2796 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; 2797 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; 2798 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; 2799 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; 2800 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; 2801 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; 2802 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; 2803 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; 2804 2805#if HAVE_YASM 2806 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; 2807#endif 2808#if HAVE_7REGS && HAVE_TEN_OPERANDS 2809 if( mm_flags&FF_MM_3DNOW ) 2810 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; 2811#endif 2812 2813 if (CONFIG_CAVS_DECODER) 2814 ff_cavsdsp_init_mmx2(c, avctx); 2815 2816 if (CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER) 2817 ff_vc1dsp_init_mmx(c, avctx); 2818 2819 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; 2820 } else if (mm_flags & FF_MM_3DNOW) { 2821 c->prefetch = prefetch_3dnow; 2822 2823 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; 2824 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; 2825 2826 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; 2827 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; 2828 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; 2829 2830 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; 2831 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; 2832 2833 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; 2834 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; 2835 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; 2836 2837 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2838 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; 2839 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; 2840 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; 2841 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; 2842 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; 2843 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; 2844 } 2845 2846 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); 2847 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); 2848 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); 2849 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); 2850 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); 2851 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); 2852 2853 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); 2854 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); 2855 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); 2856 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); 2857 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); 2858 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); 2859 2860 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); 2861 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); 2862 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); 2863 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); 2864 2865 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; 2866 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; 2867 2868 c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow; 2869 c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow; 2870 2871 if (CONFIG_CAVS_DECODER) 2872 ff_cavsdsp_init_3dnow(c, avctx); 2873 } 2874 2875 2876#define H264_QPEL_FUNCS(x, y, CPU)\ 2877 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ 2878 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ 2879 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ 2880 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; 2881 if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ 2882 // these functions are slower than mmx on AMD, but faster on Intel 2883/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma 2884 c->put_pixels_tab[0][0] = put_pixels16_sse2; 2885 c->avg_pixels_tab[0][0] = avg_pixels16_sse2; 2886*/ 2887 H264_QPEL_FUNCS(0, 0, sse2); 2888 } 2889 if(mm_flags & FF_MM_SSE2){ 2890 c->h264_idct8_add = ff_h264_idct8_add_sse2; 2891 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; 2892 2893 H264_QPEL_FUNCS(0, 1, sse2); 2894 H264_QPEL_FUNCS(0, 2, sse2); 2895 H264_QPEL_FUNCS(0, 3, sse2); 2896 H264_QPEL_FUNCS(1, 1, sse2); 2897 H264_QPEL_FUNCS(1, 2, sse2); 2898 H264_QPEL_FUNCS(1, 3, sse2); 2899 H264_QPEL_FUNCS(2, 1, sse2); 2900 H264_QPEL_FUNCS(2, 2, sse2); 2901 H264_QPEL_FUNCS(2, 3, sse2); 2902 H264_QPEL_FUNCS(3, 1, sse2); 2903 H264_QPEL_FUNCS(3, 2, sse2); 2904 H264_QPEL_FUNCS(3, 3, sse2); 2905 2906 if (CONFIG_VP6_DECODER) { 2907 c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; 2908 } 2909 } 2910#if HAVE_SSSE3 2911 if(mm_flags & FF_MM_SSSE3){ 2912 H264_QPEL_FUNCS(1, 0, ssse3); 2913 H264_QPEL_FUNCS(1, 1, ssse3); 2914 H264_QPEL_FUNCS(1, 2, ssse3); 2915 H264_QPEL_FUNCS(1, 3, ssse3); 2916 H264_QPEL_FUNCS(2, 0, ssse3); 2917 H264_QPEL_FUNCS(2, 1, ssse3); 2918 H264_QPEL_FUNCS(2, 2, ssse3); 2919 H264_QPEL_FUNCS(2, 3, ssse3); 2920 H264_QPEL_FUNCS(3, 0, ssse3); 2921 H264_QPEL_FUNCS(3, 1, ssse3); 2922 H264_QPEL_FUNCS(3, 2, ssse3); 2923 H264_QPEL_FUNCS(3, 3, ssse3); 2924 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd; 2925 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; 2926 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; 2927 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; 2928 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; 2929 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; 2930 } 2931#endif 2932 2933#if CONFIG_GPL && HAVE_YASM 2934 if( mm_flags&FF_MM_MMXEXT ){ 2935#if ARCH_X86_32 2936 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; 2937 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; 2938#endif 2939 if( mm_flags&FF_MM_SSE2 ){ 2940#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1100 2941 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; 2942 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; 2943 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; 2944 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; 2945#endif 2946 c->h264_idct_add16 = ff_h264_idct_add16_sse2; 2947 c->h264_idct_add8 = ff_h264_idct_add8_sse2; 2948 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; 2949 } 2950 } 2951#endif 2952 2953#if CONFIG_SNOW_DECODER 2954 if(mm_flags & FF_MM_SSE2 & 0){ 2955 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; 2956#if HAVE_7REGS 2957 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; 2958#endif 2959 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; 2960 } 2961 else{ 2962 if(mm_flags & FF_MM_MMXEXT){ 2963 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; 2964#if HAVE_7REGS 2965 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; 2966#endif 2967 } 2968 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; 2969 } 2970#endif 2971 2972 if(mm_flags & FF_MM_3DNOW){ 2973 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; 2974 c->vector_fmul = vector_fmul_3dnow; 2975 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2976 c->float_to_int16 = float_to_int16_3dnow; 2977 c->float_to_int16_interleave = float_to_int16_interleave_3dnow; 2978 } 2979 } 2980 if(mm_flags & FF_MM_3DNOWEXT){ 2981 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; 2982 c->vector_fmul_window = vector_fmul_window_3dnow2; 2983 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2984 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; 2985 } 2986 } 2987 if(mm_flags & FF_MM_SSE){ 2988 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; 2989 c->ac3_downmix = ac3_downmix_sse; 2990 c->vector_fmul = vector_fmul_sse; 2991 c->vector_fmul_reverse = vector_fmul_reverse_sse; 2992 c->vector_fmul_add_add = vector_fmul_add_add_sse; 2993 c->vector_fmul_window = vector_fmul_window_sse; 2994 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; 2995 c->float_to_int16 = float_to_int16_sse; 2996 c->float_to_int16_interleave = float_to_int16_interleave_sse; 2997 } 2998 if(mm_flags & FF_MM_3DNOW) 2999 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse 3000 if(mm_flags & FF_MM_SSE2){ 3001 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; 3002 c->float_to_int16 = float_to_int16_sse2; 3003 c->float_to_int16_interleave = float_to_int16_interleave_sse2; 3004 c->add_int16 = add_int16_sse2; 3005 c->sub_int16 = sub_int16_sse2; 3006 c->scalarproduct_int16 = scalarproduct_int16_sse2; 3007 } 3008 } 3009 3010 if (CONFIG_ENCODERS) 3011 dsputilenc_init_mmx(c, avctx); 3012 3013#if 0 3014 // for speed testing 3015 get_pixels = just_return; 3016 put_pixels_clamped = just_return; 3017 add_pixels_clamped = just_return; 3018 3019 pix_abs16x16 = just_return; 3020 pix_abs16x16_x2 = just_return; 3021 pix_abs16x16_y2 = just_return; 3022 pix_abs16x16_xy2 = just_return; 3023 3024 put_pixels_tab[0] = just_return; 3025 put_pixels_tab[1] = just_return; 3026 put_pixels_tab[2] = just_return; 3027 put_pixels_tab[3] = just_return; 3028 3029 put_no_rnd_pixels_tab[0] = just_return; 3030 put_no_rnd_pixels_tab[1] = just_return; 3031 put_no_rnd_pixels_tab[2] = just_return; 3032 put_no_rnd_pixels_tab[3] = just_return; 3033 3034 avg_pixels_tab[0] = just_return; 3035 avg_pixels_tab[1] = just_return; 3036 avg_pixels_tab[2] = just_return; 3037 avg_pixels_tab[3] = just_return; 3038 3039 avg_no_rnd_pixels_tab[0] = just_return; 3040 avg_no_rnd_pixels_tab[1] = just_return; 3041 avg_no_rnd_pixels_tab[2] = just_return; 3042 avg_no_rnd_pixels_tab[3] = just_return; 3043 3044 //av_fdct = just_return; 3045 //ff_idct = just_return; 3046#endif 3047} 3048