1/* 2 * MMX optimized DSP utils 3 * Copyright (c) 2000, 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * This file is part of Libav. 7 * 8 * Libav is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * Libav is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with Libav; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 * 22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 23 */ 24 25#include "libavutil/cpu.h" 26#include "libavutil/x86_cpu.h" 27#include "libavcodec/dsputil.h" 28#include "libavcodec/h264dsp.h" 29#include "libavcodec/mpegvideo.h" 30#include "libavcodec/simple_idct.h" 31#include "libavcodec/ac3dec.h" 32#include "dsputil_mmx.h" 33#include "idct_xvid.h" 34 35//#undef NDEBUG 36//#include <assert.h> 37 38/* pixel operations */ 39DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; 40DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; 41 42DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = 43{0x8000000080000000ULL, 0x8000000080000000ULL}; 44 45DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL}; 46DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL}; 47DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; 48DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; 49DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; 50DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; 51DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL}; 52DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; 53DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; 54DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL}; 55DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL}; 56DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; 57DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL}; 58DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; 59DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; 60DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; 61DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL; 62DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL}; 63DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; 64DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; 65DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; 66DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; 67DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL}; 68DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL}; 69 70DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; 71DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; 72DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; 73DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; 74DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; 75DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; 76DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; 77DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; 78DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; 79DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL}; 80DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; 81DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; 82DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; 83 84DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; 85DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; 86 87#define JUMPALIGN() __asm__ volatile (".p2align 3"::) 88#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) 89 90#define MOVQ_BFE(regd) \ 91 __asm__ volatile ( \ 92 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ 93 "paddb %%" #regd ", %%" #regd " \n\t" ::) 94 95#ifndef PIC 96#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) 97#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) 98#else 99// for shared library it's better to use this way for accessing constants 100// pcmpeqd -> -1 101#define MOVQ_BONE(regd) \ 102 __asm__ volatile ( \ 103 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 104 "psrlw $15, %%" #regd " \n\t" \ 105 "packuswb %%" #regd ", %%" #regd " \n\t" ::) 106 107#define MOVQ_WTWO(regd) \ 108 __asm__ volatile ( \ 109 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 110 "psrlw $15, %%" #regd " \n\t" \ 111 "psllw $1, %%" #regd " \n\t"::) 112 113#endif 114 115// using regr as temporary and for the output result 116// first argument is unmodifed and second is trashed 117// regfe is supposed to contain 0xfefefefefefefefe 118#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ 119 "movq " #rega ", " #regr " \n\t"\ 120 "pand " #regb ", " #regr " \n\t"\ 121 "pxor " #rega ", " #regb " \n\t"\ 122 "pand " #regfe "," #regb " \n\t"\ 123 "psrlq $1, " #regb " \n\t"\ 124 "paddb " #regb ", " #regr " \n\t" 125 126#define PAVGB_MMX(rega, regb, regr, regfe) \ 127 "movq " #rega ", " #regr " \n\t"\ 128 "por " #regb ", " #regr " \n\t"\ 129 "pxor " #rega ", " #regb " \n\t"\ 130 "pand " #regfe "," #regb " \n\t"\ 131 "psrlq $1, " #regb " \n\t"\ 132 "psubb " #regb ", " #regr " \n\t" 133 134// mm6 is supposed to contain 0xfefefefefefefefe 135#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ 136 "movq " #rega ", " #regr " \n\t"\ 137 "movq " #regc ", " #regp " \n\t"\ 138 "pand " #regb ", " #regr " \n\t"\ 139 "pand " #regd ", " #regp " \n\t"\ 140 "pxor " #rega ", " #regb " \n\t"\ 141 "pxor " #regc ", " #regd " \n\t"\ 142 "pand %%mm6, " #regb " \n\t"\ 143 "pand %%mm6, " #regd " \n\t"\ 144 "psrlq $1, " #regb " \n\t"\ 145 "psrlq $1, " #regd " \n\t"\ 146 "paddb " #regb ", " #regr " \n\t"\ 147 "paddb " #regd ", " #regp " \n\t" 148 149#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ 150 "movq " #rega ", " #regr " \n\t"\ 151 "movq " #regc ", " #regp " \n\t"\ 152 "por " #regb ", " #regr " \n\t"\ 153 "por " #regd ", " #regp " \n\t"\ 154 "pxor " #rega ", " #regb " \n\t"\ 155 "pxor " #regc ", " #regd " \n\t"\ 156 "pand %%mm6, " #regb " \n\t"\ 157 "pand %%mm6, " #regd " \n\t"\ 158 "psrlq $1, " #regd " \n\t"\ 159 "psrlq $1, " #regb " \n\t"\ 160 "psubb " #regb ", " #regr " \n\t"\ 161 "psubb " #regd ", " #regp " \n\t" 162 163/***********************************/ 164/* MMX no rounding */ 165#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx 166#define SET_RND MOVQ_WONE 167#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) 168#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) 169#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) 170 171#include "dsputil_mmx_rnd_template.c" 172 173#undef DEF 174#undef SET_RND 175#undef PAVGBP 176#undef PAVGB 177/***********************************/ 178/* MMX rounding */ 179 180#define DEF(x, y) x ## _ ## y ##_mmx 181#define SET_RND MOVQ_WTWO 182#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) 183#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) 184 185#include "dsputil_mmx_rnd_template.c" 186 187#undef DEF 188#undef SET_RND 189#undef PAVGBP 190#undef PAVGB 191#undef OP_AVG 192 193/***********************************/ 194/* 3Dnow specific */ 195 196#define DEF(x) x ## _3dnow 197#define PAVGB "pavgusb" 198#define OP_AVG PAVGB 199 200#include "dsputil_mmx_avg_template.c" 201 202#undef DEF 203#undef PAVGB 204#undef OP_AVG 205 206/***********************************/ 207/* MMX2 specific */ 208 209#define DEF(x) x ## _mmx2 210 211/* Introduced only in MMX2 set */ 212#define PAVGB "pavgb" 213#define OP_AVG PAVGB 214 215#include "dsputil_mmx_avg_template.c" 216 217#undef DEF 218#undef PAVGB 219#undef OP_AVG 220 221#define put_no_rnd_pixels16_mmx put_pixels16_mmx 222#define put_no_rnd_pixels8_mmx put_pixels8_mmx 223#define put_pixels16_mmx2 put_pixels16_mmx 224#define put_pixels8_mmx2 put_pixels8_mmx 225#define put_pixels4_mmx2 put_pixels4_mmx 226#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx 227#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx 228#define put_pixels16_3dnow put_pixels16_mmx 229#define put_pixels8_3dnow put_pixels8_mmx 230#define put_pixels4_3dnow put_pixels4_mmx 231#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx 232#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx 233 234/***********************************/ 235/* standard MMX */ 236 237void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 238{ 239 const DCTELEM *p; 240 uint8_t *pix; 241 242 /* read the pixels */ 243 p = block; 244 pix = pixels; 245 /* unrolled loop */ 246 __asm__ volatile( 247 "movq %3, %%mm0 \n\t" 248 "movq 8%3, %%mm1 \n\t" 249 "movq 16%3, %%mm2 \n\t" 250 "movq 24%3, %%mm3 \n\t" 251 "movq 32%3, %%mm4 \n\t" 252 "movq 40%3, %%mm5 \n\t" 253 "movq 48%3, %%mm6 \n\t" 254 "movq 56%3, %%mm7 \n\t" 255 "packuswb %%mm1, %%mm0 \n\t" 256 "packuswb %%mm3, %%mm2 \n\t" 257 "packuswb %%mm5, %%mm4 \n\t" 258 "packuswb %%mm7, %%mm6 \n\t" 259 "movq %%mm0, (%0) \n\t" 260 "movq %%mm2, (%0, %1) \n\t" 261 "movq %%mm4, (%0, %1, 2) \n\t" 262 "movq %%mm6, (%0, %2) \n\t" 263 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) 264 :"memory"); 265 pix += line_size*4; 266 p += 32; 267 268 // if here would be an exact copy of the code above 269 // compiler would generate some very strange code 270 // thus using "r" 271 __asm__ volatile( 272 "movq (%3), %%mm0 \n\t" 273 "movq 8(%3), %%mm1 \n\t" 274 "movq 16(%3), %%mm2 \n\t" 275 "movq 24(%3), %%mm3 \n\t" 276 "movq 32(%3), %%mm4 \n\t" 277 "movq 40(%3), %%mm5 \n\t" 278 "movq 48(%3), %%mm6 \n\t" 279 "movq 56(%3), %%mm7 \n\t" 280 "packuswb %%mm1, %%mm0 \n\t" 281 "packuswb %%mm3, %%mm2 \n\t" 282 "packuswb %%mm5, %%mm4 \n\t" 283 "packuswb %%mm7, %%mm6 \n\t" 284 "movq %%mm0, (%0) \n\t" 285 "movq %%mm2, (%0, %1) \n\t" 286 "movq %%mm4, (%0, %1, 2) \n\t" 287 "movq %%mm6, (%0, %2) \n\t" 288 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) 289 :"memory"); 290} 291 292#define put_signed_pixels_clamped_mmx_half(off) \ 293 "movq "#off"(%2), %%mm1 \n\t"\ 294 "movq 16+"#off"(%2), %%mm2 \n\t"\ 295 "movq 32+"#off"(%2), %%mm3 \n\t"\ 296 "movq 48+"#off"(%2), %%mm4 \n\t"\ 297 "packsswb 8+"#off"(%2), %%mm1 \n\t"\ 298 "packsswb 24+"#off"(%2), %%mm2 \n\t"\ 299 "packsswb 40+"#off"(%2), %%mm3 \n\t"\ 300 "packsswb 56+"#off"(%2), %%mm4 \n\t"\ 301 "paddb %%mm0, %%mm1 \n\t"\ 302 "paddb %%mm0, %%mm2 \n\t"\ 303 "paddb %%mm0, %%mm3 \n\t"\ 304 "paddb %%mm0, %%mm4 \n\t"\ 305 "movq %%mm1, (%0) \n\t"\ 306 "movq %%mm2, (%0, %3) \n\t"\ 307 "movq %%mm3, (%0, %3, 2) \n\t"\ 308 "movq %%mm4, (%0, %1) \n\t" 309 310void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 311{ 312 x86_reg line_skip = line_size; 313 x86_reg line_skip3; 314 315 __asm__ volatile ( 316 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" 317 "lea (%3, %3, 2), %1 \n\t" 318 put_signed_pixels_clamped_mmx_half(0) 319 "lea (%0, %3, 4), %0 \n\t" 320 put_signed_pixels_clamped_mmx_half(64) 321 :"+&r" (pixels), "=&r" (line_skip3) 322 :"r" (block), "r"(line_skip) 323 :"memory"); 324} 325 326void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 327{ 328 const DCTELEM *p; 329 uint8_t *pix; 330 int i; 331 332 /* read the pixels */ 333 p = block; 334 pix = pixels; 335 MOVQ_ZERO(mm7); 336 i = 4; 337 do { 338 __asm__ volatile( 339 "movq (%2), %%mm0 \n\t" 340 "movq 8(%2), %%mm1 \n\t" 341 "movq 16(%2), %%mm2 \n\t" 342 "movq 24(%2), %%mm3 \n\t" 343 "movq %0, %%mm4 \n\t" 344 "movq %1, %%mm6 \n\t" 345 "movq %%mm4, %%mm5 \n\t" 346 "punpcklbw %%mm7, %%mm4 \n\t" 347 "punpckhbw %%mm7, %%mm5 \n\t" 348 "paddsw %%mm4, %%mm0 \n\t" 349 "paddsw %%mm5, %%mm1 \n\t" 350 "movq %%mm6, %%mm5 \n\t" 351 "punpcklbw %%mm7, %%mm6 \n\t" 352 "punpckhbw %%mm7, %%mm5 \n\t" 353 "paddsw %%mm6, %%mm2 \n\t" 354 "paddsw %%mm5, %%mm3 \n\t" 355 "packuswb %%mm1, %%mm0 \n\t" 356 "packuswb %%mm3, %%mm2 \n\t" 357 "movq %%mm0, %0 \n\t" 358 "movq %%mm2, %1 \n\t" 359 :"+m"(*pix), "+m"(*(pix+line_size)) 360 :"r"(p) 361 :"memory"); 362 pix += line_size*2; 363 p += 16; 364 } while (--i); 365} 366 367static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 368{ 369 __asm__ volatile( 370 "lea (%3, %3), %%"REG_a" \n\t" 371 ".p2align 3 \n\t" 372 "1: \n\t" 373 "movd (%1), %%mm0 \n\t" 374 "movd (%1, %3), %%mm1 \n\t" 375 "movd %%mm0, (%2) \n\t" 376 "movd %%mm1, (%2, %3) \n\t" 377 "add %%"REG_a", %1 \n\t" 378 "add %%"REG_a", %2 \n\t" 379 "movd (%1), %%mm0 \n\t" 380 "movd (%1, %3), %%mm1 \n\t" 381 "movd %%mm0, (%2) \n\t" 382 "movd %%mm1, (%2, %3) \n\t" 383 "add %%"REG_a", %1 \n\t" 384 "add %%"REG_a", %2 \n\t" 385 "subl $4, %0 \n\t" 386 "jnz 1b \n\t" 387 : "+g"(h), "+r" (pixels), "+r" (block) 388 : "r"((x86_reg)line_size) 389 : "%"REG_a, "memory" 390 ); 391} 392 393static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 394{ 395 __asm__ volatile( 396 "lea (%3, %3), %%"REG_a" \n\t" 397 ".p2align 3 \n\t" 398 "1: \n\t" 399 "movq (%1), %%mm0 \n\t" 400 "movq (%1, %3), %%mm1 \n\t" 401 "movq %%mm0, (%2) \n\t" 402 "movq %%mm1, (%2, %3) \n\t" 403 "add %%"REG_a", %1 \n\t" 404 "add %%"REG_a", %2 \n\t" 405 "movq (%1), %%mm0 \n\t" 406 "movq (%1, %3), %%mm1 \n\t" 407 "movq %%mm0, (%2) \n\t" 408 "movq %%mm1, (%2, %3) \n\t" 409 "add %%"REG_a", %1 \n\t" 410 "add %%"REG_a", %2 \n\t" 411 "subl $4, %0 \n\t" 412 "jnz 1b \n\t" 413 : "+g"(h), "+r" (pixels), "+r" (block) 414 : "r"((x86_reg)line_size) 415 : "%"REG_a, "memory" 416 ); 417} 418 419static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 420{ 421 __asm__ volatile( 422 "lea (%3, %3), %%"REG_a" \n\t" 423 ".p2align 3 \n\t" 424 "1: \n\t" 425 "movq (%1), %%mm0 \n\t" 426 "movq 8(%1), %%mm4 \n\t" 427 "movq (%1, %3), %%mm1 \n\t" 428 "movq 8(%1, %3), %%mm5 \n\t" 429 "movq %%mm0, (%2) \n\t" 430 "movq %%mm4, 8(%2) \n\t" 431 "movq %%mm1, (%2, %3) \n\t" 432 "movq %%mm5, 8(%2, %3) \n\t" 433 "add %%"REG_a", %1 \n\t" 434 "add %%"REG_a", %2 \n\t" 435 "movq (%1), %%mm0 \n\t" 436 "movq 8(%1), %%mm4 \n\t" 437 "movq (%1, %3), %%mm1 \n\t" 438 "movq 8(%1, %3), %%mm5 \n\t" 439 "movq %%mm0, (%2) \n\t" 440 "movq %%mm4, 8(%2) \n\t" 441 "movq %%mm1, (%2, %3) \n\t" 442 "movq %%mm5, 8(%2, %3) \n\t" 443 "add %%"REG_a", %1 \n\t" 444 "add %%"REG_a", %2 \n\t" 445 "subl $4, %0 \n\t" 446 "jnz 1b \n\t" 447 : "+g"(h), "+r" (pixels), "+r" (block) 448 : "r"((x86_reg)line_size) 449 : "%"REG_a, "memory" 450 ); 451} 452 453static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) 454{ 455 __asm__ volatile( 456 "1: \n\t" 457 "movdqu (%1), %%xmm0 \n\t" 458 "movdqu (%1,%3), %%xmm1 \n\t" 459 "movdqu (%1,%3,2), %%xmm2 \n\t" 460 "movdqu (%1,%4), %%xmm3 \n\t" 461 "lea (%1,%3,4), %1 \n\t" 462 "movdqa %%xmm0, (%2) \n\t" 463 "movdqa %%xmm1, (%2,%3) \n\t" 464 "movdqa %%xmm2, (%2,%3,2) \n\t" 465 "movdqa %%xmm3, (%2,%4) \n\t" 466 "subl $4, %0 \n\t" 467 "lea (%2,%3,4), %2 \n\t" 468 "jnz 1b \n\t" 469 : "+g"(h), "+r" (pixels), "+r" (block) 470 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) 471 : "memory" 472 ); 473} 474 475static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) 476{ 477 __asm__ volatile( 478 "1: \n\t" 479 "movdqu (%1), %%xmm0 \n\t" 480 "movdqu (%1,%3), %%xmm1 \n\t" 481 "movdqu (%1,%3,2), %%xmm2 \n\t" 482 "movdqu (%1,%4), %%xmm3 \n\t" 483 "lea (%1,%3,4), %1 \n\t" 484 "pavgb (%2), %%xmm0 \n\t" 485 "pavgb (%2,%3), %%xmm1 \n\t" 486 "pavgb (%2,%3,2), %%xmm2 \n\t" 487 "pavgb (%2,%4), %%xmm3 \n\t" 488 "movdqa %%xmm0, (%2) \n\t" 489 "movdqa %%xmm1, (%2,%3) \n\t" 490 "movdqa %%xmm2, (%2,%3,2) \n\t" 491 "movdqa %%xmm3, (%2,%4) \n\t" 492 "subl $4, %0 \n\t" 493 "lea (%2,%3,4), %2 \n\t" 494 "jnz 1b \n\t" 495 : "+g"(h), "+r" (pixels), "+r" (block) 496 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) 497 : "memory" 498 ); 499} 500 501#define CLEAR_BLOCKS(name,n) \ 502static void name(DCTELEM *blocks)\ 503{\ 504 __asm__ volatile(\ 505 "pxor %%mm7, %%mm7 \n\t"\ 506 "mov %1, %%"REG_a" \n\t"\ 507 "1: \n\t"\ 508 "movq %%mm7, (%0, %%"REG_a") \n\t"\ 509 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ 510 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ 511 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ 512 "add $32, %%"REG_a" \n\t"\ 513 " js 1b \n\t"\ 514 : : "r" (((uint8_t *)blocks)+128*n),\ 515 "i" (-128*n)\ 516 : "%"REG_a\ 517 );\ 518} 519CLEAR_BLOCKS(clear_blocks_mmx, 6) 520CLEAR_BLOCKS(clear_block_mmx, 1) 521 522static void clear_block_sse(DCTELEM *block) 523{ 524 __asm__ volatile( 525 "xorps %%xmm0, %%xmm0 \n" 526 "movaps %%xmm0, (%0) \n" 527 "movaps %%xmm0, 16(%0) \n" 528 "movaps %%xmm0, 32(%0) \n" 529 "movaps %%xmm0, 48(%0) \n" 530 "movaps %%xmm0, 64(%0) \n" 531 "movaps %%xmm0, 80(%0) \n" 532 "movaps %%xmm0, 96(%0) \n" 533 "movaps %%xmm0, 112(%0) \n" 534 :: "r"(block) 535 : "memory" 536 ); 537} 538 539static void clear_blocks_sse(DCTELEM *blocks) 540{\ 541 __asm__ volatile( 542 "xorps %%xmm0, %%xmm0 \n" 543 "mov %1, %%"REG_a" \n" 544 "1: \n" 545 "movaps %%xmm0, (%0, %%"REG_a") \n" 546 "movaps %%xmm0, 16(%0, %%"REG_a") \n" 547 "movaps %%xmm0, 32(%0, %%"REG_a") \n" 548 "movaps %%xmm0, 48(%0, %%"REG_a") \n" 549 "movaps %%xmm0, 64(%0, %%"REG_a") \n" 550 "movaps %%xmm0, 80(%0, %%"REG_a") \n" 551 "movaps %%xmm0, 96(%0, %%"REG_a") \n" 552 "movaps %%xmm0, 112(%0, %%"REG_a") \n" 553 "add $128, %%"REG_a" \n" 554 " js 1b \n" 555 : : "r" (((uint8_t *)blocks)+128*6), 556 "i" (-128*6) 557 : "%"REG_a 558 ); 559} 560 561static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ 562 x86_reg i=0; 563 __asm__ volatile( 564 "jmp 2f \n\t" 565 "1: \n\t" 566 "movq (%1, %0), %%mm0 \n\t" 567 "movq (%2, %0), %%mm1 \n\t" 568 "paddb %%mm0, %%mm1 \n\t" 569 "movq %%mm1, (%2, %0) \n\t" 570 "movq 8(%1, %0), %%mm0 \n\t" 571 "movq 8(%2, %0), %%mm1 \n\t" 572 "paddb %%mm0, %%mm1 \n\t" 573 "movq %%mm1, 8(%2, %0) \n\t" 574 "add $16, %0 \n\t" 575 "2: \n\t" 576 "cmp %3, %0 \n\t" 577 " js 1b \n\t" 578 : "+r" (i) 579 : "r"(src), "r"(dst), "r"((x86_reg)w-15) 580 ); 581 for(; i<w; i++) 582 dst[i+0] += src[i+0]; 583} 584 585static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 586 x86_reg i=0; 587 __asm__ volatile( 588 "jmp 2f \n\t" 589 "1: \n\t" 590 "movq (%2, %0), %%mm0 \n\t" 591 "movq 8(%2, %0), %%mm1 \n\t" 592 "paddb (%3, %0), %%mm0 \n\t" 593 "paddb 8(%3, %0), %%mm1 \n\t" 594 "movq %%mm0, (%1, %0) \n\t" 595 "movq %%mm1, 8(%1, %0) \n\t" 596 "add $16, %0 \n\t" 597 "2: \n\t" 598 "cmp %4, %0 \n\t" 599 " js 1b \n\t" 600 : "+r" (i) 601 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) 602 ); 603 for(; i<w; i++) 604 dst[i] = src1[i] + src2[i]; 605} 606 607#if HAVE_7REGS 608static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) { 609 x86_reg w2 = -w; 610 x86_reg x; 611 int l = *left & 0xff; 612 int tl = *left_top & 0xff; 613 int t; 614 __asm__ volatile( 615 "mov %7, %3 \n" 616 "1: \n" 617 "movzbl (%3,%4), %2 \n" 618 "mov %2, %k3 \n" 619 "sub %b1, %b3 \n" 620 "add %b0, %b3 \n" 621 "mov %2, %1 \n" 622 "cmp %0, %2 \n" 623 "cmovg %0, %2 \n" 624 "cmovg %1, %0 \n" 625 "cmp %k3, %0 \n" 626 "cmovg %k3, %0 \n" 627 "mov %7, %3 \n" 628 "cmp %2, %0 \n" 629 "cmovl %2, %0 \n" 630 "add (%6,%4), %b0 \n" 631 "mov %b0, (%5,%4) \n" 632 "inc %4 \n" 633 "jl 1b \n" 634 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) 635 :"r"(dst+w), "r"(diff+w), "rm"(top+w) 636 ); 637 *left = l; 638 *left_top = tl; 639} 640#endif 641 642#define H263_LOOP_FILTER \ 643 "pxor %%mm7, %%mm7 \n\t"\ 644 "movq %0, %%mm0 \n\t"\ 645 "movq %0, %%mm1 \n\t"\ 646 "movq %3, %%mm2 \n\t"\ 647 "movq %3, %%mm3 \n\t"\ 648 "punpcklbw %%mm7, %%mm0 \n\t"\ 649 "punpckhbw %%mm7, %%mm1 \n\t"\ 650 "punpcklbw %%mm7, %%mm2 \n\t"\ 651 "punpckhbw %%mm7, %%mm3 \n\t"\ 652 "psubw %%mm2, %%mm0 \n\t"\ 653 "psubw %%mm3, %%mm1 \n\t"\ 654 "movq %1, %%mm2 \n\t"\ 655 "movq %1, %%mm3 \n\t"\ 656 "movq %2, %%mm4 \n\t"\ 657 "movq %2, %%mm5 \n\t"\ 658 "punpcklbw %%mm7, %%mm2 \n\t"\ 659 "punpckhbw %%mm7, %%mm3 \n\t"\ 660 "punpcklbw %%mm7, %%mm4 \n\t"\ 661 "punpckhbw %%mm7, %%mm5 \n\t"\ 662 "psubw %%mm2, %%mm4 \n\t"\ 663 "psubw %%mm3, %%mm5 \n\t"\ 664 "psllw $2, %%mm4 \n\t"\ 665 "psllw $2, %%mm5 \n\t"\ 666 "paddw %%mm0, %%mm4 \n\t"\ 667 "paddw %%mm1, %%mm5 \n\t"\ 668 "pxor %%mm6, %%mm6 \n\t"\ 669 "pcmpgtw %%mm4, %%mm6 \n\t"\ 670 "pcmpgtw %%mm5, %%mm7 \n\t"\ 671 "pxor %%mm6, %%mm4 \n\t"\ 672 "pxor %%mm7, %%mm5 \n\t"\ 673 "psubw %%mm6, %%mm4 \n\t"\ 674 "psubw %%mm7, %%mm5 \n\t"\ 675 "psrlw $3, %%mm4 \n\t"\ 676 "psrlw $3, %%mm5 \n\t"\ 677 "packuswb %%mm5, %%mm4 \n\t"\ 678 "packsswb %%mm7, %%mm6 \n\t"\ 679 "pxor %%mm7, %%mm7 \n\t"\ 680 "movd %4, %%mm2 \n\t"\ 681 "punpcklbw %%mm2, %%mm2 \n\t"\ 682 "punpcklbw %%mm2, %%mm2 \n\t"\ 683 "punpcklbw %%mm2, %%mm2 \n\t"\ 684 "psubusb %%mm4, %%mm2 \n\t"\ 685 "movq %%mm2, %%mm3 \n\t"\ 686 "psubusb %%mm4, %%mm3 \n\t"\ 687 "psubb %%mm3, %%mm2 \n\t"\ 688 "movq %1, %%mm3 \n\t"\ 689 "movq %2, %%mm4 \n\t"\ 690 "pxor %%mm6, %%mm3 \n\t"\ 691 "pxor %%mm6, %%mm4 \n\t"\ 692 "paddusb %%mm2, %%mm3 \n\t"\ 693 "psubusb %%mm2, %%mm4 \n\t"\ 694 "pxor %%mm6, %%mm3 \n\t"\ 695 "pxor %%mm6, %%mm4 \n\t"\ 696 "paddusb %%mm2, %%mm2 \n\t"\ 697 "packsswb %%mm1, %%mm0 \n\t"\ 698 "pcmpgtb %%mm0, %%mm7 \n\t"\ 699 "pxor %%mm7, %%mm0 \n\t"\ 700 "psubb %%mm7, %%mm0 \n\t"\ 701 "movq %%mm0, %%mm1 \n\t"\ 702 "psubusb %%mm2, %%mm0 \n\t"\ 703 "psubb %%mm0, %%mm1 \n\t"\ 704 "pand %5, %%mm1 \n\t"\ 705 "psrlw $2, %%mm1 \n\t"\ 706 "pxor %%mm7, %%mm1 \n\t"\ 707 "psubb %%mm7, %%mm1 \n\t"\ 708 "movq %0, %%mm5 \n\t"\ 709 "movq %3, %%mm6 \n\t"\ 710 "psubb %%mm1, %%mm5 \n\t"\ 711 "paddb %%mm1, %%mm6 \n\t" 712 713static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ 714 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { 715 const int strength= ff_h263_loop_filter_strength[qscale]; 716 717 __asm__ volatile( 718 719 H263_LOOP_FILTER 720 721 "movq %%mm3, %1 \n\t" 722 "movq %%mm4, %2 \n\t" 723 "movq %%mm5, %0 \n\t" 724 "movq %%mm6, %3 \n\t" 725 : "+m" (*(uint64_t*)(src - 2*stride)), 726 "+m" (*(uint64_t*)(src - 1*stride)), 727 "+m" (*(uint64_t*)(src + 0*stride)), 728 "+m" (*(uint64_t*)(src + 1*stride)) 729 : "g" (2*strength), "m"(ff_pb_FC) 730 ); 731 } 732} 733 734static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ 735 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { 736 const int strength= ff_h263_loop_filter_strength[qscale]; 737 DECLARE_ALIGNED(8, uint64_t, temp)[4]; 738 uint8_t *btemp= (uint8_t*)temp; 739 740 src -= 2; 741 742 transpose4x4(btemp , src , 8, stride); 743 transpose4x4(btemp+4, src + 4*stride, 8, stride); 744 __asm__ volatile( 745 H263_LOOP_FILTER // 5 3 4 6 746 747 : "+m" (temp[0]), 748 "+m" (temp[1]), 749 "+m" (temp[2]), 750 "+m" (temp[3]) 751 : "g" (2*strength), "m"(ff_pb_FC) 752 ); 753 754 __asm__ volatile( 755 "movq %%mm5, %%mm1 \n\t" 756 "movq %%mm4, %%mm0 \n\t" 757 "punpcklbw %%mm3, %%mm5 \n\t" 758 "punpcklbw %%mm6, %%mm4 \n\t" 759 "punpckhbw %%mm3, %%mm1 \n\t" 760 "punpckhbw %%mm6, %%mm0 \n\t" 761 "movq %%mm5, %%mm3 \n\t" 762 "movq %%mm1, %%mm6 \n\t" 763 "punpcklwd %%mm4, %%mm5 \n\t" 764 "punpcklwd %%mm0, %%mm1 \n\t" 765 "punpckhwd %%mm4, %%mm3 \n\t" 766 "punpckhwd %%mm0, %%mm6 \n\t" 767 "movd %%mm5, (%0) \n\t" 768 "punpckhdq %%mm5, %%mm5 \n\t" 769 "movd %%mm5, (%0,%2) \n\t" 770 "movd %%mm3, (%0,%2,2) \n\t" 771 "punpckhdq %%mm3, %%mm3 \n\t" 772 "movd %%mm3, (%0,%3) \n\t" 773 "movd %%mm1, (%1) \n\t" 774 "punpckhdq %%mm1, %%mm1 \n\t" 775 "movd %%mm1, (%1,%2) \n\t" 776 "movd %%mm6, (%1,%2,2) \n\t" 777 "punpckhdq %%mm6, %%mm6 \n\t" 778 "movd %%mm6, (%1,%3) \n\t" 779 :: "r" (src), 780 "r" (src + 4*stride), 781 "r" ((x86_reg) stride ), 782 "r" ((x86_reg)(3*stride)) 783 ); 784 } 785} 786 787/* draw the edges of width 'w' of an image of size width, height 788 this mmx version can only handle w==8 || w==16 */ 789static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides) 790{ 791 uint8_t *ptr, *last_line; 792 int i; 793 794 last_line = buf + (height - 1) * wrap; 795 /* left and right */ 796 ptr = buf; 797 if(w==8) 798 { 799 __asm__ volatile( 800 "1: \n\t" 801 "movd (%0), %%mm0 \n\t" 802 "punpcklbw %%mm0, %%mm0 \n\t" 803 "punpcklwd %%mm0, %%mm0 \n\t" 804 "punpckldq %%mm0, %%mm0 \n\t" 805 "movq %%mm0, -8(%0) \n\t" 806 "movq -8(%0, %2), %%mm1 \n\t" 807 "punpckhbw %%mm1, %%mm1 \n\t" 808 "punpckhwd %%mm1, %%mm1 \n\t" 809 "punpckhdq %%mm1, %%mm1 \n\t" 810 "movq %%mm1, (%0, %2) \n\t" 811 "add %1, %0 \n\t" 812 "cmp %3, %0 \n\t" 813 " jb 1b \n\t" 814 : "+r" (ptr) 815 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) 816 ); 817 } 818 else 819 { 820 __asm__ volatile( 821 "1: \n\t" 822 "movd (%0), %%mm0 \n\t" 823 "punpcklbw %%mm0, %%mm0 \n\t" 824 "punpcklwd %%mm0, %%mm0 \n\t" 825 "punpckldq %%mm0, %%mm0 \n\t" 826 "movq %%mm0, -8(%0) \n\t" 827 "movq %%mm0, -16(%0) \n\t" 828 "movq -8(%0, %2), %%mm1 \n\t" 829 "punpckhbw %%mm1, %%mm1 \n\t" 830 "punpckhwd %%mm1, %%mm1 \n\t" 831 "punpckhdq %%mm1, %%mm1 \n\t" 832 "movq %%mm1, (%0, %2) \n\t" 833 "movq %%mm1, 8(%0, %2) \n\t" 834 "add %1, %0 \n\t" 835 "cmp %3, %0 \n\t" 836 " jb 1b \n\t" 837 : "+r" (ptr) 838 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) 839 ); 840 } 841 842 /* top and bottom (and hopefully also the corners) */ 843 if (sides&EDGE_TOP) { 844 for(i = 0; i < h; i += 4) { 845 ptr= buf - (i + 1) * wrap - w; 846 __asm__ volatile( 847 "1: \n\t" 848 "movq (%1, %0), %%mm0 \n\t" 849 "movq %%mm0, (%0) \n\t" 850 "movq %%mm0, (%0, %2) \n\t" 851 "movq %%mm0, (%0, %2, 2) \n\t" 852 "movq %%mm0, (%0, %3) \n\t" 853 "add $8, %0 \n\t" 854 "cmp %4, %0 \n\t" 855 " jb 1b \n\t" 856 : "+r" (ptr) 857 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) 858 ); 859 } 860 } 861 862 if (sides&EDGE_BOTTOM) { 863 for(i = 0; i < w; i += 4) { 864 ptr= last_line + (i + 1) * wrap - w; 865 __asm__ volatile( 866 "1: \n\t" 867 "movq (%1, %0), %%mm0 \n\t" 868 "movq %%mm0, (%0) \n\t" 869 "movq %%mm0, (%0, %2) \n\t" 870 "movq %%mm0, (%0, %2, 2) \n\t" 871 "movq %%mm0, (%0, %3) \n\t" 872 "add $8, %0 \n\t" 873 "cmp %4, %0 \n\t" 874 " jb 1b \n\t" 875 : "+r" (ptr) 876 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) 877 ); 878 } 879 } 880} 881 882#define PAETH(cpu, abs3)\ 883static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ 884{\ 885 x86_reg i = -bpp;\ 886 x86_reg end = w-3;\ 887 __asm__ volatile(\ 888 "pxor %%mm7, %%mm7 \n"\ 889 "movd (%1,%0), %%mm0 \n"\ 890 "movd (%2,%0), %%mm1 \n"\ 891 "punpcklbw %%mm7, %%mm0 \n"\ 892 "punpcklbw %%mm7, %%mm1 \n"\ 893 "add %4, %0 \n"\ 894 "1: \n"\ 895 "movq %%mm1, %%mm2 \n"\ 896 "movd (%2,%0), %%mm1 \n"\ 897 "movq %%mm2, %%mm3 \n"\ 898 "punpcklbw %%mm7, %%mm1 \n"\ 899 "movq %%mm2, %%mm4 \n"\ 900 "psubw %%mm1, %%mm3 \n"\ 901 "psubw %%mm0, %%mm4 \n"\ 902 "movq %%mm3, %%mm5 \n"\ 903 "paddw %%mm4, %%mm5 \n"\ 904 abs3\ 905 "movq %%mm4, %%mm6 \n"\ 906 "pminsw %%mm5, %%mm6 \n"\ 907 "pcmpgtw %%mm6, %%mm3 \n"\ 908 "pcmpgtw %%mm5, %%mm4 \n"\ 909 "movq %%mm4, %%mm6 \n"\ 910 "pand %%mm3, %%mm4 \n"\ 911 "pandn %%mm3, %%mm6 \n"\ 912 "pandn %%mm0, %%mm3 \n"\ 913 "movd (%3,%0), %%mm0 \n"\ 914 "pand %%mm1, %%mm6 \n"\ 915 "pand %%mm4, %%mm2 \n"\ 916 "punpcklbw %%mm7, %%mm0 \n"\ 917 "movq %6, %%mm5 \n"\ 918 "paddw %%mm6, %%mm0 \n"\ 919 "paddw %%mm2, %%mm3 \n"\ 920 "paddw %%mm3, %%mm0 \n"\ 921 "pand %%mm5, %%mm0 \n"\ 922 "movq %%mm0, %%mm3 \n"\ 923 "packuswb %%mm3, %%mm3 \n"\ 924 "movd %%mm3, (%1,%0) \n"\ 925 "add %4, %0 \n"\ 926 "cmp %5, %0 \n"\ 927 "jle 1b \n"\ 928 :"+r"(i)\ 929 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ 930 "m"(ff_pw_255)\ 931 :"memory"\ 932 );\ 933} 934 935#define ABS3_MMX2\ 936 "psubw %%mm5, %%mm7 \n"\ 937 "pmaxsw %%mm7, %%mm5 \n"\ 938 "pxor %%mm6, %%mm6 \n"\ 939 "pxor %%mm7, %%mm7 \n"\ 940 "psubw %%mm3, %%mm6 \n"\ 941 "psubw %%mm4, %%mm7 \n"\ 942 "pmaxsw %%mm6, %%mm3 \n"\ 943 "pmaxsw %%mm7, %%mm4 \n"\ 944 "pxor %%mm7, %%mm7 \n" 945 946#define ABS3_SSSE3\ 947 "pabsw %%mm3, %%mm3 \n"\ 948 "pabsw %%mm4, %%mm4 \n"\ 949 "pabsw %%mm5, %%mm5 \n" 950 951PAETH(mmx2, ABS3_MMX2) 952#if HAVE_SSSE3 953PAETH(ssse3, ABS3_SSSE3) 954#endif 955 956#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ 957 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ 958 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ 959 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ 960 "movq "#in7", " #m3 " \n\t" /* d */\ 961 "movq "#in0", %%mm5 \n\t" /* D */\ 962 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ 963 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ 964 "movq "#in1", %%mm5 \n\t" /* C */\ 965 "movq "#in2", %%mm6 \n\t" /* B */\ 966 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ 967 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ 968 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ 969 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ 970 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ 971 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ 972 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ 973 "psraw $5, %%mm5 \n\t"\ 974 "packuswb %%mm5, %%mm5 \n\t"\ 975 OP(%%mm5, out, %%mm7, d) 976 977#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ 978static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 979 uint64_t temp;\ 980\ 981 __asm__ volatile(\ 982 "pxor %%mm7, %%mm7 \n\t"\ 983 "1: \n\t"\ 984 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ 985 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ 986 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ 987 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ 988 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ 989 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ 990 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ 991 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ 992 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ 993 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ 994 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ 995 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ 996 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ 997 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ 998 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ 999 "paddw %%mm3, %%mm5 \n\t" /* b */\ 1000 "paddw %%mm2, %%mm6 \n\t" /* c */\ 1001 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1002 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ 1003 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ 1004 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ 1005 "paddw %%mm4, %%mm0 \n\t" /* a */\ 1006 "paddw %%mm1, %%mm5 \n\t" /* d */\ 1007 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ 1008 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ 1009 "paddw %6, %%mm6 \n\t"\ 1010 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1011 "psraw $5, %%mm0 \n\t"\ 1012 "movq %%mm0, %5 \n\t"\ 1013 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ 1014 \ 1015 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ 1016 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ 1017 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ 1018 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ 1019 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ 1020 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ 1021 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ 1022 "paddw %%mm0, %%mm2 \n\t" /* b */\ 1023 "paddw %%mm5, %%mm3 \n\t" /* c */\ 1024 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ 1025 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ 1026 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ 1027 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ 1028 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ 1029 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ 1030 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ 1031 "paddw %%mm2, %%mm1 \n\t" /* a */\ 1032 "paddw %%mm6, %%mm4 \n\t" /* d */\ 1033 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ 1034 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ 1035 "paddw %6, %%mm1 \n\t"\ 1036 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ 1037 "psraw $5, %%mm3 \n\t"\ 1038 "movq %5, %%mm1 \n\t"\ 1039 "packuswb %%mm3, %%mm1 \n\t"\ 1040 OP_MMX2(%%mm1, (%1),%%mm4, q)\ 1041 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ 1042 \ 1043 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ 1044 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ 1045 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ 1046 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ 1047 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ 1048 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ 1049 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ 1050 "paddw %%mm1, %%mm5 \n\t" /* b */\ 1051 "paddw %%mm4, %%mm0 \n\t" /* c */\ 1052 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1053 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ 1054 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ 1055 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ 1056 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ 1057 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ 1058 "paddw %%mm3, %%mm2 \n\t" /* d */\ 1059 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ 1060 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ 1061 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ 1062 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ 1063 "paddw %%mm2, %%mm6 \n\t" /* a */\ 1064 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ 1065 "paddw %6, %%mm0 \n\t"\ 1066 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1067 "psraw $5, %%mm0 \n\t"\ 1068 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ 1069 \ 1070 "paddw %%mm5, %%mm3 \n\t" /* a */\ 1071 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ 1072 "paddw %%mm4, %%mm6 \n\t" /* b */\ 1073 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ 1074 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ 1075 "paddw %%mm1, %%mm4 \n\t" /* c */\ 1076 "paddw %%mm2, %%mm5 \n\t" /* d */\ 1077 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ 1078 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ 1079 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ 1080 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ 1081 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ 1082 "paddw %6, %%mm4 \n\t"\ 1083 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ 1084 "psraw $5, %%mm4 \n\t"\ 1085 "packuswb %%mm4, %%mm0 \n\t"\ 1086 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ 1087 \ 1088 "add %3, %0 \n\t"\ 1089 "add %4, %1 \n\t"\ 1090 "decl %2 \n\t"\ 1091 " jnz 1b \n\t"\ 1092 : "+a"(src), "+c"(dst), "+D"(h)\ 1093 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ 1094 : "memory"\ 1095 );\ 1096}\ 1097\ 1098static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1099 int i;\ 1100 int16_t temp[16];\ 1101 /* quick HACK, XXX FIXME MUST be optimized */\ 1102 for(i=0; i<h; i++)\ 1103 {\ 1104 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ 1105 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ 1106 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ 1107 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ 1108 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ 1109 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ 1110 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ 1111 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ 1112 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ 1113 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ 1114 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ 1115 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ 1116 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ 1117 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ 1118 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ 1119 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ 1120 __asm__ volatile(\ 1121 "movq (%0), %%mm0 \n\t"\ 1122 "movq 8(%0), %%mm1 \n\t"\ 1123 "paddw %2, %%mm0 \n\t"\ 1124 "paddw %2, %%mm1 \n\t"\ 1125 "psraw $5, %%mm0 \n\t"\ 1126 "psraw $5, %%mm1 \n\t"\ 1127 "packuswb %%mm1, %%mm0 \n\t"\ 1128 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ 1129 "movq 16(%0), %%mm0 \n\t"\ 1130 "movq 24(%0), %%mm1 \n\t"\ 1131 "paddw %2, %%mm0 \n\t"\ 1132 "paddw %2, %%mm1 \n\t"\ 1133 "psraw $5, %%mm0 \n\t"\ 1134 "psraw $5, %%mm1 \n\t"\ 1135 "packuswb %%mm1, %%mm0 \n\t"\ 1136 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ 1137 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ 1138 : "memory"\ 1139 );\ 1140 dst+=dstStride;\ 1141 src+=srcStride;\ 1142 }\ 1143}\ 1144\ 1145static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1146 __asm__ volatile(\ 1147 "pxor %%mm7, %%mm7 \n\t"\ 1148 "1: \n\t"\ 1149 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ 1150 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ 1151 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ 1152 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ 1153 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ 1154 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ 1155 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ 1156 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ 1157 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ 1158 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ 1159 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ 1160 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ 1161 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ 1162 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ 1163 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ 1164 "paddw %%mm3, %%mm5 \n\t" /* b */\ 1165 "paddw %%mm2, %%mm6 \n\t" /* c */\ 1166 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1167 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ 1168 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ 1169 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ 1170 "paddw %%mm4, %%mm0 \n\t" /* a */\ 1171 "paddw %%mm1, %%mm5 \n\t" /* d */\ 1172 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ 1173 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ 1174 "paddw %5, %%mm6 \n\t"\ 1175 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1176 "psraw $5, %%mm0 \n\t"\ 1177 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ 1178 \ 1179 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ 1180 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ 1181 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ 1182 "paddw %%mm5, %%mm1 \n\t" /* a */\ 1183 "paddw %%mm6, %%mm2 \n\t" /* b */\ 1184 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ 1185 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ 1186 "paddw %%mm6, %%mm3 \n\t" /* c */\ 1187 "paddw %%mm5, %%mm4 \n\t" /* d */\ 1188 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ 1189 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ 1190 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ 1191 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ 1192 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ 1193 "paddw %5, %%mm1 \n\t"\ 1194 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ 1195 "psraw $5, %%mm3 \n\t"\ 1196 "packuswb %%mm3, %%mm0 \n\t"\ 1197 OP_MMX2(%%mm0, (%1), %%mm4, q)\ 1198 \ 1199 "add %3, %0 \n\t"\ 1200 "add %4, %1 \n\t"\ 1201 "decl %2 \n\t"\ 1202 " jnz 1b \n\t"\ 1203 : "+a"(src), "+c"(dst), "+d"(h)\ 1204 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ 1205 : "memory"\ 1206 );\ 1207}\ 1208\ 1209static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1210 int i;\ 1211 int16_t temp[8];\ 1212 /* quick HACK, XXX FIXME MUST be optimized */\ 1213 for(i=0; i<h; i++)\ 1214 {\ 1215 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ 1216 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ 1217 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ 1218 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ 1219 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ 1220 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ 1221 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ 1222 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ 1223 __asm__ volatile(\ 1224 "movq (%0), %%mm0 \n\t"\ 1225 "movq 8(%0), %%mm1 \n\t"\ 1226 "paddw %2, %%mm0 \n\t"\ 1227 "paddw %2, %%mm1 \n\t"\ 1228 "psraw $5, %%mm0 \n\t"\ 1229 "psraw $5, %%mm1 \n\t"\ 1230 "packuswb %%mm1, %%mm0 \n\t"\ 1231 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ 1232 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ 1233 :"memory"\ 1234 );\ 1235 dst+=dstStride;\ 1236 src+=srcStride;\ 1237 }\ 1238} 1239 1240#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ 1241\ 1242static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1243 uint64_t temp[17*4];\ 1244 uint64_t *temp_ptr= temp;\ 1245 int count= 17;\ 1246\ 1247 /*FIXME unroll */\ 1248 __asm__ volatile(\ 1249 "pxor %%mm7, %%mm7 \n\t"\ 1250 "1: \n\t"\ 1251 "movq (%0), %%mm0 \n\t"\ 1252 "movq (%0), %%mm1 \n\t"\ 1253 "movq 8(%0), %%mm2 \n\t"\ 1254 "movq 8(%0), %%mm3 \n\t"\ 1255 "punpcklbw %%mm7, %%mm0 \n\t"\ 1256 "punpckhbw %%mm7, %%mm1 \n\t"\ 1257 "punpcklbw %%mm7, %%mm2 \n\t"\ 1258 "punpckhbw %%mm7, %%mm3 \n\t"\ 1259 "movq %%mm0, (%1) \n\t"\ 1260 "movq %%mm1, 17*8(%1) \n\t"\ 1261 "movq %%mm2, 2*17*8(%1) \n\t"\ 1262 "movq %%mm3, 3*17*8(%1) \n\t"\ 1263 "add $8, %1 \n\t"\ 1264 "add %3, %0 \n\t"\ 1265 "decl %2 \n\t"\ 1266 " jnz 1b \n\t"\ 1267 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 1268 : "r" ((x86_reg)srcStride)\ 1269 : "memory"\ 1270 );\ 1271 \ 1272 temp_ptr= temp;\ 1273 count=4;\ 1274 \ 1275/*FIXME reorder for speed */\ 1276 __asm__ volatile(\ 1277 /*"pxor %%mm7, %%mm7 \n\t"*/\ 1278 "1: \n\t"\ 1279 "movq (%0), %%mm0 \n\t"\ 1280 "movq 8(%0), %%mm1 \n\t"\ 1281 "movq 16(%0), %%mm2 \n\t"\ 1282 "movq 24(%0), %%mm3 \n\t"\ 1283 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 1284 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 1285 "add %4, %1 \n\t"\ 1286 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 1287 \ 1288 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 1289 "add %4, %1 \n\t"\ 1290 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 1291 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ 1292 "add %4, %1 \n\t"\ 1293 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ 1294 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ 1295 "add %4, %1 \n\t"\ 1296 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ 1297 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ 1298 "add %4, %1 \n\t"\ 1299 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ 1300 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ 1301 "add %4, %1 \n\t"\ 1302 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ 1303 \ 1304 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ 1305 "add %4, %1 \n\t" \ 1306 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ 1307 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ 1308 \ 1309 "add $136, %0 \n\t"\ 1310 "add %6, %1 \n\t"\ 1311 "decl %2 \n\t"\ 1312 " jnz 1b \n\t"\ 1313 \ 1314 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 1315 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ 1316 :"memory"\ 1317 );\ 1318}\ 1319\ 1320static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1321 uint64_t temp[9*2];\ 1322 uint64_t *temp_ptr= temp;\ 1323 int count= 9;\ 1324\ 1325 /*FIXME unroll */\ 1326 __asm__ volatile(\ 1327 "pxor %%mm7, %%mm7 \n\t"\ 1328 "1: \n\t"\ 1329 "movq (%0), %%mm0 \n\t"\ 1330 "movq (%0), %%mm1 \n\t"\ 1331 "punpcklbw %%mm7, %%mm0 \n\t"\ 1332 "punpckhbw %%mm7, %%mm1 \n\t"\ 1333 "movq %%mm0, (%1) \n\t"\ 1334 "movq %%mm1, 9*8(%1) \n\t"\ 1335 "add $8, %1 \n\t"\ 1336 "add %3, %0 \n\t"\ 1337 "decl %2 \n\t"\ 1338 " jnz 1b \n\t"\ 1339 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 1340 : "r" ((x86_reg)srcStride)\ 1341 : "memory"\ 1342 );\ 1343 \ 1344 temp_ptr= temp;\ 1345 count=2;\ 1346 \ 1347/*FIXME reorder for speed */\ 1348 __asm__ volatile(\ 1349 /*"pxor %%mm7, %%mm7 \n\t"*/\ 1350 "1: \n\t"\ 1351 "movq (%0), %%mm0 \n\t"\ 1352 "movq 8(%0), %%mm1 \n\t"\ 1353 "movq 16(%0), %%mm2 \n\t"\ 1354 "movq 24(%0), %%mm3 \n\t"\ 1355 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 1356 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 1357 "add %4, %1 \n\t"\ 1358 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 1359 \ 1360 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 1361 "add %4, %1 \n\t"\ 1362 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 1363 \ 1364 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ 1365 "add %4, %1 \n\t"\ 1366 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ 1367 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ 1368 \ 1369 "add $72, %0 \n\t"\ 1370 "add %6, %1 \n\t"\ 1371 "decl %2 \n\t"\ 1372 " jnz 1b \n\t"\ 1373 \ 1374 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 1375 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ 1376 : "memory"\ 1377 );\ 1378}\ 1379\ 1380static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 1381 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ 1382}\ 1383\ 1384static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1385 uint64_t temp[8];\ 1386 uint8_t * const half= (uint8_t*)temp;\ 1387 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ 1388 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ 1389}\ 1390\ 1391static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1392 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ 1393}\ 1394\ 1395static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1396 uint64_t temp[8];\ 1397 uint8_t * const half= (uint8_t*)temp;\ 1398 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ 1399 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ 1400}\ 1401\ 1402static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1403 uint64_t temp[8];\ 1404 uint8_t * const half= (uint8_t*)temp;\ 1405 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ 1406 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ 1407}\ 1408\ 1409static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1410 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ 1411}\ 1412\ 1413static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1414 uint64_t temp[8];\ 1415 uint8_t * const half= (uint8_t*)temp;\ 1416 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ 1417 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ 1418}\ 1419static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1420 uint64_t half[8 + 9];\ 1421 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1422 uint8_t * const halfHV= ((uint8_t*)half);\ 1423 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1424 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 1425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1426 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 1427}\ 1428static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1429 uint64_t half[8 + 9];\ 1430 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1431 uint8_t * const halfHV= ((uint8_t*)half);\ 1432 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1433 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 1434 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1435 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 1436}\ 1437static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1438 uint64_t half[8 + 9];\ 1439 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1440 uint8_t * const halfHV= ((uint8_t*)half);\ 1441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1442 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 1443 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1444 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 1445}\ 1446static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1447 uint64_t half[8 + 9];\ 1448 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1449 uint8_t * const halfHV= ((uint8_t*)half);\ 1450 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1451 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 1452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 1454}\ 1455static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1456 uint64_t half[8 + 9];\ 1457 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1458 uint8_t * const halfHV= ((uint8_t*)half);\ 1459 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1460 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1461 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 1462}\ 1463static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1464 uint64_t half[8 + 9];\ 1465 uint8_t * const halfH= ((uint8_t*)half) + 64;\ 1466 uint8_t * const halfHV= ((uint8_t*)half);\ 1467 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1468 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 1469 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 1470}\ 1471static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1472 uint64_t half[8 + 9];\ 1473 uint8_t * const halfH= ((uint8_t*)half);\ 1474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1475 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 1476 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 1477}\ 1478static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1479 uint64_t half[8 + 9];\ 1480 uint8_t * const halfH= ((uint8_t*)half);\ 1481 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1482 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 1483 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 1484}\ 1485static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1486 uint64_t half[9];\ 1487 uint8_t * const halfH= ((uint8_t*)half);\ 1488 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 1489 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 1490}\ 1491static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 1492 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ 1493}\ 1494\ 1495static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1496 uint64_t temp[32];\ 1497 uint8_t * const half= (uint8_t*)temp;\ 1498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ 1499 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ 1500}\ 1501\ 1502static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1503 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ 1504}\ 1505\ 1506static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1507 uint64_t temp[32];\ 1508 uint8_t * const half= (uint8_t*)temp;\ 1509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ 1510 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ 1511}\ 1512\ 1513static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1514 uint64_t temp[32];\ 1515 uint8_t * const half= (uint8_t*)temp;\ 1516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ 1517 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ 1518}\ 1519\ 1520static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1521 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ 1522}\ 1523\ 1524static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1525 uint64_t temp[32];\ 1526 uint8_t * const half= (uint8_t*)temp;\ 1527 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ 1528 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ 1529}\ 1530static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1531 uint64_t half[16*2 + 17*2];\ 1532 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1533 uint8_t * const halfHV= ((uint8_t*)half);\ 1534 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1535 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 1536 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1537 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 1538}\ 1539static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1540 uint64_t half[16*2 + 17*2];\ 1541 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1542 uint8_t * const halfHV= ((uint8_t*)half);\ 1543 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1544 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 1545 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1546 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 1547}\ 1548static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1549 uint64_t half[16*2 + 17*2];\ 1550 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1551 uint8_t * const halfHV= ((uint8_t*)half);\ 1552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1553 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 1554 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1555 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 1556}\ 1557static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1558 uint64_t half[16*2 + 17*2];\ 1559 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1560 uint8_t * const halfHV= ((uint8_t*)half);\ 1561 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1562 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 1563 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1564 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 1565}\ 1566static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1567 uint64_t half[16*2 + 17*2];\ 1568 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1569 uint8_t * const halfHV= ((uint8_t*)half);\ 1570 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1571 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1572 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 1573}\ 1574static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1575 uint64_t half[16*2 + 17*2];\ 1576 uint8_t * const halfH= ((uint8_t*)half) + 256;\ 1577 uint8_t * const halfHV= ((uint8_t*)half);\ 1578 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1579 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 1580 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 1581}\ 1582static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1583 uint64_t half[17*2];\ 1584 uint8_t * const halfH= ((uint8_t*)half);\ 1585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1586 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 1587 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 1588}\ 1589static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1590 uint64_t half[17*2];\ 1591 uint8_t * const halfH= ((uint8_t*)half);\ 1592 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1593 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 1594 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 1595}\ 1596static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1597 uint64_t half[17*2];\ 1598 uint8_t * const halfH= ((uint8_t*)half);\ 1599 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 1600 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 1601} 1602 1603#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 1604#define AVG_3DNOW_OP(a,b,temp, size) \ 1605"mov" #size " " #b ", " #temp " \n\t"\ 1606"pavgusb " #temp ", " #a " \n\t"\ 1607"mov" #size " " #a ", " #b " \n\t" 1608#define AVG_MMX2_OP(a,b,temp, size) \ 1609"mov" #size " " #b ", " #temp " \n\t"\ 1610"pavgb " #temp ", " #a " \n\t"\ 1611"mov" #size " " #a ", " #b " \n\t" 1612 1613QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) 1614QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) 1615QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) 1616QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) 1617QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) 1618QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) 1619QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) 1620QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) 1621QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) 1622 1623/***********************************/ 1624/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ 1625 1626#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ 1627static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1628 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ 1629} 1630#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ 1631static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1632 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ 1633} 1634 1635#define QPEL_2TAP(OPNAME, SIZE, MMX)\ 1636QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ 1637QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ 1638QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ 1639static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ 1640 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ 1641static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ 1642 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ 1643static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ 1644 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ 1645static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1646 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ 1647}\ 1648static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1649 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ 1650}\ 1651QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ 1652QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ 1653QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ 1654QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ 1655QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ 1656QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ 1657QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ 1658QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ 1659 1660QPEL_2TAP(put_, 16, mmx2) 1661QPEL_2TAP(avg_, 16, mmx2) 1662QPEL_2TAP(put_, 8, mmx2) 1663QPEL_2TAP(avg_, 8, mmx2) 1664QPEL_2TAP(put_, 16, 3dnow) 1665QPEL_2TAP(avg_, 16, 3dnow) 1666QPEL_2TAP(put_, 8, 3dnow) 1667QPEL_2TAP(avg_, 8, 3dnow) 1668 1669 1670#if HAVE_YASM 1671typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src, 1672 x86_reg linesize, x86_reg start_y, 1673 x86_reg end_y, x86_reg block_h, 1674 x86_reg start_x, x86_reg end_x, 1675 x86_reg block_w); 1676extern emu_edge_core_func ff_emu_edge_core_mmx; 1677extern emu_edge_core_func ff_emu_edge_core_sse; 1678 1679static av_always_inline 1680void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, 1681 int block_w, int block_h, 1682 int src_x, int src_y, int w, int h, 1683 emu_edge_core_func *core_fn) 1684{ 1685 int start_y, start_x, end_y, end_x, src_y_add=0; 1686 1687 if(src_y>= h){ 1688 src_y_add = h-1-src_y; 1689 src_y=h-1; 1690 }else if(src_y<=-block_h){ 1691 src_y_add = 1-block_h-src_y; 1692 src_y=1-block_h; 1693 } 1694 if(src_x>= w){ 1695 src+= (w-1-src_x); 1696 src_x=w-1; 1697 }else if(src_x<=-block_w){ 1698 src+= (1-block_w-src_x); 1699 src_x=1-block_w; 1700 } 1701 1702 start_y= FFMAX(0, -src_y); 1703 start_x= FFMAX(0, -src_x); 1704 end_y= FFMIN(block_h, h-src_y); 1705 end_x= FFMIN(block_w, w-src_x); 1706 assert(start_x < end_x && block_w > 0); 1707 assert(start_y < end_y && block_h > 0); 1708 1709 // fill in the to-be-copied part plus all above/below 1710 src += (src_y_add+start_y)*linesize + start_x; 1711 buf += start_x; 1712 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w); 1713} 1714 1715#if ARCH_X86_32 1716static av_noinline 1717void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize, 1718 int block_w, int block_h, 1719 int src_x, int src_y, int w, int h) 1720{ 1721 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, 1722 w, h, &ff_emu_edge_core_mmx); 1723} 1724#endif 1725static av_noinline 1726void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize, 1727 int block_w, int block_h, 1728 int src_x, int src_y, int w, int h) 1729{ 1730 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, 1731 w, h, &ff_emu_edge_core_sse); 1732} 1733#endif /* HAVE_YASM */ 1734 1735typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src, 1736 int linesize, int block_w, int block_h, 1737 int src_x, int src_y, int w, int h); 1738 1739static av_always_inline 1740void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 1741 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height, 1742 emulated_edge_mc_func *emu_edge_fn) 1743{ 1744 const int w = 8; 1745 const int ix = ox>>(16+shift); 1746 const int iy = oy>>(16+shift); 1747 const int oxs = ox>>4; 1748 const int oys = oy>>4; 1749 const int dxxs = dxx>>4; 1750 const int dxys = dxy>>4; 1751 const int dyxs = dyx>>4; 1752 const int dyys = dyy>>4; 1753 const uint16_t r4[4] = {r,r,r,r}; 1754 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; 1755 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; 1756 const uint64_t shift2 = 2*shift; 1757 uint8_t edge_buf[(h+1)*stride]; 1758 int x, y; 1759 1760 const int dxw = (dxx-(1<<(16+shift)))*(w-1); 1761 const int dyh = (dyy-(1<<(16+shift)))*(h-1); 1762 const int dxh = dxy*(h-1); 1763 const int dyw = dyx*(w-1); 1764 if( // non-constant fullpel offset (3% of blocks) 1765 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | 1766 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) 1767 // uses more than 16 bits of subpel mv (only at huge resolution) 1768 || (dxx|dxy|dyx|dyy)&15 ) 1769 { 1770 //FIXME could still use mmx for some of the rows 1771 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); 1772 return; 1773 } 1774 1775 src += ix + iy*stride; 1776 if( (unsigned)ix >= width-w || 1777 (unsigned)iy >= height-h ) 1778 { 1779 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); 1780 src = edge_buf; 1781 } 1782 1783 __asm__ volatile( 1784 "movd %0, %%mm6 \n\t" 1785 "pxor %%mm7, %%mm7 \n\t" 1786 "punpcklwd %%mm6, %%mm6 \n\t" 1787 "punpcklwd %%mm6, %%mm6 \n\t" 1788 :: "r"(1<<shift) 1789 ); 1790 1791 for(x=0; x<w; x+=4){ 1792 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), 1793 oxs - dxys + dxxs*(x+1), 1794 oxs - dxys + dxxs*(x+2), 1795 oxs - dxys + dxxs*(x+3) }; 1796 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), 1797 oys - dyys + dyxs*(x+1), 1798 oys - dyys + dyxs*(x+2), 1799 oys - dyys + dyxs*(x+3) }; 1800 1801 for(y=0; y<h; y++){ 1802 __asm__ volatile( 1803 "movq %0, %%mm4 \n\t" 1804 "movq %1, %%mm5 \n\t" 1805 "paddw %2, %%mm4 \n\t" 1806 "paddw %3, %%mm5 \n\t" 1807 "movq %%mm4, %0 \n\t" 1808 "movq %%mm5, %1 \n\t" 1809 "psrlw $12, %%mm4 \n\t" 1810 "psrlw $12, %%mm5 \n\t" 1811 : "+m"(*dx4), "+m"(*dy4) 1812 : "m"(*dxy4), "m"(*dyy4) 1813 ); 1814 1815 __asm__ volatile( 1816 "movq %%mm6, %%mm2 \n\t" 1817 "movq %%mm6, %%mm1 \n\t" 1818 "psubw %%mm4, %%mm2 \n\t" 1819 "psubw %%mm5, %%mm1 \n\t" 1820 "movq %%mm2, %%mm0 \n\t" 1821 "movq %%mm4, %%mm3 \n\t" 1822 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) 1823 "pmullw %%mm5, %%mm3 \n\t" // dx*dy 1824 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy 1825 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) 1826 1827 "movd %4, %%mm5 \n\t" 1828 "movd %3, %%mm4 \n\t" 1829 "punpcklbw %%mm7, %%mm5 \n\t" 1830 "punpcklbw %%mm7, %%mm4 \n\t" 1831 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy 1832 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy 1833 1834 "movd %2, %%mm5 \n\t" 1835 "movd %1, %%mm4 \n\t" 1836 "punpcklbw %%mm7, %%mm5 \n\t" 1837 "punpcklbw %%mm7, %%mm4 \n\t" 1838 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) 1839 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) 1840 "paddw %5, %%mm1 \n\t" 1841 "paddw %%mm3, %%mm2 \n\t" 1842 "paddw %%mm1, %%mm0 \n\t" 1843 "paddw %%mm2, %%mm0 \n\t" 1844 1845 "psrlw %6, %%mm0 \n\t" 1846 "packuswb %%mm0, %%mm0 \n\t" 1847 "movd %%mm0, %0 \n\t" 1848 1849 : "=m"(dst[x+y*stride]) 1850 : "m"(src[0]), "m"(src[1]), 1851 "m"(src[stride]), "m"(src[stride+1]), 1852 "m"(*r4), "m"(shift2) 1853 ); 1854 src += stride; 1855 } 1856 src += 4-h*stride; 1857 } 1858} 1859 1860#if HAVE_YASM 1861#if ARCH_X86_32 1862static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 1863 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) 1864{ 1865 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, 1866 width, height, &emulated_edge_mc_mmx); 1867} 1868#endif 1869static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 1870 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) 1871{ 1872 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, 1873 width, height, &emulated_edge_mc_sse); 1874} 1875#else 1876static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 1877 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) 1878{ 1879 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, 1880 width, height, &ff_emulated_edge_mc_8); 1881} 1882#endif 1883 1884#define PREFETCH(name, op) \ 1885static void name(void *mem, int stride, int h){\ 1886 const uint8_t *p= mem;\ 1887 do{\ 1888 __asm__ volatile(#op" %0" :: "m"(*p));\ 1889 p+= stride;\ 1890 }while(--h);\ 1891} 1892PREFETCH(prefetch_mmx2, prefetcht0) 1893PREFETCH(prefetch_3dnow, prefetch) 1894#undef PREFETCH 1895 1896#include "h264_qpel_mmx.c" 1897 1898void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, 1899 int stride, int h, int x, int y); 1900void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src, 1901 int stride, int h, int x, int y); 1902void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src, 1903 int stride, int h, int x, int y); 1904 1905void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, 1906 int stride, int h, int x, int y); 1907void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, 1908 int stride, int h, int x, int y); 1909void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, 1910 int stride, int h, int x, int y); 1911 1912void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, 1913 int stride, int h, int x, int y); 1914void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, 1915 int stride, int h, int x, int y); 1916 1917void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, 1918 int stride, int h, int x, int y); 1919void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, 1920 int stride, int h, int x, int y); 1921 1922void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, 1923 int stride, int h, int x, int y); 1924void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, 1925 int stride, int h, int x, int y); 1926 1927#define CHROMA_MC(OP, NUM, DEPTH, OPT) \ 1928void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ 1929 (uint8_t *dst, uint8_t *src,\ 1930 int stride, int h, int x, int y); 1931 1932CHROMA_MC(put, 2, 10, mmxext) 1933CHROMA_MC(avg, 2, 10, mmxext) 1934CHROMA_MC(put, 4, 10, mmxext) 1935CHROMA_MC(avg, 4, 10, mmxext) 1936CHROMA_MC(put, 8, 10, sse2) 1937CHROMA_MC(avg, 8, 10, sse2) 1938CHROMA_MC(put, 8, 10, avx) 1939CHROMA_MC(avg, 8, 10, avx) 1940 1941/* CAVS specific */ 1942void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1943 put_pixels8_mmx(dst, src, stride, 8); 1944} 1945void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1946 avg_pixels8_mmx(dst, src, stride, 8); 1947} 1948void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1949 put_pixels16_mmx(dst, src, stride, 16); 1950} 1951void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { 1952 avg_pixels16_mmx(dst, src, stride, 16); 1953} 1954 1955/* VC1 specific */ 1956void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { 1957 put_pixels8_mmx(dst, src, stride, 8); 1958} 1959void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { 1960 avg_pixels8_mmx2(dst, src, stride, 8); 1961} 1962 1963/* XXX: those functions should be suppressed ASAP when all IDCTs are 1964 converted */ 1965#if CONFIG_GPL 1966static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) 1967{ 1968 ff_mmx_idct (block); 1969 ff_put_pixels_clamped_mmx(block, dest, line_size); 1970} 1971static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) 1972{ 1973 ff_mmx_idct (block); 1974 ff_add_pixels_clamped_mmx(block, dest, line_size); 1975} 1976static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) 1977{ 1978 ff_mmxext_idct (block); 1979 ff_put_pixels_clamped_mmx(block, dest, line_size); 1980} 1981static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) 1982{ 1983 ff_mmxext_idct (block); 1984 ff_add_pixels_clamped_mmx(block, dest, line_size); 1985} 1986#endif 1987static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) 1988{ 1989 ff_idct_xvid_mmx (block); 1990 ff_put_pixels_clamped_mmx(block, dest, line_size); 1991} 1992static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) 1993{ 1994 ff_idct_xvid_mmx (block); 1995 ff_add_pixels_clamped_mmx(block, dest, line_size); 1996} 1997static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) 1998{ 1999 ff_idct_xvid_mmx2 (block); 2000 ff_put_pixels_clamped_mmx(block, dest, line_size); 2001} 2002static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) 2003{ 2004 ff_idct_xvid_mmx2 (block); 2005 ff_add_pixels_clamped_mmx(block, dest, line_size); 2006} 2007 2008static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) 2009{ 2010 int i; 2011 __asm__ volatile("pxor %%mm7, %%mm7":); 2012 for(i=0; i<blocksize; i+=2) { 2013 __asm__ volatile( 2014 "movq %0, %%mm0 \n\t" 2015 "movq %1, %%mm1 \n\t" 2016 "movq %%mm0, %%mm2 \n\t" 2017 "movq %%mm1, %%mm3 \n\t" 2018 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 2019 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 2020 "pslld $31, %%mm2 \n\t" // keep only the sign bit 2021 "pxor %%mm2, %%mm1 \n\t" 2022 "movq %%mm3, %%mm4 \n\t" 2023 "pand %%mm1, %%mm3 \n\t" 2024 "pandn %%mm1, %%mm4 \n\t" 2025 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) 2026 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) 2027 "movq %%mm3, %1 \n\t" 2028 "movq %%mm0, %0 \n\t" 2029 :"+m"(mag[i]), "+m"(ang[i]) 2030 ::"memory" 2031 ); 2032 } 2033 __asm__ volatile("femms"); 2034} 2035static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) 2036{ 2037 int i; 2038 2039 __asm__ volatile( 2040 "movaps %0, %%xmm5 \n\t" 2041 ::"m"(ff_pdw_80000000[0]) 2042 ); 2043 for(i=0; i<blocksize; i+=4) { 2044 __asm__ volatile( 2045 "movaps %0, %%xmm0 \n\t" 2046 "movaps %1, %%xmm1 \n\t" 2047 "xorps %%xmm2, %%xmm2 \n\t" 2048 "xorps %%xmm3, %%xmm3 \n\t" 2049 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 2050 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 2051 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit 2052 "xorps %%xmm2, %%xmm1 \n\t" 2053 "movaps %%xmm3, %%xmm4 \n\t" 2054 "andps %%xmm1, %%xmm3 \n\t" 2055 "andnps %%xmm1, %%xmm4 \n\t" 2056 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) 2057 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) 2058 "movaps %%xmm3, %1 \n\t" 2059 "movaps %%xmm0, %0 \n\t" 2060 :"+m"(mag[i]), "+m"(ang[i]) 2061 ::"memory" 2062 ); 2063 } 2064} 2065 2066#define IF1(x) x 2067#define IF0(x) 2068 2069#define MIX5(mono,stereo)\ 2070 __asm__ volatile(\ 2071 "movss 0(%2), %%xmm5 \n"\ 2072 "movss 8(%2), %%xmm6 \n"\ 2073 "movss 24(%2), %%xmm7 \n"\ 2074 "shufps $0, %%xmm5, %%xmm5 \n"\ 2075 "shufps $0, %%xmm6, %%xmm6 \n"\ 2076 "shufps $0, %%xmm7, %%xmm7 \n"\ 2077 "1: \n"\ 2078 "movaps (%0,%1), %%xmm0 \n"\ 2079 "movaps 0x400(%0,%1), %%xmm1 \n"\ 2080 "movaps 0x800(%0,%1), %%xmm2 \n"\ 2081 "movaps 0xc00(%0,%1), %%xmm3 \n"\ 2082 "movaps 0x1000(%0,%1), %%xmm4 \n"\ 2083 "mulps %%xmm5, %%xmm0 \n"\ 2084 "mulps %%xmm6, %%xmm1 \n"\ 2085 "mulps %%xmm5, %%xmm2 \n"\ 2086 "mulps %%xmm7, %%xmm3 \n"\ 2087 "mulps %%xmm7, %%xmm4 \n"\ 2088 stereo("addps %%xmm1, %%xmm0 \n")\ 2089 "addps %%xmm1, %%xmm2 \n"\ 2090 "addps %%xmm3, %%xmm0 \n"\ 2091 "addps %%xmm4, %%xmm2 \n"\ 2092 mono("addps %%xmm2, %%xmm0 \n")\ 2093 "movaps %%xmm0, (%0,%1) \n"\ 2094 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ 2095 "add $16, %0 \n"\ 2096 "jl 1b \n"\ 2097 :"+&r"(i)\ 2098 :"r"(samples[0]+len), "r"(matrix)\ 2099 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 2100 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ 2101 "memory"\ 2102 ); 2103 2104#define MIX_MISC(stereo)\ 2105 __asm__ volatile(\ 2106 "1: \n"\ 2107 "movaps (%3,%0), %%xmm0 \n"\ 2108 stereo("movaps %%xmm0, %%xmm1 \n")\ 2109 "mulps %%xmm4, %%xmm0 \n"\ 2110 stereo("mulps %%xmm5, %%xmm1 \n")\ 2111 "lea 1024(%3,%0), %1 \n"\ 2112 "mov %5, %2 \n"\ 2113 "2: \n"\ 2114 "movaps (%1), %%xmm2 \n"\ 2115 stereo("movaps %%xmm2, %%xmm3 \n")\ 2116 "mulps (%4,%2), %%xmm2 \n"\ 2117 stereo("mulps 16(%4,%2), %%xmm3 \n")\ 2118 "addps %%xmm2, %%xmm0 \n"\ 2119 stereo("addps %%xmm3, %%xmm1 \n")\ 2120 "add $1024, %1 \n"\ 2121 "add $32, %2 \n"\ 2122 "jl 2b \n"\ 2123 "movaps %%xmm0, (%3,%0) \n"\ 2124 stereo("movaps %%xmm1, 1024(%3,%0) \n")\ 2125 "add $16, %0 \n"\ 2126 "jl 1b \n"\ 2127 :"+&r"(i), "=&r"(j), "=&r"(k)\ 2128 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ 2129 :"memory"\ 2130 ); 2131 2132static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) 2133{ 2134 int (*matrix_cmp)[2] = (int(*)[2])matrix; 2135 intptr_t i,j,k; 2136 2137 i = -len*sizeof(float); 2138 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { 2139 MIX5(IF0,IF1); 2140 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { 2141 MIX5(IF1,IF0); 2142 } else { 2143 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; 2144 j = 2*in_ch*sizeof(float); 2145 __asm__ volatile( 2146 "1: \n" 2147 "sub $8, %0 \n" 2148 "movss (%2,%0), %%xmm4 \n" 2149 "movss 4(%2,%0), %%xmm5 \n" 2150 "shufps $0, %%xmm4, %%xmm4 \n" 2151 "shufps $0, %%xmm5, %%xmm5 \n" 2152 "movaps %%xmm4, (%1,%0,4) \n" 2153 "movaps %%xmm5, 16(%1,%0,4) \n" 2154 "jg 1b \n" 2155 :"+&r"(j) 2156 :"r"(matrix_simd), "r"(matrix) 2157 :"memory" 2158 ); 2159 if(out_ch == 2) { 2160 MIX_MISC(IF1); 2161 } else { 2162 MIX_MISC(IF0); 2163 } 2164 } 2165} 2166 2167static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){ 2168 x86_reg i = (len-4)*4; 2169 __asm__ volatile( 2170 "1: \n\t" 2171 "movq (%2,%0), %%mm0 \n\t" 2172 "movq 8(%2,%0), %%mm1 \n\t" 2173 "pfmul (%3,%0), %%mm0 \n\t" 2174 "pfmul 8(%3,%0), %%mm1 \n\t" 2175 "movq %%mm0, (%1,%0) \n\t" 2176 "movq %%mm1, 8(%1,%0) \n\t" 2177 "sub $16, %0 \n\t" 2178 "jge 1b \n\t" 2179 "femms \n\t" 2180 :"+r"(i) 2181 :"r"(dst), "r"(src0), "r"(src1) 2182 :"memory" 2183 ); 2184} 2185static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){ 2186 x86_reg i = (len-8)*4; 2187 __asm__ volatile( 2188 "1: \n\t" 2189 "movaps (%2,%0), %%xmm0 \n\t" 2190 "movaps 16(%2,%0), %%xmm1 \n\t" 2191 "mulps (%3,%0), %%xmm0 \n\t" 2192 "mulps 16(%3,%0), %%xmm1 \n\t" 2193 "movaps %%xmm0, (%1,%0) \n\t" 2194 "movaps %%xmm1, 16(%1,%0) \n\t" 2195 "sub $32, %0 \n\t" 2196 "jge 1b \n\t" 2197 :"+r"(i) 2198 :"r"(dst), "r"(src0), "r"(src1) 2199 :"memory" 2200 ); 2201} 2202 2203static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ 2204 x86_reg i = len*4-16; 2205 __asm__ volatile( 2206 "1: \n\t" 2207 "pswapd 8(%1), %%mm0 \n\t" 2208 "pswapd (%1), %%mm1 \n\t" 2209 "pfmul (%3,%0), %%mm0 \n\t" 2210 "pfmul 8(%3,%0), %%mm1 \n\t" 2211 "movq %%mm0, (%2,%0) \n\t" 2212 "movq %%mm1, 8(%2,%0) \n\t" 2213 "add $16, %1 \n\t" 2214 "sub $16, %0 \n\t" 2215 "jge 1b \n\t" 2216 :"+r"(i), "+r"(src1) 2217 :"r"(dst), "r"(src0) 2218 ); 2219 __asm__ volatile("femms"); 2220} 2221static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ 2222 x86_reg i = len*4-32; 2223 __asm__ volatile( 2224 "1: \n\t" 2225 "movaps 16(%1), %%xmm0 \n\t" 2226 "movaps (%1), %%xmm1 \n\t" 2227 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" 2228 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" 2229 "mulps (%3,%0), %%xmm0 \n\t" 2230 "mulps 16(%3,%0), %%xmm1 \n\t" 2231 "movaps %%xmm0, (%2,%0) \n\t" 2232 "movaps %%xmm1, 16(%2,%0) \n\t" 2233 "add $32, %1 \n\t" 2234 "sub $32, %0 \n\t" 2235 "jge 1b \n\t" 2236 :"+r"(i), "+r"(src1) 2237 :"r"(dst), "r"(src0) 2238 ); 2239} 2240 2241static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, 2242 const float *src2, int len){ 2243 x86_reg i = (len-4)*4; 2244 __asm__ volatile( 2245 "1: \n\t" 2246 "movq (%2,%0), %%mm0 \n\t" 2247 "movq 8(%2,%0), %%mm1 \n\t" 2248 "pfmul (%3,%0), %%mm0 \n\t" 2249 "pfmul 8(%3,%0), %%mm1 \n\t" 2250 "pfadd (%4,%0), %%mm0 \n\t" 2251 "pfadd 8(%4,%0), %%mm1 \n\t" 2252 "movq %%mm0, (%1,%0) \n\t" 2253 "movq %%mm1, 8(%1,%0) \n\t" 2254 "sub $16, %0 \n\t" 2255 "jge 1b \n\t" 2256 :"+r"(i) 2257 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) 2258 :"memory" 2259 ); 2260 __asm__ volatile("femms"); 2261} 2262static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, 2263 const float *src2, int len){ 2264 x86_reg i = (len-8)*4; 2265 __asm__ volatile( 2266 "1: \n\t" 2267 "movaps (%2,%0), %%xmm0 \n\t" 2268 "movaps 16(%2,%0), %%xmm1 \n\t" 2269 "mulps (%3,%0), %%xmm0 \n\t" 2270 "mulps 16(%3,%0), %%xmm1 \n\t" 2271 "addps (%4,%0), %%xmm0 \n\t" 2272 "addps 16(%4,%0), %%xmm1 \n\t" 2273 "movaps %%xmm0, (%1,%0) \n\t" 2274 "movaps %%xmm1, 16(%1,%0) \n\t" 2275 "sub $32, %0 \n\t" 2276 "jge 1b \n\t" 2277 :"+r"(i) 2278 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) 2279 :"memory" 2280 ); 2281} 2282 2283#if HAVE_6REGS 2284static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, 2285 const float *win, int len){ 2286 x86_reg i = -len*4; 2287 x86_reg j = len*4-8; 2288 __asm__ volatile( 2289 "1: \n" 2290 "pswapd (%5,%1), %%mm1 \n" 2291 "movq (%5,%0), %%mm0 \n" 2292 "pswapd (%4,%1), %%mm5 \n" 2293 "movq (%3,%0), %%mm4 \n" 2294 "movq %%mm0, %%mm2 \n" 2295 "movq %%mm1, %%mm3 \n" 2296 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] 2297 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] 2298 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] 2299 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] 2300 "pfadd %%mm3, %%mm2 \n" 2301 "pfsub %%mm0, %%mm1 \n" 2302 "pswapd %%mm2, %%mm2 \n" 2303 "movq %%mm1, (%2,%0) \n" 2304 "movq %%mm2, (%2,%1) \n" 2305 "sub $8, %1 \n" 2306 "add $8, %0 \n" 2307 "jl 1b \n" 2308 "femms \n" 2309 :"+r"(i), "+r"(j) 2310 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) 2311 ); 2312} 2313 2314static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, 2315 const float *win, int len){ 2316 x86_reg i = -len*4; 2317 x86_reg j = len*4-16; 2318 __asm__ volatile( 2319 "1: \n" 2320 "movaps (%5,%1), %%xmm1 \n" 2321 "movaps (%5,%0), %%xmm0 \n" 2322 "movaps (%4,%1), %%xmm5 \n" 2323 "movaps (%3,%0), %%xmm4 \n" 2324 "shufps $0x1b, %%xmm1, %%xmm1 \n" 2325 "shufps $0x1b, %%xmm5, %%xmm5 \n" 2326 "movaps %%xmm0, %%xmm2 \n" 2327 "movaps %%xmm1, %%xmm3 \n" 2328 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] 2329 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] 2330 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] 2331 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] 2332 "addps %%xmm3, %%xmm2 \n" 2333 "subps %%xmm0, %%xmm1 \n" 2334 "shufps $0x1b, %%xmm2, %%xmm2 \n" 2335 "movaps %%xmm1, (%2,%0) \n" 2336 "movaps %%xmm2, (%2,%1) \n" 2337 "sub $16, %1 \n" 2338 "add $16, %0 \n" 2339 "jl 1b \n" 2340 :"+r"(i), "+r"(j) 2341 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) 2342 ); 2343} 2344#endif /* HAVE_6REGS */ 2345 2346static void vector_clipf_sse(float *dst, const float *src, float min, float max, 2347 int len) 2348{ 2349 x86_reg i = (len-16)*4; 2350 __asm__ volatile( 2351 "movss %3, %%xmm4 \n" 2352 "movss %4, %%xmm5 \n" 2353 "shufps $0, %%xmm4, %%xmm4 \n" 2354 "shufps $0, %%xmm5, %%xmm5 \n" 2355 "1: \n\t" 2356 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel 2357 "movaps 16(%2,%0), %%xmm1 \n\t" 2358 "movaps 32(%2,%0), %%xmm2 \n\t" 2359 "movaps 48(%2,%0), %%xmm3 \n\t" 2360 "maxps %%xmm4, %%xmm0 \n\t" 2361 "maxps %%xmm4, %%xmm1 \n\t" 2362 "maxps %%xmm4, %%xmm2 \n\t" 2363 "maxps %%xmm4, %%xmm3 \n\t" 2364 "minps %%xmm5, %%xmm0 \n\t" 2365 "minps %%xmm5, %%xmm1 \n\t" 2366 "minps %%xmm5, %%xmm2 \n\t" 2367 "minps %%xmm5, %%xmm3 \n\t" 2368 "movaps %%xmm0, (%1,%0) \n\t" 2369 "movaps %%xmm1, 16(%1,%0) \n\t" 2370 "movaps %%xmm2, 32(%1,%0) \n\t" 2371 "movaps %%xmm3, 48(%1,%0) \n\t" 2372 "sub $64, %0 \n\t" 2373 "jge 1b \n\t" 2374 :"+&r"(i) 2375 :"r"(dst), "r"(src), "m"(min), "m"(max) 2376 :"memory" 2377 ); 2378} 2379 2380void ff_vp3_idct_mmx(int16_t *input_data); 2381void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); 2382void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); 2383 2384void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); 2385 2386void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); 2387void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); 2388 2389void ff_vp3_idct_sse2(int16_t *input_data); 2390void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); 2391void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); 2392 2393int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); 2394int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); 2395int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); 2396int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); 2397int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); 2398 2399void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input, 2400 const int16_t *window, unsigned int len); 2401void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input, 2402 const int16_t *window, unsigned int len); 2403void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input, 2404 const int16_t *window, unsigned int len); 2405void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input, 2406 const int16_t *window, unsigned int len); 2407void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input, 2408 const int16_t *window, unsigned int len); 2409void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, 2410 const int16_t *window, unsigned int len); 2411 2412void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); 2413int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); 2414int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); 2415 2416float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); 2417 2418void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min, 2419 int32_t max, unsigned int len); 2420void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min, 2421 int32_t max, unsigned int len); 2422void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min, 2423 int32_t max, unsigned int len); 2424void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min, 2425 int32_t max, unsigned int len); 2426 2427extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0, 2428 const float *src1, int len); 2429extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, 2430 const float *src1, int len); 2431 2432void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) 2433{ 2434 int mm_flags = av_get_cpu_flags(); 2435 const int high_bit_depth = avctx->bits_per_raw_sample > 8; 2436 const int bit_depth = avctx->bits_per_raw_sample; 2437 2438 if (avctx->dsp_mask) { 2439 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) 2440 mm_flags |= (avctx->dsp_mask & 0xffff); 2441 else 2442 mm_flags &= ~(avctx->dsp_mask & 0xffff); 2443 } 2444 2445#if 0 2446 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); 2447 if (mm_flags & AV_CPU_FLAG_MMX) 2448 av_log(avctx, AV_LOG_INFO, " mmx"); 2449 if (mm_flags & AV_CPU_FLAG_MMX2) 2450 av_log(avctx, AV_LOG_INFO, " mmx2"); 2451 if (mm_flags & AV_CPU_FLAG_3DNOW) 2452 av_log(avctx, AV_LOG_INFO, " 3dnow"); 2453 if (mm_flags & AV_CPU_FLAG_SSE) 2454 av_log(avctx, AV_LOG_INFO, " sse"); 2455 if (mm_flags & AV_CPU_FLAG_SSE2) 2456 av_log(avctx, AV_LOG_INFO, " sse2"); 2457 av_log(avctx, AV_LOG_INFO, "\n"); 2458#endif 2459 2460 if (mm_flags & AV_CPU_FLAG_MMX) { 2461 const int idct_algo= avctx->idct_algo; 2462 2463 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { 2464 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ 2465 c->idct_put= ff_simple_idct_put_mmx; 2466 c->idct_add= ff_simple_idct_add_mmx; 2467 c->idct = ff_simple_idct_mmx; 2468 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; 2469#if CONFIG_GPL 2470 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ 2471 if(mm_flags & AV_CPU_FLAG_MMX2){ 2472 c->idct_put= ff_libmpeg2mmx2_idct_put; 2473 c->idct_add= ff_libmpeg2mmx2_idct_add; 2474 c->idct = ff_mmxext_idct; 2475 }else{ 2476 c->idct_put= ff_libmpeg2mmx_idct_put; 2477 c->idct_add= ff_libmpeg2mmx_idct_add; 2478 c->idct = ff_mmx_idct; 2479 } 2480 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; 2481#endif 2482 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && 2483 idct_algo==FF_IDCT_VP3 && HAVE_YASM){ 2484 if(mm_flags & AV_CPU_FLAG_SSE2){ 2485 c->idct_put= ff_vp3_idct_put_sse2; 2486 c->idct_add= ff_vp3_idct_add_sse2; 2487 c->idct = ff_vp3_idct_sse2; 2488 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; 2489 }else{ 2490 c->idct_put= ff_vp3_idct_put_mmx; 2491 c->idct_add= ff_vp3_idct_add_mmx; 2492 c->idct = ff_vp3_idct_mmx; 2493 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; 2494 } 2495 }else if(idct_algo==FF_IDCT_CAVS){ 2496 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; 2497 }else if(idct_algo==FF_IDCT_XVIDMMX){ 2498 if(mm_flags & AV_CPU_FLAG_SSE2){ 2499 c->idct_put= ff_idct_xvid_sse2_put; 2500 c->idct_add= ff_idct_xvid_sse2_add; 2501 c->idct = ff_idct_xvid_sse2; 2502 c->idct_permutation_type= FF_SSE2_IDCT_PERM; 2503 }else if(mm_flags & AV_CPU_FLAG_MMX2){ 2504 c->idct_put= ff_idct_xvid_mmx2_put; 2505 c->idct_add= ff_idct_xvid_mmx2_add; 2506 c->idct = ff_idct_xvid_mmx2; 2507 }else{ 2508 c->idct_put= ff_idct_xvid_mmx_put; 2509 c->idct_add= ff_idct_xvid_mmx_add; 2510 c->idct = ff_idct_xvid_mmx; 2511 } 2512 } 2513 } 2514 2515 c->put_pixels_clamped = ff_put_pixels_clamped_mmx; 2516 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; 2517 c->add_pixels_clamped = ff_add_pixels_clamped_mmx; 2518 if (!high_bit_depth) { 2519 c->clear_block = clear_block_mmx; 2520 c->clear_blocks = clear_blocks_mmx; 2521 if ((mm_flags & AV_CPU_FLAG_SSE) && 2522 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){ 2523 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ 2524 c->clear_block = clear_block_sse; 2525 c->clear_blocks = clear_blocks_sse; 2526 } 2527 } 2528 2529#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ 2530 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ 2531 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ 2532 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ 2533 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU 2534 2535 if (!high_bit_depth) { 2536 SET_HPEL_FUNCS(put, 0, 16, mmx); 2537 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); 2538 SET_HPEL_FUNCS(avg, 0, 16, mmx); 2539 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); 2540 SET_HPEL_FUNCS(put, 1, 8, mmx); 2541 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); 2542 SET_HPEL_FUNCS(avg, 1, 8, mmx); 2543 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); 2544 } 2545 2546#if ARCH_X86_32 || !HAVE_YASM 2547 c->gmc= gmc_mmx; 2548#endif 2549#if ARCH_X86_32 && HAVE_YASM 2550 if (!high_bit_depth) 2551 c->emulated_edge_mc = emulated_edge_mc_mmx; 2552#endif 2553 2554 c->add_bytes= add_bytes_mmx; 2555 c->add_bytes_l2= add_bytes_l2_mmx; 2556 2557 if (!high_bit_depth) 2558 c->draw_edges = draw_edges_mmx; 2559 2560 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { 2561 c->h263_v_loop_filter= h263_v_loop_filter_mmx; 2562 c->h263_h_loop_filter= h263_h_loop_filter_mmx; 2563 } 2564 2565#if HAVE_YASM 2566 if (!high_bit_depth && CONFIG_H264CHROMA) { 2567 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd; 2568 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx; 2569 } 2570 2571 c->vector_clip_int32 = ff_vector_clip_int32_mmx; 2572#endif 2573 2574 if (mm_flags & AV_CPU_FLAG_MMX2) { 2575 c->prefetch = prefetch_mmx2; 2576 2577 if (!high_bit_depth) { 2578 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; 2579 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; 2580 2581 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; 2582 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; 2583 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; 2584 2585 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; 2586 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; 2587 2588 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; 2589 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; 2590 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; 2591 } 2592 2593 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2594 if (!high_bit_depth) { 2595 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; 2596 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; 2597 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; 2598 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; 2599 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; 2600 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; 2601 } 2602 2603 if (CONFIG_VP3_DECODER && HAVE_YASM) { 2604 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; 2605 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; 2606 } 2607 } 2608 if (CONFIG_VP3_DECODER && HAVE_YASM) { 2609 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; 2610 } 2611 2612 if (CONFIG_VP3_DECODER 2613 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { 2614 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; 2615 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; 2616 } 2617 2618#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ 2619 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ 2620 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ 2621 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ 2622 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ 2623 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ 2624 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ 2625 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ 2626 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ 2627 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ 2628 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ 2629 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ 2630 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ 2631 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ 2632 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ 2633 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ 2634 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU 2635 2636 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); 2637 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); 2638 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); 2639 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); 2640 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, ); 2641 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, ); 2642 2643 if (!high_bit_depth) { 2644 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); 2645 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, ); 2646 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, ); 2647 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, ); 2648 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, ); 2649 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, ); 2650 } 2651 else if (bit_depth == 10) { 2652#if HAVE_YASM 2653#if !ARCH_X86_64 2654 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); 2655 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); 2656 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); 2657 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); 2658#endif 2659 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); 2660 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); 2661#endif 2662 } 2663 2664 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, ); 2665 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); 2666 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); 2667 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); 2668 2669#if HAVE_YASM 2670 if (!high_bit_depth && CONFIG_H264CHROMA) { 2671 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd; 2672 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2; 2673 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2; 2674 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2; 2675 } 2676 if (bit_depth == 10 && CONFIG_H264CHROMA) { 2677 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext; 2678 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext; 2679 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext; 2680 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext; 2681 } 2682 2683 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; 2684#endif 2685#if HAVE_7REGS 2686 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) 2687 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; 2688#endif 2689 2690 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; 2691 } else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) { 2692 c->prefetch = prefetch_3dnow; 2693 2694 if (!high_bit_depth) { 2695 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; 2696 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; 2697 2698 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; 2699 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; 2700 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; 2701 2702 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; 2703 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; 2704 2705 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; 2706 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; 2707 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; 2708 2709 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2710 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; 2711 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; 2712 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; 2713 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; 2714 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; 2715 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; 2716 } 2717 } 2718 2719 if (CONFIG_VP3_DECODER 2720 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { 2721 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow; 2722 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; 2723 } 2724 2725 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); 2726 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); 2727 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); 2728 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, ); 2729 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, ); 2730 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, ); 2731 2732 if (!high_bit_depth) { 2733 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); 2734 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); 2735 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); 2736 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); 2737 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); 2738 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); 2739 } 2740 2741 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); 2742 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); 2743 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); 2744 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); 2745 2746#if HAVE_YASM 2747 if (!high_bit_depth && CONFIG_H264CHROMA) { 2748 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd; 2749 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow; 2750 } 2751 2752#endif 2753 } 2754 2755 2756#define H264_QPEL_FUNCS(x, y, CPU)\ 2757 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ 2758 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ 2759 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ 2760 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; 2761 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){ 2762 // these functions are slower than mmx on AMD, but faster on Intel 2763 if (!high_bit_depth) { 2764 c->put_pixels_tab[0][0] = put_pixels16_sse2; 2765 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2; 2766 c->avg_pixels_tab[0][0] = avg_pixels16_sse2; 2767 H264_QPEL_FUNCS(0, 0, sse2); 2768 } 2769 } 2770 if(mm_flags & AV_CPU_FLAG_SSE2){ 2771 if (!high_bit_depth) { 2772 H264_QPEL_FUNCS(0, 1, sse2); 2773 H264_QPEL_FUNCS(0, 2, sse2); 2774 H264_QPEL_FUNCS(0, 3, sse2); 2775 H264_QPEL_FUNCS(1, 1, sse2); 2776 H264_QPEL_FUNCS(1, 2, sse2); 2777 H264_QPEL_FUNCS(1, 3, sse2); 2778 H264_QPEL_FUNCS(2, 1, sse2); 2779 H264_QPEL_FUNCS(2, 2, sse2); 2780 H264_QPEL_FUNCS(2, 3, sse2); 2781 H264_QPEL_FUNCS(3, 1, sse2); 2782 H264_QPEL_FUNCS(3, 2, sse2); 2783 H264_QPEL_FUNCS(3, 3, sse2); 2784 } 2785#if HAVE_YASM 2786#define H264_QPEL_FUNCS_10(x, y, CPU)\ 2787 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\ 2788 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\ 2789 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\ 2790 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; 2791 if (bit_depth == 10) { 2792 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); 2793 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); 2794 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); 2795 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); 2796 H264_QPEL_FUNCS_10(1, 0, sse2_cache64) 2797 H264_QPEL_FUNCS_10(2, 0, sse2_cache64) 2798 H264_QPEL_FUNCS_10(3, 0, sse2_cache64) 2799 2800 if (CONFIG_H264CHROMA) { 2801 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; 2802 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2; 2803 } 2804 } 2805#endif 2806 } 2807#if HAVE_SSSE3 2808 if(mm_flags & AV_CPU_FLAG_SSSE3){ 2809 if (!high_bit_depth) { 2810 H264_QPEL_FUNCS(1, 0, ssse3); 2811 H264_QPEL_FUNCS(1, 1, ssse3); 2812 H264_QPEL_FUNCS(1, 2, ssse3); 2813 H264_QPEL_FUNCS(1, 3, ssse3); 2814 H264_QPEL_FUNCS(2, 0, ssse3); 2815 H264_QPEL_FUNCS(2, 1, ssse3); 2816 H264_QPEL_FUNCS(2, 2, ssse3); 2817 H264_QPEL_FUNCS(2, 3, ssse3); 2818 H264_QPEL_FUNCS(3, 0, ssse3); 2819 H264_QPEL_FUNCS(3, 1, ssse3); 2820 H264_QPEL_FUNCS(3, 2, ssse3); 2821 H264_QPEL_FUNCS(3, 3, ssse3); 2822 } 2823#if HAVE_YASM 2824 else if (bit_depth == 10) { 2825 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64) 2826 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64) 2827 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64) 2828 } 2829#endif 2830 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; 2831#if HAVE_YASM 2832 if (!high_bit_depth && CONFIG_H264CHROMA) { 2833 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd; 2834 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd; 2835 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3; 2836 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3; 2837 } 2838 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; 2839 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe 2840 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; 2841#endif 2842 } 2843#endif 2844 2845 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) { 2846 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; 2847 c->vector_fmul = vector_fmul_3dnow; 2848 } 2849 if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) { 2850 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; 2851#if HAVE_6REGS 2852 c->vector_fmul_window = vector_fmul_window_3dnow2; 2853#endif 2854 } 2855 if(mm_flags & AV_CPU_FLAG_MMX2){ 2856#if HAVE_YASM 2857 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; 2858 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; 2859 if (avctx->flags & CODEC_FLAG_BITEXACT) { 2860 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba; 2861 } else { 2862 c->apply_window_int16 = ff_apply_window_int16_mmxext; 2863 } 2864#endif 2865 } 2866 if(mm_flags & AV_CPU_FLAG_SSE){ 2867 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; 2868 c->ac3_downmix = ac3_downmix_sse; 2869 c->vector_fmul = vector_fmul_sse; 2870 c->vector_fmul_reverse = vector_fmul_reverse_sse; 2871 c->vector_fmul_add = vector_fmul_add_sse; 2872#if HAVE_6REGS 2873 c->vector_fmul_window = vector_fmul_window_sse; 2874#endif 2875 c->vector_clipf = vector_clipf_sse; 2876#if HAVE_YASM 2877 c->scalarproduct_float = ff_scalarproduct_float_sse; 2878 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse; 2879 2880 if (!high_bit_depth) 2881 c->emulated_edge_mc = emulated_edge_mc_sse; 2882 c->gmc = gmc_sse; 2883#endif 2884 } 2885 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) 2886 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse 2887 if(mm_flags & AV_CPU_FLAG_SSE2){ 2888#if HAVE_YASM 2889 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; 2890 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; 2891 if (mm_flags & AV_CPU_FLAG_ATOM) { 2892 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; 2893 } else { 2894 c->vector_clip_int32 = ff_vector_clip_int32_sse2; 2895 } 2896 if (avctx->flags & CODEC_FLAG_BITEXACT) { 2897 c->apply_window_int16 = ff_apply_window_int16_sse2_ba; 2898 } else { 2899 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { 2900 c->apply_window_int16 = ff_apply_window_int16_sse2; 2901 } 2902 } 2903#endif 2904 } 2905 if (mm_flags & AV_CPU_FLAG_SSSE3) { 2906#if HAVE_YASM 2907 if (mm_flags & AV_CPU_FLAG_ATOM) { 2908 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; 2909 } else { 2910 c->apply_window_int16 = ff_apply_window_int16_ssse3; 2911 } 2912 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit 2913 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; 2914 } 2915#endif 2916 } 2917 2918 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { 2919#if HAVE_YASM 2920 c->vector_clip_int32 = ff_vector_clip_int32_sse4; 2921#endif 2922 } 2923 2924#if HAVE_AVX && HAVE_YASM 2925 if (mm_flags & AV_CPU_FLAG_AVX) { 2926 if (bit_depth == 10) { 2927 //AVX implies !cache64. 2928 //TODO: Port cache(32|64) detection from x264. 2929 H264_QPEL_FUNCS_10(1, 0, sse2) 2930 H264_QPEL_FUNCS_10(2, 0, sse2) 2931 H264_QPEL_FUNCS_10(3, 0, sse2) 2932 2933 if (CONFIG_H264CHROMA) { 2934 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx; 2935 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx; 2936 } 2937 } 2938 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx; 2939 } 2940#endif 2941 } 2942 2943 if (CONFIG_ENCODERS) 2944 dsputilenc_init_mmx(c, avctx); 2945} 2946