1/* 2 * MMX optimized motion estimation 3 * Copyright (c) 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer 5 * 6 * mostly by Michael Niedermayer <michaelni@gmx.at> 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include "libavutil/attributes.h" 26#include "libavutil/avassert.h" 27#include "libavutil/mem.h" 28#include "libavutil/x86/asm.h" 29#include "libavutil/x86/cpu.h" 30#include "libavcodec/mpegvideo.h" 31#include "dsputil_x86.h" 32 33#if HAVE_INLINE_ASM 34 35DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { 36 0x0000000000000000ULL, 37 0x0001000100010001ULL, 38 0x0002000200020002ULL, 39}; 40 41DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; 42 43static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 44{ 45 x86_reg len = -(x86_reg)stride * h; 46 __asm__ volatile ( 47 ".p2align 4 \n\t" 48 "1: \n\t" 49 "movq (%1, %%"REG_a"), %%mm0 \n\t" 50 "movq (%2, %%"REG_a"), %%mm2 \n\t" 51 "movq (%2, %%"REG_a"), %%mm4 \n\t" 52 "add %3, %%"REG_a" \n\t" 53 "psubusb %%mm0, %%mm2 \n\t" 54 "psubusb %%mm4, %%mm0 \n\t" 55 "movq (%1, %%"REG_a"), %%mm1 \n\t" 56 "movq (%2, %%"REG_a"), %%mm3 \n\t" 57 "movq (%2, %%"REG_a"), %%mm5 \n\t" 58 "psubusb %%mm1, %%mm3 \n\t" 59 "psubusb %%mm5, %%mm1 \n\t" 60 "por %%mm2, %%mm0 \n\t" 61 "por %%mm1, %%mm3 \n\t" 62 "movq %%mm0, %%mm1 \n\t" 63 "movq %%mm3, %%mm2 \n\t" 64 "punpcklbw %%mm7, %%mm0 \n\t" 65 "punpckhbw %%mm7, %%mm1 \n\t" 66 "punpcklbw %%mm7, %%mm3 \n\t" 67 "punpckhbw %%mm7, %%mm2 \n\t" 68 "paddw %%mm1, %%mm0 \n\t" 69 "paddw %%mm3, %%mm2 \n\t" 70 "paddw %%mm2, %%mm0 \n\t" 71 "paddw %%mm0, %%mm6 \n\t" 72 "add %3, %%"REG_a" \n\t" 73 " js 1b \n\t" 74 : "+a" (len) 75 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); 76} 77 78static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, 79 int stride, int h) 80{ 81 __asm__ volatile ( 82 ".p2align 4 \n\t" 83 "1: \n\t" 84 "movq (%1), %%mm0 \n\t" 85 "movq (%1, %3), %%mm1 \n\t" 86 "psadbw (%2), %%mm0 \n\t" 87 "psadbw (%2, %3), %%mm1 \n\t" 88 "paddw %%mm0, %%mm6 \n\t" 89 "paddw %%mm1, %%mm6 \n\t" 90 "lea (%1,%3,2), %1 \n\t" 91 "lea (%2,%3,2), %2 \n\t" 92 "sub $2, %0 \n\t" 93 " jg 1b \n\t" 94 : "+r" (h), "+r" (blk1), "+r" (blk2) 95 : "r" ((x86_reg) stride)); 96} 97 98static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, 99 int stride, int h) 100{ 101 int ret; 102 __asm__ volatile ( 103 "pxor %%xmm2, %%xmm2 \n\t" 104 ".p2align 4 \n\t" 105 "1: \n\t" 106 "movdqu (%1), %%xmm0 \n\t" 107 "movdqu (%1, %4), %%xmm1 \n\t" 108 "psadbw (%2), %%xmm0 \n\t" 109 "psadbw (%2, %4), %%xmm1 \n\t" 110 "paddw %%xmm0, %%xmm2 \n\t" 111 "paddw %%xmm1, %%xmm2 \n\t" 112 "lea (%1,%4,2), %1 \n\t" 113 "lea (%2,%4,2), %2 \n\t" 114 "sub $2, %0 \n\t" 115 " jg 1b \n\t" 116 "movhlps %%xmm2, %%xmm0 \n\t" 117 "paddw %%xmm0, %%xmm2 \n\t" 118 "movd %%xmm2, %3 \n\t" 119 : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) 120 : "r" ((x86_reg) stride)); 121 return ret; 122} 123 124static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, 125 int stride, int h) 126{ 127 __asm__ volatile ( 128 ".p2align 4 \n\t" 129 "1: \n\t" 130 "movq (%1), %%mm0 \n\t" 131 "movq (%1, %3), %%mm1 \n\t" 132 "pavgb 1(%1), %%mm0 \n\t" 133 "pavgb 1(%1, %3), %%mm1 \n\t" 134 "psadbw (%2), %%mm0 \n\t" 135 "psadbw (%2, %3), %%mm1 \n\t" 136 "paddw %%mm0, %%mm6 \n\t" 137 "paddw %%mm1, %%mm6 \n\t" 138 "lea (%1,%3,2), %1 \n\t" 139 "lea (%2,%3,2), %2 \n\t" 140 "sub $2, %0 \n\t" 141 " jg 1b \n\t" 142 : "+r" (h), "+r" (blk1), "+r" (blk2) 143 : "r" ((x86_reg) stride)); 144} 145 146static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, 147 int stride, int h) 148{ 149 __asm__ volatile ( 150 "movq (%1), %%mm0 \n\t" 151 "add %3, %1 \n\t" 152 ".p2align 4 \n\t" 153 "1: \n\t" 154 "movq (%1), %%mm1 \n\t" 155 "movq (%1, %3), %%mm2 \n\t" 156 "pavgb %%mm1, %%mm0 \n\t" 157 "pavgb %%mm2, %%mm1 \n\t" 158 "psadbw (%2), %%mm0 \n\t" 159 "psadbw (%2, %3), %%mm1 \n\t" 160 "paddw %%mm0, %%mm6 \n\t" 161 "paddw %%mm1, %%mm6 \n\t" 162 "movq %%mm2, %%mm0 \n\t" 163 "lea (%1,%3,2), %1 \n\t" 164 "lea (%2,%3,2), %2 \n\t" 165 "sub $2, %0 \n\t" 166 " jg 1b \n\t" 167 : "+r" (h), "+r" (blk1), "+r" (blk2) 168 : "r" ((x86_reg) stride)); 169} 170 171static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, 172 int stride, int h) 173{ 174 __asm__ volatile ( 175 "movq "MANGLE(bone)", %%mm5 \n\t" 176 "movq (%1), %%mm0 \n\t" 177 "pavgb 1(%1), %%mm0 \n\t" 178 "add %3, %1 \n\t" 179 ".p2align 4 \n\t" 180 "1: \n\t" 181 "movq (%1), %%mm1 \n\t" 182 "movq (%1,%3), %%mm2 \n\t" 183 "pavgb 1(%1), %%mm1 \n\t" 184 "pavgb 1(%1,%3), %%mm2 \n\t" 185 "psubusb %%mm5, %%mm1 \n\t" 186 "pavgb %%mm1, %%mm0 \n\t" 187 "pavgb %%mm2, %%mm1 \n\t" 188 "psadbw (%2), %%mm0 \n\t" 189 "psadbw (%2,%3), %%mm1 \n\t" 190 "paddw %%mm0, %%mm6 \n\t" 191 "paddw %%mm1, %%mm6 \n\t" 192 "movq %%mm2, %%mm0 \n\t" 193 "lea (%1,%3,2), %1 \n\t" 194 "lea (%2,%3,2), %2 \n\t" 195 "sub $2, %0 \n\t" 196 " jg 1b \n\t" 197 : "+r" (h), "+r" (blk1), "+r" (blk2) 198 : "r" ((x86_reg) stride) 199 NAMED_CONSTRAINTS_ADD(bone)); 200} 201 202static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, 203 int stride, int h) 204{ 205 x86_reg len = -(x86_reg)stride * h; 206 __asm__ volatile ( 207 ".p2align 4 \n\t" 208 "1: \n\t" 209 "movq (%1, %%"REG_a"), %%mm0 \n\t" 210 "movq (%2, %%"REG_a"), %%mm1 \n\t" 211 "movq (%1, %%"REG_a"), %%mm2 \n\t" 212 "movq (%2, %%"REG_a"), %%mm3 \n\t" 213 "punpcklbw %%mm7, %%mm0 \n\t" 214 "punpcklbw %%mm7, %%mm1 \n\t" 215 "punpckhbw %%mm7, %%mm2 \n\t" 216 "punpckhbw %%mm7, %%mm3 \n\t" 217 "paddw %%mm0, %%mm1 \n\t" 218 "paddw %%mm2, %%mm3 \n\t" 219 "movq (%3, %%"REG_a"), %%mm4 \n\t" 220 "movq (%3, %%"REG_a"), %%mm2 \n\t" 221 "paddw %%mm5, %%mm1 \n\t" 222 "paddw %%mm5, %%mm3 \n\t" 223 "psrlw $1, %%mm1 \n\t" 224 "psrlw $1, %%mm3 \n\t" 225 "packuswb %%mm3, %%mm1 \n\t" 226 "psubusb %%mm1, %%mm4 \n\t" 227 "psubusb %%mm2, %%mm1 \n\t" 228 "por %%mm4, %%mm1 \n\t" 229 "movq %%mm1, %%mm0 \n\t" 230 "punpcklbw %%mm7, %%mm0 \n\t" 231 "punpckhbw %%mm7, %%mm1 \n\t" 232 "paddw %%mm1, %%mm0 \n\t" 233 "paddw %%mm0, %%mm6 \n\t" 234 "add %4, %%"REG_a" \n\t" 235 " js 1b \n\t" 236 : "+a" (len) 237 : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), 238 "r" ((x86_reg) stride)); 239} 240 241static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 242{ 243 x86_reg len = -(x86_reg)stride * h; 244 __asm__ volatile ( 245 "movq (%1, %%"REG_a"), %%mm0 \n\t" 246 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" 247 "movq %%mm0, %%mm1 \n\t" 248 "movq %%mm2, %%mm3 \n\t" 249 "punpcklbw %%mm7, %%mm0 \n\t" 250 "punpckhbw %%mm7, %%mm1 \n\t" 251 "punpcklbw %%mm7, %%mm2 \n\t" 252 "punpckhbw %%mm7, %%mm3 \n\t" 253 "paddw %%mm2, %%mm0 \n\t" 254 "paddw %%mm3, %%mm1 \n\t" 255 ".p2align 4 \n\t" 256 "1: \n\t" 257 "movq (%2, %%"REG_a"), %%mm2 \n\t" 258 "movq 1(%2, %%"REG_a"), %%mm4 \n\t" 259 "movq %%mm2, %%mm3 \n\t" 260 "movq %%mm4, %%mm5 \n\t" 261 "punpcklbw %%mm7, %%mm2 \n\t" 262 "punpckhbw %%mm7, %%mm3 \n\t" 263 "punpcklbw %%mm7, %%mm4 \n\t" 264 "punpckhbw %%mm7, %%mm5 \n\t" 265 "paddw %%mm4, %%mm2 \n\t" 266 "paddw %%mm5, %%mm3 \n\t" 267 "movq %5, %%mm5 \n\t" 268 "paddw %%mm2, %%mm0 \n\t" 269 "paddw %%mm3, %%mm1 \n\t" 270 "paddw %%mm5, %%mm0 \n\t" 271 "paddw %%mm5, %%mm1 \n\t" 272 "movq (%3, %%"REG_a"), %%mm4 \n\t" 273 "movq (%3, %%"REG_a"), %%mm5 \n\t" 274 "psrlw $2, %%mm0 \n\t" 275 "psrlw $2, %%mm1 \n\t" 276 "packuswb %%mm1, %%mm0 \n\t" 277 "psubusb %%mm0, %%mm4 \n\t" 278 "psubusb %%mm5, %%mm0 \n\t" 279 "por %%mm4, %%mm0 \n\t" 280 "movq %%mm0, %%mm4 \n\t" 281 "punpcklbw %%mm7, %%mm0 \n\t" 282 "punpckhbw %%mm7, %%mm4 \n\t" 283 "paddw %%mm0, %%mm6 \n\t" 284 "paddw %%mm4, %%mm6 \n\t" 285 "movq %%mm2, %%mm0 \n\t" 286 "movq %%mm3, %%mm1 \n\t" 287 "add %4, %%"REG_a" \n\t" 288 " js 1b \n\t" 289 : "+a" (len) 290 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), 291 "r" ((x86_reg) stride), "m" (round_tab[2])); 292} 293 294static inline int sum_mmx(void) 295{ 296 int ret; 297 __asm__ volatile ( 298 "movq %%mm6, %%mm0 \n\t" 299 "psrlq $32, %%mm6 \n\t" 300 "paddw %%mm0, %%mm6 \n\t" 301 "movq %%mm6, %%mm0 \n\t" 302 "psrlq $16, %%mm6 \n\t" 303 "paddw %%mm0, %%mm6 \n\t" 304 "movd %%mm6, %0 \n\t" 305 : "=r" (ret)); 306 return ret & 0xFFFF; 307} 308 309static inline int sum_mmxext(void) 310{ 311 int ret; 312 __asm__ volatile ( 313 "movd %%mm6, %0 \n\t" 314 : "=r" (ret)); 315 return ret; 316} 317 318static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 319{ 320 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); 321} 322 323static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 324{ 325 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); 326} 327 328#define PIX_SAD(suf) \ 329static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 330 uint8_t *blk1, int stride, int h) \ 331{ \ 332 av_assert2(h == 8); \ 333 __asm__ volatile ( \ 334 "pxor %%mm7, %%mm7 \n\t" \ 335 "pxor %%mm6, %%mm6 \n\t" \ 336 :); \ 337 \ 338 sad8_1_ ## suf(blk1, blk2, stride, 8); \ 339 \ 340 return sum_ ## suf(); \ 341} \ 342 \ 343static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 344 uint8_t *blk1, int stride, int h) \ 345{ \ 346 av_assert2(h == 8); \ 347 __asm__ volatile ( \ 348 "pxor %%mm7, %%mm7 \n\t" \ 349 "pxor %%mm6, %%mm6 \n\t" \ 350 "movq %0, %%mm5 \n\t" \ 351 :: "m" (round_tab[1])); \ 352 \ 353 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ 354 \ 355 return sum_ ## suf(); \ 356} \ 357 \ 358static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 359 uint8_t *blk1, int stride, int h) \ 360{ \ 361 av_assert2(h == 8); \ 362 __asm__ volatile ( \ 363 "pxor %%mm7, %%mm7 \n\t" \ 364 "pxor %%mm6, %%mm6 \n\t" \ 365 "movq %0, %%mm5 \n\t" \ 366 :: "m" (round_tab[1])); \ 367 \ 368 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ 369 \ 370 return sum_ ## suf(); \ 371} \ 372 \ 373static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 374 uint8_t *blk1, int stride, int h) \ 375{ \ 376 av_assert2(h == 8); \ 377 __asm__ volatile ( \ 378 "pxor %%mm7, %%mm7 \n\t" \ 379 "pxor %%mm6, %%mm6 \n\t" \ 380 ::); \ 381 \ 382 sad8_4_ ## suf(blk1, blk2, stride, 8); \ 383 \ 384 return sum_ ## suf(); \ 385} \ 386 \ 387static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 388 uint8_t *blk1, int stride, int h) \ 389{ \ 390 __asm__ volatile ( \ 391 "pxor %%mm7, %%mm7 \n\t" \ 392 "pxor %%mm6, %%mm6 \n\t" \ 393 :); \ 394 \ 395 sad8_1_ ## suf(blk1, blk2, stride, h); \ 396 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 397 \ 398 return sum_ ## suf(); \ 399} \ 400 \ 401static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 402 uint8_t *blk1, int stride, int h) \ 403{ \ 404 __asm__ volatile ( \ 405 "pxor %%mm7, %%mm7 \n\t" \ 406 "pxor %%mm6, %%mm6 \n\t" \ 407 "movq %0, %%mm5 \n\t" \ 408 :: "m" (round_tab[1])); \ 409 \ 410 sad8_x2a_ ## suf(blk1, blk2, stride, h); \ 411 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 412 \ 413 return sum_ ## suf(); \ 414} \ 415 \ 416static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 417 uint8_t *blk1, int stride, int h) \ 418{ \ 419 __asm__ volatile ( \ 420 "pxor %%mm7, %%mm7 \n\t" \ 421 "pxor %%mm6, %%mm6 \n\t" \ 422 "movq %0, %%mm5 \n\t" \ 423 :: "m" (round_tab[1])); \ 424 \ 425 sad8_y2a_ ## suf(blk1, blk2, stride, h); \ 426 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 427 \ 428 return sum_ ## suf(); \ 429} \ 430 \ 431static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 432 uint8_t *blk1, int stride, int h) \ 433{ \ 434 __asm__ volatile ( \ 435 "pxor %%mm7, %%mm7 \n\t" \ 436 "pxor %%mm6, %%mm6 \n\t" \ 437 ::); \ 438 \ 439 sad8_4_ ## suf(blk1, blk2, stride, h); \ 440 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 441 \ 442 return sum_ ## suf(); \ 443} \ 444 445PIX_SAD(mmx) 446PIX_SAD(mmxext) 447 448#endif /* HAVE_INLINE_ASM */ 449 450av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx) 451{ 452#if HAVE_INLINE_ASM 453 int cpu_flags = av_get_cpu_flags(); 454 455 if (INLINE_MMX(cpu_flags)) { 456 c->pix_abs[0][0] = sad16_mmx; 457 c->pix_abs[0][1] = sad16_x2_mmx; 458 c->pix_abs[0][2] = sad16_y2_mmx; 459 c->pix_abs[0][3] = sad16_xy2_mmx; 460 c->pix_abs[1][0] = sad8_mmx; 461 c->pix_abs[1][1] = sad8_x2_mmx; 462 c->pix_abs[1][2] = sad8_y2_mmx; 463 c->pix_abs[1][3] = sad8_xy2_mmx; 464 465 c->sad[0] = sad16_mmx; 466 c->sad[1] = sad8_mmx; 467 } 468 if (INLINE_MMXEXT(cpu_flags)) { 469 c->pix_abs[0][0] = sad16_mmxext; 470 c->pix_abs[1][0] = sad8_mmxext; 471 472 c->sad[0] = sad16_mmxext; 473 c->sad[1] = sad8_mmxext; 474 475 c->pix_abs[0][1] = sad16_x2_mmxext; 476 c->pix_abs[0][2] = sad16_y2_mmxext; 477 c->pix_abs[1][1] = sad8_x2_mmxext; 478 c->pix_abs[1][2] = sad8_y2_mmxext; 479 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { 480 c->pix_abs[0][3] = sad16_xy2_mmxext; 481 c->pix_abs[1][3] = sad8_xy2_mmxext; 482 } 483 } 484 if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { 485 c->sad[0] = sad16_sse2; 486 } 487#endif /* HAVE_INLINE_ASM */ 488} 489