1/* 2 * MMX optimized motion estimation 3 * Copyright (c) 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer 5 * 6 * mostly by Michael Niedermayer <michaelni@gmx.at> 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include "libavutil/x86_cpu.h" 26#include "libavcodec/dsputil.h" 27 28DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={ 290x0000000000000000ULL, 300x0001000100010001ULL, 310x0002000200020002ULL, 32}; 33 34DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; 35 36static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 37{ 38 x86_reg len= -(stride*h); 39 __asm__ volatile( 40 ASMALIGN(4) 41 "1: \n\t" 42 "movq (%1, %%"REG_a"), %%mm0 \n\t" 43 "movq (%2, %%"REG_a"), %%mm2 \n\t" 44 "movq (%2, %%"REG_a"), %%mm4 \n\t" 45 "add %3, %%"REG_a" \n\t" 46 "psubusb %%mm0, %%mm2 \n\t" 47 "psubusb %%mm4, %%mm0 \n\t" 48 "movq (%1, %%"REG_a"), %%mm1 \n\t" 49 "movq (%2, %%"REG_a"), %%mm3 \n\t" 50 "movq (%2, %%"REG_a"), %%mm5 \n\t" 51 "psubusb %%mm1, %%mm3 \n\t" 52 "psubusb %%mm5, %%mm1 \n\t" 53 "por %%mm2, %%mm0 \n\t" 54 "por %%mm1, %%mm3 \n\t" 55 "movq %%mm0, %%mm1 \n\t" 56 "movq %%mm3, %%mm2 \n\t" 57 "punpcklbw %%mm7, %%mm0 \n\t" 58 "punpckhbw %%mm7, %%mm1 \n\t" 59 "punpcklbw %%mm7, %%mm3 \n\t" 60 "punpckhbw %%mm7, %%mm2 \n\t" 61 "paddw %%mm1, %%mm0 \n\t" 62 "paddw %%mm3, %%mm2 \n\t" 63 "paddw %%mm2, %%mm0 \n\t" 64 "paddw %%mm0, %%mm6 \n\t" 65 "add %3, %%"REG_a" \n\t" 66 " js 1b \n\t" 67 : "+a" (len) 68 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) 69 ); 70} 71 72static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 73{ 74 __asm__ volatile( 75 ASMALIGN(4) 76 "1: \n\t" 77 "movq (%1), %%mm0 \n\t" 78 "movq (%1, %3), %%mm1 \n\t" 79 "psadbw (%2), %%mm0 \n\t" 80 "psadbw (%2, %3), %%mm1 \n\t" 81 "paddw %%mm0, %%mm6 \n\t" 82 "paddw %%mm1, %%mm6 \n\t" 83 "lea (%1,%3,2), %1 \n\t" 84 "lea (%2,%3,2), %2 \n\t" 85 "sub $2, %0 \n\t" 86 " jg 1b \n\t" 87 : "+r" (h), "+r" (blk1), "+r" (blk2) 88 : "r" ((x86_reg)stride) 89 ); 90} 91 92static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) 93{ 94 int ret; 95 __asm__ volatile( 96 "pxor %%xmm6, %%xmm6 \n\t" 97 ASMALIGN(4) 98 "1: \n\t" 99 "movdqu (%1), %%xmm0 \n\t" 100 "movdqu (%1, %3), %%xmm1 \n\t" 101 "psadbw (%2), %%xmm0 \n\t" 102 "psadbw (%2, %3), %%xmm1 \n\t" 103 "paddw %%xmm0, %%xmm6 \n\t" 104 "paddw %%xmm1, %%xmm6 \n\t" 105 "lea (%1,%3,2), %1 \n\t" 106 "lea (%2,%3,2), %2 \n\t" 107 "sub $2, %0 \n\t" 108 " jg 1b \n\t" 109 : "+r" (h), "+r" (blk1), "+r" (blk2) 110 : "r" ((x86_reg)stride) 111 ); 112 __asm__ volatile( 113 "movhlps %%xmm6, %%xmm0 \n\t" 114 "paddw %%xmm0, %%xmm6 \n\t" 115 "movd %%xmm6, %0 \n\t" 116 : "=r"(ret) 117 ); 118 return ret; 119} 120 121static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 122{ 123 __asm__ volatile( 124 ASMALIGN(4) 125 "1: \n\t" 126 "movq (%1), %%mm0 \n\t" 127 "movq (%1, %3), %%mm1 \n\t" 128 "pavgb 1(%1), %%mm0 \n\t" 129 "pavgb 1(%1, %3), %%mm1 \n\t" 130 "psadbw (%2), %%mm0 \n\t" 131 "psadbw (%2, %3), %%mm1 \n\t" 132 "paddw %%mm0, %%mm6 \n\t" 133 "paddw %%mm1, %%mm6 \n\t" 134 "lea (%1,%3,2), %1 \n\t" 135 "lea (%2,%3,2), %2 \n\t" 136 "sub $2, %0 \n\t" 137 " jg 1b \n\t" 138 : "+r" (h), "+r" (blk1), "+r" (blk2) 139 : "r" ((x86_reg)stride) 140 ); 141} 142 143static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 144{ 145 __asm__ volatile( 146 "movq (%1), %%mm0 \n\t" 147 "add %3, %1 \n\t" 148 ASMALIGN(4) 149 "1: \n\t" 150 "movq (%1), %%mm1 \n\t" 151 "movq (%1, %3), %%mm2 \n\t" 152 "pavgb %%mm1, %%mm0 \n\t" 153 "pavgb %%mm2, %%mm1 \n\t" 154 "psadbw (%2), %%mm0 \n\t" 155 "psadbw (%2, %3), %%mm1 \n\t" 156 "paddw %%mm0, %%mm6 \n\t" 157 "paddw %%mm1, %%mm6 \n\t" 158 "movq %%mm2, %%mm0 \n\t" 159 "lea (%1,%3,2), %1 \n\t" 160 "lea (%2,%3,2), %2 \n\t" 161 "sub $2, %0 \n\t" 162 " jg 1b \n\t" 163 : "+r" (h), "+r" (blk1), "+r" (blk2) 164 : "r" ((x86_reg)stride) 165 ); 166} 167 168static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) 169{ 170 __asm__ volatile( 171 "movq "MANGLE(bone)", %%mm5 \n\t" 172 "movq (%1), %%mm0 \n\t" 173 "pavgb 1(%1), %%mm0 \n\t" 174 "add %3, %1 \n\t" 175 ASMALIGN(4) 176 "1: \n\t" 177 "movq (%1), %%mm1 \n\t" 178 "movq (%1,%3), %%mm2 \n\t" 179 "pavgb 1(%1), %%mm1 \n\t" 180 "pavgb 1(%1,%3), %%mm2 \n\t" 181 "psubusb %%mm5, %%mm1 \n\t" 182 "pavgb %%mm1, %%mm0 \n\t" 183 "pavgb %%mm2, %%mm1 \n\t" 184 "psadbw (%2), %%mm0 \n\t" 185 "psadbw (%2,%3), %%mm1 \n\t" 186 "paddw %%mm0, %%mm6 \n\t" 187 "paddw %%mm1, %%mm6 \n\t" 188 "movq %%mm2, %%mm0 \n\t" 189 "lea (%1,%3,2), %1 \n\t" 190 "lea (%2,%3,2), %2 \n\t" 191 "sub $2, %0 \n\t" 192 " jg 1b \n\t" 193 : "+r" (h), "+r" (blk1), "+r" (blk2) 194 : "r" ((x86_reg)stride) 195 ); 196} 197 198static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) 199{ 200 x86_reg len= -(stride*h); 201 __asm__ volatile( 202 ASMALIGN(4) 203 "1: \n\t" 204 "movq (%1, %%"REG_a"), %%mm0 \n\t" 205 "movq (%2, %%"REG_a"), %%mm1 \n\t" 206 "movq (%1, %%"REG_a"), %%mm2 \n\t" 207 "movq (%2, %%"REG_a"), %%mm3 \n\t" 208 "punpcklbw %%mm7, %%mm0 \n\t" 209 "punpcklbw %%mm7, %%mm1 \n\t" 210 "punpckhbw %%mm7, %%mm2 \n\t" 211 "punpckhbw %%mm7, %%mm3 \n\t" 212 "paddw %%mm0, %%mm1 \n\t" 213 "paddw %%mm2, %%mm3 \n\t" 214 "movq (%3, %%"REG_a"), %%mm4 \n\t" 215 "movq (%3, %%"REG_a"), %%mm2 \n\t" 216 "paddw %%mm5, %%mm1 \n\t" 217 "paddw %%mm5, %%mm3 \n\t" 218 "psrlw $1, %%mm1 \n\t" 219 "psrlw $1, %%mm3 \n\t" 220 "packuswb %%mm3, %%mm1 \n\t" 221 "psubusb %%mm1, %%mm4 \n\t" 222 "psubusb %%mm2, %%mm1 \n\t" 223 "por %%mm4, %%mm1 \n\t" 224 "movq %%mm1, %%mm0 \n\t" 225 "punpcklbw %%mm7, %%mm0 \n\t" 226 "punpckhbw %%mm7, %%mm1 \n\t" 227 "paddw %%mm1, %%mm0 \n\t" 228 "paddw %%mm0, %%mm6 \n\t" 229 "add %4, %%"REG_a" \n\t" 230 " js 1b \n\t" 231 : "+a" (len) 232 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) 233 ); 234} 235 236static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 237{ 238 x86_reg len= -(stride*h); 239 __asm__ volatile( 240 "movq (%1, %%"REG_a"), %%mm0 \n\t" 241 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" 242 "movq %%mm0, %%mm1 \n\t" 243 "movq %%mm2, %%mm3 \n\t" 244 "punpcklbw %%mm7, %%mm0 \n\t" 245 "punpckhbw %%mm7, %%mm1 \n\t" 246 "punpcklbw %%mm7, %%mm2 \n\t" 247 "punpckhbw %%mm7, %%mm3 \n\t" 248 "paddw %%mm2, %%mm0 \n\t" 249 "paddw %%mm3, %%mm1 \n\t" 250 ASMALIGN(4) 251 "1: \n\t" 252 "movq (%2, %%"REG_a"), %%mm2 \n\t" 253 "movq 1(%2, %%"REG_a"), %%mm4 \n\t" 254 "movq %%mm2, %%mm3 \n\t" 255 "movq %%mm4, %%mm5 \n\t" 256 "punpcklbw %%mm7, %%mm2 \n\t" 257 "punpckhbw %%mm7, %%mm3 \n\t" 258 "punpcklbw %%mm7, %%mm4 \n\t" 259 "punpckhbw %%mm7, %%mm5 \n\t" 260 "paddw %%mm4, %%mm2 \n\t" 261 "paddw %%mm5, %%mm3 \n\t" 262 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" 263 "paddw %%mm2, %%mm0 \n\t" 264 "paddw %%mm3, %%mm1 \n\t" 265 "paddw %%mm5, %%mm0 \n\t" 266 "paddw %%mm5, %%mm1 \n\t" 267 "movq (%3, %%"REG_a"), %%mm4 \n\t" 268 "movq (%3, %%"REG_a"), %%mm5 \n\t" 269 "psrlw $2, %%mm0 \n\t" 270 "psrlw $2, %%mm1 \n\t" 271 "packuswb %%mm1, %%mm0 \n\t" 272 "psubusb %%mm0, %%mm4 \n\t" 273 "psubusb %%mm5, %%mm0 \n\t" 274 "por %%mm4, %%mm0 \n\t" 275 "movq %%mm0, %%mm4 \n\t" 276 "punpcklbw %%mm7, %%mm0 \n\t" 277 "punpckhbw %%mm7, %%mm4 \n\t" 278 "paddw %%mm0, %%mm6 \n\t" 279 "paddw %%mm4, %%mm6 \n\t" 280 "movq %%mm2, %%mm0 \n\t" 281 "movq %%mm3, %%mm1 \n\t" 282 "add %4, %%"REG_a" \n\t" 283 " js 1b \n\t" 284 : "+a" (len) 285 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) 286 ); 287} 288 289static inline int sum_mmx(void) 290{ 291 int ret; 292 __asm__ volatile( 293 "movq %%mm6, %%mm0 \n\t" 294 "psrlq $32, %%mm6 \n\t" 295 "paddw %%mm0, %%mm6 \n\t" 296 "movq %%mm6, %%mm0 \n\t" 297 "psrlq $16, %%mm6 \n\t" 298 "paddw %%mm0, %%mm6 \n\t" 299 "movd %%mm6, %0 \n\t" 300 : "=r" (ret) 301 ); 302 return ret&0xFFFF; 303} 304 305static inline int sum_mmx2(void) 306{ 307 int ret; 308 __asm__ volatile( 309 "movd %%mm6, %0 \n\t" 310 : "=r" (ret) 311 ); 312 return ret; 313} 314 315static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 316{ 317 sad8_2_mmx(blk1, blk1+1, blk2, stride, h); 318} 319static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) 320{ 321 sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); 322} 323 324 325#define PIX_SAD(suf)\ 326static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 327{\ 328 assert(h==8);\ 329 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 330 "pxor %%mm6, %%mm6 \n\t":);\ 331\ 332 sad8_1_ ## suf(blk1, blk2, stride, 8);\ 333\ 334 return sum_ ## suf();\ 335}\ 336static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 337{\ 338 assert(h==8);\ 339 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 340 "pxor %%mm6, %%mm6 \n\t"\ 341 "movq %0, %%mm5 \n\t"\ 342 :: "m"(round_tab[1]) \ 343 );\ 344\ 345 sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ 346\ 347 return sum_ ## suf();\ 348}\ 349\ 350static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 351{\ 352 assert(h==8);\ 353 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 354 "pxor %%mm6, %%mm6 \n\t"\ 355 "movq %0, %%mm5 \n\t"\ 356 :: "m"(round_tab[1]) \ 357 );\ 358\ 359 sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ 360\ 361 return sum_ ## suf();\ 362}\ 363\ 364static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 365{\ 366 assert(h==8);\ 367 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 368 "pxor %%mm6, %%mm6 \n\t"\ 369 ::);\ 370\ 371 sad8_4_ ## suf(blk1, blk2, stride, 8);\ 372\ 373 return sum_ ## suf();\ 374}\ 375\ 376static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 377{\ 378 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 379 "pxor %%mm6, %%mm6 \n\t":);\ 380\ 381 sad8_1_ ## suf(blk1 , blk2 , stride, h);\ 382 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ 383\ 384 return sum_ ## suf();\ 385}\ 386static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 387{\ 388 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 389 "pxor %%mm6, %%mm6 \n\t"\ 390 "movq %0, %%mm5 \n\t"\ 391 :: "m"(round_tab[1]) \ 392 );\ 393\ 394 sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ 395 sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ 396\ 397 return sum_ ## suf();\ 398}\ 399static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 400{\ 401 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 402 "pxor %%mm6, %%mm6 \n\t"\ 403 "movq %0, %%mm5 \n\t"\ 404 :: "m"(round_tab[1]) \ 405 );\ 406\ 407 sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ 408 sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ 409\ 410 return sum_ ## suf();\ 411}\ 412static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ 413{\ 414 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ 415 "pxor %%mm6, %%mm6 \n\t"\ 416 ::);\ 417\ 418 sad8_4_ ## suf(blk1 , blk2 , stride, h);\ 419 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ 420\ 421 return sum_ ## suf();\ 422}\ 423 424PIX_SAD(mmx) 425PIX_SAD(mmx2) 426 427void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) 428{ 429 if (mm_flags & FF_MM_MMX) { 430 c->pix_abs[0][0] = sad16_mmx; 431 c->pix_abs[0][1] = sad16_x2_mmx; 432 c->pix_abs[0][2] = sad16_y2_mmx; 433 c->pix_abs[0][3] = sad16_xy2_mmx; 434 c->pix_abs[1][0] = sad8_mmx; 435 c->pix_abs[1][1] = sad8_x2_mmx; 436 c->pix_abs[1][2] = sad8_y2_mmx; 437 c->pix_abs[1][3] = sad8_xy2_mmx; 438 439 c->sad[0]= sad16_mmx; 440 c->sad[1]= sad8_mmx; 441 } 442 if (mm_flags & FF_MM_MMXEXT) { 443 c->pix_abs[0][0] = sad16_mmx2; 444 c->pix_abs[1][0] = sad8_mmx2; 445 446 c->sad[0]= sad16_mmx2; 447 c->sad[1]= sad8_mmx2; 448 449 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 450 c->pix_abs[0][1] = sad16_x2_mmx2; 451 c->pix_abs[0][2] = sad16_y2_mmx2; 452 c->pix_abs[0][3] = sad16_xy2_mmx2; 453 c->pix_abs[1][1] = sad8_x2_mmx2; 454 c->pix_abs[1][2] = sad8_y2_mmx2; 455 c->pix_abs[1][3] = sad8_xy2_mmx2; 456 } 457 } 458 if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)) { 459 c->sad[0]= sad16_sse2; 460 } 461} 462