1/* 2 * MMX optimized DSP utils 3 * Copyright (c) 2000, 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * This file is part of Libav. 7 * 8 * Libav is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * Libav is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with Libav; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 * 22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 23 */ 24 25#include "libavutil/cpu.h" 26#include "libavutil/x86_cpu.h" 27#include "libavcodec/dsputil.h" 28#include "libavcodec/mpegvideo.h" 29#include "libavcodec/mathops.h" 30#include "dsputil_mmx.h" 31 32 33static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 34{ 35 __asm__ volatile( 36 "mov $-128, %%"REG_a" \n\t" 37 "pxor %%mm7, %%mm7 \n\t" 38 ".p2align 4 \n\t" 39 "1: \n\t" 40 "movq (%0), %%mm0 \n\t" 41 "movq (%0, %2), %%mm2 \n\t" 42 "movq %%mm0, %%mm1 \n\t" 43 "movq %%mm2, %%mm3 \n\t" 44 "punpcklbw %%mm7, %%mm0 \n\t" 45 "punpckhbw %%mm7, %%mm1 \n\t" 46 "punpcklbw %%mm7, %%mm2 \n\t" 47 "punpckhbw %%mm7, %%mm3 \n\t" 48 "movq %%mm0, (%1, %%"REG_a") \n\t" 49 "movq %%mm1, 8(%1, %%"REG_a") \n\t" 50 "movq %%mm2, 16(%1, %%"REG_a") \n\t" 51 "movq %%mm3, 24(%1, %%"REG_a") \n\t" 52 "add %3, %0 \n\t" 53 "add $32, %%"REG_a" \n\t" 54 "js 1b \n\t" 55 : "+r" (pixels) 56 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) 57 : "%"REG_a 58 ); 59} 60 61static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) 62{ 63 __asm__ volatile( 64 "pxor %%xmm4, %%xmm4 \n\t" 65 "movq (%0), %%xmm0 \n\t" 66 "movq (%0, %2), %%xmm1 \n\t" 67 "movq (%0, %2,2), %%xmm2 \n\t" 68 "movq (%0, %3), %%xmm3 \n\t" 69 "lea (%0,%2,4), %0 \n\t" 70 "punpcklbw %%xmm4, %%xmm0 \n\t" 71 "punpcklbw %%xmm4, %%xmm1 \n\t" 72 "punpcklbw %%xmm4, %%xmm2 \n\t" 73 "punpcklbw %%xmm4, %%xmm3 \n\t" 74 "movdqa %%xmm0, (%1) \n\t" 75 "movdqa %%xmm1, 16(%1) \n\t" 76 "movdqa %%xmm2, 32(%1) \n\t" 77 "movdqa %%xmm3, 48(%1) \n\t" 78 "movq (%0), %%xmm0 \n\t" 79 "movq (%0, %2), %%xmm1 \n\t" 80 "movq (%0, %2,2), %%xmm2 \n\t" 81 "movq (%0, %3), %%xmm3 \n\t" 82 "punpcklbw %%xmm4, %%xmm0 \n\t" 83 "punpcklbw %%xmm4, %%xmm1 \n\t" 84 "punpcklbw %%xmm4, %%xmm2 \n\t" 85 "punpcklbw %%xmm4, %%xmm3 \n\t" 86 "movdqa %%xmm0, 64(%1) \n\t" 87 "movdqa %%xmm1, 80(%1) \n\t" 88 "movdqa %%xmm2, 96(%1) \n\t" 89 "movdqa %%xmm3, 112(%1) \n\t" 90 : "+r" (pixels) 91 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) 92 ); 93} 94 95static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 96{ 97 __asm__ volatile( 98 "pxor %%mm7, %%mm7 \n\t" 99 "mov $-128, %%"REG_a" \n\t" 100 ".p2align 4 \n\t" 101 "1: \n\t" 102 "movq (%0), %%mm0 \n\t" 103 "movq (%1), %%mm2 \n\t" 104 "movq %%mm0, %%mm1 \n\t" 105 "movq %%mm2, %%mm3 \n\t" 106 "punpcklbw %%mm7, %%mm0 \n\t" 107 "punpckhbw %%mm7, %%mm1 \n\t" 108 "punpcklbw %%mm7, %%mm2 \n\t" 109 "punpckhbw %%mm7, %%mm3 \n\t" 110 "psubw %%mm2, %%mm0 \n\t" 111 "psubw %%mm3, %%mm1 \n\t" 112 "movq %%mm0, (%2, %%"REG_a") \n\t" 113 "movq %%mm1, 8(%2, %%"REG_a") \n\t" 114 "add %3, %0 \n\t" 115 "add %3, %1 \n\t" 116 "add $16, %%"REG_a" \n\t" 117 "jnz 1b \n\t" 118 : "+r" (s1), "+r" (s2) 119 : "r" (block+64), "r" ((x86_reg)stride) 120 : "%"REG_a 121 ); 122} 123 124static int pix_sum16_mmx(uint8_t * pix, int line_size){ 125 const int h=16; 126 int sum; 127 x86_reg index= -line_size*h; 128 129 __asm__ volatile( 130 "pxor %%mm7, %%mm7 \n\t" 131 "pxor %%mm6, %%mm6 \n\t" 132 "1: \n\t" 133 "movq (%2, %1), %%mm0 \n\t" 134 "movq (%2, %1), %%mm1 \n\t" 135 "movq 8(%2, %1), %%mm2 \n\t" 136 "movq 8(%2, %1), %%mm3 \n\t" 137 "punpcklbw %%mm7, %%mm0 \n\t" 138 "punpckhbw %%mm7, %%mm1 \n\t" 139 "punpcklbw %%mm7, %%mm2 \n\t" 140 "punpckhbw %%mm7, %%mm3 \n\t" 141 "paddw %%mm0, %%mm1 \n\t" 142 "paddw %%mm2, %%mm3 \n\t" 143 "paddw %%mm1, %%mm3 \n\t" 144 "paddw %%mm3, %%mm6 \n\t" 145 "add %3, %1 \n\t" 146 " js 1b \n\t" 147 "movq %%mm6, %%mm5 \n\t" 148 "psrlq $32, %%mm6 \n\t" 149 "paddw %%mm5, %%mm6 \n\t" 150 "movq %%mm6, %%mm5 \n\t" 151 "psrlq $16, %%mm6 \n\t" 152 "paddw %%mm5, %%mm6 \n\t" 153 "movd %%mm6, %0 \n\t" 154 "andl $0xFFFF, %0 \n\t" 155 : "=&r" (sum), "+r" (index) 156 : "r" (pix - index), "r" ((x86_reg)line_size) 157 ); 158 159 return sum; 160} 161 162static int pix_norm1_mmx(uint8_t *pix, int line_size) { 163 int tmp; 164 __asm__ volatile ( 165 "movl $16,%%ecx\n" 166 "pxor %%mm0,%%mm0\n" 167 "pxor %%mm7,%%mm7\n" 168 "1:\n" 169 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ 170 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ 171 172 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ 173 174 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ 175 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ 176 177 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ 178 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ 179 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ 180 181 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 182 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 183 184 "pmaddwd %%mm3,%%mm3\n" 185 "pmaddwd %%mm4,%%mm4\n" 186 187 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 188 pix2^2+pix3^2+pix6^2+pix7^2) */ 189 "paddd %%mm3,%%mm4\n" 190 "paddd %%mm2,%%mm7\n" 191 192 "add %2, %0\n" 193 "paddd %%mm4,%%mm7\n" 194 "dec %%ecx\n" 195 "jnz 1b\n" 196 197 "movq %%mm7,%%mm1\n" 198 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 199 "paddd %%mm7,%%mm1\n" 200 "movd %%mm1,%1\n" 201 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); 202 return tmp; 203} 204 205static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 206 int tmp; 207 __asm__ volatile ( 208 "movl %4,%%ecx\n" 209 "shr $1,%%ecx\n" 210 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 211 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 212 "1:\n" 213 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ 214 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ 215 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ 216 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ 217 218 /* todo: mm1-mm2, mm3-mm4 */ 219 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 220 /* OR the results to get absolute difference */ 221 "movq %%mm1,%%mm5\n" 222 "movq %%mm3,%%mm6\n" 223 "psubusb %%mm2,%%mm1\n" 224 "psubusb %%mm4,%%mm3\n" 225 "psubusb %%mm5,%%mm2\n" 226 "psubusb %%mm6,%%mm4\n" 227 228 "por %%mm1,%%mm2\n" 229 "por %%mm3,%%mm4\n" 230 231 /* now convert to 16-bit vectors so we can square them */ 232 "movq %%mm2,%%mm1\n" 233 "movq %%mm4,%%mm3\n" 234 235 "punpckhbw %%mm0,%%mm2\n" 236 "punpckhbw %%mm0,%%mm4\n" 237 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 238 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 239 240 "pmaddwd %%mm2,%%mm2\n" 241 "pmaddwd %%mm4,%%mm4\n" 242 "pmaddwd %%mm1,%%mm1\n" 243 "pmaddwd %%mm3,%%mm3\n" 244 245 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 246 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 247 248 "paddd %%mm2,%%mm1\n" 249 "paddd %%mm4,%%mm3\n" 250 "paddd %%mm1,%%mm7\n" 251 "paddd %%mm3,%%mm7\n" 252 253 "decl %%ecx\n" 254 "jnz 1b\n" 255 256 "movq %%mm7,%%mm1\n" 257 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 258 "paddd %%mm7,%%mm1\n" 259 "movd %%mm1,%2\n" 260 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 261 : "r" ((x86_reg)line_size) , "m" (h) 262 : "%ecx"); 263 return tmp; 264} 265 266static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 267 int tmp; 268 __asm__ volatile ( 269 "movl %4,%%ecx\n" 270 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 271 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 272 "1:\n" 273 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ 274 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ 275 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ 276 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ 277 278 /* todo: mm1-mm2, mm3-mm4 */ 279 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 280 /* OR the results to get absolute difference */ 281 "movq %%mm1,%%mm5\n" 282 "movq %%mm3,%%mm6\n" 283 "psubusb %%mm2,%%mm1\n" 284 "psubusb %%mm4,%%mm3\n" 285 "psubusb %%mm5,%%mm2\n" 286 "psubusb %%mm6,%%mm4\n" 287 288 "por %%mm1,%%mm2\n" 289 "por %%mm3,%%mm4\n" 290 291 /* now convert to 16-bit vectors so we can square them */ 292 "movq %%mm2,%%mm1\n" 293 "movq %%mm4,%%mm3\n" 294 295 "punpckhbw %%mm0,%%mm2\n" 296 "punpckhbw %%mm0,%%mm4\n" 297 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 298 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 299 300 "pmaddwd %%mm2,%%mm2\n" 301 "pmaddwd %%mm4,%%mm4\n" 302 "pmaddwd %%mm1,%%mm1\n" 303 "pmaddwd %%mm3,%%mm3\n" 304 305 "add %3,%0\n" 306 "add %3,%1\n" 307 308 "paddd %%mm2,%%mm1\n" 309 "paddd %%mm4,%%mm3\n" 310 "paddd %%mm1,%%mm7\n" 311 "paddd %%mm3,%%mm7\n" 312 313 "decl %%ecx\n" 314 "jnz 1b\n" 315 316 "movq %%mm7,%%mm1\n" 317 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 318 "paddd %%mm7,%%mm1\n" 319 "movd %%mm1,%2\n" 320 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 321 : "r" ((x86_reg)line_size) , "m" (h) 322 : "%ecx"); 323 return tmp; 324} 325 326int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); 327 328static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 329 int tmp; 330 __asm__ volatile ( 331 "movl %3,%%ecx\n" 332 "pxor %%mm7,%%mm7\n" 333 "pxor %%mm6,%%mm6\n" 334 335 "movq (%0),%%mm0\n" 336 "movq %%mm0, %%mm1\n" 337 "psllq $8, %%mm0\n" 338 "psrlq $8, %%mm1\n" 339 "psrlq $8, %%mm0\n" 340 "movq %%mm0, %%mm2\n" 341 "movq %%mm1, %%mm3\n" 342 "punpcklbw %%mm7,%%mm0\n" 343 "punpcklbw %%mm7,%%mm1\n" 344 "punpckhbw %%mm7,%%mm2\n" 345 "punpckhbw %%mm7,%%mm3\n" 346 "psubw %%mm1, %%mm0\n" 347 "psubw %%mm3, %%mm2\n" 348 349 "add %2,%0\n" 350 351 "movq (%0),%%mm4\n" 352 "movq %%mm4, %%mm1\n" 353 "psllq $8, %%mm4\n" 354 "psrlq $8, %%mm1\n" 355 "psrlq $8, %%mm4\n" 356 "movq %%mm4, %%mm5\n" 357 "movq %%mm1, %%mm3\n" 358 "punpcklbw %%mm7,%%mm4\n" 359 "punpcklbw %%mm7,%%mm1\n" 360 "punpckhbw %%mm7,%%mm5\n" 361 "punpckhbw %%mm7,%%mm3\n" 362 "psubw %%mm1, %%mm4\n" 363 "psubw %%mm3, %%mm5\n" 364 "psubw %%mm4, %%mm0\n" 365 "psubw %%mm5, %%mm2\n" 366 "pxor %%mm3, %%mm3\n" 367 "pxor %%mm1, %%mm1\n" 368 "pcmpgtw %%mm0, %%mm3\n\t" 369 "pcmpgtw %%mm2, %%mm1\n\t" 370 "pxor %%mm3, %%mm0\n" 371 "pxor %%mm1, %%mm2\n" 372 "psubw %%mm3, %%mm0\n" 373 "psubw %%mm1, %%mm2\n" 374 "paddw %%mm0, %%mm2\n" 375 "paddw %%mm2, %%mm6\n" 376 377 "add %2,%0\n" 378 "1:\n" 379 380 "movq (%0),%%mm0\n" 381 "movq %%mm0, %%mm1\n" 382 "psllq $8, %%mm0\n" 383 "psrlq $8, %%mm1\n" 384 "psrlq $8, %%mm0\n" 385 "movq %%mm0, %%mm2\n" 386 "movq %%mm1, %%mm3\n" 387 "punpcklbw %%mm7,%%mm0\n" 388 "punpcklbw %%mm7,%%mm1\n" 389 "punpckhbw %%mm7,%%mm2\n" 390 "punpckhbw %%mm7,%%mm3\n" 391 "psubw %%mm1, %%mm0\n" 392 "psubw %%mm3, %%mm2\n" 393 "psubw %%mm0, %%mm4\n" 394 "psubw %%mm2, %%mm5\n" 395 "pxor %%mm3, %%mm3\n" 396 "pxor %%mm1, %%mm1\n" 397 "pcmpgtw %%mm4, %%mm3\n\t" 398 "pcmpgtw %%mm5, %%mm1\n\t" 399 "pxor %%mm3, %%mm4\n" 400 "pxor %%mm1, %%mm5\n" 401 "psubw %%mm3, %%mm4\n" 402 "psubw %%mm1, %%mm5\n" 403 "paddw %%mm4, %%mm5\n" 404 "paddw %%mm5, %%mm6\n" 405 406 "add %2,%0\n" 407 408 "movq (%0),%%mm4\n" 409 "movq %%mm4, %%mm1\n" 410 "psllq $8, %%mm4\n" 411 "psrlq $8, %%mm1\n" 412 "psrlq $8, %%mm4\n" 413 "movq %%mm4, %%mm5\n" 414 "movq %%mm1, %%mm3\n" 415 "punpcklbw %%mm7,%%mm4\n" 416 "punpcklbw %%mm7,%%mm1\n" 417 "punpckhbw %%mm7,%%mm5\n" 418 "punpckhbw %%mm7,%%mm3\n" 419 "psubw %%mm1, %%mm4\n" 420 "psubw %%mm3, %%mm5\n" 421 "psubw %%mm4, %%mm0\n" 422 "psubw %%mm5, %%mm2\n" 423 "pxor %%mm3, %%mm3\n" 424 "pxor %%mm1, %%mm1\n" 425 "pcmpgtw %%mm0, %%mm3\n\t" 426 "pcmpgtw %%mm2, %%mm1\n\t" 427 "pxor %%mm3, %%mm0\n" 428 "pxor %%mm1, %%mm2\n" 429 "psubw %%mm3, %%mm0\n" 430 "psubw %%mm1, %%mm2\n" 431 "paddw %%mm0, %%mm2\n" 432 "paddw %%mm2, %%mm6\n" 433 434 "add %2,%0\n" 435 "subl $2, %%ecx\n" 436 " jnz 1b\n" 437 438 "movq %%mm6, %%mm0\n" 439 "punpcklwd %%mm7,%%mm0\n" 440 "punpckhwd %%mm7,%%mm6\n" 441 "paddd %%mm0, %%mm6\n" 442 443 "movq %%mm6,%%mm0\n" 444 "psrlq $32, %%mm6\n" 445 "paddd %%mm6,%%mm0\n" 446 "movd %%mm0,%1\n" 447 : "+r" (pix1), "=r"(tmp) 448 : "r" ((x86_reg)line_size) , "g" (h-2) 449 : "%ecx"); 450 return tmp; 451} 452 453static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 454 int tmp; 455 uint8_t * pix= pix1; 456 __asm__ volatile ( 457 "movl %3,%%ecx\n" 458 "pxor %%mm7,%%mm7\n" 459 "pxor %%mm6,%%mm6\n" 460 461 "movq (%0),%%mm0\n" 462 "movq 1(%0),%%mm1\n" 463 "movq %%mm0, %%mm2\n" 464 "movq %%mm1, %%mm3\n" 465 "punpcklbw %%mm7,%%mm0\n" 466 "punpcklbw %%mm7,%%mm1\n" 467 "punpckhbw %%mm7,%%mm2\n" 468 "punpckhbw %%mm7,%%mm3\n" 469 "psubw %%mm1, %%mm0\n" 470 "psubw %%mm3, %%mm2\n" 471 472 "add %2,%0\n" 473 474 "movq (%0),%%mm4\n" 475 "movq 1(%0),%%mm1\n" 476 "movq %%mm4, %%mm5\n" 477 "movq %%mm1, %%mm3\n" 478 "punpcklbw %%mm7,%%mm4\n" 479 "punpcklbw %%mm7,%%mm1\n" 480 "punpckhbw %%mm7,%%mm5\n" 481 "punpckhbw %%mm7,%%mm3\n" 482 "psubw %%mm1, %%mm4\n" 483 "psubw %%mm3, %%mm5\n" 484 "psubw %%mm4, %%mm0\n" 485 "psubw %%mm5, %%mm2\n" 486 "pxor %%mm3, %%mm3\n" 487 "pxor %%mm1, %%mm1\n" 488 "pcmpgtw %%mm0, %%mm3\n\t" 489 "pcmpgtw %%mm2, %%mm1\n\t" 490 "pxor %%mm3, %%mm0\n" 491 "pxor %%mm1, %%mm2\n" 492 "psubw %%mm3, %%mm0\n" 493 "psubw %%mm1, %%mm2\n" 494 "paddw %%mm0, %%mm2\n" 495 "paddw %%mm2, %%mm6\n" 496 497 "add %2,%0\n" 498 "1:\n" 499 500 "movq (%0),%%mm0\n" 501 "movq 1(%0),%%mm1\n" 502 "movq %%mm0, %%mm2\n" 503 "movq %%mm1, %%mm3\n" 504 "punpcklbw %%mm7,%%mm0\n" 505 "punpcklbw %%mm7,%%mm1\n" 506 "punpckhbw %%mm7,%%mm2\n" 507 "punpckhbw %%mm7,%%mm3\n" 508 "psubw %%mm1, %%mm0\n" 509 "psubw %%mm3, %%mm2\n" 510 "psubw %%mm0, %%mm4\n" 511 "psubw %%mm2, %%mm5\n" 512 "pxor %%mm3, %%mm3\n" 513 "pxor %%mm1, %%mm1\n" 514 "pcmpgtw %%mm4, %%mm3\n\t" 515 "pcmpgtw %%mm5, %%mm1\n\t" 516 "pxor %%mm3, %%mm4\n" 517 "pxor %%mm1, %%mm5\n" 518 "psubw %%mm3, %%mm4\n" 519 "psubw %%mm1, %%mm5\n" 520 "paddw %%mm4, %%mm5\n" 521 "paddw %%mm5, %%mm6\n" 522 523 "add %2,%0\n" 524 525 "movq (%0),%%mm4\n" 526 "movq 1(%0),%%mm1\n" 527 "movq %%mm4, %%mm5\n" 528 "movq %%mm1, %%mm3\n" 529 "punpcklbw %%mm7,%%mm4\n" 530 "punpcklbw %%mm7,%%mm1\n" 531 "punpckhbw %%mm7,%%mm5\n" 532 "punpckhbw %%mm7,%%mm3\n" 533 "psubw %%mm1, %%mm4\n" 534 "psubw %%mm3, %%mm5\n" 535 "psubw %%mm4, %%mm0\n" 536 "psubw %%mm5, %%mm2\n" 537 "pxor %%mm3, %%mm3\n" 538 "pxor %%mm1, %%mm1\n" 539 "pcmpgtw %%mm0, %%mm3\n\t" 540 "pcmpgtw %%mm2, %%mm1\n\t" 541 "pxor %%mm3, %%mm0\n" 542 "pxor %%mm1, %%mm2\n" 543 "psubw %%mm3, %%mm0\n" 544 "psubw %%mm1, %%mm2\n" 545 "paddw %%mm0, %%mm2\n" 546 "paddw %%mm2, %%mm6\n" 547 548 "add %2,%0\n" 549 "subl $2, %%ecx\n" 550 " jnz 1b\n" 551 552 "movq %%mm6, %%mm0\n" 553 "punpcklwd %%mm7,%%mm0\n" 554 "punpckhwd %%mm7,%%mm6\n" 555 "paddd %%mm0, %%mm6\n" 556 557 "movq %%mm6,%%mm0\n" 558 "psrlq $32, %%mm6\n" 559 "paddd %%mm6,%%mm0\n" 560 "movd %%mm0,%1\n" 561 : "+r" (pix1), "=r"(tmp) 562 : "r" ((x86_reg)line_size) , "g" (h-2) 563 : "%ecx"); 564 return tmp + hf_noise8_mmx(pix+8, line_size, h); 565} 566 567static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 568 MpegEncContext *c = p; 569 int score1, score2; 570 571 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); 572 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); 573 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); 574 575 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 576 else return score1 + FFABS(score2)*8; 577} 578 579static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 580 MpegEncContext *c = p; 581 int score1= sse8_mmx(c, pix1, pix2, line_size, h); 582 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); 583 584 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 585 else return score1 + FFABS(score2)*8; 586} 587 588static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 589 int tmp; 590 591 assert( (((int)pix) & 7) == 0); 592 assert((line_size &7) ==0); 593 594#define SUM(in0, in1, out0, out1) \ 595 "movq (%0), %%mm2\n"\ 596 "movq 8(%0), %%mm3\n"\ 597 "add %2,%0\n"\ 598 "movq %%mm2, " #out0 "\n"\ 599 "movq %%mm3, " #out1 "\n"\ 600 "psubusb " #in0 ", %%mm2\n"\ 601 "psubusb " #in1 ", %%mm3\n"\ 602 "psubusb " #out0 ", " #in0 "\n"\ 603 "psubusb " #out1 ", " #in1 "\n"\ 604 "por %%mm2, " #in0 "\n"\ 605 "por %%mm3, " #in1 "\n"\ 606 "movq " #in0 ", %%mm2\n"\ 607 "movq " #in1 ", %%mm3\n"\ 608 "punpcklbw %%mm7, " #in0 "\n"\ 609 "punpcklbw %%mm7, " #in1 "\n"\ 610 "punpckhbw %%mm7, %%mm2\n"\ 611 "punpckhbw %%mm7, %%mm3\n"\ 612 "paddw " #in1 ", " #in0 "\n"\ 613 "paddw %%mm3, %%mm2\n"\ 614 "paddw %%mm2, " #in0 "\n"\ 615 "paddw " #in0 ", %%mm6\n" 616 617 618 __asm__ volatile ( 619 "movl %3,%%ecx\n" 620 "pxor %%mm6,%%mm6\n" 621 "pxor %%mm7,%%mm7\n" 622 "movq (%0),%%mm0\n" 623 "movq 8(%0),%%mm1\n" 624 "add %2,%0\n" 625 "jmp 2f\n" 626 "1:\n" 627 628 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 629 "2:\n" 630 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 631 632 "subl $2, %%ecx\n" 633 "jnz 1b\n" 634 635 "movq %%mm6,%%mm0\n" 636 "psrlq $32, %%mm6\n" 637 "paddw %%mm6,%%mm0\n" 638 "movq %%mm0,%%mm6\n" 639 "psrlq $16, %%mm0\n" 640 "paddw %%mm6,%%mm0\n" 641 "movd %%mm0,%1\n" 642 : "+r" (pix), "=r"(tmp) 643 : "r" ((x86_reg)line_size) , "m" (h) 644 : "%ecx"); 645 return tmp & 0xFFFF; 646} 647#undef SUM 648 649static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 650 int tmp; 651 652 assert( (((int)pix) & 7) == 0); 653 assert((line_size &7) ==0); 654 655#define SUM(in0, in1, out0, out1) \ 656 "movq (%0), " #out0 "\n"\ 657 "movq 8(%0), " #out1 "\n"\ 658 "add %2,%0\n"\ 659 "psadbw " #out0 ", " #in0 "\n"\ 660 "psadbw " #out1 ", " #in1 "\n"\ 661 "paddw " #in1 ", " #in0 "\n"\ 662 "paddw " #in0 ", %%mm6\n" 663 664 __asm__ volatile ( 665 "movl %3,%%ecx\n" 666 "pxor %%mm6,%%mm6\n" 667 "pxor %%mm7,%%mm7\n" 668 "movq (%0),%%mm0\n" 669 "movq 8(%0),%%mm1\n" 670 "add %2,%0\n" 671 "jmp 2f\n" 672 "1:\n" 673 674 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 675 "2:\n" 676 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 677 678 "subl $2, %%ecx\n" 679 "jnz 1b\n" 680 681 "movd %%mm6,%1\n" 682 : "+r" (pix), "=r"(tmp) 683 : "r" ((x86_reg)line_size) , "m" (h) 684 : "%ecx"); 685 return tmp; 686} 687#undef SUM 688 689static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 690 int tmp; 691 692 assert( (((int)pix1) & 7) == 0); 693 assert( (((int)pix2) & 7) == 0); 694 assert((line_size &7) ==0); 695 696#define SUM(in0, in1, out0, out1) \ 697 "movq (%0),%%mm2\n"\ 698 "movq (%1)," #out0 "\n"\ 699 "movq 8(%0),%%mm3\n"\ 700 "movq 8(%1)," #out1 "\n"\ 701 "add %3,%0\n"\ 702 "add %3,%1\n"\ 703 "psubb " #out0 ", %%mm2\n"\ 704 "psubb " #out1 ", %%mm3\n"\ 705 "pxor %%mm7, %%mm2\n"\ 706 "pxor %%mm7, %%mm3\n"\ 707 "movq %%mm2, " #out0 "\n"\ 708 "movq %%mm3, " #out1 "\n"\ 709 "psubusb " #in0 ", %%mm2\n"\ 710 "psubusb " #in1 ", %%mm3\n"\ 711 "psubusb " #out0 ", " #in0 "\n"\ 712 "psubusb " #out1 ", " #in1 "\n"\ 713 "por %%mm2, " #in0 "\n"\ 714 "por %%mm3, " #in1 "\n"\ 715 "movq " #in0 ", %%mm2\n"\ 716 "movq " #in1 ", %%mm3\n"\ 717 "punpcklbw %%mm7, " #in0 "\n"\ 718 "punpcklbw %%mm7, " #in1 "\n"\ 719 "punpckhbw %%mm7, %%mm2\n"\ 720 "punpckhbw %%mm7, %%mm3\n"\ 721 "paddw " #in1 ", " #in0 "\n"\ 722 "paddw %%mm3, %%mm2\n"\ 723 "paddw %%mm2, " #in0 "\n"\ 724 "paddw " #in0 ", %%mm6\n" 725 726 727 __asm__ volatile ( 728 "movl %4,%%ecx\n" 729 "pxor %%mm6,%%mm6\n" 730 "pcmpeqw %%mm7,%%mm7\n" 731 "psllw $15, %%mm7\n" 732 "packsswb %%mm7, %%mm7\n" 733 "movq (%0),%%mm0\n" 734 "movq (%1),%%mm2\n" 735 "movq 8(%0),%%mm1\n" 736 "movq 8(%1),%%mm3\n" 737 "add %3,%0\n" 738 "add %3,%1\n" 739 "psubb %%mm2, %%mm0\n" 740 "psubb %%mm3, %%mm1\n" 741 "pxor %%mm7, %%mm0\n" 742 "pxor %%mm7, %%mm1\n" 743 "jmp 2f\n" 744 "1:\n" 745 746 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 747 "2:\n" 748 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 749 750 "subl $2, %%ecx\n" 751 "jnz 1b\n" 752 753 "movq %%mm6,%%mm0\n" 754 "psrlq $32, %%mm6\n" 755 "paddw %%mm6,%%mm0\n" 756 "movq %%mm0,%%mm6\n" 757 "psrlq $16, %%mm0\n" 758 "paddw %%mm6,%%mm0\n" 759 "movd %%mm0,%2\n" 760 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 761 : "r" ((x86_reg)line_size) , "m" (h) 762 : "%ecx"); 763 return tmp & 0x7FFF; 764} 765#undef SUM 766 767static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 768 int tmp; 769 770 assert( (((int)pix1) & 7) == 0); 771 assert( (((int)pix2) & 7) == 0); 772 assert((line_size &7) ==0); 773 774#define SUM(in0, in1, out0, out1) \ 775 "movq (%0)," #out0 "\n"\ 776 "movq (%1),%%mm2\n"\ 777 "movq 8(%0)," #out1 "\n"\ 778 "movq 8(%1),%%mm3\n"\ 779 "add %3,%0\n"\ 780 "add %3,%1\n"\ 781 "psubb %%mm2, " #out0 "\n"\ 782 "psubb %%mm3, " #out1 "\n"\ 783 "pxor %%mm7, " #out0 "\n"\ 784 "pxor %%mm7, " #out1 "\n"\ 785 "psadbw " #out0 ", " #in0 "\n"\ 786 "psadbw " #out1 ", " #in1 "\n"\ 787 "paddw " #in1 ", " #in0 "\n"\ 788 "paddw " #in0 ", %%mm6\n" 789 790 __asm__ volatile ( 791 "movl %4,%%ecx\n" 792 "pxor %%mm6,%%mm6\n" 793 "pcmpeqw %%mm7,%%mm7\n" 794 "psllw $15, %%mm7\n" 795 "packsswb %%mm7, %%mm7\n" 796 "movq (%0),%%mm0\n" 797 "movq (%1),%%mm2\n" 798 "movq 8(%0),%%mm1\n" 799 "movq 8(%1),%%mm3\n" 800 "add %3,%0\n" 801 "add %3,%1\n" 802 "psubb %%mm2, %%mm0\n" 803 "psubb %%mm3, %%mm1\n" 804 "pxor %%mm7, %%mm0\n" 805 "pxor %%mm7, %%mm1\n" 806 "jmp 2f\n" 807 "1:\n" 808 809 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 810 "2:\n" 811 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 812 813 "subl $2, %%ecx\n" 814 "jnz 1b\n" 815 816 "movd %%mm6,%2\n" 817 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 818 : "r" ((x86_reg)line_size) , "m" (h) 819 : "%ecx"); 820 return tmp; 821} 822#undef SUM 823 824static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 825 x86_reg i=0; 826 __asm__ volatile( 827 "1: \n\t" 828 "movq (%2, %0), %%mm0 \n\t" 829 "movq (%1, %0), %%mm1 \n\t" 830 "psubb %%mm0, %%mm1 \n\t" 831 "movq %%mm1, (%3, %0) \n\t" 832 "movq 8(%2, %0), %%mm0 \n\t" 833 "movq 8(%1, %0), %%mm1 \n\t" 834 "psubb %%mm0, %%mm1 \n\t" 835 "movq %%mm1, 8(%3, %0) \n\t" 836 "add $16, %0 \n\t" 837 "cmp %4, %0 \n\t" 838 " jb 1b \n\t" 839 : "+r" (i) 840 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) 841 ); 842 for(; i<w; i++) 843 dst[i+0] = src1[i+0]-src2[i+0]; 844} 845 846static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ 847 x86_reg i=0; 848 uint8_t l, lt; 849 850 __asm__ volatile( 851 "1: \n\t" 852 "movq -1(%1, %0), %%mm0 \n\t" // LT 853 "movq (%1, %0), %%mm1 \n\t" // T 854 "movq -1(%2, %0), %%mm2 \n\t" // L 855 "movq (%2, %0), %%mm3 \n\t" // X 856 "movq %%mm2, %%mm4 \n\t" // L 857 "psubb %%mm0, %%mm2 \n\t" 858 "paddb %%mm1, %%mm2 \n\t" // L + T - LT 859 "movq %%mm4, %%mm5 \n\t" // L 860 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 861 "pminub %%mm5, %%mm1 \n\t" // min(T, L) 862 "pminub %%mm2, %%mm4 \n\t" 863 "pmaxub %%mm1, %%mm4 \n\t" 864 "psubb %%mm4, %%mm3 \n\t" // dst - pred 865 "movq %%mm3, (%3, %0) \n\t" 866 "add $8, %0 \n\t" 867 "cmp %4, %0 \n\t" 868 " jb 1b \n\t" 869 : "+r" (i) 870 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) 871 ); 872 873 l= *left; 874 lt= *left_top; 875 876 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); 877 878 *left_top= src1[w-1]; 879 *left = src2[w-1]; 880} 881 882#define MMABS_MMX(a,z)\ 883 "pxor " #z ", " #z " \n\t"\ 884 "pcmpgtw " #a ", " #z " \n\t"\ 885 "pxor " #z ", " #a " \n\t"\ 886 "psubw " #z ", " #a " \n\t" 887 888#define MMABS_MMX2(a,z)\ 889 "pxor " #z ", " #z " \n\t"\ 890 "psubw " #a ", " #z " \n\t"\ 891 "pmaxsw " #z ", " #a " \n\t" 892 893#define MMABS_SSSE3(a,z)\ 894 "pabsw " #a ", " #a " \n\t" 895 896#define MMABS_SUM(a,z, sum)\ 897 MMABS(a,z)\ 898 "paddusw " #a ", " #sum " \n\t" 899 900/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 901 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, 902 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ 903#define HSUM_MMX(a, t, dst)\ 904 "movq "#a", "#t" \n\t"\ 905 "psrlq $32, "#a" \n\t"\ 906 "paddusw "#t", "#a" \n\t"\ 907 "movq "#a", "#t" \n\t"\ 908 "psrlq $16, "#a" \n\t"\ 909 "paddusw "#t", "#a" \n\t"\ 910 "movd "#a", "#dst" \n\t"\ 911 912#define HSUM_MMX2(a, t, dst)\ 913 "pshufw $0x0E, "#a", "#t" \n\t"\ 914 "paddusw "#t", "#a" \n\t"\ 915 "pshufw $0x01, "#a", "#t" \n\t"\ 916 "paddusw "#t", "#a" \n\t"\ 917 "movd "#a", "#dst" \n\t"\ 918 919#define HSUM_SSE2(a, t, dst)\ 920 "movhlps "#a", "#t" \n\t"\ 921 "paddusw "#t", "#a" \n\t"\ 922 "pshuflw $0x0E, "#a", "#t" \n\t"\ 923 "paddusw "#t", "#a" \n\t"\ 924 "pshuflw $0x01, "#a", "#t" \n\t"\ 925 "paddusw "#t", "#a" \n\t"\ 926 "movd "#a", "#dst" \n\t"\ 927 928#define hadamard_func(cpu) \ 929int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ 930 int stride, int h); \ 931int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ 932 int stride, int h); 933 934hadamard_func(mmx) 935hadamard_func(mmx2) 936hadamard_func(sse2) 937hadamard_func(ssse3) 938 939#define DCT_SAD4(m,mm,o)\ 940 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 941 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 942 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 943 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ 944 MMABS_SUM(mm##2, mm##6, mm##0)\ 945 MMABS_SUM(mm##3, mm##7, mm##1)\ 946 MMABS_SUM(mm##4, mm##6, mm##0)\ 947 MMABS_SUM(mm##5, mm##7, mm##1)\ 948 949#define DCT_SAD_MMX\ 950 "pxor %%mm0, %%mm0 \n\t"\ 951 "pxor %%mm1, %%mm1 \n\t"\ 952 DCT_SAD4(q, %%mm, 0)\ 953 DCT_SAD4(q, %%mm, 8)\ 954 DCT_SAD4(q, %%mm, 64)\ 955 DCT_SAD4(q, %%mm, 72)\ 956 "paddusw %%mm1, %%mm0 \n\t"\ 957 HSUM(%%mm0, %%mm1, %0) 958 959#define DCT_SAD_SSE2\ 960 "pxor %%xmm0, %%xmm0 \n\t"\ 961 "pxor %%xmm1, %%xmm1 \n\t"\ 962 DCT_SAD4(dqa, %%xmm, 0)\ 963 DCT_SAD4(dqa, %%xmm, 64)\ 964 "paddusw %%xmm1, %%xmm0 \n\t"\ 965 HSUM(%%xmm0, %%xmm1, %0) 966 967#define DCT_SAD_FUNC(cpu) \ 968static int sum_abs_dctelem_##cpu(DCTELEM *block){\ 969 int sum;\ 970 __asm__ volatile(\ 971 DCT_SAD\ 972 :"=r"(sum)\ 973 :"r"(block)\ 974 );\ 975 return sum&0xFFFF;\ 976} 977 978#define DCT_SAD DCT_SAD_MMX 979#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 980#define MMABS(a,z) MMABS_MMX(a,z) 981DCT_SAD_FUNC(mmx) 982#undef MMABS 983#undef HSUM 984 985#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 986#define MMABS(a,z) MMABS_MMX2(a,z) 987DCT_SAD_FUNC(mmx2) 988#undef HSUM 989#undef DCT_SAD 990 991#define DCT_SAD DCT_SAD_SSE2 992#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) 993DCT_SAD_FUNC(sse2) 994#undef MMABS 995 996#if HAVE_SSSE3 997#define MMABS(a,z) MMABS_SSSE3(a,z) 998DCT_SAD_FUNC(ssse3) 999#undef MMABS 1000#endif 1001#undef HSUM 1002#undef DCT_SAD 1003 1004static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ 1005 int sum; 1006 x86_reg i=size; 1007 __asm__ volatile( 1008 "pxor %%mm4, %%mm4 \n" 1009 "1: \n" 1010 "sub $8, %0 \n" 1011 "movq (%2,%0), %%mm2 \n" 1012 "movq (%3,%0,2), %%mm0 \n" 1013 "movq 8(%3,%0,2), %%mm1 \n" 1014 "punpckhbw %%mm2, %%mm3 \n" 1015 "punpcklbw %%mm2, %%mm2 \n" 1016 "psraw $8, %%mm3 \n" 1017 "psraw $8, %%mm2 \n" 1018 "psubw %%mm3, %%mm1 \n" 1019 "psubw %%mm2, %%mm0 \n" 1020 "pmaddwd %%mm1, %%mm1 \n" 1021 "pmaddwd %%mm0, %%mm0 \n" 1022 "paddd %%mm1, %%mm4 \n" 1023 "paddd %%mm0, %%mm4 \n" 1024 "jg 1b \n" 1025 "movq %%mm4, %%mm3 \n" 1026 "psrlq $32, %%mm3 \n" 1027 "paddd %%mm3, %%mm4 \n" 1028 "movd %%mm4, %1 \n" 1029 :"+r"(i), "=r"(sum) 1030 :"r"(pix1), "r"(pix2) 1031 ); 1032 return sum; 1033} 1034 1035#define PHADDD(a, t)\ 1036 "movq "#a", "#t" \n\t"\ 1037 "psrlq $32, "#a" \n\t"\ 1038 "paddd "#t", "#a" \n\t" 1039/* 1040 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] 1041 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] 1042 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] 1043 */ 1044#define PMULHRW(x, y, s, o)\ 1045 "pmulhw " #s ", "#x " \n\t"\ 1046 "pmulhw " #s ", "#y " \n\t"\ 1047 "paddw " #o ", "#x " \n\t"\ 1048 "paddw " #o ", "#y " \n\t"\ 1049 "psraw $1, "#x " \n\t"\ 1050 "psraw $1, "#y " \n\t" 1051#define DEF(x) x ## _mmx 1052#define SET_RND MOVQ_WONE 1053#define SCALE_OFFSET 1 1054 1055#include "dsputil_mmx_qns_template.c" 1056 1057#undef DEF 1058#undef SET_RND 1059#undef SCALE_OFFSET 1060#undef PMULHRW 1061 1062#define DEF(x) x ## _3dnow 1063#define SET_RND(x) 1064#define SCALE_OFFSET 0 1065#define PMULHRW(x, y, s, o)\ 1066 "pmulhrw " #s ", "#x " \n\t"\ 1067 "pmulhrw " #s ", "#y " \n\t" 1068 1069#include "dsputil_mmx_qns_template.c" 1070 1071#undef DEF 1072#undef SET_RND 1073#undef SCALE_OFFSET 1074#undef PMULHRW 1075 1076#if HAVE_SSSE3 1077#undef PHADDD 1078#define DEF(x) x ## _ssse3 1079#define SET_RND(x) 1080#define SCALE_OFFSET -1 1081#define PHADDD(a, t)\ 1082 "pshufw $0x0E, "#a", "#t" \n\t"\ 1083 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ 1084#define PMULHRW(x, y, s, o)\ 1085 "pmulhrsw " #s ", "#x " \n\t"\ 1086 "pmulhrsw " #s ", "#y " \n\t" 1087 1088#include "dsputil_mmx_qns_template.c" 1089 1090#undef DEF 1091#undef SET_RND 1092#undef SCALE_OFFSET 1093#undef PMULHRW 1094#undef PHADDD 1095#endif //HAVE_SSSE3 1096 1097 1098void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) 1099{ 1100 int mm_flags = av_get_cpu_flags(); 1101 int bit_depth = avctx->bits_per_raw_sample; 1102 1103 if (mm_flags & AV_CPU_FLAG_MMX) { 1104 const int dct_algo = avctx->dct_algo; 1105 if (avctx->bits_per_raw_sample <= 8 && 1106 (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) { 1107 if(mm_flags & AV_CPU_FLAG_SSE2){ 1108 c->fdct = ff_fdct_sse2; 1109 }else if(mm_flags & AV_CPU_FLAG_MMX2){ 1110 c->fdct = ff_fdct_mmx2; 1111 }else{ 1112 c->fdct = ff_fdct_mmx; 1113 } 1114 } 1115 1116 if (bit_depth <= 8) 1117 c->get_pixels = get_pixels_mmx; 1118 c->diff_pixels = diff_pixels_mmx; 1119 c->pix_sum = pix_sum16_mmx; 1120 1121 c->diff_bytes= diff_bytes_mmx; 1122 c->sum_abs_dctelem= sum_abs_dctelem_mmx; 1123 1124#if HAVE_YASM 1125 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx; 1126 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx; 1127#endif 1128 1129 c->pix_norm1 = pix_norm1_mmx; 1130 c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; 1131 c->sse[1] = sse8_mmx; 1132 c->vsad[4]= vsad_intra16_mmx; 1133 1134 c->nsse[0] = nsse16_mmx; 1135 c->nsse[1] = nsse8_mmx; 1136 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1137 c->vsad[0] = vsad16_mmx; 1138 } 1139 1140 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1141 c->try_8x8basis= try_8x8basis_mmx; 1142 } 1143 c->add_8x8basis= add_8x8basis_mmx; 1144 1145 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 1146 1147 1148 if (mm_flags & AV_CPU_FLAG_MMX2) { 1149 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; 1150#if HAVE_YASM 1151 c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2; 1152 c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2; 1153#endif 1154 c->vsad[4]= vsad_intra16_mmx2; 1155 1156 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1157 c->vsad[0] = vsad16_mmx2; 1158 } 1159 1160 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 1161 } 1162 1163 if(mm_flags & AV_CPU_FLAG_SSE2){ 1164 if (bit_depth <= 8) 1165 c->get_pixels = get_pixels_sse2; 1166 c->sum_abs_dctelem= sum_abs_dctelem_sse2; 1167#if HAVE_YASM && HAVE_ALIGNED_STACK 1168 c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2; 1169 c->hadamard8_diff[1]= ff_hadamard8_diff_sse2; 1170#endif 1171 } 1172 1173#if HAVE_SSSE3 1174 if(mm_flags & AV_CPU_FLAG_SSSE3){ 1175 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1176 c->try_8x8basis= try_8x8basis_ssse3; 1177 } 1178 c->add_8x8basis= add_8x8basis_ssse3; 1179 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; 1180#if HAVE_YASM && HAVE_ALIGNED_STACK 1181 c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3; 1182 c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3; 1183#endif 1184 } 1185#endif 1186 1187 if(mm_flags & AV_CPU_FLAG_3DNOW){ 1188 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1189 c->try_8x8basis= try_8x8basis_3dnow; 1190 } 1191 c->add_8x8basis= add_8x8basis_3dnow; 1192 } 1193 } 1194 1195 dsputil_init_pix_mmx(c, avctx); 1196} 1197