1/* 2 * MMX optimized DSP utils 3 * Copyright (c) 2000, 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 * 22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 23 */ 24 25#include "libavutil/x86_cpu.h" 26#include "libavcodec/dsputil.h" 27#include "libavcodec/mpegvideo.h" 28#include "libavcodec/mathops.h" 29#include "dsputil_mmx.h" 30 31 32static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 33{ 34 __asm__ volatile( 35 "mov $-128, %%"REG_a" \n\t" 36 "pxor %%mm7, %%mm7 \n\t" 37 ASMALIGN(4) 38 "1: \n\t" 39 "movq (%0), %%mm0 \n\t" 40 "movq (%0, %2), %%mm2 \n\t" 41 "movq %%mm0, %%mm1 \n\t" 42 "movq %%mm2, %%mm3 \n\t" 43 "punpcklbw %%mm7, %%mm0 \n\t" 44 "punpckhbw %%mm7, %%mm1 \n\t" 45 "punpcklbw %%mm7, %%mm2 \n\t" 46 "punpckhbw %%mm7, %%mm3 \n\t" 47 "movq %%mm0, (%1, %%"REG_a") \n\t" 48 "movq %%mm1, 8(%1, %%"REG_a") \n\t" 49 "movq %%mm2, 16(%1, %%"REG_a") \n\t" 50 "movq %%mm3, 24(%1, %%"REG_a") \n\t" 51 "add %3, %0 \n\t" 52 "add $32, %%"REG_a" \n\t" 53 "js 1b \n\t" 54 : "+r" (pixels) 55 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) 56 : "%"REG_a 57 ); 58} 59 60static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) 61{ 62 __asm__ volatile( 63 "pxor %%xmm7, %%xmm7 \n\t" 64 "movq (%0), %%xmm0 \n\t" 65 "movq (%0, %2), %%xmm1 \n\t" 66 "movq (%0, %2,2), %%xmm2 \n\t" 67 "movq (%0, %3), %%xmm3 \n\t" 68 "lea (%0,%2,4), %0 \n\t" 69 "punpcklbw %%xmm7, %%xmm0 \n\t" 70 "punpcklbw %%xmm7, %%xmm1 \n\t" 71 "punpcklbw %%xmm7, %%xmm2 \n\t" 72 "punpcklbw %%xmm7, %%xmm3 \n\t" 73 "movdqa %%xmm0, (%1) \n\t" 74 "movdqa %%xmm1, 16(%1) \n\t" 75 "movdqa %%xmm2, 32(%1) \n\t" 76 "movdqa %%xmm3, 48(%1) \n\t" 77 "movq (%0), %%xmm0 \n\t" 78 "movq (%0, %2), %%xmm1 \n\t" 79 "movq (%0, %2,2), %%xmm2 \n\t" 80 "movq (%0, %3), %%xmm3 \n\t" 81 "punpcklbw %%xmm7, %%xmm0 \n\t" 82 "punpcklbw %%xmm7, %%xmm1 \n\t" 83 "punpcklbw %%xmm7, %%xmm2 \n\t" 84 "punpcklbw %%xmm7, %%xmm3 \n\t" 85 "movdqa %%xmm0, 64(%1) \n\t" 86 "movdqa %%xmm1, 80(%1) \n\t" 87 "movdqa %%xmm2, 96(%1) \n\t" 88 "movdqa %%xmm3, 112(%1) \n\t" 89 : "+r" (pixels) 90 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) 91 ); 92} 93 94static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 95{ 96 __asm__ volatile( 97 "pxor %%mm7, %%mm7 \n\t" 98 "mov $-128, %%"REG_a" \n\t" 99 ASMALIGN(4) 100 "1: \n\t" 101 "movq (%0), %%mm0 \n\t" 102 "movq (%1), %%mm2 \n\t" 103 "movq %%mm0, %%mm1 \n\t" 104 "movq %%mm2, %%mm3 \n\t" 105 "punpcklbw %%mm7, %%mm0 \n\t" 106 "punpckhbw %%mm7, %%mm1 \n\t" 107 "punpcklbw %%mm7, %%mm2 \n\t" 108 "punpckhbw %%mm7, %%mm3 \n\t" 109 "psubw %%mm2, %%mm0 \n\t" 110 "psubw %%mm3, %%mm1 \n\t" 111 "movq %%mm0, (%2, %%"REG_a") \n\t" 112 "movq %%mm1, 8(%2, %%"REG_a") \n\t" 113 "add %3, %0 \n\t" 114 "add %3, %1 \n\t" 115 "add $16, %%"REG_a" \n\t" 116 "jnz 1b \n\t" 117 : "+r" (s1), "+r" (s2) 118 : "r" (block+64), "r" ((x86_reg)stride) 119 : "%"REG_a 120 ); 121} 122 123static int pix_sum16_mmx(uint8_t * pix, int line_size){ 124 const int h=16; 125 int sum; 126 x86_reg index= -line_size*h; 127 128 __asm__ volatile( 129 "pxor %%mm7, %%mm7 \n\t" 130 "pxor %%mm6, %%mm6 \n\t" 131 "1: \n\t" 132 "movq (%2, %1), %%mm0 \n\t" 133 "movq (%2, %1), %%mm1 \n\t" 134 "movq 8(%2, %1), %%mm2 \n\t" 135 "movq 8(%2, %1), %%mm3 \n\t" 136 "punpcklbw %%mm7, %%mm0 \n\t" 137 "punpckhbw %%mm7, %%mm1 \n\t" 138 "punpcklbw %%mm7, %%mm2 \n\t" 139 "punpckhbw %%mm7, %%mm3 \n\t" 140 "paddw %%mm0, %%mm1 \n\t" 141 "paddw %%mm2, %%mm3 \n\t" 142 "paddw %%mm1, %%mm3 \n\t" 143 "paddw %%mm3, %%mm6 \n\t" 144 "add %3, %1 \n\t" 145 " js 1b \n\t" 146 "movq %%mm6, %%mm5 \n\t" 147 "psrlq $32, %%mm6 \n\t" 148 "paddw %%mm5, %%mm6 \n\t" 149 "movq %%mm6, %%mm5 \n\t" 150 "psrlq $16, %%mm6 \n\t" 151 "paddw %%mm5, %%mm6 \n\t" 152 "movd %%mm6, %0 \n\t" 153 "andl $0xFFFF, %0 \n\t" 154 : "=&r" (sum), "+r" (index) 155 : "r" (pix - index), "r" ((x86_reg)line_size) 156 ); 157 158 return sum; 159} 160 161static int pix_norm1_mmx(uint8_t *pix, int line_size) { 162 int tmp; 163 __asm__ volatile ( 164 "movl $16,%%ecx\n" 165 "pxor %%mm0,%%mm0\n" 166 "pxor %%mm7,%%mm7\n" 167 "1:\n" 168 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ 169 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ 170 171 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ 172 173 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ 174 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ 175 176 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ 177 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ 178 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ 179 180 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 181 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 182 183 "pmaddwd %%mm3,%%mm3\n" 184 "pmaddwd %%mm4,%%mm4\n" 185 186 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 187 pix2^2+pix3^2+pix6^2+pix7^2) */ 188 "paddd %%mm3,%%mm4\n" 189 "paddd %%mm2,%%mm7\n" 190 191 "add %2, %0\n" 192 "paddd %%mm4,%%mm7\n" 193 "dec %%ecx\n" 194 "jnz 1b\n" 195 196 "movq %%mm7,%%mm1\n" 197 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 198 "paddd %%mm7,%%mm1\n" 199 "movd %%mm1,%1\n" 200 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); 201 return tmp; 202} 203 204static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 205 int tmp; 206 __asm__ volatile ( 207 "movl %4,%%ecx\n" 208 "shr $1,%%ecx\n" 209 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 210 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 211 "1:\n" 212 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ 213 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ 214 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ 215 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ 216 217 /* todo: mm1-mm2, mm3-mm4 */ 218 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 219 /* OR the results to get absolute difference */ 220 "movq %%mm1,%%mm5\n" 221 "movq %%mm3,%%mm6\n" 222 "psubusb %%mm2,%%mm1\n" 223 "psubusb %%mm4,%%mm3\n" 224 "psubusb %%mm5,%%mm2\n" 225 "psubusb %%mm6,%%mm4\n" 226 227 "por %%mm1,%%mm2\n" 228 "por %%mm3,%%mm4\n" 229 230 /* now convert to 16-bit vectors so we can square them */ 231 "movq %%mm2,%%mm1\n" 232 "movq %%mm4,%%mm3\n" 233 234 "punpckhbw %%mm0,%%mm2\n" 235 "punpckhbw %%mm0,%%mm4\n" 236 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 237 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 238 239 "pmaddwd %%mm2,%%mm2\n" 240 "pmaddwd %%mm4,%%mm4\n" 241 "pmaddwd %%mm1,%%mm1\n" 242 "pmaddwd %%mm3,%%mm3\n" 243 244 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 245 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 246 247 "paddd %%mm2,%%mm1\n" 248 "paddd %%mm4,%%mm3\n" 249 "paddd %%mm1,%%mm7\n" 250 "paddd %%mm3,%%mm7\n" 251 252 "decl %%ecx\n" 253 "jnz 1b\n" 254 255 "movq %%mm7,%%mm1\n" 256 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 257 "paddd %%mm7,%%mm1\n" 258 "movd %%mm1,%2\n" 259 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 260 : "r" ((x86_reg)line_size) , "m" (h) 261 : "%ecx"); 262 return tmp; 263} 264 265static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 266 int tmp; 267 __asm__ volatile ( 268 "movl %4,%%ecx\n" 269 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 270 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 271 "1:\n" 272 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ 273 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ 274 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ 275 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ 276 277 /* todo: mm1-mm2, mm3-mm4 */ 278 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 279 /* OR the results to get absolute difference */ 280 "movq %%mm1,%%mm5\n" 281 "movq %%mm3,%%mm6\n" 282 "psubusb %%mm2,%%mm1\n" 283 "psubusb %%mm4,%%mm3\n" 284 "psubusb %%mm5,%%mm2\n" 285 "psubusb %%mm6,%%mm4\n" 286 287 "por %%mm1,%%mm2\n" 288 "por %%mm3,%%mm4\n" 289 290 /* now convert to 16-bit vectors so we can square them */ 291 "movq %%mm2,%%mm1\n" 292 "movq %%mm4,%%mm3\n" 293 294 "punpckhbw %%mm0,%%mm2\n" 295 "punpckhbw %%mm0,%%mm4\n" 296 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 297 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 298 299 "pmaddwd %%mm2,%%mm2\n" 300 "pmaddwd %%mm4,%%mm4\n" 301 "pmaddwd %%mm1,%%mm1\n" 302 "pmaddwd %%mm3,%%mm3\n" 303 304 "add %3,%0\n" 305 "add %3,%1\n" 306 307 "paddd %%mm2,%%mm1\n" 308 "paddd %%mm4,%%mm3\n" 309 "paddd %%mm1,%%mm7\n" 310 "paddd %%mm3,%%mm7\n" 311 312 "decl %%ecx\n" 313 "jnz 1b\n" 314 315 "movq %%mm7,%%mm1\n" 316 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 317 "paddd %%mm7,%%mm1\n" 318 "movd %%mm1,%2\n" 319 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 320 : "r" ((x86_reg)line_size) , "m" (h) 321 : "%ecx"); 322 return tmp; 323} 324 325static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 326 int tmp; 327 __asm__ volatile ( 328 "shr $1,%2\n" 329 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ 330 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ 331 "1:\n" 332 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ 333 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ 334 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ 335 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ 336 337 /* todo: mm1-mm2, mm3-mm4 */ 338 /* algo: subtract mm1 from mm2 with saturation and vice versa */ 339 /* OR the results to get absolute difference */ 340 "movdqa %%xmm1,%%xmm5\n" 341 "movdqa %%xmm3,%%xmm6\n" 342 "psubusb %%xmm2,%%xmm1\n" 343 "psubusb %%xmm4,%%xmm3\n" 344 "psubusb %%xmm5,%%xmm2\n" 345 "psubusb %%xmm6,%%xmm4\n" 346 347 "por %%xmm1,%%xmm2\n" 348 "por %%xmm3,%%xmm4\n" 349 350 /* now convert to 16-bit vectors so we can square them */ 351 "movdqa %%xmm2,%%xmm1\n" 352 "movdqa %%xmm4,%%xmm3\n" 353 354 "punpckhbw %%xmm0,%%xmm2\n" 355 "punpckhbw %%xmm0,%%xmm4\n" 356 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ 357 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ 358 359 "pmaddwd %%xmm2,%%xmm2\n" 360 "pmaddwd %%xmm4,%%xmm4\n" 361 "pmaddwd %%xmm1,%%xmm1\n" 362 "pmaddwd %%xmm3,%%xmm3\n" 363 364 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ 365 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ 366 367 "paddd %%xmm2,%%xmm1\n" 368 "paddd %%xmm4,%%xmm3\n" 369 "paddd %%xmm1,%%xmm7\n" 370 "paddd %%xmm3,%%xmm7\n" 371 372 "decl %2\n" 373 "jnz 1b\n" 374 375 "movdqa %%xmm7,%%xmm1\n" 376 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ 377 "paddd %%xmm1,%%xmm7\n" 378 "movdqa %%xmm7,%%xmm1\n" 379 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ 380 "paddd %%xmm1,%%xmm7\n" 381 "movd %%xmm7,%3\n" 382 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 383 : "r" ((x86_reg)line_size)); 384 return tmp; 385} 386 387static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 388 int tmp; 389 __asm__ volatile ( 390 "movl %3,%%ecx\n" 391 "pxor %%mm7,%%mm7\n" 392 "pxor %%mm6,%%mm6\n" 393 394 "movq (%0),%%mm0\n" 395 "movq %%mm0, %%mm1\n" 396 "psllq $8, %%mm0\n" 397 "psrlq $8, %%mm1\n" 398 "psrlq $8, %%mm0\n" 399 "movq %%mm0, %%mm2\n" 400 "movq %%mm1, %%mm3\n" 401 "punpcklbw %%mm7,%%mm0\n" 402 "punpcklbw %%mm7,%%mm1\n" 403 "punpckhbw %%mm7,%%mm2\n" 404 "punpckhbw %%mm7,%%mm3\n" 405 "psubw %%mm1, %%mm0\n" 406 "psubw %%mm3, %%mm2\n" 407 408 "add %2,%0\n" 409 410 "movq (%0),%%mm4\n" 411 "movq %%mm4, %%mm1\n" 412 "psllq $8, %%mm4\n" 413 "psrlq $8, %%mm1\n" 414 "psrlq $8, %%mm4\n" 415 "movq %%mm4, %%mm5\n" 416 "movq %%mm1, %%mm3\n" 417 "punpcklbw %%mm7,%%mm4\n" 418 "punpcklbw %%mm7,%%mm1\n" 419 "punpckhbw %%mm7,%%mm5\n" 420 "punpckhbw %%mm7,%%mm3\n" 421 "psubw %%mm1, %%mm4\n" 422 "psubw %%mm3, %%mm5\n" 423 "psubw %%mm4, %%mm0\n" 424 "psubw %%mm5, %%mm2\n" 425 "pxor %%mm3, %%mm3\n" 426 "pxor %%mm1, %%mm1\n" 427 "pcmpgtw %%mm0, %%mm3\n\t" 428 "pcmpgtw %%mm2, %%mm1\n\t" 429 "pxor %%mm3, %%mm0\n" 430 "pxor %%mm1, %%mm2\n" 431 "psubw %%mm3, %%mm0\n" 432 "psubw %%mm1, %%mm2\n" 433 "paddw %%mm0, %%mm2\n" 434 "paddw %%mm2, %%mm6\n" 435 436 "add %2,%0\n" 437 "1:\n" 438 439 "movq (%0),%%mm0\n" 440 "movq %%mm0, %%mm1\n" 441 "psllq $8, %%mm0\n" 442 "psrlq $8, %%mm1\n" 443 "psrlq $8, %%mm0\n" 444 "movq %%mm0, %%mm2\n" 445 "movq %%mm1, %%mm3\n" 446 "punpcklbw %%mm7,%%mm0\n" 447 "punpcklbw %%mm7,%%mm1\n" 448 "punpckhbw %%mm7,%%mm2\n" 449 "punpckhbw %%mm7,%%mm3\n" 450 "psubw %%mm1, %%mm0\n" 451 "psubw %%mm3, %%mm2\n" 452 "psubw %%mm0, %%mm4\n" 453 "psubw %%mm2, %%mm5\n" 454 "pxor %%mm3, %%mm3\n" 455 "pxor %%mm1, %%mm1\n" 456 "pcmpgtw %%mm4, %%mm3\n\t" 457 "pcmpgtw %%mm5, %%mm1\n\t" 458 "pxor %%mm3, %%mm4\n" 459 "pxor %%mm1, %%mm5\n" 460 "psubw %%mm3, %%mm4\n" 461 "psubw %%mm1, %%mm5\n" 462 "paddw %%mm4, %%mm5\n" 463 "paddw %%mm5, %%mm6\n" 464 465 "add %2,%0\n" 466 467 "movq (%0),%%mm4\n" 468 "movq %%mm4, %%mm1\n" 469 "psllq $8, %%mm4\n" 470 "psrlq $8, %%mm1\n" 471 "psrlq $8, %%mm4\n" 472 "movq %%mm4, %%mm5\n" 473 "movq %%mm1, %%mm3\n" 474 "punpcklbw %%mm7,%%mm4\n" 475 "punpcklbw %%mm7,%%mm1\n" 476 "punpckhbw %%mm7,%%mm5\n" 477 "punpckhbw %%mm7,%%mm3\n" 478 "psubw %%mm1, %%mm4\n" 479 "psubw %%mm3, %%mm5\n" 480 "psubw %%mm4, %%mm0\n" 481 "psubw %%mm5, %%mm2\n" 482 "pxor %%mm3, %%mm3\n" 483 "pxor %%mm1, %%mm1\n" 484 "pcmpgtw %%mm0, %%mm3\n\t" 485 "pcmpgtw %%mm2, %%mm1\n\t" 486 "pxor %%mm3, %%mm0\n" 487 "pxor %%mm1, %%mm2\n" 488 "psubw %%mm3, %%mm0\n" 489 "psubw %%mm1, %%mm2\n" 490 "paddw %%mm0, %%mm2\n" 491 "paddw %%mm2, %%mm6\n" 492 493 "add %2,%0\n" 494 "subl $2, %%ecx\n" 495 " jnz 1b\n" 496 497 "movq %%mm6, %%mm0\n" 498 "punpcklwd %%mm7,%%mm0\n" 499 "punpckhwd %%mm7,%%mm6\n" 500 "paddd %%mm0, %%mm6\n" 501 502 "movq %%mm6,%%mm0\n" 503 "psrlq $32, %%mm6\n" 504 "paddd %%mm6,%%mm0\n" 505 "movd %%mm0,%1\n" 506 : "+r" (pix1), "=r"(tmp) 507 : "r" ((x86_reg)line_size) , "g" (h-2) 508 : "%ecx"); 509 return tmp; 510} 511 512static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 513 int tmp; 514 uint8_t * pix= pix1; 515 __asm__ volatile ( 516 "movl %3,%%ecx\n" 517 "pxor %%mm7,%%mm7\n" 518 "pxor %%mm6,%%mm6\n" 519 520 "movq (%0),%%mm0\n" 521 "movq 1(%0),%%mm1\n" 522 "movq %%mm0, %%mm2\n" 523 "movq %%mm1, %%mm3\n" 524 "punpcklbw %%mm7,%%mm0\n" 525 "punpcklbw %%mm7,%%mm1\n" 526 "punpckhbw %%mm7,%%mm2\n" 527 "punpckhbw %%mm7,%%mm3\n" 528 "psubw %%mm1, %%mm0\n" 529 "psubw %%mm3, %%mm2\n" 530 531 "add %2,%0\n" 532 533 "movq (%0),%%mm4\n" 534 "movq 1(%0),%%mm1\n" 535 "movq %%mm4, %%mm5\n" 536 "movq %%mm1, %%mm3\n" 537 "punpcklbw %%mm7,%%mm4\n" 538 "punpcklbw %%mm7,%%mm1\n" 539 "punpckhbw %%mm7,%%mm5\n" 540 "punpckhbw %%mm7,%%mm3\n" 541 "psubw %%mm1, %%mm4\n" 542 "psubw %%mm3, %%mm5\n" 543 "psubw %%mm4, %%mm0\n" 544 "psubw %%mm5, %%mm2\n" 545 "pxor %%mm3, %%mm3\n" 546 "pxor %%mm1, %%mm1\n" 547 "pcmpgtw %%mm0, %%mm3\n\t" 548 "pcmpgtw %%mm2, %%mm1\n\t" 549 "pxor %%mm3, %%mm0\n" 550 "pxor %%mm1, %%mm2\n" 551 "psubw %%mm3, %%mm0\n" 552 "psubw %%mm1, %%mm2\n" 553 "paddw %%mm0, %%mm2\n" 554 "paddw %%mm2, %%mm6\n" 555 556 "add %2,%0\n" 557 "1:\n" 558 559 "movq (%0),%%mm0\n" 560 "movq 1(%0),%%mm1\n" 561 "movq %%mm0, %%mm2\n" 562 "movq %%mm1, %%mm3\n" 563 "punpcklbw %%mm7,%%mm0\n" 564 "punpcklbw %%mm7,%%mm1\n" 565 "punpckhbw %%mm7,%%mm2\n" 566 "punpckhbw %%mm7,%%mm3\n" 567 "psubw %%mm1, %%mm0\n" 568 "psubw %%mm3, %%mm2\n" 569 "psubw %%mm0, %%mm4\n" 570 "psubw %%mm2, %%mm5\n" 571 "pxor %%mm3, %%mm3\n" 572 "pxor %%mm1, %%mm1\n" 573 "pcmpgtw %%mm4, %%mm3\n\t" 574 "pcmpgtw %%mm5, %%mm1\n\t" 575 "pxor %%mm3, %%mm4\n" 576 "pxor %%mm1, %%mm5\n" 577 "psubw %%mm3, %%mm4\n" 578 "psubw %%mm1, %%mm5\n" 579 "paddw %%mm4, %%mm5\n" 580 "paddw %%mm5, %%mm6\n" 581 582 "add %2,%0\n" 583 584 "movq (%0),%%mm4\n" 585 "movq 1(%0),%%mm1\n" 586 "movq %%mm4, %%mm5\n" 587 "movq %%mm1, %%mm3\n" 588 "punpcklbw %%mm7,%%mm4\n" 589 "punpcklbw %%mm7,%%mm1\n" 590 "punpckhbw %%mm7,%%mm5\n" 591 "punpckhbw %%mm7,%%mm3\n" 592 "psubw %%mm1, %%mm4\n" 593 "psubw %%mm3, %%mm5\n" 594 "psubw %%mm4, %%mm0\n" 595 "psubw %%mm5, %%mm2\n" 596 "pxor %%mm3, %%mm3\n" 597 "pxor %%mm1, %%mm1\n" 598 "pcmpgtw %%mm0, %%mm3\n\t" 599 "pcmpgtw %%mm2, %%mm1\n\t" 600 "pxor %%mm3, %%mm0\n" 601 "pxor %%mm1, %%mm2\n" 602 "psubw %%mm3, %%mm0\n" 603 "psubw %%mm1, %%mm2\n" 604 "paddw %%mm0, %%mm2\n" 605 "paddw %%mm2, %%mm6\n" 606 607 "add %2,%0\n" 608 "subl $2, %%ecx\n" 609 " jnz 1b\n" 610 611 "movq %%mm6, %%mm0\n" 612 "punpcklwd %%mm7,%%mm0\n" 613 "punpckhwd %%mm7,%%mm6\n" 614 "paddd %%mm0, %%mm6\n" 615 616 "movq %%mm6,%%mm0\n" 617 "psrlq $32, %%mm6\n" 618 "paddd %%mm6,%%mm0\n" 619 "movd %%mm0,%1\n" 620 : "+r" (pix1), "=r"(tmp) 621 : "r" ((x86_reg)line_size) , "g" (h-2) 622 : "%ecx"); 623 return tmp + hf_noise8_mmx(pix+8, line_size, h); 624} 625 626static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 627 MpegEncContext *c = p; 628 int score1, score2; 629 630 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); 631 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); 632 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); 633 634 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 635 else return score1 + FFABS(score2)*8; 636} 637 638static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 639 MpegEncContext *c = p; 640 int score1= sse8_mmx(c, pix1, pix2, line_size, h); 641 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); 642 643 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; 644 else return score1 + FFABS(score2)*8; 645} 646 647static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 648 int tmp; 649 650 assert( (((int)pix) & 7) == 0); 651 assert((line_size &7) ==0); 652 653#define SUM(in0, in1, out0, out1) \ 654 "movq (%0), %%mm2\n"\ 655 "movq 8(%0), %%mm3\n"\ 656 "add %2,%0\n"\ 657 "movq %%mm2, " #out0 "\n"\ 658 "movq %%mm3, " #out1 "\n"\ 659 "psubusb " #in0 ", %%mm2\n"\ 660 "psubusb " #in1 ", %%mm3\n"\ 661 "psubusb " #out0 ", " #in0 "\n"\ 662 "psubusb " #out1 ", " #in1 "\n"\ 663 "por %%mm2, " #in0 "\n"\ 664 "por %%mm3, " #in1 "\n"\ 665 "movq " #in0 ", %%mm2\n"\ 666 "movq " #in1 ", %%mm3\n"\ 667 "punpcklbw %%mm7, " #in0 "\n"\ 668 "punpcklbw %%mm7, " #in1 "\n"\ 669 "punpckhbw %%mm7, %%mm2\n"\ 670 "punpckhbw %%mm7, %%mm3\n"\ 671 "paddw " #in1 ", " #in0 "\n"\ 672 "paddw %%mm3, %%mm2\n"\ 673 "paddw %%mm2, " #in0 "\n"\ 674 "paddw " #in0 ", %%mm6\n" 675 676 677 __asm__ volatile ( 678 "movl %3,%%ecx\n" 679 "pxor %%mm6,%%mm6\n" 680 "pxor %%mm7,%%mm7\n" 681 "movq (%0),%%mm0\n" 682 "movq 8(%0),%%mm1\n" 683 "add %2,%0\n" 684 "jmp 2f\n" 685 "1:\n" 686 687 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 688 "2:\n" 689 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 690 691 "subl $2, %%ecx\n" 692 "jnz 1b\n" 693 694 "movq %%mm6,%%mm0\n" 695 "psrlq $32, %%mm6\n" 696 "paddw %%mm6,%%mm0\n" 697 "movq %%mm0,%%mm6\n" 698 "psrlq $16, %%mm0\n" 699 "paddw %%mm6,%%mm0\n" 700 "movd %%mm0,%1\n" 701 : "+r" (pix), "=r"(tmp) 702 : "r" ((x86_reg)line_size) , "m" (h) 703 : "%ecx"); 704 return tmp & 0xFFFF; 705} 706#undef SUM 707 708static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 709 int tmp; 710 711 assert( (((int)pix) & 7) == 0); 712 assert((line_size &7) ==0); 713 714#define SUM(in0, in1, out0, out1) \ 715 "movq (%0), " #out0 "\n"\ 716 "movq 8(%0), " #out1 "\n"\ 717 "add %2,%0\n"\ 718 "psadbw " #out0 ", " #in0 "\n"\ 719 "psadbw " #out1 ", " #in1 "\n"\ 720 "paddw " #in1 ", " #in0 "\n"\ 721 "paddw " #in0 ", %%mm6\n" 722 723 __asm__ volatile ( 724 "movl %3,%%ecx\n" 725 "pxor %%mm6,%%mm6\n" 726 "pxor %%mm7,%%mm7\n" 727 "movq (%0),%%mm0\n" 728 "movq 8(%0),%%mm1\n" 729 "add %2,%0\n" 730 "jmp 2f\n" 731 "1:\n" 732 733 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 734 "2:\n" 735 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 736 737 "subl $2, %%ecx\n" 738 "jnz 1b\n" 739 740 "movd %%mm6,%1\n" 741 : "+r" (pix), "=r"(tmp) 742 : "r" ((x86_reg)line_size) , "m" (h) 743 : "%ecx"); 744 return tmp; 745} 746#undef SUM 747 748static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 749 int tmp; 750 751 assert( (((int)pix1) & 7) == 0); 752 assert( (((int)pix2) & 7) == 0); 753 assert((line_size &7) ==0); 754 755#define SUM(in0, in1, out0, out1) \ 756 "movq (%0),%%mm2\n"\ 757 "movq (%1)," #out0 "\n"\ 758 "movq 8(%0),%%mm3\n"\ 759 "movq 8(%1)," #out1 "\n"\ 760 "add %3,%0\n"\ 761 "add %3,%1\n"\ 762 "psubb " #out0 ", %%mm2\n"\ 763 "psubb " #out1 ", %%mm3\n"\ 764 "pxor %%mm7, %%mm2\n"\ 765 "pxor %%mm7, %%mm3\n"\ 766 "movq %%mm2, " #out0 "\n"\ 767 "movq %%mm3, " #out1 "\n"\ 768 "psubusb " #in0 ", %%mm2\n"\ 769 "psubusb " #in1 ", %%mm3\n"\ 770 "psubusb " #out0 ", " #in0 "\n"\ 771 "psubusb " #out1 ", " #in1 "\n"\ 772 "por %%mm2, " #in0 "\n"\ 773 "por %%mm3, " #in1 "\n"\ 774 "movq " #in0 ", %%mm2\n"\ 775 "movq " #in1 ", %%mm3\n"\ 776 "punpcklbw %%mm7, " #in0 "\n"\ 777 "punpcklbw %%mm7, " #in1 "\n"\ 778 "punpckhbw %%mm7, %%mm2\n"\ 779 "punpckhbw %%mm7, %%mm3\n"\ 780 "paddw " #in1 ", " #in0 "\n"\ 781 "paddw %%mm3, %%mm2\n"\ 782 "paddw %%mm2, " #in0 "\n"\ 783 "paddw " #in0 ", %%mm6\n" 784 785 786 __asm__ volatile ( 787 "movl %4,%%ecx\n" 788 "pxor %%mm6,%%mm6\n" 789 "pcmpeqw %%mm7,%%mm7\n" 790 "psllw $15, %%mm7\n" 791 "packsswb %%mm7, %%mm7\n" 792 "movq (%0),%%mm0\n" 793 "movq (%1),%%mm2\n" 794 "movq 8(%0),%%mm1\n" 795 "movq 8(%1),%%mm3\n" 796 "add %3,%0\n" 797 "add %3,%1\n" 798 "psubb %%mm2, %%mm0\n" 799 "psubb %%mm3, %%mm1\n" 800 "pxor %%mm7, %%mm0\n" 801 "pxor %%mm7, %%mm1\n" 802 "jmp 2f\n" 803 "1:\n" 804 805 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 806 "2:\n" 807 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 808 809 "subl $2, %%ecx\n" 810 "jnz 1b\n" 811 812 "movq %%mm6,%%mm0\n" 813 "psrlq $32, %%mm6\n" 814 "paddw %%mm6,%%mm0\n" 815 "movq %%mm0,%%mm6\n" 816 "psrlq $16, %%mm0\n" 817 "paddw %%mm6,%%mm0\n" 818 "movd %%mm0,%2\n" 819 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 820 : "r" ((x86_reg)line_size) , "m" (h) 821 : "%ecx"); 822 return tmp & 0x7FFF; 823} 824#undef SUM 825 826static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 827 int tmp; 828 829 assert( (((int)pix1) & 7) == 0); 830 assert( (((int)pix2) & 7) == 0); 831 assert((line_size &7) ==0); 832 833#define SUM(in0, in1, out0, out1) \ 834 "movq (%0)," #out0 "\n"\ 835 "movq (%1),%%mm2\n"\ 836 "movq 8(%0)," #out1 "\n"\ 837 "movq 8(%1),%%mm3\n"\ 838 "add %3,%0\n"\ 839 "add %3,%1\n"\ 840 "psubb %%mm2, " #out0 "\n"\ 841 "psubb %%mm3, " #out1 "\n"\ 842 "pxor %%mm7, " #out0 "\n"\ 843 "pxor %%mm7, " #out1 "\n"\ 844 "psadbw " #out0 ", " #in0 "\n"\ 845 "psadbw " #out1 ", " #in1 "\n"\ 846 "paddw " #in1 ", " #in0 "\n"\ 847 "paddw " #in0 ", %%mm6\n" 848 849 __asm__ volatile ( 850 "movl %4,%%ecx\n" 851 "pxor %%mm6,%%mm6\n" 852 "pcmpeqw %%mm7,%%mm7\n" 853 "psllw $15, %%mm7\n" 854 "packsswb %%mm7, %%mm7\n" 855 "movq (%0),%%mm0\n" 856 "movq (%1),%%mm2\n" 857 "movq 8(%0),%%mm1\n" 858 "movq 8(%1),%%mm3\n" 859 "add %3,%0\n" 860 "add %3,%1\n" 861 "psubb %%mm2, %%mm0\n" 862 "psubb %%mm3, %%mm1\n" 863 "pxor %%mm7, %%mm0\n" 864 "pxor %%mm7, %%mm1\n" 865 "jmp 2f\n" 866 "1:\n" 867 868 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 869 "2:\n" 870 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 871 872 "subl $2, %%ecx\n" 873 "jnz 1b\n" 874 875 "movd %%mm6,%2\n" 876 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 877 : "r" ((x86_reg)line_size) , "m" (h) 878 : "%ecx"); 879 return tmp; 880} 881#undef SUM 882 883static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 884 x86_reg i=0; 885 __asm__ volatile( 886 "1: \n\t" 887 "movq (%2, %0), %%mm0 \n\t" 888 "movq (%1, %0), %%mm1 \n\t" 889 "psubb %%mm0, %%mm1 \n\t" 890 "movq %%mm1, (%3, %0) \n\t" 891 "movq 8(%2, %0), %%mm0 \n\t" 892 "movq 8(%1, %0), %%mm1 \n\t" 893 "psubb %%mm0, %%mm1 \n\t" 894 "movq %%mm1, 8(%3, %0) \n\t" 895 "add $16, %0 \n\t" 896 "cmp %4, %0 \n\t" 897 " jb 1b \n\t" 898 : "+r" (i) 899 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) 900 ); 901 for(; i<w; i++) 902 dst[i+0] = src1[i+0]-src2[i+0]; 903} 904 905static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ 906 x86_reg i=0; 907 uint8_t l, lt; 908 909 __asm__ volatile( 910 "1: \n\t" 911 "movq -1(%1, %0), %%mm0 \n\t" // LT 912 "movq (%1, %0), %%mm1 \n\t" // T 913 "movq -1(%2, %0), %%mm2 \n\t" // L 914 "movq (%2, %0), %%mm3 \n\t" // X 915 "movq %%mm2, %%mm4 \n\t" // L 916 "psubb %%mm0, %%mm2 \n\t" 917 "paddb %%mm1, %%mm2 \n\t" // L + T - LT 918 "movq %%mm4, %%mm5 \n\t" // L 919 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 920 "pminub %%mm5, %%mm1 \n\t" // min(T, L) 921 "pminub %%mm2, %%mm4 \n\t" 922 "pmaxub %%mm1, %%mm4 \n\t" 923 "psubb %%mm4, %%mm3 \n\t" // dst - pred 924 "movq %%mm3, (%3, %0) \n\t" 925 "add $8, %0 \n\t" 926 "cmp %4, %0 \n\t" 927 " jb 1b \n\t" 928 : "+r" (i) 929 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) 930 ); 931 932 l= *left; 933 lt= *left_top; 934 935 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); 936 937 *left_top= src1[w-1]; 938 *left = src2[w-1]; 939} 940 941#define DIFF_PIXELS_1(m,a,t,p1,p2)\ 942 "mov"#m" "#p1", "#a" \n\t"\ 943 "mov"#m" "#p2", "#t" \n\t"\ 944 "punpcklbw "#a", "#t" \n\t"\ 945 "punpcklbw "#a", "#a" \n\t"\ 946 "psubw "#t", "#a" \n\t"\ 947 948#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ 949 uint8_t *p1b=p1, *p2b=p2;\ 950 __asm__ volatile(\ 951 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ 952 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ 953 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ 954 "add %4, %1 \n\t"\ 955 "add %4, %2 \n\t"\ 956 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ 957 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ 958 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ 959 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ 960 "mov"#m1" "#mm"0, %0 \n\t"\ 961 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ 962 "mov"#m1" %0, "#mm"0 \n\t"\ 963 : "+m"(temp), "+r"(p1b), "+r"(p2b)\ 964 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ 965 );\ 966} 967 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) 968 969#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) 970#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) 971 972#define LBUTTERFLY2(a1,b1,a2,b2)\ 973 "paddw " #b1 ", " #a1 " \n\t"\ 974 "paddw " #b2 ", " #a2 " \n\t"\ 975 "paddw " #b1 ", " #b1 " \n\t"\ 976 "paddw " #b2 ", " #b2 " \n\t"\ 977 "psubw " #a1 ", " #b1 " \n\t"\ 978 "psubw " #a2 ", " #b2 " \n\t" 979 980#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ 981 LBUTTERFLY2(m0, m1, m2, m3)\ 982 LBUTTERFLY2(m4, m5, m6, m7)\ 983 LBUTTERFLY2(m0, m2, m1, m3)\ 984 LBUTTERFLY2(m4, m6, m5, m7)\ 985 LBUTTERFLY2(m0, m4, m1, m5)\ 986 LBUTTERFLY2(m2, m6, m3, m7)\ 987 988#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) 989 990#define MMABS_MMX(a,z)\ 991 "pxor " #z ", " #z " \n\t"\ 992 "pcmpgtw " #a ", " #z " \n\t"\ 993 "pxor " #z ", " #a " \n\t"\ 994 "psubw " #z ", " #a " \n\t" 995 996#define MMABS_MMX2(a,z)\ 997 "pxor " #z ", " #z " \n\t"\ 998 "psubw " #a ", " #z " \n\t"\ 999 "pmaxsw " #z ", " #a " \n\t" 1000 1001#define MMABS_SSSE3(a,z)\ 1002 "pabsw " #a ", " #a " \n\t" 1003 1004#define MMABS_SUM(a,z, sum)\ 1005 MMABS(a,z)\ 1006 "paddusw " #a ", " #sum " \n\t" 1007 1008#define MMABS_SUM_8x8_NOSPILL\ 1009 MMABS(%%xmm0, %%xmm8)\ 1010 MMABS(%%xmm1, %%xmm9)\ 1011 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ 1012 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ 1013 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ 1014 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ 1015 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ 1016 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ 1017 "paddusw %%xmm1, %%xmm0 \n\t" 1018 1019#if ARCH_X86_64 1020#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL 1021#else 1022#define MMABS_SUM_8x8_SSE2\ 1023 "movdqa %%xmm7, (%1) \n\t"\ 1024 MMABS(%%xmm0, %%xmm7)\ 1025 MMABS(%%xmm1, %%xmm7)\ 1026 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ 1027 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ 1028 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ 1029 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ 1030 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ 1031 "movdqa (%1), %%xmm2 \n\t"\ 1032 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ 1033 "paddusw %%xmm1, %%xmm0 \n\t" 1034#endif 1035 1036/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 1037 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, 1038 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ 1039#define HSUM_MMX(a, t, dst)\ 1040 "movq "#a", "#t" \n\t"\ 1041 "psrlq $32, "#a" \n\t"\ 1042 "paddusw "#t", "#a" \n\t"\ 1043 "movq "#a", "#t" \n\t"\ 1044 "psrlq $16, "#a" \n\t"\ 1045 "paddusw "#t", "#a" \n\t"\ 1046 "movd "#a", "#dst" \n\t"\ 1047 1048#define HSUM_MMX2(a, t, dst)\ 1049 "pshufw $0x0E, "#a", "#t" \n\t"\ 1050 "paddusw "#t", "#a" \n\t"\ 1051 "pshufw $0x01, "#a", "#t" \n\t"\ 1052 "paddusw "#t", "#a" \n\t"\ 1053 "movd "#a", "#dst" \n\t"\ 1054 1055#define HSUM_SSE2(a, t, dst)\ 1056 "movhlps "#a", "#t" \n\t"\ 1057 "paddusw "#t", "#a" \n\t"\ 1058 "pshuflw $0x0E, "#a", "#t" \n\t"\ 1059 "paddusw "#t", "#a" \n\t"\ 1060 "pshuflw $0x01, "#a", "#t" \n\t"\ 1061 "paddusw "#t", "#a" \n\t"\ 1062 "movd "#a", "#dst" \n\t"\ 1063 1064#define HADAMARD8_DIFF_MMX(cpu) \ 1065static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 1066 DECLARE_ALIGNED_8(uint64_t, temp[13]);\ 1067 int sum;\ 1068\ 1069 assert(h==8);\ 1070\ 1071 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ 1072\ 1073 __asm__ volatile(\ 1074 HADAMARD48\ 1075\ 1076 "movq %%mm7, 96(%1) \n\t"\ 1077\ 1078 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 1079 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 1080\ 1081 "movq 96(%1), %%mm7 \n\t"\ 1082 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 1083 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ 1084\ 1085 : "=r" (sum)\ 1086 : "r"(temp)\ 1087 );\ 1088\ 1089 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ 1090\ 1091 __asm__ volatile(\ 1092 HADAMARD48\ 1093\ 1094 "movq %%mm7, 96(%1) \n\t"\ 1095\ 1096 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 1097 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 1098\ 1099 "movq 96(%1), %%mm7 \n\t"\ 1100 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 1101 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ 1102 "movq %%mm6, %%mm7 \n\t"\ 1103 "movq %%mm0, %%mm6 \n\t"\ 1104\ 1105 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 1106\ 1107 HADAMARD48\ 1108 "movq %%mm7, 64(%1) \n\t"\ 1109 MMABS(%%mm0, %%mm7)\ 1110 MMABS(%%mm1, %%mm7)\ 1111 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 1112 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 1113 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 1114 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 1115 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 1116 "movq 64(%1), %%mm2 \n\t"\ 1117 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 1118 "paddusw %%mm1, %%mm0 \n\t"\ 1119 "movq %%mm0, 64(%1) \n\t"\ 1120\ 1121 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 1122 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ 1123\ 1124 HADAMARD48\ 1125 "movq %%mm7, (%1) \n\t"\ 1126 MMABS(%%mm0, %%mm7)\ 1127 MMABS(%%mm1, %%mm7)\ 1128 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 1129 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 1130 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 1131 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 1132 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 1133 "movq (%1), %%mm2 \n\t"\ 1134 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 1135 "paddusw 64(%1), %%mm0 \n\t"\ 1136 "paddusw %%mm1, %%mm0 \n\t"\ 1137\ 1138 HSUM(%%mm0, %%mm1, %0)\ 1139\ 1140 : "=r" (sum)\ 1141 : "r"(temp)\ 1142 );\ 1143 return sum&0xFFFF;\ 1144}\ 1145WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 1146 1147#define HADAMARD8_DIFF_SSE2(cpu) \ 1148static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 1149 DECLARE_ALIGNED_16(uint64_t, temp[4]);\ 1150 int sum;\ 1151\ 1152 assert(h==8);\ 1153\ 1154 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ 1155\ 1156 __asm__ volatile(\ 1157 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ 1158 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ 1159 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ 1160 MMABS_SUM_8x8\ 1161 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ 1162 : "=r" (sum)\ 1163 : "r"(temp)\ 1164 );\ 1165 return sum&0xFFFF;\ 1166}\ 1167WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 1168 1169#define MMABS(a,z) MMABS_MMX(a,z) 1170#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 1171HADAMARD8_DIFF_MMX(mmx) 1172#undef MMABS 1173#undef HSUM 1174 1175#define MMABS(a,z) MMABS_MMX2(a,z) 1176#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 1177#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 1178HADAMARD8_DIFF_MMX(mmx2) 1179HADAMARD8_DIFF_SSE2(sse2) 1180#undef MMABS 1181#undef MMABS_SUM_8x8 1182#undef HSUM 1183 1184#if HAVE_SSSE3 1185#define MMABS(a,z) MMABS_SSSE3(a,z) 1186#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL 1187HADAMARD8_DIFF_SSE2(ssse3) 1188#undef MMABS 1189#undef MMABS_SUM_8x8 1190#endif 1191 1192#define DCT_SAD4(m,mm,o)\ 1193 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 1194 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 1195 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 1196 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ 1197 MMABS_SUM(mm##2, mm##6, mm##0)\ 1198 MMABS_SUM(mm##3, mm##7, mm##1)\ 1199 MMABS_SUM(mm##4, mm##6, mm##0)\ 1200 MMABS_SUM(mm##5, mm##7, mm##1)\ 1201 1202#define DCT_SAD_MMX\ 1203 "pxor %%mm0, %%mm0 \n\t"\ 1204 "pxor %%mm1, %%mm1 \n\t"\ 1205 DCT_SAD4(q, %%mm, 0)\ 1206 DCT_SAD4(q, %%mm, 8)\ 1207 DCT_SAD4(q, %%mm, 64)\ 1208 DCT_SAD4(q, %%mm, 72)\ 1209 "paddusw %%mm1, %%mm0 \n\t"\ 1210 HSUM(%%mm0, %%mm1, %0) 1211 1212#define DCT_SAD_SSE2\ 1213 "pxor %%xmm0, %%xmm0 \n\t"\ 1214 "pxor %%xmm1, %%xmm1 \n\t"\ 1215 DCT_SAD4(dqa, %%xmm, 0)\ 1216 DCT_SAD4(dqa, %%xmm, 64)\ 1217 "paddusw %%xmm1, %%xmm0 \n\t"\ 1218 HSUM(%%xmm0, %%xmm1, %0) 1219 1220#define DCT_SAD_FUNC(cpu) \ 1221static int sum_abs_dctelem_##cpu(DCTELEM *block){\ 1222 int sum;\ 1223 __asm__ volatile(\ 1224 DCT_SAD\ 1225 :"=r"(sum)\ 1226 :"r"(block)\ 1227 );\ 1228 return sum&0xFFFF;\ 1229} 1230 1231#define DCT_SAD DCT_SAD_MMX 1232#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 1233#define MMABS(a,z) MMABS_MMX(a,z) 1234DCT_SAD_FUNC(mmx) 1235#undef MMABS 1236#undef HSUM 1237 1238#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 1239#define MMABS(a,z) MMABS_MMX2(a,z) 1240DCT_SAD_FUNC(mmx2) 1241#undef HSUM 1242#undef DCT_SAD 1243 1244#define DCT_SAD DCT_SAD_SSE2 1245#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) 1246DCT_SAD_FUNC(sse2) 1247#undef MMABS 1248 1249#if HAVE_SSSE3 1250#define MMABS(a,z) MMABS_SSSE3(a,z) 1251DCT_SAD_FUNC(ssse3) 1252#undef MMABS 1253#endif 1254#undef HSUM 1255#undef DCT_SAD 1256 1257static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ 1258 int sum; 1259 x86_reg i=size; 1260 __asm__ volatile( 1261 "pxor %%mm4, %%mm4 \n" 1262 "1: \n" 1263 "sub $8, %0 \n" 1264 "movq (%2,%0), %%mm2 \n" 1265 "movq (%3,%0,2), %%mm0 \n" 1266 "movq 8(%3,%0,2), %%mm1 \n" 1267 "punpckhbw %%mm2, %%mm3 \n" 1268 "punpcklbw %%mm2, %%mm2 \n" 1269 "psraw $8, %%mm3 \n" 1270 "psraw $8, %%mm2 \n" 1271 "psubw %%mm3, %%mm1 \n" 1272 "psubw %%mm2, %%mm0 \n" 1273 "pmaddwd %%mm1, %%mm1 \n" 1274 "pmaddwd %%mm0, %%mm0 \n" 1275 "paddd %%mm1, %%mm4 \n" 1276 "paddd %%mm0, %%mm4 \n" 1277 "jg 1b \n" 1278 "movq %%mm4, %%mm3 \n" 1279 "psrlq $32, %%mm3 \n" 1280 "paddd %%mm3, %%mm4 \n" 1281 "movd %%mm4, %1 \n" 1282 :"+r"(i), "=r"(sum) 1283 :"r"(pix1), "r"(pix2) 1284 ); 1285 return sum; 1286} 1287 1288#define PHADDD(a, t)\ 1289 "movq "#a", "#t" \n\t"\ 1290 "psrlq $32, "#a" \n\t"\ 1291 "paddd "#t", "#a" \n\t" 1292/* 1293 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] 1294 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] 1295 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] 1296 */ 1297#define PMULHRW(x, y, s, o)\ 1298 "pmulhw " #s ", "#x " \n\t"\ 1299 "pmulhw " #s ", "#y " \n\t"\ 1300 "paddw " #o ", "#x " \n\t"\ 1301 "paddw " #o ", "#y " \n\t"\ 1302 "psraw $1, "#x " \n\t"\ 1303 "psraw $1, "#y " \n\t" 1304#define DEF(x) x ## _mmx 1305#define SET_RND MOVQ_WONE 1306#define SCALE_OFFSET 1 1307 1308#include "dsputil_mmx_qns_template.c" 1309 1310#undef DEF 1311#undef SET_RND 1312#undef SCALE_OFFSET 1313#undef PMULHRW 1314 1315#define DEF(x) x ## _3dnow 1316#define SET_RND(x) 1317#define SCALE_OFFSET 0 1318#define PMULHRW(x, y, s, o)\ 1319 "pmulhrw " #s ", "#x " \n\t"\ 1320 "pmulhrw " #s ", "#y " \n\t" 1321 1322#include "dsputil_mmx_qns_template.c" 1323 1324#undef DEF 1325#undef SET_RND 1326#undef SCALE_OFFSET 1327#undef PMULHRW 1328 1329#if HAVE_SSSE3 1330#undef PHADDD 1331#define DEF(x) x ## _ssse3 1332#define SET_RND(x) 1333#define SCALE_OFFSET -1 1334#define PHADDD(a, t)\ 1335 "pshufw $0x0E, "#a", "#t" \n\t"\ 1336 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ 1337#define PMULHRW(x, y, s, o)\ 1338 "pmulhrsw " #s ", "#x " \n\t"\ 1339 "pmulhrsw " #s ", "#y " \n\t" 1340 1341#include "dsputil_mmx_qns_template.c" 1342 1343#undef DEF 1344#undef SET_RND 1345#undef SCALE_OFFSET 1346#undef PMULHRW 1347#undef PHADDD 1348#endif //HAVE_SSSE3 1349 1350 1351/* FLAC specific */ 1352void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, 1353 double *autoc); 1354 1355 1356void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) 1357{ 1358 if (mm_flags & FF_MM_MMX) { 1359 const int dct_algo = avctx->dct_algo; 1360 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ 1361 if(mm_flags & FF_MM_SSE2){ 1362 c->fdct = ff_fdct_sse2; 1363 }else if(mm_flags & FF_MM_MMXEXT){ 1364 c->fdct = ff_fdct_mmx2; 1365 }else{ 1366 c->fdct = ff_fdct_mmx; 1367 } 1368 } 1369 1370 c->get_pixels = get_pixels_mmx; 1371 c->diff_pixels = diff_pixels_mmx; 1372 c->pix_sum = pix_sum16_mmx; 1373 1374 c->diff_bytes= diff_bytes_mmx; 1375 c->sum_abs_dctelem= sum_abs_dctelem_mmx; 1376 1377 c->hadamard8_diff[0]= hadamard8_diff16_mmx; 1378 c->hadamard8_diff[1]= hadamard8_diff_mmx; 1379 1380 c->pix_norm1 = pix_norm1_mmx; 1381 c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx; 1382 c->sse[1] = sse8_mmx; 1383 c->vsad[4]= vsad_intra16_mmx; 1384 1385 c->nsse[0] = nsse16_mmx; 1386 c->nsse[1] = nsse8_mmx; 1387 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1388 c->vsad[0] = vsad16_mmx; 1389 } 1390 1391 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1392 c->try_8x8basis= try_8x8basis_mmx; 1393 } 1394 c->add_8x8basis= add_8x8basis_mmx; 1395 1396 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 1397 1398 1399 if (mm_flags & FF_MM_MMXEXT) { 1400 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; 1401 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; 1402 c->hadamard8_diff[1]= hadamard8_diff_mmx2; 1403 c->vsad[4]= vsad_intra16_mmx2; 1404 1405 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1406 c->vsad[0] = vsad16_mmx2; 1407 } 1408 1409 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 1410 } 1411 1412 if(mm_flags & FF_MM_SSE2){ 1413 c->get_pixels = get_pixels_sse2; 1414 c->sum_abs_dctelem= sum_abs_dctelem_sse2; 1415 c->hadamard8_diff[0]= hadamard8_diff16_sse2; 1416 c->hadamard8_diff[1]= hadamard8_diff_sse2; 1417 if (CONFIG_FLAC_ENCODER) 1418 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2; 1419 } 1420 1421#if HAVE_SSSE3 1422 if(mm_flags & FF_MM_SSSE3){ 1423 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1424 c->try_8x8basis= try_8x8basis_ssse3; 1425 } 1426 c->add_8x8basis= add_8x8basis_ssse3; 1427 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; 1428 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; 1429 c->hadamard8_diff[1]= hadamard8_diff_ssse3; 1430 } 1431#endif 1432 1433 if(mm_flags & FF_MM_3DNOW){ 1434 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 1435 c->try_8x8basis= try_8x8basis_3dnow; 1436 } 1437 c->add_8x8basis= add_8x8basis_3dnow; 1438 } 1439 } 1440 1441 dsputil_init_pix_mmx(c, avctx); 1442} 1443