1/* 2 * MMX optimized DSP utils 3 * Copyright (c) 2000, 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include "libavutil/attributes.h" 26#include "libavutil/cpu.h" 27#include "libavutil/x86/asm.h" 28#include "libavutil/x86/cpu.h" 29#include "libavcodec/dsputil.h" 30#include "libavcodec/mpegvideo.h" 31#include "dsputil_x86.h" 32 33int ff_sum_abs_dctelem_mmx(int16_t *block); 34int ff_sum_abs_dctelem_mmxext(int16_t *block); 35int ff_sum_abs_dctelem_sse2(int16_t *block); 36int ff_sum_abs_dctelem_ssse3(int16_t *block); 37int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 38 int line_size, int h); 39int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 40 int line_size, int h); 41int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 42 int line_size, int h); 43int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h); 44int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h); 45 46#define hadamard_func(cpu) \ 47 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 48 uint8_t *src2, int stride, int h); \ 49 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 50 uint8_t *src2, int stride, int h); 51 52hadamard_func(mmx) 53hadamard_func(mmxext) 54hadamard_func(sse2) 55hadamard_func(ssse3) 56 57#if HAVE_YASM 58static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, 59 int line_size, int h) 60{ 61 int score1, score2; 62 63 if (c) 64 score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); 65 else 66 score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); 67 score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h) 68 - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h); 69 70 if (c) 71 return score1 + FFABS(score2) * c->avctx->nsse_weight; 72 else 73 return score1 + FFABS(score2) * 8; 74} 75 76static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, 77 int line_size, int h) 78{ 79 int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); 80 int score2 = ff_hf_noise8_mmx(pix1, line_size, h) - 81 ff_hf_noise8_mmx(pix2, line_size, h); 82 83 if (c) 84 return score1 + FFABS(score2) * c->avctx->nsse_weight; 85 else 86 return score1 + FFABS(score2) * 8; 87} 88 89#endif /* HAVE_YASM */ 90 91#if HAVE_INLINE_ASM 92 93static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, 94 int line_size, int h) 95{ 96 int tmp; 97 98 av_assert2((((int) pix) & 7) == 0); 99 av_assert2((line_size & 7) == 0); 100 101#define SUM(in0, in1, out0, out1) \ 102 "movq (%0), %%mm2\n" \ 103 "movq 8(%0), %%mm3\n" \ 104 "add %2,%0\n" \ 105 "movq %%mm2, " #out0 "\n" \ 106 "movq %%mm3, " #out1 "\n" \ 107 "psubusb " #in0 ", %%mm2\n" \ 108 "psubusb " #in1 ", %%mm3\n" \ 109 "psubusb " #out0 ", " #in0 "\n" \ 110 "psubusb " #out1 ", " #in1 "\n" \ 111 "por %%mm2, " #in0 "\n" \ 112 "por %%mm3, " #in1 "\n" \ 113 "movq " #in0 ", %%mm2\n" \ 114 "movq " #in1 ", %%mm3\n" \ 115 "punpcklbw %%mm7, " #in0 "\n" \ 116 "punpcklbw %%mm7, " #in1 "\n" \ 117 "punpckhbw %%mm7, %%mm2\n" \ 118 "punpckhbw %%mm7, %%mm3\n" \ 119 "paddw " #in1 ", " #in0 "\n" \ 120 "paddw %%mm3, %%mm2\n" \ 121 "paddw %%mm2, " #in0 "\n" \ 122 "paddw " #in0 ", %%mm6\n" 123 124 125 __asm__ volatile ( 126 "movl %3, %%ecx\n" 127 "pxor %%mm6, %%mm6\n" 128 "pxor %%mm7, %%mm7\n" 129 "movq (%0), %%mm0\n" 130 "movq 8(%0), %%mm1\n" 131 "add %2, %0\n" 132 "jmp 2f\n" 133 "1:\n" 134 135 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 136 "2:\n" 137 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 138 139 "subl $2, %%ecx\n" 140 "jnz 1b\n" 141 142 "movq %%mm6, %%mm0\n" 143 "psrlq $32, %%mm6\n" 144 "paddw %%mm6, %%mm0\n" 145 "movq %%mm0, %%mm6\n" 146 "psrlq $16, %%mm0\n" 147 "paddw %%mm6, %%mm0\n" 148 "movd %%mm0, %1\n" 149 : "+r" (pix), "=r" (tmp) 150 : "r" ((x86_reg) line_size), "m" (h) 151 : "%ecx"); 152 153 return tmp & 0xFFFF; 154} 155#undef SUM 156 157static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, 158 int line_size, int h) 159{ 160 int tmp; 161 162 av_assert2((((int) pix) & 7) == 0); 163 av_assert2((line_size & 7) == 0); 164 165#define SUM(in0, in1, out0, out1) \ 166 "movq (%0), " #out0 "\n" \ 167 "movq 8(%0), " #out1 "\n" \ 168 "add %2, %0\n" \ 169 "psadbw " #out0 ", " #in0 "\n" \ 170 "psadbw " #out1 ", " #in1 "\n" \ 171 "paddw " #in1 ", " #in0 "\n" \ 172 "paddw " #in0 ", %%mm6\n" 173 174 __asm__ volatile ( 175 "movl %3, %%ecx\n" 176 "pxor %%mm6, %%mm6\n" 177 "pxor %%mm7, %%mm7\n" 178 "movq (%0), %%mm0\n" 179 "movq 8(%0), %%mm1\n" 180 "add %2, %0\n" 181 "jmp 2f\n" 182 "1:\n" 183 184 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 185 "2:\n" 186 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 187 188 "subl $2, %%ecx\n" 189 "jnz 1b\n" 190 191 "movd %%mm6, %1\n" 192 : "+r" (pix), "=r" (tmp) 193 : "r" ((x86_reg) line_size), "m" (h) 194 : "%ecx"); 195 196 return tmp; 197} 198#undef SUM 199 200static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 201 int line_size, int h) 202{ 203 int tmp; 204 205 av_assert2((((int) pix1) & 7) == 0); 206 av_assert2((((int) pix2) & 7) == 0); 207 av_assert2((line_size & 7) == 0); 208 209#define SUM(in0, in1, out0, out1) \ 210 "movq (%0), %%mm2\n" \ 211 "movq (%1), " #out0 "\n" \ 212 "movq 8(%0), %%mm3\n" \ 213 "movq 8(%1), " #out1 "\n" \ 214 "add %3, %0\n" \ 215 "add %3, %1\n" \ 216 "psubb " #out0 ", %%mm2\n" \ 217 "psubb " #out1 ", %%mm3\n" \ 218 "pxor %%mm7, %%mm2\n" \ 219 "pxor %%mm7, %%mm3\n" \ 220 "movq %%mm2, " #out0 "\n" \ 221 "movq %%mm3, " #out1 "\n" \ 222 "psubusb " #in0 ", %%mm2\n" \ 223 "psubusb " #in1 ", %%mm3\n" \ 224 "psubusb " #out0 ", " #in0 "\n" \ 225 "psubusb " #out1 ", " #in1 "\n" \ 226 "por %%mm2, " #in0 "\n" \ 227 "por %%mm3, " #in1 "\n" \ 228 "movq " #in0 ", %%mm2\n" \ 229 "movq " #in1 ", %%mm3\n" \ 230 "punpcklbw %%mm7, " #in0 "\n" \ 231 "punpcklbw %%mm7, " #in1 "\n" \ 232 "punpckhbw %%mm7, %%mm2\n" \ 233 "punpckhbw %%mm7, %%mm3\n" \ 234 "paddw " #in1 ", " #in0 "\n" \ 235 "paddw %%mm3, %%mm2\n" \ 236 "paddw %%mm2, " #in0 "\n" \ 237 "paddw " #in0 ", %%mm6\n" 238 239 240 __asm__ volatile ( 241 "movl %4, %%ecx\n" 242 "pxor %%mm6, %%mm6\n" 243 "pcmpeqw %%mm7, %%mm7\n" 244 "psllw $15, %%mm7\n" 245 "packsswb %%mm7, %%mm7\n" 246 "movq (%0), %%mm0\n" 247 "movq (%1), %%mm2\n" 248 "movq 8(%0), %%mm1\n" 249 "movq 8(%1), %%mm3\n" 250 "add %3, %0\n" 251 "add %3, %1\n" 252 "psubb %%mm2, %%mm0\n" 253 "psubb %%mm3, %%mm1\n" 254 "pxor %%mm7, %%mm0\n" 255 "pxor %%mm7, %%mm1\n" 256 "jmp 2f\n" 257 "1:\n" 258 259 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 260 "2:\n" 261 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 262 263 "subl $2, %%ecx\n" 264 "jnz 1b\n" 265 266 "movq %%mm6, %%mm0\n" 267 "psrlq $32, %%mm6\n" 268 "paddw %%mm6, %%mm0\n" 269 "movq %%mm0, %%mm6\n" 270 "psrlq $16, %%mm0\n" 271 "paddw %%mm6, %%mm0\n" 272 "movd %%mm0, %2\n" 273 : "+r" (pix1), "+r" (pix2), "=r" (tmp) 274 : "r" ((x86_reg) line_size), "m" (h) 275 : "%ecx"); 276 277 return tmp & 0x7FFF; 278} 279#undef SUM 280 281static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 282 int line_size, int h) 283{ 284 int tmp; 285 286 av_assert2((((int) pix1) & 7) == 0); 287 av_assert2((((int) pix2) & 7) == 0); 288 av_assert2((line_size & 7) == 0); 289 290#define SUM(in0, in1, out0, out1) \ 291 "movq (%0), " #out0 "\n" \ 292 "movq (%1), %%mm2\n" \ 293 "movq 8(%0), " #out1 "\n" \ 294 "movq 8(%1), %%mm3\n" \ 295 "add %3, %0\n" \ 296 "add %3, %1\n" \ 297 "psubb %%mm2, " #out0 "\n" \ 298 "psubb %%mm3, " #out1 "\n" \ 299 "pxor %%mm7, " #out0 "\n" \ 300 "pxor %%mm7, " #out1 "\n" \ 301 "psadbw " #out0 ", " #in0 "\n" \ 302 "psadbw " #out1 ", " #in1 "\n" \ 303 "paddw " #in1 ", " #in0 "\n" \ 304 "paddw " #in0 ", %%mm6\n " 305 306 __asm__ volatile ( 307 "movl %4, %%ecx\n" 308 "pxor %%mm6, %%mm6\n" 309 "pcmpeqw %%mm7, %%mm7\n" 310 "psllw $15, %%mm7\n" 311 "packsswb %%mm7, %%mm7\n" 312 "movq (%0), %%mm0\n" 313 "movq (%1), %%mm2\n" 314 "movq 8(%0), %%mm1\n" 315 "movq 8(%1), %%mm3\n" 316 "add %3, %0\n" 317 "add %3, %1\n" 318 "psubb %%mm2, %%mm0\n" 319 "psubb %%mm3, %%mm1\n" 320 "pxor %%mm7, %%mm0\n" 321 "pxor %%mm7, %%mm1\n" 322 "jmp 2f\n" 323 "1:\n" 324 325 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 326 "2:\n" 327 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 328 329 "subl $2, %%ecx\n" 330 "jnz 1b\n" 331 332 "movd %%mm6, %2\n" 333 : "+r" (pix1), "+r" (pix2), "=r" (tmp) 334 : "r" ((x86_reg) line_size), "m" (h) 335 : "%ecx"); 336 337 return tmp; 338} 339#undef SUM 340 341 342#endif /* HAVE_INLINE_ASM */ 343 344av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx) 345{ 346 int cpu_flags = av_get_cpu_flags(); 347 348#if HAVE_INLINE_ASM 349 if (INLINE_MMX(cpu_flags)) { 350 c->vsad[4] = vsad_intra16_mmx; 351 352 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { 353 c->vsad[0] = vsad16_mmx; 354 } 355 } 356 357 if (INLINE_MMXEXT(cpu_flags)) { 358 c->vsad[4] = vsad_intra16_mmxext; 359 360 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { 361 c->vsad[0] = vsad16_mmxext; 362 } 363 } 364#endif /* HAVE_INLINE_ASM */ 365 366 if (EXTERNAL_MMX(cpu_flags)) { 367 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; 368 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; 369 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; 370 c->sse[0] = ff_sse16_mmx; 371 c->sse[1] = ff_sse8_mmx; 372#if HAVE_YASM 373 c->nsse[0] = nsse16_mmx; 374 c->nsse[1] = nsse8_mmx; 375#endif 376 } 377 378 if (EXTERNAL_MMXEXT(cpu_flags)) { 379 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; 380 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; 381 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; 382 } 383 384 if (EXTERNAL_SSE2(cpu_flags)) { 385 c->sse[0] = ff_sse16_sse2; 386 c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; 387 388#if HAVE_ALIGNED_STACK 389 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; 390 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; 391#endif 392 } 393 394 if (EXTERNAL_SSSE3(cpu_flags)) { 395 c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; 396#if HAVE_ALIGNED_STACK 397 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; 398 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; 399#endif 400 } 401 402 ff_dsputil_init_pix_mmx(c, avctx); 403} 404