1;***************************************************************************** 2;* MMX optimized DSP utils 3;***************************************************************************** 4;* Copyright (c) 2000, 2001 Fabrice Bellard 5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;***************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28%macro DIFF_PIXELS_1 4 29 movh %1, %3 30 movh %2, %4 31 punpcklbw %2, %1 32 punpcklbw %1, %1 33 psubw %1, %2 34%endmacro 35 36; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 37; %6=temporary storage location 38; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) 39%macro DIFF_PIXELS_8 6 40 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] 41 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] 42 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] 43 add %1, %5 44 add %2, %5 45 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] 46 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] 47 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] 48 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] 49%ifdef m8 50 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] 51%else 52 mova [%6], m0 53 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] 54 mova m0, [%6] 55%endif 56 sub %1, %5 57 sub %2, %5 58%endmacro 59 60%macro HADAMARD8 0 61 SUMSUB_BADC w, 0, 1, 2, 3 62 SUMSUB_BADC w, 4, 5, 6, 7 63 SUMSUB_BADC w, 0, 2, 1, 3 64 SUMSUB_BADC w, 4, 6, 5, 7 65 SUMSUB_BADC w, 0, 4, 1, 5 66 SUMSUB_BADC w, 2, 6, 3, 7 67%endmacro 68 69%macro ABS1_SUM 3 70 ABS1 %1, %2 71 paddusw %3, %1 72%endmacro 73 74%macro ABS2_SUM 6 75 ABS2 %1, %2, %3, %4 76 paddusw %5, %1 77 paddusw %6, %2 78%endmacro 79 80%macro ABS_SUM_8x8_64 1 81 ABS2 m0, m1, m8, m9 82 ABS2_SUM m2, m3, m8, m9, m0, m1 83 ABS2_SUM m4, m5, m8, m9, m0, m1 84 ABS2_SUM m6, m7, m8, m9, m0, m1 85 paddusw m0, m1 86%endmacro 87 88%macro ABS_SUM_8x8_32 1 89 mova [%1], m7 90 ABS1 m0, m7 91 ABS1 m1, m7 92 ABS1_SUM m2, m7, m0 93 ABS1_SUM m3, m7, m1 94 ABS1_SUM m4, m7, m0 95 ABS1_SUM m5, m7, m1 96 ABS1_SUM m6, m7, m0 97 mova m2, [%1] 98 ABS1_SUM m2, m7, m1 99 paddusw m0, m1 100%endmacro 101 102; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to 103; about 100k on extreme inputs. But that's very unlikely to occur in natural video, 104; and it's even more unlikely to not have any alternative mvs/modes with lower cost. 105%macro HSUM 3 106%if cpuflag(sse2) 107 movhlps %2, %1 108 paddusw %1, %2 109 pshuflw %2, %1, 0xE 110 paddusw %1, %2 111 pshuflw %2, %1, 0x1 112 paddusw %1, %2 113 movd %3, %1 114%elif cpuflag(mmxext) 115 pshufw %2, %1, 0xE 116 paddusw %1, %2 117 pshufw %2, %1, 0x1 118 paddusw %1, %2 119 movd %3, %1 120%elif cpuflag(mmx) 121 mova %2, %1 122 psrlq %1, 32 123 paddusw %1, %2 124 mova %2, %1 125 psrlq %1, 16 126 paddusw %1, %2 127 movd %3, %1 128%endif 129%endmacro 130 131%macro STORE4 5 132 mova [%1+mmsize*0], %2 133 mova [%1+mmsize*1], %3 134 mova [%1+mmsize*2], %4 135 mova [%1+mmsize*3], %5 136%endmacro 137 138%macro LOAD4 5 139 mova %2, [%1+mmsize*0] 140 mova %3, [%1+mmsize*1] 141 mova %4, [%1+mmsize*2] 142 mova %5, [%1+mmsize*3] 143%endmacro 144 145%macro hadamard8_16_wrapper 2 146cglobal hadamard8_diff, 4, 4, %1 147%ifndef m8 148 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) 149 SUB rsp, pad 150%endif 151 call hadamard8x8_diff %+ SUFFIX 152%ifndef m8 153 ADD rsp, pad 154%endif 155 RET 156 157cglobal hadamard8_diff16, 5, 6, %1 158%ifndef m8 159 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) 160 SUB rsp, pad 161%endif 162 163 call hadamard8x8_diff %+ SUFFIX 164 mov r5d, eax 165 166 add r1, 8 167 add r2, 8 168 call hadamard8x8_diff %+ SUFFIX 169 add r5d, eax 170 171 cmp r4d, 16 172 jne .done 173 174 lea r1, [r1+r3*8-8] 175 lea r2, [r2+r3*8-8] 176 call hadamard8x8_diff %+ SUFFIX 177 add r5d, eax 178 179 add r1, 8 180 add r2, 8 181 call hadamard8x8_diff %+ SUFFIX 182 add r5d, eax 183 184.done: 185 mov eax, r5d 186%ifndef m8 187 ADD rsp, pad 188%endif 189 RET 190%endmacro 191 192%macro HADAMARD8_DIFF 0-1 193%if cpuflag(sse2) 194hadamard8x8_diff %+ SUFFIX: 195 lea r0, [r3*3] 196 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize 197 HADAMARD8 198%if ARCH_X86_64 199 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 200%else 201 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] 202%endif 203 HADAMARD8 204 ABS_SUM_8x8 rsp+gprsize 205 HSUM m0, m1, eax 206 and eax, 0xFFFF 207 ret 208 209hadamard8_16_wrapper %1, 3 210%elif cpuflag(mmx) 211ALIGN 16 212; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, 213; uint8_t *src2, int stride, int h) 214; r0 = void *s = unused, int h = unused (always 8) 215; note how r1, r2 and r3 are not clobbered in this function, so 16x16 216; can simply call this 2x2x (and that's why we access rsp+gprsize 217; everywhere, which is rsp of calling func 218hadamard8x8_diff %+ SUFFIX: 219 lea r0, [r3*3] 220 221 ; first 4x8 pixels 222 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 223 HADAMARD8 224 mova [rsp+gprsize+0x60], m7 225 TRANSPOSE4x4W 0, 1, 2, 3, 7 226 STORE4 rsp+gprsize, m0, m1, m2, m3 227 mova m7, [rsp+gprsize+0x60] 228 TRANSPOSE4x4W 4, 5, 6, 7, 0 229 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 230 231 ; second 4x8 pixels 232 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 233 HADAMARD8 234 mova [rsp+gprsize+0x60], m7 235 TRANSPOSE4x4W 0, 1, 2, 3, 7 236 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 237 mova m7, [rsp+gprsize+0x60] 238 TRANSPOSE4x4W 4, 5, 6, 7, 0 239 240 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 241 HADAMARD8 242 ABS_SUM_8x8_32 rsp+gprsize+0x60 243 mova [rsp+gprsize+0x60], m0 244 245 LOAD4 rsp+gprsize , m0, m1, m2, m3 246 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 247 HADAMARD8 248 ABS_SUM_8x8_32 rsp+gprsize 249 paddusw m0, [rsp+gprsize+0x60] 250 251 HSUM m0, m1, eax 252 and rax, 0xFFFF 253 ret 254 255hadamard8_16_wrapper 0, 14 256%endif 257%endmacro 258 259INIT_MMX mmx 260HADAMARD8_DIFF 261 262INIT_MMX mmxext 263HADAMARD8_DIFF 264 265INIT_XMM sse2 266%if ARCH_X86_64 267%define ABS_SUM_8x8 ABS_SUM_8x8_64 268%else 269%define ABS_SUM_8x8 ABS_SUM_8x8_32 270%endif 271HADAMARD8_DIFF 10 272 273INIT_XMM ssse3 274%define ABS_SUM_8x8 ABS_SUM_8x8_64 275HADAMARD8_DIFF 9 276 277; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 278; int line_size, int h) 279 280%macro SUM_SQUARED_ERRORS 1 281cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h 282%if %1 == mmsize 283 shr hd, 1 284%endif 285 pxor m0, m0 ; mm0 = 0 286 pxor m7, m7 ; mm7 holds the sum 287 288.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned 289 movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx 290 movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx 291%if %1 == mmsize 292 movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx 293 movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx 294%else ; %1 / 2 == mmsize; mmx only 295 mova m3, [pix1q+8] ; m3 = pix1[0][8-15] 296 mova m4, [pix2q+8] ; m4 = pix2[0][8-15] 297%endif 298 299 ; todo: mm1-mm2, mm3-mm4 300 ; algo: subtract mm1 from mm2 with saturation and vice versa 301 ; OR the result to get the absolute difference 302 mova m5, m1 303 mova m6, m3 304 psubusb m1, m2 305 psubusb m3, m4 306 psubusb m2, m5 307 psubusb m4, m6 308 309 por m2, m1 310 por m4, m3 311 312 ; now convert to 16-bit vectors so we can square them 313 mova m1, m2 314 mova m3, m4 315 316 punpckhbw m2, m0 317 punpckhbw m4, m0 318 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) 319 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) 320 321 pmaddwd m2, m2 322 pmaddwd m4, m4 323 pmaddwd m1, m1 324 pmaddwd m3, m3 325 326 paddd m1, m2 327 paddd m3, m4 328 paddd m7, m1 329 paddd m7, m3 330 331%if %1 == mmsize 332 lea pix1q, [pix1q + 2*lsizeq] 333 lea pix2q, [pix2q + 2*lsizeq] 334%else 335 add pix1q, lsizeq 336 add pix2q, lsizeq 337%endif 338 dec hd 339 jnz .next2lines 340 341 HADDD m7, m1 342 movd eax, m7 ; return value 343 RET 344%endmacro 345 346INIT_MMX mmx 347SUM_SQUARED_ERRORS 8 348 349INIT_MMX mmx 350SUM_SQUARED_ERRORS 16 351 352INIT_XMM sse2 353SUM_SQUARED_ERRORS 16 354 355;----------------------------------------------- 356;int ff_sum_abs_dctelem(int16_t *block) 357;----------------------------------------------- 358; %1 = number of xmm registers used 359; %2 = number of inline loops 360 361%macro SUM_ABS_DCTELEM 2 362cglobal sum_abs_dctelem, 1, 1, %1, block 363 pxor m0, m0 364 pxor m1, m1 365%assign %%i 0 366%rep %2 367 mova m2, [blockq+mmsize*(0+%%i)] 368 mova m3, [blockq+mmsize*(1+%%i)] 369 mova m4, [blockq+mmsize*(2+%%i)] 370 mova m5, [blockq+mmsize*(3+%%i)] 371 ABS1_SUM m2, m6, m0 372 ABS1_SUM m3, m6, m1 373 ABS1_SUM m4, m6, m0 374 ABS1_SUM m5, m6, m1 375%assign %%i %%i+4 376%endrep 377 paddusw m0, m1 378 HSUM m0, m1, eax 379 and eax, 0xFFFF 380 RET 381%endmacro 382 383INIT_MMX mmx 384SUM_ABS_DCTELEM 0, 4 385INIT_MMX mmxext 386SUM_ABS_DCTELEM 0, 4 387INIT_XMM sse2 388SUM_ABS_DCTELEM 7, 2 389INIT_XMM ssse3 390SUM_ABS_DCTELEM 6, 2 391 392;------------------------------------------------------------------------------ 393; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h) 394;------------------------------------------------------------------------------ 395; %1 = 8/16. %2-5=m# 396%macro HF_NOISE_PART1 5 397 mova m%2, [pix1q] 398%if %1 == 8 399 mova m%3, m%2 400 psllq m%2, 8 401 psrlq m%3, 8 402 psrlq m%2, 8 403%else 404 mova m%3, [pix1q+1] 405%endif 406 mova m%4, m%2 407 mova m%5, m%3 408 punpcklbw m%2, m7 409 punpcklbw m%3, m7 410 punpckhbw m%4, m7 411 punpckhbw m%5, m7 412 psubw m%2, m%3 413 psubw m%4, m%5 414%endmacro 415 416; %1-2 = m# 417%macro HF_NOISE_PART2 4 418 psubw m%1, m%3 419 psubw m%2, m%4 420 pxor m3, m3 421 pxor m1, m1 422 pcmpgtw m3, m%1 423 pcmpgtw m1, m%2 424 pxor m%1, m3 425 pxor m%2, m1 426 psubw m%1, m3 427 psubw m%2, m1 428 paddw m%2, m%1 429 paddw m6, m%2 430%endmacro 431 432; %1 = 8/16 433%macro HF_NOISE 1 434cglobal hf_noise%1, 3,3,0, pix1, lsize, h 435 movsxdifnidn lsizeq, lsized 436 sub hd, 2 437 pxor m7, m7 438 pxor m6, m6 439 HF_NOISE_PART1 %1, 0, 1, 2, 3 440 add pix1q, lsizeq 441 HF_NOISE_PART1 %1, 4, 1, 5, 3 442 HF_NOISE_PART2 0, 2, 4, 5 443 add pix1q, lsizeq 444.loop: 445 HF_NOISE_PART1 %1, 0, 1, 2, 3 446 HF_NOISE_PART2 4, 5, 0, 2 447 add pix1q, lsizeq 448 HF_NOISE_PART1 %1, 4, 1, 5, 3 449 HF_NOISE_PART2 0, 2, 4, 5 450 add pix1q, lsizeq 451 sub hd, 2 452 jne .loop 453 454 mova m0, m6 455 punpcklwd m0, m7 456 punpckhwd m6, m7 457 paddd m6, m0 458 mova m0, m6 459 psrlq m6, 32 460 paddd m0, m6 461 movd eax, m0 ; eax = result of hf_noise8; 462 REP_RET ; return eax; 463%endmacro 464 465INIT_MMX mmx 466HF_NOISE 8 467HF_NOISE 16 468