1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29cextern pw_16 30cextern pw_8 31cextern pw_4 32cextern pw_2 33cextern pw_1 34 35pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 36pw_m3: times 8 dw -3 37pw_pixel_max: times 8 dw ((1 << 10)-1) 38pw_512: times 8 dw 512 39pd_17: times 4 dd 17 40pd_16: times 4 dd 16 41 42SECTION .text 43 44; dest, left, right, src 45; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 46%macro PRED4x4_LOWPASS 4 47 paddw %2, %3 48 psrlw %2, 1 49 pavgw %1, %4, %2 50%endmacro 51 52;----------------------------------------------------------------------------- 53; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride) 54;----------------------------------------------------------------------------- 55%macro PRED4x4_DR 0 56cglobal pred4x4_down_right_10, 3, 3 57 sub r0, r2 58 lea r1, [r0+r2*2] 59 movhps m1, [r1-8] 60 movhps m2, [r0+r2*1-8] 61 movhps m4, [r0-8] 62 punpckhwd m2, m4 63 movq m3, [r0] 64 punpckhdq m1, m2 65 PALIGNR m3, m1, 10, m1 66 movhps m4, [r1+r2*1-8] 67 PALIGNR m0, m3, m4, 14, m4 68 movhps m4, [r1+r2*2-8] 69 PALIGNR m2, m0, m4, 14, m4 70 PRED4x4_LOWPASS m0, m2, m3, m0 71 movq [r1+r2*2], m0 72 psrldq m0, 2 73 movq [r1+r2*1], m0 74 psrldq m0, 2 75 movq [r0+r2*2], m0 76 psrldq m0, 2 77 movq [r0+r2*1], m0 78 RET 79%endmacro 80 81INIT_XMM sse2 82PRED4x4_DR 83INIT_XMM ssse3 84PRED4x4_DR 85%if HAVE_AVX_EXTERNAL 86INIT_XMM avx 87PRED4x4_DR 88%endif 89 90;------------------------------------------------------------------------------ 91; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) 92;------------------------------------------------------------------------------ 93%macro PRED4x4_VR 0 94cglobal pred4x4_vertical_right_10, 3, 3, 6 95 sub r0, r2 96 lea r1, [r0+r2*2] 97 movq m5, [r0] ; ........t3t2t1t0 98 movhps m1, [r0-8] 99 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt 100 pavgw m5, m0 101 movhps m1, [r0+r2*1-8] 102 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 103 movhps m2, [r0+r2*2-8] 104 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 105 movhps m3, [r1+r2*1-8] 106 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 107 PRED4x4_LOWPASS m1, m0, m2, m1 108 pslldq m0, m1, 12 109 psrldq m1, 4 110 movq [r0+r2*1], m5 111 movq [r0+r2*2], m1 112 PALIGNR m5, m0, 14, m2 113 pslldq m0, 2 114 movq [r1+r2*1], m5 115 PALIGNR m1, m0, 14, m0 116 movq [r1+r2*2], m1 117 RET 118%endmacro 119 120INIT_XMM sse2 121PRED4x4_VR 122INIT_XMM ssse3 123PRED4x4_VR 124%if HAVE_AVX_EXTERNAL 125INIT_XMM avx 126PRED4x4_VR 127%endif 128 129;------------------------------------------------------------------------------- 130; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) 131;------------------------------------------------------------------------------- 132%macro PRED4x4_HD 0 133cglobal pred4x4_horizontal_down_10, 3, 3 134 sub r0, r2 135 lea r1, [r0+r2*2] 136 movq m0, [r0-8] ; lt .. 137 movhps m0, [r0] 138 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. 139 movq m1, [r1+r2*2-8] ; l3 140 movq m3, [r1+r2*1-8] 141 punpcklwd m1, m3 ; l2 l3 142 movq m2, [r0+r2*2-8] ; l1 143 movq m3, [r0+r2*1-8] 144 punpcklwd m2, m3 ; l0 l1 145 punpckhdq m1, m2 ; l0 l1 l2 l3 146 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 147 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 148 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 149 pavgw m5, m1, m3 150 PRED4x4_LOWPASS m3, m1, m0, m3 151 punpcklwd m5, m3 152 psrldq m3, 8 153 PALIGNR m3, m5, 12, m4 154 movq [r1+r2*2], m5 155 movhps [r0+r2*2], m5 156 psrldq m5, 4 157 movq [r1+r2*1], m5 158 movq [r0+r2*1], m3 159 RET 160%endmacro 161 162INIT_XMM sse2 163PRED4x4_HD 164INIT_XMM ssse3 165PRED4x4_HD 166%if HAVE_AVX_EXTERNAL 167INIT_XMM avx 168PRED4x4_HD 169%endif 170 171;----------------------------------------------------------------------------- 172; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride) 173;----------------------------------------------------------------------------- 174 175INIT_MMX mmxext 176cglobal pred4x4_dc_10, 3, 3 177 sub r0, r2 178 lea r1, [r0+r2*2] 179 movq m2, [r0+r2*1-8] 180 paddw m2, [r0+r2*2-8] 181 paddw m2, [r1+r2*1-8] 182 paddw m2, [r1+r2*2-8] 183 psrlq m2, 48 184 movq m0, [r0] 185 HADDW m0, m1 186 paddw m0, [pw_4] 187 paddw m0, m2 188 psrlw m0, 3 189 SPLATW m0, m0, 0 190 movq [r0+r2*1], m0 191 movq [r0+r2*2], m0 192 movq [r1+r2*1], m0 193 movq [r1+r2*2], m0 194 RET 195 196;----------------------------------------------------------------------------- 197; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride) 198;----------------------------------------------------------------------------- 199%macro PRED4x4_DL 0 200cglobal pred4x4_down_left_10, 3, 3 201 sub r0, r2 202 movq m0, [r0] 203 movhps m0, [r1] 204 psrldq m2, m0, 2 205 pslldq m3, m0, 2 206 pshufhw m2, m2, 10100100b 207 PRED4x4_LOWPASS m0, m3, m2, m0 208 lea r1, [r0+r2*2] 209 movhps [r1+r2*2], m0 210 psrldq m0, 2 211 movq [r0+r2*1], m0 212 psrldq m0, 2 213 movq [r0+r2*2], m0 214 psrldq m0, 2 215 movq [r1+r2*1], m0 216 RET 217%endmacro 218 219INIT_XMM sse2 220PRED4x4_DL 221%if HAVE_AVX_EXTERNAL 222INIT_XMM avx 223PRED4x4_DL 224%endif 225 226;----------------------------------------------------------------------------- 227; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) 228;----------------------------------------------------------------------------- 229%macro PRED4x4_VL 0 230cglobal pred4x4_vertical_left_10, 3, 3 231 sub r0, r2 232 movu m1, [r0] 233 movhps m1, [r1] 234 psrldq m0, m1, 2 235 psrldq m2, m1, 4 236 pavgw m4, m0, m1 237 PRED4x4_LOWPASS m0, m1, m2, m0 238 lea r1, [r0+r2*2] 239 movq [r0+r2*1], m4 240 movq [r0+r2*2], m0 241 psrldq m4, 2 242 psrldq m0, 2 243 movq [r1+r2*1], m4 244 movq [r1+r2*2], m0 245 RET 246%endmacro 247 248INIT_XMM sse2 249PRED4x4_VL 250%if HAVE_AVX_EXTERNAL 251INIT_XMM avx 252PRED4x4_VL 253%endif 254 255;----------------------------------------------------------------------------- 256; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) 257;----------------------------------------------------------------------------- 258INIT_MMX mmxext 259cglobal pred4x4_horizontal_up_10, 3, 3 260 sub r0, r2 261 lea r1, [r0+r2*2] 262 movq m0, [r0+r2*1-8] 263 punpckhwd m0, [r0+r2*2-8] 264 movq m1, [r1+r2*1-8] 265 punpckhwd m1, [r1+r2*2-8] 266 punpckhdq m0, m1 267 pshufw m1, m1, 0xFF 268 movq [r1+r2*2], m1 269 movd [r1+r2*1+4], m1 270 pshufw m2, m0, 11111001b 271 movq m1, m2 272 pavgw m2, m0 273 274 pshufw m5, m0, 11111110b 275 PRED4x4_LOWPASS m1, m0, m5, m1 276 movq m6, m2 277 punpcklwd m6, m1 278 movq [r0+r2*1], m6 279 psrlq m2, 16 280 psrlq m1, 16 281 punpcklwd m2, m1 282 movq [r0+r2*2], m2 283 psrlq m2, 32 284 movd [r1+r2*1], m2 285 RET 286 287 288 289;----------------------------------------------------------------------------- 290; void ff_pred8x8_vertical(pixel *src, int stride) 291;----------------------------------------------------------------------------- 292INIT_XMM sse2 293cglobal pred8x8_vertical_10, 2, 2 294 sub r0, r1 295 mova m0, [r0] 296%rep 3 297 mova [r0+r1*1], m0 298 mova [r0+r1*2], m0 299 lea r0, [r0+r1*2] 300%endrep 301 mova [r0+r1*1], m0 302 mova [r0+r1*2], m0 303 RET 304 305;----------------------------------------------------------------------------- 306; void ff_pred8x8_horizontal(pixel *src, int stride) 307;----------------------------------------------------------------------------- 308INIT_XMM sse2 309cglobal pred8x8_horizontal_10, 2, 3 310 mov r2d, 4 311.loop: 312 movq m0, [r0+r1*0-8] 313 movq m1, [r0+r1*1-8] 314 pshuflw m0, m0, 0xff 315 pshuflw m1, m1, 0xff 316 punpcklqdq m0, m0 317 punpcklqdq m1, m1 318 mova [r0+r1*0], m0 319 mova [r0+r1*1], m1 320 lea r0, [r0+r1*2] 321 dec r2d 322 jg .loop 323 REP_RET 324 325;----------------------------------------------------------------------------- 326; void ff_predict_8x8_dc(pixel *src, int stride) 327;----------------------------------------------------------------------------- 328%macro MOV8 2-3 329; sort of a hack, but it works 330%if mmsize==8 331 movq [%1+0], %2 332 movq [%1+8], %3 333%else 334 movdqa [%1], %2 335%endif 336%endmacro 337 338%macro PRED8x8_DC 1 339cglobal pred8x8_dc_10, 2, 6 340 sub r0, r1 341 pxor m4, m4 342 movq m0, [r0+0] 343 movq m1, [r0+8] 344%if mmsize==16 345 punpcklwd m0, m1 346 movhlps m1, m0 347 paddw m0, m1 348%else 349 pshufw m2, m0, 00001110b 350 pshufw m3, m1, 00001110b 351 paddw m0, m2 352 paddw m1, m3 353 punpcklwd m0, m1 354%endif 355 %1 m2, m0, 00001110b 356 paddw m0, m2 357 358 lea r5, [r1*3] 359 lea r4, [r0+r1*4] 360 movzx r2d, word [r0+r1*1-2] 361 movzx r3d, word [r0+r1*2-2] 362 add r2d, r3d 363 movzx r3d, word [r0+r5*1-2] 364 add r2d, r3d 365 movzx r3d, word [r4-2] 366 add r2d, r3d 367 movd m2, r2d ; s2 368 369 movzx r2d, word [r4+r1*1-2] 370 movzx r3d, word [r4+r1*2-2] 371 add r2d, r3d 372 movzx r3d, word [r4+r5*1-2] 373 add r2d, r3d 374 movzx r3d, word [r4+r1*4-2] 375 add r2d, r3d 376 movd m3, r2d ; s3 377 378 punpcklwd m2, m3 379 punpckldq m0, m2 ; s0, s1, s2, s3 380 %1 m3, m0, 11110110b ; s2, s1, s3, s3 381 %1 m0, m0, 01110100b ; s0, s1, s3, s1 382 paddw m0, m3 383 psrlw m0, 2 384 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 385%if mmsize==16 386 punpcklwd m0, m0 387 pshufd m3, m0, 11111010b 388 punpckldq m0, m0 389 SWAP 0,1 390%else 391 pshufw m1, m0, 0x00 392 pshufw m2, m0, 0x55 393 pshufw m3, m0, 0xaa 394 pshufw m4, m0, 0xff 395%endif 396 MOV8 r0+r1*1, m1, m2 397 MOV8 r0+r1*2, m1, m2 398 MOV8 r0+r5*1, m1, m2 399 MOV8 r0+r1*4, m1, m2 400 MOV8 r4+r1*1, m3, m4 401 MOV8 r4+r1*2, m3, m4 402 MOV8 r4+r5*1, m3, m4 403 MOV8 r4+r1*4, m3, m4 404 RET 405%endmacro 406 407INIT_MMX mmxext 408PRED8x8_DC pshufw 409INIT_XMM sse2 410PRED8x8_DC pshuflw 411 412;----------------------------------------------------------------------------- 413; void ff_pred8x8_top_dc(pixel *src, int stride) 414;----------------------------------------------------------------------------- 415INIT_XMM sse2 416cglobal pred8x8_top_dc_10, 2, 4 417 sub r0, r1 418 mova m0, [r0] 419 pshuflw m1, m0, 0x4e 420 pshufhw m1, m1, 0x4e 421 paddw m0, m1 422 pshuflw m1, m0, 0xb1 423 pshufhw m1, m1, 0xb1 424 paddw m0, m1 425 lea r2, [r1*3] 426 lea r3, [r0+r1*4] 427 paddw m0, [pw_2] 428 psrlw m0, 2 429 mova [r0+r1*1], m0 430 mova [r0+r1*2], m0 431 mova [r0+r2*1], m0 432 mova [r0+r1*4], m0 433 mova [r3+r1*1], m0 434 mova [r3+r1*2], m0 435 mova [r3+r2*1], m0 436 mova [r3+r1*4], m0 437 RET 438 439;----------------------------------------------------------------------------- 440; void ff_pred8x8_plane(pixel *src, int stride) 441;----------------------------------------------------------------------------- 442INIT_XMM sse2 443cglobal pred8x8_plane_10, 2, 7, 7 444 sub r0, r1 445 lea r2, [r1*3] 446 lea r3, [r0+r1*4] 447 mova m2, [r0] 448 pmaddwd m2, [pw_m32101234] 449 HADDD m2, m1 450 movd m0, [r0-4] 451 psrld m0, 14 452 psubw m2, m0 ; H 453 movd m0, [r3+r1*4-4] 454 movd m1, [r0+12] 455 paddw m0, m1 456 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) 457 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] 458 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] 459 sub r4d, r5d 460 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] 461 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] 462 sub r6d, r5d 463 lea r4d, [r4+r6*2] 464 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] 465 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] 466 sub r5d, r6d 467 lea r5d, [r5*3] 468 add r4d, r5d 469 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] 470 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] 471 sub r6d, r5d 472 lea r4d, [r4+r6*4] 473 movd m3, r4d ; V 474 punpckldq m2, m3 475 pmaddwd m2, [pd_17] 476 paddd m2, [pd_16] 477 psrad m2, 5 ; b, c 478 479 mova m3, [pw_pixel_max] 480 pxor m1, m1 481 SPLATW m0, m0, 1 482 SPLATW m4, m2, 2 483 SPLATW m2, m2, 0 484 pmullw m2, [pw_m32101234] ; b 485 pmullw m5, m4, [pw_m3] ; c 486 paddw m5, [pw_16] 487 mov r2d, 8 488 add r0, r1 489.loop: 490 paddsw m6, m2, m5 491 paddsw m6, m0 492 psraw m6, 5 493 CLIPW m6, m1, m3 494 mova [r0], m6 495 paddw m5, m4 496 add r0, r1 497 dec r2d 498 jg .loop 499 REP_RET 500 501 502;----------------------------------------------------------------------------- 503; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, 504; int stride) 505;----------------------------------------------------------------------------- 506%macro PRED8x8L_128_DC 0 507cglobal pred8x8l_128_dc_10, 4, 4 508 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) 509 lea r1, [r3*3] 510 lea r2, [r0+r3*4] 511 MOV8 r0+r3*0, m0, m0 512 MOV8 r0+r3*1, m0, m0 513 MOV8 r0+r3*2, m0, m0 514 MOV8 r0+r1*1, m0, m0 515 MOV8 r2+r3*0, m0, m0 516 MOV8 r2+r3*1, m0, m0 517 MOV8 r2+r3*2, m0, m0 518 MOV8 r2+r1*1, m0, m0 519 RET 520%endmacro 521 522INIT_MMX mmxext 523PRED8x8L_128_DC 524INIT_XMM sse2 525PRED8x8L_128_DC 526 527;----------------------------------------------------------------------------- 528; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, 529; int stride) 530;----------------------------------------------------------------------------- 531%macro PRED8x8L_TOP_DC 0 532cglobal pred8x8l_top_dc_10, 4, 4, 6 533 sub r0, r3 534 mova m0, [r0] 535 shr r1d, 14 536 shr r2d, 13 537 neg r1 538 pslldq m1, m0, 2 539 psrldq m2, m0, 2 540 pinsrw m1, [r0+r1], 0 541 pinsrw m2, [r0+r2+14], 7 542 lea r1, [r3*3] 543 lea r2, [r0+r3*4] 544 PRED4x4_LOWPASS m0, m2, m1, m0 545 HADDW m0, m1 546 paddw m0, [pw_4] 547 psrlw m0, 3 548 SPLATW m0, m0, 0 549 mova [r0+r3*1], m0 550 mova [r0+r3*2], m0 551 mova [r0+r1*1], m0 552 mova [r0+r3*4], m0 553 mova [r2+r3*1], m0 554 mova [r2+r3*2], m0 555 mova [r2+r1*1], m0 556 mova [r2+r3*4], m0 557 RET 558%endmacro 559 560INIT_XMM sse2 561PRED8x8L_TOP_DC 562%if HAVE_AVX_EXTERNAL 563INIT_XMM avx 564PRED8x8L_TOP_DC 565%endif 566 567;------------------------------------------------------------------------------- 568; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) 569;------------------------------------------------------------------------------- 570;TODO: see if scalar is faster 571%macro PRED8x8L_DC 0 572cglobal pred8x8l_dc_10, 4, 6, 6 573 sub r0, r3 574 lea r4, [r0+r3*4] 575 lea r5, [r3*3] 576 mova m0, [r0+r3*2-16] 577 punpckhwd m0, [r0+r3*1-16] 578 mova m1, [r4+r3*0-16] 579 punpckhwd m1, [r0+r5*1-16] 580 punpckhdq m1, m0 581 mova m2, [r4+r3*2-16] 582 punpckhwd m2, [r4+r3*1-16] 583 mova m3, [r4+r3*4-16] 584 punpckhwd m3, [r4+r5*1-16] 585 punpckhdq m3, m2 586 punpckhqdq m3, m1 587 mova m0, [r0] 588 shr r1d, 14 589 shr r2d, 13 590 neg r1 591 pslldq m1, m0, 2 592 psrldq m2, m0, 2 593 pinsrw m1, [r0+r1], 0 594 pinsrw m2, [r0+r2+14], 7 595 not r1 596 and r1, r3 597 pslldq m4, m3, 2 598 psrldq m5, m3, 2 599 pshuflw m4, m4, 11100101b 600 pinsrw m5, [r0+r1-2], 7 601 PRED4x4_LOWPASS m3, m4, m5, m3 602 PRED4x4_LOWPASS m0, m2, m1, m0 603 paddw m0, m3 604 HADDW m0, m1 605 paddw m0, [pw_8] 606 psrlw m0, 4 607 SPLATW m0, m0 608 mova [r0+r3*1], m0 609 mova [r0+r3*2], m0 610 mova [r0+r5*1], m0 611 mova [r0+r3*4], m0 612 mova [r4+r3*1], m0 613 mova [r4+r3*2], m0 614 mova [r4+r5*1], m0 615 mova [r4+r3*4], m0 616 RET 617%endmacro 618 619INIT_XMM sse2 620PRED8x8L_DC 621%if HAVE_AVX_EXTERNAL 622INIT_XMM avx 623PRED8x8L_DC 624%endif 625 626;----------------------------------------------------------------------------- 627; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, 628; int stride) 629;----------------------------------------------------------------------------- 630%macro PRED8x8L_VERTICAL 0 631cglobal pred8x8l_vertical_10, 4, 4, 6 632 sub r0, r3 633 mova m0, [r0] 634 shr r1d, 14 635 shr r2d, 13 636 neg r1 637 pslldq m1, m0, 2 638 psrldq m2, m0, 2 639 pinsrw m1, [r0+r1], 0 640 pinsrw m2, [r0+r2+14], 7 641 lea r1, [r3*3] 642 lea r2, [r0+r3*4] 643 PRED4x4_LOWPASS m0, m2, m1, m0 644 mova [r0+r3*1], m0 645 mova [r0+r3*2], m0 646 mova [r0+r1*1], m0 647 mova [r0+r3*4], m0 648 mova [r2+r3*1], m0 649 mova [r2+r3*2], m0 650 mova [r2+r1*1], m0 651 mova [r2+r3*4], m0 652 RET 653%endmacro 654 655INIT_XMM sse2 656PRED8x8L_VERTICAL 657%if HAVE_AVX_EXTERNAL 658INIT_XMM avx 659PRED8x8L_VERTICAL 660%endif 661 662;----------------------------------------------------------------------------- 663; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, 664; int stride) 665;----------------------------------------------------------------------------- 666%macro PRED8x8L_HORIZONTAL 0 667cglobal pred8x8l_horizontal_10, 4, 4, 5 668 mova m0, [r0-16] 669 shr r1d, 14 670 dec r1 671 and r1, r3 672 sub r1, r3 673 punpckhwd m0, [r0+r1-16] 674 mova m1, [r0+r3*2-16] 675 punpckhwd m1, [r0+r3*1-16] 676 lea r2, [r0+r3*4] 677 lea r1, [r3*3] 678 punpckhdq m1, m0 679 mova m2, [r2+r3*0-16] 680 punpckhwd m2, [r0+r1-16] 681 mova m3, [r2+r3*2-16] 682 punpckhwd m3, [r2+r3*1-16] 683 punpckhdq m3, m2 684 punpckhqdq m3, m1 685 PALIGNR m4, m3, [r2+r1-16], 14, m0 686 pslldq m0, m4, 2 687 pshuflw m0, m0, 11100101b 688 PRED4x4_LOWPASS m4, m3, m0, m4 689 punpckhwd m3, m4, m4 690 punpcklwd m4, m4 691 pshufd m0, m3, 0xff 692 pshufd m1, m3, 0xaa 693 pshufd m2, m3, 0x55 694 pshufd m3, m3, 0x00 695 mova [r0+r3*0], m0 696 mova [r0+r3*1], m1 697 mova [r0+r3*2], m2 698 mova [r0+r1*1], m3 699 pshufd m0, m4, 0xff 700 pshufd m1, m4, 0xaa 701 pshufd m2, m4, 0x55 702 pshufd m3, m4, 0x00 703 mova [r2+r3*0], m0 704 mova [r2+r3*1], m1 705 mova [r2+r3*2], m2 706 mova [r2+r1*1], m3 707 RET 708%endmacro 709 710INIT_XMM sse2 711PRED8x8L_HORIZONTAL 712INIT_XMM ssse3 713PRED8x8L_HORIZONTAL 714%if HAVE_AVX_EXTERNAL 715INIT_XMM avx 716PRED8x8L_HORIZONTAL 717%endif 718 719;----------------------------------------------------------------------------- 720; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, 721; int stride) 722;----------------------------------------------------------------------------- 723%macro PRED8x8L_DOWN_LEFT 0 724cglobal pred8x8l_down_left_10, 4, 4, 7 725 sub r0, r3 726 mova m3, [r0] 727 shr r1d, 14 728 neg r1 729 shr r2d, 13 730 pslldq m1, m3, 2 731 psrldq m2, m3, 2 732 pinsrw m1, [r0+r1], 0 733 pinsrw m2, [r0+r2+14], 7 734 PRED4x4_LOWPASS m6, m2, m1, m3 735 jz .fix_tr ; flags from shr r2d 736 mova m1, [r0+16] 737 psrldq m5, m1, 2 738 PALIGNR m2, m1, m3, 14, m3 739 pshufhw m5, m5, 10100100b 740 PRED4x4_LOWPASS m1, m2, m5, m1 741.do_topright: 742 lea r1, [r3*3] 743 psrldq m5, m1, 14 744 lea r2, [r0+r3*4] 745 PALIGNR m2, m1, m6, 2, m0 746 PALIGNR m3, m1, m6, 14, m0 747 PALIGNR m5, m1, 2, m0 748 pslldq m4, m6, 2 749 PRED4x4_LOWPASS m6, m4, m2, m6 750 PRED4x4_LOWPASS m1, m3, m5, m1 751 mova [r2+r3*4], m1 752 PALIGNR m1, m6, 14, m2 753 pslldq m6, 2 754 mova [r2+r1*1], m1 755 PALIGNR m1, m6, 14, m2 756 pslldq m6, 2 757 mova [r2+r3*2], m1 758 PALIGNR m1, m6, 14, m2 759 pslldq m6, 2 760 mova [r2+r3*1], m1 761 PALIGNR m1, m6, 14, m2 762 pslldq m6, 2 763 mova [r0+r3*4], m1 764 PALIGNR m1, m6, 14, m2 765 pslldq m6, 2 766 mova [r0+r1*1], m1 767 PALIGNR m1, m6, 14, m2 768 pslldq m6, 2 769 mova [r0+r3*2], m1 770 PALIGNR m1, m6, 14, m6 771 mova [r0+r3*1], m1 772 RET 773.fix_tr: 774 punpckhwd m3, m3 775 pshufd m1, m3, 0xFF 776 jmp .do_topright 777%endmacro 778 779INIT_XMM sse2 780PRED8x8L_DOWN_LEFT 781INIT_XMM ssse3 782PRED8x8L_DOWN_LEFT 783%if HAVE_AVX_EXTERNAL 784INIT_XMM avx 785PRED8x8L_DOWN_LEFT 786%endif 787 788;----------------------------------------------------------------------------- 789; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, 790; int stride) 791;----------------------------------------------------------------------------- 792%macro PRED8x8L_DOWN_RIGHT 0 793; standard forbids this when has_topleft is false 794; no need to check 795cglobal pred8x8l_down_right_10, 4, 5, 8 796 sub r0, r3 797 lea r4, [r0+r3*4] 798 lea r1, [r3*3] 799 mova m0, [r0+r3*1-16] 800 punpckhwd m0, [r0+r3*0-16] 801 mova m1, [r0+r1*1-16] 802 punpckhwd m1, [r0+r3*2-16] 803 punpckhdq m1, m0 804 mova m2, [r4+r3*1-16] 805 punpckhwd m2, [r4+r3*0-16] 806 mova m3, [r4+r1*1-16] 807 punpckhwd m3, [r4+r3*2-16] 808 punpckhdq m3, m2 809 punpckhqdq m3, m1 810 mova m0, [r4+r3*4-16] 811 mova m1, [r0] 812 PALIGNR m4, m3, m0, 14, m0 813 PALIGNR m1, m3, 2, m2 814 pslldq m0, m4, 2 815 pshuflw m0, m0, 11100101b 816 PRED4x4_LOWPASS m6, m1, m4, m3 817 PRED4x4_LOWPASS m4, m3, m0, m4 818 mova m3, [r0] 819 shr r2d, 13 820 pslldq m1, m3, 2 821 psrldq m2, m3, 2 822 pinsrw m1, [r0-2], 0 823 pinsrw m2, [r0+r2+14], 7 824 PRED4x4_LOWPASS m3, m2, m1, m3 825 PALIGNR m2, m3, m6, 2, m0 826 PALIGNR m5, m3, m6, 14, m0 827 psrldq m7, m3, 2 828 PRED4x4_LOWPASS m6, m4, m2, m6 829 PRED4x4_LOWPASS m3, m5, m7, m3 830 mova [r4+r3*4], m6 831 PALIGNR m3, m6, 14, m2 832 pslldq m6, 2 833 mova [r0+r3*1], m3 834 PALIGNR m3, m6, 14, m2 835 pslldq m6, 2 836 mova [r0+r3*2], m3 837 PALIGNR m3, m6, 14, m2 838 pslldq m6, 2 839 mova [r0+r1*1], m3 840 PALIGNR m3, m6, 14, m2 841 pslldq m6, 2 842 mova [r0+r3*4], m3 843 PALIGNR m3, m6, 14, m2 844 pslldq m6, 2 845 mova [r4+r3*1], m3 846 PALIGNR m3, m6, 14, m2 847 pslldq m6, 2 848 mova [r4+r3*2], m3 849 PALIGNR m3, m6, 14, m6 850 mova [r4+r1*1], m3 851 RET 852%endmacro 853 854INIT_XMM sse2 855PRED8x8L_DOWN_RIGHT 856INIT_XMM ssse3 857PRED8x8L_DOWN_RIGHT 858%if HAVE_AVX_EXTERNAL 859INIT_XMM avx 860PRED8x8L_DOWN_RIGHT 861%endif 862 863;----------------------------------------------------------------------------- 864; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft, 865; int has_topright, int stride) 866;----------------------------------------------------------------------------- 867%macro PRED8x8L_VERTICAL_RIGHT 0 868; likewise with 8x8l_down_right 869cglobal pred8x8l_vertical_right_10, 4, 5, 7 870 sub r0, r3 871 lea r4, [r0+r3*4] 872 lea r1, [r3*3] 873 mova m0, [r0+r3*1-16] 874 punpckhwd m0, [r0+r3*0-16] 875 mova m1, [r0+r1*1-16] 876 punpckhwd m1, [r0+r3*2-16] 877 punpckhdq m1, m0 878 mova m2, [r4+r3*1-16] 879 punpckhwd m2, [r4+r3*0-16] 880 mova m3, [r4+r1*1-16] 881 punpckhwd m3, [r4+r3*2-16] 882 punpckhdq m3, m2 883 punpckhqdq m3, m1 884 mova m0, [r4+r3*4-16] 885 mova m1, [r0] 886 PALIGNR m4, m3, m0, 14, m0 887 PALIGNR m1, m3, 2, m2 888 PRED4x4_LOWPASS m3, m1, m4, m3 889 mova m2, [r0] 890 shr r2d, 13 891 pslldq m1, m2, 2 892 psrldq m5, m2, 2 893 pinsrw m1, [r0-2], 0 894 pinsrw m5, [r0+r2+14], 7 895 PRED4x4_LOWPASS m2, m5, m1, m2 896 PALIGNR m6, m2, m3, 12, m1 897 PALIGNR m5, m2, m3, 14, m0 898 PRED4x4_LOWPASS m0, m6, m2, m5 899 pavgw m2, m5 900 mova [r0+r3*2], m0 901 mova [r0+r3*1], m2 902 pslldq m6, m3, 4 903 pslldq m1, m3, 2 904 PRED4x4_LOWPASS m1, m3, m6, m1 905 PALIGNR m2, m1, 14, m4 906 mova [r0+r1*1], m2 907 pslldq m1, 2 908 PALIGNR m0, m1, 14, m3 909 mova [r0+r3*4], m0 910 pslldq m1, 2 911 PALIGNR m2, m1, 14, m4 912 mova [r4+r3*1], m2 913 pslldq m1, 2 914 PALIGNR m0, m1, 14, m3 915 mova [r4+r3*2], m0 916 pslldq m1, 2 917 PALIGNR m2, m1, 14, m4 918 mova [r4+r1*1], m2 919 pslldq m1, 2 920 PALIGNR m0, m1, 14, m1 921 mova [r4+r3*4], m0 922 RET 923%endmacro 924 925INIT_XMM sse2 926PRED8x8L_VERTICAL_RIGHT 927INIT_XMM ssse3 928PRED8x8L_VERTICAL_RIGHT 929%if HAVE_AVX_EXTERNAL 930INIT_XMM avx 931PRED8x8L_VERTICAL_RIGHT 932%endif 933 934;----------------------------------------------------------------------------- 935; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft, 936; int has_topright, int stride) 937;----------------------------------------------------------------------------- 938%macro PRED8x8L_HORIZONTAL_UP 0 939cglobal pred8x8l_horizontal_up_10, 4, 4, 6 940 mova m0, [r0+r3*0-16] 941 punpckhwd m0, [r0+r3*1-16] 942 shr r1d, 14 943 dec r1 944 and r1, r3 945 sub r1, r3 946 mova m4, [r0+r1*1-16] 947 lea r1, [r3*3] 948 lea r2, [r0+r3*4] 949 mova m1, [r0+r3*2-16] 950 punpckhwd m1, [r0+r1*1-16] 951 punpckhdq m0, m1 952 mova m2, [r2+r3*0-16] 953 punpckhwd m2, [r2+r3*1-16] 954 mova m3, [r2+r3*2-16] 955 punpckhwd m3, [r2+r1*1-16] 956 punpckhdq m2, m3 957 punpckhqdq m0, m2 958 PALIGNR m1, m0, m4, 14, m4 959 psrldq m2, m0, 2 960 pshufhw m2, m2, 10100100b 961 PRED4x4_LOWPASS m0, m1, m2, m0 962 psrldq m1, m0, 2 963 psrldq m2, m0, 4 964 pshufhw m1, m1, 10100100b 965 pshufhw m2, m2, 01010100b 966 pavgw m4, m0, m1 967 PRED4x4_LOWPASS m1, m2, m0, m1 968 punpckhwd m5, m4, m1 969 punpcklwd m4, m1 970 mova [r2+r3*0], m5 971 mova [r0+r3*0], m4 972 pshufd m0, m5, 11111001b 973 pshufd m1, m5, 11111110b 974 pshufd m2, m5, 11111111b 975 mova [r2+r3*1], m0 976 mova [r2+r3*2], m1 977 mova [r2+r1*1], m2 978 PALIGNR m2, m5, m4, 4, m0 979 PALIGNR m3, m5, m4, 8, m1 980 PALIGNR m5, m5, m4, 12, m4 981 mova [r0+r3*1], m2 982 mova [r0+r3*2], m3 983 mova [r0+r1*1], m5 984 RET 985%endmacro 986 987INIT_XMM sse2 988PRED8x8L_HORIZONTAL_UP 989INIT_XMM ssse3 990PRED8x8L_HORIZONTAL_UP 991%if HAVE_AVX_EXTERNAL 992INIT_XMM avx 993PRED8x8L_HORIZONTAL_UP 994%endif 995 996 997;----------------------------------------------------------------------------- 998; void ff_pred16x16_vertical(pixel *src, int stride) 999;----------------------------------------------------------------------------- 1000%macro MOV16 3-5 1001 mova [%1+ 0], %2 1002 mova [%1+mmsize], %3 1003%if mmsize==8 1004 mova [%1+ 16], %4 1005 mova [%1+ 24], %5 1006%endif 1007%endmacro 1008 1009%macro PRED16x16_VERTICAL 0 1010cglobal pred16x16_vertical_10, 2, 3 1011 sub r0, r1 1012 mov r2d, 8 1013 mova m0, [r0+ 0] 1014 mova m1, [r0+mmsize] 1015%if mmsize==8 1016 mova m2, [r0+16] 1017 mova m3, [r0+24] 1018%endif 1019.loop: 1020 MOV16 r0+r1*1, m0, m1, m2, m3 1021 MOV16 r0+r1*2, m0, m1, m2, m3 1022 lea r0, [r0+r1*2] 1023 dec r2d 1024 jg .loop 1025 REP_RET 1026%endmacro 1027 1028INIT_MMX mmxext 1029PRED16x16_VERTICAL 1030INIT_XMM sse2 1031PRED16x16_VERTICAL 1032 1033;----------------------------------------------------------------------------- 1034; void ff_pred16x16_horizontal(pixel *src, int stride) 1035;----------------------------------------------------------------------------- 1036%macro PRED16x16_HORIZONTAL 0 1037cglobal pred16x16_horizontal_10, 2, 3 1038 mov r2d, 8 1039.vloop: 1040 movd m0, [r0+r1*0-4] 1041 movd m1, [r0+r1*1-4] 1042 SPLATW m0, m0, 1 1043 SPLATW m1, m1, 1 1044 MOV16 r0+r1*0, m0, m0, m0, m0 1045 MOV16 r0+r1*1, m1, m1, m1, m1 1046 lea r0, [r0+r1*2] 1047 dec r2d 1048 jg .vloop 1049 REP_RET 1050%endmacro 1051 1052INIT_MMX mmxext 1053PRED16x16_HORIZONTAL 1054INIT_XMM sse2 1055PRED16x16_HORIZONTAL 1056 1057;----------------------------------------------------------------------------- 1058; void ff_pred16x16_dc(pixel *src, int stride) 1059;----------------------------------------------------------------------------- 1060%macro PRED16x16_DC 0 1061cglobal pred16x16_dc_10, 2, 6 1062 mov r5, r0 1063 sub r0, r1 1064 mova m0, [r0+0] 1065 paddw m0, [r0+mmsize] 1066%if mmsize==8 1067 paddw m0, [r0+16] 1068 paddw m0, [r0+24] 1069%endif 1070 HADDW m0, m2 1071 1072 lea r0, [r0+r1-2] 1073 movzx r3d, word [r0] 1074 movzx r4d, word [r0+r1] 1075%rep 7 1076 lea r0, [r0+r1*2] 1077 movzx r2d, word [r0] 1078 add r3d, r2d 1079 movzx r2d, word [r0+r1] 1080 add r4d, r2d 1081%endrep 1082 lea r3d, [r3+r4+16] 1083 1084 movd m1, r3d 1085 paddw m0, m1 1086 psrlw m0, 5 1087 SPLATW m0, m0 1088 mov r3d, 8 1089.loop: 1090 MOV16 r5+r1*0, m0, m0, m0, m0 1091 MOV16 r5+r1*1, m0, m0, m0, m0 1092 lea r5, [r5+r1*2] 1093 dec r3d 1094 jg .loop 1095 REP_RET 1096%endmacro 1097 1098INIT_MMX mmxext 1099PRED16x16_DC 1100INIT_XMM sse2 1101PRED16x16_DC 1102 1103;----------------------------------------------------------------------------- 1104; void ff_pred16x16_top_dc(pixel *src, int stride) 1105;----------------------------------------------------------------------------- 1106%macro PRED16x16_TOP_DC 0 1107cglobal pred16x16_top_dc_10, 2, 3 1108 sub r0, r1 1109 mova m0, [r0+0] 1110 paddw m0, [r0+mmsize] 1111%if mmsize==8 1112 paddw m0, [r0+16] 1113 paddw m0, [r0+24] 1114%endif 1115 HADDW m0, m2 1116 1117 SPLATW m0, m0 1118 paddw m0, [pw_8] 1119 psrlw m0, 4 1120 mov r2d, 8 1121.loop: 1122 MOV16 r0+r1*1, m0, m0, m0, m0 1123 MOV16 r0+r1*2, m0, m0, m0, m0 1124 lea r0, [r0+r1*2] 1125 dec r2d 1126 jg .loop 1127 REP_RET 1128%endmacro 1129 1130INIT_MMX mmxext 1131PRED16x16_TOP_DC 1132INIT_XMM sse2 1133PRED16x16_TOP_DC 1134 1135;----------------------------------------------------------------------------- 1136; void ff_pred16x16_left_dc(pixel *src, int stride) 1137;----------------------------------------------------------------------------- 1138%macro PRED16x16_LEFT_DC 0 1139cglobal pred16x16_left_dc_10, 2, 6 1140 mov r5, r0 1141 1142 sub r0, 2 1143 movzx r3d, word [r0] 1144 movzx r4d, word [r0+r1] 1145%rep 7 1146 lea r0, [r0+r1*2] 1147 movzx r2d, word [r0] 1148 add r3d, r2d 1149 movzx r2d, word [r0+r1] 1150 add r4d, r2d 1151%endrep 1152 lea r3d, [r3+r4+8] 1153 shr r3d, 4 1154 1155 movd m0, r3d 1156 SPLATW m0, m0 1157 mov r3d, 8 1158.loop: 1159 MOV16 r5+r1*0, m0, m0, m0, m0 1160 MOV16 r5+r1*1, m0, m0, m0, m0 1161 lea r5, [r5+r1*2] 1162 dec r3d 1163 jg .loop 1164 REP_RET 1165%endmacro 1166 1167INIT_MMX mmxext 1168PRED16x16_LEFT_DC 1169INIT_XMM sse2 1170PRED16x16_LEFT_DC 1171 1172;----------------------------------------------------------------------------- 1173; void ff_pred16x16_128_dc(pixel *src, int stride) 1174;----------------------------------------------------------------------------- 1175%macro PRED16x16_128_DC 0 1176cglobal pred16x16_128_dc_10, 2,3 1177 mova m0, [pw_512] 1178 mov r2d, 8 1179.loop: 1180 MOV16 r0+r1*0, m0, m0, m0, m0 1181 MOV16 r0+r1*1, m0, m0, m0, m0 1182 lea r0, [r0+r1*2] 1183 dec r2d 1184 jg .loop 1185 REP_RET 1186%endmacro 1187 1188INIT_MMX mmxext 1189PRED16x16_128_DC 1190INIT_XMM sse2 1191PRED16x16_128_DC 1192