1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of Libav. 9;* 10;* Libav is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* Libav is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with Libav; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "x86inc.asm" 26%include "x86util.asm" 27 28SECTION_RODATA 29 30cextern pw_16 31cextern pw_8 32cextern pw_4 33cextern pw_2 34cextern pw_1 35 36pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 37pw_m3: times 8 dw -3 38pw_pixel_max: times 8 dw ((1 << 10)-1) 39pw_512: times 8 dw 512 40pd_17: times 4 dd 17 41pd_16: times 4 dd 16 42 43SECTION .text 44 45; dest, left, right, src 46; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 47%macro PRED4x4_LOWPASS 4 48 paddw %2, %3 49 psrlw %2, 1 50 pavgw %1, %4, %2 51%endmacro 52 53;----------------------------------------------------------------------------- 54; void pred4x4_down_right(pixel *src, const pixel *topright, int stride) 55;----------------------------------------------------------------------------- 56%macro PRED4x4_DR 1 57cglobal pred4x4_down_right_10_%1, 3,3 58 sub r0, r2 59 lea r1, [r0+r2*2] 60 movhps m1, [r1-8] 61 movhps m2, [r0+r2*1-8] 62 movhps m4, [r0-8] 63 punpckhwd m2, m4 64 movq m3, [r0] 65 punpckhdq m1, m2 66 PALIGNR m3, m1, 10, m1 67 movhps m4, [r1+r2*1-8] 68 PALIGNR m0, m3, m4, 14, m4 69 movhps m4, [r1+r2*2-8] 70 PALIGNR m2, m0, m4, 14, m4 71 PRED4x4_LOWPASS m0, m2, m3, m0 72 movq [r1+r2*2], m0 73 psrldq m0, 2 74 movq [r1+r2*1], m0 75 psrldq m0, 2 76 movq [r0+r2*2], m0 77 psrldq m0, 2 78 movq [r0+r2*1], m0 79 RET 80%endmacro 81 82INIT_XMM 83%define PALIGNR PALIGNR_MMX 84PRED4x4_DR sse2 85%define PALIGNR PALIGNR_SSSE3 86PRED4x4_DR ssse3 87%ifdef HAVE_AVX 88INIT_AVX 89PRED4x4_DR avx 90%endif 91 92;----------------------------------------------------------------------------- 93; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) 94;----------------------------------------------------------------------------- 95%macro PRED4x4_VR 1 96cglobal pred4x4_vertical_right_10_%1, 3,3,6 97 sub r0, r2 98 lea r1, [r0+r2*2] 99 movq m5, [r0] ; ........t3t2t1t0 100 movhps m1, [r0-8] 101 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt 102 pavgw m5, m0 103 movhps m1, [r0+r2*1-8] 104 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 105 movhps m2, [r0+r2*2-8] 106 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 107 movhps m3, [r1+r2*1-8] 108 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 109 PRED4x4_LOWPASS m1, m0, m2, m1 110 pslldq m0, m1, 12 111 psrldq m1, 4 112 movq [r0+r2*1], m5 113 movq [r0+r2*2], m1 114 PALIGNR m5, m0, 14, m2 115 pslldq m0, 2 116 movq [r1+r2*1], m5 117 PALIGNR m1, m0, 14, m0 118 movq [r1+r2*2], m1 119 RET 120%endmacro 121 122INIT_XMM 123%define PALIGNR PALIGNR_MMX 124PRED4x4_VR sse2 125%define PALIGNR PALIGNR_SSSE3 126PRED4x4_VR ssse3 127%ifdef HAVE_AVX 128INIT_AVX 129PRED4x4_VR avx 130%endif 131 132;----------------------------------------------------------------------------- 133; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) 134;----------------------------------------------------------------------------- 135%macro PRED4x4_HD 1 136cglobal pred4x4_horizontal_down_10_%1, 3,3 137 sub r0, r2 138 lea r1, [r0+r2*2] 139 movq m0, [r0-8] ; lt .. 140 movhps m0, [r0] 141 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. 142 movq m1, [r1+r2*2-8] ; l3 143 movq m3, [r1+r2*1-8] 144 punpcklwd m1, m3 ; l2 l3 145 movq m2, [r0+r2*2-8] ; l1 146 movq m3, [r0+r2*1-8] 147 punpcklwd m2, m3 ; l0 l1 148 punpckhdq m1, m2 ; l0 l1 l2 l3 149 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 150 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 151 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 152 pavgw m5, m1, m3 153 PRED4x4_LOWPASS m3, m1, m0, m3 154 punpcklwd m5, m3 155 psrldq m3, 8 156 PALIGNR m3, m5, 12, m4 157 movq [r1+r2*2], m5 158 movhps [r0+r2*2], m5 159 psrldq m5, 4 160 movq [r1+r2*1], m5 161 movq [r0+r2*1], m3 162 RET 163%endmacro 164 165INIT_XMM 166%define PALIGNR PALIGNR_MMX 167PRED4x4_HD sse2 168%define PALIGNR PALIGNR_SSSE3 169PRED4x4_HD ssse3 170%ifdef HAVE_AVX 171INIT_AVX 172PRED4x4_HD avx 173%endif 174 175;----------------------------------------------------------------------------- 176; void pred4x4_dc(pixel *src, const pixel *topright, int stride) 177;----------------------------------------------------------------------------- 178%macro HADDD 2 ; sum junk 179%if mmsize == 16 180 movhlps %2, %1 181 paddd %1, %2 182 pshuflw %2, %1, 0xE 183 paddd %1, %2 184%else 185 pshufw %2, %1, 0xE 186 paddd %1, %2 187%endif 188%endmacro 189 190%macro HADDW 2 191 pmaddwd %1, [pw_1] 192 HADDD %1, %2 193%endmacro 194 195INIT_MMX 196cglobal pred4x4_dc_10_mmxext, 3,3 197 sub r0, r2 198 lea r1, [r0+r2*2] 199 movq m2, [r0+r2*1-8] 200 paddw m2, [r0+r2*2-8] 201 paddw m2, [r1+r2*1-8] 202 paddw m2, [r1+r2*2-8] 203 psrlq m2, 48 204 movq m0, [r0] 205 HADDW m0, m1 206 paddw m0, [pw_4] 207 paddw m0, m2 208 psrlw m0, 3 209 SPLATW m0, m0, 0 210 movq [r0+r2*1], m0 211 movq [r0+r2*2], m0 212 movq [r1+r2*1], m0 213 movq [r1+r2*2], m0 214 RET 215 216;----------------------------------------------------------------------------- 217; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) 218;----------------------------------------------------------------------------- 219%macro PRED4x4_DL 1 220cglobal pred4x4_down_left_10_%1, 3,3 221 sub r0, r2 222 movq m0, [r0] 223 movhps m0, [r1] 224 psrldq m2, m0, 2 225 pslldq m3, m0, 2 226 pshufhw m2, m2, 10100100b 227 PRED4x4_LOWPASS m0, m3, m2, m0 228 lea r1, [r0+r2*2] 229 movhps [r1+r2*2], m0 230 psrldq m0, 2 231 movq [r0+r2*1], m0 232 psrldq m0, 2 233 movq [r0+r2*2], m0 234 psrldq m0, 2 235 movq [r1+r2*1], m0 236 RET 237%endmacro 238 239INIT_XMM 240PRED4x4_DL sse2 241%ifdef HAVE_AVX 242INIT_AVX 243PRED4x4_DL avx 244%endif 245 246;----------------------------------------------------------------------------- 247; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) 248;----------------------------------------------------------------------------- 249%macro PRED4x4_VL 1 250cglobal pred4x4_vertical_left_10_%1, 3,3 251 sub r0, r2 252 movu m1, [r0] 253 movhps m1, [r1] 254 psrldq m0, m1, 2 255 psrldq m2, m1, 4 256 pavgw m4, m0, m1 257 PRED4x4_LOWPASS m0, m1, m2, m0 258 lea r1, [r0+r2*2] 259 movq [r0+r2*1], m4 260 movq [r0+r2*2], m0 261 psrldq m4, 2 262 psrldq m0, 2 263 movq [r1+r2*1], m4 264 movq [r1+r2*2], m0 265 RET 266%endmacro 267 268INIT_XMM 269PRED4x4_VL sse2 270%ifdef HAVE_AVX 271INIT_AVX 272PRED4x4_VL avx 273%endif 274 275;----------------------------------------------------------------------------- 276; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) 277;----------------------------------------------------------------------------- 278INIT_MMX 279cglobal pred4x4_horizontal_up_10_mmxext, 3,3 280 sub r0, r2 281 lea r1, [r0+r2*2] 282 movq m0, [r0+r2*1-8] 283 punpckhwd m0, [r0+r2*2-8] 284 movq m1, [r1+r2*1-8] 285 punpckhwd m1, [r1+r2*2-8] 286 punpckhdq m0, m1 287 pshufw m1, m1, 0xFF 288 movq [r1+r2*2], m1 289 movd [r1+r2*1+4], m1 290 pshufw m2, m0, 11111001b 291 movq m1, m2 292 pavgw m2, m0 293 294 pshufw m5, m0, 11111110b 295 PRED4x4_LOWPASS m1, m0, m5, m1 296 movq m6, m2 297 punpcklwd m6, m1 298 movq [r0+r2*1], m6 299 psrlq m2, 16 300 psrlq m1, 16 301 punpcklwd m2, m1 302 movq [r0+r2*2], m2 303 psrlq m2, 32 304 movd [r1+r2*1], m2 305 RET 306 307 308 309;----------------------------------------------------------------------------- 310; void pred8x8_vertical(pixel *src, int stride) 311;----------------------------------------------------------------------------- 312INIT_XMM 313cglobal pred8x8_vertical_10_sse2, 2,2 314 sub r0, r1 315 mova m0, [r0] 316%rep 3 317 mova [r0+r1*1], m0 318 mova [r0+r1*2], m0 319 lea r0, [r0+r1*2] 320%endrep 321 mova [r0+r1*1], m0 322 mova [r0+r1*2], m0 323 RET 324 325;----------------------------------------------------------------------------- 326; void pred8x8_horizontal(pixel *src, int stride) 327;----------------------------------------------------------------------------- 328INIT_XMM 329cglobal pred8x8_horizontal_10_sse2, 2,3 330 mov r2d, 4 331.loop: 332 movq m0, [r0+r1*0-8] 333 movq m1, [r0+r1*1-8] 334 pshuflw m0, m0, 0xff 335 pshuflw m1, m1, 0xff 336 punpcklqdq m0, m0 337 punpcklqdq m1, m1 338 mova [r0+r1*0], m0 339 mova [r0+r1*1], m1 340 lea r0, [r0+r1*2] 341 dec r2d 342 jg .loop 343 REP_RET 344 345;----------------------------------------------------------------------------- 346; void predict_8x8_dc(pixel *src, int stride) 347;----------------------------------------------------------------------------- 348%macro MOV8 2-3 349; sort of a hack, but it works 350%if mmsize==8 351 movq [%1+0], %2 352 movq [%1+8], %3 353%else 354 movdqa [%1], %2 355%endif 356%endmacro 357 358%macro PRED8x8_DC 2 359cglobal pred8x8_dc_10_%1, 2,6 360 sub r0, r1 361 pxor m4, m4 362 movq m0, [r0+0] 363 movq m1, [r0+8] 364%if mmsize==16 365 punpcklwd m0, m1 366 movhlps m1, m0 367 paddw m0, m1 368%else 369 pshufw m2, m0, 00001110b 370 pshufw m3, m1, 00001110b 371 paddw m0, m2 372 paddw m1, m3 373 punpcklwd m0, m1 374%endif 375 %2 m2, m0, 00001110b 376 paddw m0, m2 377 378 lea r5, [r1*3] 379 lea r4, [r0+r1*4] 380 movzx r2d, word [r0+r1*1-2] 381 movzx r3d, word [r0+r1*2-2] 382 add r2d, r3d 383 movzx r3d, word [r0+r5*1-2] 384 add r2d, r3d 385 movzx r3d, word [r4-2] 386 add r2d, r3d 387 movd m2, r2d ; s2 388 389 movzx r2d, word [r4+r1*1-2] 390 movzx r3d, word [r4+r1*2-2] 391 add r2d, r3d 392 movzx r3d, word [r4+r5*1-2] 393 add r2d, r3d 394 movzx r3d, word [r4+r1*4-2] 395 add r2d, r3d 396 movd m3, r2d ; s3 397 398 punpcklwd m2, m3 399 punpckldq m0, m2 ; s0, s1, s2, s3 400 %2 m3, m0, 11110110b ; s2, s1, s3, s3 401 %2 m0, m0, 01110100b ; s0, s1, s3, s1 402 paddw m0, m3 403 psrlw m0, 2 404 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 405%if mmsize==16 406 punpcklwd m0, m0 407 pshufd m3, m0, 11111010b 408 punpckldq m0, m0 409 SWAP 0,1 410%else 411 pshufw m1, m0, 0x00 412 pshufw m2, m0, 0x55 413 pshufw m3, m0, 0xaa 414 pshufw m4, m0, 0xff 415%endif 416 MOV8 r0+r1*1, m1, m2 417 MOV8 r0+r1*2, m1, m2 418 MOV8 r0+r5*1, m1, m2 419 MOV8 r0+r1*4, m1, m2 420 MOV8 r4+r1*1, m3, m4 421 MOV8 r4+r1*2, m3, m4 422 MOV8 r4+r5*1, m3, m4 423 MOV8 r4+r1*4, m3, m4 424 RET 425%endmacro 426 427INIT_MMX 428PRED8x8_DC mmxext, pshufw 429INIT_XMM 430PRED8x8_DC sse2 , pshuflw 431 432;----------------------------------------------------------------------------- 433; void pred8x8_top_dc(pixel *src, int stride) 434;----------------------------------------------------------------------------- 435INIT_XMM 436cglobal pred8x8_top_dc_10_sse2, 2,4 437 sub r0, r1 438 mova m0, [r0] 439 pshuflw m1, m0, 0x4e 440 pshufhw m1, m1, 0x4e 441 paddw m0, m1 442 pshuflw m1, m0, 0xb1 443 pshufhw m1, m1, 0xb1 444 paddw m0, m1 445 lea r2, [r1*3] 446 lea r3, [r0+r1*4] 447 paddw m0, [pw_2] 448 psrlw m0, 2 449 mova [r0+r1*1], m0 450 mova [r0+r1*2], m0 451 mova [r0+r2*1], m0 452 mova [r0+r1*4], m0 453 mova [r3+r1*1], m0 454 mova [r3+r1*2], m0 455 mova [r3+r2*1], m0 456 mova [r3+r1*4], m0 457 RET 458 459;----------------------------------------------------------------------------- 460; void pred8x8_plane(pixel *src, int stride) 461;----------------------------------------------------------------------------- 462INIT_XMM 463cglobal pred8x8_plane_10_sse2, 2,7,7 464 sub r0, r1 465 lea r2, [r1*3] 466 lea r3, [r0+r1*4] 467 mova m2, [r0] 468 pmaddwd m2, [pw_m32101234] 469 HADDD m2, m1 470 movd m0, [r0-4] 471 psrld m0, 14 472 psubw m2, m0 ; H 473 movd m0, [r3+r1*4-4] 474 movd m1, [r0+12] 475 paddw m0, m1 476 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) 477 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] 478 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] 479 sub r4d, r5d 480 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] 481 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] 482 sub r6d, r5d 483 lea r4d, [r4+r6*2] 484 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] 485 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] 486 sub r5d, r6d 487 lea r5d, [r5*3] 488 add r4d, r5d 489 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] 490 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] 491 sub r6d, r5d 492 lea r4d, [r4+r6*4] 493 movd m3, r4d ; V 494 punpckldq m2, m3 495 pmaddwd m2, [pd_17] 496 paddd m2, [pd_16] 497 psrad m2, 5 ; b, c 498 499 mova m3, [pw_pixel_max] 500 pxor m1, m1 501 SPLATW m0, m0, 1 502 SPLATW m4, m2, 2 503 SPLATW m2, m2, 0 504 pmullw m2, [pw_m32101234] ; b 505 pmullw m5, m4, [pw_m3] ; c 506 paddw m5, [pw_16] 507 mov r2d, 8 508 add r0, r1 509.loop: 510 paddsw m6, m2, m5 511 paddsw m6, m0 512 psraw m6, 5 513 CLIPW m6, m1, m3 514 mova [r0], m6 515 paddw m5, m4 516 add r0, r1 517 dec r2d 518 jg .loop 519 REP_RET 520 521 522;----------------------------------------------------------------------------- 523; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride) 524;----------------------------------------------------------------------------- 525%macro PRED8x8L_128_DC 1 526cglobal pred8x8l_128_dc_10_%1, 4,4 527 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) 528 lea r1, [r3*3] 529 lea r2, [r0+r3*4] 530 MOV8 r0+r3*0, m0, m0 531 MOV8 r0+r3*1, m0, m0 532 MOV8 r0+r3*2, m0, m0 533 MOV8 r0+r1*1, m0, m0 534 MOV8 r2+r3*0, m0, m0 535 MOV8 r2+r3*1, m0, m0 536 MOV8 r2+r3*2, m0, m0 537 MOV8 r2+r1*1, m0, m0 538 RET 539%endmacro 540 541INIT_MMX 542PRED8x8L_128_DC mmxext 543INIT_XMM 544PRED8x8L_128_DC sse2 545 546;----------------------------------------------------------------------------- 547; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride) 548;----------------------------------------------------------------------------- 549%macro PRED8x8L_TOP_DC 1 550cglobal pred8x8l_top_dc_10_%1, 4,4,6 551 sub r0, r3 552 mova m0, [r0] 553 shr r1d, 14 554 shr r2d, 13 555 neg r1 556 pslldq m1, m0, 2 557 psrldq m2, m0, 2 558 pinsrw m1, [r0+r1], 0 559 pinsrw m2, [r0+r2+14], 7 560 lea r1, [r3*3] 561 lea r2, [r0+r3*4] 562 PRED4x4_LOWPASS m0, m2, m1, m0 563 HADDW m0, m1 564 paddw m0, [pw_4] 565 psrlw m0, 3 566 SPLATW m0, m0, 0 567 mova [r0+r3*1], m0 568 mova [r0+r3*2], m0 569 mova [r0+r1*1], m0 570 mova [r0+r3*4], m0 571 mova [r2+r3*1], m0 572 mova [r2+r3*2], m0 573 mova [r2+r1*1], m0 574 mova [r2+r3*4], m0 575 RET 576%endmacro 577 578INIT_XMM 579PRED8x8L_TOP_DC sse2 580%ifdef HAVE_AVX 581INIT_AVX 582PRED8x8L_TOP_DC avx 583%endif 584 585;----------------------------------------------------------------------------- 586;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) 587;----------------------------------------------------------------------------- 588;TODO: see if scalar is faster 589%macro PRED8x8L_DC 1 590cglobal pred8x8l_dc_10_%1, 4,6,6 591 sub r0, r3 592 lea r4, [r0+r3*4] 593 lea r5, [r3*3] 594 mova m0, [r0+r3*2-16] 595 punpckhwd m0, [r0+r3*1-16] 596 mova m1, [r4+r3*0-16] 597 punpckhwd m1, [r0+r5*1-16] 598 punpckhdq m1, m0 599 mova m2, [r4+r3*2-16] 600 punpckhwd m2, [r4+r3*1-16] 601 mova m3, [r4+r3*4-16] 602 punpckhwd m3, [r4+r5*1-16] 603 punpckhdq m3, m2 604 punpckhqdq m3, m1 605 mova m0, [r0] 606 shr r1d, 14 607 shr r2d, 13 608 neg r1 609 pslldq m1, m0, 2 610 psrldq m2, m0, 2 611 pinsrw m1, [r0+r1], 0 612 pinsrw m2, [r0+r2+14], 7 613 not r1 614 and r1, r3 615 pslldq m4, m3, 2 616 psrldq m5, m3, 2 617 pshuflw m4, m4, 11100101b 618 pinsrw m5, [r0+r1-2], 7 619 PRED4x4_LOWPASS m3, m4, m5, m3 620 PRED4x4_LOWPASS m0, m2, m1, m0 621 paddw m0, m3 622 HADDW m0, m1 623 paddw m0, [pw_8] 624 psrlw m0, 4 625 SPLATW m0, m0 626 mova [r0+r3*1], m0 627 mova [r0+r3*2], m0 628 mova [r0+r5*1], m0 629 mova [r0+r3*4], m0 630 mova [r4+r3*1], m0 631 mova [r4+r3*2], m0 632 mova [r4+r5*1], m0 633 mova [r4+r3*4], m0 634 RET 635%endmacro 636 637INIT_XMM 638PRED8x8L_DC sse2 639%ifdef HAVE_AVX 640INIT_AVX 641PRED8x8L_DC avx 642%endif 643 644;----------------------------------------------------------------------------- 645; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) 646;----------------------------------------------------------------------------- 647%macro PRED8x8L_VERTICAL 1 648cglobal pred8x8l_vertical_10_%1, 4,4,6 649 sub r0, r3 650 mova m0, [r0] 651 shr r1d, 14 652 shr r2d, 13 653 neg r1 654 pslldq m1, m0, 2 655 psrldq m2, m0, 2 656 pinsrw m1, [r0+r1], 0 657 pinsrw m2, [r0+r2+14], 7 658 lea r1, [r3*3] 659 lea r2, [r0+r3*4] 660 PRED4x4_LOWPASS m0, m2, m1, m0 661 mova [r0+r3*1], m0 662 mova [r0+r3*2], m0 663 mova [r0+r1*1], m0 664 mova [r0+r3*4], m0 665 mova [r2+r3*1], m0 666 mova [r2+r3*2], m0 667 mova [r2+r1*1], m0 668 mova [r2+r3*4], m0 669 RET 670%endmacro 671 672INIT_XMM 673PRED8x8L_VERTICAL sse2 674%ifdef HAVE_AVX 675INIT_AVX 676PRED8x8L_VERTICAL avx 677%endif 678 679;----------------------------------------------------------------------------- 680; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) 681;----------------------------------------------------------------------------- 682%macro PRED8x8L_HORIZONTAL 1 683cglobal pred8x8l_horizontal_10_%1, 4,4,5 684 mova m0, [r0-16] 685 shr r1d, 14 686 dec r1 687 and r1, r3 688 sub r1, r3 689 punpckhwd m0, [r0+r1-16] 690 mova m1, [r0+r3*2-16] 691 punpckhwd m1, [r0+r3*1-16] 692 lea r2, [r0+r3*4] 693 lea r1, [r3*3] 694 punpckhdq m1, m0 695 mova m2, [r2+r3*0-16] 696 punpckhwd m2, [r0+r1-16] 697 mova m3, [r2+r3*2-16] 698 punpckhwd m3, [r2+r3*1-16] 699 punpckhdq m3, m2 700 punpckhqdq m3, m1 701 PALIGNR m4, m3, [r2+r1-16], 14, m0 702 pslldq m0, m4, 2 703 pshuflw m0, m0, 11100101b 704 PRED4x4_LOWPASS m4, m3, m0, m4 705 punpckhwd m3, m4, m4 706 punpcklwd m4, m4 707 pshufd m0, m3, 0xff 708 pshufd m1, m3, 0xaa 709 pshufd m2, m3, 0x55 710 pshufd m3, m3, 0x00 711 mova [r0+r3*0], m0 712 mova [r0+r3*1], m1 713 mova [r0+r3*2], m2 714 mova [r0+r1*1], m3 715 pshufd m0, m4, 0xff 716 pshufd m1, m4, 0xaa 717 pshufd m2, m4, 0x55 718 pshufd m3, m4, 0x00 719 mova [r2+r3*0], m0 720 mova [r2+r3*1], m1 721 mova [r2+r3*2], m2 722 mova [r2+r1*1], m3 723 RET 724%endmacro 725 726INIT_XMM 727%define PALIGNR PALIGNR_MMX 728PRED8x8L_HORIZONTAL sse2 729%define PALIGNR PALIGNR_SSSE3 730PRED8x8L_HORIZONTAL ssse3 731%ifdef HAVE_AVX 732INIT_AVX 733PRED8x8L_HORIZONTAL avx 734%endif 735 736;----------------------------------------------------------------------------- 737;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) 738;----------------------------------------------------------------------------- 739%macro PRED8x8L_DOWN_LEFT 1 740cglobal pred8x8l_down_left_10_%1, 4,4,7 741 sub r0, r3 742 mova m3, [r0] 743 shr r1d, 14 744 neg r1 745 shr r2d, 13 746 pslldq m1, m3, 2 747 psrldq m2, m3, 2 748 pinsrw m1, [r0+r1], 0 749 pinsrw m2, [r0+r2+14], 7 750 PRED4x4_LOWPASS m6, m2, m1, m3 751 jz .fix_tr ; flags from shr r2d 752 mova m1, [r0+16] 753 psrldq m5, m1, 2 754 PALIGNR m2, m1, m3, 14, m3 755 pshufhw m5, m5, 10100100b 756 PRED4x4_LOWPASS m1, m2, m5, m1 757.do_topright: 758 lea r1, [r3*3] 759 psrldq m5, m1, 14 760 lea r2, [r0+r3*4] 761 PALIGNR m2, m1, m6, 2, m0 762 PALIGNR m3, m1, m6, 14, m0 763 PALIGNR m5, m1, 2, m0 764 pslldq m4, m6, 2 765 PRED4x4_LOWPASS m6, m4, m2, m6 766 PRED4x4_LOWPASS m1, m3, m5, m1 767 mova [r2+r3*4], m1 768 PALIGNR m1, m6, 14, m2 769 pslldq m6, 2 770 mova [r2+r1*1], m1 771 PALIGNR m1, m6, 14, m2 772 pslldq m6, 2 773 mova [r2+r3*2], m1 774 PALIGNR m1, m6, 14, m2 775 pslldq m6, 2 776 mova [r2+r3*1], m1 777 PALIGNR m1, m6, 14, m2 778 pslldq m6, 2 779 mova [r0+r3*4], m1 780 PALIGNR m1, m6, 14, m2 781 pslldq m6, 2 782 mova [r0+r1*1], m1 783 PALIGNR m1, m6, 14, m2 784 pslldq m6, 2 785 mova [r0+r3*2], m1 786 PALIGNR m1, m6, 14, m6 787 mova [r0+r3*1], m1 788 RET 789.fix_tr: 790 punpckhwd m3, m3 791 pshufd m1, m3, 0xFF 792 jmp .do_topright 793%endmacro 794 795INIT_XMM 796%define PALIGNR PALIGNR_MMX 797PRED8x8L_DOWN_LEFT sse2 798%define PALIGNR PALIGNR_SSSE3 799PRED8x8L_DOWN_LEFT ssse3 800%ifdef HAVE_AVX 801INIT_AVX 802PRED8x8L_DOWN_LEFT avx 803%endif 804 805;----------------------------------------------------------------------------- 806;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride) 807;----------------------------------------------------------------------------- 808%macro PRED8x8L_DOWN_RIGHT 1 809; standard forbids this when has_topleft is false 810; no need to check 811cglobal pred8x8l_down_right_10_%1, 4,5,8 812 sub r0, r3 813 lea r4, [r0+r3*4] 814 lea r1, [r3*3] 815 mova m0, [r0+r3*1-16] 816 punpckhwd m0, [r0+r3*0-16] 817 mova m1, [r0+r1*1-16] 818 punpckhwd m1, [r0+r3*2-16] 819 punpckhdq m1, m0 820 mova m2, [r4+r3*1-16] 821 punpckhwd m2, [r4+r3*0-16] 822 mova m3, [r4+r1*1-16] 823 punpckhwd m3, [r4+r3*2-16] 824 punpckhdq m3, m2 825 punpckhqdq m3, m1 826 mova m0, [r4+r3*4-16] 827 mova m1, [r0] 828 PALIGNR m4, m3, m0, 14, m0 829 PALIGNR m1, m3, 2, m2 830 pslldq m0, m4, 2 831 pshuflw m0, m0, 11100101b 832 PRED4x4_LOWPASS m6, m1, m4, m3 833 PRED4x4_LOWPASS m4, m3, m0, m4 834 mova m3, [r0] 835 shr r2d, 13 836 pslldq m1, m3, 2 837 psrldq m2, m3, 2 838 pinsrw m1, [r0-2], 0 839 pinsrw m2, [r0+r2+14], 7 840 PRED4x4_LOWPASS m3, m2, m1, m3 841 PALIGNR m2, m3, m6, 2, m0 842 PALIGNR m5, m3, m6, 14, m0 843 psrldq m7, m3, 2 844 PRED4x4_LOWPASS m6, m4, m2, m6 845 PRED4x4_LOWPASS m3, m5, m7, m3 846 mova [r4+r3*4], m6 847 PALIGNR m3, m6, 14, m2 848 pslldq m6, 2 849 mova [r0+r3*1], m3 850 PALIGNR m3, m6, 14, m2 851 pslldq m6, 2 852 mova [r0+r3*2], m3 853 PALIGNR m3, m6, 14, m2 854 pslldq m6, 2 855 mova [r0+r1*1], m3 856 PALIGNR m3, m6, 14, m2 857 pslldq m6, 2 858 mova [r0+r3*4], m3 859 PALIGNR m3, m6, 14, m2 860 pslldq m6, 2 861 mova [r4+r3*1], m3 862 PALIGNR m3, m6, 14, m2 863 pslldq m6, 2 864 mova [r4+r3*2], m3 865 PALIGNR m3, m6, 14, m6 866 mova [r4+r1*1], m3 867 RET 868%endmacro 869 870INIT_XMM 871%define PALIGNR PALIGNR_MMX 872PRED8x8L_DOWN_RIGHT sse2 873%define PALIGNR PALIGNR_SSSE3 874PRED8x8L_DOWN_RIGHT ssse3 875%ifdef HAVE_AVX 876INIT_AVX 877PRED8x8L_DOWN_RIGHT avx 878%endif 879 880;----------------------------------------------------------------------------- 881; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) 882;----------------------------------------------------------------------------- 883%macro PRED8x8L_VERTICAL_RIGHT 1 884; likewise with 8x8l_down_right 885cglobal pred8x8l_vertical_right_10_%1, 4,5,7 886 sub r0, r3 887 lea r4, [r0+r3*4] 888 lea r1, [r3*3] 889 mova m0, [r0+r3*1-16] 890 punpckhwd m0, [r0+r3*0-16] 891 mova m1, [r0+r1*1-16] 892 punpckhwd m1, [r0+r3*2-16] 893 punpckhdq m1, m0 894 mova m2, [r4+r3*1-16] 895 punpckhwd m2, [r4+r3*0-16] 896 mova m3, [r4+r1*1-16] 897 punpckhwd m3, [r4+r3*2-16] 898 punpckhdq m3, m2 899 punpckhqdq m3, m1 900 mova m0, [r4+r3*4-16] 901 mova m1, [r0] 902 PALIGNR m4, m3, m0, 14, m0 903 PALIGNR m1, m3, 2, m2 904 PRED4x4_LOWPASS m3, m1, m4, m3 905 mova m2, [r0] 906 shr r2d, 13 907 pslldq m1, m2, 2 908 psrldq m5, m2, 2 909 pinsrw m1, [r0-2], 0 910 pinsrw m5, [r0+r2+14], 7 911 PRED4x4_LOWPASS m2, m5, m1, m2 912 PALIGNR m6, m2, m3, 12, m1 913 PALIGNR m5, m2, m3, 14, m0 914 PRED4x4_LOWPASS m0, m6, m2, m5 915 pavgw m2, m5 916 mova [r0+r3*2], m0 917 mova [r0+r3*1], m2 918 pslldq m6, m3, 4 919 pslldq m1, m3, 2 920 PRED4x4_LOWPASS m1, m3, m6, m1 921 PALIGNR m2, m1, 14, m4 922 mova [r0+r1*1], m2 923 pslldq m1, 2 924 PALIGNR m0, m1, 14, m3 925 mova [r0+r3*4], m0 926 pslldq m1, 2 927 PALIGNR m2, m1, 14, m4 928 mova [r4+r3*1], m2 929 pslldq m1, 2 930 PALIGNR m0, m1, 14, m3 931 mova [r4+r3*2], m0 932 pslldq m1, 2 933 PALIGNR m2, m1, 14, m4 934 mova [r4+r1*1], m2 935 pslldq m1, 2 936 PALIGNR m0, m1, 14, m1 937 mova [r4+r3*4], m0 938 RET 939%endmacro 940 941INIT_XMM 942%define PALIGNR PALIGNR_MMX 943PRED8x8L_VERTICAL_RIGHT sse2 944%define PALIGNR PALIGNR_SSSE3 945PRED8x8L_VERTICAL_RIGHT ssse3 946%ifdef HAVE_AVX 947INIT_AVX 948PRED8x8L_VERTICAL_RIGHT avx 949%endif 950 951;----------------------------------------------------------------------------- 952; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) 953;----------------------------------------------------------------------------- 954%macro PRED8x8L_HORIZONTAL_UP 1 955cglobal pred8x8l_horizontal_up_10_%1, 4,4,6 956 mova m0, [r0+r3*0-16] 957 punpckhwd m0, [r0+r3*1-16] 958 shr r1d, 14 959 dec r1 960 and r1, r3 961 sub r1, r3 962 mova m4, [r0+r1*1-16] 963 lea r1, [r3*3] 964 lea r2, [r0+r3*4] 965 mova m1, [r0+r3*2-16] 966 punpckhwd m1, [r0+r1*1-16] 967 punpckhdq m0, m1 968 mova m2, [r2+r3*0-16] 969 punpckhwd m2, [r2+r3*1-16] 970 mova m3, [r2+r3*2-16] 971 punpckhwd m3, [r2+r1*1-16] 972 punpckhdq m2, m3 973 punpckhqdq m0, m2 974 PALIGNR m1, m0, m4, 14, m4 975 psrldq m2, m0, 2 976 pshufhw m2, m2, 10100100b 977 PRED4x4_LOWPASS m0, m1, m2, m0 978 psrldq m1, m0, 2 979 psrldq m2, m0, 4 980 pshufhw m1, m1, 10100100b 981 pshufhw m2, m2, 01010100b 982 pavgw m4, m0, m1 983 PRED4x4_LOWPASS m1, m2, m0, m1 984 punpckhwd m5, m4, m1 985 punpcklwd m4, m1 986 mova [r2+r3*0], m5 987 mova [r0+r3*0], m4 988 pshufd m0, m5, 11111001b 989 pshufd m1, m5, 11111110b 990 pshufd m2, m5, 11111111b 991 mova [r2+r3*1], m0 992 mova [r2+r3*2], m1 993 mova [r2+r1*1], m2 994 PALIGNR m2, m5, m4, 4, m0 995 PALIGNR m3, m5, m4, 8, m1 996 PALIGNR m5, m5, m4, 12, m4 997 mova [r0+r3*1], m2 998 mova [r0+r3*2], m3 999 mova [r0+r1*1], m5 1000 RET 1001%endmacro 1002 1003INIT_XMM 1004%define PALIGNR PALIGNR_MMX 1005PRED8x8L_HORIZONTAL_UP sse2 1006%define PALIGNR PALIGNR_SSSE3 1007PRED8x8L_HORIZONTAL_UP ssse3 1008%ifdef HAVE_AVX 1009INIT_AVX 1010PRED8x8L_HORIZONTAL_UP avx 1011%endif 1012 1013 1014;----------------------------------------------------------------------------- 1015; void pred16x16_vertical(pixel *src, int stride) 1016;----------------------------------------------------------------------------- 1017%macro MOV16 3-5 1018 mova [%1+ 0], %2 1019 mova [%1+mmsize], %3 1020%if mmsize==8 1021 mova [%1+ 16], %4 1022 mova [%1+ 24], %5 1023%endif 1024%endmacro 1025 1026%macro PRED16x16_VERTICAL 1 1027cglobal pred16x16_vertical_10_%1, 2,3 1028 sub r0, r1 1029 mov r2d, 8 1030 mova m0, [r0+ 0] 1031 mova m1, [r0+mmsize] 1032%if mmsize==8 1033 mova m2, [r0+16] 1034 mova m3, [r0+24] 1035%endif 1036.loop: 1037 MOV16 r0+r1*1, m0, m1, m2, m3 1038 MOV16 r0+r1*2, m0, m1, m2, m3 1039 lea r0, [r0+r1*2] 1040 dec r2d 1041 jg .loop 1042 REP_RET 1043%endmacro 1044 1045INIT_MMX 1046PRED16x16_VERTICAL mmxext 1047INIT_XMM 1048PRED16x16_VERTICAL sse2 1049 1050;----------------------------------------------------------------------------- 1051; void pred16x16_horizontal(pixel *src, int stride) 1052;----------------------------------------------------------------------------- 1053%macro PRED16x16_HORIZONTAL 1 1054cglobal pred16x16_horizontal_10_%1, 2,3 1055 mov r2d, 8 1056.vloop: 1057 movd m0, [r0+r1*0-4] 1058 movd m1, [r0+r1*1-4] 1059 SPLATW m0, m0, 1 1060 SPLATW m1, m1, 1 1061 MOV16 r0+r1*0, m0, m0, m0, m0 1062 MOV16 r0+r1*1, m1, m1, m1, m1 1063 lea r0, [r0+r1*2] 1064 dec r2d 1065 jg .vloop 1066 REP_RET 1067%endmacro 1068 1069INIT_MMX 1070PRED16x16_HORIZONTAL mmxext 1071INIT_XMM 1072PRED16x16_HORIZONTAL sse2 1073 1074;----------------------------------------------------------------------------- 1075; void pred16x16_dc(pixel *src, int stride) 1076;----------------------------------------------------------------------------- 1077%macro PRED16x16_DC 1 1078cglobal pred16x16_dc_10_%1, 2,6 1079 mov r5, r0 1080 sub r0, r1 1081 mova m0, [r0+0] 1082 paddw m0, [r0+mmsize] 1083%if mmsize==8 1084 paddw m0, [r0+16] 1085 paddw m0, [r0+24] 1086%endif 1087 HADDW m0, m2 1088 1089 lea r0, [r0+r1-2] 1090 movzx r3d, word [r0] 1091 movzx r4d, word [r0+r1] 1092%rep 7 1093 lea r0, [r0+r1*2] 1094 movzx r2d, word [r0] 1095 add r3d, r2d 1096 movzx r2d, word [r0+r1] 1097 add r4d, r2d 1098%endrep 1099 lea r3d, [r3+r4+16] 1100 1101 movd m1, r3d 1102 paddw m0, m1 1103 psrlw m0, 5 1104 SPLATW m0, m0 1105 mov r3d, 8 1106.loop: 1107 MOV16 r5+r1*0, m0, m0, m0, m0 1108 MOV16 r5+r1*1, m0, m0, m0, m0 1109 lea r5, [r5+r1*2] 1110 dec r3d 1111 jg .loop 1112 REP_RET 1113%endmacro 1114 1115INIT_MMX 1116PRED16x16_DC mmxext 1117INIT_XMM 1118PRED16x16_DC sse2 1119 1120;----------------------------------------------------------------------------- 1121; void pred16x16_top_dc(pixel *src, int stride) 1122;----------------------------------------------------------------------------- 1123%macro PRED16x16_TOP_DC 1 1124cglobal pred16x16_top_dc_10_%1, 2,3 1125 sub r0, r1 1126 mova m0, [r0+0] 1127 paddw m0, [r0+mmsize] 1128%if mmsize==8 1129 paddw m0, [r0+16] 1130 paddw m0, [r0+24] 1131%endif 1132 HADDW m0, m2 1133 1134 SPLATW m0, m0 1135 paddw m0, [pw_8] 1136 psrlw m0, 4 1137 mov r2d, 8 1138.loop: 1139 MOV16 r0+r1*1, m0, m0, m0, m0 1140 MOV16 r0+r1*2, m0, m0, m0, m0 1141 lea r0, [r0+r1*2] 1142 dec r2d 1143 jg .loop 1144 REP_RET 1145%endmacro 1146 1147INIT_MMX 1148PRED16x16_TOP_DC mmxext 1149INIT_XMM 1150PRED16x16_TOP_DC sse2 1151 1152;----------------------------------------------------------------------------- 1153; void pred16x16_left_dc(pixel *src, int stride) 1154;----------------------------------------------------------------------------- 1155%macro PRED16x16_LEFT_DC 1 1156cglobal pred16x16_left_dc_10_%1, 2,6 1157 mov r5, r0 1158 1159 sub r0, 2 1160 movzx r3d, word [r0] 1161 movzx r4d, word [r0+r1] 1162%rep 7 1163 lea r0, [r0+r1*2] 1164 movzx r2d, word [r0] 1165 add r3d, r2d 1166 movzx r2d, word [r0+r1] 1167 add r4d, r2d 1168%endrep 1169 lea r3d, [r3+r4+8] 1170 shr r3d, 4 1171 1172 movd m0, r3d 1173 SPLATW m0, m0 1174 mov r3d, 8 1175.loop: 1176 MOV16 r5+r1*0, m0, m0, m0, m0 1177 MOV16 r5+r1*1, m0, m0, m0, m0 1178 lea r5, [r5+r1*2] 1179 dec r3d 1180 jg .loop 1181 REP_RET 1182%endmacro 1183 1184INIT_MMX 1185PRED16x16_LEFT_DC mmxext 1186INIT_XMM 1187PRED16x16_LEFT_DC sse2 1188 1189;----------------------------------------------------------------------------- 1190; void pred16x16_128_dc(pixel *src, int stride) 1191;----------------------------------------------------------------------------- 1192%macro PRED16x16_128_DC 1 1193cglobal pred16x16_128_dc_10_%1, 2,3 1194 mova m0, [pw_512] 1195 mov r2d, 8 1196.loop: 1197 MOV16 r0+r1*0, m0, m0, m0, m0 1198 MOV16 r0+r1*1, m0, m0, m0, m0 1199 lea r0, [r0+r1*2] 1200 dec r2d 1201 jg .loop 1202 REP_RET 1203%endmacro 1204 1205INIT_MMX 1206PRED16x16_128_DC mmxext 1207INIT_XMM 1208PRED16x16_128_DC sse2 1209