1;****************************************************************************** 2;* H.264 intra prediction asm optimizations 3;* Copyright (c) 2010 Jason Garrett-Glaser 4;* Copyright (c) 2010 Holger Lubitz 5;* Copyright (c) 2010 Loren Merritt 6;* Copyright (c) 2010 Ronald S. Bultje 7;* 8;* This file is part of Libav. 9;* 10;* Libav is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* Libav is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with Libav; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "x86inc.asm" 26%include "x86util.asm" 27 28SECTION_RODATA 29 30tm_shuf: times 8 db 0x03, 0x80 31pw_ff00: times 8 dw 0xff00 32plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 33 db 1, 2, 3, 4, 5, 6, 7, 8 34plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 35 db 1, 2, 3, 4, 0, 0, 0, 0 36pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 37pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 38pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 39pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 40 41SECTION .text 42 43cextern pb_1 44cextern pb_3 45cextern pw_4 46cextern pw_5 47cextern pw_8 48cextern pw_16 49cextern pw_17 50cextern pw_32 51 52;----------------------------------------------------------------------------- 53; void pred16x16_vertical(uint8_t *src, int stride) 54;----------------------------------------------------------------------------- 55 56cglobal pred16x16_vertical_mmx, 2,3 57 sub r0, r1 58 mov r2, 8 59 movq mm0, [r0+0] 60 movq mm1, [r0+8] 61.loop: 62 movq [r0+r1*1+0], mm0 63 movq [r0+r1*1+8], mm1 64 movq [r0+r1*2+0], mm0 65 movq [r0+r1*2+8], mm1 66 lea r0, [r0+r1*2] 67 dec r2 68 jg .loop 69 REP_RET 70 71cglobal pred16x16_vertical_sse, 2,3 72 sub r0, r1 73 mov r2, 4 74 movaps xmm0, [r0] 75.loop: 76 movaps [r0+r1*1], xmm0 77 movaps [r0+r1*2], xmm0 78 lea r0, [r0+r1*2] 79 movaps [r0+r1*1], xmm0 80 movaps [r0+r1*2], xmm0 81 lea r0, [r0+r1*2] 82 dec r2 83 jg .loop 84 REP_RET 85 86;----------------------------------------------------------------------------- 87; void pred16x16_horizontal(uint8_t *src, int stride) 88;----------------------------------------------------------------------------- 89 90%macro PRED16x16_H 1 91cglobal pred16x16_horizontal_%1, 2,3 92 mov r2, 8 93%ifidn %1, ssse3 94 mova m2, [pb_3] 95%endif 96.loop: 97 movd m0, [r0+r1*0-4] 98 movd m1, [r0+r1*1-4] 99 100%ifidn %1, ssse3 101 pshufb m0, m2 102 pshufb m1, m2 103%else 104 punpcklbw m0, m0 105 punpcklbw m1, m1 106%ifidn %1, mmxext 107 pshufw m0, m0, 0xff 108 pshufw m1, m1, 0xff 109%else 110 punpckhwd m0, m0 111 punpckhwd m1, m1 112 punpckhdq m0, m0 113 punpckhdq m1, m1 114%endif 115 mova [r0+r1*0+8], m0 116 mova [r0+r1*1+8], m1 117%endif 118 119 mova [r0+r1*0], m0 120 mova [r0+r1*1], m1 121 lea r0, [r0+r1*2] 122 dec r2 123 jg .loop 124 REP_RET 125%endmacro 126 127INIT_MMX 128PRED16x16_H mmx 129PRED16x16_H mmxext 130INIT_XMM 131PRED16x16_H ssse3 132 133;----------------------------------------------------------------------------- 134; void pred16x16_dc(uint8_t *src, int stride) 135;----------------------------------------------------------------------------- 136 137%macro PRED16x16_DC 1 138cglobal pred16x16_dc_%1, 2,7 139 mov r4, r0 140 sub r0, r1 141 pxor mm0, mm0 142 pxor mm1, mm1 143 psadbw mm0, [r0+0] 144 psadbw mm1, [r0+8] 145 dec r0 146 movzx r5d, byte [r0+r1*1] 147 paddw mm0, mm1 148 movd r6d, mm0 149 lea r0, [r0+r1*2] 150%rep 7 151 movzx r2d, byte [r0+r1*0] 152 movzx r3d, byte [r0+r1*1] 153 add r5d, r2d 154 add r6d, r3d 155 lea r0, [r0+r1*2] 156%endrep 157 movzx r2d, byte [r0+r1*0] 158 add r5d, r6d 159 lea r2d, [r2+r5+16] 160 shr r2d, 5 161%ifidn %1, mmxext 162 movd m0, r2d 163 punpcklbw m0, m0 164 pshufw m0, m0, 0 165%elifidn %1, sse2 166 movd m0, r2d 167 punpcklbw m0, m0 168 pshuflw m0, m0, 0 169 punpcklqdq m0, m0 170%elifidn %1, ssse3 171 pxor m1, m1 172 movd m0, r2d 173 pshufb m0, m1 174%endif 175 176%if mmsize==8 177 mov r3d, 8 178.loop: 179 mova [r4+r1*0+0], m0 180 mova [r4+r1*0+8], m0 181 mova [r4+r1*1+0], m0 182 mova [r4+r1*1+8], m0 183%else 184 mov r3d, 4 185.loop: 186 mova [r4+r1*0], m0 187 mova [r4+r1*1], m0 188 lea r4, [r4+r1*2] 189 mova [r4+r1*0], m0 190 mova [r4+r1*1], m0 191%endif 192 lea r4, [r4+r1*2] 193 dec r3d 194 jg .loop 195 REP_RET 196%endmacro 197 198INIT_MMX 199PRED16x16_DC mmxext 200INIT_XMM 201PRED16x16_DC sse2 202PRED16x16_DC ssse3 203 204;----------------------------------------------------------------------------- 205; void pred16x16_tm_vp8(uint8_t *src, int stride) 206;----------------------------------------------------------------------------- 207 208%macro PRED16x16_TM_MMX 1 209cglobal pred16x16_tm_vp8_%1, 2,5 210 sub r0, r1 211 pxor mm7, mm7 212 movq mm0, [r0+0] 213 movq mm2, [r0+8] 214 movq mm1, mm0 215 movq mm3, mm2 216 punpcklbw mm0, mm7 217 punpckhbw mm1, mm7 218 punpcklbw mm2, mm7 219 punpckhbw mm3, mm7 220 movzx r3d, byte [r0-1] 221 mov r4d, 16 222.loop: 223 movzx r2d, byte [r0+r1-1] 224 sub r2d, r3d 225 movd mm4, r2d 226%ifidn %1, mmx 227 punpcklwd mm4, mm4 228 punpckldq mm4, mm4 229%else 230 pshufw mm4, mm4, 0 231%endif 232 movq mm5, mm4 233 movq mm6, mm4 234 movq mm7, mm4 235 paddw mm4, mm0 236 paddw mm5, mm1 237 paddw mm6, mm2 238 paddw mm7, mm3 239 packuswb mm4, mm5 240 packuswb mm6, mm7 241 movq [r0+r1+0], mm4 242 movq [r0+r1+8], mm6 243 add r0, r1 244 dec r4d 245 jg .loop 246 REP_RET 247%endmacro 248 249PRED16x16_TM_MMX mmx 250PRED16x16_TM_MMX mmxext 251 252cglobal pred16x16_tm_vp8_sse2, 2,6,6 253 sub r0, r1 254 pxor xmm2, xmm2 255 movdqa xmm0, [r0] 256 movdqa xmm1, xmm0 257 punpcklbw xmm0, xmm2 258 punpckhbw xmm1, xmm2 259 movzx r4d, byte [r0-1] 260 mov r5d, 8 261.loop: 262 movzx r2d, byte [r0+r1*1-1] 263 movzx r3d, byte [r0+r1*2-1] 264 sub r2d, r4d 265 sub r3d, r4d 266 movd xmm2, r2d 267 movd xmm4, r3d 268 pshuflw xmm2, xmm2, 0 269 pshuflw xmm4, xmm4, 0 270 punpcklqdq xmm2, xmm2 271 punpcklqdq xmm4, xmm4 272 movdqa xmm3, xmm2 273 movdqa xmm5, xmm4 274 paddw xmm2, xmm0 275 paddw xmm3, xmm1 276 paddw xmm4, xmm0 277 paddw xmm5, xmm1 278 packuswb xmm2, xmm3 279 packuswb xmm4, xmm5 280 movdqa [r0+r1*1], xmm2 281 movdqa [r0+r1*2], xmm4 282 lea r0, [r0+r1*2] 283 dec r5d 284 jg .loop 285 REP_RET 286 287;----------------------------------------------------------------------------- 288; void pred16x16_plane(uint8_t *src, int stride) 289;----------------------------------------------------------------------------- 290 291%macro H264_PRED16x16_PLANE 3 292cglobal pred16x16_plane_%3_%1, 2, 7, %2 293 mov r2, r1 ; +stride 294 neg r1 ; -stride 295 296 movh m0, [r0+r1 -1] 297%if mmsize == 8 298 pxor m4, m4 299 movh m1, [r0+r1 +3 ] 300 movh m2, [r0+r1 +8 ] 301 movh m3, [r0+r1 +12] 302 punpcklbw m0, m4 303 punpcklbw m1, m4 304 punpcklbw m2, m4 305 punpcklbw m3, m4 306 pmullw m0, [pw_m8tom1 ] 307 pmullw m1, [pw_m8tom1+8] 308 pmullw m2, [pw_1to8 ] 309 pmullw m3, [pw_1to8 +8] 310 paddw m0, m2 311 paddw m1, m3 312%else ; mmsize == 16 313%ifidn %1, sse2 314 pxor m2, m2 315 movh m1, [r0+r1 +8] 316 punpcklbw m0, m2 317 punpcklbw m1, m2 318 pmullw m0, [pw_m8tom1] 319 pmullw m1, [pw_1to8] 320 paddw m0, m1 321%else ; ssse3 322 movhps m0, [r0+r1 +8] 323 pmaddubsw m0, [plane_shuf] ; H coefficients 324%endif 325 movhlps m1, m0 326%endif 327 paddw m0, m1 328%ifidn %1, mmx 329 mova m1, m0 330 psrlq m1, 32 331%elifidn %1, mmx2 332 pshufw m1, m0, 0xE 333%else ; mmsize == 16 334 pshuflw m1, m0, 0xE 335%endif 336 paddw m0, m1 337%ifidn %1, mmx 338 mova m1, m0 339 psrlq m1, 16 340%elifidn %1, mmx2 341 pshufw m1, m0, 0x1 342%else 343 pshuflw m1, m0, 0x1 344%endif 345 paddw m0, m1 ; sum of H coefficients 346 347 lea r4, [r0+r2*8-1] 348 lea r3, [r0+r2*4-1] 349 add r4, r2 350 351%ifdef ARCH_X86_64 352%define e_reg r11 353%else 354%define e_reg r0 355%endif 356 357 movzx e_reg, byte [r3+r2*2 ] 358 movzx r5, byte [r4+r1 ] 359 sub r5, e_reg 360 361 movzx e_reg, byte [r3+r2 ] 362 movzx r6, byte [r4 ] 363 sub r6, e_reg 364 lea r5, [r5+r6*2] 365 366 movzx e_reg, byte [r3+r1 ] 367 movzx r6, byte [r4+r2*2 ] 368 sub r6, e_reg 369 lea r5, [r5+r6*4] 370 371 movzx e_reg, byte [r3 ] 372%ifdef ARCH_X86_64 373 movzx r10, byte [r4+r2 ] 374 sub r10, e_reg 375%else 376 movzx r6, byte [r4+r2 ] 377 sub r6, e_reg 378 lea r5, [r5+r6*4] 379 sub r5, r6 380%endif 381 382 lea e_reg, [r3+r1*4] 383 lea r3, [r4+r2*4] 384 385 movzx r4, byte [e_reg+r2 ] 386 movzx r6, byte [r3 ] 387 sub r6, r4 388%ifdef ARCH_X86_64 389 lea r6, [r10+r6*2] 390 lea r5, [r5+r6*2] 391 add r5, r6 392%else 393 lea r5, [r5+r6*4] 394 lea r5, [r5+r6*2] 395%endif 396 397 movzx r4, byte [e_reg ] 398%ifdef ARCH_X86_64 399 movzx r10, byte [r3 +r2 ] 400 sub r10, r4 401 sub r5, r10 402%else 403 movzx r6, byte [r3 +r2 ] 404 sub r6, r4 405 lea r5, [r5+r6*8] 406 sub r5, r6 407%endif 408 409 movzx r4, byte [e_reg+r1 ] 410 movzx r6, byte [r3 +r2*2] 411 sub r6, r4 412%ifdef ARCH_X86_64 413 add r6, r10 414%endif 415 lea r5, [r5+r6*8] 416 417 movzx r4, byte [e_reg+r2*2] 418 movzx r6, byte [r3 +r1 ] 419 sub r6, r4 420 lea r5, [r5+r6*4] 421 add r5, r6 ; sum of V coefficients 422 423%ifndef ARCH_X86_64 424 mov r0, r0m 425%endif 426 427%ifidn %3, h264 428 lea r5, [r5*5+32] 429 sar r5, 6 430%elifidn %3, rv40 431 lea r5, [r5*5] 432 sar r5, 6 433%elifidn %3, svq3 434 test r5, r5 435 lea r6, [r5+3] 436 cmovs r5, r6 437 sar r5, 2 ; V/4 438 lea r5, [r5*5] ; 5*(V/4) 439 test r5, r5 440 lea r6, [r5+15] 441 cmovs r5, r6 442 sar r5, 4 ; (5*(V/4))/16 443%endif 444 445 movzx r4, byte [r0+r1 +15] 446 movzx r3, byte [r3+r2*2 ] 447 lea r3, [r3+r4+1] 448 shl r3, 4 449 450 movd r1d, m0 451 movsx r1d, r1w 452%ifnidn %3, svq3 453%ifidn %3, h264 454 lea r1d, [r1d*5+32] 455%else ; rv40 456 lea r1d, [r1d*5] 457%endif 458 sar r1d, 6 459%else ; svq3 460 test r1d, r1d 461 lea r4d, [r1d+3] 462 cmovs r1d, r4d 463 sar r1d, 2 ; H/4 464 lea r1d, [r1d*5] ; 5*(H/4) 465 test r1d, r1d 466 lea r4d, [r1d+15] 467 cmovs r1d, r4d 468 sar r1d, 4 ; (5*(H/4))/16 469%endif 470 movd m0, r1d 471 472 add r1d, r5d 473 add r3d, r1d 474 shl r1d, 3 475 sub r3d, r1d ; a 476 477 movd m1, r5d 478 movd m3, r3d 479%ifidn %1, mmx 480 punpcklwd m0, m0 481 punpcklwd m1, m1 482 punpcklwd m3, m3 483 punpckldq m0, m0 484 punpckldq m1, m1 485 punpckldq m3, m3 486%elifidn %1, mmx2 487 pshufw m0, m0, 0x0 488 pshufw m1, m1, 0x0 489 pshufw m3, m3, 0x0 490%else 491 pshuflw m0, m0, 0x0 492 pshuflw m1, m1, 0x0 493 pshuflw m3, m3, 0x0 494 punpcklqdq m0, m0 ; splat H (words) 495 punpcklqdq m1, m1 ; splat V (words) 496 punpcklqdq m3, m3 ; splat a (words) 497%endif 498%ifidn %3, svq3 499 SWAP 0, 1 500%endif 501 mova m2, m0 502%if mmsize == 8 503 mova m5, m0 504%endif 505 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 506%if mmsize == 16 507 psllw m2, 3 508%else 509 psllw m5, 3 510 psllw m2, 2 511 mova m6, m5 512 paddw m6, m2 513%endif 514 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 515 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H 516%if mmsize == 8 517 paddw m5, m0 ; a + {8,9,10,11}*H 518 paddw m6, m0 ; a + {12,13,14,15}*H 519%endif 520 521 mov r4, 8 522.loop 523 mova m3, m0 ; b[0..7] 524 mova m4, m2 ; b[8..15] 525 psraw m3, 5 526 psraw m4, 5 527 packuswb m3, m4 528 mova [r0], m3 529%if mmsize == 8 530 mova m3, m5 ; b[8..11] 531 mova m4, m6 ; b[12..15] 532 psraw m3, 5 533 psraw m4, 5 534 packuswb m3, m4 535 mova [r0+8], m3 536%endif 537 paddw m0, m1 538 paddw m2, m1 539%if mmsize == 8 540 paddw m5, m1 541 paddw m6, m1 542%endif 543 544 mova m3, m0 ; b[0..7] 545 mova m4, m2 ; b[8..15] 546 psraw m3, 5 547 psraw m4, 5 548 packuswb m3, m4 549 mova [r0+r2], m3 550%if mmsize == 8 551 mova m3, m5 ; b[8..11] 552 mova m4, m6 ; b[12..15] 553 psraw m3, 5 554 psraw m4, 5 555 packuswb m3, m4 556 mova [r0+r2+8], m3 557%endif 558 paddw m0, m1 559 paddw m2, m1 560%if mmsize == 8 561 paddw m5, m1 562 paddw m6, m1 563%endif 564 565 lea r0, [r0+r2*2] 566 dec r4 567 jg .loop 568 REP_RET 569%endmacro 570 571INIT_MMX 572H264_PRED16x16_PLANE mmx, 0, h264 573H264_PRED16x16_PLANE mmx, 0, rv40 574H264_PRED16x16_PLANE mmx, 0, svq3 575H264_PRED16x16_PLANE mmx2, 0, h264 576H264_PRED16x16_PLANE mmx2, 0, rv40 577H264_PRED16x16_PLANE mmx2, 0, svq3 578INIT_XMM 579H264_PRED16x16_PLANE sse2, 8, h264 580H264_PRED16x16_PLANE sse2, 8, rv40 581H264_PRED16x16_PLANE sse2, 8, svq3 582H264_PRED16x16_PLANE ssse3, 8, h264 583H264_PRED16x16_PLANE ssse3, 8, rv40 584H264_PRED16x16_PLANE ssse3, 8, svq3 585 586;----------------------------------------------------------------------------- 587; void pred8x8_plane(uint8_t *src, int stride) 588;----------------------------------------------------------------------------- 589 590%macro H264_PRED8x8_PLANE 2 591cglobal pred8x8_plane_%1, 2, 7, %2 592 mov r2, r1 ; +stride 593 neg r1 ; -stride 594 595 movd m0, [r0+r1 -1] 596%if mmsize == 8 597 pxor m2, m2 598 movh m1, [r0+r1 +4 ] 599 punpcklbw m0, m2 600 punpcklbw m1, m2 601 pmullw m0, [pw_m4to4] 602 pmullw m1, [pw_m4to4+8] 603%else ; mmsize == 16 604%ifidn %1, sse2 605 pxor m2, m2 606 movd m1, [r0+r1 +4] 607 punpckldq m0, m1 608 punpcklbw m0, m2 609 pmullw m0, [pw_m4to4] 610%else ; ssse3 611 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary 612 pmaddubsw m0, [plane8_shuf] ; H coefficients 613%endif 614 movhlps m1, m0 615%endif 616 paddw m0, m1 617 618%ifnidn %1, ssse3 619%ifidn %1, mmx 620 mova m1, m0 621 psrlq m1, 32 622%elifidn %1, mmx2 623 pshufw m1, m0, 0xE 624%else ; mmsize == 16 625 pshuflw m1, m0, 0xE 626%endif 627 paddw m0, m1 628%endif ; !ssse3 629 630%ifidn %1, mmx 631 mova m1, m0 632 psrlq m1, 16 633%elifidn %1, mmx2 634 pshufw m1, m0, 0x1 635%else 636 pshuflw m1, m0, 0x1 637%endif 638 paddw m0, m1 ; sum of H coefficients 639 640 lea r4, [r0+r2*4-1] 641 lea r3, [r0 -1] 642 add r4, r2 643 644%ifdef ARCH_X86_64 645%define e_reg r11 646%else 647%define e_reg r0 648%endif 649 650 movzx e_reg, byte [r3+r2*2 ] 651 movzx r5, byte [r4+r1 ] 652 sub r5, e_reg 653 654 movzx e_reg, byte [r3 ] 655%ifdef ARCH_X86_64 656 movzx r10, byte [r4+r2 ] 657 sub r10, e_reg 658 sub r5, r10 659%else 660 movzx r6, byte [r4+r2 ] 661 sub r6, e_reg 662 lea r5, [r5+r6*4] 663 sub r5, r6 664%endif 665 666 movzx e_reg, byte [r3+r1 ] 667 movzx r6, byte [r4+r2*2 ] 668 sub r6, e_reg 669%ifdef ARCH_X86_64 670 add r6, r10 671%endif 672 lea r5, [r5+r6*4] 673 674 movzx e_reg, byte [r3+r2 ] 675 movzx r6, byte [r4 ] 676 sub r6, e_reg 677 lea r6, [r5+r6*2] 678 679 lea r5, [r6*9+16] 680 lea r5, [r5+r6*8] 681 sar r5, 5 682 683%ifndef ARCH_X86_64 684 mov r0, r0m 685%endif 686 687 movzx r3, byte [r4+r2*2 ] 688 movzx r4, byte [r0+r1 +7] 689 lea r3, [r3+r4+1] 690 shl r3, 4 691 movd r1d, m0 692 movsx r1d, r1w 693 imul r1d, 17 694 add r1d, 16 695 sar r1d, 5 696 movd m0, r1d 697 add r1d, r5d 698 sub r3d, r1d 699 add r1d, r1d 700 sub r3d, r1d ; a 701 702 movd m1, r5d 703 movd m3, r3d 704%ifidn %1, mmx 705 punpcklwd m0, m0 706 punpcklwd m1, m1 707 punpcklwd m3, m3 708 punpckldq m0, m0 709 punpckldq m1, m1 710 punpckldq m3, m3 711%elifidn %1, mmx2 712 pshufw m0, m0, 0x0 713 pshufw m1, m1, 0x0 714 pshufw m3, m3, 0x0 715%else 716 pshuflw m0, m0, 0x0 717 pshuflw m1, m1, 0x0 718 pshuflw m3, m3, 0x0 719 punpcklqdq m0, m0 ; splat H (words) 720 punpcklqdq m1, m1 ; splat V (words) 721 punpcklqdq m3, m3 ; splat a (words) 722%endif 723%if mmsize == 8 724 mova m2, m0 725%endif 726 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 727 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 728%if mmsize == 8 729 psllw m2, 2 730 paddw m2, m0 ; a + {4,5,6,7}*H 731%endif 732 733 mov r4, 4 734ALIGN 16 735.loop 736%if mmsize == 16 737 mova m3, m0 ; b[0..7] 738 paddw m0, m1 739 psraw m3, 5 740 mova m4, m0 ; V+b[0..7] 741 paddw m0, m1 742 psraw m4, 5 743 packuswb m3, m4 744 movh [r0], m3 745 movhps [r0+r2], m3 746%else ; mmsize == 8 747 mova m3, m0 ; b[0..3] 748 mova m4, m2 ; b[4..7] 749 paddw m0, m1 750 paddw m2, m1 751 psraw m3, 5 752 psraw m4, 5 753 mova m5, m0 ; V+b[0..3] 754 mova m6, m2 ; V+b[4..7] 755 paddw m0, m1 756 paddw m2, m1 757 psraw m5, 5 758 psraw m6, 5 759 packuswb m3, m4 760 packuswb m5, m6 761 mova [r0], m3 762 mova [r0+r2], m5 763%endif 764 765 lea r0, [r0+r2*2] 766 dec r4 767 jg .loop 768 REP_RET 769%endmacro 770 771INIT_MMX 772H264_PRED8x8_PLANE mmx, 0 773H264_PRED8x8_PLANE mmx2, 0 774INIT_XMM 775H264_PRED8x8_PLANE sse2, 8 776H264_PRED8x8_PLANE ssse3, 8 777 778;----------------------------------------------------------------------------- 779; void pred8x8_vertical(uint8_t *src, int stride) 780;----------------------------------------------------------------------------- 781 782cglobal pred8x8_vertical_mmx, 2,2 783 sub r0, r1 784 movq mm0, [r0] 785%rep 3 786 movq [r0+r1*1], mm0 787 movq [r0+r1*2], mm0 788 lea r0, [r0+r1*2] 789%endrep 790 movq [r0+r1*1], mm0 791 movq [r0+r1*2], mm0 792 RET 793 794;----------------------------------------------------------------------------- 795; void pred8x8_horizontal(uint8_t *src, int stride) 796;----------------------------------------------------------------------------- 797 798%macro PRED8x8_H 1 799cglobal pred8x8_horizontal_%1, 2,3 800 mov r2, 4 801%ifidn %1, ssse3 802 mova m2, [pb_3] 803%endif 804.loop: 805 movd m0, [r0+r1*0-4] 806 movd m1, [r0+r1*1-4] 807%ifidn %1, ssse3 808 pshufb m0, m2 809 pshufb m1, m2 810%else 811 punpcklbw m0, m0 812 punpcklbw m1, m1 813%ifidn %1, mmxext 814 pshufw m0, m0, 0xff 815 pshufw m1, m1, 0xff 816%else 817 punpckhwd m0, m0 818 punpckhwd m1, m1 819 punpckhdq m0, m0 820 punpckhdq m1, m1 821%endif 822%endif 823 mova [r0+r1*0], m0 824 mova [r0+r1*1], m1 825 lea r0, [r0+r1*2] 826 dec r2 827 jg .loop 828 REP_RET 829%endmacro 830 831INIT_MMX 832PRED8x8_H mmx 833PRED8x8_H mmxext 834PRED8x8_H ssse3 835 836;----------------------------------------------------------------------------- 837; void pred8x8_top_dc_mmxext(uint8_t *src, int stride) 838;----------------------------------------------------------------------------- 839cglobal pred8x8_top_dc_mmxext, 2,5 840 sub r0, r1 841 movq mm0, [r0] 842 pxor mm1, mm1 843 pxor mm2, mm2 844 lea r2, [r0+r1*2] 845 punpckhbw mm1, mm0 846 punpcklbw mm0, mm2 847 psadbw mm1, mm2 ; s1 848 lea r3, [r2+r1*2] 849 psadbw mm0, mm2 ; s0 850 psrlw mm1, 1 851 psrlw mm0, 1 852 pavgw mm1, mm2 853 lea r4, [r3+r1*2] 854 pavgw mm0, mm2 855 pshufw mm1, mm1, 0 856 pshufw mm0, mm0, 0 ; dc0 (w) 857 packuswb mm0, mm1 ; dc0,dc1 (b) 858 movq [r0+r1*1], mm0 859 movq [r0+r1*2], mm0 860 lea r0, [r3+r1*2] 861 movq [r2+r1*1], mm0 862 movq [r2+r1*2], mm0 863 movq [r3+r1*1], mm0 864 movq [r3+r1*2], mm0 865 movq [r0+r1*1], mm0 866 movq [r0+r1*2], mm0 867 RET 868 869;----------------------------------------------------------------------------- 870; void pred8x8_dc_mmxext(uint8_t *src, int stride) 871;----------------------------------------------------------------------------- 872 873INIT_MMX 874cglobal pred8x8_dc_mmxext, 2,5 875 sub r0, r1 876 pxor m7, m7 877 movd m0, [r0+0] 878 movd m1, [r0+4] 879 psadbw m0, m7 ; s0 880 mov r4, r0 881 psadbw m1, m7 ; s1 882 883 movzx r2d, byte [r0+r1*1-1] 884 movzx r3d, byte [r0+r1*2-1] 885 lea r0, [r0+r1*2] 886 add r2d, r3d 887 movzx r3d, byte [r0+r1*1-1] 888 add r2d, r3d 889 movzx r3d, byte [r0+r1*2-1] 890 add r2d, r3d 891 lea r0, [r0+r1*2] 892 movd m2, r2d ; s2 893 movzx r2d, byte [r0+r1*1-1] 894 movzx r3d, byte [r0+r1*2-1] 895 lea r0, [r0+r1*2] 896 add r2d, r3d 897 movzx r3d, byte [r0+r1*1-1] 898 add r2d, r3d 899 movzx r3d, byte [r0+r1*2-1] 900 add r2d, r3d 901 movd m3, r2d ; s3 902 903 punpcklwd m0, m1 904 mov r0, r4 905 punpcklwd m2, m3 906 punpckldq m0, m2 ; s0, s1, s2, s3 907 pshufw m3, m0, 11110110b ; s2, s1, s3, s3 908 lea r2, [r0+r1*2] 909 pshufw m0, m0, 01110100b ; s0, s1, s3, s1 910 paddw m0, m3 911 lea r3, [r2+r1*2] 912 psrlw m0, 2 913 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 914 lea r4, [r3+r1*2] 915 packuswb m0, m0 916 punpcklbw m0, m0 917 movq m1, m0 918 punpcklbw m0, m0 919 punpckhbw m1, m1 920 movq [r0+r1*1], m0 921 movq [r0+r1*2], m0 922 movq [r2+r1*1], m0 923 movq [r2+r1*2], m0 924 movq [r3+r1*1], m1 925 movq [r3+r1*2], m1 926 movq [r4+r1*1], m1 927 movq [r4+r1*2], m1 928 RET 929 930;----------------------------------------------------------------------------- 931; void pred8x8_dc_rv40(uint8_t *src, int stride) 932;----------------------------------------------------------------------------- 933 934cglobal pred8x8_dc_rv40_mmxext, 2,7 935 mov r4, r0 936 sub r0, r1 937 pxor mm0, mm0 938 psadbw mm0, [r0] 939 dec r0 940 movzx r5d, byte [r0+r1*1] 941 movd r6d, mm0 942 lea r0, [r0+r1*2] 943%rep 3 944 movzx r2d, byte [r0+r1*0] 945 movzx r3d, byte [r0+r1*1] 946 add r5d, r2d 947 add r6d, r3d 948 lea r0, [r0+r1*2] 949%endrep 950 movzx r2d, byte [r0+r1*0] 951 add r5d, r6d 952 lea r2d, [r2+r5+8] 953 shr r2d, 4 954 movd mm0, r2d 955 punpcklbw mm0, mm0 956 pshufw mm0, mm0, 0 957 mov r3d, 4 958.loop: 959 movq [r4+r1*0], mm0 960 movq [r4+r1*1], mm0 961 lea r4, [r4+r1*2] 962 dec r3d 963 jg .loop 964 REP_RET 965 966;----------------------------------------------------------------------------- 967; void pred8x8_tm_vp8(uint8_t *src, int stride) 968;----------------------------------------------------------------------------- 969 970%macro PRED8x8_TM_MMX 1 971cglobal pred8x8_tm_vp8_%1, 2,6 972 sub r0, r1 973 pxor mm7, mm7 974 movq mm0, [r0] 975 movq mm1, mm0 976 punpcklbw mm0, mm7 977 punpckhbw mm1, mm7 978 movzx r4d, byte [r0-1] 979 mov r5d, 4 980.loop: 981 movzx r2d, byte [r0+r1*1-1] 982 movzx r3d, byte [r0+r1*2-1] 983 sub r2d, r4d 984 sub r3d, r4d 985 movd mm2, r2d 986 movd mm4, r3d 987%ifidn %1, mmx 988 punpcklwd mm2, mm2 989 punpcklwd mm4, mm4 990 punpckldq mm2, mm2 991 punpckldq mm4, mm4 992%else 993 pshufw mm2, mm2, 0 994 pshufw mm4, mm4, 0 995%endif 996 movq mm3, mm2 997 movq mm5, mm4 998 paddw mm2, mm0 999 paddw mm3, mm1 1000 paddw mm4, mm0 1001 paddw mm5, mm1 1002 packuswb mm2, mm3 1003 packuswb mm4, mm5 1004 movq [r0+r1*1], mm2 1005 movq [r0+r1*2], mm4 1006 lea r0, [r0+r1*2] 1007 dec r5d 1008 jg .loop 1009 REP_RET 1010%endmacro 1011 1012PRED8x8_TM_MMX mmx 1013PRED8x8_TM_MMX mmxext 1014 1015cglobal pred8x8_tm_vp8_sse2, 2,6,4 1016 sub r0, r1 1017 pxor xmm1, xmm1 1018 movq xmm0, [r0] 1019 punpcklbw xmm0, xmm1 1020 movzx r4d, byte [r0-1] 1021 mov r5d, 4 1022.loop: 1023 movzx r2d, byte [r0+r1*1-1] 1024 movzx r3d, byte [r0+r1*2-1] 1025 sub r2d, r4d 1026 sub r3d, r4d 1027 movd xmm2, r2d 1028 movd xmm3, r3d 1029 pshuflw xmm2, xmm2, 0 1030 pshuflw xmm3, xmm3, 0 1031 punpcklqdq xmm2, xmm2 1032 punpcklqdq xmm3, xmm3 1033 paddw xmm2, xmm0 1034 paddw xmm3, xmm0 1035 packuswb xmm2, xmm3 1036 movq [r0+r1*1], xmm2 1037 movhps [r0+r1*2], xmm2 1038 lea r0, [r0+r1*2] 1039 dec r5d 1040 jg .loop 1041 REP_RET 1042 1043cglobal pred8x8_tm_vp8_ssse3, 2,3,6 1044 sub r0, r1 1045 movdqa xmm4, [tm_shuf] 1046 pxor xmm1, xmm1 1047 movq xmm0, [r0] 1048 punpcklbw xmm0, xmm1 1049 movd xmm5, [r0-4] 1050 pshufb xmm5, xmm4 1051 mov r2d, 4 1052.loop: 1053 movd xmm2, [r0+r1*1-4] 1054 movd xmm3, [r0+r1*2-4] 1055 pshufb xmm2, xmm4 1056 pshufb xmm3, xmm4 1057 psubw xmm2, xmm5 1058 psubw xmm3, xmm5 1059 paddw xmm2, xmm0 1060 paddw xmm3, xmm0 1061 packuswb xmm2, xmm3 1062 movq [r0+r1*1], xmm2 1063 movhps [r0+r1*2], xmm2 1064 lea r0, [r0+r1*2] 1065 dec r2d 1066 jg .loop 1067 REP_RET 1068 1069; dest, left, right, src, tmp 1070; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 1071%macro PRED4x4_LOWPASS 5 1072 mova %5, %2 1073 pavgb %2, %3 1074 pxor %3, %5 1075 mova %1, %4 1076 pand %3, [pb_1] 1077 psubusb %2, %3 1078 pavgb %1, %2 1079%endmacro 1080 1081;----------------------------------------------------------------------------- 1082; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride) 1083;----------------------------------------------------------------------------- 1084%macro PRED8x8L_TOP_DC 1 1085cglobal pred8x8l_top_dc_%1, 4,4 1086 sub r0, r3 1087 pxor mm7, mm7 1088 movq mm0, [r0-8] 1089 movq mm3, [r0] 1090 movq mm1, [r0+8] 1091 movq mm2, mm3 1092 movq mm4, mm3 1093 PALIGNR mm2, mm0, 7, mm0 1094 PALIGNR mm1, mm4, 1, mm4 1095 test r1, r1 ; top_left 1096 jz .fix_lt_2 1097 test r2, r2 ; top_right 1098 jz .fix_tr_1 1099 jmp .body 1100.fix_lt_2: 1101 movq mm5, mm3 1102 pxor mm5, mm2 1103 psllq mm5, 56 1104 psrlq mm5, 56 1105 pxor mm2, mm5 1106 test r2, r2 ; top_right 1107 jnz .body 1108.fix_tr_1: 1109 movq mm5, mm3 1110 pxor mm5, mm1 1111 psrlq mm5, 56 1112 psllq mm5, 56 1113 pxor mm1, mm5 1114.body 1115 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1116 psadbw mm7, mm0 1117 paddw mm7, [pw_4] 1118 psrlw mm7, 3 1119 pshufw mm7, mm7, 0 1120 packuswb mm7, mm7 1121%rep 3 1122 movq [r0+r3*1], mm7 1123 movq [r0+r3*2], mm7 1124 lea r0, [r0+r3*2] 1125%endrep 1126 movq [r0+r3*1], mm7 1127 movq [r0+r3*2], mm7 1128 RET 1129%endmacro 1130 1131INIT_MMX 1132%define PALIGNR PALIGNR_MMX 1133PRED8x8L_TOP_DC mmxext 1134%define PALIGNR PALIGNR_SSSE3 1135PRED8x8L_TOP_DC ssse3 1136 1137;----------------------------------------------------------------------------- 1138;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride) 1139;----------------------------------------------------------------------------- 1140 1141%macro PRED8x8L_DC 1 1142cglobal pred8x8l_dc_%1, 4,5 1143 sub r0, r3 1144 lea r4, [r0+r3*2] 1145 movq mm0, [r0+r3*1-8] 1146 punpckhbw mm0, [r0+r3*0-8] 1147 movq mm1, [r4+r3*1-8] 1148 punpckhbw mm1, [r0+r3*2-8] 1149 mov r4, r0 1150 punpckhwd mm1, mm0 1151 lea r0, [r0+r3*4] 1152 movq mm2, [r0+r3*1-8] 1153 punpckhbw mm2, [r0+r3*0-8] 1154 lea r0, [r0+r3*2] 1155 movq mm3, [r0+r3*1-8] 1156 punpckhbw mm3, [r0+r3*0-8] 1157 punpckhwd mm3, mm2 1158 punpckhdq mm3, mm1 1159 lea r0, [r0+r3*2] 1160 movq mm0, [r0+r3*0-8] 1161 movq mm1, [r4] 1162 mov r0, r4 1163 movq mm4, mm3 1164 movq mm2, mm3 1165 PALIGNR mm4, mm0, 7, mm0 1166 PALIGNR mm1, mm2, 1, mm2 1167 test r1, r1 1168 jnz .do_left 1169.fix_lt_1: 1170 movq mm5, mm3 1171 pxor mm5, mm4 1172 psrlq mm5, 56 1173 psllq mm5, 48 1174 pxor mm1, mm5 1175 jmp .do_left 1176.fix_lt_2: 1177 movq mm5, mm3 1178 pxor mm5, mm2 1179 psllq mm5, 56 1180 psrlq mm5, 56 1181 pxor mm2, mm5 1182 test r2, r2 1183 jnz .body 1184.fix_tr_1: 1185 movq mm5, mm3 1186 pxor mm5, mm1 1187 psrlq mm5, 56 1188 psllq mm5, 56 1189 pxor mm1, mm5 1190 jmp .body 1191.do_left: 1192 movq mm0, mm4 1193 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1194 movq mm4, mm0 1195 movq mm7, mm2 1196 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1197 psllq mm1, 56 1198 PALIGNR mm7, mm1, 7, mm3 1199 movq mm0, [r0-8] 1200 movq mm3, [r0] 1201 movq mm1, [r0+8] 1202 movq mm2, mm3 1203 movq mm4, mm3 1204 PALIGNR mm2, mm0, 7, mm0 1205 PALIGNR mm1, mm4, 1, mm4 1206 test r1, r1 1207 jz .fix_lt_2 1208 test r2, r2 1209 jz .fix_tr_1 1210.body 1211 lea r1, [r0+r3*2] 1212 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1213 pxor mm0, mm0 1214 pxor mm1, mm1 1215 lea r2, [r1+r3*2] 1216 psadbw mm0, mm7 1217 psadbw mm1, mm6 1218 paddw mm0, [pw_8] 1219 paddw mm0, mm1 1220 lea r4, [r2+r3*2] 1221 psrlw mm0, 4 1222 pshufw mm0, mm0, 0 1223 packuswb mm0, mm0 1224 movq [r0+r3*1], mm0 1225 movq [r0+r3*2], mm0 1226 movq [r1+r3*1], mm0 1227 movq [r1+r3*2], mm0 1228 movq [r2+r3*1], mm0 1229 movq [r2+r3*2], mm0 1230 movq [r4+r3*1], mm0 1231 movq [r4+r3*2], mm0 1232 RET 1233%endmacro 1234INIT_MMX 1235%define PALIGNR PALIGNR_MMX 1236PRED8x8L_DC mmxext 1237%define PALIGNR PALIGNR_SSSE3 1238PRED8x8L_DC ssse3 1239 1240;----------------------------------------------------------------------------- 1241; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) 1242;----------------------------------------------------------------------------- 1243 1244%macro PRED8x8L_HORIZONTAL 1 1245cglobal pred8x8l_horizontal_%1, 4,4 1246 sub r0, r3 1247 lea r2, [r0+r3*2] 1248 movq mm0, [r0+r3*1-8] 1249 test r1, r1 1250 lea r1, [r0+r3] 1251 cmovnz r1, r0 1252 punpckhbw mm0, [r1+r3*0-8] 1253 movq mm1, [r2+r3*1-8] 1254 punpckhbw mm1, [r0+r3*2-8] 1255 mov r2, r0 1256 punpckhwd mm1, mm0 1257 lea r0, [r0+r3*4] 1258 movq mm2, [r0+r3*1-8] 1259 punpckhbw mm2, [r0+r3*0-8] 1260 lea r0, [r0+r3*2] 1261 movq mm3, [r0+r3*1-8] 1262 punpckhbw mm3, [r0+r3*0-8] 1263 punpckhwd mm3, mm2 1264 punpckhdq mm3, mm1 1265 lea r0, [r0+r3*2] 1266 movq mm0, [r0+r3*0-8] 1267 movq mm1, [r1+r3*0-8] 1268 mov r0, r2 1269 movq mm4, mm3 1270 movq mm2, mm3 1271 PALIGNR mm4, mm0, 7, mm0 1272 PALIGNR mm1, mm2, 1, mm2 1273 movq mm0, mm4 1274 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1275 movq mm4, mm0 1276 movq mm7, mm2 1277 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1278 psllq mm1, 56 1279 PALIGNR mm7, mm1, 7, mm3 1280 movq mm3, mm7 1281 lea r1, [r0+r3*2] 1282 movq mm7, mm3 1283 punpckhbw mm3, mm3 1284 punpcklbw mm7, mm7 1285 pshufw mm0, mm3, 0xff 1286 pshufw mm1, mm3, 0xaa 1287 lea r2, [r1+r3*2] 1288 pshufw mm2, mm3, 0x55 1289 pshufw mm3, mm3, 0x00 1290 pshufw mm4, mm7, 0xff 1291 pshufw mm5, mm7, 0xaa 1292 pshufw mm6, mm7, 0x55 1293 pshufw mm7, mm7, 0x00 1294 movq [r0+r3*1], mm0 1295 movq [r0+r3*2], mm1 1296 movq [r1+r3*1], mm2 1297 movq [r1+r3*2], mm3 1298 movq [r2+r3*1], mm4 1299 movq [r2+r3*2], mm5 1300 lea r0, [r2+r3*2] 1301 movq [r0+r3*1], mm6 1302 movq [r0+r3*2], mm7 1303 RET 1304%endmacro 1305 1306INIT_MMX 1307%define PALIGNR PALIGNR_MMX 1308PRED8x8L_HORIZONTAL mmxext 1309%define PALIGNR PALIGNR_SSSE3 1310PRED8x8L_HORIZONTAL ssse3 1311 1312;----------------------------------------------------------------------------- 1313; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride) 1314;----------------------------------------------------------------------------- 1315 1316%macro PRED8x8L_VERTICAL 1 1317cglobal pred8x8l_vertical_%1, 4,4 1318 sub r0, r3 1319 movq mm0, [r0-8] 1320 movq mm3, [r0] 1321 movq mm1, [r0+8] 1322 movq mm2, mm3 1323 movq mm4, mm3 1324 PALIGNR mm2, mm0, 7, mm0 1325 PALIGNR mm1, mm4, 1, mm4 1326 test r1, r1 ; top_left 1327 jz .fix_lt_2 1328 test r2, r2 ; top_right 1329 jz .fix_tr_1 1330 jmp .body 1331.fix_lt_2: 1332 movq mm5, mm3 1333 pxor mm5, mm2 1334 psllq mm5, 56 1335 psrlq mm5, 56 1336 pxor mm2, mm5 1337 test r2, r2 ; top_right 1338 jnz .body 1339.fix_tr_1: 1340 movq mm5, mm3 1341 pxor mm5, mm1 1342 psrlq mm5, 56 1343 psllq mm5, 56 1344 pxor mm1, mm5 1345.body 1346 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1347%rep 3 1348 movq [r0+r3*1], mm0 1349 movq [r0+r3*2], mm0 1350 lea r0, [r0+r3*2] 1351%endrep 1352 movq [r0+r3*1], mm0 1353 movq [r0+r3*2], mm0 1354 RET 1355%endmacro 1356 1357INIT_MMX 1358%define PALIGNR PALIGNR_MMX 1359PRED8x8L_VERTICAL mmxext 1360%define PALIGNR PALIGNR_SSSE3 1361PRED8x8L_VERTICAL ssse3 1362 1363;----------------------------------------------------------------------------- 1364;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride) 1365;----------------------------------------------------------------------------- 1366 1367INIT_MMX 1368%define PALIGNR PALIGNR_MMX 1369cglobal pred8x8l_down_left_mmxext, 4,5 1370 sub r0, r3 1371 movq mm0, [r0-8] 1372 movq mm3, [r0] 1373 movq mm1, [r0+8] 1374 movq mm2, mm3 1375 movq mm4, mm3 1376 PALIGNR mm2, mm0, 7, mm0 1377 PALIGNR mm1, mm4, 1, mm4 1378 test r1, r1 1379 jz .fix_lt_2 1380 test r2, r2 1381 jz .fix_tr_1 1382 jmp .do_top 1383.fix_lt_2: 1384 movq mm5, mm3 1385 pxor mm5, mm2 1386 psllq mm5, 56 1387 psrlq mm5, 56 1388 pxor mm2, mm5 1389 test r2, r2 1390 jnz .do_top 1391.fix_tr_1: 1392 movq mm5, mm3 1393 pxor mm5, mm1 1394 psrlq mm5, 56 1395 psllq mm5, 56 1396 pxor mm1, mm5 1397 jmp .do_top 1398.fix_tr_2: 1399 punpckhbw mm3, mm3 1400 pshufw mm1, mm3, 0xFF 1401 jmp .do_topright 1402.do_top: 1403 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1404 movq mm7, mm4 1405 test r2, r2 1406 jz .fix_tr_2 1407 movq mm0, [r0+8] 1408 movq mm5, mm0 1409 movq mm2, mm0 1410 movq mm4, mm0 1411 psrlq mm5, 56 1412 PALIGNR mm2, mm3, 7, mm3 1413 PALIGNR mm5, mm4, 1, mm4 1414 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1415.do_topright: 1416 lea r1, [r0+r3*2] 1417 movq mm6, mm1 1418 psrlq mm1, 56 1419 movq mm4, mm1 1420 lea r2, [r1+r3*2] 1421 movq mm2, mm6 1422 PALIGNR mm2, mm7, 1, mm0 1423 movq mm3, mm6 1424 PALIGNR mm3, mm7, 7, mm0 1425 PALIGNR mm4, mm6, 1, mm0 1426 movq mm5, mm7 1427 movq mm1, mm7 1428 movq mm7, mm6 1429 lea r4, [r2+r3*2] 1430 psllq mm1, 8 1431 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 1432 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 1433 movq [r4+r3*2], mm1 1434 movq mm2, mm0 1435 psllq mm1, 8 1436 psrlq mm2, 56 1437 psllq mm0, 8 1438 por mm1, mm2 1439 movq [r4+r3*1], mm1 1440 movq mm2, mm0 1441 psllq mm1, 8 1442 psrlq mm2, 56 1443 psllq mm0, 8 1444 por mm1, mm2 1445 movq [r2+r3*2], mm1 1446 movq mm2, mm0 1447 psllq mm1, 8 1448 psrlq mm2, 56 1449 psllq mm0, 8 1450 por mm1, mm2 1451 movq [r2+r3*1], mm1 1452 movq mm2, mm0 1453 psllq mm1, 8 1454 psrlq mm2, 56 1455 psllq mm0, 8 1456 por mm1, mm2 1457 movq [r1+r3*2], mm1 1458 movq mm2, mm0 1459 psllq mm1, 8 1460 psrlq mm2, 56 1461 psllq mm0, 8 1462 por mm1, mm2 1463 movq [r1+r3*1], mm1 1464 movq mm2, mm0 1465 psllq mm1, 8 1466 psrlq mm2, 56 1467 psllq mm0, 8 1468 por mm1, mm2 1469 movq [r0+r3*2], mm1 1470 psllq mm1, 8 1471 psrlq mm0, 56 1472 por mm1, mm0 1473 movq [r0+r3*1], mm1 1474 RET 1475 1476%macro PRED8x8L_DOWN_LEFT 1 1477cglobal pred8x8l_down_left_%1, 4,4 1478 sub r0, r3 1479 movq mm0, [r0-8] 1480 movq mm3, [r0] 1481 movq mm1, [r0+8] 1482 movq mm2, mm3 1483 movq mm4, mm3 1484 PALIGNR mm2, mm0, 7, mm0 1485 PALIGNR mm1, mm4, 1, mm4 1486 test r1, r1 ; top_left 1487 jz .fix_lt_2 1488 test r2, r2 ; top_right 1489 jz .fix_tr_1 1490 jmp .do_top 1491.fix_lt_2: 1492 movq mm5, mm3 1493 pxor mm5, mm2 1494 psllq mm5, 56 1495 psrlq mm5, 56 1496 pxor mm2, mm5 1497 test r2, r2 ; top_right 1498 jnz .do_top 1499.fix_tr_1: 1500 movq mm5, mm3 1501 pxor mm5, mm1 1502 psrlq mm5, 56 1503 psllq mm5, 56 1504 pxor mm1, mm5 1505 jmp .do_top 1506.fix_tr_2: 1507 punpckhbw mm3, mm3 1508 pshufw mm1, mm3, 0xFF 1509 jmp .do_topright 1510.do_top: 1511 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1512 movq2dq xmm3, mm4 1513 test r2, r2 ; top_right 1514 jz .fix_tr_2 1515 movq mm0, [r0+8] 1516 movq mm5, mm0 1517 movq mm2, mm0 1518 movq mm4, mm0 1519 psrlq mm5, 56 1520 PALIGNR mm2, mm3, 7, mm3 1521 PALIGNR mm5, mm4, 1, mm4 1522 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1523.do_topright: 1524 movq2dq xmm4, mm1 1525 psrlq mm1, 56 1526 movq2dq xmm5, mm1 1527 lea r1, [r0+r3*2] 1528 pslldq xmm4, 8 1529 por xmm3, xmm4 1530 movdqa xmm2, xmm3 1531 psrldq xmm2, 1 1532 pslldq xmm5, 15 1533 por xmm2, xmm5 1534 lea r2, [r1+r3*2] 1535 movdqa xmm1, xmm3 1536 pslldq xmm1, 1 1537INIT_XMM 1538 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1539 psrldq xmm0, 1 1540 movq [r0+r3*1], xmm0 1541 psrldq xmm0, 1 1542 movq [r0+r3*2], xmm0 1543 psrldq xmm0, 1 1544 lea r0, [r2+r3*2] 1545 movq [r1+r3*1], xmm0 1546 psrldq xmm0, 1 1547 movq [r1+r3*2], xmm0 1548 psrldq xmm0, 1 1549 movq [r2+r3*1], xmm0 1550 psrldq xmm0, 1 1551 movq [r2+r3*2], xmm0 1552 psrldq xmm0, 1 1553 movq [r0+r3*1], xmm0 1554 psrldq xmm0, 1 1555 movq [r0+r3*2], xmm0 1556 RET 1557%endmacro 1558 1559INIT_MMX 1560%define PALIGNR PALIGNR_MMX 1561PRED8x8L_DOWN_LEFT sse2 1562INIT_MMX 1563%define PALIGNR PALIGNR_SSSE3 1564PRED8x8L_DOWN_LEFT ssse3 1565 1566;----------------------------------------------------------------------------- 1567;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride) 1568;----------------------------------------------------------------------------- 1569 1570INIT_MMX 1571%define PALIGNR PALIGNR_MMX 1572cglobal pred8x8l_down_right_mmxext, 4,5 1573 sub r0, r3 1574 lea r4, [r0+r3*2] 1575 movq mm0, [r0+r3*1-8] 1576 punpckhbw mm0, [r0+r3*0-8] 1577 movq mm1, [r4+r3*1-8] 1578 punpckhbw mm1, [r0+r3*2-8] 1579 mov r4, r0 1580 punpckhwd mm1, mm0 1581 lea r0, [r0+r3*4] 1582 movq mm2, [r0+r3*1-8] 1583 punpckhbw mm2, [r0+r3*0-8] 1584 lea r0, [r0+r3*2] 1585 movq mm3, [r0+r3*1-8] 1586 punpckhbw mm3, [r0+r3*0-8] 1587 punpckhwd mm3, mm2 1588 punpckhdq mm3, mm1 1589 lea r0, [r0+r3*2] 1590 movq mm0, [r0+r3*0-8] 1591 movq mm1, [r4] 1592 mov r0, r4 1593 movq mm4, mm3 1594 movq mm2, mm3 1595 PALIGNR mm4, mm0, 7, mm0 1596 PALIGNR mm1, mm2, 1, mm2 1597 test r1, r1 ; top_left 1598 jz .fix_lt_1 1599.do_left: 1600 movq mm0, mm4 1601 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1602 movq mm4, mm0 1603 movq mm7, mm2 1604 movq mm6, mm2 1605 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1606 psllq mm1, 56 1607 PALIGNR mm7, mm1, 7, mm3 1608 movq mm0, [r0-8] 1609 movq mm3, [r0] 1610 movq mm1, [r0+8] 1611 movq mm2, mm3 1612 movq mm4, mm3 1613 PALIGNR mm2, mm0, 7, mm0 1614 PALIGNR mm1, mm4, 1, mm4 1615 test r1, r1 ; top_left 1616 jz .fix_lt_2 1617 test r2, r2 ; top_right 1618 jz .fix_tr_1 1619.do_top: 1620 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1621 movq mm5, mm4 1622 jmp .body 1623.fix_lt_1: 1624 movq mm5, mm3 1625 pxor mm5, mm4 1626 psrlq mm5, 56 1627 psllq mm5, 48 1628 pxor mm1, mm5 1629 jmp .do_left 1630.fix_lt_2: 1631 movq mm5, mm3 1632 pxor mm5, mm2 1633 psllq mm5, 56 1634 psrlq mm5, 56 1635 pxor mm2, mm5 1636 test r2, r2 ; top_right 1637 jnz .do_top 1638.fix_tr_1: 1639 movq mm5, mm3 1640 pxor mm5, mm1 1641 psrlq mm5, 56 1642 psllq mm5, 56 1643 pxor mm1, mm5 1644 jmp .do_top 1645.body 1646 lea r1, [r0+r3*2] 1647 movq mm1, mm7 1648 movq mm7, mm5 1649 movq mm5, mm6 1650 movq mm2, mm7 1651 lea r2, [r1+r3*2] 1652 PALIGNR mm2, mm6, 1, mm0 1653 movq mm3, mm7 1654 PALIGNR mm3, mm6, 7, mm0 1655 movq mm4, mm7 1656 lea r4, [r2+r3*2] 1657 psrlq mm4, 8 1658 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 1659 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 1660 movq [r4+r3*2], mm0 1661 movq mm2, mm1 1662 psrlq mm0, 8 1663 psllq mm2, 56 1664 psrlq mm1, 8 1665 por mm0, mm2 1666 movq [r4+r3*1], mm0 1667 movq mm2, mm1 1668 psrlq mm0, 8 1669 psllq mm2, 56 1670 psrlq mm1, 8 1671 por mm0, mm2 1672 movq [r2+r3*2], mm0 1673 movq mm2, mm1 1674 psrlq mm0, 8 1675 psllq mm2, 56 1676 psrlq mm1, 8 1677 por mm0, mm2 1678 movq [r2+r3*1], mm0 1679 movq mm2, mm1 1680 psrlq mm0, 8 1681 psllq mm2, 56 1682 psrlq mm1, 8 1683 por mm0, mm2 1684 movq [r1+r3*2], mm0 1685 movq mm2, mm1 1686 psrlq mm0, 8 1687 psllq mm2, 56 1688 psrlq mm1, 8 1689 por mm0, mm2 1690 movq [r1+r3*1], mm0 1691 movq mm2, mm1 1692 psrlq mm0, 8 1693 psllq mm2, 56 1694 psrlq mm1, 8 1695 por mm0, mm2 1696 movq [r0+r3*2], mm0 1697 psrlq mm0, 8 1698 psllq mm1, 56 1699 por mm0, mm1 1700 movq [r0+r3*1], mm0 1701 RET 1702 1703%macro PRED8x8L_DOWN_RIGHT 1 1704cglobal pred8x8l_down_right_%1, 4,5 1705 sub r0, r3 1706 lea r4, [r0+r3*2] 1707 movq mm0, [r0+r3*1-8] 1708 punpckhbw mm0, [r0+r3*0-8] 1709 movq mm1, [r4+r3*1-8] 1710 punpckhbw mm1, [r0+r3*2-8] 1711 mov r4, r0 1712 punpckhwd mm1, mm0 1713 lea r0, [r0+r3*4] 1714 movq mm2, [r0+r3*1-8] 1715 punpckhbw mm2, [r0+r3*0-8] 1716 lea r0, [r0+r3*2] 1717 movq mm3, [r0+r3*1-8] 1718 punpckhbw mm3, [r0+r3*0-8] 1719 punpckhwd mm3, mm2 1720 punpckhdq mm3, mm1 1721 lea r0, [r0+r3*2] 1722 movq mm0, [r0+r3*0-8] 1723 movq mm1, [r4] 1724 mov r0, r4 1725 movq mm4, mm3 1726 movq mm2, mm3 1727 PALIGNR mm4, mm0, 7, mm0 1728 PALIGNR mm1, mm2, 1, mm2 1729 test r1, r1 1730 jz .fix_lt_1 1731 jmp .do_left 1732.fix_lt_1: 1733 movq mm5, mm3 1734 pxor mm5, mm4 1735 psrlq mm5, 56 1736 psllq mm5, 48 1737 pxor mm1, mm5 1738 jmp .do_left 1739.fix_lt_2: 1740 movq mm5, mm3 1741 pxor mm5, mm2 1742 psllq mm5, 56 1743 psrlq mm5, 56 1744 pxor mm2, mm5 1745 test r2, r2 1746 jnz .do_top 1747.fix_tr_1: 1748 movq mm5, mm3 1749 pxor mm5, mm1 1750 psrlq mm5, 56 1751 psllq mm5, 56 1752 pxor mm1, mm5 1753 jmp .do_top 1754.do_left: 1755 movq mm0, mm4 1756 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1757 movq mm4, mm0 1758 movq mm7, mm2 1759 movq2dq xmm3, mm2 1760 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1761 psllq mm1, 56 1762 PALIGNR mm7, mm1, 7, mm3 1763 movq2dq xmm1, mm7 1764 movq mm0, [r0-8] 1765 movq mm3, [r0] 1766 movq mm1, [r0+8] 1767 movq mm2, mm3 1768 movq mm4, mm3 1769 PALIGNR mm2, mm0, 7, mm0 1770 PALIGNR mm1, mm4, 1, mm4 1771 test r1, r1 1772 jz .fix_lt_2 1773 test r2, r2 1774 jz .fix_tr_1 1775.do_top: 1776 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1777 movq2dq xmm4, mm4 1778 lea r1, [r0+r3*2] 1779 movdqa xmm0, xmm3 1780 pslldq xmm4, 8 1781 por xmm3, xmm4 1782 lea r2, [r1+r3*2] 1783 pslldq xmm4, 1 1784 por xmm1, xmm4 1785 psrldq xmm0, 7 1786 pslldq xmm0, 15 1787 psrldq xmm0, 7 1788 por xmm1, xmm0 1789 lea r0, [r2+r3*2] 1790 movdqa xmm2, xmm3 1791 psrldq xmm2, 1 1792INIT_XMM 1793 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1794 movdqa xmm1, xmm0 1795 psrldq xmm1, 1 1796 movq [r0+r3*2], xmm0 1797 movq [r0+r3*1], xmm1 1798 psrldq xmm0, 2 1799 psrldq xmm1, 2 1800 movq [r2+r3*2], xmm0 1801 movq [r2+r3*1], xmm1 1802 psrldq xmm0, 2 1803 psrldq xmm1, 2 1804 movq [r1+r3*2], xmm0 1805 movq [r1+r3*1], xmm1 1806 psrldq xmm0, 2 1807 psrldq xmm1, 2 1808 movq [r4+r3*2], xmm0 1809 movq [r4+r3*1], xmm1 1810 RET 1811%endmacro 1812 1813INIT_MMX 1814%define PALIGNR PALIGNR_MMX 1815PRED8x8L_DOWN_RIGHT sse2 1816INIT_MMX 1817%define PALIGNR PALIGNR_SSSE3 1818PRED8x8L_DOWN_RIGHT ssse3 1819 1820;----------------------------------------------------------------------------- 1821; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride) 1822;----------------------------------------------------------------------------- 1823 1824INIT_MMX 1825%define PALIGNR PALIGNR_MMX 1826cglobal pred8x8l_vertical_right_mmxext, 4,5 1827 sub r0, r3 1828 lea r4, [r0+r3*2] 1829 movq mm0, [r0+r3*1-8] 1830 punpckhbw mm0, [r0+r3*0-8] 1831 movq mm1, [r4+r3*1-8] 1832 punpckhbw mm1, [r0+r3*2-8] 1833 mov r4, r0 1834 punpckhwd mm1, mm0 1835 lea r0, [r0+r3*4] 1836 movq mm2, [r0+r3*1-8] 1837 punpckhbw mm2, [r0+r3*0-8] 1838 lea r0, [r0+r3*2] 1839 movq mm3, [r0+r3*1-8] 1840 punpckhbw mm3, [r0+r3*0-8] 1841 punpckhwd mm3, mm2 1842 punpckhdq mm3, mm1 1843 lea r0, [r0+r3*2] 1844 movq mm0, [r0+r3*0-8] 1845 movq mm1, [r4] 1846 mov r0, r4 1847 movq mm4, mm3 1848 movq mm2, mm3 1849 PALIGNR mm4, mm0, 7, mm0 1850 PALIGNR mm1, mm2, 1, mm2 1851 test r1, r1 1852 jz .fix_lt_1 1853 jmp .do_left 1854.fix_lt_1: 1855 movq mm5, mm3 1856 pxor mm5, mm4 1857 psrlq mm5, 56 1858 psllq mm5, 48 1859 pxor mm1, mm5 1860 jmp .do_left 1861.fix_lt_2: 1862 movq mm5, mm3 1863 pxor mm5, mm2 1864 psllq mm5, 56 1865 psrlq mm5, 56 1866 pxor mm2, mm5 1867 test r2, r2 1868 jnz .do_top 1869.fix_tr_1: 1870 movq mm5, mm3 1871 pxor mm5, mm1 1872 psrlq mm5, 56 1873 psllq mm5, 56 1874 pxor mm1, mm5 1875 jmp .do_top 1876.do_left: 1877 movq mm0, mm4 1878 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1879 movq mm7, mm2 1880 movq mm0, [r0-8] 1881 movq mm3, [r0] 1882 movq mm1, [r0+8] 1883 movq mm2, mm3 1884 movq mm4, mm3 1885 PALIGNR mm2, mm0, 7, mm0 1886 PALIGNR mm1, mm4, 1, mm4 1887 test r1, r1 1888 jz .fix_lt_2 1889 test r2, r2 1890 jz .fix_tr_1 1891.do_top 1892 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1893 lea r1, [r0+r3*2] 1894 movq mm2, mm6 1895 movq mm3, mm6 1896 PALIGNR mm3, mm7, 7, mm0 1897 PALIGNR mm6, mm7, 6, mm1 1898 movq mm4, mm3 1899 pavgb mm3, mm2 1900 lea r2, [r1+r3*2] 1901 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5 1902 movq [r0+r3*1], mm3 1903 movq [r0+r3*2], mm0 1904 movq mm5, mm0 1905 movq mm6, mm3 1906 movq mm1, mm7 1907 movq mm2, mm1 1908 psllq mm2, 8 1909 movq mm3, mm1 1910 psllq mm3, 16 1911 lea r4, [r2+r3*2] 1912 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4 1913 PALIGNR mm6, mm0, 7, mm2 1914 movq [r1+r3*1], mm6 1915 psllq mm0, 8 1916 PALIGNR mm5, mm0, 7, mm1 1917 movq [r1+r3*2], mm5 1918 psllq mm0, 8 1919 PALIGNR mm6, mm0, 7, mm2 1920 movq [r2+r3*1], mm6 1921 psllq mm0, 8 1922 PALIGNR mm5, mm0, 7, mm1 1923 movq [r2+r3*2], mm5 1924 psllq mm0, 8 1925 PALIGNR mm6, mm0, 7, mm2 1926 movq [r4+r3*1], mm6 1927 psllq mm0, 8 1928 PALIGNR mm5, mm0, 7, mm1 1929 movq [r4+r3*2], mm5 1930 RET 1931 1932%macro PRED8x8L_VERTICAL_RIGHT 1 1933cglobal pred8x8l_vertical_right_%1, 4,5,7 1934 sub r0, r3 1935 lea r4, [r0+r3*2] 1936 movq mm0, [r0+r3*1-8] 1937 punpckhbw mm0, [r0+r3*0-8] 1938 movq mm1, [r4+r3*1-8] 1939 punpckhbw mm1, [r0+r3*2-8] 1940 mov r4, r0 1941 punpckhwd mm1, mm0 1942 lea r0, [r0+r3*4] 1943 movq mm2, [r0+r3*1-8] 1944 punpckhbw mm2, [r0+r3*0-8] 1945 lea r0, [r0+r3*2] 1946 movq mm3, [r0+r3*1-8] 1947 punpckhbw mm3, [r0+r3*0-8] 1948 punpckhwd mm3, mm2 1949 punpckhdq mm3, mm1 1950 lea r0, [r0+r3*2] 1951 movq mm0, [r0+r3*0-8] 1952 movq mm1, [r4] 1953 mov r0, r4 1954 movq mm4, mm3 1955 movq mm2, mm3 1956 PALIGNR mm4, mm0, 7, mm0 1957 PALIGNR mm1, mm2, 1, mm2 1958 test r1, r1 1959 jnz .do_left 1960.fix_lt_1: 1961 movq mm5, mm3 1962 pxor mm5, mm4 1963 psrlq mm5, 56 1964 psllq mm5, 48 1965 pxor mm1, mm5 1966 jmp .do_left 1967.fix_lt_2: 1968 movq mm5, mm3 1969 pxor mm5, mm2 1970 psllq mm5, 56 1971 psrlq mm5, 56 1972 pxor mm2, mm5 1973 test r2, r2 1974 jnz .do_top 1975.fix_tr_1: 1976 movq mm5, mm3 1977 pxor mm5, mm1 1978 psrlq mm5, 56 1979 psllq mm5, 56 1980 pxor mm1, mm5 1981 jmp .do_top 1982.do_left: 1983 movq mm0, mm4 1984 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1985 movq2dq xmm0, mm2 1986 movq mm0, [r0-8] 1987 movq mm3, [r0] 1988 movq mm1, [r0+8] 1989 movq mm2, mm3 1990 movq mm4, mm3 1991 PALIGNR mm2, mm0, 7, mm0 1992 PALIGNR mm1, mm4, 1, mm4 1993 test r1, r1 1994 jz .fix_lt_2 1995 test r2, r2 1996 jz .fix_tr_1 1997.do_top 1998 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1999 lea r1, [r0+r3*2] 2000 movq2dq xmm4, mm6 2001 pslldq xmm4, 8 2002 por xmm0, xmm4 2003 movdqa xmm6, [pw_ff00] 2004 movdqa xmm1, xmm0 2005 lea r2, [r1+r3*2] 2006 movdqa xmm2, xmm0 2007 movdqa xmm3, xmm0 2008 pslldq xmm0, 1 2009 pslldq xmm1, 2 2010 pavgb xmm2, xmm0 2011INIT_XMM 2012 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 2013 pandn xmm6, xmm4 2014 movdqa xmm5, xmm4 2015 psrlw xmm4, 8 2016 packuswb xmm6, xmm4 2017 movhlps xmm4, xmm6 2018 movhps [r0+r3*2], xmm5 2019 movhps [r0+r3*1], xmm2 2020 psrldq xmm5, 4 2021 movss xmm5, xmm6 2022 psrldq xmm2, 4 2023 movss xmm2, xmm4 2024 lea r0, [r2+r3*2] 2025 psrldq xmm5, 1 2026 psrldq xmm2, 1 2027 movq [r0+r3*2], xmm5 2028 movq [r0+r3*1], xmm2 2029 psrldq xmm5, 1 2030 psrldq xmm2, 1 2031 movq [r2+r3*2], xmm5 2032 movq [r2+r3*1], xmm2 2033 psrldq xmm5, 1 2034 psrldq xmm2, 1 2035 movq [r1+r3*2], xmm5 2036 movq [r1+r3*1], xmm2 2037 RET 2038%endmacro 2039 2040INIT_MMX 2041%define PALIGNR PALIGNR_MMX 2042PRED8x8L_VERTICAL_RIGHT sse2 2043INIT_MMX 2044%define PALIGNR PALIGNR_SSSE3 2045PRED8x8L_VERTICAL_RIGHT ssse3 2046 2047;----------------------------------------------------------------------------- 2048;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride) 2049;----------------------------------------------------------------------------- 2050 2051%macro PRED8x8L_VERTICAL_LEFT 1 2052cglobal pred8x8l_vertical_left_%1, 4,4 2053 sub r0, r3 2054 movq mm0, [r0-8] 2055 movq mm3, [r0] 2056 movq mm1, [r0+8] 2057 movq mm2, mm3 2058 movq mm4, mm3 2059 PALIGNR mm2, mm0, 7, mm0 2060 PALIGNR mm1, mm4, 1, mm4 2061 test r1, r1 2062 jz .fix_lt_2 2063 test r2, r2 2064 jz .fix_tr_1 2065 jmp .do_top 2066.fix_lt_2: 2067 movq mm5, mm3 2068 pxor mm5, mm2 2069 psllq mm5, 56 2070 psrlq mm5, 56 2071 pxor mm2, mm5 2072 test r2, r2 2073 jnz .do_top 2074.fix_tr_1: 2075 movq mm5, mm3 2076 pxor mm5, mm1 2077 psrlq mm5, 56 2078 psllq mm5, 56 2079 pxor mm1, mm5 2080 jmp .do_top 2081.fix_tr_2: 2082 punpckhbw mm3, mm3 2083 pshufw mm1, mm3, 0xFF 2084 jmp .do_topright 2085.do_top: 2086 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2087 movq2dq xmm4, mm4 2088 test r2, r2 2089 jz .fix_tr_2 2090 movq mm0, [r0+8] 2091 movq mm5, mm0 2092 movq mm2, mm0 2093 movq mm4, mm0 2094 psrlq mm5, 56 2095 PALIGNR mm2, mm3, 7, mm3 2096 PALIGNR mm5, mm4, 1, mm4 2097 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 2098.do_topright: 2099 movq2dq xmm3, mm1 2100 lea r1, [r0+r3*2] 2101 pslldq xmm3, 8 2102 por xmm4, xmm3 2103 movdqa xmm2, xmm4 2104 movdqa xmm1, xmm4 2105 movdqa xmm3, xmm4 2106 psrldq xmm2, 1 2107 pslldq xmm1, 1 2108 pavgb xmm3, xmm2 2109 lea r2, [r1+r3*2] 2110INIT_XMM 2111 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 2112 psrldq xmm0, 1 2113 movq [r0+r3*1], xmm3 2114 movq [r0+r3*2], xmm0 2115 lea r0, [r2+r3*2] 2116 psrldq xmm3, 1 2117 psrldq xmm0, 1 2118 movq [r1+r3*1], xmm3 2119 movq [r1+r3*2], xmm0 2120 psrldq xmm3, 1 2121 psrldq xmm0, 1 2122 movq [r2+r3*1], xmm3 2123 movq [r2+r3*2], xmm0 2124 psrldq xmm3, 1 2125 psrldq xmm0, 1 2126 movq [r0+r3*1], xmm3 2127 movq [r0+r3*2], xmm0 2128 RET 2129%endmacro 2130 2131INIT_MMX 2132%define PALIGNR PALIGNR_MMX 2133PRED8x8L_VERTICAL_LEFT sse2 2134%define PALIGNR PALIGNR_SSSE3 2135INIT_MMX 2136PRED8x8L_VERTICAL_LEFT ssse3 2137 2138;----------------------------------------------------------------------------- 2139; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride) 2140;----------------------------------------------------------------------------- 2141 2142%macro PRED8x8L_HORIZONTAL_UP 1 2143cglobal pred8x8l_horizontal_up_%1, 4,4 2144 sub r0, r3 2145 lea r2, [r0+r3*2] 2146 movq mm0, [r0+r3*1-8] 2147 test r1, r1 2148 lea r1, [r0+r3] 2149 cmovnz r1, r0 2150 punpckhbw mm0, [r1+r3*0-8] 2151 movq mm1, [r2+r3*1-8] 2152 punpckhbw mm1, [r0+r3*2-8] 2153 mov r2, r0 2154 punpckhwd mm1, mm0 2155 lea r0, [r0+r3*4] 2156 movq mm2, [r0+r3*1-8] 2157 punpckhbw mm2, [r0+r3*0-8] 2158 lea r0, [r0+r3*2] 2159 movq mm3, [r0+r3*1-8] 2160 punpckhbw mm3, [r0+r3*0-8] 2161 punpckhwd mm3, mm2 2162 punpckhdq mm3, mm1 2163 lea r0, [r0+r3*2] 2164 movq mm0, [r0+r3*0-8] 2165 movq mm1, [r1+r3*0-8] 2166 mov r0, r2 2167 movq mm4, mm3 2168 movq mm2, mm3 2169 PALIGNR mm4, mm0, 7, mm0 2170 PALIGNR mm1, mm2, 1, mm2 2171 movq mm0, mm4 2172 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2173 movq mm4, mm0 2174 movq mm7, mm2 2175 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2176 psllq mm1, 56 2177 PALIGNR mm7, mm1, 7, mm3 2178 lea r1, [r0+r3*2] 2179 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 2180 psllq mm7, 56 ; l7 .. .. .. .. .. .. .. 2181 movq mm2, mm0 2182 psllw mm0, 8 2183 psrlw mm2, 8 2184 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 2185 movq mm3, mm2 2186 movq mm4, mm2 2187 movq mm5, mm2 2188 psrlq mm2, 8 2189 psrlq mm3, 16 2190 lea r2, [r1+r3*2] 2191 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 2192 punpckhbw mm7, mm7 2193 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 2194 pavgb mm4, mm2 2195 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 2196 movq mm5, mm4 2197 punpcklbw mm4, mm1 ; p4 p3 p2 p1 2198 punpckhbw mm5, mm1 ; p8 p7 p6 p5 2199 movq mm6, mm5 2200 movq mm7, mm5 2201 movq mm0, mm5 2202 PALIGNR mm5, mm4, 2, mm1 2203 pshufw mm1, mm6, 11111001b 2204 PALIGNR mm6, mm4, 4, mm2 2205 pshufw mm2, mm7, 11111110b 2206 PALIGNR mm7, mm4, 6, mm3 2207 pshufw mm3, mm0, 11111111b 2208 movq [r0+r3*1], mm4 2209 movq [r0+r3*2], mm5 2210 lea r0, [r2+r3*2] 2211 movq [r1+r3*1], mm6 2212 movq [r1+r3*2], mm7 2213 movq [r2+r3*1], mm0 2214 movq [r2+r3*2], mm1 2215 movq [r0+r3*1], mm2 2216 movq [r0+r3*2], mm3 2217 RET 2218%endmacro 2219 2220INIT_MMX 2221%define PALIGNR PALIGNR_MMX 2222PRED8x8L_HORIZONTAL_UP mmxext 2223%define PALIGNR PALIGNR_SSSE3 2224PRED8x8L_HORIZONTAL_UP ssse3 2225 2226;----------------------------------------------------------------------------- 2227;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride) 2228;----------------------------------------------------------------------------- 2229 2230INIT_MMX 2231%define PALIGNR PALIGNR_MMX 2232cglobal pred8x8l_horizontal_down_mmxext, 4,5 2233 sub r0, r3 2234 lea r4, [r0+r3*2] 2235 movq mm0, [r0+r3*1-8] 2236 punpckhbw mm0, [r0+r3*0-8] 2237 movq mm1, [r4+r3*1-8] 2238 punpckhbw mm1, [r0+r3*2-8] 2239 mov r4, r0 2240 punpckhwd mm1, mm0 2241 lea r0, [r0+r3*4] 2242 movq mm2, [r0+r3*1-8] 2243 punpckhbw mm2, [r0+r3*0-8] 2244 lea r0, [r0+r3*2] 2245 movq mm3, [r0+r3*1-8] 2246 punpckhbw mm3, [r0+r3*0-8] 2247 punpckhwd mm3, mm2 2248 punpckhdq mm3, mm1 2249 lea r0, [r0+r3*2] 2250 movq mm0, [r0+r3*0-8] 2251 movq mm1, [r4] 2252 mov r0, r4 2253 movq mm4, mm3 2254 movq mm2, mm3 2255 PALIGNR mm4, mm0, 7, mm0 2256 PALIGNR mm1, mm2, 1, mm2 2257 test r1, r1 2258 jnz .do_left 2259.fix_lt_1: 2260 movq mm5, mm3 2261 pxor mm5, mm4 2262 psrlq mm5, 56 2263 psllq mm5, 48 2264 pxor mm1, mm5 2265 jmp .do_left 2266.fix_lt_2: 2267 movq mm5, mm3 2268 pxor mm5, mm2 2269 psllq mm5, 56 2270 psrlq mm5, 56 2271 pxor mm2, mm5 2272 test r2, r2 2273 jnz .do_top 2274.fix_tr_1: 2275 movq mm5, mm3 2276 pxor mm5, mm1 2277 psrlq mm5, 56 2278 psllq mm5, 56 2279 pxor mm1, mm5 2280 jmp .do_top 2281.do_left: 2282 movq mm0, mm4 2283 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2284 movq mm4, mm0 2285 movq mm7, mm2 2286 movq mm6, mm2 2287 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2288 psllq mm1, 56 2289 PALIGNR mm7, mm1, 7, mm3 2290 movq mm0, [r0-8] 2291 movq mm3, [r0] 2292 movq mm1, [r0+8] 2293 movq mm2, mm3 2294 movq mm4, mm3 2295 PALIGNR mm2, mm0, 7, mm0 2296 PALIGNR mm1, mm4, 1, mm4 2297 test r1, r1 2298 jz .fix_lt_2 2299 test r2, r2 2300 jz .fix_tr_1 2301.do_top: 2302 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2303 movq mm5, mm4 2304 lea r1, [r0+r3*2] 2305 psllq mm7, 56 2306 movq mm2, mm5 2307 movq mm3, mm6 2308 movq mm4, mm2 2309 PALIGNR mm2, mm6, 7, mm5 2310 PALIGNR mm6, mm7, 7, mm0 2311 lea r2, [r1+r3*2] 2312 PALIGNR mm4, mm3, 1, mm7 2313 movq mm5, mm3 2314 pavgb mm3, mm6 2315 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7 2316 movq mm4, mm2 2317 movq mm1, mm2 2318 lea r4, [r2+r3*2] 2319 psrlq mm4, 16 2320 psrlq mm1, 8 2321 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5 2322 movq mm7, mm3 2323 punpcklbw mm3, mm0 2324 punpckhbw mm7, mm0 2325 movq mm1, mm7 2326 movq mm0, mm7 2327 movq mm4, mm7 2328 movq [r4+r3*2], mm3 2329 PALIGNR mm7, mm3, 2, mm5 2330 movq [r4+r3*1], mm7 2331 PALIGNR mm1, mm3, 4, mm5 2332 movq [r2+r3*2], mm1 2333 PALIGNR mm0, mm3, 6, mm3 2334 movq [r2+r3*1], mm0 2335 movq mm2, mm6 2336 movq mm3, mm6 2337 movq [r1+r3*2], mm4 2338 PALIGNR mm6, mm4, 2, mm5 2339 movq [r1+r3*1], mm6 2340 PALIGNR mm2, mm4, 4, mm5 2341 movq [r0+r3*2], mm2 2342 PALIGNR mm3, mm4, 6, mm4 2343 movq [r0+r3*1], mm3 2344 RET 2345 2346%macro PRED8x8L_HORIZONTAL_DOWN 1 2347cglobal pred8x8l_horizontal_down_%1, 4,5 2348 sub r0, r3 2349 lea r4, [r0+r3*2] 2350 movq mm0, [r0+r3*1-8] 2351 punpckhbw mm0, [r0+r3*0-8] 2352 movq mm1, [r4+r3*1-8] 2353 punpckhbw mm1, [r0+r3*2-8] 2354 mov r4, r0 2355 punpckhwd mm1, mm0 2356 lea r0, [r0+r3*4] 2357 movq mm2, [r0+r3*1-8] 2358 punpckhbw mm2, [r0+r3*0-8] 2359 lea r0, [r0+r3*2] 2360 movq mm3, [r0+r3*1-8] 2361 punpckhbw mm3, [r0+r3*0-8] 2362 punpckhwd mm3, mm2 2363 punpckhdq mm3, mm1 2364 lea r0, [r0+r3*2] 2365 movq mm0, [r0+r3*0-8] 2366 movq mm1, [r4] 2367 mov r0, r4 2368 movq mm4, mm3 2369 movq mm2, mm3 2370 PALIGNR mm4, mm0, 7, mm0 2371 PALIGNR mm1, mm2, 1, mm2 2372 test r1, r1 2373 jnz .do_left 2374.fix_lt_1: 2375 movq mm5, mm3 2376 pxor mm5, mm4 2377 psrlq mm5, 56 2378 psllq mm5, 48 2379 pxor mm1, mm5 2380 jmp .do_left 2381.fix_lt_2: 2382 movq mm5, mm3 2383 pxor mm5, mm2 2384 psllq mm5, 56 2385 psrlq mm5, 56 2386 pxor mm2, mm5 2387 test r2, r2 2388 jnz .do_top 2389.fix_tr_1: 2390 movq mm5, mm3 2391 pxor mm5, mm1 2392 psrlq mm5, 56 2393 psllq mm5, 56 2394 pxor mm1, mm5 2395 jmp .do_top 2396.fix_tr_2: 2397 punpckhbw mm3, mm3 2398 pshufw mm1, mm3, 0xFF 2399 jmp .do_topright 2400.do_left: 2401 movq mm0, mm4 2402 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2403 movq2dq xmm0, mm2 2404 pslldq xmm0, 8 2405 movq mm4, mm0 2406 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2407 movq2dq xmm2, mm1 2408 pslldq xmm2, 15 2409 psrldq xmm2, 8 2410 por xmm0, xmm2 2411 movq mm0, [r0-8] 2412 movq mm3, [r0] 2413 movq mm1, [r0+8] 2414 movq mm2, mm3 2415 movq mm4, mm3 2416 PALIGNR mm2, mm0, 7, mm0 2417 PALIGNR mm1, mm4, 1, mm4 2418 test r1, r1 2419 jz .fix_lt_2 2420 test r2, r2 2421 jz .fix_tr_1 2422.do_top: 2423 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2424 movq2dq xmm1, mm4 2425 test r2, r2 2426 jz .fix_tr_2 2427 movq mm0, [r0+8] 2428 movq mm5, mm0 2429 movq mm2, mm0 2430 movq mm4, mm0 2431 psrlq mm5, 56 2432 PALIGNR mm2, mm3, 7, mm3 2433 PALIGNR mm5, mm4, 1, mm4 2434 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 2435.do_topright: 2436 movq2dq xmm5, mm1 2437 pslldq xmm5, 8 2438 por xmm1, xmm5 2439INIT_XMM 2440 lea r2, [r4+r3*2] 2441 movdqa xmm2, xmm1 2442 movdqa xmm3, xmm1 2443 PALIGNR xmm1, xmm0, 7, xmm4 2444 PALIGNR xmm2, xmm0, 9, xmm5 2445 lea r1, [r2+r3*2] 2446 PALIGNR xmm3, xmm0, 8, xmm0 2447 movdqa xmm4, xmm1 2448 pavgb xmm4, xmm3 2449 lea r0, [r1+r3*2] 2450 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 2451 punpcklbw xmm4, xmm0 2452 movhlps xmm0, xmm4 2453 movq [r0+r3*2], xmm4 2454 movq [r2+r3*2], xmm0 2455 psrldq xmm4, 2 2456 psrldq xmm0, 2 2457 movq [r0+r3*1], xmm4 2458 movq [r2+r3*1], xmm0 2459 psrldq xmm4, 2 2460 psrldq xmm0, 2 2461 movq [r1+r3*2], xmm4 2462 movq [r4+r3*2], xmm0 2463 psrldq xmm4, 2 2464 psrldq xmm0, 2 2465 movq [r1+r3*1], xmm4 2466 movq [r4+r3*1], xmm0 2467 RET 2468%endmacro 2469 2470INIT_MMX 2471%define PALIGNR PALIGNR_MMX 2472PRED8x8L_HORIZONTAL_DOWN sse2 2473INIT_MMX 2474%define PALIGNR PALIGNR_SSSE3 2475PRED8x8L_HORIZONTAL_DOWN ssse3 2476 2477;----------------------------------------------------------------------------- 2478; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2479;----------------------------------------------------------------------------- 2480 2481cglobal pred4x4_dc_mmxext, 3,5 2482 pxor mm7, mm7 2483 mov r4, r0 2484 sub r0, r2 2485 movd mm0, [r0] 2486 psadbw mm0, mm7 2487 movzx r1d, byte [r0+r2*1-1] 2488 movd r3d, mm0 2489 add r3d, r1d 2490 movzx r1d, byte [r0+r2*2-1] 2491 lea r0, [r0+r2*2] 2492 add r3d, r1d 2493 movzx r1d, byte [r0+r2*1-1] 2494 add r3d, r1d 2495 movzx r1d, byte [r0+r2*2-1] 2496 add r3d, r1d 2497 add r3d, 4 2498 shr r3d, 3 2499 imul r3d, 0x01010101 2500 mov [r4+r2*0], r3d 2501 mov [r0+r2*0], r3d 2502 mov [r0+r2*1], r3d 2503 mov [r0+r2*2], r3d 2504 RET 2505 2506;----------------------------------------------------------------------------- 2507; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2508;----------------------------------------------------------------------------- 2509 2510%macro PRED4x4_TM_MMX 1 2511cglobal pred4x4_tm_vp8_%1, 3,6 2512 sub r0, r2 2513 pxor mm7, mm7 2514 movd mm0, [r0] 2515 punpcklbw mm0, mm7 2516 movzx r4d, byte [r0-1] 2517 mov r5d, 2 2518.loop: 2519 movzx r1d, byte [r0+r2*1-1] 2520 movzx r3d, byte [r0+r2*2-1] 2521 sub r1d, r4d 2522 sub r3d, r4d 2523 movd mm2, r1d 2524 movd mm4, r3d 2525%ifidn %1, mmx 2526 punpcklwd mm2, mm2 2527 punpcklwd mm4, mm4 2528 punpckldq mm2, mm2 2529 punpckldq mm4, mm4 2530%else 2531 pshufw mm2, mm2, 0 2532 pshufw mm4, mm4, 0 2533%endif 2534 paddw mm2, mm0 2535 paddw mm4, mm0 2536 packuswb mm2, mm2 2537 packuswb mm4, mm4 2538 movd [r0+r2*1], mm2 2539 movd [r0+r2*2], mm4 2540 lea r0, [r0+r2*2] 2541 dec r5d 2542 jg .loop 2543 REP_RET 2544%endmacro 2545 2546PRED4x4_TM_MMX mmx 2547PRED4x4_TM_MMX mmxext 2548 2549cglobal pred4x4_tm_vp8_ssse3, 3,3 2550 sub r0, r2 2551 movq mm6, [tm_shuf] 2552 pxor mm1, mm1 2553 movd mm0, [r0] 2554 punpcklbw mm0, mm1 2555 movd mm7, [r0-4] 2556 pshufb mm7, mm6 2557 lea r1, [r0+r2*2] 2558 movd mm2, [r0+r2*1-4] 2559 movd mm3, [r0+r2*2-4] 2560 movd mm4, [r1+r2*1-4] 2561 movd mm5, [r1+r2*2-4] 2562 pshufb mm2, mm6 2563 pshufb mm3, mm6 2564 pshufb mm4, mm6 2565 pshufb mm5, mm6 2566 psubw mm2, mm7 2567 psubw mm3, mm7 2568 psubw mm4, mm7 2569 psubw mm5, mm7 2570 paddw mm2, mm0 2571 paddw mm3, mm0 2572 paddw mm4, mm0 2573 paddw mm5, mm0 2574 packuswb mm2, mm2 2575 packuswb mm3, mm3 2576 packuswb mm4, mm4 2577 packuswb mm5, mm5 2578 movd [r0+r2*1], mm2 2579 movd [r0+r2*2], mm3 2580 movd [r1+r2*1], mm4 2581 movd [r1+r2*2], mm5 2582 RET 2583 2584;----------------------------------------------------------------------------- 2585; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2586;----------------------------------------------------------------------------- 2587 2588INIT_MMX 2589cglobal pred4x4_vertical_vp8_mmxext, 3,3 2590 sub r0, r2 2591 movd m1, [r0-1] 2592 movd m0, [r0] 2593 mova m2, m0 ;t0 t1 t2 t3 2594 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 2595 lea r1, [r0+r2*2] 2596 psrlq m0, 8 ;t1 t2 t3 t4 2597 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2598 movd [r0+r2*1], m3 2599 movd [r0+r2*2], m3 2600 movd [r1+r2*1], m3 2601 movd [r1+r2*2], m3 2602 RET 2603 2604;----------------------------------------------------------------------------- 2605; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2606;----------------------------------------------------------------------------- 2607INIT_MMX 2608cglobal pred4x4_down_left_mmxext, 3,3 2609 sub r0, r2 2610 movq m1, [r0] 2611 punpckldq m1, [r1] 2612 movq m2, m1 2613 movq m3, m1 2614 psllq m1, 8 2615 pxor m2, m1 2616 psrlq m2, 8 2617 pxor m2, m3 2618 PRED4x4_LOWPASS m0, m1, m2, m3, m4 2619 lea r1, [r0+r2*2] 2620 psrlq m0, 8 2621 movd [r0+r2*1], m0 2622 psrlq m0, 8 2623 movd [r0+r2*2], m0 2624 psrlq m0, 8 2625 movd [r1+r2*1], m0 2626 psrlq m0, 8 2627 movd [r1+r2*2], m0 2628 RET 2629 2630;----------------------------------------------------------------------------- 2631; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2632;----------------------------------------------------------------------------- 2633 2634INIT_MMX 2635cglobal pred4x4_vertical_left_mmxext, 3,3 2636 sub r0, r2 2637 movq m1, [r0] 2638 punpckldq m1, [r1] 2639 movq m3, m1 2640 movq m2, m1 2641 psrlq m3, 8 2642 psrlq m2, 16 2643 movq m4, m3 2644 pavgb m4, m1 2645 PRED4x4_LOWPASS m0, m1, m2, m3, m5 2646 lea r1, [r0+r2*2] 2647 movh [r0+r2*1], m4 2648 movh [r0+r2*2], m0 2649 psrlq m4, 8 2650 psrlq m0, 8 2651 movh [r1+r2*1], m4 2652 movh [r1+r2*2], m0 2653 RET 2654 2655;----------------------------------------------------------------------------- 2656; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2657;----------------------------------------------------------------------------- 2658 2659INIT_MMX 2660cglobal pred4x4_horizontal_up_mmxext, 3,3 2661 sub r0, r2 2662 lea r1, [r0+r2*2] 2663 movd m0, [r0+r2*1-4] 2664 punpcklbw m0, [r0+r2*2-4] 2665 movd m1, [r1+r2*1-4] 2666 punpcklbw m1, [r1+r2*2-4] 2667 punpckhwd m0, m1 2668 movq m1, m0 2669 punpckhbw m1, m1 2670 pshufw m1, m1, 0xFF 2671 punpckhdq m0, m1 2672 movq m2, m0 2673 movq m3, m0 2674 movq m7, m0 2675 psrlq m2, 16 2676 psrlq m3, 8 2677 pavgb m7, m3 2678 PRED4x4_LOWPASS m4, m0, m2, m3, m5 2679 punpcklbw m7, m4 2680 movd [r0+r2*1], m7 2681 psrlq m7, 16 2682 movd [r0+r2*2], m7 2683 psrlq m7, 16 2684 movd [r1+r2*1], m7 2685 movd [r1+r2*2], m1 2686 RET 2687 2688;----------------------------------------------------------------------------- 2689; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2690;----------------------------------------------------------------------------- 2691 2692INIT_MMX 2693%define PALIGNR PALIGNR_MMX 2694cglobal pred4x4_horizontal_down_mmxext, 3,3 2695 sub r0, r2 2696 lea r1, [r0+r2*2] 2697 movh m0, [r0-4] ; lt .. 2698 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. 2699 psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. 2700 movd m1, [r1+r2*2-4] ; l3 2701 punpcklbw m1, [r1+r2*1-4] ; l2 l3 2702 movd m2, [r0+r2*2-4] ; l1 2703 punpcklbw m2, [r0+r2*1-4] ; l0 l1 2704 punpckhwd m1, m2 ; l0 l1 l2 l3 2705 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 2706 movq m0, m1 2707 movq m2, m1 2708 movq m5, m1 2709 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 2710 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 2711 pavgb m5, m2 2712 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2713 punpcklbw m5, m3 2714 psrlq m3, 32 2715 PALIGNR m3, m5, 6, m4 2716 movh [r1+r2*2], m5 2717 psrlq m5, 16 2718 movh [r1+r2*1], m5 2719 psrlq m5, 16 2720 movh [r0+r2*2], m5 2721 movh [r0+r2*1], m3 2722 RET 2723 2724;----------------------------------------------------------------------------- 2725; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2726;----------------------------------------------------------------------------- 2727 2728INIT_MMX 2729%define PALIGNR PALIGNR_MMX 2730cglobal pred4x4_vertical_right_mmxext, 3,3 2731 sub r0, r2 2732 lea r1, [r0+r2*2] 2733 movh m0, [r0] ; ........t3t2t1t0 2734 movq m5, m0 2735 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt 2736 pavgb m5, m0 2737 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 2738 movq m1, m0 2739 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 2740 movq m2, m0 2741 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 2742 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2743 movq m1, m3 2744 psrlq m3, 16 2745 psllq m1, 48 2746 movh [r0+r2*1], m5 2747 movh [r0+r2*2], m3 2748 PALIGNR m5, m1, 7, m2 2749 psllq m1, 8 2750 movh [r1+r2*1], m5 2751 PALIGNR m3, m1, 7, m1 2752 movh [r1+r2*2], m3 2753 RET 2754 2755;----------------------------------------------------------------------------- 2756; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2757;----------------------------------------------------------------------------- 2758 2759INIT_MMX 2760%define PALIGNR PALIGNR_MMX 2761cglobal pred4x4_down_right_mmxext, 3,3 2762 sub r0, r2 2763 lea r1, [r0+r2*2] 2764 movq m1, [r1-8] 2765 movq m2, [r0+r2*1-8] 2766 punpckhbw m2, [r0-8] 2767 movh m3, [r0] 2768 punpckhwd m1, m2 2769 PALIGNR m3, m1, 5, m1 2770 movq m1, m3 2771 PALIGNR m3, [r1+r2*1-8], 7, m4 2772 movq m2, m3 2773 PALIGNR m3, [r1+r2*2-8], 7, m4 2774 PRED4x4_LOWPASS m0, m3, m1, m2, m4 2775 movh [r1+r2*2], m0 2776 psrlq m0, 8 2777 movh [r1+r2*1], m0 2778 psrlq m0, 8 2779 movh [r0+r2*2], m0 2780 psrlq m0, 8 2781 movh [r0+r2*1], m0 2782 RET 2783