1;****************************************************************************** 2;* H.264 intra prediction asm optimizations 3;* Copyright (c) 2010 Fiona Glaser 4;* Copyright (c) 2010 Holger Lubitz 5;* Copyright (c) 2010 Loren Merritt 6;* Copyright (c) 2010 Ronald S. Bultje 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29tm_shuf: times 8 db 0x03, 0x80 30pw_ff00: times 8 dw 0xff00 31plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 32 db 1, 2, 3, 4, 5, 6, 7, 8 33plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 34 db 1, 2, 3, 4, 0, 0, 0, 0 35pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 36pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 37pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 38pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 39 40SECTION .text 41 42cextern pb_1 43cextern pb_3 44cextern pw_4 45cextern pw_5 46cextern pw_8 47cextern pw_16 48cextern pw_17 49cextern pw_32 50 51;----------------------------------------------------------------------------- 52; void ff_pred16x16_vertical_8(uint8_t *src, int stride) 53;----------------------------------------------------------------------------- 54 55INIT_MMX mmx 56cglobal pred16x16_vertical_8, 2,3 57 sub r0, r1 58 mov r2, 8 59 movq mm0, [r0+0] 60 movq mm1, [r0+8] 61.loop: 62 movq [r0+r1*1+0], mm0 63 movq [r0+r1*1+8], mm1 64 movq [r0+r1*2+0], mm0 65 movq [r0+r1*2+8], mm1 66 lea r0, [r0+r1*2] 67 dec r2 68 jg .loop 69 REP_RET 70 71INIT_XMM sse 72cglobal pred16x16_vertical_8, 2,3 73 sub r0, r1 74 mov r2, 4 75 movaps xmm0, [r0] 76.loop: 77 movaps [r0+r1*1], xmm0 78 movaps [r0+r1*2], xmm0 79 lea r0, [r0+r1*2] 80 movaps [r0+r1*1], xmm0 81 movaps [r0+r1*2], xmm0 82 lea r0, [r0+r1*2] 83 dec r2 84 jg .loop 85 REP_RET 86 87;----------------------------------------------------------------------------- 88; void ff_pred16x16_horizontal_8(uint8_t *src, int stride) 89;----------------------------------------------------------------------------- 90 91%macro PRED16x16_H 0 92cglobal pred16x16_horizontal_8, 2,3 93 mov r2, 8 94%if cpuflag(ssse3) 95 mova m2, [pb_3] 96%endif 97.loop: 98 movd m0, [r0+r1*0-4] 99 movd m1, [r0+r1*1-4] 100 101%if cpuflag(ssse3) 102 pshufb m0, m2 103 pshufb m1, m2 104%else 105 punpcklbw m0, m0 106 punpcklbw m1, m1 107 SPLATW m0, m0, 3 108 SPLATW m1, m1, 3 109 mova [r0+r1*0+8], m0 110 mova [r0+r1*1+8], m1 111%endif 112 113 mova [r0+r1*0], m0 114 mova [r0+r1*1], m1 115 lea r0, [r0+r1*2] 116 dec r2 117 jg .loop 118 REP_RET 119%endmacro 120 121INIT_MMX mmx 122PRED16x16_H 123INIT_MMX mmxext 124PRED16x16_H 125INIT_XMM ssse3 126PRED16x16_H 127 128;----------------------------------------------------------------------------- 129; void ff_pred16x16_dc_8(uint8_t *src, int stride) 130;----------------------------------------------------------------------------- 131 132%macro PRED16x16_DC 0 133cglobal pred16x16_dc_8, 2,7 134 mov r4, r0 135 sub r0, r1 136 pxor mm0, mm0 137 pxor mm1, mm1 138 psadbw mm0, [r0+0] 139 psadbw mm1, [r0+8] 140 dec r0 141 movzx r5d, byte [r0+r1*1] 142 paddw mm0, mm1 143 movd r6d, mm0 144 lea r0, [r0+r1*2] 145%rep 7 146 movzx r2d, byte [r0+r1*0] 147 movzx r3d, byte [r0+r1*1] 148 add r5d, r2d 149 add r6d, r3d 150 lea r0, [r0+r1*2] 151%endrep 152 movzx r2d, byte [r0+r1*0] 153 add r5d, r6d 154 lea r2d, [r2+r5+16] 155 shr r2d, 5 156%if cpuflag(ssse3) 157 pxor m1, m1 158%endif 159 SPLATB_REG m0, r2, m1 160 161%if mmsize==8 162 mov r3d, 8 163.loop: 164 mova [r4+r1*0+0], m0 165 mova [r4+r1*0+8], m0 166 mova [r4+r1*1+0], m0 167 mova [r4+r1*1+8], m0 168%else 169 mov r3d, 4 170.loop: 171 mova [r4+r1*0], m0 172 mova [r4+r1*1], m0 173 lea r4, [r4+r1*2] 174 mova [r4+r1*0], m0 175 mova [r4+r1*1], m0 176%endif 177 lea r4, [r4+r1*2] 178 dec r3d 179 jg .loop 180 REP_RET 181%endmacro 182 183INIT_MMX mmxext 184PRED16x16_DC 185INIT_XMM sse2 186PRED16x16_DC 187INIT_XMM ssse3 188PRED16x16_DC 189 190;----------------------------------------------------------------------------- 191; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride) 192;----------------------------------------------------------------------------- 193 194%macro PRED16x16_TM 0 195cglobal pred16x16_tm_vp8_8, 2,5 196 sub r0, r1 197 pxor mm7, mm7 198 movq mm0, [r0+0] 199 movq mm2, [r0+8] 200 movq mm1, mm0 201 movq mm3, mm2 202 punpcklbw mm0, mm7 203 punpckhbw mm1, mm7 204 punpcklbw mm2, mm7 205 punpckhbw mm3, mm7 206 movzx r3d, byte [r0-1] 207 mov r4d, 16 208.loop: 209 movzx r2d, byte [r0+r1-1] 210 sub r2d, r3d 211 movd mm4, r2d 212 SPLATW mm4, mm4, 0 213 movq mm5, mm4 214 movq mm6, mm4 215 movq mm7, mm4 216 paddw mm4, mm0 217 paddw mm5, mm1 218 paddw mm6, mm2 219 paddw mm7, mm3 220 packuswb mm4, mm5 221 packuswb mm6, mm7 222 movq [r0+r1+0], mm4 223 movq [r0+r1+8], mm6 224 add r0, r1 225 dec r4d 226 jg .loop 227 REP_RET 228%endmacro 229 230INIT_MMX mmx 231PRED16x16_TM 232INIT_MMX mmxext 233PRED16x16_TM 234 235INIT_XMM sse2 236cglobal pred16x16_tm_vp8_8, 2,6,6 237 sub r0, r1 238 pxor xmm2, xmm2 239 movdqa xmm0, [r0] 240 movdqa xmm1, xmm0 241 punpcklbw xmm0, xmm2 242 punpckhbw xmm1, xmm2 243 movzx r4d, byte [r0-1] 244 mov r5d, 8 245.loop: 246 movzx r2d, byte [r0+r1*1-1] 247 movzx r3d, byte [r0+r1*2-1] 248 sub r2d, r4d 249 sub r3d, r4d 250 movd xmm2, r2d 251 movd xmm4, r3d 252 pshuflw xmm2, xmm2, 0 253 pshuflw xmm4, xmm4, 0 254 punpcklqdq xmm2, xmm2 255 punpcklqdq xmm4, xmm4 256 movdqa xmm3, xmm2 257 movdqa xmm5, xmm4 258 paddw xmm2, xmm0 259 paddw xmm3, xmm1 260 paddw xmm4, xmm0 261 paddw xmm5, xmm1 262 packuswb xmm2, xmm3 263 packuswb xmm4, xmm5 264 movdqa [r0+r1*1], xmm2 265 movdqa [r0+r1*2], xmm4 266 lea r0, [r0+r1*2] 267 dec r5d 268 jg .loop 269 REP_RET 270 271;----------------------------------------------------------------------------- 272; void ff_pred16x16_plane_*_8(uint8_t *src, int stride) 273;----------------------------------------------------------------------------- 274 275%macro H264_PRED16x16_PLANE 1 276cglobal pred16x16_plane_%1_8, 2,9,7 277 mov r2, r1 ; +stride 278 neg r1 ; -stride 279 280 movh m0, [r0+r1 -1] 281%if mmsize == 8 282 pxor m4, m4 283 movh m1, [r0+r1 +3 ] 284 movh m2, [r0+r1 +8 ] 285 movh m3, [r0+r1 +12] 286 punpcklbw m0, m4 287 punpcklbw m1, m4 288 punpcklbw m2, m4 289 punpcklbw m3, m4 290 pmullw m0, [pw_m8tom1 ] 291 pmullw m1, [pw_m8tom1+8] 292 pmullw m2, [pw_1to8 ] 293 pmullw m3, [pw_1to8 +8] 294 paddw m0, m2 295 paddw m1, m3 296%else ; mmsize == 16 297%if cpuflag(ssse3) 298 movhps m0, [r0+r1 +8] 299 pmaddubsw m0, [plane_shuf] ; H coefficients 300%else ; sse2 301 pxor m2, m2 302 movh m1, [r0+r1 +8] 303 punpcklbw m0, m2 304 punpcklbw m1, m2 305 pmullw m0, [pw_m8tom1] 306 pmullw m1, [pw_1to8] 307 paddw m0, m1 308%endif 309 movhlps m1, m0 310%endif 311 paddw m0, m1 312%if cpuflag(mmxext) 313 PSHUFLW m1, m0, 0xE 314%elif cpuflag(mmx) 315 mova m1, m0 316 psrlq m1, 32 317%endif 318 paddw m0, m1 319%if cpuflag(mmxext) 320 PSHUFLW m1, m0, 0x1 321%elif cpuflag(mmx) 322 mova m1, m0 323 psrlq m1, 16 324%endif 325 paddw m0, m1 ; sum of H coefficients 326 327 lea r4, [r0+r2*8-1] 328 lea r3, [r0+r2*4-1] 329 add r4, r2 330 331%if ARCH_X86_64 332%define e_reg r8 333%else 334%define e_reg r0 335%endif 336 337 movzx e_reg, byte [r3+r2*2 ] 338 movzx r5, byte [r4+r1 ] 339 sub r5, e_reg 340 341 movzx e_reg, byte [r3+r2 ] 342 movzx r6, byte [r4 ] 343 sub r6, e_reg 344 lea r5, [r5+r6*2] 345 346 movzx e_reg, byte [r3+r1 ] 347 movzx r6, byte [r4+r2*2 ] 348 sub r6, e_reg 349 lea r5, [r5+r6*4] 350 351 movzx e_reg, byte [r3 ] 352%if ARCH_X86_64 353 movzx r7, byte [r4+r2 ] 354 sub r7, e_reg 355%else 356 movzx r6, byte [r4+r2 ] 357 sub r6, e_reg 358 lea r5, [r5+r6*4] 359 sub r5, r6 360%endif 361 362 lea e_reg, [r3+r1*4] 363 lea r3, [r4+r2*4] 364 365 movzx r4, byte [e_reg+r2 ] 366 movzx r6, byte [r3 ] 367 sub r6, r4 368%if ARCH_X86_64 369 lea r6, [r7+r6*2] 370 lea r5, [r5+r6*2] 371 add r5, r6 372%else 373 lea r5, [r5+r6*4] 374 lea r5, [r5+r6*2] 375%endif 376 377 movzx r4, byte [e_reg ] 378%if ARCH_X86_64 379 movzx r7, byte [r3 +r2 ] 380 sub r7, r4 381 sub r5, r7 382%else 383 movzx r6, byte [r3 +r2 ] 384 sub r6, r4 385 lea r5, [r5+r6*8] 386 sub r5, r6 387%endif 388 389 movzx r4, byte [e_reg+r1 ] 390 movzx r6, byte [r3 +r2*2] 391 sub r6, r4 392%if ARCH_X86_64 393 add r6, r7 394%endif 395 lea r5, [r5+r6*8] 396 397 movzx r4, byte [e_reg+r2*2] 398 movzx r6, byte [r3 +r1 ] 399 sub r6, r4 400 lea r5, [r5+r6*4] 401 add r5, r6 ; sum of V coefficients 402 403%if ARCH_X86_64 == 0 404 mov r0, r0m 405%endif 406 407%ifidn %1, h264 408 lea r5, [r5*5+32] 409 sar r5, 6 410%elifidn %1, rv40 411 lea r5, [r5*5] 412 sar r5, 6 413%elifidn %1, svq3 414 test r5, r5 415 lea r6, [r5+3] 416 cmovs r5, r6 417 sar r5, 2 ; V/4 418 lea r5, [r5*5] ; 5*(V/4) 419 test r5, r5 420 lea r6, [r5+15] 421 cmovs r5, r6 422 sar r5, 4 ; (5*(V/4))/16 423%endif 424 425 movzx r4, byte [r0+r1 +15] 426 movzx r3, byte [r3+r2*2 ] 427 lea r3, [r3+r4+1] 428 shl r3, 4 429 430 movd r1d, m0 431 movsx r1d, r1w 432%ifnidn %1, svq3 433%ifidn %1, h264 434 lea r1d, [r1d*5+32] 435%else ; rv40 436 lea r1d, [r1d*5] 437%endif 438 sar r1d, 6 439%else ; svq3 440 test r1d, r1d 441 lea r4d, [r1d+3] 442 cmovs r1d, r4d 443 sar r1d, 2 ; H/4 444 lea r1d, [r1d*5] ; 5*(H/4) 445 test r1d, r1d 446 lea r4d, [r1d+15] 447 cmovs r1d, r4d 448 sar r1d, 4 ; (5*(H/4))/16 449%endif 450 movd m0, r1d 451 452 add r1d, r5d 453 add r3d, r1d 454 shl r1d, 3 455 sub r3d, r1d ; a 456 457 movd m1, r5d 458 movd m3, r3d 459 SPLATW m0, m0, 0 ; H 460 SPLATW m1, m1, 0 ; V 461 SPLATW m3, m3, 0 ; a 462%ifidn %1, svq3 463 SWAP 0, 1 464%endif 465 mova m2, m0 466%if mmsize == 8 467 mova m5, m0 468%endif 469 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 470%if mmsize == 16 471 psllw m2, 3 472%else 473 psllw m5, 3 474 psllw m2, 2 475 mova m6, m5 476 paddw m6, m2 477%endif 478 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 479 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H 480%if mmsize == 8 481 paddw m5, m0 ; a + {8,9,10,11}*H 482 paddw m6, m0 ; a + {12,13,14,15}*H 483%endif 484 485 mov r4, 8 486.loop: 487 mova m3, m0 ; b[0..7] 488 mova m4, m2 ; b[8..15] 489 psraw m3, 5 490 psraw m4, 5 491 packuswb m3, m4 492 mova [r0], m3 493%if mmsize == 8 494 mova m3, m5 ; b[8..11] 495 mova m4, m6 ; b[12..15] 496 psraw m3, 5 497 psraw m4, 5 498 packuswb m3, m4 499 mova [r0+8], m3 500%endif 501 paddw m0, m1 502 paddw m2, m1 503%if mmsize == 8 504 paddw m5, m1 505 paddw m6, m1 506%endif 507 508 mova m3, m0 ; b[0..7] 509 mova m4, m2 ; b[8..15] 510 psraw m3, 5 511 psraw m4, 5 512 packuswb m3, m4 513 mova [r0+r2], m3 514%if mmsize == 8 515 mova m3, m5 ; b[8..11] 516 mova m4, m6 ; b[12..15] 517 psraw m3, 5 518 psraw m4, 5 519 packuswb m3, m4 520 mova [r0+r2+8], m3 521%endif 522 paddw m0, m1 523 paddw m2, m1 524%if mmsize == 8 525 paddw m5, m1 526 paddw m6, m1 527%endif 528 529 lea r0, [r0+r2*2] 530 dec r4 531 jg .loop 532 REP_RET 533%endmacro 534 535INIT_MMX mmx 536H264_PRED16x16_PLANE h264 537H264_PRED16x16_PLANE rv40 538H264_PRED16x16_PLANE svq3 539INIT_MMX mmxext 540H264_PRED16x16_PLANE h264 541H264_PRED16x16_PLANE rv40 542H264_PRED16x16_PLANE svq3 543INIT_XMM sse2 544H264_PRED16x16_PLANE h264 545H264_PRED16x16_PLANE rv40 546H264_PRED16x16_PLANE svq3 547INIT_XMM ssse3 548H264_PRED16x16_PLANE h264 549H264_PRED16x16_PLANE rv40 550H264_PRED16x16_PLANE svq3 551 552;----------------------------------------------------------------------------- 553; void ff_pred8x8_plane_8(uint8_t *src, int stride) 554;----------------------------------------------------------------------------- 555 556%macro H264_PRED8x8_PLANE 0 557cglobal pred8x8_plane_8, 2,9,7 558 mov r2, r1 ; +stride 559 neg r1 ; -stride 560 561 movd m0, [r0+r1 -1] 562%if mmsize == 8 563 pxor m2, m2 564 movh m1, [r0+r1 +4 ] 565 punpcklbw m0, m2 566 punpcklbw m1, m2 567 pmullw m0, [pw_m4to4] 568 pmullw m1, [pw_m4to4+8] 569%else ; mmsize == 16 570%if cpuflag(ssse3) 571 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary 572 pmaddubsw m0, [plane8_shuf] ; H coefficients 573%else ; sse2 574 pxor m2, m2 575 movd m1, [r0+r1 +4] 576 punpckldq m0, m1 577 punpcklbw m0, m2 578 pmullw m0, [pw_m4to4] 579%endif 580 movhlps m1, m0 581%endif 582 paddw m0, m1 583 584%if notcpuflag(ssse3) 585%if cpuflag(mmxext) 586 PSHUFLW m1, m0, 0xE 587%elif cpuflag(mmx) 588 mova m1, m0 589 psrlq m1, 32 590%endif 591 paddw m0, m1 592%endif ; !ssse3 593 594%if cpuflag(mmxext) 595 PSHUFLW m1, m0, 0x1 596%elif cpuflag(mmx) 597 mova m1, m0 598 psrlq m1, 16 599%endif 600 paddw m0, m1 ; sum of H coefficients 601 602 lea r4, [r0+r2*4-1] 603 lea r3, [r0 -1] 604 add r4, r2 605 606%if ARCH_X86_64 607%define e_reg r8 608%else 609%define e_reg r0 610%endif 611 612 movzx e_reg, byte [r3+r2*2 ] 613 movzx r5, byte [r4+r1 ] 614 sub r5, e_reg 615 616 movzx e_reg, byte [r3 ] 617%if ARCH_X86_64 618 movzx r7, byte [r4+r2 ] 619 sub r7, e_reg 620 sub r5, r7 621%else 622 movzx r6, byte [r4+r2 ] 623 sub r6, e_reg 624 lea r5, [r5+r6*4] 625 sub r5, r6 626%endif 627 628 movzx e_reg, byte [r3+r1 ] 629 movzx r6, byte [r4+r2*2 ] 630 sub r6, e_reg 631%if ARCH_X86_64 632 add r6, r7 633%endif 634 lea r5, [r5+r6*4] 635 636 movzx e_reg, byte [r3+r2 ] 637 movzx r6, byte [r4 ] 638 sub r6, e_reg 639 lea r6, [r5+r6*2] 640 641 lea r5, [r6*9+16] 642 lea r5, [r5+r6*8] 643 sar r5, 5 644 645%if ARCH_X86_64 == 0 646 mov r0, r0m 647%endif 648 649 movzx r3, byte [r4+r2*2 ] 650 movzx r4, byte [r0+r1 +7] 651 lea r3, [r3+r4+1] 652 shl r3, 4 653 movd r1d, m0 654 movsx r1d, r1w 655 imul r1d, 17 656 add r1d, 16 657 sar r1d, 5 658 movd m0, r1d 659 add r1d, r5d 660 sub r3d, r1d 661 add r1d, r1d 662 sub r3d, r1d ; a 663 664 movd m1, r5d 665 movd m3, r3d 666 SPLATW m0, m0, 0 ; H 667 SPLATW m1, m1, 0 ; V 668 SPLATW m3, m3, 0 ; a 669%if mmsize == 8 670 mova m2, m0 671%endif 672 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 673 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 674%if mmsize == 8 675 psllw m2, 2 676 paddw m2, m0 ; a + {4,5,6,7}*H 677%endif 678 679 mov r4, 4 680ALIGN 16 681.loop: 682%if mmsize == 16 683 mova m3, m0 ; b[0..7] 684 paddw m0, m1 685 psraw m3, 5 686 mova m4, m0 ; V+b[0..7] 687 paddw m0, m1 688 psraw m4, 5 689 packuswb m3, m4 690 movh [r0], m3 691 movhps [r0+r2], m3 692%else ; mmsize == 8 693 mova m3, m0 ; b[0..3] 694 mova m4, m2 ; b[4..7] 695 paddw m0, m1 696 paddw m2, m1 697 psraw m3, 5 698 psraw m4, 5 699 mova m5, m0 ; V+b[0..3] 700 mova m6, m2 ; V+b[4..7] 701 paddw m0, m1 702 paddw m2, m1 703 psraw m5, 5 704 psraw m6, 5 705 packuswb m3, m4 706 packuswb m5, m6 707 mova [r0], m3 708 mova [r0+r2], m5 709%endif 710 711 lea r0, [r0+r2*2] 712 dec r4 713 jg .loop 714 REP_RET 715%endmacro 716 717INIT_MMX mmx 718H264_PRED8x8_PLANE 719INIT_MMX mmxext 720H264_PRED8x8_PLANE 721INIT_XMM sse2 722H264_PRED8x8_PLANE 723INIT_XMM ssse3 724H264_PRED8x8_PLANE 725 726;----------------------------------------------------------------------------- 727; void ff_pred8x8_vertical_8(uint8_t *src, int stride) 728;----------------------------------------------------------------------------- 729 730INIT_MMX mmx 731cglobal pred8x8_vertical_8, 2,2 732 sub r0, r1 733 movq mm0, [r0] 734%rep 3 735 movq [r0+r1*1], mm0 736 movq [r0+r1*2], mm0 737 lea r0, [r0+r1*2] 738%endrep 739 movq [r0+r1*1], mm0 740 movq [r0+r1*2], mm0 741 RET 742 743;----------------------------------------------------------------------------- 744; void ff_pred8x8_horizontal_8(uint8_t *src, int stride) 745;----------------------------------------------------------------------------- 746 747%macro PRED8x8_H 0 748cglobal pred8x8_horizontal_8, 2,3 749 mov r2, 4 750%if cpuflag(ssse3) 751 mova m2, [pb_3] 752%endif 753.loop: 754 SPLATB_LOAD m0, r0+r1*0-1, m2 755 SPLATB_LOAD m1, r0+r1*1-1, m2 756 mova [r0+r1*0], m0 757 mova [r0+r1*1], m1 758 lea r0, [r0+r1*2] 759 dec r2 760 jg .loop 761 REP_RET 762%endmacro 763 764INIT_MMX mmx 765PRED8x8_H 766INIT_MMX mmxext 767PRED8x8_H 768INIT_MMX ssse3 769PRED8x8_H 770 771;----------------------------------------------------------------------------- 772; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride) 773;----------------------------------------------------------------------------- 774INIT_MMX mmxext 775cglobal pred8x8_top_dc_8, 2,5 776 sub r0, r1 777 movq mm0, [r0] 778 pxor mm1, mm1 779 pxor mm2, mm2 780 lea r2, [r0+r1*2] 781 punpckhbw mm1, mm0 782 punpcklbw mm0, mm2 783 psadbw mm1, mm2 ; s1 784 lea r3, [r2+r1*2] 785 psadbw mm0, mm2 ; s0 786 psrlw mm1, 1 787 psrlw mm0, 1 788 pavgw mm1, mm2 789 lea r4, [r3+r1*2] 790 pavgw mm0, mm2 791 pshufw mm1, mm1, 0 792 pshufw mm0, mm0, 0 ; dc0 (w) 793 packuswb mm0, mm1 ; dc0,dc1 (b) 794 movq [r0+r1*1], mm0 795 movq [r0+r1*2], mm0 796 lea r0, [r3+r1*2] 797 movq [r2+r1*1], mm0 798 movq [r2+r1*2], mm0 799 movq [r3+r1*1], mm0 800 movq [r3+r1*2], mm0 801 movq [r0+r1*1], mm0 802 movq [r0+r1*2], mm0 803 RET 804 805;----------------------------------------------------------------------------- 806; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride) 807;----------------------------------------------------------------------------- 808 809INIT_MMX mmxext 810cglobal pred8x8_dc_8, 2,5 811 sub r0, r1 812 pxor m7, m7 813 movd m0, [r0+0] 814 movd m1, [r0+4] 815 psadbw m0, m7 ; s0 816 mov r4, r0 817 psadbw m1, m7 ; s1 818 819 movzx r2d, byte [r0+r1*1-1] 820 movzx r3d, byte [r0+r1*2-1] 821 lea r0, [r0+r1*2] 822 add r2d, r3d 823 movzx r3d, byte [r0+r1*1-1] 824 add r2d, r3d 825 movzx r3d, byte [r0+r1*2-1] 826 add r2d, r3d 827 lea r0, [r0+r1*2] 828 movd m2, r2d ; s2 829 movzx r2d, byte [r0+r1*1-1] 830 movzx r3d, byte [r0+r1*2-1] 831 lea r0, [r0+r1*2] 832 add r2d, r3d 833 movzx r3d, byte [r0+r1*1-1] 834 add r2d, r3d 835 movzx r3d, byte [r0+r1*2-1] 836 add r2d, r3d 837 movd m3, r2d ; s3 838 839 punpcklwd m0, m1 840 mov r0, r4 841 punpcklwd m2, m3 842 punpckldq m0, m2 ; s0, s1, s2, s3 843 pshufw m3, m0, 11110110b ; s2, s1, s3, s3 844 lea r2, [r0+r1*2] 845 pshufw m0, m0, 01110100b ; s0, s1, s3, s1 846 paddw m0, m3 847 lea r3, [r2+r1*2] 848 psrlw m0, 2 849 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 850 lea r4, [r3+r1*2] 851 packuswb m0, m0 852 punpcklbw m0, m0 853 movq m1, m0 854 punpcklbw m0, m0 855 punpckhbw m1, m1 856 movq [r0+r1*1], m0 857 movq [r0+r1*2], m0 858 movq [r2+r1*1], m0 859 movq [r2+r1*2], m0 860 movq [r3+r1*1], m1 861 movq [r3+r1*2], m1 862 movq [r4+r1*1], m1 863 movq [r4+r1*2], m1 864 RET 865 866;----------------------------------------------------------------------------- 867; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride) 868;----------------------------------------------------------------------------- 869 870INIT_MMX mmxext 871cglobal pred8x8_dc_rv40_8, 2,7 872 mov r4, r0 873 sub r0, r1 874 pxor mm0, mm0 875 psadbw mm0, [r0] 876 dec r0 877 movzx r5d, byte [r0+r1*1] 878 movd r6d, mm0 879 lea r0, [r0+r1*2] 880%rep 3 881 movzx r2d, byte [r0+r1*0] 882 movzx r3d, byte [r0+r1*1] 883 add r5d, r2d 884 add r6d, r3d 885 lea r0, [r0+r1*2] 886%endrep 887 movzx r2d, byte [r0+r1*0] 888 add r5d, r6d 889 lea r2d, [r2+r5+8] 890 shr r2d, 4 891 movd mm0, r2d 892 punpcklbw mm0, mm0 893 pshufw mm0, mm0, 0 894 mov r3d, 4 895.loop: 896 movq [r4+r1*0], mm0 897 movq [r4+r1*1], mm0 898 lea r4, [r4+r1*2] 899 dec r3d 900 jg .loop 901 REP_RET 902 903;----------------------------------------------------------------------------- 904; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride) 905;----------------------------------------------------------------------------- 906 907%macro PRED8x8_TM 0 908cglobal pred8x8_tm_vp8_8, 2,6 909 sub r0, r1 910 pxor mm7, mm7 911 movq mm0, [r0] 912 movq mm1, mm0 913 punpcklbw mm0, mm7 914 punpckhbw mm1, mm7 915 movzx r4d, byte [r0-1] 916 mov r5d, 4 917.loop: 918 movzx r2d, byte [r0+r1*1-1] 919 movzx r3d, byte [r0+r1*2-1] 920 sub r2d, r4d 921 sub r3d, r4d 922 movd mm2, r2d 923 movd mm4, r3d 924 SPLATW mm2, mm2, 0 925 SPLATW mm4, mm4, 0 926 movq mm3, mm2 927 movq mm5, mm4 928 paddw mm2, mm0 929 paddw mm3, mm1 930 paddw mm4, mm0 931 paddw mm5, mm1 932 packuswb mm2, mm3 933 packuswb mm4, mm5 934 movq [r0+r1*1], mm2 935 movq [r0+r1*2], mm4 936 lea r0, [r0+r1*2] 937 dec r5d 938 jg .loop 939 REP_RET 940%endmacro 941 942INIT_MMX mmx 943PRED8x8_TM 944INIT_MMX mmxext 945PRED8x8_TM 946 947INIT_XMM sse2 948cglobal pred8x8_tm_vp8_8, 2,6,4 949 sub r0, r1 950 pxor xmm1, xmm1 951 movq xmm0, [r0] 952 punpcklbw xmm0, xmm1 953 movzx r4d, byte [r0-1] 954 mov r5d, 4 955.loop: 956 movzx r2d, byte [r0+r1*1-1] 957 movzx r3d, byte [r0+r1*2-1] 958 sub r2d, r4d 959 sub r3d, r4d 960 movd xmm2, r2d 961 movd xmm3, r3d 962 pshuflw xmm2, xmm2, 0 963 pshuflw xmm3, xmm3, 0 964 punpcklqdq xmm2, xmm2 965 punpcklqdq xmm3, xmm3 966 paddw xmm2, xmm0 967 paddw xmm3, xmm0 968 packuswb xmm2, xmm3 969 movq [r0+r1*1], xmm2 970 movhps [r0+r1*2], xmm2 971 lea r0, [r0+r1*2] 972 dec r5d 973 jg .loop 974 REP_RET 975 976INIT_XMM ssse3 977cglobal pred8x8_tm_vp8_8, 2,3,6 978 sub r0, r1 979 movdqa xmm4, [tm_shuf] 980 pxor xmm1, xmm1 981 movq xmm0, [r0] 982 punpcklbw xmm0, xmm1 983 movd xmm5, [r0-4] 984 pshufb xmm5, xmm4 985 mov r2d, 4 986.loop: 987 movd xmm2, [r0+r1*1-4] 988 movd xmm3, [r0+r1*2-4] 989 pshufb xmm2, xmm4 990 pshufb xmm3, xmm4 991 psubw xmm2, xmm5 992 psubw xmm3, xmm5 993 paddw xmm2, xmm0 994 paddw xmm3, xmm0 995 packuswb xmm2, xmm3 996 movq [r0+r1*1], xmm2 997 movhps [r0+r1*2], xmm2 998 lea r0, [r0+r1*2] 999 dec r2d 1000 jg .loop 1001 REP_RET 1002 1003; dest, left, right, src, tmp 1004; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 1005%macro PRED4x4_LOWPASS 5 1006 mova %5, %2 1007 pavgb %2, %3 1008 pxor %3, %5 1009 mova %1, %4 1010 pand %3, [pb_1] 1011 psubusb %2, %3 1012 pavgb %1, %2 1013%endmacro 1014 1015;----------------------------------------------------------------------------- 1016; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, 1017; int stride) 1018;----------------------------------------------------------------------------- 1019%macro PRED8x8L_TOP_DC 0 1020cglobal pred8x8l_top_dc_8, 4,4 1021 sub r0, r3 1022 pxor mm7, mm7 1023 movq mm0, [r0-8] 1024 movq mm3, [r0] 1025 movq mm1, [r0+8] 1026 movq mm2, mm3 1027 movq mm4, mm3 1028 PALIGNR mm2, mm0, 7, mm0 1029 PALIGNR mm1, mm4, 1, mm4 1030 test r1, r1 ; top_left 1031 jz .fix_lt_2 1032 test r2, r2 ; top_right 1033 jz .fix_tr_1 1034 jmp .body 1035.fix_lt_2: 1036 movq mm5, mm3 1037 pxor mm5, mm2 1038 psllq mm5, 56 1039 psrlq mm5, 56 1040 pxor mm2, mm5 1041 test r2, r2 ; top_right 1042 jnz .body 1043.fix_tr_1: 1044 movq mm5, mm3 1045 pxor mm5, mm1 1046 psrlq mm5, 56 1047 psllq mm5, 56 1048 pxor mm1, mm5 1049.body: 1050 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1051 psadbw mm7, mm0 1052 paddw mm7, [pw_4] 1053 psrlw mm7, 3 1054 pshufw mm7, mm7, 0 1055 packuswb mm7, mm7 1056%rep 3 1057 movq [r0+r3*1], mm7 1058 movq [r0+r3*2], mm7 1059 lea r0, [r0+r3*2] 1060%endrep 1061 movq [r0+r3*1], mm7 1062 movq [r0+r3*2], mm7 1063 RET 1064%endmacro 1065 1066INIT_MMX mmxext 1067PRED8x8L_TOP_DC 1068INIT_MMX ssse3 1069PRED8x8L_TOP_DC 1070 1071;----------------------------------------------------------------------------- 1072; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, 1073; int stride) 1074;----------------------------------------------------------------------------- 1075 1076%macro PRED8x8L_DC 0 1077cglobal pred8x8l_dc_8, 4,5 1078 sub r0, r3 1079 lea r4, [r0+r3*2] 1080 movq mm0, [r0+r3*1-8] 1081 punpckhbw mm0, [r0+r3*0-8] 1082 movq mm1, [r4+r3*1-8] 1083 punpckhbw mm1, [r0+r3*2-8] 1084 mov r4, r0 1085 punpckhwd mm1, mm0 1086 lea r0, [r0+r3*4] 1087 movq mm2, [r0+r3*1-8] 1088 punpckhbw mm2, [r0+r3*0-8] 1089 lea r0, [r0+r3*2] 1090 movq mm3, [r0+r3*1-8] 1091 punpckhbw mm3, [r0+r3*0-8] 1092 punpckhwd mm3, mm2 1093 punpckhdq mm3, mm1 1094 lea r0, [r0+r3*2] 1095 movq mm0, [r0+r3*0-8] 1096 movq mm1, [r4] 1097 mov r0, r4 1098 movq mm4, mm3 1099 movq mm2, mm3 1100 PALIGNR mm4, mm0, 7, mm0 1101 PALIGNR mm1, mm2, 1, mm2 1102 test r1, r1 1103 jnz .do_left 1104.fix_lt_1: 1105 movq mm5, mm3 1106 pxor mm5, mm4 1107 psrlq mm5, 56 1108 psllq mm5, 48 1109 pxor mm1, mm5 1110 jmp .do_left 1111.fix_lt_2: 1112 movq mm5, mm3 1113 pxor mm5, mm2 1114 psllq mm5, 56 1115 psrlq mm5, 56 1116 pxor mm2, mm5 1117 test r2, r2 1118 jnz .body 1119.fix_tr_1: 1120 movq mm5, mm3 1121 pxor mm5, mm1 1122 psrlq mm5, 56 1123 psllq mm5, 56 1124 pxor mm1, mm5 1125 jmp .body 1126.do_left: 1127 movq mm0, mm4 1128 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1129 movq mm4, mm0 1130 movq mm7, mm2 1131 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1132 psllq mm1, 56 1133 PALIGNR mm7, mm1, 7, mm3 1134 movq mm0, [r0-8] 1135 movq mm3, [r0] 1136 movq mm1, [r0+8] 1137 movq mm2, mm3 1138 movq mm4, mm3 1139 PALIGNR mm2, mm0, 7, mm0 1140 PALIGNR mm1, mm4, 1, mm4 1141 test r1, r1 1142 jz .fix_lt_2 1143 test r2, r2 1144 jz .fix_tr_1 1145.body: 1146 lea r1, [r0+r3*2] 1147 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1148 pxor mm0, mm0 1149 pxor mm1, mm1 1150 lea r2, [r1+r3*2] 1151 psadbw mm0, mm7 1152 psadbw mm1, mm6 1153 paddw mm0, [pw_8] 1154 paddw mm0, mm1 1155 lea r4, [r2+r3*2] 1156 psrlw mm0, 4 1157 pshufw mm0, mm0, 0 1158 packuswb mm0, mm0 1159 movq [r0+r3*1], mm0 1160 movq [r0+r3*2], mm0 1161 movq [r1+r3*1], mm0 1162 movq [r1+r3*2], mm0 1163 movq [r2+r3*1], mm0 1164 movq [r2+r3*2], mm0 1165 movq [r4+r3*1], mm0 1166 movq [r4+r3*2], mm0 1167 RET 1168%endmacro 1169 1170INIT_MMX mmxext 1171PRED8x8L_DC 1172INIT_MMX ssse3 1173PRED8x8L_DC 1174 1175;----------------------------------------------------------------------------- 1176; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, 1177; int has_topright, int stride) 1178;----------------------------------------------------------------------------- 1179 1180%macro PRED8x8L_HORIZONTAL 0 1181cglobal pred8x8l_horizontal_8, 4,4 1182 sub r0, r3 1183 lea r2, [r0+r3*2] 1184 movq mm0, [r0+r3*1-8] 1185 test r1, r1 1186 lea r1, [r0+r3] 1187 cmovnz r1, r0 1188 punpckhbw mm0, [r1+r3*0-8] 1189 movq mm1, [r2+r3*1-8] 1190 punpckhbw mm1, [r0+r3*2-8] 1191 mov r2, r0 1192 punpckhwd mm1, mm0 1193 lea r0, [r0+r3*4] 1194 movq mm2, [r0+r3*1-8] 1195 punpckhbw mm2, [r0+r3*0-8] 1196 lea r0, [r0+r3*2] 1197 movq mm3, [r0+r3*1-8] 1198 punpckhbw mm3, [r0+r3*0-8] 1199 punpckhwd mm3, mm2 1200 punpckhdq mm3, mm1 1201 lea r0, [r0+r3*2] 1202 movq mm0, [r0+r3*0-8] 1203 movq mm1, [r1+r3*0-8] 1204 mov r0, r2 1205 movq mm4, mm3 1206 movq mm2, mm3 1207 PALIGNR mm4, mm0, 7, mm0 1208 PALIGNR mm1, mm2, 1, mm2 1209 movq mm0, mm4 1210 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1211 movq mm4, mm0 1212 movq mm7, mm2 1213 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1214 psllq mm1, 56 1215 PALIGNR mm7, mm1, 7, mm3 1216 movq mm3, mm7 1217 lea r1, [r0+r3*2] 1218 movq mm7, mm3 1219 punpckhbw mm3, mm3 1220 punpcklbw mm7, mm7 1221 pshufw mm0, mm3, 0xff 1222 pshufw mm1, mm3, 0xaa 1223 lea r2, [r1+r3*2] 1224 pshufw mm2, mm3, 0x55 1225 pshufw mm3, mm3, 0x00 1226 pshufw mm4, mm7, 0xff 1227 pshufw mm5, mm7, 0xaa 1228 pshufw mm6, mm7, 0x55 1229 pshufw mm7, mm7, 0x00 1230 movq [r0+r3*1], mm0 1231 movq [r0+r3*2], mm1 1232 movq [r1+r3*1], mm2 1233 movq [r1+r3*2], mm3 1234 movq [r2+r3*1], mm4 1235 movq [r2+r3*2], mm5 1236 lea r0, [r2+r3*2] 1237 movq [r0+r3*1], mm6 1238 movq [r0+r3*2], mm7 1239 RET 1240%endmacro 1241 1242INIT_MMX mmxext 1243PRED8x8L_HORIZONTAL 1244INIT_MMX ssse3 1245PRED8x8L_HORIZONTAL 1246 1247;----------------------------------------------------------------------------- 1248; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, 1249; int stride) 1250;----------------------------------------------------------------------------- 1251 1252%macro PRED8x8L_VERTICAL 0 1253cglobal pred8x8l_vertical_8, 4,4 1254 sub r0, r3 1255 movq mm0, [r0-8] 1256 movq mm3, [r0] 1257 movq mm1, [r0+8] 1258 movq mm2, mm3 1259 movq mm4, mm3 1260 PALIGNR mm2, mm0, 7, mm0 1261 PALIGNR mm1, mm4, 1, mm4 1262 test r1, r1 ; top_left 1263 jz .fix_lt_2 1264 test r2, r2 ; top_right 1265 jz .fix_tr_1 1266 jmp .body 1267.fix_lt_2: 1268 movq mm5, mm3 1269 pxor mm5, mm2 1270 psllq mm5, 56 1271 psrlq mm5, 56 1272 pxor mm2, mm5 1273 test r2, r2 ; top_right 1274 jnz .body 1275.fix_tr_1: 1276 movq mm5, mm3 1277 pxor mm5, mm1 1278 psrlq mm5, 56 1279 psllq mm5, 56 1280 pxor mm1, mm5 1281.body: 1282 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1283%rep 3 1284 movq [r0+r3*1], mm0 1285 movq [r0+r3*2], mm0 1286 lea r0, [r0+r3*2] 1287%endrep 1288 movq [r0+r3*1], mm0 1289 movq [r0+r3*2], mm0 1290 RET 1291%endmacro 1292 1293INIT_MMX mmxext 1294PRED8x8L_VERTICAL 1295INIT_MMX ssse3 1296PRED8x8L_VERTICAL 1297 1298;----------------------------------------------------------------------------- 1299; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, 1300; int has_topright, int stride) 1301;----------------------------------------------------------------------------- 1302 1303INIT_MMX mmxext 1304cglobal pred8x8l_down_left_8, 4,5 1305 sub r0, r3 1306 movq mm0, [r0-8] 1307 movq mm3, [r0] 1308 movq mm1, [r0+8] 1309 movq mm2, mm3 1310 movq mm4, mm3 1311 PALIGNR mm2, mm0, 7, mm0 1312 PALIGNR mm1, mm4, 1, mm4 1313 test r1, r1 1314 jz .fix_lt_2 1315 test r2, r2 1316 jz .fix_tr_1 1317 jmp .do_top 1318.fix_lt_2: 1319 movq mm5, mm3 1320 pxor mm5, mm2 1321 psllq mm5, 56 1322 psrlq mm5, 56 1323 pxor mm2, mm5 1324 test r2, r2 1325 jnz .do_top 1326.fix_tr_1: 1327 movq mm5, mm3 1328 pxor mm5, mm1 1329 psrlq mm5, 56 1330 psllq mm5, 56 1331 pxor mm1, mm5 1332 jmp .do_top 1333.fix_tr_2: 1334 punpckhbw mm3, mm3 1335 pshufw mm1, mm3, 0xFF 1336 jmp .do_topright 1337.do_top: 1338 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1339 movq mm7, mm4 1340 test r2, r2 1341 jz .fix_tr_2 1342 movq mm0, [r0+8] 1343 movq mm5, mm0 1344 movq mm2, mm0 1345 movq mm4, mm0 1346 psrlq mm5, 56 1347 PALIGNR mm2, mm3, 7, mm3 1348 PALIGNR mm5, mm4, 1, mm4 1349 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1350.do_topright: 1351 lea r1, [r0+r3*2] 1352 movq mm6, mm1 1353 psrlq mm1, 56 1354 movq mm4, mm1 1355 lea r2, [r1+r3*2] 1356 movq mm2, mm6 1357 PALIGNR mm2, mm7, 1, mm0 1358 movq mm3, mm6 1359 PALIGNR mm3, mm7, 7, mm0 1360 PALIGNR mm4, mm6, 1, mm0 1361 movq mm5, mm7 1362 movq mm1, mm7 1363 movq mm7, mm6 1364 lea r4, [r2+r3*2] 1365 psllq mm1, 8 1366 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 1367 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 1368 movq [r4+r3*2], mm1 1369 movq mm2, mm0 1370 psllq mm1, 8 1371 psrlq mm2, 56 1372 psllq mm0, 8 1373 por mm1, mm2 1374 movq [r4+r3*1], mm1 1375 movq mm2, mm0 1376 psllq mm1, 8 1377 psrlq mm2, 56 1378 psllq mm0, 8 1379 por mm1, mm2 1380 movq [r2+r3*2], mm1 1381 movq mm2, mm0 1382 psllq mm1, 8 1383 psrlq mm2, 56 1384 psllq mm0, 8 1385 por mm1, mm2 1386 movq [r2+r3*1], mm1 1387 movq mm2, mm0 1388 psllq mm1, 8 1389 psrlq mm2, 56 1390 psllq mm0, 8 1391 por mm1, mm2 1392 movq [r1+r3*2], mm1 1393 movq mm2, mm0 1394 psllq mm1, 8 1395 psrlq mm2, 56 1396 psllq mm0, 8 1397 por mm1, mm2 1398 movq [r1+r3*1], mm1 1399 movq mm2, mm0 1400 psllq mm1, 8 1401 psrlq mm2, 56 1402 psllq mm0, 8 1403 por mm1, mm2 1404 movq [r0+r3*2], mm1 1405 psllq mm1, 8 1406 psrlq mm0, 56 1407 por mm1, mm0 1408 movq [r0+r3*1], mm1 1409 RET 1410 1411%macro PRED8x8L_DOWN_LEFT 0 1412cglobal pred8x8l_down_left_8, 4,4 1413 sub r0, r3 1414 movq mm0, [r0-8] 1415 movq mm3, [r0] 1416 movq mm1, [r0+8] 1417 movq mm2, mm3 1418 movq mm4, mm3 1419 PALIGNR mm2, mm0, 7, mm0 1420 PALIGNR mm1, mm4, 1, mm4 1421 test r1, r1 ; top_left 1422 jz .fix_lt_2 1423 test r2, r2 ; top_right 1424 jz .fix_tr_1 1425 jmp .do_top 1426.fix_lt_2: 1427 movq mm5, mm3 1428 pxor mm5, mm2 1429 psllq mm5, 56 1430 psrlq mm5, 56 1431 pxor mm2, mm5 1432 test r2, r2 ; top_right 1433 jnz .do_top 1434.fix_tr_1: 1435 movq mm5, mm3 1436 pxor mm5, mm1 1437 psrlq mm5, 56 1438 psllq mm5, 56 1439 pxor mm1, mm5 1440 jmp .do_top 1441.fix_tr_2: 1442 punpckhbw mm3, mm3 1443 pshufw mm1, mm3, 0xFF 1444 jmp .do_topright 1445.do_top: 1446 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1447 movq2dq xmm3, mm4 1448 test r2, r2 ; top_right 1449 jz .fix_tr_2 1450 movq mm0, [r0+8] 1451 movq mm5, mm0 1452 movq mm2, mm0 1453 movq mm4, mm0 1454 psrlq mm5, 56 1455 PALIGNR mm2, mm3, 7, mm3 1456 PALIGNR mm5, mm4, 1, mm4 1457 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1458.do_topright: 1459 movq2dq xmm4, mm1 1460 psrlq mm1, 56 1461 movq2dq xmm5, mm1 1462 lea r1, [r0+r3*2] 1463 pslldq xmm4, 8 1464 por xmm3, xmm4 1465 movdqa xmm2, xmm3 1466 psrldq xmm2, 1 1467 pslldq xmm5, 15 1468 por xmm2, xmm5 1469 lea r2, [r1+r3*2] 1470 movdqa xmm1, xmm3 1471 pslldq xmm1, 1 1472INIT_XMM cpuname 1473 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1474 psrldq xmm0, 1 1475 movq [r0+r3*1], xmm0 1476 psrldq xmm0, 1 1477 movq [r0+r3*2], xmm0 1478 psrldq xmm0, 1 1479 lea r0, [r2+r3*2] 1480 movq [r1+r3*1], xmm0 1481 psrldq xmm0, 1 1482 movq [r1+r3*2], xmm0 1483 psrldq xmm0, 1 1484 movq [r2+r3*1], xmm0 1485 psrldq xmm0, 1 1486 movq [r2+r3*2], xmm0 1487 psrldq xmm0, 1 1488 movq [r0+r3*1], xmm0 1489 psrldq xmm0, 1 1490 movq [r0+r3*2], xmm0 1491 RET 1492%endmacro 1493 1494INIT_MMX sse2 1495PRED8x8L_DOWN_LEFT 1496INIT_MMX ssse3 1497PRED8x8L_DOWN_LEFT 1498 1499;----------------------------------------------------------------------------- 1500; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, 1501; int has_topright, int stride) 1502;----------------------------------------------------------------------------- 1503 1504INIT_MMX mmxext 1505cglobal pred8x8l_down_right_8, 4,5 1506 sub r0, r3 1507 lea r4, [r0+r3*2] 1508 movq mm0, [r0+r3*1-8] 1509 punpckhbw mm0, [r0+r3*0-8] 1510 movq mm1, [r4+r3*1-8] 1511 punpckhbw mm1, [r0+r3*2-8] 1512 mov r4, r0 1513 punpckhwd mm1, mm0 1514 lea r0, [r0+r3*4] 1515 movq mm2, [r0+r3*1-8] 1516 punpckhbw mm2, [r0+r3*0-8] 1517 lea r0, [r0+r3*2] 1518 movq mm3, [r0+r3*1-8] 1519 punpckhbw mm3, [r0+r3*0-8] 1520 punpckhwd mm3, mm2 1521 punpckhdq mm3, mm1 1522 lea r0, [r0+r3*2] 1523 movq mm0, [r0+r3*0-8] 1524 movq mm1, [r4] 1525 mov r0, r4 1526 movq mm4, mm3 1527 movq mm2, mm3 1528 PALIGNR mm4, mm0, 7, mm0 1529 PALIGNR mm1, mm2, 1, mm2 1530 test r1, r1 ; top_left 1531 jz .fix_lt_1 1532.do_left: 1533 movq mm0, mm4 1534 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1535 movq mm4, mm0 1536 movq mm7, mm2 1537 movq mm6, mm2 1538 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1539 psllq mm1, 56 1540 PALIGNR mm7, mm1, 7, mm3 1541 movq mm0, [r0-8] 1542 movq mm3, [r0] 1543 movq mm1, [r0+8] 1544 movq mm2, mm3 1545 movq mm4, mm3 1546 PALIGNR mm2, mm0, 7, mm0 1547 PALIGNR mm1, mm4, 1, mm4 1548 test r1, r1 ; top_left 1549 jz .fix_lt_2 1550 test r2, r2 ; top_right 1551 jz .fix_tr_1 1552.do_top: 1553 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1554 movq mm5, mm4 1555 jmp .body 1556.fix_lt_1: 1557 movq mm5, mm3 1558 pxor mm5, mm4 1559 psrlq mm5, 56 1560 psllq mm5, 48 1561 pxor mm1, mm5 1562 jmp .do_left 1563.fix_lt_2: 1564 movq mm5, mm3 1565 pxor mm5, mm2 1566 psllq mm5, 56 1567 psrlq mm5, 56 1568 pxor mm2, mm5 1569 test r2, r2 ; top_right 1570 jnz .do_top 1571.fix_tr_1: 1572 movq mm5, mm3 1573 pxor mm5, mm1 1574 psrlq mm5, 56 1575 psllq mm5, 56 1576 pxor mm1, mm5 1577 jmp .do_top 1578.body: 1579 lea r1, [r0+r3*2] 1580 movq mm1, mm7 1581 movq mm7, mm5 1582 movq mm5, mm6 1583 movq mm2, mm7 1584 lea r2, [r1+r3*2] 1585 PALIGNR mm2, mm6, 1, mm0 1586 movq mm3, mm7 1587 PALIGNR mm3, mm6, 7, mm0 1588 movq mm4, mm7 1589 lea r4, [r2+r3*2] 1590 psrlq mm4, 8 1591 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 1592 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 1593 movq [r4+r3*2], mm0 1594 movq mm2, mm1 1595 psrlq mm0, 8 1596 psllq mm2, 56 1597 psrlq mm1, 8 1598 por mm0, mm2 1599 movq [r4+r3*1], mm0 1600 movq mm2, mm1 1601 psrlq mm0, 8 1602 psllq mm2, 56 1603 psrlq mm1, 8 1604 por mm0, mm2 1605 movq [r2+r3*2], mm0 1606 movq mm2, mm1 1607 psrlq mm0, 8 1608 psllq mm2, 56 1609 psrlq mm1, 8 1610 por mm0, mm2 1611 movq [r2+r3*1], mm0 1612 movq mm2, mm1 1613 psrlq mm0, 8 1614 psllq mm2, 56 1615 psrlq mm1, 8 1616 por mm0, mm2 1617 movq [r1+r3*2], mm0 1618 movq mm2, mm1 1619 psrlq mm0, 8 1620 psllq mm2, 56 1621 psrlq mm1, 8 1622 por mm0, mm2 1623 movq [r1+r3*1], mm0 1624 movq mm2, mm1 1625 psrlq mm0, 8 1626 psllq mm2, 56 1627 psrlq mm1, 8 1628 por mm0, mm2 1629 movq [r0+r3*2], mm0 1630 psrlq mm0, 8 1631 psllq mm1, 56 1632 por mm0, mm1 1633 movq [r0+r3*1], mm0 1634 RET 1635 1636%macro PRED8x8L_DOWN_RIGHT 0 1637cglobal pred8x8l_down_right_8, 4,5 1638 sub r0, r3 1639 lea r4, [r0+r3*2] 1640 movq mm0, [r0+r3*1-8] 1641 punpckhbw mm0, [r0+r3*0-8] 1642 movq mm1, [r4+r3*1-8] 1643 punpckhbw mm1, [r0+r3*2-8] 1644 mov r4, r0 1645 punpckhwd mm1, mm0 1646 lea r0, [r0+r3*4] 1647 movq mm2, [r0+r3*1-8] 1648 punpckhbw mm2, [r0+r3*0-8] 1649 lea r0, [r0+r3*2] 1650 movq mm3, [r0+r3*1-8] 1651 punpckhbw mm3, [r0+r3*0-8] 1652 punpckhwd mm3, mm2 1653 punpckhdq mm3, mm1 1654 lea r0, [r0+r3*2] 1655 movq mm0, [r0+r3*0-8] 1656 movq mm1, [r4] 1657 mov r0, r4 1658 movq mm4, mm3 1659 movq mm2, mm3 1660 PALIGNR mm4, mm0, 7, mm0 1661 PALIGNR mm1, mm2, 1, mm2 1662 test r1, r1 1663 jz .fix_lt_1 1664 jmp .do_left 1665.fix_lt_1: 1666 movq mm5, mm3 1667 pxor mm5, mm4 1668 psrlq mm5, 56 1669 psllq mm5, 48 1670 pxor mm1, mm5 1671 jmp .do_left 1672.fix_lt_2: 1673 movq mm5, mm3 1674 pxor mm5, mm2 1675 psllq mm5, 56 1676 psrlq mm5, 56 1677 pxor mm2, mm5 1678 test r2, r2 1679 jnz .do_top 1680.fix_tr_1: 1681 movq mm5, mm3 1682 pxor mm5, mm1 1683 psrlq mm5, 56 1684 psllq mm5, 56 1685 pxor mm1, mm5 1686 jmp .do_top 1687.do_left: 1688 movq mm0, mm4 1689 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1690 movq mm4, mm0 1691 movq mm7, mm2 1692 movq2dq xmm3, mm2 1693 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1694 psllq mm1, 56 1695 PALIGNR mm7, mm1, 7, mm3 1696 movq2dq xmm1, mm7 1697 movq mm0, [r0-8] 1698 movq mm3, [r0] 1699 movq mm1, [r0+8] 1700 movq mm2, mm3 1701 movq mm4, mm3 1702 PALIGNR mm2, mm0, 7, mm0 1703 PALIGNR mm1, mm4, 1, mm4 1704 test r1, r1 1705 jz .fix_lt_2 1706 test r2, r2 1707 jz .fix_tr_1 1708.do_top: 1709 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1710 movq2dq xmm4, mm4 1711 lea r1, [r0+r3*2] 1712 movdqa xmm0, xmm3 1713 pslldq xmm4, 8 1714 por xmm3, xmm4 1715 lea r2, [r1+r3*2] 1716 pslldq xmm4, 1 1717 por xmm1, xmm4 1718 psrldq xmm0, 7 1719 pslldq xmm0, 15 1720 psrldq xmm0, 7 1721 por xmm1, xmm0 1722 lea r0, [r2+r3*2] 1723 movdqa xmm2, xmm3 1724 psrldq xmm2, 1 1725INIT_XMM cpuname 1726 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1727 movdqa xmm1, xmm0 1728 psrldq xmm1, 1 1729 movq [r0+r3*2], xmm0 1730 movq [r0+r3*1], xmm1 1731 psrldq xmm0, 2 1732 psrldq xmm1, 2 1733 movq [r2+r3*2], xmm0 1734 movq [r2+r3*1], xmm1 1735 psrldq xmm0, 2 1736 psrldq xmm1, 2 1737 movq [r1+r3*2], xmm0 1738 movq [r1+r3*1], xmm1 1739 psrldq xmm0, 2 1740 psrldq xmm1, 2 1741 movq [r4+r3*2], xmm0 1742 movq [r4+r3*1], xmm1 1743 RET 1744%endmacro 1745 1746INIT_MMX sse2 1747PRED8x8L_DOWN_RIGHT 1748INIT_MMX ssse3 1749PRED8x8L_DOWN_RIGHT 1750 1751;----------------------------------------------------------------------------- 1752; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, 1753; int has_topright, int stride) 1754;----------------------------------------------------------------------------- 1755 1756INIT_MMX mmxext 1757cglobal pred8x8l_vertical_right_8, 4,5 1758 sub r0, r3 1759 lea r4, [r0+r3*2] 1760 movq mm0, [r0+r3*1-8] 1761 punpckhbw mm0, [r0+r3*0-8] 1762 movq mm1, [r4+r3*1-8] 1763 punpckhbw mm1, [r0+r3*2-8] 1764 mov r4, r0 1765 punpckhwd mm1, mm0 1766 lea r0, [r0+r3*4] 1767 movq mm2, [r0+r3*1-8] 1768 punpckhbw mm2, [r0+r3*0-8] 1769 lea r0, [r0+r3*2] 1770 movq mm3, [r0+r3*1-8] 1771 punpckhbw mm3, [r0+r3*0-8] 1772 punpckhwd mm3, mm2 1773 punpckhdq mm3, mm1 1774 lea r0, [r0+r3*2] 1775 movq mm0, [r0+r3*0-8] 1776 movq mm1, [r4] 1777 mov r0, r4 1778 movq mm4, mm3 1779 movq mm2, mm3 1780 PALIGNR mm4, mm0, 7, mm0 1781 PALIGNR mm1, mm2, 1, mm2 1782 test r1, r1 1783 jz .fix_lt_1 1784 jmp .do_left 1785.fix_lt_1: 1786 movq mm5, mm3 1787 pxor mm5, mm4 1788 psrlq mm5, 56 1789 psllq mm5, 48 1790 pxor mm1, mm5 1791 jmp .do_left 1792.fix_lt_2: 1793 movq mm5, mm3 1794 pxor mm5, mm2 1795 psllq mm5, 56 1796 psrlq mm5, 56 1797 pxor mm2, mm5 1798 test r2, r2 1799 jnz .do_top 1800.fix_tr_1: 1801 movq mm5, mm3 1802 pxor mm5, mm1 1803 psrlq mm5, 56 1804 psllq mm5, 56 1805 pxor mm1, mm5 1806 jmp .do_top 1807.do_left: 1808 movq mm0, mm4 1809 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1810 movq mm7, mm2 1811 movq mm0, [r0-8] 1812 movq mm3, [r0] 1813 movq mm1, [r0+8] 1814 movq mm2, mm3 1815 movq mm4, mm3 1816 PALIGNR mm2, mm0, 7, mm0 1817 PALIGNR mm1, mm4, 1, mm4 1818 test r1, r1 1819 jz .fix_lt_2 1820 test r2, r2 1821 jz .fix_tr_1 1822.do_top: 1823 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1824 lea r1, [r0+r3*2] 1825 movq mm2, mm6 1826 movq mm3, mm6 1827 PALIGNR mm3, mm7, 7, mm0 1828 PALIGNR mm6, mm7, 6, mm1 1829 movq mm4, mm3 1830 pavgb mm3, mm2 1831 lea r2, [r1+r3*2] 1832 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5 1833 movq [r0+r3*1], mm3 1834 movq [r0+r3*2], mm0 1835 movq mm5, mm0 1836 movq mm6, mm3 1837 movq mm1, mm7 1838 movq mm2, mm1 1839 psllq mm2, 8 1840 movq mm3, mm1 1841 psllq mm3, 16 1842 lea r4, [r2+r3*2] 1843 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4 1844 PALIGNR mm6, mm0, 7, mm2 1845 movq [r1+r3*1], mm6 1846 psllq mm0, 8 1847 PALIGNR mm5, mm0, 7, mm1 1848 movq [r1+r3*2], mm5 1849 psllq mm0, 8 1850 PALIGNR mm6, mm0, 7, mm2 1851 movq [r2+r3*1], mm6 1852 psllq mm0, 8 1853 PALIGNR mm5, mm0, 7, mm1 1854 movq [r2+r3*2], mm5 1855 psllq mm0, 8 1856 PALIGNR mm6, mm0, 7, mm2 1857 movq [r4+r3*1], mm6 1858 psllq mm0, 8 1859 PALIGNR mm5, mm0, 7, mm1 1860 movq [r4+r3*2], mm5 1861 RET 1862 1863%macro PRED8x8L_VERTICAL_RIGHT 0 1864cglobal pred8x8l_vertical_right_8, 4,5,7 1865 ; manually spill XMM registers for Win64 because 1866 ; the code here is initialized with INIT_MMX 1867 WIN64_SPILL_XMM 7 1868 sub r0, r3 1869 lea r4, [r0+r3*2] 1870 movq mm0, [r0+r3*1-8] 1871 punpckhbw mm0, [r0+r3*0-8] 1872 movq mm1, [r4+r3*1-8] 1873 punpckhbw mm1, [r0+r3*2-8] 1874 mov r4, r0 1875 punpckhwd mm1, mm0 1876 lea r0, [r0+r3*4] 1877 movq mm2, [r0+r3*1-8] 1878 punpckhbw mm2, [r0+r3*0-8] 1879 lea r0, [r0+r3*2] 1880 movq mm3, [r0+r3*1-8] 1881 punpckhbw mm3, [r0+r3*0-8] 1882 punpckhwd mm3, mm2 1883 punpckhdq mm3, mm1 1884 lea r0, [r0+r3*2] 1885 movq mm0, [r0+r3*0-8] 1886 movq mm1, [r4] 1887 mov r0, r4 1888 movq mm4, mm3 1889 movq mm2, mm3 1890 PALIGNR mm4, mm0, 7, mm0 1891 PALIGNR mm1, mm2, 1, mm2 1892 test r1, r1 1893 jnz .do_left 1894.fix_lt_1: 1895 movq mm5, mm3 1896 pxor mm5, mm4 1897 psrlq mm5, 56 1898 psllq mm5, 48 1899 pxor mm1, mm5 1900 jmp .do_left 1901.fix_lt_2: 1902 movq mm5, mm3 1903 pxor mm5, mm2 1904 psllq mm5, 56 1905 psrlq mm5, 56 1906 pxor mm2, mm5 1907 test r2, r2 1908 jnz .do_top 1909.fix_tr_1: 1910 movq mm5, mm3 1911 pxor mm5, mm1 1912 psrlq mm5, 56 1913 psllq mm5, 56 1914 pxor mm1, mm5 1915 jmp .do_top 1916.do_left: 1917 movq mm0, mm4 1918 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1919 movq2dq xmm0, mm2 1920 movq mm0, [r0-8] 1921 movq mm3, [r0] 1922 movq mm1, [r0+8] 1923 movq mm2, mm3 1924 movq mm4, mm3 1925 PALIGNR mm2, mm0, 7, mm0 1926 PALIGNR mm1, mm4, 1, mm4 1927 test r1, r1 1928 jz .fix_lt_2 1929 test r2, r2 1930 jz .fix_tr_1 1931.do_top: 1932 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1933 lea r1, [r0+r3*2] 1934 movq2dq xmm4, mm6 1935 pslldq xmm4, 8 1936 por xmm0, xmm4 1937 movdqa xmm6, [pw_ff00] 1938 movdqa xmm1, xmm0 1939 lea r2, [r1+r3*2] 1940 movdqa xmm2, xmm0 1941 movdqa xmm3, xmm0 1942 pslldq xmm0, 1 1943 pslldq xmm1, 2 1944 pavgb xmm2, xmm0 1945INIT_XMM cpuname 1946 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 1947 pandn xmm6, xmm4 1948 movdqa xmm5, xmm4 1949 psrlw xmm4, 8 1950 packuswb xmm6, xmm4 1951 movhlps xmm4, xmm6 1952 movhps [r0+r3*2], xmm5 1953 movhps [r0+r3*1], xmm2 1954 psrldq xmm5, 4 1955 movss xmm5, xmm6 1956 psrldq xmm2, 4 1957 movss xmm2, xmm4 1958 lea r0, [r2+r3*2] 1959 psrldq xmm5, 1 1960 psrldq xmm2, 1 1961 movq [r0+r3*2], xmm5 1962 movq [r0+r3*1], xmm2 1963 psrldq xmm5, 1 1964 psrldq xmm2, 1 1965 movq [r2+r3*2], xmm5 1966 movq [r2+r3*1], xmm2 1967 psrldq xmm5, 1 1968 psrldq xmm2, 1 1969 movq [r1+r3*2], xmm5 1970 movq [r1+r3*1], xmm2 1971 RET 1972%endmacro 1973 1974INIT_MMX sse2 1975PRED8x8L_VERTICAL_RIGHT 1976INIT_MMX ssse3 1977PRED8x8L_VERTICAL_RIGHT 1978 1979;----------------------------------------------------------------------------- 1980; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, 1981; int has_topright, int stride) 1982;----------------------------------------------------------------------------- 1983 1984%macro PRED8x8L_VERTICAL_LEFT 0 1985cglobal pred8x8l_vertical_left_8, 4,4 1986 sub r0, r3 1987 movq mm0, [r0-8] 1988 movq mm3, [r0] 1989 movq mm1, [r0+8] 1990 movq mm2, mm3 1991 movq mm4, mm3 1992 PALIGNR mm2, mm0, 7, mm0 1993 PALIGNR mm1, mm4, 1, mm4 1994 test r1, r1 1995 jz .fix_lt_2 1996 test r2, r2 1997 jz .fix_tr_1 1998 jmp .do_top 1999.fix_lt_2: 2000 movq mm5, mm3 2001 pxor mm5, mm2 2002 psllq mm5, 56 2003 psrlq mm5, 56 2004 pxor mm2, mm5 2005 test r2, r2 2006 jnz .do_top 2007.fix_tr_1: 2008 movq mm5, mm3 2009 pxor mm5, mm1 2010 psrlq mm5, 56 2011 psllq mm5, 56 2012 pxor mm1, mm5 2013 jmp .do_top 2014.fix_tr_2: 2015 punpckhbw mm3, mm3 2016 pshufw mm1, mm3, 0xFF 2017 jmp .do_topright 2018.do_top: 2019 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2020 movq2dq xmm4, mm4 2021 test r2, r2 2022 jz .fix_tr_2 2023 movq mm0, [r0+8] 2024 movq mm5, mm0 2025 movq mm2, mm0 2026 movq mm4, mm0 2027 psrlq mm5, 56 2028 PALIGNR mm2, mm3, 7, mm3 2029 PALIGNR mm5, mm4, 1, mm4 2030 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 2031.do_topright: 2032 movq2dq xmm3, mm1 2033 lea r1, [r0+r3*2] 2034 pslldq xmm3, 8 2035 por xmm4, xmm3 2036 movdqa xmm2, xmm4 2037 movdqa xmm1, xmm4 2038 movdqa xmm3, xmm4 2039 psrldq xmm2, 1 2040 pslldq xmm1, 1 2041 pavgb xmm3, xmm2 2042 lea r2, [r1+r3*2] 2043INIT_XMM cpuname 2044 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 2045 psrldq xmm0, 1 2046 movq [r0+r3*1], xmm3 2047 movq [r0+r3*2], xmm0 2048 lea r0, [r2+r3*2] 2049 psrldq xmm3, 1 2050 psrldq xmm0, 1 2051 movq [r1+r3*1], xmm3 2052 movq [r1+r3*2], xmm0 2053 psrldq xmm3, 1 2054 psrldq xmm0, 1 2055 movq [r2+r3*1], xmm3 2056 movq [r2+r3*2], xmm0 2057 psrldq xmm3, 1 2058 psrldq xmm0, 1 2059 movq [r0+r3*1], xmm3 2060 movq [r0+r3*2], xmm0 2061 RET 2062%endmacro 2063 2064INIT_MMX sse2 2065PRED8x8L_VERTICAL_LEFT 2066INIT_MMX ssse3 2067PRED8x8L_VERTICAL_LEFT 2068 2069;----------------------------------------------------------------------------- 2070; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, 2071; int has_topright, int stride) 2072;----------------------------------------------------------------------------- 2073 2074%macro PRED8x8L_HORIZONTAL_UP 0 2075cglobal pred8x8l_horizontal_up_8, 4,4 2076 sub r0, r3 2077 lea r2, [r0+r3*2] 2078 movq mm0, [r0+r3*1-8] 2079 test r1, r1 2080 lea r1, [r0+r3] 2081 cmovnz r1, r0 2082 punpckhbw mm0, [r1+r3*0-8] 2083 movq mm1, [r2+r3*1-8] 2084 punpckhbw mm1, [r0+r3*2-8] 2085 mov r2, r0 2086 punpckhwd mm1, mm0 2087 lea r0, [r0+r3*4] 2088 movq mm2, [r0+r3*1-8] 2089 punpckhbw mm2, [r0+r3*0-8] 2090 lea r0, [r0+r3*2] 2091 movq mm3, [r0+r3*1-8] 2092 punpckhbw mm3, [r0+r3*0-8] 2093 punpckhwd mm3, mm2 2094 punpckhdq mm3, mm1 2095 lea r0, [r0+r3*2] 2096 movq mm0, [r0+r3*0-8] 2097 movq mm1, [r1+r3*0-8] 2098 mov r0, r2 2099 movq mm4, mm3 2100 movq mm2, mm3 2101 PALIGNR mm4, mm0, 7, mm0 2102 PALIGNR mm1, mm2, 1, mm2 2103 movq mm0, mm4 2104 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2105 movq mm4, mm0 2106 movq mm7, mm2 2107 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2108 psllq mm1, 56 2109 PALIGNR mm7, mm1, 7, mm3 2110 lea r1, [r0+r3*2] 2111 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 2112 psllq mm7, 56 ; l7 .. .. .. .. .. .. .. 2113 movq mm2, mm0 2114 psllw mm0, 8 2115 psrlw mm2, 8 2116 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 2117 movq mm3, mm2 2118 movq mm4, mm2 2119 movq mm5, mm2 2120 psrlq mm2, 8 2121 psrlq mm3, 16 2122 lea r2, [r1+r3*2] 2123 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 2124 punpckhbw mm7, mm7 2125 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 2126 pavgb mm4, mm2 2127 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 2128 movq mm5, mm4 2129 punpcklbw mm4, mm1 ; p4 p3 p2 p1 2130 punpckhbw mm5, mm1 ; p8 p7 p6 p5 2131 movq mm6, mm5 2132 movq mm7, mm5 2133 movq mm0, mm5 2134 PALIGNR mm5, mm4, 2, mm1 2135 pshufw mm1, mm6, 11111001b 2136 PALIGNR mm6, mm4, 4, mm2 2137 pshufw mm2, mm7, 11111110b 2138 PALIGNR mm7, mm4, 6, mm3 2139 pshufw mm3, mm0, 11111111b 2140 movq [r0+r3*1], mm4 2141 movq [r0+r3*2], mm5 2142 lea r0, [r2+r3*2] 2143 movq [r1+r3*1], mm6 2144 movq [r1+r3*2], mm7 2145 movq [r2+r3*1], mm0 2146 movq [r2+r3*2], mm1 2147 movq [r0+r3*1], mm2 2148 movq [r0+r3*2], mm3 2149 RET 2150%endmacro 2151 2152INIT_MMX mmxext 2153PRED8x8L_HORIZONTAL_UP 2154INIT_MMX ssse3 2155PRED8x8L_HORIZONTAL_UP 2156 2157;----------------------------------------------------------------------------- 2158; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, 2159; int has_topright, int stride) 2160;----------------------------------------------------------------------------- 2161 2162INIT_MMX mmxext 2163cglobal pred8x8l_horizontal_down_8, 4,5 2164 sub r0, r3 2165 lea r4, [r0+r3*2] 2166 movq mm0, [r0+r3*1-8] 2167 punpckhbw mm0, [r0+r3*0-8] 2168 movq mm1, [r4+r3*1-8] 2169 punpckhbw mm1, [r0+r3*2-8] 2170 mov r4, r0 2171 punpckhwd mm1, mm0 2172 lea r0, [r0+r3*4] 2173 movq mm2, [r0+r3*1-8] 2174 punpckhbw mm2, [r0+r3*0-8] 2175 lea r0, [r0+r3*2] 2176 movq mm3, [r0+r3*1-8] 2177 punpckhbw mm3, [r0+r3*0-8] 2178 punpckhwd mm3, mm2 2179 punpckhdq mm3, mm1 2180 lea r0, [r0+r3*2] 2181 movq mm0, [r0+r3*0-8] 2182 movq mm1, [r4] 2183 mov r0, r4 2184 movq mm4, mm3 2185 movq mm2, mm3 2186 PALIGNR mm4, mm0, 7, mm0 2187 PALIGNR mm1, mm2, 1, mm2 2188 test r1, r1 2189 jnz .do_left 2190.fix_lt_1: 2191 movq mm5, mm3 2192 pxor mm5, mm4 2193 psrlq mm5, 56 2194 psllq mm5, 48 2195 pxor mm1, mm5 2196 jmp .do_left 2197.fix_lt_2: 2198 movq mm5, mm3 2199 pxor mm5, mm2 2200 psllq mm5, 56 2201 psrlq mm5, 56 2202 pxor mm2, mm5 2203 test r2, r2 2204 jnz .do_top 2205.fix_tr_1: 2206 movq mm5, mm3 2207 pxor mm5, mm1 2208 psrlq mm5, 56 2209 psllq mm5, 56 2210 pxor mm1, mm5 2211 jmp .do_top 2212.do_left: 2213 movq mm0, mm4 2214 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2215 movq mm4, mm0 2216 movq mm7, mm2 2217 movq mm6, mm2 2218 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2219 psllq mm1, 56 2220 PALIGNR mm7, mm1, 7, mm3 2221 movq mm0, [r0-8] 2222 movq mm3, [r0] 2223 movq mm1, [r0+8] 2224 movq mm2, mm3 2225 movq mm4, mm3 2226 PALIGNR mm2, mm0, 7, mm0 2227 PALIGNR mm1, mm4, 1, mm4 2228 test r1, r1 2229 jz .fix_lt_2 2230 test r2, r2 2231 jz .fix_tr_1 2232.do_top: 2233 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2234 movq mm5, mm4 2235 lea r1, [r0+r3*2] 2236 psllq mm7, 56 2237 movq mm2, mm5 2238 movq mm3, mm6 2239 movq mm4, mm2 2240 PALIGNR mm2, mm6, 7, mm5 2241 PALIGNR mm6, mm7, 7, mm0 2242 lea r2, [r1+r3*2] 2243 PALIGNR mm4, mm3, 1, mm7 2244 movq mm5, mm3 2245 pavgb mm3, mm6 2246 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7 2247 movq mm4, mm2 2248 movq mm1, mm2 2249 lea r4, [r2+r3*2] 2250 psrlq mm4, 16 2251 psrlq mm1, 8 2252 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5 2253 movq mm7, mm3 2254 punpcklbw mm3, mm0 2255 punpckhbw mm7, mm0 2256 movq mm1, mm7 2257 movq mm0, mm7 2258 movq mm4, mm7 2259 movq [r4+r3*2], mm3 2260 PALIGNR mm7, mm3, 2, mm5 2261 movq [r4+r3*1], mm7 2262 PALIGNR mm1, mm3, 4, mm5 2263 movq [r2+r3*2], mm1 2264 PALIGNR mm0, mm3, 6, mm3 2265 movq [r2+r3*1], mm0 2266 movq mm2, mm6 2267 movq mm3, mm6 2268 movq [r1+r3*2], mm4 2269 PALIGNR mm6, mm4, 2, mm5 2270 movq [r1+r3*1], mm6 2271 PALIGNR mm2, mm4, 4, mm5 2272 movq [r0+r3*2], mm2 2273 PALIGNR mm3, mm4, 6, mm4 2274 movq [r0+r3*1], mm3 2275 RET 2276 2277%macro PRED8x8L_HORIZONTAL_DOWN 0 2278cglobal pred8x8l_horizontal_down_8, 4,5 2279 sub r0, r3 2280 lea r4, [r0+r3*2] 2281 movq mm0, [r0+r3*1-8] 2282 punpckhbw mm0, [r0+r3*0-8] 2283 movq mm1, [r4+r3*1-8] 2284 punpckhbw mm1, [r0+r3*2-8] 2285 mov r4, r0 2286 punpckhwd mm1, mm0 2287 lea r0, [r0+r3*4] 2288 movq mm2, [r0+r3*1-8] 2289 punpckhbw mm2, [r0+r3*0-8] 2290 lea r0, [r0+r3*2] 2291 movq mm3, [r0+r3*1-8] 2292 punpckhbw mm3, [r0+r3*0-8] 2293 punpckhwd mm3, mm2 2294 punpckhdq mm3, mm1 2295 lea r0, [r0+r3*2] 2296 movq mm0, [r0+r3*0-8] 2297 movq mm1, [r4] 2298 mov r0, r4 2299 movq mm4, mm3 2300 movq mm2, mm3 2301 PALIGNR mm4, mm0, 7, mm0 2302 PALIGNR mm1, mm2, 1, mm2 2303 test r1, r1 2304 jnz .do_left 2305.fix_lt_1: 2306 movq mm5, mm3 2307 pxor mm5, mm4 2308 psrlq mm5, 56 2309 psllq mm5, 48 2310 pxor mm1, mm5 2311 jmp .do_left 2312.fix_lt_2: 2313 movq mm5, mm3 2314 pxor mm5, mm2 2315 psllq mm5, 56 2316 psrlq mm5, 56 2317 pxor mm2, mm5 2318 test r2, r2 2319 jnz .do_top 2320.fix_tr_1: 2321 movq mm5, mm3 2322 pxor mm5, mm1 2323 psrlq mm5, 56 2324 psllq mm5, 56 2325 pxor mm1, mm5 2326 jmp .do_top 2327.fix_tr_2: 2328 punpckhbw mm3, mm3 2329 pshufw mm1, mm3, 0xFF 2330 jmp .do_topright 2331.do_left: 2332 movq mm0, mm4 2333 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2334 movq2dq xmm0, mm2 2335 pslldq xmm0, 8 2336 movq mm4, mm0 2337 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2338 movq2dq xmm2, mm1 2339 pslldq xmm2, 15 2340 psrldq xmm2, 8 2341 por xmm0, xmm2 2342 movq mm0, [r0-8] 2343 movq mm3, [r0] 2344 movq mm1, [r0+8] 2345 movq mm2, mm3 2346 movq mm4, mm3 2347 PALIGNR mm2, mm0, 7, mm0 2348 PALIGNR mm1, mm4, 1, mm4 2349 test r1, r1 2350 jz .fix_lt_2 2351 test r2, r2 2352 jz .fix_tr_1 2353.do_top: 2354 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2355 movq2dq xmm1, mm4 2356 test r2, r2 2357 jz .fix_tr_2 2358 movq mm0, [r0+8] 2359 movq mm5, mm0 2360 movq mm2, mm0 2361 movq mm4, mm0 2362 psrlq mm5, 56 2363 PALIGNR mm2, mm3, 7, mm3 2364 PALIGNR mm5, mm4, 1, mm4 2365 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 2366.do_topright: 2367 movq2dq xmm5, mm1 2368 pslldq xmm5, 8 2369 por xmm1, xmm5 2370INIT_XMM cpuname 2371 lea r2, [r4+r3*2] 2372 movdqa xmm2, xmm1 2373 movdqa xmm3, xmm1 2374 PALIGNR xmm1, xmm0, 7, xmm4 2375 PALIGNR xmm2, xmm0, 9, xmm5 2376 lea r1, [r2+r3*2] 2377 PALIGNR xmm3, xmm0, 8, xmm0 2378 movdqa xmm4, xmm1 2379 pavgb xmm4, xmm3 2380 lea r0, [r1+r3*2] 2381 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 2382 punpcklbw xmm4, xmm0 2383 movhlps xmm0, xmm4 2384 movq [r0+r3*2], xmm4 2385 movq [r2+r3*2], xmm0 2386 psrldq xmm4, 2 2387 psrldq xmm0, 2 2388 movq [r0+r3*1], xmm4 2389 movq [r2+r3*1], xmm0 2390 psrldq xmm4, 2 2391 psrldq xmm0, 2 2392 movq [r1+r3*2], xmm4 2393 movq [r4+r3*2], xmm0 2394 psrldq xmm4, 2 2395 psrldq xmm0, 2 2396 movq [r1+r3*1], xmm4 2397 movq [r4+r3*1], xmm0 2398 RET 2399%endmacro 2400 2401INIT_MMX sse2 2402PRED8x8L_HORIZONTAL_DOWN 2403INIT_MMX ssse3 2404PRED8x8L_HORIZONTAL_DOWN 2405 2406;------------------------------------------------------------------------------- 2407; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) 2408;------------------------------------------------------------------------------- 2409 2410INIT_MMX mmxext 2411cglobal pred4x4_dc_8, 3,5 2412 pxor mm7, mm7 2413 mov r4, r0 2414 sub r0, r2 2415 movd mm0, [r0] 2416 psadbw mm0, mm7 2417 movzx r1d, byte [r0+r2*1-1] 2418 movd r3d, mm0 2419 add r3d, r1d 2420 movzx r1d, byte [r0+r2*2-1] 2421 lea r0, [r0+r2*2] 2422 add r3d, r1d 2423 movzx r1d, byte [r0+r2*1-1] 2424 add r3d, r1d 2425 movzx r1d, byte [r0+r2*2-1] 2426 add r3d, r1d 2427 add r3d, 4 2428 shr r3d, 3 2429 imul r3d, 0x01010101 2430 mov [r4+r2*0], r3d 2431 mov [r0+r2*0], r3d 2432 mov [r0+r2*1], r3d 2433 mov [r0+r2*2], r3d 2434 RET 2435 2436;----------------------------------------------------------------------------- 2437; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 2438; int stride) 2439;----------------------------------------------------------------------------- 2440 2441%macro PRED4x4_TM 0 2442cglobal pred4x4_tm_vp8_8, 3,6 2443 sub r0, r2 2444 pxor mm7, mm7 2445 movd mm0, [r0] 2446 punpcklbw mm0, mm7 2447 movzx r4d, byte [r0-1] 2448 mov r5d, 2 2449.loop: 2450 movzx r1d, byte [r0+r2*1-1] 2451 movzx r3d, byte [r0+r2*2-1] 2452 sub r1d, r4d 2453 sub r3d, r4d 2454 movd mm2, r1d 2455 movd mm4, r3d 2456%if cpuflag(mmxext) 2457 pshufw mm2, mm2, 0 2458 pshufw mm4, mm4, 0 2459%else 2460 punpcklwd mm2, mm2 2461 punpcklwd mm4, mm4 2462 punpckldq mm2, mm2 2463 punpckldq mm4, mm4 2464%endif 2465 paddw mm2, mm0 2466 paddw mm4, mm0 2467 packuswb mm2, mm2 2468 packuswb mm4, mm4 2469 movd [r0+r2*1], mm2 2470 movd [r0+r2*2], mm4 2471 lea r0, [r0+r2*2] 2472 dec r5d 2473 jg .loop 2474 REP_RET 2475%endmacro 2476 2477INIT_MMX mmx 2478PRED4x4_TM 2479INIT_MMX mmxext 2480PRED4x4_TM 2481 2482INIT_XMM ssse3 2483cglobal pred4x4_tm_vp8_8, 3,3 2484 sub r0, r2 2485 movq mm6, [tm_shuf] 2486 pxor mm1, mm1 2487 movd mm0, [r0] 2488 punpcklbw mm0, mm1 2489 movd mm7, [r0-4] 2490 pshufb mm7, mm6 2491 lea r1, [r0+r2*2] 2492 movd mm2, [r0+r2*1-4] 2493 movd mm3, [r0+r2*2-4] 2494 movd mm4, [r1+r2*1-4] 2495 movd mm5, [r1+r2*2-4] 2496 pshufb mm2, mm6 2497 pshufb mm3, mm6 2498 pshufb mm4, mm6 2499 pshufb mm5, mm6 2500 psubw mm0, mm7 2501 paddw mm2, mm0 2502 paddw mm3, mm0 2503 paddw mm4, mm0 2504 paddw mm5, mm0 2505 packuswb mm2, mm2 2506 packuswb mm3, mm3 2507 packuswb mm4, mm4 2508 packuswb mm5, mm5 2509 movd [r0+r2*1], mm2 2510 movd [r0+r2*2], mm3 2511 movd [r1+r2*1], mm4 2512 movd [r1+r2*2], mm5 2513 RET 2514 2515;----------------------------------------------------------------------------- 2516; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 2517; int stride) 2518;----------------------------------------------------------------------------- 2519 2520INIT_MMX mmxext 2521cglobal pred4x4_vertical_vp8_8, 3,3 2522 sub r0, r2 2523 movd m1, [r0-1] 2524 movd m0, [r0] 2525 mova m2, m0 ;t0 t1 t2 t3 2526 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 2527 lea r1, [r0+r2*2] 2528 psrlq m0, 8 ;t1 t2 t3 t4 2529 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2530 movd [r0+r2*1], m3 2531 movd [r0+r2*2], m3 2532 movd [r1+r2*1], m3 2533 movd [r1+r2*2], m3 2534 RET 2535 2536;----------------------------------------------------------------------------- 2537; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, 2538; int stride) 2539;----------------------------------------------------------------------------- 2540INIT_MMX mmxext 2541cglobal pred4x4_down_left_8, 3,3 2542 sub r0, r2 2543 movq m1, [r0] 2544 punpckldq m1, [r1] 2545 movq m2, m1 2546 movq m3, m1 2547 psllq m1, 8 2548 pxor m2, m1 2549 psrlq m2, 8 2550 pxor m2, m3 2551 PRED4x4_LOWPASS m0, m1, m2, m3, m4 2552 lea r1, [r0+r2*2] 2553 psrlq m0, 8 2554 movd [r0+r2*1], m0 2555 psrlq m0, 8 2556 movd [r0+r2*2], m0 2557 psrlq m0, 8 2558 movd [r1+r2*1], m0 2559 psrlq m0, 8 2560 movd [r1+r2*2], m0 2561 RET 2562 2563;------------------------------------------------------------------------------ 2564; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, 2565; int stride) 2566;------------------------------------------------------------------------------ 2567 2568INIT_MMX mmxext 2569cglobal pred4x4_vertical_left_8, 3,3 2570 sub r0, r2 2571 movq m1, [r0] 2572 punpckldq m1, [r1] 2573 movq m3, m1 2574 movq m2, m1 2575 psrlq m3, 8 2576 psrlq m2, 16 2577 movq m4, m3 2578 pavgb m4, m1 2579 PRED4x4_LOWPASS m0, m1, m2, m3, m5 2580 lea r1, [r0+r2*2] 2581 movh [r0+r2*1], m4 2582 movh [r0+r2*2], m0 2583 psrlq m4, 8 2584 psrlq m0, 8 2585 movh [r1+r2*1], m4 2586 movh [r1+r2*2], m0 2587 RET 2588 2589;------------------------------------------------------------------------------ 2590; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, 2591; int stride) 2592;------------------------------------------------------------------------------ 2593 2594INIT_MMX mmxext 2595cglobal pred4x4_horizontal_up_8, 3,3 2596 sub r0, r2 2597 lea r1, [r0+r2*2] 2598 movd m0, [r0+r2*1-4] 2599 punpcklbw m0, [r0+r2*2-4] 2600 movd m1, [r1+r2*1-4] 2601 punpcklbw m1, [r1+r2*2-4] 2602 punpckhwd m0, m1 2603 movq m1, m0 2604 punpckhbw m1, m1 2605 pshufw m1, m1, 0xFF 2606 punpckhdq m0, m1 2607 movq m2, m0 2608 movq m3, m0 2609 movq m7, m0 2610 psrlq m2, 16 2611 psrlq m3, 8 2612 pavgb m7, m3 2613 PRED4x4_LOWPASS m4, m0, m2, m3, m5 2614 punpcklbw m7, m4 2615 movd [r0+r2*1], m7 2616 psrlq m7, 16 2617 movd [r0+r2*2], m7 2618 psrlq m7, 16 2619 movd [r1+r2*1], m7 2620 movd [r1+r2*2], m1 2621 RET 2622 2623;------------------------------------------------------------------------------ 2624; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, 2625; const uint8_t *topright, int stride) 2626;------------------------------------------------------------------------------ 2627 2628INIT_MMX mmxext 2629cglobal pred4x4_horizontal_down_8, 3,3 2630 sub r0, r2 2631 lea r1, [r0+r2*2] 2632 movh m0, [r0-4] ; lt .. 2633 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. 2634 psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. 2635 movd m1, [r1+r2*2-4] ; l3 2636 punpcklbw m1, [r1+r2*1-4] ; l2 l3 2637 movd m2, [r0+r2*2-4] ; l1 2638 punpcklbw m2, [r0+r2*1-4] ; l0 l1 2639 punpckhwd m1, m2 ; l0 l1 l2 l3 2640 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 2641 movq m0, m1 2642 movq m2, m1 2643 movq m5, m1 2644 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 2645 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 2646 pavgb m5, m2 2647 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2648 punpcklbw m5, m3 2649 psrlq m3, 32 2650 PALIGNR m3, m5, 6, m4 2651 movh [r1+r2*2], m5 2652 psrlq m5, 16 2653 movh [r1+r2*1], m5 2654 psrlq m5, 16 2655 movh [r0+r2*2], m5 2656 movh [r0+r2*1], m3 2657 RET 2658 2659;----------------------------------------------------------------------------- 2660; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, 2661; const uint8_t *topright, int stride) 2662;----------------------------------------------------------------------------- 2663 2664INIT_MMX mmxext 2665cglobal pred4x4_vertical_right_8, 3,3 2666 sub r0, r2 2667 lea r1, [r0+r2*2] 2668 movh m0, [r0] ; ........t3t2t1t0 2669 movq m5, m0 2670 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt 2671 pavgb m5, m0 2672 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 2673 movq m1, m0 2674 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 2675 movq m2, m0 2676 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 2677 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2678 movq m1, m3 2679 psrlq m3, 16 2680 psllq m1, 48 2681 movh [r0+r2*1], m5 2682 movh [r0+r2*2], m3 2683 PALIGNR m5, m1, 7, m2 2684 psllq m1, 8 2685 movh [r1+r2*1], m5 2686 PALIGNR m3, m1, 7, m1 2687 movh [r1+r2*2], m3 2688 RET 2689 2690;----------------------------------------------------------------------------- 2691; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, 2692; int stride) 2693;----------------------------------------------------------------------------- 2694 2695INIT_MMX mmxext 2696cglobal pred4x4_down_right_8, 3,3 2697 sub r0, r2 2698 lea r1, [r0+r2*2] 2699 movq m1, [r1-8] 2700 movq m2, [r0+r2*1-8] 2701 punpckhbw m2, [r0-8] 2702 movh m3, [r0] 2703 punpckhwd m1, m2 2704 PALIGNR m3, m1, 5, m1 2705 movq m1, m3 2706 PALIGNR m3, [r1+r2*1-8], 7, m4 2707 movq m2, m3 2708 PALIGNR m3, [r1+r2*2-8], 7, m4 2709 PRED4x4_LOWPASS m0, m3, m1, m2, m4 2710 movh [r1+r2*2], m0 2711 psrlq m0, 8 2712 movh [r1+r2*1], m0 2713 psrlq m0, 8 2714 movh [r0+r2*2], m0 2715 psrlq m0, 8 2716 movh [r0+r2*1], m0 2717 RET 2718