1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code 3;***************************************************************************** 4;* Copyright (C) 2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of Libav. 9;* 10;* Libav is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* Libav is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with Libav; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "x86inc.asm" 26%include "x86util.asm" 27 28SECTION_RODATA 32 29 30cextern pw_16 31cextern pw_1 32cextern pb_0 33 34pw_pixel_max: times 8 dw ((1 << 10)-1) 35 36pad10: times 8 dw 10*1023 37pad20: times 8 dw 20*1023 38pad30: times 8 dw 30*1023 39depad: times 4 dd 32*20*1023 + 512 40depad2: times 8 dw 20*1023 + 16*1022 + 16 41unpad: times 8 dw 16*1022/32 ; needs to be mod 16 42 43tap1: times 4 dw 1, -5 44tap2: times 4 dw 20, 20 45tap3: times 4 dw -5, 1 46pd_0f: times 4 dd 0xffff 47 48SECTION .text 49 50 51%macro AVG_MOV 2 52 pavgw %2, %1 53 mova %1, %2 54%endmacro 55 56%macro ADDW 3 57%if mmsize == 8 58 paddw %1, %2 59%else 60 movu %3, %2 61 paddw %1, %3 62%endif 63%endmacro 64 65%macro FILT_H 4 66 paddw %1, %4 67 psubw %1, %2 ; a-b 68 psraw %1, 2 ; (a-b)/4 69 psubw %1, %2 ; (a-b)/4-b 70 paddw %1, %3 ; (a-b)/4-b+c 71 psraw %1, 2 ; ((a-b)/4-b+c)/4 72 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 73%endmacro 74 75%macro PRELOAD_V 0 76 lea r3, [r2*3] 77 sub r1, r3 78 movu m0, [r1+r2] 79 movu m1, [r1+r2*2] 80 add r1, r3 81 movu m2, [r1] 82 movu m3, [r1+r2] 83 movu m4, [r1+r2*2] 84 add r1, r3 85%endmacro 86 87%macro FILT_V 8 88 movu %6, [r1] 89 paddw %1, %6 90 mova %7, %2 91 paddw %7, %5 92 mova %8, %3 93 paddw %8, %4 94 FILT_H %1, %7, %8, [pw_16] 95 psraw %1, 1 96 CLIPW %1, [pb_0], [pw_pixel_max] 97%endmacro 98 99%macro MC 1 100%define OP_MOV mova 101INIT_MMX 102%1 mmxext, put, 4 103INIT_XMM 104%1 sse2 , put, 8 105 106%define OP_MOV AVG_MOV 107INIT_MMX 108%1 mmxext, avg, 4 109INIT_XMM 110%1 sse2 , avg, 8 111%endmacro 112 113%macro MCAxA 8 114%ifdef ARCH_X86_64 115%ifnidn %1,mmxext 116MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 117%endif 118%else 119MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 120%endif 121%endmacro 122 123%macro MCAxA_OP 8 124cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 125%ifdef ARCH_X86_32 126 call stub_%2_h264_qpel%4_%3_10_%1 127 mov r0, r0m 128 mov r1, r1m 129 add r0, %4*2 130 add r1, %4*2 131 call stub_%2_h264_qpel%4_%3_10_%1 132 mov r0, r0m 133 mov r1, r1m 134 lea r0, [r0+r2*%4] 135 lea r1, [r1+r2*%4] 136 call stub_%2_h264_qpel%4_%3_10_%1 137 mov r0, r0m 138 mov r1, r1m 139 lea r0, [r0+r2*%4+%4*2] 140 lea r1, [r1+r2*%4+%4*2] 141 call stub_%2_h264_qpel%4_%3_10_%1 142 RET 143%else ; ARCH_X86_64 144 mov r10, r0 145 mov r11, r1 146 call stub_%2_h264_qpel%4_%3_10_%1 147 lea r0, [r10+%4*2] 148 lea r1, [r11+%4*2] 149 call stub_%2_h264_qpel%4_%3_10_%1 150 lea r0, [r10+r2*%4] 151 lea r1, [r11+r2*%4] 152 call stub_%2_h264_qpel%4_%3_10_%1 153 lea r0, [r10+r2*%4+%4*2] 154 lea r1, [r11+r2*%4+%4*2] 155%ifndef UNIX64 ; fall through to function 156 call stub_%2_h264_qpel%4_%3_10_%1 157 RET 158%endif 159%endif 160%endmacro 161 162;cpu, put/avg, mc, 4/8, ... 163%macro cglobal_mc 7 164%assign i %4*2 165MCAxA %1, %2, %3, %4, i, %5,%6,%7 166 167cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7 168%ifndef UNIX64 ; no prologue or epilogue for UNIX64 169 call stub_%2_h264_qpel%4_%3_10_%1 170 RET 171%endif 172 173stub_%2_h264_qpel%4_%3_10_%1: 174%endmacro 175 176;----------------------------------------------------------------------------- 177; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) 178;----------------------------------------------------------------------------- 179%macro COPY4 0 180 movu m0, [r1 ] 181 OP_MOV [r0 ], m0 182 movu m0, [r1+r2 ] 183 OP_MOV [r0+r2 ], m0 184 movu m0, [r1+r2*2] 185 OP_MOV [r0+r2*2], m0 186 movu m0, [r1+r3 ] 187 OP_MOV [r0+r3 ], m0 188%endmacro 189 190%macro MC00 1 191INIT_MMX 192cglobal_mc mmxext, %1, mc00, 4, 3,4,0 193 lea r3, [r2*3] 194 COPY4 195 ret 196 197INIT_XMM 198cglobal %1_h264_qpel8_mc00_10_sse2, 3,4 199 lea r3, [r2*3] 200 COPY4 201 lea r0, [r0+r2*4] 202 lea r1, [r1+r2*4] 203 COPY4 204 RET 205 206cglobal %1_h264_qpel16_mc00_10_sse2, 3,4 207 mov r3d, 8 208.loop: 209 movu m0, [r1 ] 210 movu m1, [r1 +16] 211 OP_MOV [r0 ], m0 212 OP_MOV [r0 +16], m1 213 movu m0, [r1+r2 ] 214 movu m1, [r1+r2+16] 215 OP_MOV [r0+r2 ], m0 216 OP_MOV [r0+r2+16], m1 217 lea r0, [r0+r2*2] 218 lea r1, [r1+r2*2] 219 dec r3d 220 jg .loop 221 REP_RET 222%endmacro 223 224%define OP_MOV mova 225MC00 put 226 227%define OP_MOV AVG_MOV 228MC00 avg 229 230;----------------------------------------------------------------------------- 231; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) 232;----------------------------------------------------------------------------- 233%macro MC_CACHE 1 234%define OP_MOV mova 235%define PALIGNR PALIGNR_MMX 236INIT_MMX 237%1 mmxext , put, 4 238INIT_XMM 239%1 sse2_cache64 , put, 8 240%define PALIGNR PALIGNR_SSSE3 241%1 ssse3_cache64, put, 8 242%1 sse2 , put, 8, 0 243 244%define OP_MOV AVG_MOV 245%define PALIGNR PALIGNR_MMX 246INIT_MMX 247%1 mmxext , avg, 4 248INIT_XMM 249%1 sse2_cache64 , avg, 8 250%define PALIGNR PALIGNR_SSSE3 251%1 ssse3_cache64, avg, 8 252%1 sse2 , avg, 8, 0 253%endmacro 254 255%macro MC20 3-4 256cglobal_mc %1, %2, mc20, %3, 3,4,9 257 mov r3d, %3 258 mova m1, [pw_pixel_max] 259%if num_mmregs > 8 260 mova m8, [pw_16] 261 %define p16 m8 262%else 263 %define p16 [pw_16] 264%endif 265.nextrow 266%if %0 == 4 267 movu m2, [r1-4] 268 movu m3, [r1-2] 269 movu m4, [r1+0] 270 ADDW m2, [r1+6], m5 271 ADDW m3, [r1+4], m5 272 ADDW m4, [r1+2], m5 273%else ; movu is slow on these processors 274%if mmsize==16 275 movu m2, [r1-4] 276 movu m0, [r1+6] 277 mova m6, m0 278 psrldq m0, 6 279 280 paddw m6, m2 281 PALIGNR m3, m0, m2, 2, m5 282 PALIGNR m7, m0, m2, 8, m5 283 paddw m3, m7 284 PALIGNR m4, m0, m2, 4, m5 285 PALIGNR m7, m0, m2, 6, m5 286 paddw m4, m7 287 SWAP 2, 6 288%else 289 movu m2, [r1-4] 290 movu m6, [r1+4] 291 PALIGNR m3, m6, m2, 2, m5 292 paddw m3, m6 293 PALIGNR m4, m6, m2, 4, m5 294 PALIGNR m7, m6, m2, 6, m5 295 paddw m4, m7 296 paddw m2, [r1+6] 297%endif 298%endif 299 300 FILT_H m2, m3, m4, p16 301 psraw m2, 1 302 pxor m0, m0 303 CLIPW m2, m0, m1 304 OP_MOV [r0], m2 305 add r0, r2 306 add r1, r2 307 dec r3d 308 jg .nextrow 309 rep ret 310%endmacro 311 312MC_CACHE MC20 313 314;----------------------------------------------------------------------------- 315; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) 316;----------------------------------------------------------------------------- 317%macro MC30 3-4 318cglobal_mc %1, %2, mc30, %3, 3,5,9 319 lea r4, [r1+2] 320 jmp stub_%2_h264_qpel%3_mc10_10_%1.body 321%endmacro 322 323MC_CACHE MC30 324 325;----------------------------------------------------------------------------- 326; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) 327;----------------------------------------------------------------------------- 328%macro MC10 3-4 329cglobal_mc %1, %2, mc10, %3, 3,5,9 330 mov r4, r1 331.body 332 mov r3d, %3 333 mova m1, [pw_pixel_max] 334%if num_mmregs > 8 335 mova m8, [pw_16] 336 %define p16 m8 337%else 338 %define p16 [pw_16] 339%endif 340.nextrow 341%if %0 == 4 342 movu m2, [r1-4] 343 movu m3, [r1-2] 344 movu m4, [r1+0] 345 ADDW m2, [r1+6], m5 346 ADDW m3, [r1+4], m5 347 ADDW m4, [r1+2], m5 348%else ; movu is slow on these processors 349%if mmsize==16 350 movu m2, [r1-4] 351 movu m0, [r1+6] 352 mova m6, m0 353 psrldq m0, 6 354 355 paddw m6, m2 356 PALIGNR m3, m0, m2, 2, m5 357 PALIGNR m7, m0, m2, 8, m5 358 paddw m3, m7 359 PALIGNR m4, m0, m2, 4, m5 360 PALIGNR m7, m0, m2, 6, m5 361 paddw m4, m7 362 SWAP 2, 6 363%else 364 movu m2, [r1-4] 365 movu m6, [r1+4] 366 PALIGNR m3, m6, m2, 2, m5 367 paddw m3, m6 368 PALIGNR m4, m6, m2, 4, m5 369 PALIGNR m7, m6, m2, 6, m5 370 paddw m4, m7 371 paddw m2, [r1+6] 372%endif 373%endif 374 375 FILT_H m2, m3, m4, p16 376 psraw m2, 1 377 pxor m0, m0 378 CLIPW m2, m0, m1 379 movu m3, [r4] 380 pavgw m2, m3 381 OP_MOV [r0], m2 382 add r0, r2 383 add r1, r2 384 add r4, r2 385 dec r3d 386 jg .nextrow 387 rep ret 388%endmacro 389 390MC_CACHE MC10 391 392;----------------------------------------------------------------------------- 393; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) 394;----------------------------------------------------------------------------- 395%macro V_FILT 11 396v_filt%9_%10_10_%11: 397 add r4, r2 398.no_addr4: 399 FILT_V m0, m1, m2, m3, m4, m5, m6, m7 400 add r1, r2 401 add r0, r2 402 ret 403%endmacro 404 405INIT_MMX 406RESET_MM_PERMUTATION 407%assign i 0 408%rep 4 409V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext 410SWAP 0,1,2,3,4,5 411%assign i i+1 412%endrep 413 414INIT_XMM 415RESET_MM_PERMUTATION 416%assign i 0 417%rep 6 418V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2 419SWAP 0,1,2,3,4,5 420%assign i i+1 421%endrep 422 423%macro MC02 3 424cglobal_mc %1, %2, mc02, %3, 3,4,8 425 PRELOAD_V 426 427 sub r0, r2 428%assign j 0 429%rep %3 430 %assign i (j % 6) 431 call v_filt%3_ %+ i %+ _10_%1.no_addr4 432 OP_MOV [r0], m0 433 SWAP 0,1,2,3,4,5 434 %assign j j+1 435%endrep 436 ret 437%endmacro 438 439MC MC02 440 441;----------------------------------------------------------------------------- 442; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) 443;----------------------------------------------------------------------------- 444%macro MC01 3 445cglobal_mc %1, %2, mc01, %3, 3,5,8 446 mov r4, r1 447.body 448 PRELOAD_V 449 450 sub r4, r2 451 sub r0, r2 452%assign j 0 453%rep %3 454 %assign i (j % 6) 455 call v_filt%3_ %+ i %+ _10_%1 456 movu m7, [r4] 457 pavgw m0, m7 458 OP_MOV [r0], m0 459 SWAP 0,1,2,3,4,5 460 %assign j j+1 461%endrep 462 ret 463%endmacro 464 465MC MC01 466 467;----------------------------------------------------------------------------- 468; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) 469;----------------------------------------------------------------------------- 470%macro MC03 3 471cglobal_mc %1, %2, mc03, %3, 3,5,8 472 lea r4, [r1+r2] 473 jmp stub_%2_h264_qpel%3_mc01_10_%1.body 474%endmacro 475 476MC MC03 477 478;----------------------------------------------------------------------------- 479; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) 480;----------------------------------------------------------------------------- 481%macro H_FILT_AVG 3-4 482h_filt%2_%3_10_%1: 483;FILT_H with fewer registers and averaged with the FILT_V result 484;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration 485;unfortunately I need three registers, so m5 will have to be re-read from memory 486 movu m5, [r4-4] 487 ADDW m5, [r4+6], m7 488 movu m6, [r4-2] 489 ADDW m6, [r4+4], m7 490 paddw m5, [pw_16] 491 psubw m5, m6 ; a-b 492 psraw m5, 2 ; (a-b)/4 493 psubw m5, m6 ; (a-b)/4-b 494 movu m6, [r4+0] 495 ADDW m6, [r4+2], m7 496 paddw m5, m6 ; (a-b)/4-b+c 497 psraw m5, 2 ; ((a-b)/4-b+c)/4 498 paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 499 psraw m5, 1 500 CLIPW m5, [pb_0], [pw_pixel_max] 501;avg FILT_V, FILT_H 502 pavgw m0, m5 503%if %0!=4 504 movu m5, [r1+r5] 505%endif 506 ret 507%endmacro 508 509INIT_MMX 510RESET_MM_PERMUTATION 511%assign i 0 512%rep 3 513H_FILT_AVG mmxext, 4, i 514SWAP 0,1,2,3,4,5 515%assign i i+1 516%endrep 517H_FILT_AVG mmxext, 4, i, 0 518 519INIT_XMM 520RESET_MM_PERMUTATION 521%assign i 0 522%rep 6 523%if i==1 524H_FILT_AVG sse2, 8, i, 0 525%else 526H_FILT_AVG sse2, 8, i 527%endif 528SWAP 0,1,2,3,4,5 529%assign i i+1 530%endrep 531 532%macro MC11 3 533; this REALLY needs x86_64 534cglobal_mc %1, %2, mc11, %3, 3,6,8 535 mov r4, r1 536.body 537 PRELOAD_V 538 539 sub r0, r2 540 sub r4, r2 541 mov r5, r2 542 neg r5 543%assign j 0 544%rep %3 545 %assign i (j % 6) 546 call v_filt%3_ %+ i %+ _10_%1 547 call h_filt%3_ %+ i %+ _10_%1 548%if %3==8 && i==1 549 movu m5, [r1+r5] 550%endif 551 OP_MOV [r0], m0 552 SWAP 0,1,2,3,4,5 553 %assign j j+1 554%endrep 555 ret 556%endmacro 557 558MC MC11 559 560;----------------------------------------------------------------------------- 561; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) 562;----------------------------------------------------------------------------- 563%macro MC31 3 564cglobal_mc %1, %2, mc31, %3, 3,6,8 565 mov r4, r1 566 add r1, 2 567 jmp stub_%2_h264_qpel%3_mc11_10_%1.body 568%endmacro 569 570MC MC31 571 572;----------------------------------------------------------------------------- 573; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) 574;----------------------------------------------------------------------------- 575%macro MC13 3 576cglobal_mc %1, %2, mc13, %3, 3,7,12 577 lea r4, [r1+r2] 578 jmp stub_%2_h264_qpel%3_mc11_10_%1.body 579%endmacro 580 581MC MC13 582 583;----------------------------------------------------------------------------- 584; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) 585;----------------------------------------------------------------------------- 586%macro MC33 3 587cglobal_mc %1, %2, mc33, %3, 3,6,8 588 lea r4, [r1+r2] 589 add r1, 2 590 jmp stub_%2_h264_qpel%3_mc11_10_%1.body 591%endmacro 592 593MC MC33 594 595;----------------------------------------------------------------------------- 596; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride) 597;----------------------------------------------------------------------------- 598%macro FILT_H2 3 599 psubw %1, %2 ; a-b 600 psubw %2, %3 ; b-c 601 psllw %2, 2 602 psubw %1, %2 ; a-5*b+4*c 603 psllw %3, 4 604 paddw %1, %3 ; a-5*b+20*c 605%endmacro 606 607%macro FILT_VNRD 8 608 movu %6, [r1] 609 paddw %1, %6 610 mova %7, %2 611 paddw %7, %5 612 mova %8, %3 613 paddw %8, %4 614 FILT_H2 %1, %7, %8 615%endmacro 616 617%macro HV 2 618%ifidn %1,sse2 619%define PAD 12 620%define COUNT 2 621%else 622%define PAD 4 623%define COUNT 3 624%endif 625put_hv%2_10_%1: 626 neg r2 ; This actually saves instructions 627 lea r1, [r1+r2*2-mmsize+PAD] 628 lea r4, [rsp+PAD+gprsize] 629 mov r3d, COUNT 630.v_loop: 631 movu m0, [r1] 632 sub r1, r2 633 movu m1, [r1] 634 sub r1, r2 635 movu m2, [r1] 636 sub r1, r2 637 movu m3, [r1] 638 sub r1, r2 639 movu m4, [r1] 640 sub r1, r2 641%assign i 0 642%rep %2-1 643 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 644 psubw m0, [pad20] 645 movu [r4+i*mmsize*3], m0 646 sub r1, r2 647 SWAP 0,1,2,3,4,5 648%assign i i+1 649%endrep 650 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 651 psubw m0, [pad20] 652 movu [r4+i*mmsize*3], m0 653 add r4, mmsize 654 lea r1, [r1+r2*8+mmsize] 655%if %2==8 656 lea r1, [r1+r2*4] 657%endif 658 dec r3d 659 jg .v_loop 660 neg r2 661 ret 662%endmacro 663 664INIT_MMX 665HV mmxext, 4 666INIT_XMM 667HV sse2 , 8 668 669%macro H_LOOP 2 670%if num_mmregs > 8 671 %define s1 m8 672 %define s2 m9 673 %define s3 m10 674 %define d1 m11 675%else 676 %define s1 [tap1] 677 %define s2 [tap2] 678 %define s3 [tap3] 679 %define d1 [depad] 680%endif 681h%2_loop_op_%1: 682 movu m1, [r1+mmsize-4] 683 movu m2, [r1+mmsize-2] 684 mova m3, [r1+mmsize+0] 685 movu m4, [r1+mmsize+2] 686 movu m5, [r1+mmsize+4] 687 movu m6, [r1+mmsize+6] 688%if num_mmregs > 8 689 pmaddwd m1, s1 690 pmaddwd m2, s1 691 pmaddwd m3, s2 692 pmaddwd m4, s2 693 pmaddwd m5, s3 694 pmaddwd m6, s3 695 paddd m1, d1 696 paddd m2, d1 697%else 698 mova m0, s1 699 pmaddwd m1, m0 700 pmaddwd m2, m0 701 mova m0, s2 702 pmaddwd m3, m0 703 pmaddwd m4, m0 704 mova m0, s3 705 pmaddwd m5, m0 706 pmaddwd m6, m0 707 mova m0, d1 708 paddd m1, m0 709 paddd m2, m0 710%endif 711 paddd m3, m5 712 paddd m4, m6 713 paddd m1, m3 714 paddd m2, m4 715 psrad m1, 10 716 psrad m2, 10 717 pslld m2, 16 718 pand m1, [pd_0f] 719 por m1, m2 720%if num_mmregs <= 8 721 pxor m0, m0 722%endif 723 CLIPW m1, m0, m7 724 add r1, mmsize*3 725 ret 726%endmacro 727 728INIT_MMX 729H_LOOP mmxext, 4 730INIT_XMM 731H_LOOP sse2 , 8 732 733%macro MC22 3 734cglobal_mc %1, %2, mc22, %3, 3,7,12 735%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) 736 mov r6, rsp ; backup stack pointer 737 and rsp, ~(mmsize-1) ; align stack 738 sub rsp, PAD 739 740 call put_hv%3_10_%1 741 742 mov r3d, %3 743 mova m7, [pw_pixel_max] 744%if num_mmregs > 8 745 pxor m0, m0 746 mova m8, [tap1] 747 mova m9, [tap2] 748 mova m10, [tap3] 749 mova m11, [depad] 750%endif 751 mov r1, rsp 752.h_loop: 753 call h%3_loop_op_%1 754 755 OP_MOV [r0], m1 756 add r0, r2 757 dec r3d 758 jg .h_loop 759 760 mov rsp, r6 ; restore stack pointer 761 ret 762%endmacro 763 764MC MC22 765 766;----------------------------------------------------------------------------- 767; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) 768;----------------------------------------------------------------------------- 769%macro MC12 3 770cglobal_mc %1, %2, mc12, %3, 3,7,12 771%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) 772 mov r6, rsp ; backup stack pointer 773 and rsp, ~(mmsize-1) ; align stack 774 sub rsp, PAD 775 776 call put_hv%3_10_%1 777 778 xor r4d, r4d 779.body 780 mov r3d, %3 781 pxor m0, m0 782 mova m7, [pw_pixel_max] 783%if num_mmregs > 8 784 mova m8, [tap1] 785 mova m9, [tap2] 786 mova m10, [tap3] 787 mova m11, [depad] 788%endif 789 mov r1, rsp 790.h_loop: 791 call h%3_loop_op_%1 792 793 movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc 794 paddw m3, [depad2] 795 psrlw m3, 5 796 psubw m3, [unpad] 797 CLIPW m3, m0, m7 798 pavgw m1, m3 799 800 OP_MOV [r0], m1 801 add r0, r2 802 dec r3d 803 jg .h_loop 804 805 mov rsp, r6 ; restore stack pointer 806 ret 807%endmacro 808 809MC MC12 810 811;----------------------------------------------------------------------------- 812; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) 813;----------------------------------------------------------------------------- 814%macro MC32 3 815cglobal_mc %1, %2, mc32, %3, 3,7,12 816%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) 817 mov r6, rsp ; backup stack pointer 818 and rsp, ~(mmsize-1) ; align stack 819 sub rsp, PAD 820 821 call put_hv%3_10_%1 822 823 mov r4d, 2 ; sizeof(pixel) 824 jmp stub_%2_h264_qpel%3_mc12_10_%1.body 825%endmacro 826 827MC MC32 828 829;----------------------------------------------------------------------------- 830; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) 831;----------------------------------------------------------------------------- 832%macro H_NRD 2 833put_h%2_10_%1: 834 add rsp, gprsize 835 mov r3d, %2 836 xor r4d, r4d 837 mova m6, [pad20] 838.nextrow 839 movu m2, [r5-4] 840 movu m3, [r5-2] 841 movu m4, [r5+0] 842 ADDW m2, [r5+6], m5 843 ADDW m3, [r5+4], m5 844 ADDW m4, [r5+2], m5 845 846 FILT_H2 m2, m3, m4 847 psubw m2, m6 848 mova [rsp+r4], m2 849 add r4d, mmsize*3 850 add r5, r2 851 dec r3d 852 jg .nextrow 853 sub rsp, gprsize 854 ret 855%endmacro 856 857INIT_MMX 858H_NRD mmxext, 4 859INIT_XMM 860H_NRD sse2 , 8 861 862%macro MC21 3 863cglobal_mc %1, %2, mc21, %3, 3,7,12 864 mov r5, r1 865.body 866%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) 867 mov r6, rsp ; backup stack pointer 868 and rsp, ~(mmsize-1) ; align stack 869 870 sub rsp, PAD 871 call put_h%3_10_%1 872 873 sub rsp, PAD 874 call put_hv%3_10_%1 875 876 mov r4d, PAD-mmsize ; H buffer 877 jmp stub_%2_h264_qpel%3_mc12_10_%1.body 878%endmacro 879 880MC MC21 881 882;----------------------------------------------------------------------------- 883; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) 884;----------------------------------------------------------------------------- 885%macro MC23 3 886cglobal_mc %1, %2, mc23, %3, 3,7,12 887 lea r5, [r1+r2] 888 jmp stub_%2_h264_qpel%3_mc21_10_%1.body 889%endmacro 890 891MC MC23 892