1;****************************************************************************** 2;* 3;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> 4;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> 5;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> 6;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> 7;* Copyright (c) 2013 Daniel Kang 8;* 9;* SIMD-optimized halfpel functions 10;* 11;* This file is part of FFmpeg. 12;* 13;* FFmpeg is free software; you can redistribute it and/or 14;* modify it under the terms of the GNU Lesser General Public 15;* License as published by the Free Software Foundation; either 16;* version 2.1 of the License, or (at your option) any later version. 17;* 18;* FFmpeg is distributed in the hope that it will be useful, 19;* but WITHOUT ANY WARRANTY; without even the implied warranty of 20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21;* Lesser General Public License for more details. 22;* 23;* You should have received a copy of the GNU Lesser General Public 24;* License along with FFmpeg; if not, write to the Free Software 25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26;****************************************************************************** 27 28%include "libavutil/x86/x86util.asm" 29 30SECTION_RODATA 31cextern pb_1 32cextern pw_2 33pw_8192: times 8 dw (1<<13) 34pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 35pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 36 37SECTION_TEXT 38 39; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 40%macro PUT_PIXELS8_X2 0 41%if cpuflag(sse2) 42cglobal put_pixels16_x2, 4,5,4 43%else 44cglobal put_pixels8_x2, 4,5 45%endif 46 lea r4, [r2*2] 47.loop: 48 movu m0, [r1+1] 49 movu m1, [r1+r2+1] 50%if cpuflag(sse2) 51 movu m2, [r1] 52 movu m3, [r1+r2] 53 pavgb m0, m2 54 pavgb m1, m3 55%else 56 PAVGB m0, [r1] 57 PAVGB m1, [r1+r2] 58%endif 59 mova [r0], m0 60 mova [r0+r2], m1 61 add r1, r4 62 add r0, r4 63 movu m0, [r1+1] 64 movu m1, [r1+r2+1] 65%if cpuflag(sse2) 66 movu m2, [r1] 67 movu m3, [r1+r2] 68 pavgb m0, m2 69 pavgb m1, m3 70%else 71 PAVGB m0, [r1] 72 PAVGB m1, [r1+r2] 73%endif 74 add r1, r4 75 mova [r0], m0 76 mova [r0+r2], m1 77 add r0, r4 78 sub r3d, 4 79 jne .loop 80 REP_RET 81%endmacro 82 83INIT_MMX mmxext 84PUT_PIXELS8_X2 85INIT_MMX 3dnow 86PUT_PIXELS8_X2 87 88 89; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 90%macro PUT_PIXELS_16 0 91cglobal put_pixels16_x2, 4,5 92 lea r4, [r2*2] 93.loop: 94 mova m0, [r1] 95 mova m1, [r1+r2] 96 mova m2, [r1+8] 97 mova m3, [r1+r2+8] 98 PAVGB m0, [r1+1] 99 PAVGB m1, [r1+r2+1] 100 PAVGB m2, [r1+9] 101 PAVGB m3, [r1+r2+9] 102 mova [r0], m0 103 mova [r0+r2], m1 104 mova [r0+8], m2 105 mova [r0+r2+8], m3 106 add r1, r4 107 add r0, r4 108 mova m0, [r1] 109 mova m1, [r1+r2] 110 mova m2, [r1+8] 111 mova m3, [r1+r2+8] 112 PAVGB m0, [r1+1] 113 PAVGB m1, [r1+r2+1] 114 PAVGB m2, [r1+9] 115 PAVGB m3, [r1+r2+9] 116 add r1, r4 117 mova [r0], m0 118 mova [r0+r2], m1 119 mova [r0+8], m2 120 mova [r0+r2+8], m3 121 add r0, r4 122 sub r3d, 4 123 jne .loop 124 REP_RET 125%endmacro 126 127INIT_MMX mmxext 128PUT_PIXELS_16 129INIT_MMX 3dnow 130PUT_PIXELS_16 131; The 8_X2 macro can easily be used here 132INIT_XMM sse2 133PUT_PIXELS8_X2 134 135 136; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 137%macro PUT_NO_RND_PIXELS8_X2 0 138cglobal put_no_rnd_pixels8_x2, 4,5 139 mova m6, [pb_1] 140 lea r4, [r2*2] 141.loop: 142 mova m0, [r1] 143 mova m2, [r1+r2] 144 mova m1, [r1+1] 145 mova m3, [r1+r2+1] 146 add r1, r4 147 psubusb m0, m6 148 psubusb m2, m6 149 PAVGB m0, m1 150 PAVGB m2, m3 151 mova [r0], m0 152 mova [r0+r2], m2 153 mova m0, [r1] 154 mova m1, [r1+1] 155 mova m2, [r1+r2] 156 mova m3, [r1+r2+1] 157 add r0, r4 158 add r1, r4 159 psubusb m0, m6 160 psubusb m2, m6 161 PAVGB m0, m1 162 PAVGB m2, m3 163 mova [r0], m0 164 mova [r0+r2], m2 165 add r0, r4 166 sub r3d, 4 167 jne .loop 168 REP_RET 169%endmacro 170 171INIT_MMX mmxext 172PUT_NO_RND_PIXELS8_X2 173INIT_MMX 3dnow 174PUT_NO_RND_PIXELS8_X2 175 176 177; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 178%macro PUT_NO_RND_PIXELS8_X2_EXACT 0 179cglobal put_no_rnd_pixels8_x2_exact, 4,5 180 lea r4, [r2*3] 181 pcmpeqb m6, m6 182.loop: 183 mova m0, [r1] 184 mova m2, [r1+r2] 185 mova m1, [r1+1] 186 mova m3, [r1+r2+1] 187 pxor m0, m6 188 pxor m2, m6 189 pxor m1, m6 190 pxor m3, m6 191 PAVGB m0, m1 192 PAVGB m2, m3 193 pxor m0, m6 194 pxor m2, m6 195 mova [r0], m0 196 mova [r0+r2], m2 197 mova m0, [r1+r2*2] 198 mova m1, [r1+r2*2+1] 199 mova m2, [r1+r4] 200 mova m3, [r1+r4+1] 201 pxor m0, m6 202 pxor m1, m6 203 pxor m2, m6 204 pxor m3, m6 205 PAVGB m0, m1 206 PAVGB m2, m3 207 pxor m0, m6 208 pxor m2, m6 209 mova [r0+r2*2], m0 210 mova [r0+r4], m2 211 lea r1, [r1+r2*4] 212 lea r0, [r0+r2*4] 213 sub r3d, 4 214 jg .loop 215 REP_RET 216%endmacro 217 218INIT_MMX mmxext 219PUT_NO_RND_PIXELS8_X2_EXACT 220INIT_MMX 3dnow 221PUT_NO_RND_PIXELS8_X2_EXACT 222 223 224; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 225%macro PUT_PIXELS8_Y2 0 226%if cpuflag(sse2) 227cglobal put_pixels16_y2, 4,5,3 228%else 229cglobal put_pixels8_y2, 4,5 230%endif 231 lea r4, [r2*2] 232 movu m0, [r1] 233 sub r0, r2 234.loop: 235 movu m1, [r1+r2] 236 movu m2, [r1+r4] 237 add r1, r4 238 PAVGB m0, m1 239 PAVGB m1, m2 240 mova [r0+r2], m0 241 mova [r0+r4], m1 242 movu m1, [r1+r2] 243 movu m0, [r1+r4] 244 add r0, r4 245 add r1, r4 246 PAVGB m2, m1 247 PAVGB m1, m0 248 mova [r0+r2], m2 249 mova [r0+r4], m1 250 add r0, r4 251 sub r3d, 4 252 jne .loop 253 REP_RET 254%endmacro 255 256INIT_MMX mmxext 257PUT_PIXELS8_Y2 258INIT_MMX 3dnow 259PUT_PIXELS8_Y2 260; actually, put_pixels16_y2_sse2 261INIT_XMM sse2 262PUT_PIXELS8_Y2 263 264 265; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 266%macro PUT_NO_RND_PIXELS8_Y2 0 267cglobal put_no_rnd_pixels8_y2, 4,5 268 mova m6, [pb_1] 269 lea r4, [r2+r2] 270 mova m0, [r1] 271 sub r0, r2 272.loop: 273 mova m1, [r1+r2] 274 mova m2, [r1+r4] 275 add r1, r4 276 psubusb m1, m6 277 PAVGB m0, m1 278 PAVGB m1, m2 279 mova [r0+r2], m0 280 mova [r0+r4], m1 281 mova m1, [r1+r2] 282 mova m0, [r1+r4] 283 add r0, r4 284 add r1, r4 285 psubusb m1, m6 286 PAVGB m2, m1 287 PAVGB m1, m0 288 mova [r0+r2], m2 289 mova [r0+r4], m1 290 add r0, r4 291 sub r3d, 4 292 jne .loop 293 REP_RET 294%endmacro 295 296INIT_MMX mmxext 297PUT_NO_RND_PIXELS8_Y2 298INIT_MMX 3dnow 299PUT_NO_RND_PIXELS8_Y2 300 301 302; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 303%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0 304cglobal put_no_rnd_pixels8_y2_exact, 4,5 305 lea r4, [r2*3] 306 mova m0, [r1] 307 pcmpeqb m6, m6 308 add r1, r2 309 pxor m0, m6 310.loop: 311 mova m1, [r1] 312 mova m2, [r1+r2] 313 pxor m1, m6 314 pxor m2, m6 315 PAVGB m0, m1 316 PAVGB m1, m2 317 pxor m0, m6 318 pxor m1, m6 319 mova [r0], m0 320 mova [r0+r2], m1 321 mova m1, [r1+r2*2] 322 mova m0, [r1+r4] 323 pxor m1, m6 324 pxor m0, m6 325 PAVGB m2, m1 326 PAVGB m1, m0 327 pxor m2, m6 328 pxor m1, m6 329 mova [r0+r2*2], m2 330 mova [r0+r4], m1 331 lea r1, [r1+r2*4] 332 lea r0, [r0+r2*4] 333 sub r3d, 4 334 jg .loop 335 REP_RET 336%endmacro 337 338INIT_MMX mmxext 339PUT_NO_RND_PIXELS8_Y2_EXACT 340INIT_MMX 3dnow 341PUT_NO_RND_PIXELS8_Y2_EXACT 342 343 344; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 345%macro AVG_PIXELS8 0 346cglobal avg_pixels8, 4,5 347 lea r4, [r2*2] 348.loop: 349 mova m0, [r0] 350 mova m1, [r0+r2] 351 PAVGB m0, [r1] 352 PAVGB m1, [r1+r2] 353 mova [r0], m0 354 mova [r0+r2], m1 355 add r1, r4 356 add r0, r4 357 mova m0, [r0] 358 mova m1, [r0+r2] 359 PAVGB m0, [r1] 360 PAVGB m1, [r1+r2] 361 add r1, r4 362 mova [r0], m0 363 mova [r0+r2], m1 364 add r0, r4 365 sub r3d, 4 366 jne .loop 367 REP_RET 368%endmacro 369 370INIT_MMX 3dnow 371AVG_PIXELS8 372 373 374; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 375%macro AVG_PIXELS8_X2 0 376%if cpuflag(sse2) 377cglobal avg_pixels16_x2, 4,5,4 378%else 379cglobal avg_pixels8_x2, 4,5 380%endif 381 lea r4, [r2*2] 382%if notcpuflag(mmxext) 383 pcmpeqd m5, m5 384 paddb m5, m5 385%endif 386.loop: 387 movu m0, [r1] 388 movu m2, [r1+r2] 389%if cpuflag(sse2) 390 movu m1, [r1+1] 391 movu m3, [r1+r2+1] 392 pavgb m0, m1 393 pavgb m2, m3 394%else 395 PAVGB m0, [r1+1], m3, m5 396 PAVGB m2, [r1+r2+1], m4, m5 397%endif 398 PAVGB m0, [r0], m3, m5 399 PAVGB m2, [r0+r2], m4, m5 400 add r1, r4 401 mova [r0], m0 402 mova [r0+r2], m2 403 movu m0, [r1] 404 movu m2, [r1+r2] 405%if cpuflag(sse2) 406 movu m1, [r1+1] 407 movu m3, [r1+r2+1] 408 pavgb m0, m1 409 pavgb m2, m3 410%else 411 PAVGB m0, [r1+1], m3, m5 412 PAVGB m2, [r1+r2+1], m4, m5 413%endif 414 add r0, r4 415 add r1, r4 416 PAVGB m0, [r0], m3, m5 417 PAVGB m2, [r0+r2], m4, m5 418 mova [r0], m0 419 mova [r0+r2], m2 420 add r0, r4 421 sub r3d, 4 422 jne .loop 423 REP_RET 424%endmacro 425 426INIT_MMX mmx 427AVG_PIXELS8_X2 428INIT_MMX mmxext 429AVG_PIXELS8_X2 430INIT_MMX 3dnow 431AVG_PIXELS8_X2 432; actually avg_pixels16_x2 433INIT_XMM sse2 434AVG_PIXELS8_X2 435 436 437; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 438%macro AVG_PIXELS8_Y2 0 439%if cpuflag(sse2) 440cglobal avg_pixels16_y2, 4,5,3 441%else 442cglobal avg_pixels8_y2, 4,5 443%endif 444 lea r4, [r2*2] 445 movu m0, [r1] 446 sub r0, r2 447.loop: 448 movu m1, [r1+r2] 449 movu m2, [r1+r4] 450 add r1, r4 451 PAVGB m0, m1 452 PAVGB m1, m2 453 PAVGB m0, [r0+r2] 454 PAVGB m1, [r0+r4] 455 mova [r0+r2], m0 456 mova [r0+r4], m1 457 movu m1, [r1+r2] 458 movu m0, [r1+r4] 459 PAVGB m2, m1 460 PAVGB m1, m0 461 add r0, r4 462 add r1, r4 463 PAVGB m2, [r0+r2] 464 PAVGB m1, [r0+r4] 465 mova [r0+r2], m2 466 mova [r0+r4], m1 467 add r0, r4 468 sub r3d, 4 469 jne .loop 470 REP_RET 471%endmacro 472 473INIT_MMX mmxext 474AVG_PIXELS8_Y2 475INIT_MMX 3dnow 476AVG_PIXELS8_Y2 477; actually avg_pixels16_y2 478INIT_XMM sse2 479AVG_PIXELS8_Y2 480 481 482; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 483; Note this is not correctly rounded, and is therefore used for 484; not-bitexact output 485%macro AVG_APPROX_PIXELS8_XY2 0 486cglobal avg_approx_pixels8_xy2, 4,5 487 mova m6, [pb_1] 488 lea r4, [r2*2] 489 mova m0, [r1] 490 PAVGB m0, [r1+1] 491.loop: 492 mova m2, [r1+r4] 493 mova m1, [r1+r2] 494 psubusb m2, m6 495 PAVGB m1, [r1+r2+1] 496 PAVGB m2, [r1+r4+1] 497 add r1, r4 498 PAVGB m0, m1 499 PAVGB m1, m2 500 PAVGB m0, [r0] 501 PAVGB m1, [r0+r2] 502 mova [r0], m0 503 mova [r0+r2], m1 504 mova m1, [r1+r2] 505 mova m0, [r1+r4] 506 PAVGB m1, [r1+r2+1] 507 PAVGB m0, [r1+r4+1] 508 add r0, r4 509 add r1, r4 510 PAVGB m2, m1 511 PAVGB m1, m0 512 PAVGB m2, [r0] 513 PAVGB m1, [r0+r2] 514 mova [r0], m2 515 mova [r0+r2], m1 516 add r0, r4 517 sub r3d, 4 518 jne .loop 519 REP_RET 520%endmacro 521 522INIT_MMX mmxext 523AVG_APPROX_PIXELS8_XY2 524INIT_MMX 3dnow 525AVG_APPROX_PIXELS8_XY2 526 527 528; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 529%macro SET_PIXELS_XY2 1 530%if cpuflag(sse2) 531cglobal %1_pixels16_xy2, 4,5,8 532%else 533cglobal %1_pixels8_xy2, 4,5 534%endif 535 pxor m7, m7 536 mova m6, [pw_2] 537 movu m0, [r1] 538 movu m4, [r1+1] 539 mova m1, m0 540 mova m5, m4 541 punpcklbw m0, m7 542 punpcklbw m4, m7 543 punpckhbw m1, m7 544 punpckhbw m5, m7 545 paddusw m4, m0 546 paddusw m5, m1 547 xor r4, r4 548 add r1, r2 549.loop: 550 movu m0, [r1+r4] 551 movu m2, [r1+r4+1] 552 mova m1, m0 553 mova m3, m2 554 punpcklbw m0, m7 555 punpcklbw m2, m7 556 punpckhbw m1, m7 557 punpckhbw m3, m7 558 paddusw m0, m2 559 paddusw m1, m3 560 paddusw m4, m6 561 paddusw m5, m6 562 paddusw m4, m0 563 paddusw m5, m1 564 psrlw m4, 2 565 psrlw m5, 2 566%ifidn %1, avg 567 mova m3, [r0+r4] 568 packuswb m4, m5 569 PAVGB m4, m3 570%else 571 packuswb m4, m5 572%endif 573 mova [r0+r4], m4 574 add r4, r2 575 576 movu m2, [r1+r4] 577 movu m4, [r1+r4+1] 578 mova m3, m2 579 mova m5, m4 580 punpcklbw m2, m7 581 punpcklbw m4, m7 582 punpckhbw m3, m7 583 punpckhbw m5, m7 584 paddusw m4, m2 585 paddusw m5, m3 586 paddusw m0, m6 587 paddusw m1, m6 588 paddusw m0, m4 589 paddusw m1, m5 590 psrlw m0, 2 591 psrlw m1, 2 592%ifidn %1, avg 593 mova m3, [r0+r4] 594 packuswb m0, m1 595 PAVGB m0, m3 596%else 597 packuswb m0, m1 598%endif 599 mova [r0+r4], m0 600 add r4, r2 601 sub r3d, 2 602 jnz .loop 603 REP_RET 604%endmacro 605 606INIT_MMX mmxext 607SET_PIXELS_XY2 avg 608INIT_MMX 3dnow 609SET_PIXELS_XY2 avg 610INIT_XMM sse2 611SET_PIXELS_XY2 put 612SET_PIXELS_XY2 avg 613 614%macro SSSE3_PIXELS_XY2 1-2 615%if %0 == 2 ; sse2 616cglobal %1_pixels16_xy2, 4,5,%2 617 mova m4, [pb_interleave16] 618%else 619cglobal %1_pixels8_xy2, 4,5 620 mova m4, [pb_interleave8] 621%endif 622 mova m5, [pb_1] 623 movu m0, [r1] 624 movu m1, [r1+1] 625 pmaddubsw m0, m5 626 pmaddubsw m1, m5 627 xor r4, r4 628 add r1, r2 629.loop: 630 movu m2, [r1+r4] 631 movu m3, [r1+r4+1] 632 pmaddubsw m2, m5 633 pmaddubsw m3, m5 634 paddusw m0, m2 635 paddusw m1, m3 636 pmulhrsw m0, [pw_8192] 637 pmulhrsw m1, [pw_8192] 638%ifidn %1, avg 639 mova m6, [r0+r4] 640 packuswb m0, m1 641 pshufb m0, m4 642 pavgb m0, m6 643%else 644 packuswb m0, m1 645 pshufb m0, m4 646%endif 647 mova [r0+r4], m0 648 add r4, r2 649 650 movu m0, [r1+r4] 651 movu m1, [r1+r4+1] 652 pmaddubsw m0, m5 653 pmaddubsw m1, m5 654 paddusw m2, m0 655 paddusw m3, m1 656 pmulhrsw m2, [pw_8192] 657 pmulhrsw m3, [pw_8192] 658%ifidn %1, avg 659 mova m6, [r0+r4] 660 packuswb m2, m3 661 pshufb m2, m4 662 pavgb m2, m6 663%else 664 packuswb m2, m3 665 pshufb m2, m4 666%endif 667 mova [r0+r4], m2 668 add r4, r2 669 sub r3d, 2 670 jnz .loop 671 REP_RET 672%endmacro 673 674INIT_MMX ssse3 675SSSE3_PIXELS_XY2 put 676SSSE3_PIXELS_XY2 avg 677INIT_XMM ssse3 678SSSE3_PIXELS_XY2 put, 6 679SSSE3_PIXELS_XY2 avg, 7 680