1;****************************************************************************** 2;* VP8 MMXEXT optimizations 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> 5;* 6;* This file is part of Libav. 7;* 8;* Libav is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* Libav is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with Libav; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "x86inc.asm" 24%include "x86util.asm" 25 26SECTION_RODATA 27 28fourtap_filter_hw_m: times 4 dw -6, 123 29 times 4 dw 12, -1 30 times 4 dw -9, 93 31 times 4 dw 50, -6 32 times 4 dw -6, 50 33 times 4 dw 93, -9 34 times 4 dw -1, 12 35 times 4 dw 123, -6 36 37sixtap_filter_hw_m: times 4 dw 2, -11 38 times 4 dw 108, 36 39 times 4 dw -8, 1 40 times 4 dw 3, -16 41 times 4 dw 77, 77 42 times 4 dw -16, 3 43 times 4 dw 1, -8 44 times 4 dw 36, 108 45 times 4 dw -11, 2 46 47fourtap_filter_hb_m: times 8 db -6, 123 48 times 8 db 12, -1 49 times 8 db -9, 93 50 times 8 db 50, -6 51 times 8 db -6, 50 52 times 8 db 93, -9 53 times 8 db -1, 12 54 times 8 db 123, -6 55 56sixtap_filter_hb_m: times 8 db 2, 1 57 times 8 db -11, 108 58 times 8 db 36, -8 59 times 8 db 3, 3 60 times 8 db -16, 77 61 times 8 db 77, -16 62 times 8 db 1, 2 63 times 8 db -8, 36 64 times 8 db 108, -11 65 66fourtap_filter_v_m: times 8 dw -6 67 times 8 dw 123 68 times 8 dw 12 69 times 8 dw -1 70 times 8 dw -9 71 times 8 dw 93 72 times 8 dw 50 73 times 8 dw -6 74 times 8 dw -6 75 times 8 dw 50 76 times 8 dw 93 77 times 8 dw -9 78 times 8 dw -1 79 times 8 dw 12 80 times 8 dw 123 81 times 8 dw -6 82 83sixtap_filter_v_m: times 8 dw 2 84 times 8 dw -11 85 times 8 dw 108 86 times 8 dw 36 87 times 8 dw -8 88 times 8 dw 1 89 times 8 dw 3 90 times 8 dw -16 91 times 8 dw 77 92 times 8 dw 77 93 times 8 dw -16 94 times 8 dw 3 95 times 8 dw 1 96 times 8 dw -8 97 times 8 dw 36 98 times 8 dw 108 99 times 8 dw -11 100 times 8 dw 2 101 102bilinear_filter_vw_m: times 8 dw 1 103 times 8 dw 2 104 times 8 dw 3 105 times 8 dw 4 106 times 8 dw 5 107 times 8 dw 6 108 times 8 dw 7 109 110bilinear_filter_vb_m: times 8 db 7, 1 111 times 8 db 6, 2 112 times 8 db 5, 3 113 times 8 db 4, 4 114 times 8 db 3, 5 115 times 8 db 2, 6 116 times 8 db 1, 7 117 118%ifdef PIC 119%define fourtap_filter_hw r11 120%define sixtap_filter_hw r11 121%define fourtap_filter_hb r11 122%define sixtap_filter_hb r11 123%define fourtap_filter_v r11 124%define sixtap_filter_v r11 125%define bilinear_filter_vw r11 126%define bilinear_filter_vb r11 127%else 128%define fourtap_filter_hw fourtap_filter_hw_m 129%define sixtap_filter_hw sixtap_filter_hw_m 130%define fourtap_filter_hb fourtap_filter_hb_m 131%define sixtap_filter_hb sixtap_filter_hb_m 132%define fourtap_filter_v fourtap_filter_v_m 133%define sixtap_filter_v sixtap_filter_v_m 134%define bilinear_filter_vw bilinear_filter_vw_m 135%define bilinear_filter_vb bilinear_filter_vb_m 136%endif 137 138filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 139filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 140 141filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 142filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 143filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 144 145pw_20091: times 4 dw 20091 146pw_17734: times 4 dw 17734 147 148pb_27_63: times 8 db 27, 63 149pb_18_63: times 8 db 18, 63 150pb_9_63: times 8 db 9, 63 151 152cextern pb_1 153cextern pw_3 154cextern pb_3 155cextern pw_4 156cextern pb_4 157cextern pw_9 158cextern pw_18 159cextern pw_27 160cextern pw_63 161cextern pw_64 162cextern pb_80 163cextern pb_F8 164cextern pb_FE 165 166SECTION .text 167 168;----------------------------------------------------------------------------- 169; subpel MC functions: 170; 171; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, 172; uint8_t *src, int srcstride, 173; int height, int mx, int my); 174;----------------------------------------------------------------------------- 175 176%macro FILTER_SSSE3 3 177cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 178 lea r5d, [r5*3] 179 mova m3, [filter_h6_shuf2] 180 mova m4, [filter_h6_shuf3] 181%ifdef PIC 182 lea r11, [sixtap_filter_hb_m] 183%endif 184 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes 185 mova m6, [sixtap_filter_hb+r5*8-32] 186 mova m7, [sixtap_filter_hb+r5*8-16] 187 188.nextrow 189 movu m0, [r2-2] 190 mova m1, m0 191 mova m2, m0 192%ifidn %1, 4 193; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the 194; shuffle with a memory operand 195 punpcklbw m0, [r2+3] 196%else 197 pshufb m0, [filter_h6_shuf1] 198%endif 199 pshufb m1, m3 200 pshufb m2, m4 201 pmaddubsw m0, m5 202 pmaddubsw m1, m6 203 pmaddubsw m2, m7 204 paddsw m0, m1 205 paddsw m0, m2 206 paddsw m0, [pw_64] 207 psraw m0, 7 208 packuswb m0, m0 209 movh [r0], m0 ; store 210 211 ; go to next line 212 add r0, r1 213 add r2, r3 214 dec r4d ; next row 215 jg .nextrow 216 REP_RET 217 218cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 219 shl r5d, 4 220 mova m2, [pw_64] 221 mova m3, [filter_h2_shuf] 222 mova m4, [filter_h4_shuf] 223%ifdef PIC 224 lea r11, [fourtap_filter_hb_m] 225%endif 226 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes 227 mova m6, [fourtap_filter_hb+r5] 228 229.nextrow 230 movu m0, [r2-1] 231 mova m1, m0 232 pshufb m0, m3 233 pshufb m1, m4 234 pmaddubsw m0, m5 235 pmaddubsw m1, m6 236 paddsw m0, m2 237 paddsw m0, m1 238 psraw m0, 7 239 packuswb m0, m0 240 movh [r0], m0 ; store 241 242 ; go to next line 243 add r0, r1 244 add r2, r3 245 dec r4d ; next row 246 jg .nextrow 247 REP_RET 248 249cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 250 shl r6d, 4 251%ifdef PIC 252 lea r11, [fourtap_filter_hb_m] 253%endif 254 mova m5, [fourtap_filter_hb+r6-16] 255 mova m6, [fourtap_filter_hb+r6] 256 mova m7, [pw_64] 257 258 ; read 3 lines 259 sub r2, r3 260 movh m0, [r2] 261 movh m1, [r2+ r3] 262 movh m2, [r2+2*r3] 263 add r2, r3 264 265.nextrow 266 movh m3, [r2+2*r3] ; read new row 267 mova m4, m0 268 mova m0, m1 269 punpcklbw m4, m1 270 mova m1, m2 271 punpcklbw m2, m3 272 pmaddubsw m4, m5 273 pmaddubsw m2, m6 274 paddsw m4, m2 275 mova m2, m3 276 paddsw m4, m7 277 psraw m4, 7 278 packuswb m4, m4 279 movh [r0], m4 280 281 ; go to next line 282 add r0, r1 283 add r2, r3 284 dec r4d ; next row 285 jg .nextrow 286 REP_RET 287 288cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 289 lea r6d, [r6*3] 290%ifdef PIC 291 lea r11, [sixtap_filter_hb_m] 292%endif 293 lea r6, [sixtap_filter_hb+r6*8] 294 295 ; read 5 lines 296 sub r2, r3 297 sub r2, r3 298 movh m0, [r2] 299 movh m1, [r2+r3] 300 movh m2, [r2+r3*2] 301 lea r2, [r2+r3*2] 302 add r2, r3 303 movh m3, [r2] 304 movh m4, [r2+r3] 305 306.nextrow 307 movh m5, [r2+2*r3] ; read new row 308 mova m6, m0 309 punpcklbw m6, m5 310 mova m0, m1 311 punpcklbw m1, m2 312 mova m7, m3 313 punpcklbw m7, m4 314 pmaddubsw m6, [r6-48] 315 pmaddubsw m1, [r6-32] 316 pmaddubsw m7, [r6-16] 317 paddsw m6, m1 318 paddsw m6, m7 319 mova m1, m2 320 paddsw m6, [pw_64] 321 mova m2, m3 322 psraw m6, 7 323 mova m3, m4 324 packuswb m6, m6 325 mova m4, m5 326 movh [r0], m6 327 328 ; go to next line 329 add r0, r1 330 add r2, r3 331 dec r4d ; next row 332 jg .nextrow 333 REP_RET 334%endmacro 335 336INIT_MMX 337FILTER_SSSE3 4, 0, 0 338INIT_XMM 339FILTER_SSSE3 8, 8, 7 340 341; 4x4 block, H-only 4-tap filter 342cglobal put_vp8_epel4_h4_mmxext, 6, 6 343 shl r5d, 4 344%ifdef PIC 345 lea r11, [fourtap_filter_hw_m] 346%endif 347 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words 348 movq mm5, [fourtap_filter_hw+r5] 349 movq mm7, [pw_64] 350 pxor mm6, mm6 351 352.nextrow 353 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels 354 355 ; first set of 2 pixels 356 movq mm2, mm1 ; byte ABCD.. 357 punpcklbw mm1, mm6 ; byte->word ABCD 358 pshufw mm0, mm2, 9 ; byte CDEF.. 359 punpcklbw mm0, mm6 ; byte->word CDEF 360 pshufw mm3, mm1, 0x94 ; word ABBC 361 pshufw mm1, mm0, 0x94 ; word CDDE 362 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 363 movq mm0, mm1 ; backup for second set of pixels 364 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 365 paddd mm3, mm1 ; finish 1st 2px 366 367 ; second set of 2 pixels, use backup of above 368 punpckhbw mm2, mm6 ; byte->word EFGH 369 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 370 pshufw mm1, mm2, 0x94 ; word EFFG 371 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 372 paddd mm0, mm1 ; finish 2nd 2px 373 374 ; merge two sets of 2 pixels into one set of 4, round/clip/store 375 packssdw mm3, mm0 ; merge dword->word (4px) 376 paddsw mm3, mm7 ; rounding 377 psraw mm3, 7 378 packuswb mm3, mm6 ; clip and word->bytes 379 movd [r0], mm3 ; store 380 381 ; go to next line 382 add r0, r1 383 add r2, r3 384 dec r4d ; next row 385 jg .nextrow 386 REP_RET 387 388; 4x4 block, H-only 6-tap filter 389cglobal put_vp8_epel4_h6_mmxext, 6, 6 390 lea r5d, [r5*3] 391%ifdef PIC 392 lea r11, [sixtap_filter_hw_m] 393%endif 394 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words 395 movq mm5, [sixtap_filter_hw+r5*8-32] 396 movq mm6, [sixtap_filter_hw+r5*8-16] 397 movq mm7, [pw_64] 398 pxor mm3, mm3 399 400.nextrow 401 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels 402 403 ; first set of 2 pixels 404 movq mm2, mm1 ; byte ABCD.. 405 punpcklbw mm1, mm3 ; byte->word ABCD 406 pshufw mm0, mm2, 0x9 ; byte CDEF.. 407 punpckhbw mm2, mm3 ; byte->word EFGH 408 punpcklbw mm0, mm3 ; byte->word CDEF 409 pshufw mm1, mm1, 0x94 ; word ABBC 410 pshufw mm2, mm2, 0x94 ; word EFFG 411 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 412 pshufw mm3, mm0, 0x94 ; word CDDE 413 movq mm0, mm3 ; backup for second set of pixels 414 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 415 paddd mm1, mm3 ; add to 1st 2px cache 416 movq mm3, mm2 ; backup for second set of pixels 417 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 418 paddd mm1, mm2 ; finish 1st 2px 419 420 ; second set of 2 pixels, use backup of above 421 movd mm2, [r2+3] ; byte FGHI (prevent overreads) 422 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 423 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 424 paddd mm0, mm3 ; add to 2nd 2px cache 425 pxor mm3, mm3 426 punpcklbw mm2, mm3 ; byte->word FGHI 427 pshufw mm2, mm2, 0xE9 ; word GHHI 428 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 429 paddd mm0, mm2 ; finish 2nd 2px 430 431 ; merge two sets of 2 pixels into one set of 4, round/clip/store 432 packssdw mm1, mm0 ; merge dword->word (4px) 433 paddsw mm1, mm7 ; rounding 434 psraw mm1, 7 435 packuswb mm1, mm3 ; clip and word->bytes 436 movd [r0], mm1 ; store 437 438 ; go to next line 439 add r0, r1 440 add r2, r3 441 dec r4d ; next row 442 jg .nextrow 443 REP_RET 444 445INIT_XMM 446cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 447 shl r5d, 5 448%ifdef PIC 449 lea r11, [fourtap_filter_v_m] 450%endif 451 lea r5, [fourtap_filter_v+r5-32] 452 pxor m7, m7 453 mova m4, [pw_64] 454 mova m5, [r5+ 0] 455 mova m6, [r5+16] 456%ifdef m8 457 mova m8, [r5+32] 458 mova m9, [r5+48] 459%endif 460.nextrow 461 movq m0, [r2-1] 462 movq m1, [r2-0] 463 movq m2, [r2+1] 464 movq m3, [r2+2] 465 punpcklbw m0, m7 466 punpcklbw m1, m7 467 punpcklbw m2, m7 468 punpcklbw m3, m7 469 pmullw m0, m5 470 pmullw m1, m6 471%ifdef m8 472 pmullw m2, m8 473 pmullw m3, m9 474%else 475 pmullw m2, [r5+32] 476 pmullw m3, [r5+48] 477%endif 478 paddsw m0, m1 479 paddsw m2, m3 480 paddsw m0, m2 481 paddsw m0, m4 482 psraw m0, 7 483 packuswb m0, m7 484 movh [r0], m0 ; store 485 486 ; go to next line 487 add r0, r1 488 add r2, r3 489 dec r4d ; next row 490 jg .nextrow 491 REP_RET 492 493cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 494 lea r5d, [r5*3] 495 shl r5d, 4 496%ifdef PIC 497 lea r11, [sixtap_filter_v_m] 498%endif 499 lea r5, [sixtap_filter_v+r5-96] 500 pxor m7, m7 501 mova m6, [pw_64] 502%ifdef m8 503 mova m8, [r5+ 0] 504 mova m9, [r5+16] 505 mova m10, [r5+32] 506 mova m11, [r5+48] 507 mova m12, [r5+64] 508 mova m13, [r5+80] 509%endif 510.nextrow 511 movq m0, [r2-2] 512 movq m1, [r2-1] 513 movq m2, [r2-0] 514 movq m3, [r2+1] 515 movq m4, [r2+2] 516 movq m5, [r2+3] 517 punpcklbw m0, m7 518 punpcklbw m1, m7 519 punpcklbw m2, m7 520 punpcklbw m3, m7 521 punpcklbw m4, m7 522 punpcklbw m5, m7 523%ifdef m8 524 pmullw m0, m8 525 pmullw m1, m9 526 pmullw m2, m10 527 pmullw m3, m11 528 pmullw m4, m12 529 pmullw m5, m13 530%else 531 pmullw m0, [r5+ 0] 532 pmullw m1, [r5+16] 533 pmullw m2, [r5+32] 534 pmullw m3, [r5+48] 535 pmullw m4, [r5+64] 536 pmullw m5, [r5+80] 537%endif 538 paddsw m1, m4 539 paddsw m0, m5 540 paddsw m1, m2 541 paddsw m0, m3 542 paddsw m0, m1 543 paddsw m0, m6 544 psraw m0, 7 545 packuswb m0, m7 546 movh [r0], m0 ; store 547 548 ; go to next line 549 add r0, r1 550 add r2, r3 551 dec r4d ; next row 552 jg .nextrow 553 REP_RET 554 555%macro FILTER_V 3 556; 4x4 block, V-only 4-tap filter 557cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 558 shl r6d, 5 559%ifdef PIC 560 lea r11, [fourtap_filter_v_m] 561%endif 562 lea r6, [fourtap_filter_v+r6-32] 563 mova m6, [pw_64] 564 pxor m7, m7 565 mova m5, [r6+48] 566 567 ; read 3 lines 568 sub r2, r3 569 movh m0, [r2] 570 movh m1, [r2+ r3] 571 movh m2, [r2+2*r3] 572 add r2, r3 573 punpcklbw m0, m7 574 punpcklbw m1, m7 575 punpcklbw m2, m7 576 577.nextrow 578 ; first calculate negative taps (to prevent losing positive overflows) 579 movh m4, [r2+2*r3] ; read new row 580 punpcklbw m4, m7 581 mova m3, m4 582 pmullw m0, [r6+0] 583 pmullw m4, m5 584 paddsw m4, m0 585 586 ; then calculate positive taps 587 mova m0, m1 588 pmullw m1, [r6+16] 589 paddsw m4, m1 590 mova m1, m2 591 pmullw m2, [r6+32] 592 paddsw m4, m2 593 mova m2, m3 594 595 ; round/clip/store 596 paddsw m4, m6 597 psraw m4, 7 598 packuswb m4, m7 599 movh [r0], m4 600 601 ; go to next line 602 add r0, r1 603 add r2, r3 604 dec r4d ; next row 605 jg .nextrow 606 REP_RET 607 608 609; 4x4 block, V-only 6-tap filter 610cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 611 shl r6d, 4 612 lea r6, [r6*3] 613%ifdef PIC 614 lea r11, [sixtap_filter_v_m] 615%endif 616 lea r6, [sixtap_filter_v+r6-96] 617 pxor m7, m7 618 619 ; read 5 lines 620 sub r2, r3 621 sub r2, r3 622 movh m0, [r2] 623 movh m1, [r2+r3] 624 movh m2, [r2+r3*2] 625 lea r2, [r2+r3*2] 626 add r2, r3 627 movh m3, [r2] 628 movh m4, [r2+r3] 629 punpcklbw m0, m7 630 punpcklbw m1, m7 631 punpcklbw m2, m7 632 punpcklbw m3, m7 633 punpcklbw m4, m7 634 635.nextrow 636 ; first calculate negative taps (to prevent losing positive overflows) 637 mova m5, m1 638 pmullw m5, [r6+16] 639 mova m6, m4 640 pmullw m6, [r6+64] 641 paddsw m6, m5 642 643 ; then calculate positive taps 644 movh m5, [r2+2*r3] ; read new row 645 punpcklbw m5, m7 646 pmullw m0, [r6+0] 647 paddsw m6, m0 648 mova m0, m1 649 mova m1, m2 650 pmullw m2, [r6+32] 651 paddsw m6, m2 652 mova m2, m3 653 pmullw m3, [r6+48] 654 paddsw m6, m3 655 mova m3, m4 656 mova m4, m5 657 pmullw m5, [r6+80] 658 paddsw m6, m5 659 660 ; round/clip/store 661 paddsw m6, [pw_64] 662 psraw m6, 7 663 packuswb m6, m7 664 movh [r0], m6 665 666 ; go to next line 667 add r0, r1 668 add r2, r3 669 dec r4d ; next row 670 jg .nextrow 671 REP_RET 672%endmacro 673 674INIT_MMX 675FILTER_V mmxext, 4, 0 676INIT_XMM 677FILTER_V sse2, 8, 8 678 679%macro FILTER_BILINEAR 3 680cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 681 mov r5d, 8*16 682 shl r6d, 4 683 sub r5d, r6d 684%ifdef PIC 685 lea r11, [bilinear_filter_vw_m] 686%endif 687 pxor m6, m6 688 mova m4, [bilinear_filter_vw+r5-16] 689 mova m5, [bilinear_filter_vw+r6-16] 690.nextrow 691 movh m0, [r2+r3*0] 692 movh m1, [r2+r3*1] 693 movh m3, [r2+r3*2] 694 punpcklbw m0, m6 695 punpcklbw m1, m6 696 punpcklbw m3, m6 697 mova m2, m1 698 pmullw m0, m4 699 pmullw m1, m5 700 pmullw m2, m4 701 pmullw m3, m5 702 paddsw m0, m1 703 paddsw m2, m3 704 psraw m0, 2 705 psraw m2, 2 706 pavgw m0, m6 707 pavgw m2, m6 708%ifidn %1, mmxext 709 packuswb m0, m0 710 packuswb m2, m2 711 movh [r0+r1*0], m0 712 movh [r0+r1*1], m2 713%else 714 packuswb m0, m2 715 movh [r0+r1*0], m0 716 movhps [r0+r1*1], m0 717%endif 718 719 lea r0, [r0+r1*2] 720 lea r2, [r2+r3*2] 721 sub r4d, 2 722 jg .nextrow 723 REP_RET 724 725cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 726 mov r6d, 8*16 727 shl r5d, 4 728 sub r6d, r5d 729%ifdef PIC 730 lea r11, [bilinear_filter_vw_m] 731%endif 732 pxor m6, m6 733 mova m4, [bilinear_filter_vw+r6-16] 734 mova m5, [bilinear_filter_vw+r5-16] 735.nextrow 736 movh m0, [r2+r3*0+0] 737 movh m1, [r2+r3*0+1] 738 movh m2, [r2+r3*1+0] 739 movh m3, [r2+r3*1+1] 740 punpcklbw m0, m6 741 punpcklbw m1, m6 742 punpcklbw m2, m6 743 punpcklbw m3, m6 744 pmullw m0, m4 745 pmullw m1, m5 746 pmullw m2, m4 747 pmullw m3, m5 748 paddsw m0, m1 749 paddsw m2, m3 750 psraw m0, 2 751 psraw m2, 2 752 pavgw m0, m6 753 pavgw m2, m6 754%ifidn %1, mmxext 755 packuswb m0, m0 756 packuswb m2, m2 757 movh [r0+r1*0], m0 758 movh [r0+r1*1], m2 759%else 760 packuswb m0, m2 761 movh [r0+r1*0], m0 762 movhps [r0+r1*1], m0 763%endif 764 765 lea r0, [r0+r1*2] 766 lea r2, [r2+r3*2] 767 sub r4d, 2 768 jg .nextrow 769 REP_RET 770%endmacro 771 772INIT_MMX 773FILTER_BILINEAR mmxext, 4, 0 774INIT_XMM 775FILTER_BILINEAR sse2, 8, 7 776 777%macro FILTER_BILINEAR_SSSE3 1 778cglobal put_vp8_bilinear%1_v_ssse3, 7,7 779 shl r6d, 4 780%ifdef PIC 781 lea r11, [bilinear_filter_vb_m] 782%endif 783 pxor m4, m4 784 mova m3, [bilinear_filter_vb+r6-16] 785.nextrow 786 movh m0, [r2+r3*0] 787 movh m1, [r2+r3*1] 788 movh m2, [r2+r3*2] 789 punpcklbw m0, m1 790 punpcklbw m1, m2 791 pmaddubsw m0, m3 792 pmaddubsw m1, m3 793 psraw m0, 2 794 psraw m1, 2 795 pavgw m0, m4 796 pavgw m1, m4 797%if mmsize==8 798 packuswb m0, m0 799 packuswb m1, m1 800 movh [r0+r1*0], m0 801 movh [r0+r1*1], m1 802%else 803 packuswb m0, m1 804 movh [r0+r1*0], m0 805 movhps [r0+r1*1], m0 806%endif 807 808 lea r0, [r0+r1*2] 809 lea r2, [r2+r3*2] 810 sub r4d, 2 811 jg .nextrow 812 REP_RET 813 814cglobal put_vp8_bilinear%1_h_ssse3, 7,7 815 shl r5d, 4 816%ifdef PIC 817 lea r11, [bilinear_filter_vb_m] 818%endif 819 pxor m4, m4 820 mova m2, [filter_h2_shuf] 821 mova m3, [bilinear_filter_vb+r5-16] 822.nextrow 823 movu m0, [r2+r3*0] 824 movu m1, [r2+r3*1] 825 pshufb m0, m2 826 pshufb m1, m2 827 pmaddubsw m0, m3 828 pmaddubsw m1, m3 829 psraw m0, 2 830 psraw m1, 2 831 pavgw m0, m4 832 pavgw m1, m4 833%if mmsize==8 834 packuswb m0, m0 835 packuswb m1, m1 836 movh [r0+r1*0], m0 837 movh [r0+r1*1], m1 838%else 839 packuswb m0, m1 840 movh [r0+r1*0], m0 841 movhps [r0+r1*1], m0 842%endif 843 844 lea r0, [r0+r1*2] 845 lea r2, [r2+r3*2] 846 sub r4d, 2 847 jg .nextrow 848 REP_RET 849%endmacro 850 851INIT_MMX 852FILTER_BILINEAR_SSSE3 4 853INIT_XMM 854FILTER_BILINEAR_SSSE3 8 855 856cglobal put_vp8_pixels8_mmx, 5,5 857.nextrow: 858 movq mm0, [r2+r3*0] 859 movq mm1, [r2+r3*1] 860 lea r2, [r2+r3*2] 861 movq [r0+r1*0], mm0 862 movq [r0+r1*1], mm1 863 lea r0, [r0+r1*2] 864 sub r4d, 2 865 jg .nextrow 866 REP_RET 867 868cglobal put_vp8_pixels16_mmx, 5,5 869.nextrow: 870 movq mm0, [r2+r3*0+0] 871 movq mm1, [r2+r3*0+8] 872 movq mm2, [r2+r3*1+0] 873 movq mm3, [r2+r3*1+8] 874 lea r2, [r2+r3*2] 875 movq [r0+r1*0+0], mm0 876 movq [r0+r1*0+8], mm1 877 movq [r0+r1*1+0], mm2 878 movq [r0+r1*1+8], mm3 879 lea r0, [r0+r1*2] 880 sub r4d, 2 881 jg .nextrow 882 REP_RET 883 884cglobal put_vp8_pixels16_sse, 5,5,2 885.nextrow: 886 movups xmm0, [r2+r3*0] 887 movups xmm1, [r2+r3*1] 888 lea r2, [r2+r3*2] 889 movaps [r0+r1*0], xmm0 890 movaps [r0+r1*1], xmm1 891 lea r0, [r0+r1*2] 892 sub r4d, 2 893 jg .nextrow 894 REP_RET 895 896;----------------------------------------------------------------------------- 897; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); 898;----------------------------------------------------------------------------- 899 900%macro ADD_DC 4 901 %4 m2, [r0+%3] 902 %4 m3, [r0+r2+%3] 903 %4 m4, [r1+%3] 904 %4 m5, [r1+r2+%3] 905 paddusb m2, %1 906 paddusb m3, %1 907 paddusb m4, %1 908 paddusb m5, %1 909 psubusb m2, %2 910 psubusb m3, %2 911 psubusb m4, %2 912 psubusb m5, %2 913 %4 [r0+%3], m2 914 %4 [r0+r2+%3], m3 915 %4 [r1+%3], m4 916 %4 [r1+r2+%3], m5 917%endmacro 918 919INIT_MMX 920cglobal vp8_idct_dc_add_mmx, 3, 3 921 ; load data 922 movd m0, [r1] 923 924 ; calculate DC 925 paddw m0, [pw_4] 926 pxor m1, m1 927 psraw m0, 3 928 movd [r1], m1 929 psubw m1, m0 930 packuswb m0, m0 931 packuswb m1, m1 932 punpcklbw m0, m0 933 punpcklbw m1, m1 934 punpcklwd m0, m0 935 punpcklwd m1, m1 936 937 ; add DC 938 lea r1, [r0+r2*2] 939 ADD_DC m0, m1, 0, movh 940 RET 941 942INIT_XMM 943cglobal vp8_idct_dc_add_sse4, 3, 3, 6 944 ; load data 945 movd m0, [r1] 946 pxor m1, m1 947 948 ; calculate DC 949 paddw m0, [pw_4] 950 movd [r1], m1 951 lea r1, [r0+r2*2] 952 movd m2, [r0] 953 movd m3, [r0+r2] 954 movd m4, [r1] 955 movd m5, [r1+r2] 956 psraw m0, 3 957 pshuflw m0, m0, 0 958 punpcklqdq m0, m0 959 punpckldq m2, m3 960 punpckldq m4, m5 961 punpcklbw m2, m1 962 punpcklbw m4, m1 963 paddw m2, m0 964 paddw m4, m0 965 packuswb m2, m4 966 movd [r0], m2 967 pextrd [r0+r2], m2, 1 968 pextrd [r1], m2, 2 969 pextrd [r1+r2], m2, 3 970 RET 971 972;----------------------------------------------------------------------------- 973; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); 974;----------------------------------------------------------------------------- 975 976INIT_MMX 977cglobal vp8_idct_dc_add4y_mmx, 3, 3 978 ; load data 979 movd m0, [r1+32*0] ; A 980 movd m1, [r1+32*2] ; C 981 punpcklwd m0, [r1+32*1] ; A B 982 punpcklwd m1, [r1+32*3] ; C D 983 punpckldq m0, m1 ; A B C D 984 pxor m6, m6 985 986 ; calculate DC 987 paddw m0, [pw_4] 988 movd [r1+32*0], m6 989 movd [r1+32*1], m6 990 movd [r1+32*2], m6 991 movd [r1+32*3], m6 992 psraw m0, 3 993 psubw m6, m0 994 packuswb m0, m0 995 packuswb m6, m6 996 punpcklbw m0, m0 ; AABBCCDD 997 punpcklbw m6, m6 ; AABBCCDD 998 movq m1, m0 999 movq m7, m6 1000 punpcklbw m0, m0 ; AAAABBBB 1001 punpckhbw m1, m1 ; CCCCDDDD 1002 punpcklbw m6, m6 ; AAAABBBB 1003 punpckhbw m7, m7 ; CCCCDDDD 1004 1005 ; add DC 1006 lea r1, [r0+r2*2] 1007 ADD_DC m0, m6, 0, mova 1008 ADD_DC m1, m7, 8, mova 1009 RET 1010 1011INIT_XMM 1012cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 1013 ; load data 1014 movd m0, [r1+32*0] ; A 1015 movd m1, [r1+32*2] ; C 1016 punpcklwd m0, [r1+32*1] ; A B 1017 punpcklwd m1, [r1+32*3] ; C D 1018 punpckldq m0, m1 ; A B C D 1019 pxor m1, m1 1020 1021 ; calculate DC 1022 paddw m0, [pw_4] 1023 movd [r1+32*0], m1 1024 movd [r1+32*1], m1 1025 movd [r1+32*2], m1 1026 movd [r1+32*3], m1 1027 psraw m0, 3 1028 psubw m1, m0 1029 packuswb m0, m0 1030 packuswb m1, m1 1031 punpcklbw m0, m0 1032 punpcklbw m1, m1 1033 punpcklbw m0, m0 1034 punpcklbw m1, m1 1035 1036 ; add DC 1037 lea r1, [r0+r2*2] 1038 ADD_DC m0, m1, 0, mova 1039 RET 1040 1041;----------------------------------------------------------------------------- 1042; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); 1043;----------------------------------------------------------------------------- 1044 1045INIT_MMX 1046cglobal vp8_idct_dc_add4uv_mmx, 3, 3 1047 ; load data 1048 movd m0, [r1+32*0] ; A 1049 movd m1, [r1+32*2] ; C 1050 punpcklwd m0, [r1+32*1] ; A B 1051 punpcklwd m1, [r1+32*3] ; C D 1052 punpckldq m0, m1 ; A B C D 1053 pxor m6, m6 1054 1055 ; calculate DC 1056 paddw m0, [pw_4] 1057 movd [r1+32*0], m6 1058 movd [r1+32*1], m6 1059 movd [r1+32*2], m6 1060 movd [r1+32*3], m6 1061 psraw m0, 3 1062 psubw m6, m0 1063 packuswb m0, m0 1064 packuswb m6, m6 1065 punpcklbw m0, m0 ; AABBCCDD 1066 punpcklbw m6, m6 ; AABBCCDD 1067 movq m1, m0 1068 movq m7, m6 1069 punpcklbw m0, m0 ; AAAABBBB 1070 punpckhbw m1, m1 ; CCCCDDDD 1071 punpcklbw m6, m6 ; AAAABBBB 1072 punpckhbw m7, m7 ; CCCCDDDD 1073 1074 ; add DC 1075 lea r1, [r0+r2*2] 1076 ADD_DC m0, m6, 0, mova 1077 lea r0, [r0+r2*4] 1078 lea r1, [r1+r2*4] 1079 ADD_DC m1, m7, 0, mova 1080 RET 1081 1082;----------------------------------------------------------------------------- 1083; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); 1084;----------------------------------------------------------------------------- 1085 1086; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) 1087; this macro assumes that m6/m7 have words for 20091/17734 loaded 1088%macro VP8_MULTIPLY_SUMSUB 4 1089 mova %3, %1 1090 mova %4, %2 1091 pmulhw %3, m6 ;20091(1) 1092 pmulhw %4, m6 ;20091(2) 1093 paddw %3, %1 1094 paddw %4, %2 1095 paddw %1, %1 1096 paddw %2, %2 1097 pmulhw %1, m7 ;35468(1) 1098 pmulhw %2, m7 ;35468(2) 1099 psubw %1, %4 1100 paddw %2, %3 1101%endmacro 1102 1103; calculate x0=%1+%3; x1=%1-%3 1104; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) 1105; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) 1106; %5/%6 are temporary registers 1107; we assume m6/m7 have constant words 20091/17734 loaded in them 1108%macro VP8_IDCT_TRANSFORM4x4_1D 6 1109 SUMSUB_BA w, %3, %1, %5 ;t0, t1 1110 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 1111 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 1112 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 1113 SWAP %4, %1 1114 SWAP %4, %3 1115%endmacro 1116 1117INIT_MMX 1118%macro VP8_IDCT_ADD 1 1119cglobal vp8_idct_add_%1, 3, 3 1120 ; load block data 1121 movq m0, [r1+ 0] 1122 movq m1, [r1+ 8] 1123 movq m2, [r1+16] 1124 movq m3, [r1+24] 1125 movq m6, [pw_20091] 1126 movq m7, [pw_17734] 1127%ifidn %1, sse 1128 xorps xmm0, xmm0 1129 movaps [r1+ 0], xmm0 1130 movaps [r1+16], xmm0 1131%else 1132 pxor m4, m4 1133 movq [r1+ 0], m4 1134 movq [r1+ 8], m4 1135 movq [r1+16], m4 1136 movq [r1+24], m4 1137%endif 1138 1139 ; actual IDCT 1140 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1141 TRANSPOSE4x4W 0, 1, 2, 3, 4 1142 paddw m0, [pw_4] 1143 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1144 TRANSPOSE4x4W 0, 1, 2, 3, 4 1145 1146 ; store 1147 pxor m4, m4 1148 lea r1, [r0+2*r2] 1149 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 1150 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 1151 1152 RET 1153%endmacro 1154 1155VP8_IDCT_ADD mmx 1156VP8_IDCT_ADD sse 1157 1158;----------------------------------------------------------------------------- 1159; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) 1160;----------------------------------------------------------------------------- 1161 1162%macro SCATTER_WHT 3 1163 movd r1d, m%1 1164 movd r2d, m%2 1165 mov [r0+2*16*(0+%3)], r1w 1166 mov [r0+2*16*(1+%3)], r2w 1167 shr r1d, 16 1168 shr r2d, 16 1169 psrlq m%1, 32 1170 psrlq m%2, 32 1171 mov [r0+2*16*(4+%3)], r1w 1172 mov [r0+2*16*(5+%3)], r2w 1173 movd r1d, m%1 1174 movd r2d, m%2 1175 mov [r0+2*16*(8+%3)], r1w 1176 mov [r0+2*16*(9+%3)], r2w 1177 shr r1d, 16 1178 shr r2d, 16 1179 mov [r0+2*16*(12+%3)], r1w 1180 mov [r0+2*16*(13+%3)], r2w 1181%endmacro 1182 1183%macro HADAMARD4_1D 4 1184 SUMSUB_BADC w, %2, %1, %4, %3 1185 SUMSUB_BADC w, %4, %2, %3, %1 1186 SWAP %1, %4, %3 1187%endmacro 1188 1189%macro VP8_DC_WHT 1 1190cglobal vp8_luma_dc_wht_%1, 2,3 1191 movq m0, [r1] 1192 movq m1, [r1+8] 1193 movq m2, [r1+16] 1194 movq m3, [r1+24] 1195%ifidn %1, sse 1196 xorps xmm0, xmm0 1197 movaps [r1+ 0], xmm0 1198 movaps [r1+16], xmm0 1199%else 1200 pxor m4, m4 1201 movq [r1+ 0], m4 1202 movq [r1+ 8], m4 1203 movq [r1+16], m4 1204 movq [r1+24], m4 1205%endif 1206 HADAMARD4_1D 0, 1, 2, 3 1207 TRANSPOSE4x4W 0, 1, 2, 3, 4 1208 paddw m0, [pw_3] 1209 HADAMARD4_1D 0, 1, 2, 3 1210 psraw m0, 3 1211 psraw m1, 3 1212 psraw m2, 3 1213 psraw m3, 3 1214 SCATTER_WHT 0, 1, 0 1215 SCATTER_WHT 2, 3, 2 1216 RET 1217%endmacro 1218 1219INIT_MMX 1220VP8_DC_WHT mmx 1221VP8_DC_WHT sse 1222 1223;----------------------------------------------------------------------------- 1224; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); 1225;----------------------------------------------------------------------------- 1226 1227; macro called with 7 mm register indexes as argument, and 4 regular registers 1228; 1229; first 4 mm registers will carry the transposed pixel data 1230; the other three are scratchspace (one would be sufficient, but this allows 1231; for more spreading/pipelining and thus faster execution on OOE CPUs) 1232; 1233; first two regular registers are buf+4*stride and buf+5*stride 1234; third is -stride, fourth is +stride 1235%macro READ_8x4_INTERLEAVED 11 1236 ; interleave 8 (A-H) rows of 4 pixels each 1237 movd m%1, [%8+%10*4] ; A0-3 1238 movd m%5, [%9+%10*4] ; B0-3 1239 movd m%2, [%8+%10*2] ; C0-3 1240 movd m%6, [%8+%10] ; D0-3 1241 movd m%3, [%8] ; E0-3 1242 movd m%7, [%9] ; F0-3 1243 movd m%4, [%9+%11] ; G0-3 1244 punpcklbw m%1, m%5 ; A/B interleaved 1245 movd m%5, [%9+%11*2] ; H0-3 1246 punpcklbw m%2, m%6 ; C/D interleaved 1247 punpcklbw m%3, m%7 ; E/F interleaved 1248 punpcklbw m%4, m%5 ; G/H interleaved 1249%endmacro 1250 1251; macro called with 7 mm register indexes as argument, and 5 regular registers 1252; first 11 mean the same as READ_8x4_TRANSPOSED above 1253; fifth regular register is scratchspace to reach the bottom 8 rows, it 1254; will be set to second regular register + 8*stride at the end 1255%macro READ_16x4_INTERLEAVED 12 1256 ; transpose 16 (A-P) rows of 4 pixels each 1257 lea %12, [r0+8*r2] 1258 1259 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M 1260 movd m%1, [%8+%10*4] ; A0-3 1261 movd m%3, [%12+%10*4] ; I0-3 1262 movd m%2, [%8+%10*2] ; C0-3 1263 movd m%4, [%12+%10*2] ; K0-3 1264 movd m%6, [%8+%10] ; D0-3 1265 movd m%5, [%12+%10] ; L0-3 1266 movd m%7, [%12] ; M0-3 1267 add %12, %11 1268 punpcklbw m%1, m%3 ; A/I 1269 movd m%3, [%8] ; E0-3 1270 punpcklbw m%2, m%4 ; C/K 1271 punpcklbw m%6, m%5 ; D/L 1272 punpcklbw m%3, m%7 ; E/M 1273 punpcklbw m%2, m%6 ; C/D/K/L interleaved 1274 1275 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P 1276 movd m%5, [%9+%10*4] ; B0-3 1277 movd m%4, [%12+%10*4] ; J0-3 1278 movd m%7, [%9] ; F0-3 1279 movd m%6, [%12] ; N0-3 1280 punpcklbw m%5, m%4 ; B/J 1281 punpcklbw m%7, m%6 ; F/N 1282 punpcklbw m%1, m%5 ; A/B/I/J interleaved 1283 punpcklbw m%3, m%7 ; E/F/M/N interleaved 1284 movd m%4, [%9+%11] ; G0-3 1285 movd m%6, [%12+%11] ; O0-3 1286 movd m%5, [%9+%11*2] ; H0-3 1287 movd m%7, [%12+%11*2] ; P0-3 1288 punpcklbw m%4, m%6 ; G/O 1289 punpcklbw m%5, m%7 ; H/P 1290 punpcklbw m%4, m%5 ; G/H/O/P interleaved 1291%endmacro 1292 1293; write 4 mm registers of 2 dwords each 1294; first four arguments are mm register indexes containing source data 1295; last four are registers containing buf+4*stride, buf+5*stride, 1296; -stride and +stride 1297%macro WRITE_4x2D 8 1298 ; write out (2 dwords per register) 1299 movd [%5+%7*4], m%1 1300 movd [%5+%7*2], m%2 1301 movd [%5], m%3 1302 movd [%6+%8], m%4 1303 punpckhdq m%1, m%1 1304 punpckhdq m%2, m%2 1305 punpckhdq m%3, m%3 1306 punpckhdq m%4, m%4 1307 movd [%6+%7*4], m%1 1308 movd [%5+%7], m%2 1309 movd [%6], m%3 1310 movd [%6+%8*2], m%4 1311%endmacro 1312 1313; write 4 xmm registers of 4 dwords each 1314; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular 1315; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride 1316; we add 1*stride to the third regular registry in the process 1317; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the 1318; same memory region), or 8 if they cover two separate buffers (third one points to 1319; a different memory region than the first two), allowing for more optimal code for 1320; the 16-width case 1321%macro WRITE_4x4D 10 1322 ; write out (4 dwords per register), start with dwords zero 1323 movd [%5+%8*4], m%1 1324 movd [%5], m%2 1325 movd [%7+%8*4], m%3 1326 movd [%7], m%4 1327 1328 ; store dwords 1 1329 psrldq m%1, 4 1330 psrldq m%2, 4 1331 psrldq m%3, 4 1332 psrldq m%4, 4 1333 movd [%6+%8*4], m%1 1334 movd [%6], m%2 1335%if %10 == 16 1336 movd [%6+%9*4], m%3 1337%endif 1338 movd [%7+%9], m%4 1339 1340 ; write dwords 2 1341 psrldq m%1, 4 1342 psrldq m%2, 4 1343%if %10 == 8 1344 movd [%5+%8*2], m%1 1345 movd %5d, m%3 1346%endif 1347 psrldq m%3, 4 1348 psrldq m%4, 4 1349%if %10 == 16 1350 movd [%5+%8*2], m%1 1351%endif 1352 movd [%6+%9], m%2 1353 movd [%7+%8*2], m%3 1354 movd [%7+%9*2], m%4 1355 add %7, %9 1356 1357 ; store dwords 3 1358 psrldq m%1, 4 1359 psrldq m%2, 4 1360 psrldq m%3, 4 1361 psrldq m%4, 4 1362%if %10 == 8 1363 mov [%7+%8*4], %5d 1364 movd [%6+%8*2], m%1 1365%else 1366 movd [%5+%8], m%1 1367%endif 1368 movd [%6+%9*2], m%2 1369 movd [%7+%8*2], m%3 1370 movd [%7+%9*2], m%4 1371%endmacro 1372 1373; write 4 or 8 words in the mmx/xmm registers as 8 lines 1374; 1 and 2 are the registers to write, this can be the same (for SSE2) 1375; for pre-SSE4: 1376; 3 is a general-purpose register that we will clobber 1377; for SSE4: 1378; 3 is a pointer to the destination's 5th line 1379; 4 is a pointer to the destination's 4th line 1380; 5/6 is -stride and +stride 1381%macro WRITE_2x4W 6 1382 movd %3d, %1 1383 punpckhdq %1, %1 1384 mov [%4+%5*4], %3w 1385 shr %3, 16 1386 add %4, %6 1387 mov [%4+%5*4], %3w 1388 1389 movd %3d, %1 1390 add %4, %5 1391 mov [%4+%5*2], %3w 1392 shr %3, 16 1393 mov [%4+%5 ], %3w 1394 1395 movd %3d, %2 1396 punpckhdq %2, %2 1397 mov [%4 ], %3w 1398 shr %3, 16 1399 mov [%4+%6 ], %3w 1400 1401 movd %3d, %2 1402 add %4, %6 1403 mov [%4+%6 ], %3w 1404 shr %3, 16 1405 mov [%4+%6*2], %3w 1406 add %4, %5 1407%endmacro 1408 1409%macro WRITE_8W_SSE2 5 1410 movd %2d, %1 1411 psrldq %1, 4 1412 mov [%3+%4*4], %2w 1413 shr %2, 16 1414 add %3, %5 1415 mov [%3+%4*4], %2w 1416 1417 movd %2d, %1 1418 psrldq %1, 4 1419 add %3, %4 1420 mov [%3+%4*2], %2w 1421 shr %2, 16 1422 mov [%3+%4 ], %2w 1423 1424 movd %2d, %1 1425 psrldq %1, 4 1426 mov [%3 ], %2w 1427 shr %2, 16 1428 mov [%3+%5 ], %2w 1429 1430 movd %2d, %1 1431 add %3, %5 1432 mov [%3+%5 ], %2w 1433 shr %2, 16 1434 mov [%3+%5*2], %2w 1435%endmacro 1436 1437%macro WRITE_8W_SSE4 5 1438 pextrw [%3+%4*4], %1, 0 1439 pextrw [%2+%4*4], %1, 1 1440 pextrw [%3+%4*2], %1, 2 1441 pextrw [%3+%4 ], %1, 3 1442 pextrw [%3 ], %1, 4 1443 pextrw [%2 ], %1, 5 1444 pextrw [%2+%5 ], %1, 6 1445 pextrw [%2+%5*2], %1, 7 1446%endmacro 1447 1448%macro SPLATB_REG_MMX 2-3 1449 movd %1, %2d 1450 punpcklbw %1, %1 1451 punpcklwd %1, %1 1452 punpckldq %1, %1 1453%endmacro 1454 1455%macro SPLATB_REG_MMXEXT 2-3 1456 movd %1, %2d 1457 punpcklbw %1, %1 1458 pshufw %1, %1, 0x0 1459%endmacro 1460 1461%macro SPLATB_REG_SSE2 2-3 1462 movd %1, %2d 1463 punpcklbw %1, %1 1464 pshuflw %1, %1, 0x0 1465 punpcklqdq %1, %1 1466%endmacro 1467 1468%macro SPLATB_REG_SSSE3 3 1469 movd %1, %2d 1470 pshufb %1, %3 1471%endmacro 1472 1473%macro SIMPLE_LOOPFILTER 4 1474cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 1475%if mmsize == 8 ; mmx/mmxext 1476 mov r3, 2 1477%endif 1478%ifnidn %1, sse2 1479%if mmsize == 16 1480 pxor m0, m0 1481%endif 1482%endif 1483 SPLATB_REG m7, r2, m0 ; splat "flim" into register 1484 1485 ; set up indexes to address 4 rows 1486 mov r2, r1 1487 neg r1 1488%ifidn %2, h 1489 lea r0, [r0+4*r2-2] 1490%endif 1491 1492%if mmsize == 8 ; mmx / mmxext 1493.next8px 1494%endif 1495%ifidn %2, v 1496 ; read 4 half/full rows of pixels 1497 mova m0, [r0+r1*2] ; p1 1498 mova m1, [r0+r1] ; p0 1499 mova m2, [r0] ; q0 1500 mova m3, [r0+r2] ; q1 1501%else ; h 1502 lea r4, [r0+r2] 1503 1504%if mmsize == 8 ; mmx/mmxext 1505 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 1506%else ; sse2 1507 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 1508%endif 1509 TRANSPOSE4x4W 0, 1, 2, 3, 4 1510%endif 1511 1512 ; simple_limit 1513 mova m5, m2 ; m5=backup of q0 1514 mova m6, m1 ; m6=backup of p0 1515 psubusb m1, m2 ; p0-q0 1516 psubusb m2, m6 ; q0-p0 1517 por m1, m2 ; FFABS(p0-q0) 1518 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 1519 1520 mova m4, m3 1521 mova m2, m0 1522 psubusb m3, m0 ; q1-p1 1523 psubusb m0, m4 ; p1-q1 1524 por m3, m0 ; FFABS(p1-q1) 1525 mova m0, [pb_80] 1526 pxor m2, m0 1527 pxor m4, m0 1528 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below 1529 pand m3, [pb_FE] 1530 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed 1531 paddusb m3, m1 1532 psubusb m3, m7 1533 pxor m1, m1 1534 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) 1535 1536 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) 1537 mova m4, m5 1538 pxor m5, m0 1539 pxor m0, m6 1540 psubsb m5, m0 ; q0-p0 (signed) 1541 paddsb m2, m5 1542 paddsb m2, m5 1543 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) 1544 pand m2, m3 ; apply filter mask (m3) 1545 1546 mova m3, [pb_F8] 1547 mova m1, m2 1548 paddsb m2, [pb_4] ; f1<<3=a+4 1549 paddsb m1, [pb_3] ; f2<<3=a+3 1550 pand m2, m3 1551 pand m1, m3 ; cache f2<<3 1552 1553 pxor m0, m0 1554 pxor m3, m3 1555 pcmpgtb m0, m2 ; which values are <0? 1556 psubb m3, m2 ; -f1<<3 1557 psrlq m2, 3 ; +f1 1558 psrlq m3, 3 ; -f1 1559 pand m3, m0 1560 pandn m0, m2 1561 psubusb m4, m0 1562 paddusb m4, m3 ; q0-f1 1563 1564 pxor m0, m0 1565 pxor m3, m3 1566 pcmpgtb m0, m1 ; which values are <0? 1567 psubb m3, m1 ; -f2<<3 1568 psrlq m1, 3 ; +f2 1569 psrlq m3, 3 ; -f2 1570 pand m3, m0 1571 pandn m0, m1 1572 paddusb m6, m0 1573 psubusb m6, m3 ; p0+f2 1574 1575 ; store 1576%ifidn %2, v 1577 mova [r0], m4 1578 mova [r0+r1], m6 1579%else ; h 1580 inc r0 1581 SBUTTERFLY bw, 6, 4, 0 1582 1583%if mmsize == 16 ; sse2 1584%ifidn %1, sse4 1585 inc r4 1586%endif 1587 WRITE_8W m6, r4, r0, r1, r2 1588 lea r4, [r3+r1+1] 1589%ifidn %1, sse4 1590 inc r3 1591%endif 1592 WRITE_8W m4, r3, r4, r1, r2 1593%else ; mmx/mmxext 1594 WRITE_2x4W m6, m4, r4, r0, r1, r2 1595%endif 1596%endif 1597 1598%if mmsize == 8 ; mmx/mmxext 1599 ; next 8 pixels 1600%ifidn %2, v 1601 add r0, 8 ; advance 8 cols = pixels 1602%else ; h 1603 lea r0, [r0+r2*8-1] ; advance 8 rows = lines 1604%endif 1605 dec r3 1606 jg .next8px 1607 REP_RET 1608%else ; sse2 1609 RET 1610%endif 1611%endmacro 1612 1613INIT_MMX 1614%define SPLATB_REG SPLATB_REG_MMX 1615SIMPLE_LOOPFILTER mmx, v, 4, 0 1616SIMPLE_LOOPFILTER mmx, h, 5, 0 1617%define SPLATB_REG SPLATB_REG_MMXEXT 1618SIMPLE_LOOPFILTER mmxext, v, 4, 0 1619SIMPLE_LOOPFILTER mmxext, h, 5, 0 1620INIT_XMM 1621%define SPLATB_REG SPLATB_REG_SSE2 1622%define WRITE_8W WRITE_8W_SSE2 1623SIMPLE_LOOPFILTER sse2, v, 3, 8 1624SIMPLE_LOOPFILTER sse2, h, 5, 8 1625%define SPLATB_REG SPLATB_REG_SSSE3 1626SIMPLE_LOOPFILTER ssse3, v, 3, 8 1627SIMPLE_LOOPFILTER ssse3, h, 5, 8 1628%define WRITE_8W WRITE_8W_SSE4 1629SIMPLE_LOOPFILTER sse4, h, 5, 8 1630 1631;----------------------------------------------------------------------------- 1632; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, 1633; int flimE, int flimI, int hev_thr); 1634;----------------------------------------------------------------------------- 1635 1636%macro INNER_LOOPFILTER 5 1637%if %4 == 8 ; chroma 1638cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 1639%define dst8_reg r1 1640%define mstride_reg r2 1641%define E_reg r3 1642%define I_reg r4 1643%define hev_thr_reg r5 1644%else ; luma 1645cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 1646%define mstride_reg r1 1647%define E_reg r2 1648%define I_reg r3 1649%define hev_thr_reg r4 1650%ifdef m8 ; x86-64, sse2 1651%define dst8_reg r4 1652%elif mmsize == 16 ; x86-32, sse2 1653%define dst8_reg r5 1654%else ; x86-32, mmx/mmxext 1655%define cnt_reg r5 1656%endif 1657%endif 1658%define dst_reg r0 1659%define stride_reg E_reg 1660%define dst2_reg I_reg 1661%ifndef m8 1662%define stack_reg hev_thr_reg 1663%endif 1664 1665%ifnidn %1, sse2 1666%if mmsize == 16 1667 pxor m7, m7 1668%endif 1669%endif 1670 1671%ifndef m8 ; mmx/mmxext or sse2 on x86-32 1672 ; splat function arguments 1673 SPLATB_REG m0, E_reg, m7 ; E 1674 SPLATB_REG m1, I_reg, m7 ; I 1675 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh 1676 1677 ; align stack 1678 mov stack_reg, rsp ; backup stack pointer 1679 and rsp, ~(mmsize-1) ; align stack 1680%ifidn %2, v 1681 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr 1682 ; [3]=hev() result 1683%else ; h 1684 sub rsp, mmsize * 5 ; extra storage space for transposes 1685%endif 1686 1687%define flim_E [rsp] 1688%define flim_I [rsp+mmsize] 1689%define hev_thr [rsp+mmsize*2] 1690%define mask_res [rsp+mmsize*3] 1691%define p0backup [rsp+mmsize*3] 1692%define q0backup [rsp+mmsize*4] 1693 1694 mova flim_E, m0 1695 mova flim_I, m1 1696 mova hev_thr, m2 1697 1698%else ; sse2 on x86-64 1699 1700%define flim_E m9 1701%define flim_I m10 1702%define hev_thr m11 1703%define mask_res m12 1704%define p0backup m12 1705%define q0backup m8 1706 1707 ; splat function arguments 1708 SPLATB_REG flim_E, E_reg, m7 ; E 1709 SPLATB_REG flim_I, I_reg, m7 ; I 1710 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh 1711%endif 1712 1713%if mmsize == 8 && %4 == 16 ; mmx/mmxext 1714 mov cnt_reg, 2 1715%endif 1716 mov stride_reg, mstride_reg 1717 neg mstride_reg 1718%ifidn %2, h 1719 lea dst_reg, [dst_reg + stride_reg*4-4] 1720%if %4 == 8 1721 lea dst8_reg, [dst8_reg+ stride_reg*4-4] 1722%endif 1723%endif 1724 1725%if mmsize == 8 1726.next8px 1727%endif 1728 ; read 1729 lea dst2_reg, [dst_reg + stride_reg] 1730%ifidn %2, v 1731%if %4 == 8 && mmsize == 16 1732%define movrow movh 1733%else 1734%define movrow mova 1735%endif 1736 movrow m0, [dst_reg +mstride_reg*4] ; p3 1737 movrow m1, [dst2_reg+mstride_reg*4] ; p2 1738 movrow m2, [dst_reg +mstride_reg*2] ; p1 1739 movrow m5, [dst2_reg] ; q1 1740 movrow m6, [dst2_reg+ stride_reg] ; q2 1741 movrow m7, [dst2_reg+ stride_reg*2] ; q3 1742%if mmsize == 16 && %4 == 8 1743 movhps m0, [dst8_reg+mstride_reg*4] 1744 movhps m2, [dst8_reg+mstride_reg*2] 1745 add dst8_reg, stride_reg 1746 movhps m1, [dst8_reg+mstride_reg*4] 1747 movhps m5, [dst8_reg] 1748 movhps m6, [dst8_reg+ stride_reg] 1749 movhps m7, [dst8_reg+ stride_reg*2] 1750 add dst8_reg, mstride_reg 1751%endif 1752%elif mmsize == 8 ; mmx/mmxext (h) 1753 ; read 8 rows of 8px each 1754 movu m0, [dst_reg +mstride_reg*4] 1755 movu m1, [dst2_reg+mstride_reg*4] 1756 movu m2, [dst_reg +mstride_reg*2] 1757 movu m3, [dst_reg +mstride_reg] 1758 movu m4, [dst_reg] 1759 movu m5, [dst2_reg] 1760 movu m6, [dst2_reg+ stride_reg] 1761 1762 ; 8x8 transpose 1763 TRANSPOSE4x4B 0, 1, 2, 3, 7 1764 mova q0backup, m1 1765 movu m7, [dst2_reg+ stride_reg*2] 1766 TRANSPOSE4x4B 4, 5, 6, 7, 1 1767 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1768 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1769 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1770 mova m1, q0backup 1771 mova q0backup, m2 ; store q0 1772 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 1773 mova p0backup, m5 ; store p0 1774 SWAP 1, 4 1775 SWAP 2, 4 1776 SWAP 6, 3 1777 SWAP 5, 3 1778%else ; sse2 (h) 1779%if %4 == 16 1780 lea dst8_reg, [dst_reg + stride_reg*8] 1781%endif 1782 1783 ; read 16 rows of 8px each, interleave 1784 movh m0, [dst_reg +mstride_reg*4] 1785 movh m1, [dst8_reg+mstride_reg*4] 1786 movh m2, [dst_reg +mstride_reg*2] 1787 movh m5, [dst8_reg+mstride_reg*2] 1788 movh m3, [dst_reg +mstride_reg] 1789 movh m6, [dst8_reg+mstride_reg] 1790 movh m4, [dst_reg] 1791 movh m7, [dst8_reg] 1792 punpcklbw m0, m1 ; A/I 1793 punpcklbw m2, m5 ; C/K 1794 punpcklbw m3, m6 ; D/L 1795 punpcklbw m4, m7 ; E/M 1796 1797 add dst8_reg, stride_reg 1798 movh m1, [dst2_reg+mstride_reg*4] 1799 movh m6, [dst8_reg+mstride_reg*4] 1800 movh m5, [dst2_reg] 1801 movh m7, [dst8_reg] 1802 punpcklbw m1, m6 ; B/J 1803 punpcklbw m5, m7 ; F/N 1804 movh m6, [dst2_reg+ stride_reg] 1805 movh m7, [dst8_reg+ stride_reg] 1806 punpcklbw m6, m7 ; G/O 1807 1808 ; 8x16 transpose 1809 TRANSPOSE4x4B 0, 1, 2, 3, 7 1810%ifdef m8 1811 SWAP 1, 8 1812%else 1813 mova q0backup, m1 1814%endif 1815 movh m7, [dst2_reg+ stride_reg*2] 1816 movh m1, [dst8_reg+ stride_reg*2] 1817 punpcklbw m7, m1 ; H/P 1818 TRANSPOSE4x4B 4, 5, 6, 7, 1 1819 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 1820 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 1821 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 1822%ifdef m8 1823 SWAP 1, 8 1824 SWAP 2, 8 1825%else 1826 mova m1, q0backup 1827 mova q0backup, m2 ; store q0 1828%endif 1829 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 1830%ifdef m12 1831 SWAP 5, 12 1832%else 1833 mova p0backup, m5 ; store p0 1834%endif 1835 SWAP 1, 4 1836 SWAP 2, 4 1837 SWAP 6, 3 1838 SWAP 5, 3 1839%endif 1840 1841 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 1842 mova m4, m1 1843 SWAP 4, 1 1844 psubusb m4, m0 ; p2-p3 1845 psubusb m0, m1 ; p3-p2 1846 por m0, m4 ; abs(p3-p2) 1847 1848 mova m4, m2 1849 SWAP 4, 2 1850 psubusb m4, m1 ; p1-p2 1851 psubusb m1, m2 ; p2-p1 1852 por m1, m4 ; abs(p2-p1) 1853 1854 mova m4, m6 1855 SWAP 4, 6 1856 psubusb m4, m7 ; q2-q3 1857 psubusb m7, m6 ; q3-q2 1858 por m7, m4 ; abs(q3-q2) 1859 1860 mova m4, m5 1861 SWAP 4, 5 1862 psubusb m4, m6 ; q1-q2 1863 psubusb m6, m5 ; q2-q1 1864 por m6, m4 ; abs(q2-q1) 1865 1866%ifidn %1, mmx 1867 mova m4, flim_I 1868 pxor m3, m3 1869 psubusb m0, m4 1870 psubusb m1, m4 1871 psubusb m7, m4 1872 psubusb m6, m4 1873 pcmpeqb m0, m3 ; abs(p3-p2) <= I 1874 pcmpeqb m1, m3 ; abs(p2-p1) <= I 1875 pcmpeqb m7, m3 ; abs(q3-q2) <= I 1876 pcmpeqb m6, m3 ; abs(q2-q1) <= I 1877 pand m0, m1 1878 pand m7, m6 1879 pand m0, m7 1880%else ; mmxext/sse2 1881 pmaxub m0, m1 1882 pmaxub m6, m7 1883 pmaxub m0, m6 1884%endif 1885 1886 ; normal_limit and high_edge_variance for p1-p0, q1-q0 1887 SWAP 7, 3 ; now m7 is zero 1888%ifidn %2, v 1889 movrow m3, [dst_reg +mstride_reg] ; p0 1890%if mmsize == 16 && %4 == 8 1891 movhps m3, [dst8_reg+mstride_reg] 1892%endif 1893%elifdef m12 1894 SWAP 3, 12 1895%else 1896 mova m3, p0backup 1897%endif 1898 1899 mova m1, m2 1900 SWAP 1, 2 1901 mova m6, m3 1902 SWAP 3, 6 1903 psubusb m1, m3 ; p1-p0 1904 psubusb m6, m2 ; p0-p1 1905 por m1, m6 ; abs(p1-p0) 1906%ifidn %1, mmx 1907 mova m6, m1 1908 psubusb m1, m4 1909 psubusb m6, hev_thr 1910 pcmpeqb m1, m7 ; abs(p1-p0) <= I 1911 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh 1912 pand m0, m1 1913 mova mask_res, m6 1914%else ; mmxext/sse2 1915 pmaxub m0, m1 ; max_I 1916 SWAP 1, 4 ; max_hev_thresh 1917%endif 1918 1919 SWAP 6, 4 ; now m6 is I 1920%ifidn %2, v 1921 movrow m4, [dst_reg] ; q0 1922%if mmsize == 16 && %4 == 8 1923 movhps m4, [dst8_reg] 1924%endif 1925%elifdef m8 1926 SWAP 4, 8 1927%else 1928 mova m4, q0backup 1929%endif 1930 mova m1, m4 1931 SWAP 1, 4 1932 mova m7, m5 1933 SWAP 7, 5 1934 psubusb m1, m5 ; q0-q1 1935 psubusb m7, m4 ; q1-q0 1936 por m1, m7 ; abs(q1-q0) 1937%ifidn %1, mmx 1938 mova m7, m1 1939 psubusb m1, m6 1940 psubusb m7, hev_thr 1941 pxor m6, m6 1942 pcmpeqb m1, m6 ; abs(q1-q0) <= I 1943 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh 1944 mova m6, mask_res 1945 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I 1946 pand m6, m7 1947%else ; mmxext/sse2 1948 pxor m7, m7 1949 pmaxub m0, m1 1950 pmaxub m6, m1 1951 psubusb m0, flim_I 1952 psubusb m6, hev_thr 1953 pcmpeqb m0, m7 ; max(abs(..)) <= I 1954 pcmpeqb m6, m7 ; !(max(abs..) > thresh) 1955%endif 1956%ifdef m12 1957 SWAP 6, 12 1958%else 1959 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) 1960%endif 1961 1962 ; simple_limit 1963 mova m1, m3 1964 SWAP 1, 3 1965 mova m6, m4 ; keep copies of p0/q0 around for later use 1966 SWAP 6, 4 1967 psubusb m1, m4 ; p0-q0 1968 psubusb m6, m3 ; q0-p0 1969 por m1, m6 ; abs(q0-p0) 1970 paddusb m1, m1 ; m1=2*abs(q0-p0) 1971 1972 mova m7, m2 1973 SWAP 7, 2 1974 mova m6, m5 1975 SWAP 6, 5 1976 psubusb m7, m5 ; p1-q1 1977 psubusb m6, m2 ; q1-p1 1978 por m7, m6 ; abs(q1-p1) 1979 pxor m6, m6 1980 pand m7, [pb_FE] 1981 psrlq m7, 1 ; abs(q1-p1)/2 1982 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 1983 psubusb m7, flim_E 1984 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E 1985 pand m0, m7 ; normal_limit result 1986 1987 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask 1988%ifdef m8 ; x86-64 && sse2 1989 mova m8, [pb_80] 1990%define pb_80_var m8 1991%else ; x86-32 or mmx/mmxext 1992%define pb_80_var [pb_80] 1993%endif 1994 mova m1, m4 1995 mova m7, m3 1996 pxor m1, pb_80_var 1997 pxor m7, pb_80_var 1998 psubsb m1, m7 ; (signed) q0-p0 1999 mova m6, m2 2000 mova m7, m5 2001 pxor m6, pb_80_var 2002 pxor m7, pb_80_var 2003 psubsb m6, m7 ; (signed) p1-q1 2004 mova m7, mask_res 2005 pandn m7, m6 2006 paddsb m7, m1 2007 paddsb m7, m1 2008 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) 2009 2010 pand m7, m0 2011 mova m1, [pb_F8] 2012 mova m6, m7 2013 paddsb m7, [pb_3] 2014 paddsb m6, [pb_4] 2015 pand m7, m1 2016 pand m6, m1 2017 2018 pxor m1, m1 2019 pxor m0, m0 2020 pcmpgtb m1, m7 2021 psubb m0, m7 2022 psrlq m7, 3 ; +f2 2023 psrlq m0, 3 ; -f2 2024 pand m0, m1 2025 pandn m1, m7 2026 psubusb m3, m0 2027 paddusb m3, m1 ; p0+f2 2028 2029 pxor m1, m1 2030 pxor m0, m0 2031 pcmpgtb m0, m6 2032 psubb m1, m6 2033 psrlq m6, 3 ; +f1 2034 psrlq m1, 3 ; -f1 2035 pand m1, m0 2036 pandn m0, m6 2037 psubusb m4, m0 2038 paddusb m4, m1 ; q0-f1 2039 2040%ifdef m12 2041 SWAP 6, 12 2042%else 2043 mova m6, mask_res 2044%endif 2045%ifidn %1, mmx 2046 mova m7, [pb_1] 2047%else ; mmxext/sse2 2048 pxor m7, m7 2049%endif 2050 pand m0, m6 2051 pand m1, m6 2052%ifidn %1, mmx 2053 paddusb m0, m7 2054 pand m1, [pb_FE] 2055 pandn m7, m0 2056 psrlq m1, 1 2057 psrlq m7, 1 2058 SWAP 0, 7 2059%else ; mmxext/sse2 2060 psubusb m1, [pb_1] 2061 pavgb m0, m7 ; a 2062 pavgb m1, m7 ; -a 2063%endif 2064 psubusb m5, m0 2065 psubusb m2, m1 2066 paddusb m5, m1 ; q1-a 2067 paddusb m2, m0 ; p1+a 2068 2069 ; store 2070%ifidn %2, v 2071 movrow [dst_reg +mstride_reg*2], m2 2072 movrow [dst_reg +mstride_reg ], m3 2073 movrow [dst_reg], m4 2074 movrow [dst_reg + stride_reg ], m5 2075%if mmsize == 16 && %4 == 8 2076 movhps [dst8_reg+mstride_reg*2], m2 2077 movhps [dst8_reg+mstride_reg ], m3 2078 movhps [dst8_reg], m4 2079 movhps [dst8_reg+ stride_reg ], m5 2080%endif 2081%else ; h 2082 add dst_reg, 2 2083 add dst2_reg, 2 2084 2085 ; 4x8/16 transpose 2086 TRANSPOSE4x4B 2, 3, 4, 5, 6 2087 2088%if mmsize == 8 ; mmx/mmxext (h) 2089 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg 2090%else ; sse2 (h) 2091 lea dst8_reg, [dst8_reg+mstride_reg+2] 2092 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 2093%endif 2094%endif 2095 2096%if mmsize == 8 2097%if %4 == 8 ; chroma 2098%ifidn %2, h 2099 sub dst_reg, 2 2100%endif 2101 cmp dst_reg, dst8_reg 2102 mov dst_reg, dst8_reg 2103 jnz .next8px 2104%else 2105%ifidn %2, h 2106 lea dst_reg, [dst_reg + stride_reg*8-2] 2107%else ; v 2108 add dst_reg, 8 2109%endif 2110 dec cnt_reg 2111 jg .next8px 2112%endif 2113%endif 2114 2115%ifndef m8 ; sse2 on x86-32 or mmx/mmxext 2116 mov rsp, stack_reg ; restore stack pointer 2117%endif 2118 RET 2119%endmacro 2120 2121INIT_MMX 2122%define SPLATB_REG SPLATB_REG_MMX 2123INNER_LOOPFILTER mmx, v, 6, 16, 0 2124INNER_LOOPFILTER mmx, h, 6, 16, 0 2125INNER_LOOPFILTER mmx, v, 6, 8, 0 2126INNER_LOOPFILTER mmx, h, 6, 8, 0 2127 2128%define SPLATB_REG SPLATB_REG_MMXEXT 2129INNER_LOOPFILTER mmxext, v, 6, 16, 0 2130INNER_LOOPFILTER mmxext, h, 6, 16, 0 2131INNER_LOOPFILTER mmxext, v, 6, 8, 0 2132INNER_LOOPFILTER mmxext, h, 6, 8, 0 2133 2134INIT_XMM 2135%define SPLATB_REG SPLATB_REG_SSE2 2136INNER_LOOPFILTER sse2, v, 5, 16, 13 2137%ifdef m8 2138INNER_LOOPFILTER sse2, h, 5, 16, 13 2139%else 2140INNER_LOOPFILTER sse2, h, 6, 16, 13 2141%endif 2142INNER_LOOPFILTER sse2, v, 6, 8, 13 2143INNER_LOOPFILTER sse2, h, 6, 8, 13 2144 2145%define SPLATB_REG SPLATB_REG_SSSE3 2146INNER_LOOPFILTER ssse3, v, 5, 16, 13 2147%ifdef m8 2148INNER_LOOPFILTER ssse3, h, 5, 16, 13 2149%else 2150INNER_LOOPFILTER ssse3, h, 6, 16, 13 2151%endif 2152INNER_LOOPFILTER ssse3, v, 6, 8, 13 2153INNER_LOOPFILTER ssse3, h, 6, 8, 13 2154 2155;----------------------------------------------------------------------------- 2156; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, 2157; int flimE, int flimI, int hev_thr); 2158;----------------------------------------------------------------------------- 2159 2160%macro MBEDGE_LOOPFILTER 5 2161%if %4 == 8 ; chroma 2162cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 2163%define dst8_reg r1 2164%define mstride_reg r2 2165%define E_reg r3 2166%define I_reg r4 2167%define hev_thr_reg r5 2168%else ; luma 2169cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 2170%define mstride_reg r1 2171%define E_reg r2 2172%define I_reg r3 2173%define hev_thr_reg r4 2174%ifdef m8 ; x86-64, sse2 2175%define dst8_reg r4 2176%elif mmsize == 16 ; x86-32, sse2 2177%define dst8_reg r5 2178%else ; x86-32, mmx/mmxext 2179%define cnt_reg r5 2180%endif 2181%endif 2182%define dst_reg r0 2183%define stride_reg E_reg 2184%define dst2_reg I_reg 2185%ifndef m8 2186%define stack_reg hev_thr_reg 2187%endif 2188 2189%define ssse3_or_higher 0 2190%ifnidn %1, sse2 2191%if mmsize == 16 2192%define ssse3_or_higher 1 2193%endif 2194%endif 2195 2196%if ssse3_or_higher 2197 pxor m7, m7 2198%endif 2199 2200%ifndef m8 ; mmx/mmxext or sse2 on x86-32 2201 ; splat function arguments 2202 SPLATB_REG m0, E_reg, m7 ; E 2203 SPLATB_REG m1, I_reg, m7 ; I 2204 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh 2205 2206 ; align stack 2207 mov stack_reg, rsp ; backup stack pointer 2208 and rsp, ~(mmsize-1) ; align stack 2209%if mmsize == 16 2210 sub rsp, mmsize * 7 2211%else 2212 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr 2213 ; [3]=hev() result 2214 ; [4]=filter tmp result 2215 ; [5]/[6] = p2/q2 backup 2216 ; [7]=lim_res sign result 2217%endif 2218 2219%define flim_E [rsp] 2220%define flim_I [rsp+mmsize] 2221%define hev_thr [rsp+mmsize*2] 2222%define mask_res [rsp+mmsize*3] 2223%define lim_res [rsp+mmsize*4] 2224%define p0backup [rsp+mmsize*3] 2225%define q0backup [rsp+mmsize*4] 2226%define p2backup [rsp+mmsize*5] 2227%define q2backup [rsp+mmsize*6] 2228%if mmsize == 16 2229%define lim_sign [rsp] 2230%else 2231%define lim_sign [rsp+mmsize*7] 2232%endif 2233 2234 mova flim_E, m0 2235 mova flim_I, m1 2236 mova hev_thr, m2 2237 2238%else ; sse2 on x86-64 2239 2240%define flim_E m9 2241%define flim_I m10 2242%define hev_thr m11 2243%define mask_res m12 2244%define lim_res m8 2245%define p0backup m12 2246%define q0backup m8 2247%define p2backup m13 2248%define q2backup m14 2249%define lim_sign m9 2250 2251 ; splat function arguments 2252 SPLATB_REG flim_E, E_reg, m7 ; E 2253 SPLATB_REG flim_I, I_reg, m7 ; I 2254 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh 2255%endif 2256 2257%if mmsize == 8 && %4 == 16 ; mmx/mmxext 2258 mov cnt_reg, 2 2259%endif 2260 mov stride_reg, mstride_reg 2261 neg mstride_reg 2262%ifidn %2, h 2263 lea dst_reg, [dst_reg + stride_reg*4-4] 2264%if %4 == 8 2265 lea dst8_reg, [dst8_reg+ stride_reg*4-4] 2266%endif 2267%endif 2268 2269%if mmsize == 8 2270.next8px 2271%endif 2272 ; read 2273 lea dst2_reg, [dst_reg + stride_reg] 2274%ifidn %2, v 2275%if %4 == 8 && mmsize == 16 2276%define movrow movh 2277%else 2278%define movrow mova 2279%endif 2280 movrow m0, [dst_reg +mstride_reg*4] ; p3 2281 movrow m1, [dst2_reg+mstride_reg*4] ; p2 2282 movrow m2, [dst_reg +mstride_reg*2] ; p1 2283 movrow m5, [dst2_reg] ; q1 2284 movrow m6, [dst2_reg+ stride_reg] ; q2 2285 movrow m7, [dst2_reg+ stride_reg*2] ; q3 2286%if mmsize == 16 && %4 == 8 2287 movhps m0, [dst8_reg+mstride_reg*4] 2288 movhps m2, [dst8_reg+mstride_reg*2] 2289 add dst8_reg, stride_reg 2290 movhps m1, [dst8_reg+mstride_reg*4] 2291 movhps m5, [dst8_reg] 2292 movhps m6, [dst8_reg+ stride_reg] 2293 movhps m7, [dst8_reg+ stride_reg*2] 2294 add dst8_reg, mstride_reg 2295%endif 2296%elif mmsize == 8 ; mmx/mmxext (h) 2297 ; read 8 rows of 8px each 2298 movu m0, [dst_reg +mstride_reg*4] 2299 movu m1, [dst2_reg+mstride_reg*4] 2300 movu m2, [dst_reg +mstride_reg*2] 2301 movu m3, [dst_reg +mstride_reg] 2302 movu m4, [dst_reg] 2303 movu m5, [dst2_reg] 2304 movu m6, [dst2_reg+ stride_reg] 2305 2306 ; 8x8 transpose 2307 TRANSPOSE4x4B 0, 1, 2, 3, 7 2308 mova q0backup, m1 2309 movu m7, [dst2_reg+ stride_reg*2] 2310 TRANSPOSE4x4B 4, 5, 6, 7, 1 2311 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 2312 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 2313 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 2314 mova m1, q0backup 2315 mova q0backup, m2 ; store q0 2316 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 2317 mova p0backup, m5 ; store p0 2318 SWAP 1, 4 2319 SWAP 2, 4 2320 SWAP 6, 3 2321 SWAP 5, 3 2322%else ; sse2 (h) 2323%if %4 == 16 2324 lea dst8_reg, [dst_reg + stride_reg*8] 2325%endif 2326 2327 ; read 16 rows of 8px each, interleave 2328 movh m0, [dst_reg +mstride_reg*4] 2329 movh m1, [dst8_reg+mstride_reg*4] 2330 movh m2, [dst_reg +mstride_reg*2] 2331 movh m5, [dst8_reg+mstride_reg*2] 2332 movh m3, [dst_reg +mstride_reg] 2333 movh m6, [dst8_reg+mstride_reg] 2334 movh m4, [dst_reg] 2335 movh m7, [dst8_reg] 2336 punpcklbw m0, m1 ; A/I 2337 punpcklbw m2, m5 ; C/K 2338 punpcklbw m3, m6 ; D/L 2339 punpcklbw m4, m7 ; E/M 2340 2341 add dst8_reg, stride_reg 2342 movh m1, [dst2_reg+mstride_reg*4] 2343 movh m6, [dst8_reg+mstride_reg*4] 2344 movh m5, [dst2_reg] 2345 movh m7, [dst8_reg] 2346 punpcklbw m1, m6 ; B/J 2347 punpcklbw m5, m7 ; F/N 2348 movh m6, [dst2_reg+ stride_reg] 2349 movh m7, [dst8_reg+ stride_reg] 2350 punpcklbw m6, m7 ; G/O 2351 2352 ; 8x16 transpose 2353 TRANSPOSE4x4B 0, 1, 2, 3, 7 2354%ifdef m8 2355 SWAP 1, 8 2356%else 2357 mova q0backup, m1 2358%endif 2359 movh m7, [dst2_reg+ stride_reg*2] 2360 movh m1, [dst8_reg+ stride_reg*2] 2361 punpcklbw m7, m1 ; H/P 2362 TRANSPOSE4x4B 4, 5, 6, 7, 1 2363 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 2364 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 2365 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 2366%ifdef m8 2367 SWAP 1, 8 2368 SWAP 2, 8 2369%else 2370 mova m1, q0backup 2371 mova q0backup, m2 ; store q0 2372%endif 2373 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 2374%ifdef m12 2375 SWAP 5, 12 2376%else 2377 mova p0backup, m5 ; store p0 2378%endif 2379 SWAP 1, 4 2380 SWAP 2, 4 2381 SWAP 6, 3 2382 SWAP 5, 3 2383%endif 2384 2385 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 2386 mova m4, m1 2387 SWAP 4, 1 2388 psubusb m4, m0 ; p2-p3 2389 psubusb m0, m1 ; p3-p2 2390 por m0, m4 ; abs(p3-p2) 2391 2392 mova m4, m2 2393 SWAP 4, 2 2394 psubusb m4, m1 ; p1-p2 2395 mova p2backup, m1 2396 psubusb m1, m2 ; p2-p1 2397 por m1, m4 ; abs(p2-p1) 2398 2399 mova m4, m6 2400 SWAP 4, 6 2401 psubusb m4, m7 ; q2-q3 2402 psubusb m7, m6 ; q3-q2 2403 por m7, m4 ; abs(q3-q2) 2404 2405 mova m4, m5 2406 SWAP 4, 5 2407 psubusb m4, m6 ; q1-q2 2408 mova q2backup, m6 2409 psubusb m6, m5 ; q2-q1 2410 por m6, m4 ; abs(q2-q1) 2411 2412%ifidn %1, mmx 2413 mova m4, flim_I 2414 pxor m3, m3 2415 psubusb m0, m4 2416 psubusb m1, m4 2417 psubusb m7, m4 2418 psubusb m6, m4 2419 pcmpeqb m0, m3 ; abs(p3-p2) <= I 2420 pcmpeqb m1, m3 ; abs(p2-p1) <= I 2421 pcmpeqb m7, m3 ; abs(q3-q2) <= I 2422 pcmpeqb m6, m3 ; abs(q2-q1) <= I 2423 pand m0, m1 2424 pand m7, m6 2425 pand m0, m7 2426%else ; mmxext/sse2 2427 pmaxub m0, m1 2428 pmaxub m6, m7 2429 pmaxub m0, m6 2430%endif 2431 2432 ; normal_limit and high_edge_variance for p1-p0, q1-q0 2433 SWAP 7, 3 ; now m7 is zero 2434%ifidn %2, v 2435 movrow m3, [dst_reg +mstride_reg] ; p0 2436%if mmsize == 16 && %4 == 8 2437 movhps m3, [dst8_reg+mstride_reg] 2438%endif 2439%elifdef m12 2440 SWAP 3, 12 2441%else 2442 mova m3, p0backup 2443%endif 2444 2445 mova m1, m2 2446 SWAP 1, 2 2447 mova m6, m3 2448 SWAP 3, 6 2449 psubusb m1, m3 ; p1-p0 2450 psubusb m6, m2 ; p0-p1 2451 por m1, m6 ; abs(p1-p0) 2452%ifidn %1, mmx 2453 mova m6, m1 2454 psubusb m1, m4 2455 psubusb m6, hev_thr 2456 pcmpeqb m1, m7 ; abs(p1-p0) <= I 2457 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh 2458 pand m0, m1 2459 mova mask_res, m6 2460%else ; mmxext/sse2 2461 pmaxub m0, m1 ; max_I 2462 SWAP 1, 4 ; max_hev_thresh 2463%endif 2464 2465 SWAP 6, 4 ; now m6 is I 2466%ifidn %2, v 2467 movrow m4, [dst_reg] ; q0 2468%if mmsize == 16 && %4 == 8 2469 movhps m4, [dst8_reg] 2470%endif 2471%elifdef m8 2472 SWAP 4, 8 2473%else 2474 mova m4, q0backup 2475%endif 2476 mova m1, m4 2477 SWAP 1, 4 2478 mova m7, m5 2479 SWAP 7, 5 2480 psubusb m1, m5 ; q0-q1 2481 psubusb m7, m4 ; q1-q0 2482 por m1, m7 ; abs(q1-q0) 2483%ifidn %1, mmx 2484 mova m7, m1 2485 psubusb m1, m6 2486 psubusb m7, hev_thr 2487 pxor m6, m6 2488 pcmpeqb m1, m6 ; abs(q1-q0) <= I 2489 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh 2490 mova m6, mask_res 2491 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I 2492 pand m6, m7 2493%else ; mmxext/sse2 2494 pxor m7, m7 2495 pmaxub m0, m1 2496 pmaxub m6, m1 2497 psubusb m0, flim_I 2498 psubusb m6, hev_thr 2499 pcmpeqb m0, m7 ; max(abs(..)) <= I 2500 pcmpeqb m6, m7 ; !(max(abs..) > thresh) 2501%endif 2502%ifdef m12 2503 SWAP 6, 12 2504%else 2505 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) 2506%endif 2507 2508 ; simple_limit 2509 mova m1, m3 2510 SWAP 1, 3 2511 mova m6, m4 ; keep copies of p0/q0 around for later use 2512 SWAP 6, 4 2513 psubusb m1, m4 ; p0-q0 2514 psubusb m6, m3 ; q0-p0 2515 por m1, m6 ; abs(q0-p0) 2516 paddusb m1, m1 ; m1=2*abs(q0-p0) 2517 2518 mova m7, m2 2519 SWAP 7, 2 2520 mova m6, m5 2521 SWAP 6, 5 2522 psubusb m7, m5 ; p1-q1 2523 psubusb m6, m2 ; q1-p1 2524 por m7, m6 ; abs(q1-p1) 2525 pxor m6, m6 2526 pand m7, [pb_FE] 2527 psrlq m7, 1 ; abs(q1-p1)/2 2528 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 2529 psubusb m7, flim_E 2530 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E 2531 pand m0, m7 ; normal_limit result 2532 2533 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask 2534%ifdef m8 ; x86-64 && sse2 2535 mova m8, [pb_80] 2536%define pb_80_var m8 2537%else ; x86-32 or mmx/mmxext 2538%define pb_80_var [pb_80] 2539%endif 2540 mova m1, m4 2541 mova m7, m3 2542 pxor m1, pb_80_var 2543 pxor m7, pb_80_var 2544 psubsb m1, m7 ; (signed) q0-p0 2545 mova m6, m2 2546 mova m7, m5 2547 pxor m6, pb_80_var 2548 pxor m7, pb_80_var 2549 psubsb m6, m7 ; (signed) p1-q1 2550 mova m7, mask_res 2551 paddsb m6, m1 2552 paddsb m6, m1 2553 paddsb m6, m1 2554 pand m6, m0 2555%ifdef m8 2556 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge 2557 pand lim_res, m7 2558%else 2559 mova m0, m6 2560 pand m0, m7 2561 mova lim_res, m0 2562%endif 2563 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common 2564 2565 mova m1, [pb_F8] 2566 mova m6, m7 2567 paddsb m7, [pb_3] 2568 paddsb m6, [pb_4] 2569 pand m7, m1 2570 pand m6, m1 2571 2572 pxor m1, m1 2573 pxor m0, m0 2574 pcmpgtb m1, m7 2575 psubb m0, m7 2576 psrlq m7, 3 ; +f2 2577 psrlq m0, 3 ; -f2 2578 pand m0, m1 2579 pandn m1, m7 2580 psubusb m3, m0 2581 paddusb m3, m1 ; p0+f2 2582 2583 pxor m1, m1 2584 pxor m0, m0 2585 pcmpgtb m0, m6 2586 psubb m1, m6 2587 psrlq m6, 3 ; +f1 2588 psrlq m1, 3 ; -f1 2589 pand m1, m0 2590 pandn m0, m6 2591 psubusb m4, m0 2592 paddusb m4, m1 ; q0-f1 2593 2594 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) 2595%if ssse3_or_higher 2596 mova m7, [pb_1] 2597%else 2598 mova m7, [pw_63] 2599%endif 2600%ifdef m8 2601 SWAP 1, 8 2602%else 2603 mova m1, lim_res 2604%endif 2605 pxor m0, m0 2606 mova m6, m1 2607 pcmpgtb m0, m1 ; which are negative 2608%if ssse3_or_higher 2609 punpcklbw m6, m7 ; interleave with "1" for rounding 2610 punpckhbw m1, m7 2611%else 2612 punpcklbw m6, m0 ; signed byte->word 2613 punpckhbw m1, m0 2614%endif 2615 mova lim_sign, m0 2616%if ssse3_or_higher 2617 mova m7, [pb_27_63] 2618%ifndef m8 2619 mova lim_res, m1 2620%endif 2621%ifdef m10 2622 SWAP 0, 10 ; don't lose lim_sign copy 2623%endif 2624 mova m0, m7 2625 pmaddubsw m7, m6 2626 SWAP 6, 7 2627 pmaddubsw m0, m1 2628 SWAP 1, 0 2629%ifdef m10 2630 SWAP 0, 10 2631%else 2632 mova m0, lim_sign 2633%endif 2634%else 2635 mova mask_res, m6 ; backup for later in filter 2636 mova lim_res, m1 2637 pmullw m6, [pw_27] 2638 pmullw m1, [pw_27] 2639 paddw m6, m7 2640 paddw m1, m7 2641%endif 2642 psraw m6, 7 2643 psraw m1, 7 2644 packsswb m6, m1 ; a0 2645 pxor m1, m1 2646 psubb m1, m6 2647 pand m1, m0 ; -a0 2648 pandn m0, m6 ; +a0 2649%if ssse3_or_higher 2650 mova m6, [pb_18_63] ; pipelining 2651%endif 2652 psubusb m3, m1 2653 paddusb m4, m1 2654 paddusb m3, m0 ; p0+a0 2655 psubusb m4, m0 ; q0-a0 2656 2657%if ssse3_or_higher 2658 SWAP 6, 7 2659%ifdef m10 2660 SWAP 1, 10 2661%else 2662 mova m1, lim_res 2663%endif 2664 mova m0, m7 2665 pmaddubsw m7, m6 2666 SWAP 6, 7 2667 pmaddubsw m0, m1 2668 SWAP 1, 0 2669%ifdef m10 2670 SWAP 0, 10 2671%endif 2672 mova m0, lim_sign 2673%else 2674 mova m6, mask_res 2675 mova m1, lim_res 2676 pmullw m6, [pw_18] 2677 pmullw m1, [pw_18] 2678 paddw m6, m7 2679 paddw m1, m7 2680%endif 2681 mova m0, lim_sign 2682 psraw m6, 7 2683 psraw m1, 7 2684 packsswb m6, m1 ; a1 2685 pxor m1, m1 2686 psubb m1, m6 2687 pand m1, m0 ; -a1 2688 pandn m0, m6 ; +a1 2689%if ssse3_or_higher 2690 mova m6, [pb_9_63] 2691%endif 2692 psubusb m2, m1 2693 paddusb m5, m1 2694 paddusb m2, m0 ; p1+a1 2695 psubusb m5, m0 ; q1-a1 2696 2697%if ssse3_or_higher 2698 SWAP 6, 7 2699%ifdef m10 2700 SWAP 1, 10 2701%else 2702 mova m1, lim_res 2703%endif 2704 mova m0, m7 2705 pmaddubsw m7, m6 2706 SWAP 6, 7 2707 pmaddubsw m0, m1 2708 SWAP 1, 0 2709%else 2710%ifdef m8 2711 SWAP 6, 12 2712 SWAP 1, 8 2713%else 2714 mova m6, mask_res 2715 mova m1, lim_res 2716%endif 2717 pmullw m6, [pw_9] 2718 pmullw m1, [pw_9] 2719 paddw m6, m7 2720 paddw m1, m7 2721%endif 2722%ifdef m9 2723 SWAP 7, 9 2724%else 2725 mova m7, lim_sign 2726%endif 2727 psraw m6, 7 2728 psraw m1, 7 2729 packsswb m6, m1 ; a1 2730 pxor m0, m0 2731 psubb m0, m6 2732 pand m0, m7 ; -a1 2733 pandn m7, m6 ; +a1 2734%ifdef m8 2735 SWAP 1, 13 2736 SWAP 6, 14 2737%else 2738 mova m1, p2backup 2739 mova m6, q2backup 2740%endif 2741 psubusb m1, m0 2742 paddusb m6, m0 2743 paddusb m1, m7 ; p1+a1 2744 psubusb m6, m7 ; q1-a1 2745 2746 ; store 2747%ifidn %2, v 2748 movrow [dst2_reg+mstride_reg*4], m1 2749 movrow [dst_reg +mstride_reg*2], m2 2750 movrow [dst_reg +mstride_reg ], m3 2751 movrow [dst_reg], m4 2752 movrow [dst2_reg], m5 2753 movrow [dst2_reg+ stride_reg ], m6 2754%if mmsize == 16 && %4 == 8 2755 add dst8_reg, mstride_reg 2756 movhps [dst8_reg+mstride_reg*2], m1 2757 movhps [dst8_reg+mstride_reg ], m2 2758 movhps [dst8_reg], m3 2759 add dst8_reg, stride_reg 2760 movhps [dst8_reg], m4 2761 movhps [dst8_reg+ stride_reg ], m5 2762 movhps [dst8_reg+ stride_reg*2], m6 2763%endif 2764%else ; h 2765 inc dst_reg 2766 inc dst2_reg 2767 2768 ; 4x8/16 transpose 2769 TRANSPOSE4x4B 1, 2, 3, 4, 0 2770 SBUTTERFLY bw, 5, 6, 0 2771 2772%if mmsize == 8 ; mmx/mmxext (h) 2773 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg 2774 add dst_reg, 4 2775 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg 2776%else ; sse2 (h) 2777 lea dst8_reg, [dst8_reg+mstride_reg+1] 2778 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 2779 lea dst_reg, [dst2_reg+mstride_reg+4] 2780 lea dst8_reg, [dst8_reg+mstride_reg+4] 2781%ifidn %1, sse4 2782 add dst2_reg, 4 2783%endif 2784 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg 2785%ifidn %1, sse4 2786 lea dst2_reg, [dst8_reg+ stride_reg] 2787%endif 2788 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg 2789%endif 2790%endif 2791 2792%if mmsize == 8 2793%if %4 == 8 ; chroma 2794%ifidn %2, h 2795 sub dst_reg, 5 2796%endif 2797 cmp dst_reg, dst8_reg 2798 mov dst_reg, dst8_reg 2799 jnz .next8px 2800%else 2801%ifidn %2, h 2802 lea dst_reg, [dst_reg + stride_reg*8-5] 2803%else ; v 2804 add dst_reg, 8 2805%endif 2806 dec cnt_reg 2807 jg .next8px 2808%endif 2809%endif 2810 2811%ifndef m8 ; sse2 on x86-32 or mmx/mmxext 2812 mov rsp, stack_reg ; restore stack pointer 2813%endif 2814 RET 2815%endmacro 2816 2817INIT_MMX 2818%define SPLATB_REG SPLATB_REG_MMX 2819MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 2820MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 2821MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 2822MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 2823 2824%define SPLATB_REG SPLATB_REG_MMXEXT 2825MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 2826MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 2827MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 2828MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 2829 2830INIT_XMM 2831%define SPLATB_REG SPLATB_REG_SSE2 2832%define WRITE_8W WRITE_8W_SSE2 2833MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 2834%ifdef m8 2835MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 2836%else 2837MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 2838%endif 2839MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 2840MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 2841 2842%define SPLATB_REG SPLATB_REG_SSSE3 2843MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 2844%ifdef m8 2845MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 2846%else 2847MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 2848%endif 2849MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 2850MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 2851 2852%define WRITE_8W WRITE_8W_SSE4 2853%ifdef m8 2854MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 2855%else 2856MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 2857%endif 2858MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 2859