1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Oskar Arvidsson <oskar@irock.se> 7;* Loren Merritt <lorenm@u.washington.edu> 8;* Fiona Glaser <fiona@x264.com> 9;* 10;* This file is part of FFmpeg. 11;* 12;* FFmpeg is free software; you can redistribute it and/or 13;* modify it under the terms of the GNU Lesser General Public 14;* License as published by the Free Software Foundation; either 15;* version 2.1 of the License, or (at your option) any later version. 16;* 17;* FFmpeg is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20;* Lesser General Public License for more details. 21;* 22;* You should have received a copy of the GNU Lesser General Public 23;* License along with FFmpeg; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25;****************************************************************************** 26 27%include "libavutil/x86/x86util.asm" 28 29SECTION_RODATA 30 31pw_pixel_max: times 8 dw ((1 << 10)-1) 32 33SECTION .text 34 35cextern pw_2 36cextern pw_3 37cextern pw_4 38 39; out: %4 = |%1-%2|-%3 40; clobbers: %5 41%macro ABS_SUB 5 42 psubusw %5, %2, %1 43 psubusw %4, %1, %2 44 por %4, %5 45 psubw %4, %3 46%endmacro 47 48; out: %4 = |%1-%2|<%3 49%macro DIFF_LT 5 50 psubusw %4, %2, %1 51 psubusw %5, %1, %2 52 por %5, %4 ; |%1-%2| 53 pxor %4, %4 54 psubw %5, %3 ; |%1-%2|-%3 55 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 56%endmacro 57 58%macro LOAD_AB 4 59 movd %1, %3 60 movd %2, %4 61 SPLATW %1, %1 62 SPLATW %2, %2 63%endmacro 64 65; in: %2=tc reg 66; out: %1=splatted tc 67%macro LOAD_TC 2 68 movd %1, [%2] 69 punpcklbw %1, %1 70%if mmsize == 8 71 pshufw %1, %1, 0 72%else 73 pshuflw %1, %1, 01010000b 74 pshufd %1, %1, 01010000b 75%endif 76 psraw %1, 6 77%endmacro 78 79; in: %1=p1, %2=p0, %3=q0, %4=q1 80; %5=alpha, %6=beta, %7-%9=tmp 81; out: %7=mask 82%macro LOAD_MASK 9 83 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha 84 ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta 85 pand %8, %9 86 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta 87 pxor %7, %7 88 pand %8, %9 89 pcmpgtw %7, %8 90%endmacro 91 92; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp 93; out: %1=p0', m2=q0' 94%macro DEBLOCK_P0_Q0 7 95 psubw %3, %4 96 pxor %7, %7 97 paddw %3, [pw_4] 98 psubw %7, %5 99 psubw %6, %2, %1 100 psllw %6, 2 101 paddw %3, %6 102 psraw %3, 3 103 mova %6, [pw_pixel_max] 104 CLIPW %3, %7, %5 105 pxor %7, %7 106 paddw %1, %3 107 psubw %2, %3 108 CLIPW %1, %7, %6 109 CLIPW %2, %7, %6 110%endmacro 111 112; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp 113%macro LUMA_Q1 6 114 pavgw %6, %3, %4 ; (p0+q0+1)>>1 115 paddw %1, %6 116 pxor %6, %6 117 psraw %1, 1 118 psubw %6, %5 119 psubw %1, %2 120 CLIPW %1, %6, %5 121 paddw %1, %2 122%endmacro 123 124%macro LUMA_DEBLOCK_ONE 3 125 DIFF_LT m5, %1, bm, m4, m6 126 pxor m6, m6 127 mova %3, m4 128 pcmpgtw m6, tcm 129 pand m4, tcm 130 pandn m6, m7 131 pand m4, m6 132 LUMA_Q1 m5, %2, m1, m2, m4, m6 133%endmacro 134 135%macro LUMA_H_STORE 2 136%if mmsize == 8 137 movq [r0-4], m0 138 movq [r0+r1-4], m1 139 movq [r0+r1*2-4], m2 140 movq [r0+%2-4], m3 141%else 142 movq [r0-4], m0 143 movhps [r0+r1-4], m0 144 movq [r0+r1*2-4], m1 145 movhps [%1-4], m1 146 movq [%1+r1-4], m2 147 movhps [%1+r1*2-4], m2 148 movq [%1+%2-4], m3 149 movhps [%1+r1*4-4], m3 150%endif 151%endmacro 152 153%macro DEBLOCK_LUMA 0 154;----------------------------------------------------------------------------- 155; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta, 156; int8_t *tc0) 157;----------------------------------------------------------------------------- 158cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) 159 %assign pad 5*mmsize+12-(stack_offset&15) 160 %define tcm [rsp] 161 %define ms1 [rsp+mmsize] 162 %define ms2 [rsp+mmsize*2] 163 %define am [rsp+mmsize*3] 164 %define bm [rsp+mmsize*4] 165 SUB rsp, pad 166 shl r2d, 2 167 shl r3d, 2 168 LOAD_AB m4, m5, r2d, r3d 169 mov r3, 32/mmsize 170 mov r2, r0 171 sub r0, r1 172 mova am, m4 173 sub r0, r1 174 mova bm, m5 175 sub r0, r1 176.loop: 177 mova m0, [r0+r1] 178 mova m1, [r0+r1*2] 179 mova m2, [r2] 180 mova m3, [r2+r1] 181 182 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 183 LOAD_TC m6, r4 184 mova tcm, m6 185 186 mova m5, [r0] 187 LUMA_DEBLOCK_ONE m1, m0, ms1 188 mova [r0+r1], m5 189 190 mova m5, [r2+r1*2] 191 LUMA_DEBLOCK_ONE m2, m3, ms2 192 mova [r2+r1], m5 193 194 pxor m5, m5 195 mova m6, tcm 196 pcmpgtw m5, tcm 197 psubw m6, ms1 198 pandn m5, m7 199 psubw m6, ms2 200 pand m5, m6 201 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 202 mova [r0+r1*2], m1 203 mova [r2], m2 204 205 add r0, mmsize 206 add r2, mmsize 207 add r4, mmsize/8 208 dec r3 209 jg .loop 210 ADD rsp, pad 211 RET 212 213cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) 214 %assign pad 7*mmsize+12-(stack_offset&15) 215 %define tcm [rsp] 216 %define ms1 [rsp+mmsize] 217 %define ms2 [rsp+mmsize*2] 218 %define p1m [rsp+mmsize*3] 219 %define p2m [rsp+mmsize*4] 220 %define am [rsp+mmsize*5] 221 %define bm [rsp+mmsize*6] 222 SUB rsp, pad 223 shl r2d, 2 224 shl r3d, 2 225 LOAD_AB m4, m5, r2d, r3d 226 mov r3, r1 227 mova am, m4 228 add r3, r1 229 mov r5, 32/mmsize 230 mova bm, m5 231 add r3, r1 232%if mmsize == 16 233 mov r2, r0 234 add r2, r3 235%endif 236.loop: 237%if mmsize == 8 238 movq m2, [r0-8] ; y q2 q1 q0 239 movq m7, [r0+0] 240 movq m5, [r0+r1-8] 241 movq m3, [r0+r1+0] 242 movq m0, [r0+r1*2-8] 243 movq m6, [r0+r1*2+0] 244 movq m1, [r0+r3-8] 245 TRANSPOSE4x4W 2, 5, 0, 1, 4 246 SWAP 2, 7 247 movq m7, [r0+r3] 248 TRANSPOSE4x4W 2, 3, 6, 7, 4 249%else 250 movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x 251 movu m0, [r0+r1-8] 252 movu m2, [r0+r1*2-8] 253 movu m3, [r2-8] 254 TRANSPOSE4x4W 5, 0, 2, 3, 6 255 mova tcm, m3 256 257 movu m4, [r2+r1-8] 258 movu m1, [r2+r1*2-8] 259 movu m3, [r2+r3-8] 260 movu m7, [r2+r1*4-8] 261 TRANSPOSE4x4W 4, 1, 3, 7, 6 262 263 mova m6, tcm 264 punpcklqdq m6, m7 265 punpckhqdq m5, m4 266 SBUTTERFLY qdq, 0, 1, 7 267 SBUTTERFLY qdq, 2, 3, 7 268%endif 269 270 mova p2m, m6 271 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 272 LOAD_TC m6, r4 273 mova tcm, m6 274 275 LUMA_DEBLOCK_ONE m1, m0, ms1 276 mova p1m, m5 277 278 mova m5, p2m 279 LUMA_DEBLOCK_ONE m2, m3, ms2 280 mova p2m, m5 281 282 pxor m5, m5 283 mova m6, tcm 284 pcmpgtw m5, tcm 285 psubw m6, ms1 286 pandn m5, m7 287 psubw m6, ms2 288 pand m5, m6 289 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 290 mova m0, p1m 291 mova m3, p2m 292 TRANSPOSE4x4W 0, 1, 2, 3, 4 293 LUMA_H_STORE r2, r3 294 295 add r4, mmsize/8 296 lea r0, [r0+r1*(mmsize/2)] 297 lea r2, [r2+r1*(mmsize/2)] 298 dec r5 299 jg .loop 300 ADD rsp, pad 301 RET 302%endmacro 303 304%if ARCH_X86_64 305; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 306; m12=alpha, m13=beta 307; out: m0=p1', m3=q1', m1=p0', m2=q0' 308; clobbers: m4, m5, m6, m7, m10, m11, m14 309%macro DEBLOCK_LUMA_INTER_SSE2 0 310 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 311 LOAD_TC m6, r4 312 DIFF_LT m8, m1, m13, m10, m4 313 DIFF_LT m9, m2, m13, m11, m4 314 pand m6, m7 315 316 mova m14, m6 317 pxor m4, m4 318 pcmpgtw m6, m4 319 pand m6, m14 320 321 mova m5, m10 322 pand m5, m6 323 LUMA_Q1 m8, m0, m1, m2, m5, m4 324 325 mova m5, m11 326 pand m5, m6 327 LUMA_Q1 m9, m3, m1, m2, m5, m4 328 329 pxor m4, m4 330 psubw m6, m10 331 pcmpgtw m4, m14 332 pandn m4, m7 333 psubw m6, m11 334 pand m4, m6 335 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 336 337 SWAP 0, 8 338 SWAP 3, 9 339%endmacro 340 341%macro DEBLOCK_LUMA_64 0 342cglobal deblock_v_luma_10, 5,5,15 343 %define p2 m8 344 %define p1 m0 345 %define p0 m1 346 %define q0 m2 347 %define q1 m3 348 %define q2 m9 349 %define mask0 m7 350 %define mask1 m10 351 %define mask2 m11 352 shl r2d, 2 353 shl r3d, 2 354 LOAD_AB m12, m13, r2d, r3d 355 mov r2, r0 356 sub r0, r1 357 sub r0, r1 358 sub r0, r1 359 mov r3, 2 360.loop: 361 mova p2, [r0] 362 mova p1, [r0+r1] 363 mova p0, [r0+r1*2] 364 mova q0, [r2] 365 mova q1, [r2+r1] 366 mova q2, [r2+r1*2] 367 DEBLOCK_LUMA_INTER_SSE2 368 mova [r0+r1], p1 369 mova [r0+r1*2], p0 370 mova [r2], q0 371 mova [r2+r1], q1 372 add r0, mmsize 373 add r2, mmsize 374 add r4, 2 375 dec r3 376 jg .loop 377 REP_RET 378 379cglobal deblock_h_luma_10, 5,7,15 380 shl r2d, 2 381 shl r3d, 2 382 LOAD_AB m12, m13, r2d, r3d 383 mov r2, r1 384 add r2, r1 385 add r2, r1 386 mov r5, r0 387 add r5, r2 388 mov r6, 2 389.loop: 390 movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x 391 movu m0, [r0+r1-8] 392 movu m2, [r0+r1*2-8] 393 movu m9, [r5-8] 394 movu m5, [r5+r1-8] 395 movu m1, [r5+r1*2-8] 396 movu m3, [r5+r2-8] 397 movu m7, [r5+r1*4-8] 398 399 TRANSPOSE4x4W 8, 0, 2, 9, 10 400 TRANSPOSE4x4W 5, 1, 3, 7, 10 401 402 punpckhqdq m8, m5 403 SBUTTERFLY qdq, 0, 1, 10 404 SBUTTERFLY qdq, 2, 3, 10 405 punpcklqdq m9, m7 406 407 DEBLOCK_LUMA_INTER_SSE2 408 409 TRANSPOSE4x4W 0, 1, 2, 3, 4 410 LUMA_H_STORE r5, r2 411 add r4, 2 412 lea r0, [r0+r1*8] 413 lea r5, [r5+r1*8] 414 dec r6 415 jg .loop 416 REP_RET 417%endmacro 418 419INIT_XMM sse2 420DEBLOCK_LUMA_64 421%if HAVE_AVX_EXTERNAL 422INIT_XMM avx 423DEBLOCK_LUMA_64 424%endif 425%endif 426 427%macro SWAPMOVA 2 428%ifid %1 429 SWAP %1, %2 430%else 431 mova %1, %2 432%endif 433%endmacro 434 435; in: t0-t2: tmp registers 436; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 437; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' 438%macro LUMA_INTRA_P012 12 ; p0..p3 in memory 439%if ARCH_X86_64 440 paddw t0, %3, %2 441 mova t2, %4 442 paddw t2, %3 443%else 444 mova t0, %3 445 mova t2, %4 446 paddw t0, %2 447 paddw t2, %3 448%endif 449 paddw t0, %1 450 paddw t2, t2 451 paddw t0, %5 452 paddw t2, %9 453 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) 454 paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) 455 456 psrlw t2, 3 457 psrlw t1, t0, 2 458 psubw t2, %3 459 psubw t1, %2 460 pand t2, %8 461 pand t1, %8 462 paddw t2, %3 463 paddw t1, %2 464 SWAPMOVA %11, t1 465 466 psubw t1, t0, %3 467 paddw t0, t0 468 psubw t1, %5 469 psubw t0, %3 470 paddw t1, %6 471 paddw t1, %2 472 paddw t0, %6 473 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 474 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 475 476 pxor t0, t1 477 pxor t1, %1 478 pand t0, %8 479 pand t1, %7 480 pxor t0, t1 481 pxor t0, %1 482 SWAPMOVA %10, t0 483 SWAPMOVA %12, t2 484%endmacro 485 486%macro LUMA_INTRA_INIT 1 487 %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) 488 %define t0 m4 489 %define t1 m5 490 %define t2 m6 491 %define t3 m7 492 %assign i 4 493%rep %1 494 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] 495 %assign i i+1 496%endrep 497 SUB rsp, pad 498%endmacro 499 500; in: %1-%3=tmp, %4=p2, %5=q2 501%macro LUMA_INTRA_INTER 5 502 LOAD_AB t0, t1, r2d, r3d 503 mova %1, t0 504 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 505%if ARCH_X86_64 506 mova %2, t0 ; mask0 507 psrlw t3, %1, 2 508%else 509 mova t3, %1 510 mova %2, t0 ; mask0 511 psrlw t3, 2 512%endif 513 paddw t3, [pw_2] ; alpha/4+2 514 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 515 pand t2, %2 516 mova t3, %5 ; q2 517 mova %1, t2 ; mask1 518 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta 519 pand t2, %1 520 mova t3, %4 ; p2 521 mova %3, t2 ; mask1q 522 DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta 523 pand t2, %1 524 mova %1, t2 ; mask1p 525%endmacro 526 527%macro LUMA_H_INTRA_LOAD 0 528%if mmsize == 8 529 movu t0, [r0-8] 530 movu t1, [r0+r1-8] 531 movu m0, [r0+r1*2-8] 532 movu m1, [r0+r4-8] 533 TRANSPOSE4x4W 4, 5, 0, 1, 2 534 mova t4, t0 ; p3 535 mova t5, t1 ; p2 536 537 movu m2, [r0] 538 movu m3, [r0+r1] 539 movu t0, [r0+r1*2] 540 movu t1, [r0+r4] 541 TRANSPOSE4x4W 2, 3, 4, 5, 6 542 mova t6, t0 ; q2 543 mova t7, t1 ; q3 544%else 545 movu t0, [r0-8] 546 movu t1, [r0+r1-8] 547 movu m0, [r0+r1*2-8] 548 movu m1, [r0+r5-8] 549 movu m2, [r4-8] 550 movu m3, [r4+r1-8] 551 movu t2, [r4+r1*2-8] 552 movu t3, [r4+r5-8] 553 TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 554 mova t4, t0 ; p3 555 mova t5, t1 ; p2 556 mova t6, t2 ; q2 557 mova t7, t3 ; q3 558%endif 559%endmacro 560 561; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp 562%macro LUMA_H_INTRA_STORE 9 563%if mmsize == 8 564 TRANSPOSE4x4W %1, %2, %3, %4, %9 565 movq [r0-8], m%1 566 movq [r0+r1-8], m%2 567 movq [r0+r1*2-8], m%3 568 movq [r0+r4-8], m%4 569 movq m%1, %8 570 TRANSPOSE4x4W %5, %6, %7, %1, %9 571 movq [r0], m%5 572 movq [r0+r1], m%6 573 movq [r0+r1*2], m%7 574 movq [r0+r4], m%1 575%else 576 TRANSPOSE2x4x4W %1, %2, %3, %4, %9 577 movq [r0-8], m%1 578 movq [r0+r1-8], m%2 579 movq [r0+r1*2-8], m%3 580 movq [r0+r5-8], m%4 581 movhps [r4-8], m%1 582 movhps [r4+r1-8], m%2 583 movhps [r4+r1*2-8], m%3 584 movhps [r4+r5-8], m%4 585%ifnum %8 586 SWAP %1, %8 587%else 588 mova m%1, %8 589%endif 590 TRANSPOSE2x4x4W %5, %6, %7, %1, %9 591 movq [r0], m%5 592 movq [r0+r1], m%6 593 movq [r0+r1*2], m%7 594 movq [r0+r5], m%1 595 movhps [r4], m%5 596 movhps [r4+r1], m%6 597 movhps [r4+r1*2], m%7 598 movhps [r4+r5], m%1 599%endif 600%endmacro 601 602%if ARCH_X86_64 603;----------------------------------------------------------------------------- 604; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, 605; int beta) 606;----------------------------------------------------------------------------- 607%macro DEBLOCK_LUMA_INTRA_64 0 608cglobal deblock_v_luma_intra_10, 4,7,16 609 %define t0 m1 610 %define t1 m2 611 %define t2 m4 612 %define p2 m8 613 %define p1 m9 614 %define p0 m10 615 %define q0 m11 616 %define q1 m12 617 %define q2 m13 618 %define aa m5 619 %define bb m14 620 lea r4, [r1*4] 621 lea r5, [r1*3] ; 3*stride 622 neg r4 623 add r4, r0 ; pix-4*stride 624 mov r6, 2 625 mova m0, [pw_2] 626 shl r2d, 2 627 shl r3d, 2 628 LOAD_AB aa, bb, r2d, r3d 629.loop: 630 mova p2, [r4+r1] 631 mova p1, [r4+2*r1] 632 mova p0, [r4+r5] 633 mova q0, [r0] 634 mova q1, [r0+r1] 635 mova q2, [r0+2*r1] 636 637 LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 638 mova t2, aa 639 psrlw t2, 2 640 paddw t2, m0 ; alpha/4+2 641 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 642 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta 643 DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta 644 pand m6, m3 645 pand m7, m6 646 pand m6, t1 647 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] 648 LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] 649 add r0, mmsize 650 add r4, mmsize 651 dec r6 652 jg .loop 653 REP_RET 654 655;----------------------------------------------------------------------------- 656; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, 657; int beta) 658;----------------------------------------------------------------------------- 659cglobal deblock_h_luma_intra_10, 4,7,16 660 %define t0 m15 661 %define t1 m14 662 %define t2 m2 663 %define q3 m5 664 %define q2 m8 665 %define q1 m9 666 %define q0 m10 667 %define p0 m11 668 %define p1 m12 669 %define p2 m13 670 %define p3 m4 671 %define spill [rsp] 672 %assign pad 24-(stack_offset&15) 673 SUB rsp, pad 674 lea r4, [r1*4] 675 lea r5, [r1*3] ; 3*stride 676 add r4, r0 ; pix+4*stride 677 mov r6, 2 678 mova m0, [pw_2] 679 shl r2d, 2 680 shl r3d, 2 681.loop: 682 movu q3, [r0-8] 683 movu q2, [r0+r1-8] 684 movu q1, [r0+r1*2-8] 685 movu q0, [r0+r5-8] 686 movu p0, [r4-8] 687 movu p1, [r4+r1-8] 688 movu p2, [r4+r1*2-8] 689 movu p3, [r4+r5-8] 690 TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 691 692 LOAD_AB m1, m2, r2d, r3d 693 LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 694 psrlw m1, 2 695 paddw m1, m0 ; alpha/4+2 696 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 697 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta 698 DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta 699 pand m6, m3 700 pand m7, m6 701 pand m6, t1 702 703 mova spill, q3 704 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 705 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 706 mova m7, spill 707 708 LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 709 710 lea r0, [r0+r1*8] 711 lea r4, [r4+r1*8] 712 dec r6 713 jg .loop 714 ADD rsp, pad 715 RET 716%endmacro 717 718INIT_XMM sse2 719DEBLOCK_LUMA_INTRA_64 720%if HAVE_AVX_EXTERNAL 721INIT_XMM avx 722DEBLOCK_LUMA_INTRA_64 723%endif 724 725%endif 726 727%macro DEBLOCK_LUMA_INTRA 0 728;----------------------------------------------------------------------------- 729; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, 730; int beta) 731;----------------------------------------------------------------------------- 732cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16) 733 LUMA_INTRA_INIT 3 734 lea r4, [r1*4] 735 lea r5, [r1*3] 736 neg r4 737 add r4, r0 738 mov r6, 32/mmsize 739 shl r2d, 2 740 shl r3d, 2 741.loop: 742 mova m0, [r4+r1*2] ; p1 743 mova m1, [r4+r5] ; p0 744 mova m2, [r0] ; q0 745 mova m3, [r0+r1] ; q1 746 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] 747 LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] 748 mova t3, [r0+r1*2] ; q2 749 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] 750 add r0, mmsize 751 add r4, mmsize 752 dec r6 753 jg .loop 754 ADD rsp, pad 755 RET 756 757;----------------------------------------------------------------------------- 758; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, 759; int beta) 760;----------------------------------------------------------------------------- 761cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) 762 LUMA_INTRA_INIT 8 763%if mmsize == 8 764 lea r4, [r1*3] 765 mov r5, 32/mmsize 766%else 767 lea r4, [r1*4] 768 lea r5, [r1*3] ; 3*stride 769 add r4, r0 ; pix+4*stride 770 mov r6, 32/mmsize 771%endif 772 shl r2d, 2 773 shl r3d, 2 774.loop: 775 LUMA_H_INTRA_LOAD 776 LUMA_INTRA_INTER t8, t9, t10, t5, t6 777 778 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 779 mova t3, t6 ; q2 780 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 781 782 mova m2, t4 783 mova m0, t11 784 mova m1, t5 785 mova m3, t8 786 mova m6, t6 787 788 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 789 790 lea r0, [r0+r1*(mmsize/2)] 791%if mmsize == 8 792 dec r5 793%else 794 lea r4, [r4+r1*(mmsize/2)] 795 dec r6 796%endif 797 jg .loop 798 ADD rsp, pad 799 RET 800%endmacro 801 802%if ARCH_X86_64 == 0 803INIT_MMX mmxext 804DEBLOCK_LUMA 805DEBLOCK_LUMA_INTRA 806INIT_XMM sse2 807DEBLOCK_LUMA 808DEBLOCK_LUMA_INTRA 809%if HAVE_AVX_EXTERNAL 810INIT_XMM avx 811DEBLOCK_LUMA 812DEBLOCK_LUMA_INTRA 813%endif 814%endif 815 816; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp 817; out: %1=p0', %2=q0' 818%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 819 mova %6, [pw_2] 820 paddw %6, %3 821 paddw %6, %4 822 paddw %7, %6, %2 823 paddw %6, %1 824 paddw %6, %3 825 paddw %7, %4 826 psraw %6, 2 827 psraw %7, 2 828 psubw %6, %1 829 psubw %7, %2 830 pand %6, %5 831 pand %7, %5 832 paddw %1, %6 833 paddw %2, %7 834%endmacro 835 836%macro CHROMA_V_LOAD 1 837 mova m0, [r0] ; p1 838 mova m1, [r0+r1] ; p0 839 mova m2, [%1] ; q0 840 mova m3, [%1+r1] ; q1 841%endmacro 842 843%macro CHROMA_V_STORE 0 844 mova [r0+1*r1], m1 845 mova [r0+2*r1], m2 846%endmacro 847 848%macro CHROMA_V_LOAD_TC 2 849 movd %1, [%2] 850 punpcklbw %1, %1 851 punpcklwd %1, %1 852 psraw %1, 6 853%endmacro 854 855%macro DEBLOCK_CHROMA 0 856;----------------------------------------------------------------------------- 857; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta, 858; int8_t *tc0) 859;----------------------------------------------------------------------------- 860cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) 861 mov r5, r0 862 sub r0, r1 863 sub r0, r1 864 shl r2d, 2 865 shl r3d, 2 866%if mmsize < 16 867 mov r6, 16/mmsize 868.loop: 869%endif 870 CHROMA_V_LOAD r5 871 LOAD_AB m4, m5, r2d, r3d 872 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 873 pxor m4, m4 874 CHROMA_V_LOAD_TC m6, r4 875 psubw m6, [pw_3] 876 pmaxsw m6, m4 877 pand m7, m6 878 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 879 CHROMA_V_STORE 880%if mmsize < 16 881 add r0, mmsize 882 add r5, mmsize 883 add r4, mmsize/4 884 dec r6 885 jg .loop 886 REP_RET 887%else 888 RET 889%endif 890 891;----------------------------------------------------------------------------- 892; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha, 893; int beta) 894;----------------------------------------------------------------------------- 895cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) 896 mov r4, r0 897 sub r0, r1 898 sub r0, r1 899 shl r2d, 2 900 shl r3d, 2 901%if mmsize < 16 902 mov r5, 16/mmsize 903.loop: 904%endif 905 CHROMA_V_LOAD r4 906 LOAD_AB m4, m5, r2d, r3d 907 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 908 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 909 CHROMA_V_STORE 910%if mmsize < 16 911 add r0, mmsize 912 add r4, mmsize 913 dec r5 914 jg .loop 915 REP_RET 916%else 917 RET 918%endif 919%endmacro 920 921%if ARCH_X86_64 == 0 922INIT_MMX mmxext 923DEBLOCK_CHROMA 924%endif 925INIT_XMM sse2 926DEBLOCK_CHROMA 927%if HAVE_AVX_EXTERNAL 928INIT_XMM avx 929DEBLOCK_CHROMA 930%endif 931