1;***************************************************************************** 2;* SSE2-optimized HEVC deblocking code 3;***************************************************************************** 4;* Copyright (C) 2013 VTT 5;* 6;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29pw_pixel_max: times 8 dw ((1 << 10)-1) 30pw_m1: times 8 dw -1 31pw_m2: times 8 dw -2 32pd_1 : times 4 dd 1 33 34cextern pw_4 35cextern pw_8 36 37SECTION .text 38INIT_XMM sse2 39 40; expands to [base],...,[base+7*stride] 41%define PASS8ROWS(base, base3, stride, stride3) \ 42 [base], [base+stride], [base+stride*2], [base3], \ 43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] 44 45; in: 8 rows of 4 bytes in %4..%11 46; out: 4 rows of 8 words in m0..m3 47%macro TRANSPOSE4x8B_LOAD 8 48 movd m0, %1 49 movd m2, %2 50 movd m1, %3 51 movd m3, %4 52 53 punpcklbw m0, m2 54 punpcklbw m1, m3 55 punpcklwd m0, m1 56 57 movd m4, %5 58 movd m6, %6 59 movd m5, %7 60 movd m3, %8 61 62 punpcklbw m4, m6 63 punpcklbw m5, m3 64 punpcklwd m4, m5 65 66 punpckhdq m2, m0, m4 67 punpckldq m0, m4 68 69 pxor m5, m5 70 punpckhbw m1, m0, m5 71 punpcklbw m0, m5 72 punpckhbw m3, m2, m5 73 punpcklbw m2, m5 74%endmacro 75 76; in: 4 rows of 8 words in m0..m3 77; out: 8 rows of 4 bytes in %1..%8 78%macro TRANSPOSE8x4B_STORE 8 79 packuswb m0, m0 80 packuswb m1, m1 81 packuswb m2, m2 82 packuswb m3, m3 83 84 punpcklbw m0, m1 85 punpcklbw m2, m3 86 87 punpckhwd m6, m0, m2 88 punpcklwd m0, m2 89 90 movd %1, m0 91 pshufd m0, m0, 0x39 92 movd %2, m0 93 pshufd m0, m0, 0x39 94 movd %3, m0 95 pshufd m0, m0, 0x39 96 movd %4, m0 97 98 movd %5, m6 99 pshufd m6, m6, 0x39 100 movd %6, m6 101 pshufd m6, m6, 0x39 102 movd %7, m6 103 pshufd m6, m6, 0x39 104 movd %8, m6 105%endmacro 106 107; in: 8 rows of 4 words in %4..%11 108; out: 4 rows of 8 words in m0..m3 109%macro TRANSPOSE4x8W_LOAD 8 110 movq m0, %1 111 movq m2, %2 112 movq m1, %3 113 movq m3, %4 114 115 punpcklwd m0, m2 116 punpcklwd m1, m3 117 punpckhdq m2, m0, m1 118 punpckldq m0, m1 119 120 movq m4, %5 121 movq m6, %6 122 movq m5, %7 123 movq m3, %8 124 125 punpcklwd m4, m6 126 punpcklwd m5, m3 127 punpckhdq m6, m4, m5 128 punpckldq m4, m5 129 130 punpckhqdq m1, m0, m4 131 punpcklqdq m0, m4 132 punpckhqdq m3, m2, m6 133 punpcklqdq m2, m6 134 135%endmacro 136 137; in: 4 rows of 8 words in m0..m3 138; out: 8 rows of 4 words in %1..%8 139%macro TRANSPOSE8x4W_STORE 8 140 pxor m5, m5; zeros reg 141 CLIPW m0, m5, [pw_pixel_max] 142 CLIPW m1, m5, [pw_pixel_max] 143 CLIPW m2, m5, [pw_pixel_max] 144 CLIPW m3, m5, [pw_pixel_max] 145 146 punpckhwd m4, m0, m1 147 punpcklwd m0, m1 148 punpckhwd m5, m2, m3 149 punpcklwd m2, m3 150 punpckhdq m6, m0, m2 151 punpckldq m0, m2 152 153 movq %1, m0 154 movhps %2, m0 155 movq %3, m6 156 movhps %4, m6 157 158 punpckhdq m6, m4, m5 159 punpckldq m4, m5 160 161 movq %5, m4 162 movhps %6, m4 163 movq %7, m6 164 movhps %8, m6 165%endmacro 166 167; in: 8 rows of 8 bytes in %1..%8 168; out: 8 rows of 8 words in m0..m7 169%macro TRANSPOSE8x8B_LOAD 8 170 movq m7, %1 171 movq m2, %2 172 movq m1, %3 173 movq m3, %4 174 175 punpcklbw m7, m2 176 punpcklbw m1, m3 177 punpcklwd m3, m7, m1 178 punpckhwd m7, m1 179 180 movq m4, %5 181 movq m6, %6 182 movq m5, %7 183 movq m15, %8 184 185 punpcklbw m4, m6 186 punpcklbw m5, m15 187 punpcklwd m9, m4, m5 188 punpckhwd m4, m5 189 190 punpckldq m1, m3, m9; 0, 1 191 punpckhdq m3, m9; 2, 3 192 193 punpckldq m5, m7, m4; 4, 5 194 punpckhdq m7, m4; 6, 7 195 196 pxor m13, m13 197 198 punpcklbw m0, m1, m13; 0 in 16 bit 199 punpckhbw m1, m13; 1 in 16 bit 200 201 punpcklbw m2, m3, m13; 2 202 punpckhbw m3, m13; 3 203 204 punpcklbw m4, m5, m13; 4 205 punpckhbw m5, m13; 5 206 207 punpcklbw m6, m7, m13; 6 208 punpckhbw m7, m13; 7 209%endmacro 210 211 212; in: 8 rows of 8 words in m0..m8 213; out: 8 rows of 8 bytes in %1..%8 214%macro TRANSPOSE8x8B_STORE 8 215 packuswb m0, m0 216 packuswb m1, m1 217 packuswb m2, m2 218 packuswb m3, m3 219 packuswb m4, m4 220 packuswb m5, m5 221 packuswb m6, m6 222 packuswb m7, m7 223 224 punpcklbw m0, m1 225 punpcklbw m2, m3 226 227 punpckhwd m8, m0, m2 228 punpcklwd m0, m2 229 230 punpcklbw m4, m5 231 punpcklbw m6, m7 232 233 punpckhwd m9, m4, m6 234 punpcklwd m4, m6 235 236 punpckhdq m10, m0, m4; 2, 3 237 punpckldq m0, m4; 0, 1 238 239 punpckldq m11, m8, m9; 4, 5 240 punpckhdq m8, m9; 6, 7 241 movq %1, m0 242 movhps %2, m0 243 movq %3, m10 244 movhps %4, m10 245 movq %5, m11 246 movhps %6, m11 247 movq %7, m8 248 movhps %8, m8 249%endmacro 250 251; in: 8 rows of 8 words in %1..%8 252; out: 8 rows of 8 words in m0..m7 253%macro TRANSPOSE8x8W_LOAD 8 254 movdqu m0, %1 255 movdqu m1, %2 256 movdqu m2, %3 257 movdqu m3, %4 258 movdqu m4, %5 259 movdqu m5, %6 260 movdqu m6, %7 261 movdqu m7, %8 262 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 263%endmacro 264 265; in: 8 rows of 8 words in m0..m8 266; out: 8 rows of 8 words in %1..%8 267%macro TRANSPOSE8x8W_STORE 8 268 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 269 270 pxor m8, m8 271 CLIPW m0, m8, [pw_pixel_max] 272 CLIPW m1, m8, [pw_pixel_max] 273 CLIPW m2, m8, [pw_pixel_max] 274 CLIPW m3, m8, [pw_pixel_max] 275 CLIPW m4, m8, [pw_pixel_max] 276 CLIPW m5, m8, [pw_pixel_max] 277 CLIPW m6, m8, [pw_pixel_max] 278 CLIPW m7, m8, [pw_pixel_max] 279 280 movdqu %1, m0 281 movdqu %2, m1 282 movdqu %3, m2 283 movdqu %4, m3 284 movdqu %5, m4 285 movdqu %6, m5 286 movdqu %7, m6 287 movdqu %8, m7 288%endmacro 289 290 291; in: %2 clobbered 292; out: %1 293; mask in m11 294; clobbers m10 295%macro MASKED_COPY 2 296 pand %2, m11 ; and mask 297 pandn m10, m11, %1; and -mask 298 por %2, m10 299 mova %1, %2 300%endmacro 301 302; in: %2 clobbered 303; out: %1 304; mask in %3, will be clobbered 305%macro MASKED_COPY2 3 306 pand %2, %3 ; and mask 307 pandn %3, %1; and -mask 308 por %2, %3 309 mova %1, %2 310%endmacro 311 312ALIGN 16 313; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2 314%macro CHROMA_DEBLOCK_BODY 1 315 psubw m4, m2, m1; q0 - p0 316 psubw m5, m0, m3; p1 - q1 317 psllw m4, 2; << 2 318 paddw m5, m4; 319 320 ;tc calculations 321 movd m6, [tcq]; tc0 322 punpcklwd m6, m6 323 movd m4, [tcq+4]; tc1 324 punpcklwd m4, m4 325 shufps m6, m4, 0; tc0, tc1 326 pmullw m4, m6, [pw_m1]; -tc0, -tc1 327 ;end tc calculations 328 329 paddw m5, [pw_4]; +4 330 psraw m5, 3; >> 3 331 332 psllw m4, %1-8; << (BIT_DEPTH - 8) 333 psllw m6, %1-8; << (BIT_DEPTH - 8) 334 pmaxsw m5, m4 335 pminsw m5, m6 336 paddw m1, m5; p0 + delta0 337 psubw m2, m5; q0 - delta0 338%endmacro 339 340; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6 341%macro LUMA_DEBLOCK_BODY 2 342 psllw m9, m2, 1; *2 343 psubw m10, m1, m9 344 paddw m10, m3 345 ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3 346 347 psllw m9, m5, 1; *2 348 psubw m11, m6, m9 349 paddw m11, m4 350 ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 351 352 ;beta calculations 353 mov r11, [betaq]; 354 shl r11, %1 - 8 355 movd m13, r11d; beta0 356 add betaq, 4; 357 punpcklwd m13, m13 358 mov r12, [betaq]; 359 shl r12, %1 - 8 360 movd m14, r12d; beta1 361 punpcklwd m14, m14 362 pshufd m13, m14, 0; beta0, beta1 363 ;end beta calculations 364 365 paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 366 367 pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high 368 pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low 369 370 pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3 371 pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3 372 373 paddw m14, m9; 0d0+0d3, 1d0+1d3 374 375 ;compare 376 pcmpgtw m15, m13, m14; beta0, beta1 377 movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) 378 cmp r13, 0 379 je .bypassluma 380 381 ;weak / strong decision compare to beta_2 382 psraw m15, m13, 2; beta >> 2 383 psllw m8, m9, 1; 384 pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2 385 movmskps r14, m15; 386 ;end weak / strong decision 387 388 ; weak filter nd_p/q calculation 389 pshufd m8, m10, 0x31 390 psrld m8, 16 391 paddw m8, m10 392 movd r7d, m8 393 and r7, 0xffff; 1dp0 + 1dp3 394 pshufd m8, m8, 0x4E 395 movd r8d, m8 396 and r8, 0xffff; 0dp0 + 0dp3 397 398 pshufd m8, m11, 0x31 399 psrld m8, 16 400 paddw m8, m11 401 movd r9d, m8 402 and r9, 0xffff; 1dq0 + 1dq3 403 pshufd m8, m8, 0x4E 404 movd r10d, m8 405 and r10, 0xffff; 0dq0 + 0dq3 406 ; end calc for weak filter 407 408 ; filtering mask 409 mov r2, r13 410 shr r2, 3 411 movd m15, r2d 412 and r13, 1 413 movd m11, r13d 414 shufps m11, m15, 0 415 shl r2, 1 416 or r13, r2 417 418 pcmpeqd m11, [pd_1]; filtering mask 419 420 ;decide between strong and weak filtering 421 ;tc25 calculations 422 mov r2d, [tcq]; 423 shl r2, %1 - 8 424 movd m8, r2d; tc0 425 add tcq, 4; 426 mov r3d, [tcq]; 427 shl r3, %1 - 8 428 movd m9, r3d; tc1 429 add r2d, r3d; tc0 + tc1 430 jz .bypassluma 431 punpcklwd m8, m8 432 punpcklwd m9, m9 433 shufps m8, m9, 0; tc0, tc1 434 mova m9, m8 435 psllw m8, 2; tc << 2 436 pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1) 437 ;end tc25 calculations 438 439 ;----beta_3 comparison----- 440 psubw m12, m0, m3; p3 - p0 441 ABS1 m12, m14; abs(p3 - p0) 442 443 psubw m15, m7, m4; q3 - q0 444 ABS1 m15, m14; abs(q3 - q0) 445 446 paddw m12, m15; abs(p3 - p0) + abs(q3 - q0) 447 448 pshufhw m12, m12, 0xf0 ;0b11110000; 449 pshuflw m12, m12, 0xf0 ;0b11110000; 450 451 psraw m13, 3; beta >> 3 452 pcmpgtw m13, m12; 453 movmskps r2, m13; 454 and r14, r2; strong mask , beta_2 and beta_3 comparisons 455 ;----beta_3 comparison end----- 456 ;----tc25 comparison--- 457 psubw m12, m3, m4; p0 - q0 458 ABS1 m12, m14; abs(p0 - q0) 459 460 pshufhw m12, m12, 0xf0 ;0b11110000; 461 pshuflw m12, m12, 0xf0 ;0b11110000; 462 463 pcmpgtw m8, m12; tc25 comparisons 464 movmskps r2, m8; 465 and r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons 466 ;----tc25 comparison end--- 467 mov r2, r14; 468 shr r2, 1; 469 and r14, r2; strong mask, bits 2 and 0 470 471 pmullw m14, m9, [pw_m2]; -tc * 2 472 psllw m9, 1; tc * 2 473 474 and r14, 5; 0b101 475 mov r2, r14; strong mask 476 shr r14, 2; 477 movd m12, r14d; store to xmm for mask generation 478 shl r14, 1 479 and r2, 1 480 movd m10, r2d; store to xmm for mask generation 481 or r14, r2; final strong mask, bits 1 and 0 482 jz .weakfilter 483 484 shufps m10, m12, 0 485 pcmpeqd m10, [pd_1]; strong mask 486 487 mova m13, [pw_4]; 4 in every cell 488 pand m11, m10; combine filtering mask and strong mask 489 paddw m12, m2, m3; p1 + p0 490 paddw m12, m4; p1 + p0 + q0 491 mova m10, m12; copy 492 psllw m12, 1; 2*p1 + 2*p0 + 2*q0 493 paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0 494 paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1 495 paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 496 psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) 497 psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0 498 pmaxsw m12, m14 499 pminsw m12, m9; av_clip( , -2 * tc, 2 * tc) 500 paddw m12, m3; p0' 501 502 paddw m15, m1, m10; p2 + p1 + p0 + q0 503 psrlw m13, 1; 2 in every cell 504 paddw m15, m13; p2 + p1 + p0 + q0 + 2 505 psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2 506 psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1 507 pmaxsw m15, m14 508 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 509 paddw m15, m2; p1' 510 511 paddw m8, m1, m0; p3 + p2 512 psllw m8, 1; 2*p3 + 2*p2 513 paddw m8, m1; 2*p3 + 3*p2 514 paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0 515 psllw m13, 1; 4 in every cell 516 paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4 517 psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3 518 psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2 519 pmaxsw m8, m14 520 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 521 paddw m8, m1; p2' 522 MASKED_COPY m1, m8 523 524 paddw m8, m3, m4; p0 + q0 525 paddw m8, m5; p0 + q0 + q1 526 psllw m8, 1; 2*p0 + 2*q0 + 2*q1 527 paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1 528 paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2 529 paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 530 psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3 531 psubw m8, m4; 532 pmaxsw m8, m14 533 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc) 534 paddw m8, m4; q0' 535 MASKED_COPY m2, m15 536 537 paddw m15, m3, m4; p0 + q0 538 paddw m15, m5; p0 + q0 + q1 539 mova m10, m15; 540 paddw m15, m6; p0 + q0 + q1 + q2 541 psrlw m13, 1; 2 in every cell 542 paddw m15, m13; p0 + q0 + q1 + q2 + 2 543 psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2 544 psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1 545 pmaxsw m15, m14 546 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) 547 paddw m15, m5; q1' 548 549 paddw m13, m7; q3 + 2 550 paddw m13, m6; q3 + q2 + 2 551 psllw m13, 1; 2*q3 + 2*q2 + 4 552 paddw m13, m6; 2*q3 + 3*q2 + 4 553 paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4 554 psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3 555 psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2 556 pmaxsw m13, m14 557 pminsw m13, m9; av_clip( , -2 * tc, 2 * tc) 558 paddw m13, m6; q2' 559 560 MASKED_COPY m6, m13 561 MASKED_COPY m5, m15 562 MASKED_COPY m4, m8 563 MASKED_COPY m3, m12 564 565.weakfilter: 566 not r14; strong mask -> weak mask 567 and r14, r13; final weak filtering mask, bits 0 and 1 568 jz .store 569 570 ; weak filtering mask 571 mov r2, r14 572 shr r2, 1 573 movd m12, r2d 574 and r14, 1 575 movd m11, r14d 576 shufps m11, m12, 0 577 pcmpeqd m11, [pd_1]; filtering mask 578 579 mov r13, r11; beta0 580 shr r13, 1; 581 add r11, r13 582 shr r11, 3; ((beta0+(beta0>>1))>>3)) 583 584 mov r13, r12; beta1 585 shr r13, 1; 586 add r12, r13 587 shr r12, 3; ((beta1+(beta1>>1))>>3)) 588 589 mova m13, [pw_8] 590 psubw m12, m4, m3 ; q0 - p0 591 psllw m10, m12, 3; 8 * (q0 - p0) 592 paddw m12, m10 ; 9 * (q0 - p0) 593 594 psubw m10, m5, m2 ; q1 - p1 595 psllw m8, m10, 1; 2 * ( q1 - p1 ) 596 paddw m10, m8; 3 * ( q1 - p1 ) 597 psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 ) 598 paddw m12, m13; + 8 599 psraw m12, 4; >> 4 , delta0 600 PABSW m13, m12; abs(delta0) 601 602 603 psllw m10, m9, 2; 8 * tc 604 paddw m10, m9; 10 * tc 605 pcmpgtw m10, m13 606 pand m11, m10 607 608 psraw m9, 1; tc * 2 -> tc 609 psraw m14, 1; -tc * 2 -> -tc 610 611 pmaxsw m12, m14 612 pminsw m12, m9; av_clip(delta0, -tc, tc) 613 614 psraw m9, 1; tc -> tc / 2 615 pmullw m14, m9, [pw_m1]; -tc / 2 616 617 pavgw m15, m1, m3; (p2 + p0 + 1) >> 1 618 psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1 619 paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0 620 psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 621 pmaxsw m15, m14 622 pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2) 623 paddw m15, m2; p1' 624 625 ;beta calculations 626 movd m10, r11d; beta0 627 punpcklwd m10, m10 628 movd m13, r12d; beta1 629 punpcklwd m13, m13 630 shufps m10, m13, 0; betax0, betax1 631 632 movd m13, r7d; 1dp0 + 1dp3 633 movd m8, r8d; 0dp0 + 0dp3 634 punpcklwd m8, m8 635 punpcklwd m13, m13 636 shufps m13, m8, 0; 637 pcmpgtw m8, m10, m13 638 pand m8, m11 639 ;end beta calculations 640 MASKED_COPY2 m2, m15, m8; write p1' 641 642 pavgw m8, m6, m4; (q2 + q0 + 1) >> 1 643 psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1 644 psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0) 645 psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 646 pmaxsw m8, m14 647 pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2) 648 paddw m8, m5; q1' 649 650 movd m13, r9d; 651 movd m15, r10d; 652 punpcklwd m15, m15 653 punpcklwd m13, m13 654 shufps m13, m15, 0; dq0 + dq3 655 656 pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3) 657 pand m10, m11 658 MASKED_COPY2 m5, m8, m10; write q1' 659 660 paddw m15, m3, m12 ; p0 + delta0 661 MASKED_COPY m3, m15 662 663 psubw m8, m4, m12 ; q0 - delta0 664 MASKED_COPY m4, m8 665%endmacro 666 667INIT_XMM sse2 668;----------------------------------------------------------------------------- 669; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q) 670;----------------------------------------------------------------------------- 671cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride 672 sub pixq, 2 673 lea r3strideq, [3*strideq] 674 mov pix0q, pixq 675 add pixq, r3strideq 676 TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 677 CHROMA_DEBLOCK_BODY 8 678 TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq) 679 RET 680 681cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride 682 sub pixq, 4 683 lea r3strideq, [3*strideq] 684 mov pix0q, pixq 685 add pixq, r3strideq 686 TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) 687 CHROMA_DEBLOCK_BODY 10 688 TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq) 689 RET 690 691;----------------------------------------------------------------------------- 692; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q 693;----------------------------------------------------------------------------- 694cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0 695 mov pix0q, pixq 696 sub pix0q, strideq 697 sub pix0q, strideq 698 movq m0, [pix0q]; p1 699 movq m1, [pix0q+strideq]; p0 700 movq m2, [pixq]; q0 701 movq m3, [pixq+strideq]; q1 702 pxor m5, m5; zeros reg 703 punpcklbw m0, m5 704 punpcklbw m1, m5 705 punpcklbw m2, m5 706 punpcklbw m3, m5 707 CHROMA_DEBLOCK_BODY 8 708 packuswb m1, m1 ; p0' packed in bytes on low quadword 709 packuswb m2, m2 ; q0' packed in bytes on low quadword 710 movq [pix0q+strideq], m1 711 movq [pixq], m2 712 RET 713 714cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0 715 mov pix0q, pixq 716 sub pix0q, strideq 717 sub pix0q, strideq 718 movu m0, [pix0q]; p1 719 movu m1, [pix0q+strideq]; p0 720 movu m2, [pixq]; q0 721 movu m3, [pixq+strideq]; q1 722 CHROMA_DEBLOCK_BODY 10 723 pxor m5, m5; zeros reg 724 CLIPW m1, m5, [pw_pixel_max] 725 CLIPW m2, m5, [pw_pixel_max] 726 movu [pix0q+strideq], m1 727 movu [pixq], m2 728 RET 729 730%if ARCH_X86_64 731%macro LOOP_FILTER_LUMA 0 732;----------------------------------------------------------------------------- 733; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); 734;----------------------------------------------------------------------------- 735cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc 736 sub r0, 4 737 lea r5, [3*r1] 738 mov r6, r0 739 add r0, r5 740 TRANSPOSE8x8B_LOAD PASS8ROWS(r6, r0, r1, r5) 741 LUMA_DEBLOCK_BODY 8, v 742.store: 743 TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5) 744.bypassluma: 745 RET 746 747cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc 748 sub pixq, 8 749 lea r5, [3*strideq] 750 mov r6, pixq 751 add pixq, r5 752 TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5) 753 LUMA_DEBLOCK_BODY 10, v 754.store: 755 TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5) 756.bypassluma: 757 RET 758 759;----------------------------------------------------------------------------- 760; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); 761;----------------------------------------------------------------------------- 762cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride 763 lea src3strideq, [3*strideq] 764 mov pix0q, pixq 765 sub pix0q, src3strideq 766 sub pix0q, strideq 767 movdqu m0, [pix0q]; p3 768 movdqu m1, [pix0q+strideq]; p2 769 movdqu m2, [pix0q+2*strideq]; p1 770 movdqu m3, [pix0q+src3strideq]; p0 771 movdqu m4, [pixq]; q0 772 movdqu m5, [pixq+strideq]; q1 773 movdqu m6, [pixq+2*strideq]; q2 774 movdqu m7, [pixq+src3strideq]; q3 775 pxor m8, m8 776 punpcklbw m0, m8 777 punpcklbw m1, m8 778 punpcklbw m2, m8 779 punpcklbw m3, m8 780 punpcklbw m4, m8 781 punpcklbw m5, m8 782 punpcklbw m6, m8 783 punpcklbw m7, m8 784 LUMA_DEBLOCK_BODY 8, h 785.store: 786 packuswb m1, m1; p2 787 packuswb m2, m2; p1 788 packuswb m3, m3; p0 789 packuswb m4, m4; q0 790 packuswb m5, m5; q1 791 packuswb m6, m6; q2 792 movq [r5+r1], m1; p2 793 movq [r5+2*r1], m2; p1 794 movq [r5+r6], m3; p0 795 movq [r0], m4; q0 796 movq [r0+r1], m5; q1 797 movq [r0+2*r1], m6; q2 798.bypassluma: 799 RET 800 801cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride 802 lea src3strideq, [3*strideq] 803 mov pix0q, pixq 804 sub pix0q, src3strideq 805 sub pix0q, strideq 806 movdqu m0, [pix0q]; p3 807 movdqu m1, [pix0q+strideq]; p2 808 movdqu m2, [pix0q+2*strideq]; p1 809 movdqu m3, [pix0q+src3strideq]; p0 810 movdqu m4, [pixq]; q0 811 movdqu m5, [pixq+strideq]; q1 812 movdqu m6, [pixq+2*strideq]; q2 813 movdqu m7, [pixq+src3strideq]; q3 814 LUMA_DEBLOCK_BODY 10, h 815.store: 816 pxor m8, m8; zeros reg 817 CLIPW m1, m8, [pw_pixel_max] 818 CLIPW m2, m8, [pw_pixel_max] 819 CLIPW m3, m8, [pw_pixel_max] 820 CLIPW m4, m8, [pw_pixel_max] 821 CLIPW m5, m8, [pw_pixel_max] 822 CLIPW m6, m8, [pw_pixel_max] 823 movdqu [pix0q+strideq], m1; p2 824 movdqu [pix0q+2*strideq], m2; p1 825 movdqu [pix0q+src3strideq], m3; p0 826 movdqu [pixq], m4; q0 827 movdqu [pixq+strideq], m5; q1 828 movdqu [pixq+2*strideq], m6; q2 829.bypassluma: 830 RET 831%endmacro 832 833INIT_XMM sse2 834LOOP_FILTER_LUMA 835INIT_XMM ssse3 836LOOP_FILTER_LUMA 837%endif 838