1;***************************************************************************** 2;* MMX/SSE2-optimized H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2008 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* 8;* This program is free software; you can redistribute it and/or modify 9;* it under the terms of the GNU General Public License as published by 10;* the Free Software Foundation; either version 2 of the License, or 11;* (at your option) any later version. 12;* 13;* This program is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16;* GNU General Public License for more details. 17;* 18;* You should have received a copy of the GNU General Public License 19;* along with this program; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 21;***************************************************************************** 22 23%include "x86inc.asm" 24 25SECTION_RODATA 26pb_00: times 16 db 0x00 27pb_01: times 16 db 0x01 28pb_03: times 16 db 0x03 29pb_a1: times 16 db 0xa1 30 31SECTION .text 32 33; expands to [base],...,[base+7*stride] 34%define PASS8ROWS(base, base3, stride, stride3) \ 35 [base], [base+stride], [base+stride*2], [base3], \ 36 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] 37 38; in: 8 rows of 4 bytes in %1..%8 39; out: 4 rows of 8 bytes in m0..m3 40%macro TRANSPOSE4x8_LOAD 8 41 movd m0, %1 42 movd m2, %2 43 movd m1, %3 44 movd m3, %4 45 punpcklbw m0, m2 46 punpcklbw m1, m3 47 movq m2, m0 48 punpcklwd m0, m1 49 punpckhwd m2, m1 50 51 movd m4, %5 52 movd m6, %6 53 movd m5, %7 54 movd m7, %8 55 punpcklbw m4, m6 56 punpcklbw m5, m7 57 movq m6, m4 58 punpcklwd m4, m5 59 punpckhwd m6, m5 60 61 movq m1, m0 62 movq m3, m2 63 punpckldq m0, m4 64 punpckhdq m1, m4 65 punpckldq m2, m6 66 punpckhdq m3, m6 67%endmacro 68 69; in: 4 rows of 8 bytes in m0..m3 70; out: 8 rows of 4 bytes in %1..%8 71%macro TRANSPOSE8x4_STORE 8 72 movq m4, m0 73 movq m5, m1 74 movq m6, m2 75 punpckhdq m4, m4 76 punpckhdq m5, m5 77 punpckhdq m6, m6 78 79 punpcklbw m0, m1 80 punpcklbw m2, m3 81 movq m1, m0 82 punpcklwd m0, m2 83 punpckhwd m1, m2 84 movd %1, m0 85 punpckhdq m0, m0 86 movd %2, m0 87 movd %3, m1 88 punpckhdq m1, m1 89 movd %4, m1 90 91 punpckhdq m3, m3 92 punpcklbw m4, m5 93 punpcklbw m6, m3 94 movq m5, m4 95 punpcklwd m4, m6 96 punpckhwd m5, m6 97 movd %5, m4 98 punpckhdq m4, m4 99 movd %6, m4 100 movd %7, m5 101 punpckhdq m5, m5 102 movd %8, m5 103%endmacro 104 105%macro SBUTTERFLY 4 106 movq %4, %2 107 punpckl%1 %2, %3 108 punpckh%1 %4, %3 109%endmacro 110 111; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 112; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] 113%macro TRANSPOSE6x8_MEM 9 114 movq m0, %1 115 movq m1, %2 116 movq m2, %3 117 movq m3, %4 118 movq m4, %5 119 movq m5, %6 120 movq m6, %7 121 SBUTTERFLY bw, m0, m1, m7 122 SBUTTERFLY bw, m2, m3, m1 123 SBUTTERFLY bw, m4, m5, m3 124 movq [%9+0x10], m1 125 SBUTTERFLY bw, m6, %8, m5 126 SBUTTERFLY wd, m0, m2, m1 127 SBUTTERFLY wd, m4, m6, m2 128 punpckhdq m0, m4 129 movq [%9+0x00], m0 130 SBUTTERFLY wd, m7, [%9+0x10], m6 131 SBUTTERFLY wd, m3, m5, m4 132 SBUTTERFLY dq, m7, m3, m0 133 SBUTTERFLY dq, m1, m2, m5 134 punpckldq m6, m4 135 movq [%9+0x10], m1 136 movq [%9+0x20], m5 137 movq [%9+0x30], m7 138 movq [%9+0x40], m0 139 movq [%9+0x50], m6 140%endmacro 141 142; in: 8 rows of 8 in %1..%8 143; out: 8 rows of 8 in %9..%16 144%macro TRANSPOSE8x8_MEM 16 145 movq m0, %1 146 movq m1, %2 147 movq m2, %3 148 movq m3, %4 149 movq m4, %5 150 movq m5, %6 151 movq m6, %7 152 SBUTTERFLY bw, m0, m1, m7 153 SBUTTERFLY bw, m2, m3, m1 154 SBUTTERFLY bw, m4, m5, m3 155 SBUTTERFLY bw, m6, %8, m5 156 movq %9, m3 157 SBUTTERFLY wd, m0, m2, m3 158 SBUTTERFLY wd, m4, m6, m2 159 SBUTTERFLY wd, m7, m1, m6 160 movq %11, m2 161 movq m2, %9 162 SBUTTERFLY wd, m2, m5, m1 163 SBUTTERFLY dq, m0, m4, m5 164 SBUTTERFLY dq, m7, m2, m4 165 movq %9, m0 166 movq %10, m5 167 movq %13, m7 168 movq %14, m4 169 SBUTTERFLY dq, m3, %11, m0 170 SBUTTERFLY dq, m6, m1, m5 171 movq %11, m3 172 movq %12, m0 173 movq %15, m6 174 movq %16, m5 175%endmacro 176 177; out: %4 = |%1-%2|>%3 178; clobbers: %5 179%macro DIFF_GT 5 180 mova %5, %2 181 mova %4, %1 182 psubusb %5, %1 183 psubusb %4, %2 184 por %4, %5 185 psubusb %4, %3 186%endmacro 187 188; out: %4 = |%1-%2|>%3 189; clobbers: %5 190%macro DIFF_GT2 5 191 mova %5, %2 192 mova %4, %1 193 psubusb %5, %1 194 psubusb %4, %2 195 psubusb %5, %3 196 psubusb %4, %3 197 pcmpeqb %4, %5 198%endmacro 199 200%macro SPLATW 1 201%ifidn m0, xmm0 202 pshuflw %1, %1, 0 203 punpcklqdq %1, %1 204%else 205 pshufw %1, %1, 0 206%endif 207%endmacro 208 209; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 210; out: m5=beta-1, m7=mask, %3=alpha-1 211; clobbers: m4,m6 212%macro LOAD_MASK 2-3 213 movd m4, %1 214 movd m5, %2 215 SPLATW m4 216 SPLATW m5 217 packuswb m4, m4 ; 16x alpha-1 218 packuswb m5, m5 ; 16x beta-1 219%if %0>2 220 mova %3, m4 221%endif 222 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 223 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 224 por m7, m4 225 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 226 por m7, m4 227 pxor m6, m6 228 pcmpeqb m7, m6 229%endmacro 230 231; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) 232; out: m1=p0' m2=q0' 233; clobbers: m0,3-6 234%macro DEBLOCK_P0_Q0 0 235 mova m5, m1 236 pxor m5, m2 ; p0^q0 237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 238 pcmpeqb m4, m4 239 pxor m3, m4 240 pavgb m3, m0 ; (p1 - q1 + 256)>>1 241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 242 pxor m4, m1 243 pavgb m4, m2 ; (q0 - p0 + 256)>>1 244 pavgb m3, m5 245 paddusb m3, m4 ; d+128+33 246 mova m6, [pb_a1 GLOBAL] 247 psubusb m6, m3 248 psubusb m3, [pb_a1 GLOBAL] 249 pminub m6, m7 250 pminub m3, m7 251 psubusb m1, m6 252 psubusb m2, m3 253 paddusb m1, m3 254 paddusb m2, m6 255%endmacro 256 257; in: m1=p0 m2=q0 258; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp 259; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 260; clobbers: q2, tmp, tc0 261%macro LUMA_Q1 6 262 mova %6, m1 263 pavgb %6, m2 264 pavgb %2, %6 ; avg(p2,avg(p0,q0)) 265 pxor %6, %3 266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 268 mova %6, %1 269 psubusb %6, %5 270 paddusb %5, %1 271 pmaxub %2, %6 272 pminub %2, %5 273 mova %4, %2 274%endmacro 275 276%ifdef ARCH_X86_64 277;----------------------------------------------------------------------------- 278; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 279;----------------------------------------------------------------------------- 280INIT_XMM 281cglobal x264_deblock_v_luma_sse2, 5,5,10 282 movd m8, [r4] ; tc0 283 lea r4, [r1*3] 284 dec r2d ; alpha-1 285 neg r4 286 dec r3d ; beta-1 287 add r4, r0 ; pix-3*stride 288 289 mova m0, [r4+r1] ; p1 290 mova m1, [r4+2*r1] ; p0 291 mova m2, [r0] ; q0 292 mova m3, [r0+r1] ; q1 293 LOAD_MASK r2d, r3d 294 295 punpcklbw m8, m8 296 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 297 pcmpeqb m9, m9 298 pcmpeqb m9, m8 299 pandn m9, m7 300 pand m8, m9 301 302 movdqa m3, [r4] ; p2 303 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 304 pand m6, m9 305 mova m7, m8 306 psubb m7, m6 307 pand m6, m8 308 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 309 310 movdqa m4, [r0+2*r1] ; q2 311 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 312 pand m6, m9 313 pand m8, m6 314 psubb m7, m6 315 mova m3, [r0+r1] 316 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 317 318 DEBLOCK_P0_Q0 319 mova [r4+2*r1], m1 320 mova [r0], m2 321 RET 322 323;----------------------------------------------------------------------------- 324; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 325;----------------------------------------------------------------------------- 326INIT_MMX 327cglobal x264_deblock_h_luma_sse2, 5,7 328 movsxd r10, r1d 329 lea r11, [r10+r10*2] 330 lea r6, [r0-4] 331 lea r5, [r0-4+r11] 332%ifdef WIN64 333 sub rsp, 0x98 334 %define pix_tmp rsp+0x30 335%else 336 sub rsp, 0x68 337 %define pix_tmp rsp 338%endif 339 340 ; transpose 6x16 -> tmp space 341 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp 342 lea r6, [r6+r10*8] 343 lea r5, [r5+r10*8] 344 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 345 346 ; vertical filter 347 ; alpha, beta, tc0 are still in r2d, r3d, r4 348 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them 349 lea r0, [pix_tmp+0x30] 350 mov r1d, 0x10 351%ifdef WIN64 352 mov [rsp+0x20], r4 353%endif 354 call x264_deblock_v_luma_sse2 355 356 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 357 add r6, 2 358 add r5, 2 359 movq m0, [pix_tmp+0x18] 360 movq m1, [pix_tmp+0x28] 361 movq m2, [pix_tmp+0x38] 362 movq m3, [pix_tmp+0x48] 363 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) 364 365 shl r10, 3 366 sub r6, r10 367 sub r5, r10 368 shr r10, 3 369 movq m0, [pix_tmp+0x10] 370 movq m1, [pix_tmp+0x20] 371 movq m2, [pix_tmp+0x30] 372 movq m3, [pix_tmp+0x40] 373 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) 374 375%ifdef WIN64 376 add rsp, 0x98 377%else 378 add rsp, 0x68 379%endif 380 RET 381 382%else 383 384%macro DEBLOCK_LUMA 3 385;----------------------------------------------------------------------------- 386; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 387;----------------------------------------------------------------------------- 388cglobal x264_deblock_%2_luma_%1, 5,5 389 lea r4, [r1*3] 390 dec r2 ; alpha-1 391 neg r4 392 dec r3 ; beta-1 393 add r4, r0 ; pix-3*stride 394 %assign pad 2*%3+12-(stack_offset&15) 395 SUB esp, pad 396 397 mova m0, [r4+r1] ; p1 398 mova m1, [r4+2*r1] ; p0 399 mova m2, [r0] ; q0 400 mova m3, [r0+r1] ; q1 401 LOAD_MASK r2, r3 402 403 mov r3, r4mp 404 movd m4, [r3] ; tc0 405 punpcklbw m4, m4 406 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 407 mova [esp+%3], m4 ; tc 408 pcmpeqb m3, m3 409 pcmpgtb m4, m3 410 pand m4, m7 411 mova [esp], m4 ; mask 412 413 mova m3, [r4] ; p2 414 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 415 pand m6, m4 416 pand m4, [esp+%3] ; tc 417 mova m7, m4 418 psubb m7, m6 419 pand m6, m4 420 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 421 422 mova m4, [r0+2*r1] ; q2 423 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 424 mova m5, [esp] ; mask 425 pand m6, m5 426 mova m5, [esp+%3] ; tc 427 pand m5, m6 428 psubb m7, m6 429 mova m3, [r0+r1] 430 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 431 432 DEBLOCK_P0_Q0 433 mova [r4+2*r1], m1 434 mova [r0], m2 435 ADD esp, pad 436 RET 437 438;----------------------------------------------------------------------------- 439; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 440;----------------------------------------------------------------------------- 441INIT_MMX 442cglobal x264_deblock_h_luma_%1, 0,5 443 mov r0, r0mp 444 mov r3, r1m 445 lea r4, [r3*3] 446 sub r0, 4 447 lea r1, [r0+r4] 448 %assign pad 0x78-(stack_offset&15) 449 SUB esp, pad 450%define pix_tmp esp+12 451 452 ; transpose 6x16 -> tmp space 453 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp 454 lea r0, [r0+r3*8] 455 lea r1, [r1+r3*8] 456 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 457 458 ; vertical filter 459 lea r0, [pix_tmp+0x30] 460 PUSH dword r4m 461 PUSH dword r3m 462 PUSH dword r2m 463 PUSH dword 16 464 PUSH dword r0 465 call x264_deblock_%2_luma_%1 466%ifidn %2, v8 467 add dword [esp ], 8 ; pix_tmp+0x38 468 add dword [esp+16], 2 ; tc0+2 469 call x264_deblock_%2_luma_%1 470%endif 471 ADD esp, 20 472 473 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 474 mov r0, r0mp 475 sub r0, 2 476 lea r1, [r0+r4] 477 478 movq m0, [pix_tmp+0x10] 479 movq m1, [pix_tmp+0x20] 480 movq m2, [pix_tmp+0x30] 481 movq m3, [pix_tmp+0x40] 482 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) 483 484 lea r0, [r0+r3*8] 485 lea r1, [r1+r3*8] 486 movq m0, [pix_tmp+0x18] 487 movq m1, [pix_tmp+0x28] 488 movq m2, [pix_tmp+0x38] 489 movq m3, [pix_tmp+0x48] 490 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) 491 492 ADD esp, pad 493 RET 494%endmacro ; DEBLOCK_LUMA 495 496INIT_XMM 497DEBLOCK_LUMA sse2, v, 16 498 499%endif ; ARCH 500 501 502 503%macro LUMA_INTRA_P012 4 ; p0..p3 in memory 504 mova t0, p2 505 mova t1, p0 506 pavgb t0, p1 507 pavgb t1, q0 508 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 509 mova t5, t1 510 mova t2, p2 511 mova t3, p0 512 paddb t2, p1 513 paddb t3, q0 514 paddb t2, t3 515 mova t3, t2 516 mova t4, t2 517 psrlw t2, 1 518 pavgb t2, mpb_00 519 pxor t2, t0 520 pand t2, mpb_01 521 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; 522 523 mova t1, p2 524 mova t2, p2 525 pavgb t1, q1 526 psubb t2, q1 527 paddb t3, t3 528 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 529 pand t2, mpb_01 530 psubb t1, t2 531 pavgb t1, p1 532 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 533 psrlw t3, 2 534 pavgb t3, mpb_00 535 pxor t3, t1 536 pand t3, mpb_01 537 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 538 539 mova t3, p0 540 mova t2, p0 541 pxor t3, q1 542 pavgb t2, q1 543 pand t3, mpb_01 544 psubb t2, t3 545 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 546 547 pxor t1, t2 548 pxor t2, p0 549 pand t1, mask1p 550 pand t2, mask0 551 pxor t1, t2 552 pxor t1, p0 553 mova %1, t1 ; store p0 554 555 mova t1, %4 ; p3 556 mova t2, t1 557 pavgb t1, p2 558 paddb t2, p2 559 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 560 paddb t2, t2 561 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 562 psrlw t2, 2 563 pavgb t2, mpb_00 564 pxor t2, t1 565 pand t2, mpb_01 566 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 567 568 pxor t0, p1 569 pxor t1, p2 570 pand t0, mask1p 571 pand t1, mask1p 572 pxor t0, p1 573 pxor t1, p2 574 mova %2, t0 ; store p1 575 mova %3, t1 ; store p2 576%endmacro 577 578%macro LUMA_INTRA_SWAP_PQ 0 579 %define q1 m0 580 %define q0 m1 581 %define p0 m2 582 %define p1 m3 583 %define p2 q2 584 %define mask1p mask1q 585%endmacro 586 587%macro DEBLOCK_LUMA_INTRA 2 588 %define p1 m0 589 %define p0 m1 590 %define q0 m2 591 %define q1 m3 592 %define t0 m4 593 %define t1 m5 594 %define t2 m6 595 %define t3 m7 596%ifdef ARCH_X86_64 597 %define p2 m8 598 %define q2 m9 599 %define t4 m10 600 %define t5 m11 601 %define mask0 m12 602 %define mask1p m13 603 %define mask1q [rsp-24] 604 %define mpb_00 m14 605 %define mpb_01 m15 606%else 607 %define spill(x) [esp+16*x+((stack_offset+4)&15)] 608 %define p2 [r4+r1] 609 %define q2 [r0+2*r1] 610 %define t4 spill(0) 611 %define t5 spill(1) 612 %define mask0 spill(2) 613 %define mask1p spill(3) 614 %define mask1q spill(4) 615 %define mpb_00 [pb_00 GLOBAL] 616 %define mpb_01 [pb_01 GLOBAL] 617%endif 618 619;----------------------------------------------------------------------------- 620; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) 621;----------------------------------------------------------------------------- 622cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 623%ifndef ARCH_X86_64 624 sub esp, 0x60 625%endif 626 lea r4, [r1*4] 627 lea r5, [r1*3] ; 3*stride 628 dec r2d ; alpha-1 629 jl .end 630 neg r4 631 dec r3d ; beta-1 632 jl .end 633 add r4, r0 ; pix-4*stride 634 mova p1, [r4+2*r1] 635 mova p0, [r4+r5] 636 mova q0, [r0] 637 mova q1, [r0+r1] 638%ifdef ARCH_X86_64 639 pxor mpb_00, mpb_00 640 mova mpb_01, [pb_01 GLOBAL] 641 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 642 SWAP 7, 12 ; m12=mask0 643 pavgb t5, mpb_00 644 pavgb t5, mpb_01 ; alpha/4+1 645 movdqa p2, [r4+r1] 646 movdqa q2, [r0+2*r1] 647 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 648 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 649 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 650 pand t0, mask0 651 pand t4, t0 652 pand t2, t0 653 mova mask1q, t4 654 mova mask1p, t2 655%else 656 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 657 mova m4, t5 658 mova mask0, m7 659 pavgb m4, [pb_00 GLOBAL] 660 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 661 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 662 pand m6, mask0 663 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 664 pand m4, m6 665 mova mask1p, m4 666 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 667 pand m4, m6 668 mova mask1q, m4 669%endif 670 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] 671 LUMA_INTRA_SWAP_PQ 672 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] 673.end: 674%ifndef ARCH_X86_64 675 add esp, 0x60 676%endif 677 RET 678 679INIT_MMX 680%ifdef ARCH_X86_64 681;----------------------------------------------------------------------------- 682; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) 683;----------------------------------------------------------------------------- 684cglobal x264_deblock_h_luma_intra_%1, 4,7 685 movsxd r10, r1d 686 lea r11, [r10*3] 687 lea r6, [r0-4] 688 lea r5, [r0-4+r11] 689 sub rsp, 0x88 690 %define pix_tmp rsp 691 692 ; transpose 8x16 -> tmp space 693 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 694 lea r6, [r6+r10*8] 695 lea r5, [r5+r10*8] 696 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 697 698 lea r0, [pix_tmp+0x40] 699 mov r1, 0x10 700 call x264_deblock_v_luma_intra_%1 701 702 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 703 lea r5, [r6+r11] 704 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) 705 shl r10, 3 706 sub r6, r10 707 sub r5, r10 708 shr r10, 3 709 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) 710 add rsp, 0x88 711 RET 712%else 713cglobal x264_deblock_h_luma_intra_%1, 2,4 714 lea r3, [r1*3] 715 sub r0, 4 716 lea r2, [r0+r3] 717%assign pad 0x8c-(stack_offset&15) 718 SUB rsp, pad 719 %define pix_tmp rsp 720 721 ; transpose 8x16 -> tmp space 722 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 723 lea r0, [r0+r1*8] 724 lea r2, [r2+r1*8] 725 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 726 727 lea r0, [pix_tmp+0x40] 728 PUSH dword r3m 729 PUSH dword r2m 730 PUSH dword 16 731 PUSH r0 732 call x264_deblock_%2_luma_intra_%1 733%ifidn %2, v8 734 add dword [rsp], 8 ; pix_tmp+8 735 call x264_deblock_%2_luma_intra_%1 736%endif 737 ADD esp, 16 738 739 mov r1, r1m 740 mov r0, r0mp 741 lea r3, [r1*3] 742 sub r0, 4 743 lea r2, [r0+r3] 744 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 745 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 746 lea r0, [r0+r1*8] 747 lea r2, [r2+r1*8] 748 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 749 ADD rsp, pad 750 RET 751%endif ; ARCH_X86_64 752%endmacro ; DEBLOCK_LUMA_INTRA 753 754INIT_XMM 755DEBLOCK_LUMA_INTRA sse2, v 756%ifndef ARCH_X86_64 757INIT_MMX 758DEBLOCK_LUMA_INTRA mmxext, v8 759%endif 760