1;***************************************************************************** 2;* MMX/SSE2-optimized H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2008 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* 8;* This program is free software; you can redistribute it and/or modify 9;* it under the terms of the GNU General Public License as published by 10;* the Free Software Foundation; either version 2 of the License, or 11;* (at your option) any later version. 12;* 13;* This program is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16;* GNU General Public License for more details. 17;* 18;* You should have received a copy of the GNU General Public License 19;* along with this program; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 21;***************************************************************************** 22 23%include "x86inc.asm" 24 25SECTION_RODATA 26pb_00: times 16 db 0x00 27pb_01: times 16 db 0x01 28pb_03: times 16 db 0x03 29pb_a1: times 16 db 0xa1 30 31SECTION .text 32 33; expands to [base],...,[base+7*stride] 34%define PASS8ROWS(base, base3, stride, stride3) \ 35 [base], [base+stride], [base+stride*2], [base3], \ 36 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] 37 38; in: 8 rows of 4 bytes in %1..%8 39; out: 4 rows of 8 bytes in m0..m3 40%macro TRANSPOSE4x8_LOAD 8 41 movd m0, %1 42 movd m2, %2 43 movd m1, %3 44 movd m3, %4 45 punpcklbw m0, m2 46 punpcklbw m1, m3 47 movq m2, m0 48 punpcklwd m0, m1 49 punpckhwd m2, m1 50 51 movd m4, %5 52 movd m6, %6 53 movd m5, %7 54 movd m7, %8 55 punpcklbw m4, m6 56 punpcklbw m5, m7 57 movq m6, m4 58 punpcklwd m4, m5 59 punpckhwd m6, m5 60 61 movq m1, m0 62 movq m3, m2 63 punpckldq m0, m4 64 punpckhdq m1, m4 65 punpckldq m2, m6 66 punpckhdq m3, m6 67%endmacro 68 69; in: 4 rows of 8 bytes in m0..m3 70; out: 8 rows of 4 bytes in %1..%8 71%macro TRANSPOSE8x4_STORE 8 72 movq m4, m0 73 movq m5, m1 74 movq m6, m2 75 punpckhdq m4, m4 76 punpckhdq m5, m5 77 punpckhdq m6, m6 78 79 punpcklbw m0, m1 80 punpcklbw m2, m3 81 movq m1, m0 82 punpcklwd m0, m2 83 punpckhwd m1, m2 84 movd %1, m0 85 punpckhdq m0, m0 86 movd %2, m0 87 movd %3, m1 88 punpckhdq m1, m1 89 movd %4, m1 90 91 punpckhdq m3, m3 92 punpcklbw m4, m5 93 punpcklbw m6, m3 94 movq m5, m4 95 punpcklwd m4, m6 96 punpckhwd m5, m6 97 movd %5, m4 98 punpckhdq m4, m4 99 movd %6, m4 100 movd %7, m5 101 punpckhdq m5, m5 102 movd %8, m5 103%endmacro 104 105%macro SBUTTERFLY 4 106 movq %4, %2 107 punpckl%1 %2, %3 108 punpckh%1 %4, %3 109%endmacro 110 111; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 112; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] 113%macro TRANSPOSE6x8_MEM 9 114 movq m0, %1 115 movq m1, %2 116 movq m2, %3 117 movq m3, %4 118 movq m4, %5 119 movq m5, %6 120 movq m6, %7 121 SBUTTERFLY bw, m0, m1, m7 122 SBUTTERFLY bw, m2, m3, m1 123 SBUTTERFLY bw, m4, m5, m3 124 movq [%9+0x10], m1 125 SBUTTERFLY bw, m6, %8, m5 126 SBUTTERFLY wd, m0, m2, m1 127 SBUTTERFLY wd, m4, m6, m2 128 punpckhdq m0, m4 129 movq [%9+0x00], m0 130 SBUTTERFLY wd, m7, [%9+0x10], m6 131 SBUTTERFLY wd, m3, m5, m4 132 SBUTTERFLY dq, m7, m3, m0 133 SBUTTERFLY dq, m1, m2, m5 134 punpckldq m6, m4 135 movq [%9+0x10], m1 136 movq [%9+0x20], m5 137 movq [%9+0x30], m7 138 movq [%9+0x40], m0 139 movq [%9+0x50], m6 140%endmacro 141 142; in: 8 rows of 8 in %1..%8 143; out: 8 rows of 8 in %9..%16 144%macro TRANSPOSE8x8_MEM 16 145 movq m0, %1 146 movq m1, %2 147 movq m2, %3 148 movq m3, %4 149 movq m4, %5 150 movq m5, %6 151 movq m6, %7 152 SBUTTERFLY bw, m0, m1, m7 153 SBUTTERFLY bw, m2, m3, m1 154 SBUTTERFLY bw, m4, m5, m3 155 SBUTTERFLY bw, m6, %8, m5 156 movq %9, m3 157 SBUTTERFLY wd, m0, m2, m3 158 SBUTTERFLY wd, m4, m6, m2 159 SBUTTERFLY wd, m7, m1, m6 160 movq %11, m2 161 movq m2, %9 162 SBUTTERFLY wd, m2, m5, m1 163 SBUTTERFLY dq, m0, m4, m5 164 SBUTTERFLY dq, m7, m2, m4 165 movq %9, m0 166 movq %10, m5 167 movq %13, m7 168 movq %14, m4 169 SBUTTERFLY dq, m3, %11, m0 170 SBUTTERFLY dq, m6, m1, m5 171 movq %11, m3 172 movq %12, m0 173 movq %15, m6 174 movq %16, m5 175%endmacro 176 177; out: %4 = |%1-%2|>%3 178; clobbers: %5 179%macro DIFF_GT 5 180 mova %5, %2 181 mova %4, %1 182 psubusb %5, %1 183 psubusb %4, %2 184 por %4, %5 185 psubusb %4, %3 186%endmacro 187 188; out: %4 = |%1-%2|>%3 189; clobbers: %5 190%macro DIFF_GT2 5 191 mova %5, %2 192 mova %4, %1 193 psubusb %5, %1 194 psubusb %4, %2 195 psubusb %5, %3 196 psubusb %4, %3 197 pcmpeqb %4, %5 198%endmacro 199 200%macro SPLATW 1 201%ifidn m0, xmm0 202 pshuflw %1, %1, 0 203 punpcklqdq %1, %1 204%else 205 pshufw %1, %1, 0 206%endif 207%endmacro 208 209; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 210; out: m5=beta-1, m7=mask, %3=alpha-1 211; clobbers: m4,m6 212%macro LOAD_MASK 2-3 213 movd m4, %1 214 movd m5, %2 215 SPLATW m4 216 SPLATW m5 217 packuswb m4, m4 ; 16x alpha-1 218 packuswb m5, m5 ; 16x beta-1 219%if %0>2 220 mova %3, m4 221%endif 222 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 223 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 224 por m7, m4 225 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 226 por m7, m4 227 pxor m6, m6 228 pcmpeqb m7, m6 229%endmacro 230 231; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) 232; out: m1=p0' m2=q0' 233; clobbers: m0,3-6 234%macro DEBLOCK_P0_Q0 0 235 mova m5, m1 236 pxor m5, m2 ; p0^q0 237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 238 pcmpeqb m4, m4 239 pxor m3, m4 240 pavgb m3, m0 ; (p1 - q1 + 256)>>1 241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 242 pxor m4, m1 243 pavgb m4, m2 ; (q0 - p0 + 256)>>1 244 pavgb m3, m5 245 paddusb m3, m4 ; d+128+33 246 mova m6, [pb_a1 GLOBAL] 247 psubusb m6, m3 248 psubusb m3, [pb_a1 GLOBAL] 249 pminub m6, m7 250 pminub m3, m7 251 psubusb m1, m6 252 psubusb m2, m3 253 paddusb m1, m3 254 paddusb m2, m6 255%endmacro 256 257; in: m1=p0 m2=q0 258; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp 259; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 260; clobbers: q2, tmp, tc0 261%macro LUMA_Q1 6 262 mova %6, m1 263 pavgb %6, m2 264 pavgb %2, %6 ; avg(p2,avg(p0,q0)) 265 pxor %6, %3 266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 268 mova %6, %1 269 psubusb %6, %5 270 paddusb %5, %1 271 pmaxub %2, %6 272 pminub %2, %5 273 mova %4, %2 274%endmacro 275 276%ifdef ARCH_X86_64 277;----------------------------------------------------------------------------- 278; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 279;----------------------------------------------------------------------------- 280INIT_XMM 281cglobal x264_deblock_v_luma_sse2 282 movd m8, [r4] ; tc0 283 lea r4, [r1*3] 284 dec r2d ; alpha-1 285 neg r4 286 dec r3d ; beta-1 287 add r4, r0 ; pix-3*stride 288 289 mova m0, [r4+r1] ; p1 290 mova m1, [r4+2*r1] ; p0 291 mova m2, [r0] ; q0 292 mova m3, [r0+r1] ; q1 293 LOAD_MASK r2d, r3d 294 295 punpcklbw m8, m8 296 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 297 pcmpeqb m9, m9 298 pcmpeqb m9, m8 299 pandn m9, m7 300 pand m8, m9 301 302 movdqa m3, [r4] ; p2 303 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 304 pand m6, m9 305 mova m7, m8 306 psubb m7, m6 307 pand m6, m8 308 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 309 310 movdqa m4, [r0+2*r1] ; q2 311 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 312 pand m6, m9 313 pand m8, m6 314 psubb m7, m6 315 mova m3, [r0+r1] 316 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 317 318 DEBLOCK_P0_Q0 319 mova [r4+2*r1], m1 320 mova [r0], m2 321 ret 322 323;----------------------------------------------------------------------------- 324; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 325;----------------------------------------------------------------------------- 326INIT_MMX 327cglobal x264_deblock_h_luma_sse2 328 movsxd r10, esi 329 lea r11, [r10+r10*2] 330 lea rax, [r0-4] 331 lea r9, [r0-4+r11] 332 sub rsp, 0x68 333 %define pix_tmp rsp 334 335 ; transpose 6x16 -> tmp space 336 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp 337 lea rax, [rax+r10*8] 338 lea r9, [r9 +r10*8] 339 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 340 341 ; vertical filter 342 ; alpha, beta, tc0 are still in r2d, r3d, r4 343 ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them 344 lea r0, [pix_tmp+0x30] 345 mov esi, 0x10 346 call x264_deblock_v_luma_sse2 347 348 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 349 add rax, 2 350 add r9, 2 351 movq m0, [pix_tmp+0x18] 352 movq m1, [pix_tmp+0x28] 353 movq m2, [pix_tmp+0x38] 354 movq m3, [pix_tmp+0x48] 355 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) 356 357 shl r10, 3 358 sub rax, r10 359 sub r9, r10 360 shr r10, 3 361 movq m0, [pix_tmp+0x10] 362 movq m1, [pix_tmp+0x20] 363 movq m2, [pix_tmp+0x30] 364 movq m3, [pix_tmp+0x40] 365 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) 366 367 add rsp, 0x68 368 ret 369 370%else 371 372%macro DEBLOCK_LUMA 3 373;----------------------------------------------------------------------------- 374; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 375;----------------------------------------------------------------------------- 376cglobal x264_deblock_%2_luma_%1, 5,5 377 lea r4, [r1*3] 378 dec r2 ; alpha-1 379 neg r4 380 dec r3 ; beta-1 381 add r4, r0 ; pix-3*stride 382 %assign pad 2*%3+12-(stack_offset&15) 383 SUB esp, pad 384 385 mova m0, [r4+r1] ; p1 386 mova m1, [r4+2*r1] ; p0 387 mova m2, [r0] ; q0 388 mova m3, [r0+r1] ; q1 389 LOAD_MASK r2, r3 390 391 mov r3, r4m 392 movd m4, [r3] ; tc0 393 punpcklbw m4, m4 394 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 395 mova [esp+%3], m4 ; tc 396 pcmpeqb m3, m3 397 pcmpgtb m4, m3 398 pand m4, m7 399 mova [esp], m4 ; mask 400 401 mova m3, [r4] ; p2 402 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 403 pand m6, m4 404 pand m4, [esp+%3] ; tc 405 mova m7, m4 406 psubb m7, m6 407 pand m6, m4 408 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 409 410 mova m4, [r0+2*r1] ; q2 411 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 412 mova m5, [esp] ; mask 413 pand m6, m5 414 mova m5, [esp+%3] ; tc 415 pand m5, m6 416 psubb m7, m6 417 mova m3, [r0+r1] 418 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 419 420 DEBLOCK_P0_Q0 421 mova [r4+2*r1], m1 422 mova [r0], m2 423 ADD esp, pad 424 RET 425 426;----------------------------------------------------------------------------- 427; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) 428;----------------------------------------------------------------------------- 429INIT_MMX 430cglobal x264_deblock_h_luma_%1, 0,5 431 mov r0, r0m 432 mov r3, r1m 433 lea r4, [r3*3] 434 sub r0, 4 435 lea r1, [r0+r4] 436 %assign pad 0x78-(stack_offset&15) 437 SUB esp, pad 438%define pix_tmp esp+12 439 440 ; transpose 6x16 -> tmp space 441 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp 442 lea r0, [r0+r3*8] 443 lea r1, [r1+r3*8] 444 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 445 446 ; vertical filter 447 lea r0, [pix_tmp+0x30] 448 PUSH dword r4m 449 PUSH dword r3m 450 PUSH dword r2m 451 PUSH dword 16 452 PUSH dword r0 453 call x264_deblock_%2_luma_%1 454%ifidn %2, v8 455 add dword [esp ], 8 ; pix_tmp+0x38 456 add dword [esp+16], 2 ; tc0+2 457 call x264_deblock_%2_luma_%1 458%endif 459 ADD esp, 20 460 461 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 462 mov r0, r0m 463 sub r0, 2 464 lea r1, [r0+r4] 465 466 movq m0, [pix_tmp+0x10] 467 movq m1, [pix_tmp+0x20] 468 movq m2, [pix_tmp+0x30] 469 movq m3, [pix_tmp+0x40] 470 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) 471 472 lea r0, [r0+r3*8] 473 lea r1, [r1+r3*8] 474 movq m0, [pix_tmp+0x18] 475 movq m1, [pix_tmp+0x28] 476 movq m2, [pix_tmp+0x38] 477 movq m3, [pix_tmp+0x48] 478 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) 479 480 ADD esp, pad 481 RET 482%endmacro ; DEBLOCK_LUMA 483 484INIT_XMM 485DEBLOCK_LUMA sse2, v, 16 486 487%endif ; ARCH 488 489 490 491%macro LUMA_INTRA_P012 4 ; p0..p3 in memory 492 mova t0, p2 493 mova t1, p0 494 pavgb t0, p1 495 pavgb t1, q0 496 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 497 mova t5, t1 498 mova t2, p2 499 mova t3, p0 500 paddb t2, p1 501 paddb t3, q0 502 paddb t2, t3 503 mova t3, t2 504 mova t4, t2 505 psrlw t2, 1 506 pavgb t2, mpb_00 507 pxor t2, t0 508 pand t2, mpb_01 509 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; 510 511 mova t1, p2 512 mova t2, p2 513 pavgb t1, q1 514 psubb t2, q1 515 paddb t3, t3 516 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 517 pand t2, mpb_01 518 psubb t1, t2 519 pavgb t1, p1 520 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 521 psrlw t3, 2 522 pavgb t3, mpb_00 523 pxor t3, t1 524 pand t3, mpb_01 525 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 526 527 mova t3, p0 528 mova t2, p0 529 pxor t3, q1 530 pavgb t2, q1 531 pand t3, mpb_01 532 psubb t2, t3 533 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 534 535 pxor t1, t2 536 pxor t2, p0 537 pand t1, mask1p 538 pand t2, mask0 539 pxor t1, t2 540 pxor t1, p0 541 mova %1, t1 ; store p0 542 543 mova t1, %4 ; p3 544 mova t2, t1 545 pavgb t1, p2 546 paddb t2, p2 547 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 548 paddb t2, t2 549 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 550 psrlw t2, 2 551 pavgb t2, mpb_00 552 pxor t2, t1 553 pand t2, mpb_01 554 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 555 556 pxor t0, p1 557 pxor t1, p2 558 pand t0, mask1p 559 pand t1, mask1p 560 pxor t0, p1 561 pxor t1, p2 562 mova %2, t0 ; store p1 563 mova %3, t1 ; store p2 564%endmacro 565 566%macro LUMA_INTRA_SWAP_PQ 0 567 %define q1 m0 568 %define q0 m1 569 %define p0 m2 570 %define p1 m3 571 %define p2 q2 572 %define mask1p mask1q 573%endmacro 574 575%macro DEBLOCK_LUMA_INTRA 2 576 %define p1 m0 577 %define p0 m1 578 %define q0 m2 579 %define q1 m3 580 %define t0 m4 581 %define t1 m5 582 %define t2 m6 583 %define t3 m7 584%ifdef ARCH_X86_64 585 %define p2 m8 586 %define q2 m9 587 %define t4 m10 588 %define t5 m11 589 %define mask0 m12 590 %define mask1p m13 591 %define mask1q [rsp-24] 592 %define mpb_00 m14 593 %define mpb_01 m15 594%else 595 %define spill(x) [esp+16*x+((stack_offset+4)&15)] 596 %define p2 [r4+r1] 597 %define q2 [r0+2*r1] 598 %define t4 spill(0) 599 %define t5 spill(1) 600 %define mask0 spill(2) 601 %define mask1p spill(3) 602 %define mask1q spill(4) 603 %define mpb_00 [pb_00 GLOBAL] 604 %define mpb_01 [pb_01 GLOBAL] 605%endif 606 607;----------------------------------------------------------------------------- 608; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) 609;----------------------------------------------------------------------------- 610cglobal x264_deblock_%2_luma_intra_%1, 4,6 611%ifndef ARCH_X86_64 612 sub esp, 0x60 613%endif 614 lea r4, [r1*4] 615 lea r5, [r1*3] ; 3*stride 616 dec r2d ; alpha-1 617 jl .end 618 neg r4 619 dec r3d ; beta-1 620 jl .end 621 add r4, r0 ; pix-4*stride 622 mova p1, [r4+2*r1] 623 mova p0, [r4+r5] 624 mova q0, [r0] 625 mova q1, [r0+r1] 626%ifdef ARCH_X86_64 627 pxor mpb_00, mpb_00 628 mova mpb_01, [pb_01 GLOBAL] 629 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 630 SWAP 7, 12 ; m12=mask0 631 pavgb t5, mpb_00 632 pavgb t5, mpb_01 ; alpha/4+1 633 movdqa p2, [r4+r1] 634 movdqa q2, [r0+2*r1] 635 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 636 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 637 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 638 pand t0, mask0 639 pand t4, t0 640 pand t2, t0 641 mova mask1q, t4 642 mova mask1p, t2 643%else 644 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 645 mova m4, t5 646 mova mask0, m7 647 pavgb m4, [pb_00 GLOBAL] 648 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 649 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 650 pand m6, mask0 651 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 652 pand m4, m6 653 mova mask1p, m4 654 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 655 pand m4, m6 656 mova mask1q, m4 657%endif 658 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] 659 LUMA_INTRA_SWAP_PQ 660 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] 661.end: 662%ifndef ARCH_X86_64 663 add esp, 0x60 664%endif 665 RET 666 667INIT_MMX 668%ifdef ARCH_X86_64 669;----------------------------------------------------------------------------- 670; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) 671;----------------------------------------------------------------------------- 672cglobal x264_deblock_h_luma_intra_%1 673 movsxd r10, r1d 674 lea r11, [r10*3] 675 lea rax, [r0-4] 676 lea r9, [r0-4+r11] 677 sub rsp, 0x88 678 %define pix_tmp rsp 679 680 ; transpose 8x16 -> tmp space 681 TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 682 lea rax, [rax+r10*8] 683 lea r9, [r9+r10*8] 684 TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 685 686 lea r0, [pix_tmp+0x40] 687 mov r1, 0x10 688 call x264_deblock_v_luma_intra_%1 689 690 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 691 lea r9, [rax+r11] 692 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) 693 shl r10, 3 694 sub rax, r10 695 sub r9, r10 696 shr r10, 3 697 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) 698 add rsp, 0x88 699 ret 700%else 701cglobal x264_deblock_h_luma_intra_%1, 2,4 702 lea r3, [r1*3] 703 sub r0, 4 704 lea r2, [r0+r3] 705%assign pad 0x8c-(stack_offset&15) 706 SUB rsp, pad 707 %define pix_tmp rsp 708 709 ; transpose 8x16 -> tmp space 710 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 711 lea r0, [r0+r1*8] 712 lea r2, [r2+r1*8] 713 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 714 715 lea r0, [pix_tmp+0x40] 716 PUSH dword r3m 717 PUSH dword r2m 718 PUSH dword 16 719 PUSH r0 720 call x264_deblock_%2_luma_intra_%1 721%ifidn %2, v8 722 add dword [rsp], 8 ; pix_tmp+8 723 call x264_deblock_%2_luma_intra_%1 724%endif 725 ADD esp, 16 726 727 mov r1, r1m 728 mov r0, r0m 729 lea r3, [r1*3] 730 sub r0, 4 731 lea r2, [r0+r3] 732 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 733 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 734 lea r0, [r0+r1*8] 735 lea r2, [r2+r1*8] 736 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 737 ADD rsp, pad 738 RET 739%endif ; ARCH_X86_64 740%endmacro ; DEBLOCK_LUMA_INTRA 741 742INIT_XMM 743DEBLOCK_LUMA_INTRA sse2, v 744%ifndef ARCH_X86_64 745INIT_MMX 746DEBLOCK_LUMA_INTRA mmxext, v8 747%endif 748