1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of Libav. 9;* 10;* Libav is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* Libav is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with Libav; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "x86inc.asm" 26%include "x86util.asm" 27 28SECTION_RODATA 29 30pw_pixel_max: times 8 dw ((1 << 10)-1) 31pd_32: times 4 dd 32 32scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 33 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 34 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 35 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 36 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 37 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 38 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 39 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 40 db 4+11*8, 5+11*8, 4+12*8, 5+12*8 41 db 6+11*8, 7+11*8, 6+12*8, 7+12*8 42 db 4+13*8, 5+13*8, 4+14*8, 5+14*8 43 db 6+13*8, 7+13*8, 6+14*8, 7+14*8 44 45%ifdef PIC 46%define scan8 r11 47%else 48%define scan8 scan8_mem 49%endif 50 51SECTION .text 52 53;----------------------------------------------------------------------------- 54; void h264_idct_add(pixel *dst, dctcoef *block, int stride) 55;----------------------------------------------------------------------------- 56%macro STORE_DIFFx2 6 57 psrad %1, 6 58 psrad %2, 6 59 packssdw %1, %2 60 movq %3, [%5] 61 movhps %3, [%5+%6] 62 paddsw %1, %3 63 CLIPW %1, %4, [pw_pixel_max] 64 movq [%5], %1 65 movhps [%5+%6], %1 66%endmacro 67 68%macro STORE_DIFF16 5 69 psrad %1, 6 70 psrad %2, 6 71 packssdw %1, %2 72 paddsw %1, [%5] 73 CLIPW %1, %3, %4 74 mova [%5], %1 75%endmacro 76 77;dst, in, stride 78%macro IDCT4_ADD_10 3 79 mova m0, [%2+ 0] 80 mova m1, [%2+16] 81 mova m2, [%2+32] 82 mova m3, [%2+48] 83 IDCT4_1D d,0,1,2,3,4,5 84 TRANSPOSE4x4D 0,1,2,3,4 85 paddd m0, [pd_32] 86 IDCT4_1D d,0,1,2,3,4,5 87 pxor m5, m5 88 STORE_DIFFx2 m0, m1, m4, m5, %1, %3 89 lea %1, [%1+%3*2] 90 STORE_DIFFx2 m2, m3, m4, m5, %1, %3 91%endmacro 92 93%macro IDCT_ADD_10 1 94cglobal h264_idct_add_10_%1, 3,3 95 IDCT4_ADD_10 r0, r1, r2 96 RET 97%endmacro 98 99INIT_XMM 100IDCT_ADD_10 sse2 101%ifdef HAVE_AVX 102INIT_AVX 103IDCT_ADD_10 avx 104%endif 105 106;----------------------------------------------------------------------------- 107; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) 108;----------------------------------------------------------------------------- 109;;;;;;; NO FATE SAMPLES TRIGGER THIS 110%macro ADD4x4IDCT 1 111add4x4_idct_%1: 112 add r5, r0 113 mova m0, [r2+ 0] 114 mova m1, [r2+16] 115 mova m2, [r2+32] 116 mova m3, [r2+48] 117 IDCT4_1D d,0,1,2,3,4,5 118 TRANSPOSE4x4D 0,1,2,3,4 119 paddd m0, [pd_32] 120 IDCT4_1D d,0,1,2,3,4,5 121 pxor m5, m5 122 STORE_DIFFx2 m0, m1, m4, m5, r5, r3 123 lea r5, [r5+r3*2] 124 STORE_DIFFx2 m2, m3, m4, m5, r5, r3 125 ret 126%endmacro 127 128INIT_XMM 129ALIGN 16 130ADD4x4IDCT sse2 131%ifdef HAVE_AVX 132INIT_AVX 133ALIGN 16 134ADD4x4IDCT avx 135%endif 136 137%macro ADD16_OP 3 138 cmp byte [r4+%3], 0 139 jz .skipblock%2 140 mov r5d, [r1+%2*4] 141 call add4x4_idct_%1 142.skipblock%2: 143%if %2<15 144 add r2, 64 145%endif 146%endmacro 147 148%macro IDCT_ADD16_10 1 149cglobal h264_idct_add16_10_%1, 5,6 150 ADD16_OP %1, 0, 4+1*8 151 ADD16_OP %1, 1, 5+1*8 152 ADD16_OP %1, 2, 4+2*8 153 ADD16_OP %1, 3, 5+2*8 154 ADD16_OP %1, 4, 6+1*8 155 ADD16_OP %1, 5, 7+1*8 156 ADD16_OP %1, 6, 6+2*8 157 ADD16_OP %1, 7, 7+2*8 158 ADD16_OP %1, 8, 4+3*8 159 ADD16_OP %1, 9, 5+3*8 160 ADD16_OP %1, 10, 4+4*8 161 ADD16_OP %1, 11, 5+4*8 162 ADD16_OP %1, 12, 6+3*8 163 ADD16_OP %1, 13, 7+3*8 164 ADD16_OP %1, 14, 6+4*8 165 ADD16_OP %1, 15, 7+4*8 166 REP_RET 167%endmacro 168 169INIT_XMM 170IDCT_ADD16_10 sse2 171%ifdef HAVE_AVX 172INIT_AVX 173IDCT_ADD16_10 avx 174%endif 175 176;----------------------------------------------------------------------------- 177; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) 178;----------------------------------------------------------------------------- 179%macro IDCT_DC_ADD_OP_10 3 180 pxor m5, m5 181%if avx_enabled 182 paddw m1, m0, [%1+0 ] 183 paddw m2, m0, [%1+%2 ] 184 paddw m3, m0, [%1+%2*2] 185 paddw m4, m0, [%1+%3 ] 186%else 187 mova m1, [%1+0 ] 188 mova m2, [%1+%2 ] 189 mova m3, [%1+%2*2] 190 mova m4, [%1+%3 ] 191 paddw m1, m0 192 paddw m2, m0 193 paddw m3, m0 194 paddw m4, m0 195%endif 196 CLIPW m1, m5, m6 197 CLIPW m2, m5, m6 198 CLIPW m3, m5, m6 199 CLIPW m4, m5, m6 200 mova [%1+0 ], m1 201 mova [%1+%2 ], m2 202 mova [%1+%2*2], m3 203 mova [%1+%3 ], m4 204%endmacro 205 206INIT_MMX 207cglobal h264_idct_dc_add_10_mmx2,3,3 208 movd m0, [r1] 209 paddd m0, [pd_32] 210 psrad m0, 6 211 lea r1, [r2*3] 212 pshufw m0, m0, 0 213 mova m6, [pw_pixel_max] 214 IDCT_DC_ADD_OP_10 r0, r2, r1 215 RET 216 217;----------------------------------------------------------------------------- 218; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) 219;----------------------------------------------------------------------------- 220%macro IDCT8_DC_ADD 1 221cglobal h264_idct8_dc_add_10_%1,3,3,7 222 mov r1d, [r1] 223 add r1, 32 224 sar r1, 6 225 movd m0, r1d 226 lea r1, [r2*3] 227 SPLATW m0, m0, 0 228 mova m6, [pw_pixel_max] 229 IDCT_DC_ADD_OP_10 r0, r2, r1 230 lea r0, [r0+r2*4] 231 IDCT_DC_ADD_OP_10 r0, r2, r1 232 RET 233%endmacro 234 235INIT_XMM 236IDCT8_DC_ADD sse2 237%ifdef HAVE_AVX 238INIT_AVX 239IDCT8_DC_ADD avx 240%endif 241 242;----------------------------------------------------------------------------- 243; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) 244;----------------------------------------------------------------------------- 245%macro AC 2 246.ac%2 247 mov r5d, [r1+(%2+0)*4] 248 call add4x4_idct_%1 249 mov r5d, [r1+(%2+1)*4] 250 add r2, 64 251 call add4x4_idct_%1 252 add r2, 64 253 jmp .skipadd%2 254%endmacro 255 256%assign last_block 16 257%macro ADD16_OP_INTRA 3 258 cmp word [r4+%3], 0 259 jnz .ac%2 260 mov r5d, [r2+ 0] 261 or r5d, [r2+64] 262 jz .skipblock%2 263 mov r5d, [r1+(%2+0)*4] 264 call idct_dc_add_%1 265.skipblock%2: 266%if %2<last_block-2 267 add r2, 128 268%endif 269.skipadd%2: 270%endmacro 271 272%macro IDCT_ADD16INTRA_10 1 273idct_dc_add_%1: 274 add r5, r0 275 movq m0, [r2+ 0] 276 movhps m0, [r2+64] 277 paddd m0, [pd_32] 278 psrad m0, 6 279 pshufhw m0, m0, 0 280 pshuflw m0, m0, 0 281 lea r6, [r3*3] 282 mova m6, [pw_pixel_max] 283 IDCT_DC_ADD_OP_10 r5, r3, r6 284 ret 285 286cglobal h264_idct_add16intra_10_%1,5,7,8 287 ADD16_OP_INTRA %1, 0, 4+1*8 288 ADD16_OP_INTRA %1, 2, 4+2*8 289 ADD16_OP_INTRA %1, 4, 6+1*8 290 ADD16_OP_INTRA %1, 6, 6+2*8 291 ADD16_OP_INTRA %1, 8, 4+3*8 292 ADD16_OP_INTRA %1, 10, 4+4*8 293 ADD16_OP_INTRA %1, 12, 6+3*8 294 ADD16_OP_INTRA %1, 14, 6+4*8 295 REP_RET 296 AC %1, 8 297 AC %1, 10 298 AC %1, 12 299 AC %1, 14 300 AC %1, 0 301 AC %1, 2 302 AC %1, 4 303 AC %1, 6 304%endmacro 305 306INIT_XMM 307IDCT_ADD16INTRA_10 sse2 308%ifdef HAVE_AVX 309INIT_AVX 310IDCT_ADD16INTRA_10 avx 311%endif 312 313%assign last_block 36 314;----------------------------------------------------------------------------- 315; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) 316;----------------------------------------------------------------------------- 317%macro IDCT_ADD8 1 318cglobal h264_idct_add8_10_%1,5,7 319%ifdef ARCH_X86_64 320 mov r10, r0 321%endif 322 add r2, 1024 323 mov r0, [r0] 324 ADD16_OP_INTRA %1, 16, 4+ 6*8 325 ADD16_OP_INTRA %1, 18, 4+ 7*8 326 add r2, 1024-128*2 327%ifdef ARCH_X86_64 328 mov r0, [r10+gprsize] 329%else 330 mov r0, r0m 331 mov r0, [r0+gprsize] 332%endif 333 ADD16_OP_INTRA %1, 32, 4+11*8 334 ADD16_OP_INTRA %1, 34, 4+12*8 335 REP_RET 336 AC %1, 16 337 AC %1, 18 338 AC %1, 32 339 AC %1, 34 340 341%endmacro ; IDCT_ADD8 342 343INIT_XMM 344IDCT_ADD8 sse2 345%ifdef HAVE_AVX 346INIT_AVX 347IDCT_ADD8 avx 348%endif 349 350;----------------------------------------------------------------------------- 351; void h264_idct8_add(pixel *dst, dctcoef *block, int stride) 352;----------------------------------------------------------------------------- 353%macro IDCT8_1D 2 354 SWAP 0, 1 355 psrad m4, m5, 1 356 psrad m1, m0, 1 357 paddd m4, m5 358 paddd m1, m0 359 paddd m4, m7 360 paddd m1, m5 361 psubd m4, m0 362 paddd m1, m3 363 364 psubd m0, m3 365 psubd m5, m3 366 paddd m0, m7 367 psubd m5, m7 368 psrad m3, 1 369 psrad m7, 1 370 psubd m0, m3 371 psubd m5, m7 372 373 SWAP 1, 7 374 psrad m1, m7, 2 375 psrad m3, m4, 2 376 paddd m3, m0 377 psrad m0, 2 378 paddd m1, m5 379 psrad m5, 2 380 psubd m0, m4 381 psubd m7, m5 382 383 SWAP 5, 6 384 psrad m4, m2, 1 385 psrad m6, m5, 1 386 psubd m4, m5 387 paddd m6, m2 388 389 mova m2, %1 390 mova m5, %2 391 SUMSUB_BA d, 5, 2 392 SUMSUB_BA d, 6, 5 393 SUMSUB_BA d, 4, 2 394 SUMSUB_BA d, 7, 6 395 SUMSUB_BA d, 0, 4 396 SUMSUB_BA d, 3, 2 397 SUMSUB_BA d, 1, 5 398 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 399%endmacro 400 401%macro IDCT8_1D_FULL 1 402 mova m7, [%1+112*2] 403 mova m6, [%1+ 96*2] 404 mova m5, [%1+ 80*2] 405 mova m3, [%1+ 48*2] 406 mova m2, [%1+ 32*2] 407 mova m1, [%1+ 16*2] 408 IDCT8_1D [%1], [%1+ 64*2] 409%endmacro 410 411; %1=int16_t *block, %2=int16_t *dstblock 412%macro IDCT8_ADD_SSE_START 2 413 IDCT8_1D_FULL %1 414%ifdef ARCH_X86_64 415 TRANSPOSE4x4D 0,1,2,3,8 416 mova [%2 ], m0 417 TRANSPOSE4x4D 4,5,6,7,8 418 mova [%2+8*2], m4 419%else 420 mova [%1], m7 421 TRANSPOSE4x4D 0,1,2,3,7 422 mova m7, [%1] 423 mova [%2 ], m0 424 mova [%2+16*2], m1 425 mova [%2+32*2], m2 426 mova [%2+48*2], m3 427 TRANSPOSE4x4D 4,5,6,7,3 428 mova [%2+ 8*2], m4 429 mova [%2+24*2], m5 430 mova [%2+40*2], m6 431 mova [%2+56*2], m7 432%endif 433%endmacro 434 435; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 436%macro IDCT8_ADD_SSE_END 3 437 IDCT8_1D_FULL %2 438 mova [%2 ], m6 439 mova [%2+16*2], m7 440 441 pxor m7, m7 442 STORE_DIFFx2 m0, m1, m6, m7, %1, %3 443 lea %1, [%1+%3*2] 444 STORE_DIFFx2 m2, m3, m6, m7, %1, %3 445 mova m0, [%2 ] 446 mova m1, [%2+16*2] 447 lea %1, [%1+%3*2] 448 STORE_DIFFx2 m4, m5, m6, m7, %1, %3 449 lea %1, [%1+%3*2] 450 STORE_DIFFx2 m0, m1, m6, m7, %1, %3 451%endmacro 452 453%macro IDCT8_ADD 1 454cglobal h264_idct8_add_10_%1, 3,4,16 455%ifndef UNIX64 456 %assign pad 16-gprsize-(stack_offset&15) 457 sub rsp, pad 458 call h264_idct8_add1_10_%1 459 add rsp, pad 460 RET 461%endif 462 463ALIGN 16 464; TODO: does not need to use stack 465h264_idct8_add1_10_%1: 466%assign pad 256+16-gprsize 467 sub rsp, pad 468 add dword [r1], 32 469 470%ifdef ARCH_X86_64 471 IDCT8_ADD_SSE_START r1, rsp 472 SWAP 1, 9 473 SWAP 2, 10 474 SWAP 3, 11 475 SWAP 5, 13 476 SWAP 6, 14 477 SWAP 7, 15 478 IDCT8_ADD_SSE_START r1+16, rsp+128 479 PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 480 IDCT8_1D [rsp], [rsp+128] 481 SWAP 0, 8 482 SWAP 1, 9 483 SWAP 2, 10 484 SWAP 3, 11 485 SWAP 4, 12 486 SWAP 5, 13 487 SWAP 6, 14 488 SWAP 7, 15 489 IDCT8_1D [rsp+16], [rsp+144] 490 psrad m8, 6 491 psrad m0, 6 492 packssdw m8, m0 493 paddsw m8, [r0] 494 pxor m0, m0 495 CLIPW m8, m0, [pw_pixel_max] 496 mova [r0], m8 497 mova m8, [pw_pixel_max] 498 STORE_DIFF16 m9, m1, m0, m8, r0+r2 499 lea r0, [r0+r2*2] 500 STORE_DIFF16 m10, m2, m0, m8, r0 501 STORE_DIFF16 m11, m3, m0, m8, r0+r2 502 lea r0, [r0+r2*2] 503 STORE_DIFF16 m12, m4, m0, m8, r0 504 STORE_DIFF16 m13, m5, m0, m8, r0+r2 505 lea r0, [r0+r2*2] 506 STORE_DIFF16 m14, m6, m0, m8, r0 507 STORE_DIFF16 m15, m7, m0, m8, r0+r2 508%else 509 IDCT8_ADD_SSE_START r1, rsp 510 IDCT8_ADD_SSE_START r1+16, rsp+128 511 lea r3, [r0+8] 512 IDCT8_ADD_SSE_END r0, rsp, r2 513 IDCT8_ADD_SSE_END r3, rsp+16, r2 514%endif ; ARCH_X86_64 515 516 add rsp, pad 517 ret 518%endmacro 519 520INIT_XMM 521IDCT8_ADD sse2 522%ifdef HAVE_AVX 523INIT_AVX 524IDCT8_ADD avx 525%endif 526 527;----------------------------------------------------------------------------- 528; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) 529;----------------------------------------------------------------------------- 530;;;;;;; NO FATE SAMPLES TRIGGER THIS 531%macro IDCT8_ADD4_OP 3 532 cmp byte [r4+%3], 0 533 jz .skipblock%2 534 mov r0d, [r6+%2*4] 535 add r0, r5 536 call h264_idct8_add1_10_%1 537.skipblock%2: 538%if %2<12 539 add r1, 256 540%endif 541%endmacro 542 543%macro IDCT8_ADD4 1 544cglobal h264_idct8_add4_10_%1, 0,7,16 545 %assign pad 16-gprsize-(stack_offset&15) 546 SUB rsp, pad 547 mov r5, r0mp 548 mov r6, r1mp 549 mov r1, r2mp 550 mov r2d, r3m 551 movifnidn r4, r4mp 552 IDCT8_ADD4_OP %1, 0, 4+1*8 553 IDCT8_ADD4_OP %1, 4, 6+1*8 554 IDCT8_ADD4_OP %1, 8, 4+3*8 555 IDCT8_ADD4_OP %1, 12, 6+3*8 556 ADD rsp, pad 557 RET 558%endmacro ; IDCT8_ADD4 559 560INIT_XMM 561IDCT8_ADD4 sse2 562%ifdef HAVE_AVX 563INIT_AVX 564IDCT8_ADD4 avx 565%endif 566