1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29pw_pixel_max: times 8 dw ((1 << 10)-1) 30pd_32: times 4 dd 32 31 32SECTION .text 33 34;----------------------------------------------------------------------------- 35; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) 36;----------------------------------------------------------------------------- 37%macro STORE_DIFFx2 6 38 psrad %1, 6 39 psrad %2, 6 40 packssdw %1, %2 41 movq %3, [%5] 42 movhps %3, [%5+%6] 43 paddsw %1, %3 44 CLIPW %1, %4, [pw_pixel_max] 45 movq [%5], %1 46 movhps [%5+%6], %1 47%endmacro 48 49%macro STORE_DIFF16 5 50 psrad %1, 6 51 psrad %2, 6 52 packssdw %1, %2 53 paddsw %1, [%5] 54 CLIPW %1, %3, %4 55 mova [%5], %1 56%endmacro 57 58;dst, in, stride 59%macro IDCT4_ADD_10 3 60 mova m0, [%2+ 0] 61 mova m1, [%2+16] 62 mova m2, [%2+32] 63 mova m3, [%2+48] 64 IDCT4_1D d,0,1,2,3,4,5 65 TRANSPOSE4x4D 0,1,2,3,4 66 paddd m0, [pd_32] 67 IDCT4_1D d,0,1,2,3,4,5 68 pxor m5, m5 69 mova [%2+ 0], m5 70 mova [%2+16], m5 71 mova [%2+32], m5 72 mova [%2+48], m5 73 STORE_DIFFx2 m0, m1, m4, m5, %1, %3 74 lea %1, [%1+%3*2] 75 STORE_DIFFx2 m2, m3, m4, m5, %1, %3 76%endmacro 77 78%macro IDCT_ADD_10 0 79cglobal h264_idct_add_10, 3,3 80 IDCT4_ADD_10 r0, r1, r2 81 RET 82%endmacro 83 84INIT_XMM sse2 85IDCT_ADD_10 86%if HAVE_AVX_EXTERNAL 87INIT_XMM avx 88IDCT_ADD_10 89%endif 90 91;----------------------------------------------------------------------------- 92; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset, 93; int16_t *block, int stride, 94; const uint8_t nnzc[6*8]) 95;----------------------------------------------------------------------------- 96;;;;;;; NO FATE SAMPLES TRIGGER THIS 97%macro ADD4x4IDCT 0 98add4x4_idct %+ SUFFIX: 99 add r5, r0 100 mova m0, [r2+ 0] 101 mova m1, [r2+16] 102 mova m2, [r2+32] 103 mova m3, [r2+48] 104 IDCT4_1D d,0,1,2,3,4,5 105 TRANSPOSE4x4D 0,1,2,3,4 106 paddd m0, [pd_32] 107 IDCT4_1D d,0,1,2,3,4,5 108 pxor m5, m5 109 mova [r2+ 0], m5 110 mova [r2+16], m5 111 mova [r2+32], m5 112 mova [r2+48], m5 113 STORE_DIFFx2 m0, m1, m4, m5, r5, r3 114 lea r5, [r5+r3*2] 115 STORE_DIFFx2 m2, m3, m4, m5, r5, r3 116 ret 117%endmacro 118 119INIT_XMM sse2 120ALIGN 16 121ADD4x4IDCT 122%if HAVE_AVX_EXTERNAL 123INIT_XMM avx 124ALIGN 16 125ADD4x4IDCT 126%endif 127 128%macro ADD16_OP 2 129 cmp byte [r4+%2], 0 130 jz .skipblock%1 131 mov r5d, [r1+%1*4] 132 call add4x4_idct %+ SUFFIX 133.skipblock%1: 134%if %1<15 135 add r2, 64 136%endif 137%endmacro 138 139%macro IDCT_ADD16_10 0 140cglobal h264_idct_add16_10, 5,6 141 ADD16_OP 0, 4+1*8 142 ADD16_OP 1, 5+1*8 143 ADD16_OP 2, 4+2*8 144 ADD16_OP 3, 5+2*8 145 ADD16_OP 4, 6+1*8 146 ADD16_OP 5, 7+1*8 147 ADD16_OP 6, 6+2*8 148 ADD16_OP 7, 7+2*8 149 ADD16_OP 8, 4+3*8 150 ADD16_OP 9, 5+3*8 151 ADD16_OP 10, 4+4*8 152 ADD16_OP 11, 5+4*8 153 ADD16_OP 12, 6+3*8 154 ADD16_OP 13, 7+3*8 155 ADD16_OP 14, 6+4*8 156 ADD16_OP 15, 7+4*8 157 REP_RET 158%endmacro 159 160INIT_XMM sse2 161IDCT_ADD16_10 162%if HAVE_AVX_EXTERNAL 163INIT_XMM avx 164IDCT_ADD16_10 165%endif 166 167;----------------------------------------------------------------------------- 168; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride) 169;----------------------------------------------------------------------------- 170%macro IDCT_DC_ADD_OP_10 3 171 pxor m5, m5 172%if avx_enabled 173 paddw m1, m0, [%1+0 ] 174 paddw m2, m0, [%1+%2 ] 175 paddw m3, m0, [%1+%2*2] 176 paddw m4, m0, [%1+%3 ] 177%else 178 mova m1, [%1+0 ] 179 mova m2, [%1+%2 ] 180 mova m3, [%1+%2*2] 181 mova m4, [%1+%3 ] 182 paddw m1, m0 183 paddw m2, m0 184 paddw m3, m0 185 paddw m4, m0 186%endif 187 CLIPW m1, m5, m6 188 CLIPW m2, m5, m6 189 CLIPW m3, m5, m6 190 CLIPW m4, m5, m6 191 mova [%1+0 ], m1 192 mova [%1+%2 ], m2 193 mova [%1+%2*2], m3 194 mova [%1+%3 ], m4 195%endmacro 196 197INIT_MMX mmxext 198cglobal h264_idct_dc_add_10,3,3 199 movd m0, [r1] 200 mov dword [r1], 0 201 paddd m0, [pd_32] 202 psrad m0, 6 203 lea r1, [r2*3] 204 pshufw m0, m0, 0 205 mova m6, [pw_pixel_max] 206 IDCT_DC_ADD_OP_10 r0, r2, r1 207 RET 208 209;----------------------------------------------------------------------------- 210; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride) 211;----------------------------------------------------------------------------- 212%macro IDCT8_DC_ADD 0 213cglobal h264_idct8_dc_add_10,3,4,7 214 movd m0, [r1] 215 mov dword[r1], 0 216 paddd m0, [pd_32] 217 psrad m0, 6 218 lea r1, [r2*3] 219 SPLATW m0, m0, 0 220 mova m6, [pw_pixel_max] 221 IDCT_DC_ADD_OP_10 r0, r2, r1 222 lea r0, [r0+r2*4] 223 IDCT_DC_ADD_OP_10 r0, r2, r1 224 RET 225%endmacro 226 227INIT_XMM sse2 228IDCT8_DC_ADD 229%if HAVE_AVX_EXTERNAL 230INIT_XMM avx 231IDCT8_DC_ADD 232%endif 233 234;----------------------------------------------------------------------------- 235; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset, 236; int16_t *block, int stride, 237; const uint8_t nnzc[6*8]) 238;----------------------------------------------------------------------------- 239%macro AC 1 240.ac%1: 241 mov r5d, [r1+(%1+0)*4] 242 call add4x4_idct %+ SUFFIX 243 mov r5d, [r1+(%1+1)*4] 244 add r2, 64 245 call add4x4_idct %+ SUFFIX 246 add r2, 64 247 jmp .skipadd%1 248%endmacro 249 250%assign last_block 16 251%macro ADD16_OP_INTRA 2 252 cmp word [r4+%2], 0 253 jnz .ac%1 254 mov r5d, [r2+ 0] 255 or r5d, [r2+64] 256 jz .skipblock%1 257 mov r5d, [r1+(%1+0)*4] 258 call idct_dc_add %+ SUFFIX 259.skipblock%1: 260%if %1<last_block-2 261 add r2, 128 262%endif 263.skipadd%1: 264%endmacro 265 266%macro IDCT_ADD16INTRA_10 0 267idct_dc_add %+ SUFFIX: 268 add r5, r0 269 movq m0, [r2+ 0] 270 movhps m0, [r2+64] 271 mov dword [r2+ 0], 0 272 mov dword [r2+64], 0 273 paddd m0, [pd_32] 274 psrad m0, 6 275 pshufhw m0, m0, 0 276 pshuflw m0, m0, 0 277 lea r6, [r3*3] 278 mova m6, [pw_pixel_max] 279 IDCT_DC_ADD_OP_10 r5, r3, r6 280 ret 281 282cglobal h264_idct_add16intra_10,5,7,8 283 ADD16_OP_INTRA 0, 4+1*8 284 ADD16_OP_INTRA 2, 4+2*8 285 ADD16_OP_INTRA 4, 6+1*8 286 ADD16_OP_INTRA 6, 6+2*8 287 ADD16_OP_INTRA 8, 4+3*8 288 ADD16_OP_INTRA 10, 4+4*8 289 ADD16_OP_INTRA 12, 6+3*8 290 ADD16_OP_INTRA 14, 6+4*8 291 REP_RET 292 AC 8 293 AC 10 294 AC 12 295 AC 14 296 AC 0 297 AC 2 298 AC 4 299 AC 6 300%endmacro 301 302INIT_XMM sse2 303IDCT_ADD16INTRA_10 304%if HAVE_AVX_EXTERNAL 305INIT_XMM avx 306IDCT_ADD16INTRA_10 307%endif 308 309%assign last_block 36 310;----------------------------------------------------------------------------- 311; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset, 312; int16_t *block, int stride, 313; const uint8_t nnzc[6*8]) 314;----------------------------------------------------------------------------- 315%macro IDCT_ADD8 0 316cglobal h264_idct_add8_10,5,8,7 317%if ARCH_X86_64 318 mov r7, r0 319%endif 320 add r2, 1024 321 mov r0, [r0] 322 ADD16_OP_INTRA 16, 4+ 6*8 323 ADD16_OP_INTRA 18, 4+ 7*8 324 add r2, 1024-128*2 325%if ARCH_X86_64 326 mov r0, [r7+gprsize] 327%else 328 mov r0, r0m 329 mov r0, [r0+gprsize] 330%endif 331 ADD16_OP_INTRA 32, 4+11*8 332 ADD16_OP_INTRA 34, 4+12*8 333 REP_RET 334 AC 16 335 AC 18 336 AC 32 337 AC 34 338 339%endmacro ; IDCT_ADD8 340 341INIT_XMM sse2 342IDCT_ADD8 343%if HAVE_AVX_EXTERNAL 344INIT_XMM avx 345IDCT_ADD8 346%endif 347 348;----------------------------------------------------------------------------- 349; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride) 350;----------------------------------------------------------------------------- 351%macro IDCT8_1D 2 352 SWAP 0, 1 353 psrad m4, m5, 1 354 psrad m1, m0, 1 355 paddd m4, m5 356 paddd m1, m0 357 paddd m4, m7 358 paddd m1, m5 359 psubd m4, m0 360 paddd m1, m3 361 362 psubd m0, m3 363 psubd m5, m3 364 paddd m0, m7 365 psubd m5, m7 366 psrad m3, 1 367 psrad m7, 1 368 psubd m0, m3 369 psubd m5, m7 370 371 SWAP 1, 7 372 psrad m1, m7, 2 373 psrad m3, m4, 2 374 paddd m3, m0 375 psrad m0, 2 376 paddd m1, m5 377 psrad m5, 2 378 psubd m0, m4 379 psubd m7, m5 380 381 SWAP 5, 6 382 psrad m4, m2, 1 383 psrad m6, m5, 1 384 psubd m4, m5 385 paddd m6, m2 386 387 mova m2, %1 388 mova m5, %2 389 SUMSUB_BA d, 5, 2 390 SUMSUB_BA d, 6, 5 391 SUMSUB_BA d, 4, 2 392 SUMSUB_BA d, 7, 6 393 SUMSUB_BA d, 0, 4 394 SUMSUB_BA d, 3, 2 395 SUMSUB_BA d, 1, 5 396 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 397%endmacro 398 399%macro IDCT8_1D_FULL 1 400 mova m7, [%1+112*2] 401 mova m6, [%1+ 96*2] 402 mova m5, [%1+ 80*2] 403 mova m3, [%1+ 48*2] 404 mova m2, [%1+ 32*2] 405 mova m1, [%1+ 16*2] 406 IDCT8_1D [%1], [%1+ 64*2] 407%endmacro 408 409; %1=int16_t *block, %2=int16_t *dstblock 410%macro IDCT8_ADD_SSE_START 2 411 IDCT8_1D_FULL %1 412%if ARCH_X86_64 413 TRANSPOSE4x4D 0,1,2,3,8 414 mova [%2 ], m0 415 TRANSPOSE4x4D 4,5,6,7,8 416 mova [%2+8*2], m4 417%else 418 mova [%1], m7 419 TRANSPOSE4x4D 0,1,2,3,7 420 mova m7, [%1] 421 mova [%2 ], m0 422 mova [%2+16*2], m1 423 mova [%2+32*2], m2 424 mova [%2+48*2], m3 425 TRANSPOSE4x4D 4,5,6,7,3 426 mova [%2+ 8*2], m4 427 mova [%2+24*2], m5 428 mova [%2+40*2], m6 429 mova [%2+56*2], m7 430%endif 431%endmacro 432 433; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 434%macro IDCT8_ADD_SSE_END 3 435 IDCT8_1D_FULL %2 436 mova [%2 ], m6 437 mova [%2+16*2], m7 438 439 pxor m7, m7 440 STORE_DIFFx2 m0, m1, m6, m7, %1, %3 441 lea %1, [%1+%3*2] 442 STORE_DIFFx2 m2, m3, m6, m7, %1, %3 443 mova m0, [%2 ] 444 mova m1, [%2+16*2] 445 lea %1, [%1+%3*2] 446 STORE_DIFFx2 m4, m5, m6, m7, %1, %3 447 lea %1, [%1+%3*2] 448 STORE_DIFFx2 m0, m1, m6, m7, %1, %3 449%endmacro 450 451%macro IDCT8_ADD 0 452cglobal h264_idct8_add_10, 3,4,16 453%if UNIX64 == 0 454 %assign pad 16-gprsize-(stack_offset&15) 455 sub rsp, pad 456 call h264_idct8_add1_10 %+ SUFFIX 457 add rsp, pad 458 RET 459%endif 460 461ALIGN 16 462; TODO: does not need to use stack 463h264_idct8_add1_10 %+ SUFFIX: 464%assign pad 256+16-gprsize 465 sub rsp, pad 466 add dword [r1], 32 467 468%if ARCH_X86_64 469 IDCT8_ADD_SSE_START r1, rsp 470 SWAP 1, 9 471 SWAP 2, 10 472 SWAP 3, 11 473 SWAP 5, 13 474 SWAP 6, 14 475 SWAP 7, 15 476 IDCT8_ADD_SSE_START r1+16, rsp+128 477 PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 478 IDCT8_1D [rsp], [rsp+128] 479 SWAP 0, 8 480 SWAP 1, 9 481 SWAP 2, 10 482 SWAP 3, 11 483 SWAP 4, 12 484 SWAP 5, 13 485 SWAP 6, 14 486 SWAP 7, 15 487 IDCT8_1D [rsp+16], [rsp+144] 488 psrad m8, 6 489 psrad m0, 6 490 packssdw m8, m0 491 paddsw m8, [r0] 492 pxor m0, m0 493 mova [r1+ 0], m0 494 mova [r1+ 16], m0 495 mova [r1+ 32], m0 496 mova [r1+ 48], m0 497 mova [r1+ 64], m0 498 mova [r1+ 80], m0 499 mova [r1+ 96], m0 500 mova [r1+112], m0 501 mova [r1+128], m0 502 mova [r1+144], m0 503 mova [r1+160], m0 504 mova [r1+176], m0 505 mova [r1+192], m0 506 mova [r1+208], m0 507 mova [r1+224], m0 508 mova [r1+240], m0 509 CLIPW m8, m0, [pw_pixel_max] 510 mova [r0], m8 511 mova m8, [pw_pixel_max] 512 STORE_DIFF16 m9, m1, m0, m8, r0+r2 513 lea r0, [r0+r2*2] 514 STORE_DIFF16 m10, m2, m0, m8, r0 515 STORE_DIFF16 m11, m3, m0, m8, r0+r2 516 lea r0, [r0+r2*2] 517 STORE_DIFF16 m12, m4, m0, m8, r0 518 STORE_DIFF16 m13, m5, m0, m8, r0+r2 519 lea r0, [r0+r2*2] 520 STORE_DIFF16 m14, m6, m0, m8, r0 521 STORE_DIFF16 m15, m7, m0, m8, r0+r2 522%else 523 IDCT8_ADD_SSE_START r1, rsp 524 IDCT8_ADD_SSE_START r1+16, rsp+128 525 lea r3, [r0+8] 526 IDCT8_ADD_SSE_END r0, rsp, r2 527 IDCT8_ADD_SSE_END r3, rsp+16, r2 528 mova [r1+ 0], m7 529 mova [r1+ 16], m7 530 mova [r1+ 32], m7 531 mova [r1+ 48], m7 532 mova [r1+ 64], m7 533 mova [r1+ 80], m7 534 mova [r1+ 96], m7 535 mova [r1+112], m7 536 mova [r1+128], m7 537 mova [r1+144], m7 538 mova [r1+160], m7 539 mova [r1+176], m7 540 mova [r1+192], m7 541 mova [r1+208], m7 542 mova [r1+224], m7 543 mova [r1+240], m7 544%endif ; ARCH_X86_64 545 546 add rsp, pad 547 ret 548%endmacro 549 550INIT_XMM sse2 551IDCT8_ADD 552%if HAVE_AVX_EXTERNAL 553INIT_XMM avx 554IDCT8_ADD 555%endif 556 557;----------------------------------------------------------------------------- 558; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset, 559; int16_t *block, int stride, 560; const uint8_t nnzc[6*8]) 561;----------------------------------------------------------------------------- 562;;;;;;; NO FATE SAMPLES TRIGGER THIS 563%macro IDCT8_ADD4_OP 2 564 cmp byte [r4+%2], 0 565 jz .skipblock%1 566 mov r0d, [r6+%1*4] 567 add r0, r5 568 call h264_idct8_add1_10 %+ SUFFIX 569.skipblock%1: 570%if %1<12 571 add r1, 256 572%endif 573%endmacro 574 575%macro IDCT8_ADD4 0 576cglobal h264_idct8_add4_10, 0,7,16 577 %assign pad 16-gprsize-(stack_offset&15) 578 SUB rsp, pad 579 mov r5, r0mp 580 mov r6, r1mp 581 mov r1, r2mp 582 mov r2d, r3m 583 movifnidn r4, r4mp 584 IDCT8_ADD4_OP 0, 4+1*8 585 IDCT8_ADD4_OP 4, 6+1*8 586 IDCT8_ADD4_OP 8, 4+3*8 587 IDCT8_ADD4_OP 12, 6+3*8 588 ADD rsp, pad 589 RET 590%endmacro ; IDCT8_ADD4 591 592INIT_XMM sse2 593IDCT8_ADD4 594%if HAVE_AVX_EXTERNAL 595INIT_XMM avx 596IDCT8_ADD4 597%endif 598