1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Anton Mitrofanov <BugMaster@narod.ru> 8;* Jason Garrett-Glaser <darkshikari@gmail.com> 9;* 10;* Permission to use, copy, modify, and/or distribute this software for any 11;* purpose with or without fee is hereby granted, provided that the above 12;* copyright notice and this permission notice appear in all copies. 13;* 14;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 15;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 16;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 17;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 18;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 19;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 20;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 21;***************************************************************************** 22 23; This is a header file for the x264ASM assembly language, which uses 24; NASM/YASM syntax combined with a large number of macros to provide easy 25; abstraction between different calling conventions (x86_32, win64, linux64). 26; It also has various other useful features to simplify writing the kind of 27; DSP functions that are most often used in x264. 28 29; Unlike the rest of x264, this file is available under an ISC license, as it 30; has significant usefulness outside of x264 and we want it to be available 31; to the largest audience possible. Of course, if you modify it for your own 32; purposes to add a new feature, we strongly encourage contributing a patch 33; as this feature might be useful for others as well. Send patches or ideas 34; to x264-devel@videolan.org . 35 36%define program_name ff 37 38%ifdef ARCH_X86_64 39 %ifidn __OUTPUT_FORMAT__,win32 40 %define WIN64 41 %else 42 %define UNIX64 43 %endif 44%endif 45 46%ifdef PREFIX 47 %define mangle(x) _ %+ x 48%else 49 %define mangle(x) x 50%endif 51 52; FIXME: All of the 64bit asm functions that take a stride as an argument 53; via register, assume that the high dword of that register is filled with 0. 54; This is true in practice (since we never do any 64bit arithmetic on strides, 55; and x264's strides are all positive), but is not guaranteed by the ABI. 56 57; Name of the .rodata section. 58; Kludge: Something on OS X fails to align .rodata even given an align attribute, 59; so use a different read-only section. 60%macro SECTION_RODATA 0-1 16 61 %ifidn __OUTPUT_FORMAT__,macho64 62 SECTION .text align=%1 63 %elifidn __OUTPUT_FORMAT__,macho 64 SECTION .text align=%1 65 fakegot: 66 %elifidn __OUTPUT_FORMAT__,aout 67 section .text 68 %else 69 SECTION .rodata align=%1 70 %endif 71%endmacro 72 73; aout does not support align= 74%macro SECTION_TEXT 0-1 16 75 %ifidn __OUTPUT_FORMAT__,aout 76 SECTION .text 77 %else 78 SECTION .text align=%1 79 %endif 80%endmacro 81 82%ifdef WIN64 83 %define PIC 84%elifndef ARCH_X86_64 85; x86_32 doesn't require PIC. 86; Some distros prefer shared objects to be PIC, but nothing breaks if 87; the code contains a few textrels, so we'll skip that complexity. 88 %undef PIC 89%endif 90%ifdef PIC 91 default rel 92%endif 93 94; Macros to eliminate most code duplication between x86_32 and x86_64: 95; Currently this works only for leaf functions which load all their arguments 96; into registers at the start, and make no other use of the stack. Luckily that 97; covers most of x264's asm. 98 99; PROLOGUE: 100; %1 = number of arguments. loads them from stack if needed. 101; %2 = number of registers used. pushes callee-saved regs if needed. 102; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 103; %4 = list of names to define to registers 104; PROLOGUE can also be invoked by adding the same options to cglobal 105 106; e.g. 107; cglobal foo, 2,3,0, dst, src, tmp 108; declares a function (foo), taking two args (dst and src) and one local variable (tmp) 109 110; TODO Some functions can use some args directly from the stack. If they're the 111; last args then you can just not declare them, but if they're in the middle 112; we need more flexible macro. 113 114; RET: 115; Pops anything that was pushed by PROLOGUE, and returns. 116 117; REP_RET: 118; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 119; which are slow when a normal ret follows a branch. 120 121; registers: 122; rN and rNq are the native-size register holding function argument N 123; rNd, rNw, rNb are dword, word, and byte size 124; rNm is the original location of arg N (a register or on the stack), dword 125; rNmp is native size 126 127%macro DECLARE_REG 6 128 %define r%1q %2 129 %define r%1d %3 130 %define r%1w %4 131 %define r%1b %5 132 %define r%1m %6 133 %ifid %6 ; i.e. it's a register 134 %define r%1mp %2 135 %elifdef ARCH_X86_64 ; memory 136 %define r%1mp qword %6 137 %else 138 %define r%1mp dword %6 139 %endif 140 %define r%1 %2 141%endmacro 142 143%macro DECLARE_REG_SIZE 2 144 %define r%1q r%1 145 %define e%1q r%1 146 %define r%1d e%1 147 %define e%1d e%1 148 %define r%1w %1 149 %define e%1w %1 150 %define r%1b %2 151 %define e%1b %2 152%ifndef ARCH_X86_64 153 %define r%1 e%1 154%endif 155%endmacro 156 157DECLARE_REG_SIZE ax, al 158DECLARE_REG_SIZE bx, bl 159DECLARE_REG_SIZE cx, cl 160DECLARE_REG_SIZE dx, dl 161DECLARE_REG_SIZE si, sil 162DECLARE_REG_SIZE di, dil 163DECLARE_REG_SIZE bp, bpl 164 165; t# defines for when per-arch register allocation is more complex than just function arguments 166 167%macro DECLARE_REG_TMP 1-* 168 %assign %%i 0 169 %rep %0 170 CAT_XDEFINE t, %%i, r%1 171 %assign %%i %%i+1 172 %rotate 1 173 %endrep 174%endmacro 175 176%macro DECLARE_REG_TMP_SIZE 0-* 177 %rep %0 178 %define t%1q t%1 %+ q 179 %define t%1d t%1 %+ d 180 %define t%1w t%1 %+ w 181 %define t%1b t%1 %+ b 182 %rotate 1 183 %endrep 184%endmacro 185 186DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 187 188%ifdef ARCH_X86_64 189 %define gprsize 8 190%else 191 %define gprsize 4 192%endif 193 194%macro PUSH 1 195 push %1 196 %assign stack_offset stack_offset+gprsize 197%endmacro 198 199%macro POP 1 200 pop %1 201 %assign stack_offset stack_offset-gprsize 202%endmacro 203 204%macro SUB 2 205 sub %1, %2 206 %ifidn %1, rsp 207 %assign stack_offset stack_offset+(%2) 208 %endif 209%endmacro 210 211%macro ADD 2 212 add %1, %2 213 %ifidn %1, rsp 214 %assign stack_offset stack_offset-(%2) 215 %endif 216%endmacro 217 218%macro movifnidn 2 219 %ifnidn %1, %2 220 mov %1, %2 221 %endif 222%endmacro 223 224%macro movsxdifnidn 2 225 %ifnidn %1, %2 226 movsxd %1, %2 227 %endif 228%endmacro 229 230%macro ASSERT 1 231 %if (%1) == 0 232 %error assert failed 233 %endif 234%endmacro 235 236%macro DEFINE_ARGS 0-* 237 %ifdef n_arg_names 238 %assign %%i 0 239 %rep n_arg_names 240 CAT_UNDEF arg_name %+ %%i, q 241 CAT_UNDEF arg_name %+ %%i, d 242 CAT_UNDEF arg_name %+ %%i, w 243 CAT_UNDEF arg_name %+ %%i, b 244 CAT_UNDEF arg_name %+ %%i, m 245 CAT_UNDEF arg_name, %%i 246 %assign %%i %%i+1 247 %endrep 248 %endif 249 250 %assign %%i 0 251 %rep %0 252 %xdefine %1q r %+ %%i %+ q 253 %xdefine %1d r %+ %%i %+ d 254 %xdefine %1w r %+ %%i %+ w 255 %xdefine %1b r %+ %%i %+ b 256 %xdefine %1m r %+ %%i %+ m 257 CAT_XDEFINE arg_name, %%i, %1 258 %assign %%i %%i+1 259 %rotate 1 260 %endrep 261 %assign n_arg_names %%i 262%endmacro 263 264%ifdef WIN64 ; Windows x64 ;================================================= 265 266DECLARE_REG 0, rcx, ecx, cx, cl, ecx 267DECLARE_REG 1, rdx, edx, dx, dl, edx 268DECLARE_REG 2, r8, r8d, r8w, r8b, r8d 269DECLARE_REG 3, r9, r9d, r9w, r9b, r9d 270DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] 271DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] 272DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] 273%define r7m [rsp + stack_offset + 64] 274%define r8m [rsp + stack_offset + 72] 275 276%macro LOAD_IF_USED 2 ; reg_id, number_of_args 277 %if %1 < %2 278 mov r%1, [rsp + stack_offset + 8 + %1*8] 279 %endif 280%endmacro 281 282%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... 283 ASSERT %2 >= %1 284 %assign regs_used %2 285 ASSERT regs_used <= 7 286 %if regs_used > 4 287 push r4 288 push r5 289 %assign stack_offset stack_offset+16 290 %endif 291 WIN64_SPILL_XMM %3 292 LOAD_IF_USED 4, %1 293 LOAD_IF_USED 5, %1 294 LOAD_IF_USED 6, %1 295 DEFINE_ARGS %4 296%endmacro 297 298%macro WIN64_SPILL_XMM 1 299 %assign xmm_regs_used %1 300 %if mmsize == 8 301 %assign xmm_regs_used 0 302 %endif 303 ASSERT xmm_regs_used <= 16 304 %if xmm_regs_used > 6 305 sub rsp, (xmm_regs_used-6)*16+16 306 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 307 %assign %%i xmm_regs_used 308 %rep (xmm_regs_used-6) 309 %assign %%i %%i-1 310 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i 311 %endrep 312 %endif 313%endmacro 314 315%macro WIN64_RESTORE_XMM_INTERNAL 1 316 %if xmm_regs_used > 6 317 %assign %%i xmm_regs_used 318 %rep (xmm_regs_used-6) 319 %assign %%i %%i-1 320 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] 321 %endrep 322 add %1, (xmm_regs_used-6)*16+16 323 %endif 324%endmacro 325 326%macro WIN64_RESTORE_XMM 1 327 WIN64_RESTORE_XMM_INTERNAL %1 328 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 329 %assign xmm_regs_used 0 330%endmacro 331 332%macro RET 0 333 WIN64_RESTORE_XMM_INTERNAL rsp 334 %if regs_used > 4 335 pop r5 336 pop r4 337 %endif 338 ret 339%endmacro 340 341%macro REP_RET 0 342 %if regs_used > 4 || xmm_regs_used > 6 343 RET 344 %else 345 rep ret 346 %endif 347%endmacro 348 349%elifdef ARCH_X86_64 ; *nix x64 ;============================================= 350 351DECLARE_REG 0, rdi, edi, di, dil, edi 352DECLARE_REG 1, rsi, esi, si, sil, esi 353DECLARE_REG 2, rdx, edx, dx, dl, edx 354DECLARE_REG 3, rcx, ecx, cx, cl, ecx 355DECLARE_REG 4, r8, r8d, r8w, r8b, r8d 356DECLARE_REG 5, r9, r9d, r9w, r9b, r9d 357DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] 358%define r7m [rsp + stack_offset + 16] 359%define r8m [rsp + stack_offset + 24] 360 361%macro LOAD_IF_USED 2 ; reg_id, number_of_args 362 %if %1 < %2 363 mov r%1, [rsp - 40 + %1*8] 364 %endif 365%endmacro 366 367%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 368 ASSERT %2 >= %1 369 ASSERT %2 <= 7 370 LOAD_IF_USED 6, %1 371 DEFINE_ARGS %4 372%endmacro 373 374%macro RET 0 375 ret 376%endmacro 377 378%macro REP_RET 0 379 rep ret 380%endmacro 381 382%else ; X86_32 ;============================================================== 383 384DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] 385DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] 386DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] 387DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] 388DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] 389DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] 390DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] 391%define r7m [esp + stack_offset + 32] 392%define r8m [esp + stack_offset + 36] 393%define rsp esp 394 395%macro PUSH_IF_USED 1 ; reg_id 396 %if %1 < regs_used 397 push r%1 398 %assign stack_offset stack_offset+4 399 %endif 400%endmacro 401 402%macro POP_IF_USED 1 ; reg_id 403 %if %1 < regs_used 404 pop r%1 405 %endif 406%endmacro 407 408%macro LOAD_IF_USED 2 ; reg_id, number_of_args 409 %if %1 < %2 410 mov r%1, [esp + stack_offset + 4 + %1*4] 411 %endif 412%endmacro 413 414%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 415 ASSERT %2 >= %1 416 %assign regs_used %2 417 ASSERT regs_used <= 7 418 PUSH_IF_USED 3 419 PUSH_IF_USED 4 420 PUSH_IF_USED 5 421 PUSH_IF_USED 6 422 LOAD_IF_USED 0, %1 423 LOAD_IF_USED 1, %1 424 LOAD_IF_USED 2, %1 425 LOAD_IF_USED 3, %1 426 LOAD_IF_USED 4, %1 427 LOAD_IF_USED 5, %1 428 LOAD_IF_USED 6, %1 429 DEFINE_ARGS %4 430%endmacro 431 432%macro RET 0 433 POP_IF_USED 6 434 POP_IF_USED 5 435 POP_IF_USED 4 436 POP_IF_USED 3 437 ret 438%endmacro 439 440%macro REP_RET 0 441 %if regs_used > 3 442 RET 443 %else 444 rep ret 445 %endif 446%endmacro 447 448%endif ;====================================================================== 449 450%ifndef WIN64 451%macro WIN64_SPILL_XMM 1 452%endmacro 453%macro WIN64_RESTORE_XMM 1 454%endmacro 455%endif 456 457 458 459;============================================================================= 460; arch-independent part 461;============================================================================= 462 463%assign function_align 16 464 465; Begin a function. 466; Applies any symbol mangling needed for C linkage, and sets up a define such that 467; subsequent uses of the function name automatically refer to the mangled version. 468; Appends cpuflags to the function name if cpuflags has been specified. 469%macro cglobal 1-2+ ; name, [PROLOGUE args] 470%if %0 == 1 471 cglobal_internal %1 %+ SUFFIX 472%else 473 cglobal_internal %1 %+ SUFFIX, %2 474%endif 475%endmacro 476%macro cglobal_internal 1-2+ 477 %ifndef cglobaled_%1 478 %xdefine %1 mangle(program_name %+ _ %+ %1) 479 %xdefine %1.skip_prologue %1 %+ .skip_prologue 480 CAT_XDEFINE cglobaled_, %1, 1 481 %endif 482 %xdefine current_function %1 483 %ifidn __OUTPUT_FORMAT__,elf 484 global %1:function hidden 485 %else 486 global %1 487 %endif 488 align function_align 489 %1: 490 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 491 %assign stack_offset 0 492 %if %0 > 1 493 PROLOGUE %2 494 %endif 495%endmacro 496 497%macro cextern 1 498 %xdefine %1 mangle(program_name %+ _ %+ %1) 499 CAT_XDEFINE cglobaled_, %1, 1 500 extern %1 501%endmacro 502 503; like cextern, but without the prefix 504%macro cextern_naked 1 505 %xdefine %1 mangle(%1) 506 CAT_XDEFINE cglobaled_, %1, 1 507 extern %1 508%endmacro 509 510%macro const 2+ 511 %xdefine %1 mangle(program_name %+ _ %+ %1) 512 global %1 513 %1: %2 514%endmacro 515 516; This is needed for ELF, otherwise the GNU linker assumes the stack is 517; executable by default. 518%ifidn __OUTPUT_FORMAT__,elf 519SECTION .note.GNU-stack noalloc noexec nowrite progbits 520%endif 521 522; cpuflags 523 524%assign cpuflags_mmx (1<<0) 525%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 526%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 527%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow 528%assign cpuflags_sse (1<<4) | cpuflags_mmx2 529%assign cpuflags_sse2 (1<<5) | cpuflags_sse 530%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 531%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 532%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 533%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 534%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 535%assign cpuflags_avx (1<<11)| cpuflags_sse42 536%assign cpuflags_xop (1<<12)| cpuflags_avx 537%assign cpuflags_fma4 (1<<13)| cpuflags_avx 538 539%assign cpuflags_cache32 (1<<16) 540%assign cpuflags_cache64 (1<<17) 541%assign cpuflags_slowctz (1<<18) 542%assign cpuflags_lzcnt (1<<19) 543%assign cpuflags_misalign (1<<20) 544%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant 545%assign cpuflags_atom (1<<22) 546 547%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 548%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 549 550; Takes up to 2 cpuflags from the above list. 551; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 552; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 553%macro INIT_CPUFLAGS 0-2 554 %if %0 >= 1 555 %xdefine cpuname %1 556 %assign cpuflags cpuflags_%1 557 %if %0 >= 2 558 %xdefine cpuname %1_%2 559 %assign cpuflags cpuflags | cpuflags_%2 560 %endif 561 %xdefine SUFFIX _ %+ cpuname 562 %if cpuflag(avx) 563 %assign avx_enabled 1 564 %endif 565 %if mmsize == 16 && notcpuflag(sse2) 566 %define mova movaps 567 %define movu movups 568 %define movnta movntps 569 %endif 570 %if cpuflag(aligned) 571 %define movu mova 572 %elifidn %1, sse3 573 %define movu lddqu 574 %endif 575 %else 576 %xdefine SUFFIX 577 %undef cpuname 578 %undef cpuflags 579 %endif 580%endmacro 581 582; merge mmx and sse* 583 584%macro CAT_XDEFINE 3 585 %xdefine %1%2 %3 586%endmacro 587 588%macro CAT_UNDEF 2 589 %undef %1%2 590%endmacro 591 592%macro INIT_MMX 0-1+ 593 %assign avx_enabled 0 594 %define RESET_MM_PERMUTATION INIT_MMX %1 595 %define mmsize 8 596 %define num_mmregs 8 597 %define mova movq 598 %define movu movq 599 %define movh movd 600 %define movnta movntq 601 %assign %%i 0 602 %rep 8 603 CAT_XDEFINE m, %%i, mm %+ %%i 604 CAT_XDEFINE nmm, %%i, %%i 605 %assign %%i %%i+1 606 %endrep 607 %rep 8 608 CAT_UNDEF m, %%i 609 CAT_UNDEF nmm, %%i 610 %assign %%i %%i+1 611 %endrep 612 INIT_CPUFLAGS %1 613%endmacro 614 615%macro INIT_XMM 0-1+ 616 %assign avx_enabled 0 617 %define RESET_MM_PERMUTATION INIT_XMM %1 618 %define mmsize 16 619 %define num_mmregs 8 620 %ifdef ARCH_X86_64 621 %define num_mmregs 16 622 %endif 623 %define mova movdqa 624 %define movu movdqu 625 %define movh movq 626 %define movnta movntdq 627 %assign %%i 0 628 %rep num_mmregs 629 CAT_XDEFINE m, %%i, xmm %+ %%i 630 CAT_XDEFINE nxmm, %%i, %%i 631 %assign %%i %%i+1 632 %endrep 633 INIT_CPUFLAGS %1 634%endmacro 635 636; FIXME: INIT_AVX can be replaced by INIT_XMM avx 637%macro INIT_AVX 0 638 INIT_XMM 639 %assign avx_enabled 1 640 %define PALIGNR PALIGNR_SSSE3 641 %define RESET_MM_PERMUTATION INIT_AVX 642%endmacro 643 644%macro INIT_YMM 0-1+ 645 %assign avx_enabled 1 646 %define RESET_MM_PERMUTATION INIT_YMM %1 647 %define mmsize 32 648 %define num_mmregs 8 649 %ifdef ARCH_X86_64 650 %define num_mmregs 16 651 %endif 652 %define mova vmovaps 653 %define movu vmovups 654 %undef movh 655 %define movnta vmovntps 656 %assign %%i 0 657 %rep num_mmregs 658 CAT_XDEFINE m, %%i, ymm %+ %%i 659 CAT_XDEFINE nymm, %%i, %%i 660 %assign %%i %%i+1 661 %endrep 662 INIT_CPUFLAGS %1 663%endmacro 664 665INIT_XMM 666 667; I often want to use macros that permute their arguments. e.g. there's no 668; efficient way to implement butterfly or transpose or dct without swapping some 669; arguments. 670; 671; I would like to not have to manually keep track of the permutations: 672; If I insert a permutation in the middle of a function, it should automatically 673; change everything that follows. For more complex macros I may also have multiple 674; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 675; 676; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 677; permutes its arguments. It's equivalent to exchanging the contents of the 678; registers, except that this way you exchange the register names instead, so it 679; doesn't cost any cycles. 680 681%macro PERMUTE 2-* ; takes a list of pairs to swap 682%rep %0/2 683 %xdefine tmp%2 m%2 684 %xdefine ntmp%2 nm%2 685 %rotate 2 686%endrep 687%rep %0/2 688 %xdefine m%1 tmp%2 689 %xdefine nm%1 ntmp%2 690 %undef tmp%2 691 %undef ntmp%2 692 %rotate 2 693%endrep 694%endmacro 695 696%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 697%rep %0-1 698%ifdef m%1 699 %xdefine tmp m%1 700 %xdefine m%1 m%2 701 %xdefine m%2 tmp 702 CAT_XDEFINE n, m%1, %1 703 CAT_XDEFINE n, m%2, %2 704%else 705 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. 706 ; Be careful using this mode in nested macros though, as in some cases there may be 707 ; other copies of m# that have already been dereferenced and don't get updated correctly. 708 %xdefine %%n1 n %+ %1 709 %xdefine %%n2 n %+ %2 710 %xdefine tmp m %+ %%n1 711 CAT_XDEFINE m, %%n1, m %+ %%n2 712 CAT_XDEFINE m, %%n2, tmp 713 CAT_XDEFINE n, m %+ %%n1, %%n1 714 CAT_XDEFINE n, m %+ %%n2, %%n2 715%endif 716 %undef tmp 717 %rotate 1 718%endrep 719%endmacro 720 721; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 722; calls to that function will automatically load the permutation, so values can 723; be returned in mmregs. 724%macro SAVE_MM_PERMUTATION 0-1 725 %if %0 726 %xdefine %%f %1_m 727 %else 728 %xdefine %%f current_function %+ _m 729 %endif 730 %assign %%i 0 731 %rep num_mmregs 732 CAT_XDEFINE %%f, %%i, m %+ %%i 733 %assign %%i %%i+1 734 %endrep 735%endmacro 736 737%macro LOAD_MM_PERMUTATION 1 ; name to load from 738 %ifdef %1_m0 739 %assign %%i 0 740 %rep num_mmregs 741 CAT_XDEFINE m, %%i, %1_m %+ %%i 742 CAT_XDEFINE n, m %+ %%i, %%i 743 %assign %%i %%i+1 744 %endrep 745 %endif 746%endmacro 747 748; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 749%macro call 1 750 call_internal %1, %1 %+ SUFFIX 751%endmacro 752%macro call_internal 2 753 %xdefine %%i %1 754 %ifndef cglobaled_%1 755 %ifdef cglobaled_%2 756 %xdefine %%i %2 757 %endif 758 %endif 759 call %%i 760 LOAD_MM_PERMUTATION %%i 761%endmacro 762 763; Substitutions that reduce instruction size but are functionally equivalent 764%macro add 2 765 %ifnum %2 766 %if %2==128 767 sub %1, -128 768 %else 769 add %1, %2 770 %endif 771 %else 772 add %1, %2 773 %endif 774%endmacro 775 776%macro sub 2 777 %ifnum %2 778 %if %2==128 779 add %1, -128 780 %else 781 sub %1, %2 782 %endif 783 %else 784 sub %1, %2 785 %endif 786%endmacro 787 788;============================================================================= 789; AVX abstraction layer 790;============================================================================= 791 792%assign i 0 793%rep 16 794 %if i < 8 795 CAT_XDEFINE sizeofmm, i, 8 796 %endif 797 CAT_XDEFINE sizeofxmm, i, 16 798 CAT_XDEFINE sizeofymm, i, 32 799%assign i i+1 800%endrep 801%undef i 802 803;%1 == instruction 804;%2 == 1 if float, 0 if int 805;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) 806;%4 == number of operands given 807;%5+: operands 808%macro RUN_AVX_INSTR 6-7+ 809 %ifid %5 810 %define %%size sizeof%5 811 %else 812 %define %%size mmsize 813 %endif 814 %if %%size==32 815 v%1 %5, %6, %7 816 %else 817 %if %%size==8 818 %define %%regmov movq 819 %elif %2 820 %define %%regmov movaps 821 %else 822 %define %%regmov movdqa 823 %endif 824 825 %if %4>=3+%3 826 %ifnidn %5, %6 827 %if avx_enabled && sizeof%5==16 828 v%1 %5, %6, %7 829 %else 830 %%regmov %5, %6 831 %1 %5, %7 832 %endif 833 %else 834 %1 %5, %7 835 %endif 836 %elif %3 837 %1 %5, %6, %7 838 %else 839 %1 %5, %6 840 %endif 841 %endif 842%endmacro 843 844; 3arg AVX ops with a memory arg can only have it in src2, 845; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). 846; So, if the op is symmetric and the wrong one is memory, swap them. 847%macro RUN_AVX_INSTR1 8 848 %assign %%swap 0 849 %if avx_enabled 850 %ifnid %6 851 %assign %%swap 1 852 %endif 853 %elifnidn %5, %6 854 %ifnid %7 855 %assign %%swap 1 856 %endif 857 %endif 858 %if %%swap && %3 == 0 && %8 == 1 859 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 860 %else 861 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 862 %endif 863%endmacro 864 865;%1 == instruction 866;%2 == 1 if float, 0 if int 867;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) 868;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not 869%macro AVX_INSTR 4 870 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 871 %ifidn %3, fnord 872 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 873 %elifidn %4, fnord 874 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 875 %elifidn %5, fnord 876 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 877 %else 878 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 879 %endif 880 %endmacro 881%endmacro 882 883AVX_INSTR addpd, 1, 0, 1 884AVX_INSTR addps, 1, 0, 1 885AVX_INSTR addsd, 1, 0, 1 886AVX_INSTR addss, 1, 0, 1 887AVX_INSTR addsubpd, 1, 0, 0 888AVX_INSTR addsubps, 1, 0, 0 889AVX_INSTR andpd, 1, 0, 1 890AVX_INSTR andps, 1, 0, 1 891AVX_INSTR andnpd, 1, 0, 0 892AVX_INSTR andnps, 1, 0, 0 893AVX_INSTR blendpd, 1, 0, 0 894AVX_INSTR blendps, 1, 0, 0 895AVX_INSTR blendvpd, 1, 0, 0 896AVX_INSTR blendvps, 1, 0, 0 897AVX_INSTR cmppd, 1, 0, 0 898AVX_INSTR cmpps, 1, 0, 0 899AVX_INSTR cmpsd, 1, 0, 0 900AVX_INSTR cmpss, 1, 0, 0 901AVX_INSTR divpd, 1, 0, 0 902AVX_INSTR divps, 1, 0, 0 903AVX_INSTR divsd, 1, 0, 0 904AVX_INSTR divss, 1, 0, 0 905AVX_INSTR dppd, 1, 1, 0 906AVX_INSTR dpps, 1, 1, 0 907AVX_INSTR haddpd, 1, 0, 0 908AVX_INSTR haddps, 1, 0, 0 909AVX_INSTR hsubpd, 1, 0, 0 910AVX_INSTR hsubps, 1, 0, 0 911AVX_INSTR maxpd, 1, 0, 1 912AVX_INSTR maxps, 1, 0, 1 913AVX_INSTR maxsd, 1, 0, 1 914AVX_INSTR maxss, 1, 0, 1 915AVX_INSTR minpd, 1, 0, 1 916AVX_INSTR minps, 1, 0, 1 917AVX_INSTR minsd, 1, 0, 1 918AVX_INSTR minss, 1, 0, 1 919AVX_INSTR movhlps, 1, 0, 0 920AVX_INSTR movlhps, 1, 0, 0 921AVX_INSTR movsd, 1, 0, 0 922AVX_INSTR movss, 1, 0, 0 923AVX_INSTR mpsadbw, 0, 1, 0 924AVX_INSTR mulpd, 1, 0, 1 925AVX_INSTR mulps, 1, 0, 1 926AVX_INSTR mulsd, 1, 0, 1 927AVX_INSTR mulss, 1, 0, 1 928AVX_INSTR orpd, 1, 0, 1 929AVX_INSTR orps, 1, 0, 1 930AVX_INSTR packsswb, 0, 0, 0 931AVX_INSTR packssdw, 0, 0, 0 932AVX_INSTR packuswb, 0, 0, 0 933AVX_INSTR packusdw, 0, 0, 0 934AVX_INSTR paddb, 0, 0, 1 935AVX_INSTR paddw, 0, 0, 1 936AVX_INSTR paddd, 0, 0, 1 937AVX_INSTR paddq, 0, 0, 1 938AVX_INSTR paddsb, 0, 0, 1 939AVX_INSTR paddsw, 0, 0, 1 940AVX_INSTR paddusb, 0, 0, 1 941AVX_INSTR paddusw, 0, 0, 1 942AVX_INSTR palignr, 0, 1, 0 943AVX_INSTR pand, 0, 0, 1 944AVX_INSTR pandn, 0, 0, 0 945AVX_INSTR pavgb, 0, 0, 1 946AVX_INSTR pavgw, 0, 0, 1 947AVX_INSTR pblendvb, 0, 0, 0 948AVX_INSTR pblendw, 0, 1, 0 949AVX_INSTR pcmpestri, 0, 0, 0 950AVX_INSTR pcmpestrm, 0, 0, 0 951AVX_INSTR pcmpistri, 0, 0, 0 952AVX_INSTR pcmpistrm, 0, 0, 0 953AVX_INSTR pcmpeqb, 0, 0, 1 954AVX_INSTR pcmpeqw, 0, 0, 1 955AVX_INSTR pcmpeqd, 0, 0, 1 956AVX_INSTR pcmpeqq, 0, 0, 1 957AVX_INSTR pcmpgtb, 0, 0, 0 958AVX_INSTR pcmpgtw, 0, 0, 0 959AVX_INSTR pcmpgtd, 0, 0, 0 960AVX_INSTR pcmpgtq, 0, 0, 0 961AVX_INSTR phaddw, 0, 0, 0 962AVX_INSTR phaddd, 0, 0, 0 963AVX_INSTR phaddsw, 0, 0, 0 964AVX_INSTR phsubw, 0, 0, 0 965AVX_INSTR phsubd, 0, 0, 0 966AVX_INSTR phsubsw, 0, 0, 0 967AVX_INSTR pmaddwd, 0, 0, 1 968AVX_INSTR pmaddubsw, 0, 0, 0 969AVX_INSTR pmaxsb, 0, 0, 1 970AVX_INSTR pmaxsw, 0, 0, 1 971AVX_INSTR pmaxsd, 0, 0, 1 972AVX_INSTR pmaxub, 0, 0, 1 973AVX_INSTR pmaxuw, 0, 0, 1 974AVX_INSTR pmaxud, 0, 0, 1 975AVX_INSTR pminsb, 0, 0, 1 976AVX_INSTR pminsw, 0, 0, 1 977AVX_INSTR pminsd, 0, 0, 1 978AVX_INSTR pminub, 0, 0, 1 979AVX_INSTR pminuw, 0, 0, 1 980AVX_INSTR pminud, 0, 0, 1 981AVX_INSTR pmulhuw, 0, 0, 1 982AVX_INSTR pmulhrsw, 0, 0, 1 983AVX_INSTR pmulhw, 0, 0, 1 984AVX_INSTR pmullw, 0, 0, 1 985AVX_INSTR pmulld, 0, 0, 1 986AVX_INSTR pmuludq, 0, 0, 1 987AVX_INSTR pmuldq, 0, 0, 1 988AVX_INSTR por, 0, 0, 1 989AVX_INSTR psadbw, 0, 0, 1 990AVX_INSTR pshufb, 0, 0, 0 991AVX_INSTR psignb, 0, 0, 0 992AVX_INSTR psignw, 0, 0, 0 993AVX_INSTR psignd, 0, 0, 0 994AVX_INSTR psllw, 0, 0, 0 995AVX_INSTR pslld, 0, 0, 0 996AVX_INSTR psllq, 0, 0, 0 997AVX_INSTR pslldq, 0, 0, 0 998AVX_INSTR psraw, 0, 0, 0 999AVX_INSTR psrad, 0, 0, 0 1000AVX_INSTR psrlw, 0, 0, 0 1001AVX_INSTR psrld, 0, 0, 0 1002AVX_INSTR psrlq, 0, 0, 0 1003AVX_INSTR psrldq, 0, 0, 0 1004AVX_INSTR psubb, 0, 0, 0 1005AVX_INSTR psubw, 0, 0, 0 1006AVX_INSTR psubd, 0, 0, 0 1007AVX_INSTR psubq, 0, 0, 0 1008AVX_INSTR psubsb, 0, 0, 0 1009AVX_INSTR psubsw, 0, 0, 0 1010AVX_INSTR psubusb, 0, 0, 0 1011AVX_INSTR psubusw, 0, 0, 0 1012AVX_INSTR punpckhbw, 0, 0, 0 1013AVX_INSTR punpckhwd, 0, 0, 0 1014AVX_INSTR punpckhdq, 0, 0, 0 1015AVX_INSTR punpckhqdq, 0, 0, 0 1016AVX_INSTR punpcklbw, 0, 0, 0 1017AVX_INSTR punpcklwd, 0, 0, 0 1018AVX_INSTR punpckldq, 0, 0, 0 1019AVX_INSTR punpcklqdq, 0, 0, 0 1020AVX_INSTR pxor, 0, 0, 1 1021AVX_INSTR shufps, 1, 1, 0 1022AVX_INSTR subpd, 1, 0, 0 1023AVX_INSTR subps, 1, 0, 0 1024AVX_INSTR subsd, 1, 0, 0 1025AVX_INSTR subss, 1, 0, 0 1026AVX_INSTR unpckhpd, 1, 0, 0 1027AVX_INSTR unpckhps, 1, 0, 0 1028AVX_INSTR unpcklpd, 1, 0, 0 1029AVX_INSTR unpcklps, 1, 0, 0 1030AVX_INSTR xorpd, 1, 0, 1 1031AVX_INSTR xorps, 1, 0, 1 1032 1033; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1034AVX_INSTR pfadd, 1, 0, 1 1035AVX_INSTR pfsub, 1, 0, 0 1036AVX_INSTR pfmul, 1, 0, 1 1037 1038; base-4 constants for shuffles 1039%assign i 0 1040%rep 256 1041 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1042 %if j < 10 1043 CAT_XDEFINE q000, j, i 1044 %elif j < 100 1045 CAT_XDEFINE q00, j, i 1046 %elif j < 1000 1047 CAT_XDEFINE q0, j, i 1048 %else 1049 CAT_XDEFINE q, j, i 1050 %endif 1051%assign i i+1 1052%endrep 1053%undef i 1054%undef j 1055 1056%macro FMA_INSTR 3 1057 %macro %1 4-7 %1, %2, %3 1058 %if cpuflag(xop) 1059 v%5 %1, %2, %3, %4 1060 %else 1061 %6 %1, %2, %3 1062 %7 %1, %4 1063 %endif 1064 %endmacro 1065%endmacro 1066 1067FMA_INSTR pmacsdd, pmulld, paddd 1068FMA_INSTR pmacsww, pmullw, paddw 1069FMA_INSTR pmadcswd, pmaddwd, paddd 1070