1;***************************************************************************** 2;* x86inc.asm 3;***************************************************************************** 4;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;***************************************************************************** 22 23%ifdef ARCH_X86_64 24 %ifidn __OUTPUT_FORMAT__,win32 25 %define WIN64 26 %else 27 %define UNIX64 28 %endif 29%endif 30 31; FIXME: All of the 64bit asm functions that take a stride as an argument 32; via register, assume that the high dword of that register is filled with 0. 33; This is true in practice (since we never do any 64bit arithmetic on strides, 34; and x264's strides are all positive), but is not guaranteed by the ABI. 35 36; Name of the .rodata section. 37; Kludge: Something on OS X fails to align .rodata even given an align attribute, 38; so use a different read-only section. 39%macro SECTION_RODATA 0-1 16 40 %ifidn __OUTPUT_FORMAT__,macho64 41 SECTION .text align=%1 42 %elifidn __OUTPUT_FORMAT__,macho 43 SECTION .text align=%1 44 fakegot: 45 %else 46 SECTION .rodata align=%1 47 %endif 48%endmacro 49 50; PIC support macros. 51; x86_64 can't fit 64bit address literals in most instruction types, 52; so shared objects (under the assumption that they might be anywhere 53; in memory) must use an address mode that does fit. 54; So all accesses to global variables must use this macro, e.g. 55; mov eax, [foo GLOBAL] 56; instead of 57; mov eax, [foo] 58; 59; x86_32 doesn't require PIC. 60; Some distros prefer shared objects to be PIC, but nothing breaks if 61; the code contains a few textrels, so we'll skip that complexity. 62 63%ifdef WIN64 64 %define PIC 65%elifndef ARCH_X86_64 66 %undef PIC 67%endif 68%ifdef PIC 69 %define GLOBAL wrt rip 70%else 71 %define GLOBAL 72%endif 73 74; Macros to eliminate most code duplication between x86_32 and x86_64: 75; Currently this works only for leaf functions which load all their arguments 76; into registers at the start, and make no other use of the stack. Luckily that 77; covers most of x264's asm. 78 79; PROLOGUE: 80; %1 = number of arguments. loads them from stack if needed. 81; %2 = number of registers used. pushes callee-saved regs if needed. 82; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 83; %4 = list of names to define to registers 84; PROLOGUE can also be invoked by adding the same options to cglobal 85 86; e.g. 87; cglobal foo, 2,3,0, dst, src, tmp 88; declares a function (foo), taking two args (dst and src) and one local variable (tmp) 89 90; TODO Some functions can use some args directly from the stack. If they're the 91; last args then you can just not declare them, but if they're in the middle 92; we need more flexible macro. 93 94; RET: 95; Pops anything that was pushed by PROLOGUE 96 97; REP_RET: 98; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 99; which are slow when a normal ret follows a branch. 100 101; registers: 102; rN and rNq are the native-size register holding function argument N 103; rNd, rNw, rNb are dword, word, and byte size 104; rNm is the original location of arg N (a register or on the stack), dword 105; rNmp is native size 106 107%macro DECLARE_REG 6 108 %define r%1q %2 109 %define r%1d %3 110 %define r%1w %4 111 %define r%1b %5 112 %define r%1m %6 113 %ifid %6 ; i.e. it's a register 114 %define r%1mp %2 115 %elifdef ARCH_X86_64 ; memory 116 %define r%1mp qword %6 117 %else 118 %define r%1mp dword %6 119 %endif 120 %define r%1 %2 121%endmacro 122 123%macro DECLARE_REG_SIZE 2 124 %define r%1q r%1 125 %define e%1q r%1 126 %define r%1d e%1 127 %define e%1d e%1 128 %define r%1w %1 129 %define e%1w %1 130 %define r%1b %2 131 %define e%1b %2 132%ifndef ARCH_X86_64 133 %define r%1 e%1 134%endif 135%endmacro 136 137DECLARE_REG_SIZE ax, al 138DECLARE_REG_SIZE bx, bl 139DECLARE_REG_SIZE cx, cl 140DECLARE_REG_SIZE dx, dl 141DECLARE_REG_SIZE si, sil 142DECLARE_REG_SIZE di, dil 143DECLARE_REG_SIZE bp, bpl 144 145; t# defines for when per-arch register allocation is more complex than just function arguments 146 147%macro DECLARE_REG_TMP 1-* 148 %assign %%i 0 149 %rep %0 150 CAT_XDEFINE t, %%i, r%1 151 %assign %%i %%i+1 152 %rotate 1 153 %endrep 154%endmacro 155 156%macro DECLARE_REG_TMP_SIZE 0-* 157 %rep %0 158 %define t%1q t%1 %+ q 159 %define t%1d t%1 %+ d 160 %define t%1w t%1 %+ w 161 %define t%1b t%1 %+ b 162 %rotate 1 163 %endrep 164%endmacro 165 166DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 167 168%ifdef ARCH_X86_64 169 %define gprsize 8 170%else 171 %define gprsize 4 172%endif 173 174%macro PUSH 1 175 push %1 176 %assign stack_offset stack_offset+gprsize 177%endmacro 178 179%macro POP 1 180 pop %1 181 %assign stack_offset stack_offset-gprsize 182%endmacro 183 184%macro SUB 2 185 sub %1, %2 186 %ifidn %1, rsp 187 %assign stack_offset stack_offset+(%2) 188 %endif 189%endmacro 190 191%macro ADD 2 192 add %1, %2 193 %ifidn %1, rsp 194 %assign stack_offset stack_offset-(%2) 195 %endif 196%endmacro 197 198%macro movifnidn 2 199 %ifnidn %1, %2 200 mov %1, %2 201 %endif 202%endmacro 203 204%macro movsxdifnidn 2 205 %ifnidn %1, %2 206 movsxd %1, %2 207 %endif 208%endmacro 209 210%macro ASSERT 1 211 %if (%1) == 0 212 %error assert failed 213 %endif 214%endmacro 215 216%macro DEFINE_ARGS 0-* 217 %ifdef n_arg_names 218 %assign %%i 0 219 %rep n_arg_names 220 CAT_UNDEF arg_name %+ %%i, q 221 CAT_UNDEF arg_name %+ %%i, d 222 CAT_UNDEF arg_name %+ %%i, w 223 CAT_UNDEF arg_name %+ %%i, b 224 CAT_UNDEF arg_name %+ %%i, m 225 CAT_UNDEF arg_name, %%i 226 %assign %%i %%i+1 227 %endrep 228 %endif 229 230 %assign %%i 0 231 %rep %0 232 %xdefine %1q r %+ %%i %+ q 233 %xdefine %1d r %+ %%i %+ d 234 %xdefine %1w r %+ %%i %+ w 235 %xdefine %1b r %+ %%i %+ b 236 %xdefine %1m r %+ %%i %+ m 237 CAT_XDEFINE arg_name, %%i, %1 238 %assign %%i %%i+1 239 %rotate 1 240 %endrep 241 %assign n_arg_names %%i 242%endmacro 243 244%ifdef WIN64 ; Windows x64 ;================================================= 245 246DECLARE_REG 0, rcx, ecx, cx, cl, ecx 247DECLARE_REG 1, rdx, edx, dx, dl, edx 248DECLARE_REG 2, r8, r8d, r8w, r8b, r8d 249DECLARE_REG 3, r9, r9d, r9w, r9b, r9d 250DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] 251DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] 252DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] 253%define r7m [rsp + stack_offset + 64] 254%define r8m [rsp + stack_offset + 72] 255 256%macro LOAD_IF_USED 2 ; reg_id, number_of_args 257 %if %1 < %2 258 mov r%1, [rsp + stack_offset + 8 + %1*8] 259 %endif 260%endmacro 261 262%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 263 ASSERT %2 >= %1 264 %assign regs_used %2 265 ASSERT regs_used <= 7 266 %if %0 > 2 267 %assign xmm_regs_used %3 268 %else 269 %assign xmm_regs_used 0 270 %endif 271 ASSERT xmm_regs_used <= 16 272 %if regs_used > 4 273 push r4 274 push r5 275 %assign stack_offset stack_offset+16 276 %endif 277 %if xmm_regs_used > 6 278 sub rsp, (xmm_regs_used-6)*16+16 279 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 280 %assign %%i xmm_regs_used 281 %rep (xmm_regs_used-6) 282 %assign %%i %%i-1 283 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i 284 %endrep 285 %endif 286 LOAD_IF_USED 4, %1 287 LOAD_IF_USED 5, %1 288 LOAD_IF_USED 6, %1 289 DEFINE_ARGS %4 290%endmacro 291 292%macro RESTORE_XMM_INTERNAL 1 293 %if xmm_regs_used > 6 294 %assign %%i xmm_regs_used 295 %rep (xmm_regs_used-6) 296 %assign %%i %%i-1 297 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] 298 %endrep 299 add %1, (xmm_regs_used-6)*16+16 300 %endif 301%endmacro 302 303%macro RESTORE_XMM 1 304 RESTORE_XMM_INTERNAL %1 305 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 306 %assign xmm_regs_used 0 307%endmacro 308 309%macro RET 0 310 RESTORE_XMM_INTERNAL rsp 311 %if regs_used > 4 312 pop r5 313 pop r4 314 %endif 315 ret 316%endmacro 317 318%macro REP_RET 0 319 %if regs_used > 4 || xmm_regs_used > 6 320 RET 321 %else 322 rep ret 323 %endif 324%endmacro 325 326%elifdef ARCH_X86_64 ; *nix x64 ;============================================= 327 328DECLARE_REG 0, rdi, edi, di, dil, edi 329DECLARE_REG 1, rsi, esi, si, sil, esi 330DECLARE_REG 2, rdx, edx, dx, dl, edx 331DECLARE_REG 3, rcx, ecx, cx, cl, ecx 332DECLARE_REG 4, r8, r8d, r8w, r8b, r8d 333DECLARE_REG 5, r9, r9d, r9w, r9b, r9d 334DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] 335%define r7m [rsp + stack_offset + 16] 336%define r8m [rsp + stack_offset + 24] 337 338%macro LOAD_IF_USED 2 ; reg_id, number_of_args 339 %if %1 < %2 340 mov r%1, [rsp - 40 + %1*8] 341 %endif 342%endmacro 343 344%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 345 ASSERT %2 >= %1 346 ASSERT %2 <= 7 347 LOAD_IF_USED 6, %1 348 DEFINE_ARGS %4 349%endmacro 350 351%macro RET 0 352 ret 353%endmacro 354 355%macro REP_RET 0 356 rep ret 357%endmacro 358 359%else ; X86_32 ;============================================================== 360 361DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] 362DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] 363DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] 364DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] 365DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] 366DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] 367DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] 368%define r7m [esp + stack_offset + 32] 369%define r8m [esp + stack_offset + 36] 370%define rsp esp 371 372%macro PUSH_IF_USED 1 ; reg_id 373 %if %1 < regs_used 374 push r%1 375 %assign stack_offset stack_offset+4 376 %endif 377%endmacro 378 379%macro POP_IF_USED 1 ; reg_id 380 %if %1 < regs_used 381 pop r%1 382 %endif 383%endmacro 384 385%macro LOAD_IF_USED 2 ; reg_id, number_of_args 386 %if %1 < %2 387 mov r%1, [esp + stack_offset + 4 + %1*4] 388 %endif 389%endmacro 390 391%macro PROLOGUE 2-4+ ; #args, #regs, arg_names... 392 ASSERT %2 >= %1 393 %assign regs_used %2 394 ASSERT regs_used <= 7 395 PUSH_IF_USED 3 396 PUSH_IF_USED 4 397 PUSH_IF_USED 5 398 PUSH_IF_USED 6 399 LOAD_IF_USED 0, %1 400 LOAD_IF_USED 1, %1 401 LOAD_IF_USED 2, %1 402 LOAD_IF_USED 3, %1 403 LOAD_IF_USED 4, %1 404 LOAD_IF_USED 5, %1 405 LOAD_IF_USED 6, %1 406 DEFINE_ARGS %4 407%endmacro 408 409%macro RET 0 410 POP_IF_USED 6 411 POP_IF_USED 5 412 POP_IF_USED 4 413 POP_IF_USED 3 414 ret 415%endmacro 416 417%macro REP_RET 0 418 %if regs_used > 3 419 RET 420 %else 421 rep ret 422 %endif 423%endmacro 424 425%endif ;====================================================================== 426 427 428 429;============================================================================= 430; arch-independent part 431;============================================================================= 432 433%assign function_align 16 434 435; Symbol prefix for C linkage 436%macro cglobal 1-2+ 437 %xdefine %1 ff_%1 438 %ifdef PREFIX 439 %xdefine %1 _ %+ %1 440 %endif 441 %xdefine %1.skip_prologue %1 %+ .skip_prologue 442 %ifidn __OUTPUT_FORMAT__,elf 443 global %1:function hidden 444 %else 445 global %1 446 %endif 447 align function_align 448 %1: 449 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 450 %assign stack_offset 0 451 %if %0 > 1 452 PROLOGUE %2 453 %endif 454%endmacro 455 456%macro cextern 1 457 %ifdef PREFIX 458 %xdefine %1 _%1 459 %endif 460 extern %1 461%endmacro 462 463; This is needed for ELF, otherwise the GNU linker assumes the stack is 464; executable by default. 465%ifidn __OUTPUT_FORMAT__,elf 466SECTION .note.GNU-stack noalloc noexec nowrite progbits 467%endif 468 469%assign FENC_STRIDE 16 470%assign FDEC_STRIDE 32 471 472; merge mmx and sse* 473 474%macro CAT_XDEFINE 3 475 %xdefine %1%2 %3 476%endmacro 477 478%macro CAT_UNDEF 2 479 %undef %1%2 480%endmacro 481 482%macro INIT_MMX 0 483 %define RESET_MM_PERMUTATION INIT_MMX 484 %define mmsize 8 485 %define num_mmregs 8 486 %define mova movq 487 %define movu movq 488 %define movh movd 489 %define movnt movntq 490 %assign %%i 0 491 %rep 8 492 CAT_XDEFINE m, %%i, mm %+ %%i 493 CAT_XDEFINE nmm, %%i, %%i 494 %assign %%i %%i+1 495 %endrep 496 %rep 8 497 CAT_UNDEF m, %%i 498 CAT_UNDEF nmm, %%i 499 %assign %%i %%i+1 500 %endrep 501%endmacro 502 503%macro INIT_XMM 0 504 %define RESET_MM_PERMUTATION INIT_XMM 505 %define mmsize 16 506 %define num_mmregs 8 507 %ifdef ARCH_X86_64 508 %define num_mmregs 16 509 %endif 510 %define mova movdqa 511 %define movu movdqu 512 %define movh movq 513 %define movnt movntdq 514 %assign %%i 0 515 %rep num_mmregs 516 CAT_XDEFINE m, %%i, xmm %+ %%i 517 CAT_XDEFINE nxmm, %%i, %%i 518 %assign %%i %%i+1 519 %endrep 520%endmacro 521 522INIT_MMX 523 524; I often want to use macros that permute their arguments. e.g. there's no 525; efficient way to implement butterfly or transpose or dct without swapping some 526; arguments. 527; 528; I would like to not have to manually keep track of the permutations: 529; If I insert a permutation in the middle of a function, it should automatically 530; change everything that follows. For more complex macros I may also have multiple 531; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 532; 533; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 534; permutes its arguments. It's equivalent to exchanging the contents of the 535; registers, except that this way you exchange the register names instead, so it 536; doesn't cost any cycles. 537 538%macro PERMUTE 2-* ; takes a list of pairs to swap 539%rep %0/2 540 %xdefine tmp%2 m%2 541 %xdefine ntmp%2 nm%2 542 %rotate 2 543%endrep 544%rep %0/2 545 %xdefine m%1 tmp%2 546 %xdefine nm%1 ntmp%2 547 %undef tmp%2 548 %undef ntmp%2 549 %rotate 2 550%endrep 551%endmacro 552 553%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 554%rep %0-1 555%ifdef m%1 556 %xdefine tmp m%1 557 %xdefine m%1 m%2 558 %xdefine m%2 tmp 559 CAT_XDEFINE n, m%1, %1 560 CAT_XDEFINE n, m%2, %2 561%else 562 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. 563 ; Be careful using this mode in nested macros though, as in some cases there may be 564 ; other copies of m# that have already been dereferenced and don't get updated correctly. 565 %xdefine %%n1 n %+ %1 566 %xdefine %%n2 n %+ %2 567 %xdefine tmp m %+ %%n1 568 CAT_XDEFINE m, %%n1, m %+ %%n2 569 CAT_XDEFINE m, %%n2, tmp 570 CAT_XDEFINE n, m %+ %%n1, %%n1 571 CAT_XDEFINE n, m %+ %%n2, %%n2 572%endif 573 %undef tmp 574 %rotate 1 575%endrep 576%endmacro 577 578%macro SAVE_MM_PERMUTATION 1 579 %assign %%i 0 580 %rep num_mmregs 581 CAT_XDEFINE %1_m, %%i, m %+ %%i 582 %assign %%i %%i+1 583 %endrep 584%endmacro 585 586%macro LOAD_MM_PERMUTATION 1 587 %assign %%i 0 588 %rep num_mmregs 589 CAT_XDEFINE m, %%i, %1_m %+ %%i 590 CAT_XDEFINE n, m %+ %%i, %%i 591 %assign %%i %%i+1 592 %endrep 593%endmacro 594 595%macro call 1 596 call %1 597 %ifdef %1_m0 598 LOAD_MM_PERMUTATION %1 599 %endif 600%endmacro 601 602;Substitutions that reduce instruction size but are functionally equivalent 603%macro add 2 604 %ifnum %2 605 %if %2==128 606 sub %1, -128 607 %else 608 add %1, %2 609 %endif 610 %else 611 add %1, %2 612 %endif 613%endmacro 614 615%macro sub 2 616 %ifnum %2 617 %if %2==128 618 add %1, -128 619 %else 620 sub %1, %2 621 %endif 622 %else 623 sub %1, %2 624 %endif 625%endmacro 626