1;***************************************************************************** 2;* x86inc.asm 3;***************************************************************************** 4;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;***************************************************************************** 22 23; FIXME: All of the 64bit asm functions that take a stride as an argument 24; via register, assume that the high dword of that register is filled with 0. 25; This is true in practice (since we never do any 64bit arithmetic on strides, 26; and x264's strides are all positive), but is not guaranteed by the ABI. 27 28; Name of the .rodata section. 29; Kludge: Something on OS X fails to align .rodata even given an align attribute, 30; so use a different read-only section. 31%macro SECTION_RODATA 0 32 %ifidn __OUTPUT_FORMAT__,macho64 33 SECTION .text align=16 34 %elifidn __OUTPUT_FORMAT__,macho 35 SECTION .text align=16 36 fakegot: 37 %else 38 SECTION .rodata align=16 39 %endif 40%endmacro 41 42; PIC support macros. All these macros are totally harmless when PIC is 43; not defined but can ruin everything if misused in PIC mode. On x86_32, shared 44; objects cannot directly access global variables by address, they need to 45; go through the GOT (global offset table). Most OSes do not care about it 46; and let you load non-shared .so objects (Linux, Win32...). However, OS X 47; requires PIC code in its .dylib objects. 48; 49; - GLOBAL should be used as a suffix for global addressing, eg. 50; picgetgot ebx 51; mov eax, [foo GLOBAL] 52; instead of 53; mov eax, [foo] 54; 55; - picgetgot computes the GOT address into the given register in PIC 56; mode, otherwise does nothing. You need to do this before using GLOBAL. 57; Before in both execution order and compiled code order (so GLOBAL knows 58; which register the GOT is in). 59 60%ifndef PIC 61 %define GLOBAL 62 %macro picgetgot 1 63 %endmacro 64%elifdef ARCH_X86_64 65 %define PIC64 66 %define GLOBAL wrt rip 67 %macro picgetgot 1 68 %endmacro 69%else 70 %define PIC32 71 %ifidn __OUTPUT_FORMAT__,macho 72 ; There is no real global offset table on OS X, but we still 73 ; need to reference our variables by offset. 74 %macro picgetgot 1 75 call %%getgot 76 %%getgot: 77 pop %1 78 add %1, $$ - %%getgot 79 %undef GLOBAL 80 %define GLOBAL + %1 - fakegot 81 %endmacro 82 %else ; elf 83 extern _GLOBAL_OFFSET_TABLE_ 84 %macro picgetgot 1 85 call %%getgot 86 %%getgot: 87 pop %1 88 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc 89 %undef GLOBAL 90 %define GLOBAL + %1 wrt ..gotoff 91 %endmacro 92 %endif 93%endif 94 95; Macros to eliminate most code duplication between x86_32 and x86_64: 96; Currently this works only for leaf functions which load all their arguments 97; into registers at the start, and make no other use of the stack. Luckily that 98; covers most of x264's asm. 99 100; PROLOGUE: 101; %1 = number of arguments. loads them from stack if needed. 102; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. 103; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. 104; %4 = list of names to define to registers 105; PROLOGUE can also be invoked by adding the same options to cglobal 106 107; e.g. 108; cglobal foo, 2,3,0, dst, src, tmp 109; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals 110 111; TODO Some functions can use some args directly from the stack. If they're the 112; last args then you can just not declare them, but if they're in the middle 113; we need more flexible macro. 114 115; RET: 116; Pops anything that was pushed by PROLOGUE 117 118; REP_RET: 119; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 120; which are slow when a normal ret follows a branch. 121 122%macro DECLARE_REG 6 123 %define r%1q %2 124 %define r%1d %3 125 %define r%1w %4 126 %define r%1b %5 127 %define r%1m %6 128 %define r%1 %2 129%endmacro 130 131%macro DECLARE_REG_SIZE 2 132 %define r%1q r%1 133 %define e%1q r%1 134 %define r%1d e%1 135 %define e%1d e%1 136 %define r%1w %1 137 %define e%1w %1 138 %define r%1b %2 139 %define e%1b %2 140%ifndef ARCH_X86_64 141 %define r%1 e%1 142%endif 143%endmacro 144 145DECLARE_REG_SIZE ax, al 146DECLARE_REG_SIZE bx, bl 147DECLARE_REG_SIZE cx, cl 148DECLARE_REG_SIZE dx, dl 149DECLARE_REG_SIZE si, sil 150DECLARE_REG_SIZE di, dil 151DECLARE_REG_SIZE bp, bpl 152 153%ifdef ARCH_X86_64 154 %define gprsize 8 155%else 156 %define gprsize 4 157%endif 158 159%macro PUSH 1 160 push %1 161 %assign stack_offset stack_offset+gprsize 162%endmacro 163 164%macro POP 1 165 pop %1 166 %assign stack_offset stack_offset-gprsize 167%endmacro 168 169%macro SUB 2 170 sub %1, %2 171 %ifidn %1, rsp 172 %assign stack_offset stack_offset+(%2) 173 %endif 174%endmacro 175 176%macro ADD 2 177 add %1, %2 178 %ifidn %1, rsp 179 %assign stack_offset stack_offset-(%2) 180 %endif 181%endmacro 182 183%macro movifnidn 2 184 %ifnidn %1, %2 185 mov %1, %2 186 %endif 187%endmacro 188 189%macro movsxdifnidn 2 190 %ifnidn %1, %2 191 movsxd %1, %2 192 %endif 193%endmacro 194 195%macro ASSERT 1 196 %if (%1) == 0 197 %error assert failed 198 %endif 199%endmacro 200 201%macro DEFINE_ARGS 0-* 202 %ifdef n_arg_names 203 %assign %%i 0 204 %rep n_arg_names 205 CAT_UNDEF arg_name %+ %%i, q 206 CAT_UNDEF arg_name %+ %%i, d 207 CAT_UNDEF arg_name %+ %%i, w 208 CAT_UNDEF arg_name %+ %%i, b 209 CAT_UNDEF arg_name, %%i 210 %assign %%i %%i+1 211 %endrep 212 %endif 213 214 %assign %%i 0 215 %rep %0 216 %xdefine %1q r %+ %%i %+ q 217 %xdefine %1d r %+ %%i %+ d 218 %xdefine %1w r %+ %%i %+ w 219 %xdefine %1b r %+ %%i %+ b 220 CAT_XDEFINE arg_name, %%i, %1 221 %assign %%i %%i+1 222 %rotate 1 223 %endrep 224 %assign n_arg_names %%i 225%endmacro 226 227%ifdef ARCH_X86_64 ;========================================================== 228%ifidn __OUTPUT_FORMAT__,win32 229 230DECLARE_REG 0, rcx, ecx, cx, cl, ecx 231DECLARE_REG 1, rdx, edx, dx, dl, edx 232DECLARE_REG 2, r8, r8d, r8w, r8b, r8d 233DECLARE_REG 3, r9, r9d, r9w, r9b, r9d 234DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] 235DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] 236DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] 237%define r7m [rsp + stack_offset + 64] 238%define r8m [rsp + stack_offset + 72] 239 240%macro LOAD_IF_USED 2 ; reg_id, number_of_args 241 %if %1 < %2 242 mov r%1, [rsp + 8 + %1*8] 243 %endif 244%endmacro 245 246%else ;======================================================================= 247 248DECLARE_REG 0, rdi, edi, di, dil, edi 249DECLARE_REG 1, rsi, esi, si, sil, esi 250DECLARE_REG 2, rdx, edx, dx, dl, edx 251DECLARE_REG 3, rcx, ecx, cx, cl, ecx 252DECLARE_REG 4, r8, r8d, r8w, r8b, r8d 253DECLARE_REG 5, r9, r9d, r9w, r9b, r9d 254DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] 255%define r7m [rsp + stack_offset + 16] 256%define r8m [rsp + stack_offset + 24] 257 258%macro LOAD_IF_USED 2 ; reg_id, number_of_args 259 %if %1 < %2 260 mov r%1, [rsp - 40 + %1*8] 261 %endif 262%endmacro 263 264%endif ; !WIN64 265 266%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... 267 ASSERT %2 >= %1 268 ASSERT %2 <= 7 269 %assign stack_offset 0 270%ifidn __OUTPUT_FORMAT__,win32 271 LOAD_IF_USED 4, %1 272 LOAD_IF_USED 5, %1 273%endif 274 LOAD_IF_USED 6, %1 275 DEFINE_ARGS %4 276%endmacro 277 278%macro RET 0 279 ret 280%endmacro 281 282%macro REP_RET 0 283 rep ret 284%endmacro 285 286%else ; X86_32 ;============================================================== 287 288DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] 289DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] 290DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] 291DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] 292DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] 293DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] 294DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] 295%define r7m [esp + stack_offset + 32] 296%define r8m [esp + stack_offset + 36] 297%define rsp esp 298 299%macro PUSH_IF_USED 1 ; reg_id 300 %if %1 < regs_used 301 push r%1 302 %assign stack_offset stack_offset+4 303 %endif 304%endmacro 305 306%macro POP_IF_USED 1 ; reg_id 307 %if %1 < regs_used 308 pop r%1 309 %endif 310%endmacro 311 312%macro LOAD_IF_USED 2 ; reg_id, number_of_args 313 %if %1 < %2 314 mov r%1, [esp + stack_offset + 4 + %1*4] 315 %endif 316%endmacro 317 318%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... 319 ASSERT %2 >= %1 320 %assign stack_offset 0 321 %assign regs_used %2 322 %ifdef PIC 323 %if %3 324 %assign regs_used regs_used+1 325 %endif 326 %endif 327 ASSERT regs_used <= 7 328 PUSH_IF_USED 3 329 PUSH_IF_USED 4 330 PUSH_IF_USED 5 331 PUSH_IF_USED 6 332 LOAD_IF_USED 0, %1 333 LOAD_IF_USED 1, %1 334 LOAD_IF_USED 2, %1 335 LOAD_IF_USED 3, %1 336 LOAD_IF_USED 4, %1 337 LOAD_IF_USED 5, %1 338 LOAD_IF_USED 6, %1 339 %if %3 340 picgetgot r%2 341 %endif 342 DEFINE_ARGS %4 343%endmacro 344 345%macro RET 0 346 POP_IF_USED 6 347 POP_IF_USED 5 348 POP_IF_USED 4 349 POP_IF_USED 3 350 ret 351%endmacro 352 353%macro REP_RET 0 354 %if regs_used > 3 355 RET 356 %else 357 rep ret 358 %endif 359%endmacro 360 361%endif ;====================================================================== 362 363 364 365;============================================================================= 366; arch-independent part 367;============================================================================= 368 369%assign function_align 16 370 371; Symbol prefix for C linkage 372%macro cglobal 1-2+ 373 %xdefine %1 ff_%1 374 %ifdef PREFIX 375 %xdefine %1 _ %+ %1 376 %endif 377 %ifidn __OUTPUT_FORMAT__,elf 378 global %1:function hidden 379 %else 380 global %1 381 %endif 382 align function_align 383 %1: 384 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 385 %if %0 > 1 386 PROLOGUE %2 387 %endif 388%endmacro 389 390%macro cextern 1 391 %ifdef PREFIX 392 extern _%1 393 %define %1 _%1 394 %else 395 extern %1 396 %endif 397%endmacro 398 399; This is needed for ELF, otherwise the GNU linker assumes the stack is 400; executable by default. 401%ifidn __OUTPUT_FORMAT__,elf 402SECTION .note.GNU-stack noalloc noexec nowrite progbits 403%endif 404 405%assign FENC_STRIDE 16 406%assign FDEC_STRIDE 32 407 408; merge mmx and sse* 409 410%macro CAT_XDEFINE 3 411 %xdefine %1%2 %3 412%endmacro 413 414%macro CAT_UNDEF 2 415 %undef %1%2 416%endmacro 417 418%macro INIT_MMX 0 419 %define RESET_MM_PERMUTATION INIT_MMX 420 %define mmsize 8 421 %define num_mmregs 8 422 %define mova movq 423 %define movu movq 424 %define movh movd 425 %define movnt movntq 426 %assign %%i 0 427 %rep 8 428 CAT_XDEFINE m, %%i, mm %+ %%i 429 CAT_XDEFINE nmm, %%i, %%i 430 %assign %%i %%i+1 431 %endrep 432 %rep 8 433 CAT_UNDEF m, %%i 434 CAT_UNDEF nmm, %%i 435 %assign %%i %%i+1 436 %endrep 437%endmacro 438 439%macro INIT_XMM 0 440 %define RESET_MM_PERMUTATION INIT_XMM 441 %define mmsize 16 442 %define num_mmregs 8 443 %ifdef ARCH_X86_64 444 %define num_mmregs 16 445 %endif 446 %define mova movdqa 447 %define movu movdqu 448 %define movh movq 449 %define movnt movntdq 450 %assign %%i 0 451 %rep num_mmregs 452 CAT_XDEFINE m, %%i, xmm %+ %%i 453 CAT_XDEFINE nxmm, %%i, %%i 454 %assign %%i %%i+1 455 %endrep 456%endmacro 457 458INIT_MMX 459 460; I often want to use macros that permute their arguments. e.g. there's no 461; efficient way to implement butterfly or transpose or dct without swapping some 462; arguments. 463; 464; I would like to not have to manually keep track of the permutations: 465; If I insert a permutation in the middle of a function, it should automatically 466; change everything that follows. For more complex macros I may also have multiple 467; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 468; 469; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 470; permutes its arguments. It's equivalent to exchanging the contents of the 471; registers, except that this way you exchange the register names instead, so it 472; doesn't cost any cycles. 473 474%macro PERMUTE 2-* ; takes a list of pairs to swap 475%rep %0/2 476 %xdefine tmp%2 m%2 477 %xdefine ntmp%2 nm%2 478 %rotate 2 479%endrep 480%rep %0/2 481 %xdefine m%1 tmp%2 482 %xdefine nm%1 ntmp%2 483 %undef tmp%2 484 %undef ntmp%2 485 %rotate 2 486%endrep 487%endmacro 488 489%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 490%rep %0-1 491%ifdef m%1 492 %xdefine tmp m%1 493 %xdefine m%1 m%2 494 %xdefine m%2 tmp 495 CAT_XDEFINE n, m%1, %1 496 CAT_XDEFINE n, m%2, %2 497%else 498 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. 499 ; Be careful using this mode in nested macros though, as in some cases there may be 500 ; other copies of m# that have already been dereferenced and don't get updated correctly. 501 %xdefine %%n1 n %+ %1 502 %xdefine %%n2 n %+ %2 503 %xdefine tmp m %+ %%n1 504 CAT_XDEFINE m, %%n1, m %+ %%n2 505 CAT_XDEFINE m, %%n2, tmp 506 CAT_XDEFINE n, m %+ %%n1, %%n1 507 CAT_XDEFINE n, m %+ %%n2, %%n2 508%endif 509 %undef tmp 510 %rotate 1 511%endrep 512%endmacro 513 514%macro SAVE_MM_PERMUTATION 1 515 %assign %%i 0 516 %rep num_mmregs 517 CAT_XDEFINE %1_m, %%i, m %+ %%i 518 %assign %%i %%i+1 519 %endrep 520%endmacro 521 522%macro LOAD_MM_PERMUTATION 1 523 %assign %%i 0 524 %rep num_mmregs 525 CAT_XDEFINE m, %%i, %1_m %+ %%i 526 %assign %%i %%i+1 527 %endrep 528%endmacro 529 530%macro call 1 531 call %1 532 %ifdef %1_m0 533 LOAD_MM_PERMUTATION %1 534 %endif 535%endmacro 536 537; substitutions which are functionally identical but reduce code size 538%define movdqa movaps 539%define movdqu movups 540 541