1;*****************************************************************************
2;* x86inc.asm: x264asm abstraction layer
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Anton Mitrofanov <BugMaster@narod.ru>
8;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9;*
10;* Permission to use, copy, modify, and/or distribute this software for any
11;* purpose with or without fee is hereby granted, provided that the above
12;* copyright notice and this permission notice appear in all copies.
13;*
14;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
15;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
16;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
17;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
19;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
20;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21;*****************************************************************************
22
23; This is a header file for the x264ASM assembly language, which uses
24; NASM/YASM syntax combined with a large number of macros to provide easy
25; abstraction between different calling conventions (x86_32, win64, linux64).
26; It also has various other useful features to simplify writing the kind of
27; DSP functions that are most often used in x264.
28
29; Unlike the rest of x264, this file is available under an ISC license, as it
30; has significant usefulness outside of x264 and we want it to be available
31; to the largest audience possible.  Of course, if you modify it for your own
32; purposes to add a new feature, we strongly encourage contributing a patch
33; as this feature might be useful for others as well.  Send patches or ideas
34; to x264-devel@videolan.org .
35
36%define program_name ff
37
38%ifdef ARCH_X86_64
39    %ifidn __OUTPUT_FORMAT__,win32
40        %define WIN64
41    %else
42        %define UNIX64
43    %endif
44%endif
45
46%ifdef PREFIX
47    %define mangle(x) _ %+ x
48%else
49    %define mangle(x) x
50%endif
51
52; FIXME: All of the 64bit asm functions that take a stride as an argument
53; via register, assume that the high dword of that register is filled with 0.
54; This is true in practice (since we never do any 64bit arithmetic on strides,
55; and x264's strides are all positive), but is not guaranteed by the ABI.
56
57; Name of the .rodata section.
58; Kludge: Something on OS X fails to align .rodata even given an align attribute,
59; so use a different read-only section.
60%macro SECTION_RODATA 0-1 16
61    %ifidn __OUTPUT_FORMAT__,macho64
62        SECTION .text align=%1
63    %elifidn __OUTPUT_FORMAT__,macho
64        SECTION .text align=%1
65        fakegot:
66    %elifidn __OUTPUT_FORMAT__,aout
67        section .text
68    %else
69        SECTION .rodata align=%1
70    %endif
71%endmacro
72
73; aout does not support align=
74%macro SECTION_TEXT 0-1 16
75    %ifidn __OUTPUT_FORMAT__,aout
76        SECTION .text
77    %else
78        SECTION .text align=%1
79    %endif
80%endmacro
81
82%ifdef WIN64
83    %define PIC
84%elifndef ARCH_X86_64
85; x86_32 doesn't require PIC.
86; Some distros prefer shared objects to be PIC, but nothing breaks if
87; the code contains a few textrels, so we'll skip that complexity.
88    %undef PIC
89%endif
90%ifdef PIC
91    default rel
92%endif
93
94; Macros to eliminate most code duplication between x86_32 and x86_64:
95; Currently this works only for leaf functions which load all their arguments
96; into registers at the start, and make no other use of the stack. Luckily that
97; covers most of x264's asm.
98
99; PROLOGUE:
100; %1 = number of arguments. loads them from stack if needed.
101; %2 = number of registers used. pushes callee-saved regs if needed.
102; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
103; %4 = list of names to define to registers
104; PROLOGUE can also be invoked by adding the same options to cglobal
105
106; e.g.
107; cglobal foo, 2,3,0, dst, src, tmp
108; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
109
110; TODO Some functions can use some args directly from the stack. If they're the
111; last args then you can just not declare them, but if they're in the middle
112; we need more flexible macro.
113
114; RET:
115; Pops anything that was pushed by PROLOGUE, and returns.
116
117; REP_RET:
118; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
119; which are slow when a normal ret follows a branch.
120
121; registers:
122; rN and rNq are the native-size register holding function argument N
123; rNd, rNw, rNb are dword, word, and byte size
124; rNm is the original location of arg N (a register or on the stack), dword
125; rNmp is native size
126
127%macro DECLARE_REG 6
128    %define r%1q %2
129    %define r%1d %3
130    %define r%1w %4
131    %define r%1b %5
132    %define r%1m %6
133    %ifid %6 ; i.e. it's a register
134        %define r%1mp %2
135    %elifdef ARCH_X86_64 ; memory
136        %define r%1mp qword %6
137    %else
138        %define r%1mp dword %6
139    %endif
140    %define r%1  %2
141%endmacro
142
143%macro DECLARE_REG_SIZE 2
144    %define r%1q r%1
145    %define e%1q r%1
146    %define r%1d e%1
147    %define e%1d e%1
148    %define r%1w %1
149    %define e%1w %1
150    %define r%1b %2
151    %define e%1b %2
152%ifndef ARCH_X86_64
153    %define r%1  e%1
154%endif
155%endmacro
156
157DECLARE_REG_SIZE ax, al
158DECLARE_REG_SIZE bx, bl
159DECLARE_REG_SIZE cx, cl
160DECLARE_REG_SIZE dx, dl
161DECLARE_REG_SIZE si, sil
162DECLARE_REG_SIZE di, dil
163DECLARE_REG_SIZE bp, bpl
164
165; t# defines for when per-arch register allocation is more complex than just function arguments
166
167%macro DECLARE_REG_TMP 1-*
168    %assign %%i 0
169    %rep %0
170        CAT_XDEFINE t, %%i, r%1
171        %assign %%i %%i+1
172        %rotate 1
173    %endrep
174%endmacro
175
176%macro DECLARE_REG_TMP_SIZE 0-*
177    %rep %0
178        %define t%1q t%1 %+ q
179        %define t%1d t%1 %+ d
180        %define t%1w t%1 %+ w
181        %define t%1b t%1 %+ b
182        %rotate 1
183    %endrep
184%endmacro
185
186DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
187
188%ifdef ARCH_X86_64
189    %define gprsize 8
190%else
191    %define gprsize 4
192%endif
193
194%macro PUSH 1
195    push %1
196    %assign stack_offset stack_offset+gprsize
197%endmacro
198
199%macro POP 1
200    pop %1
201    %assign stack_offset stack_offset-gprsize
202%endmacro
203
204%macro SUB 2
205    sub %1, %2
206    %ifidn %1, rsp
207        %assign stack_offset stack_offset+(%2)
208    %endif
209%endmacro
210
211%macro ADD 2
212    add %1, %2
213    %ifidn %1, rsp
214        %assign stack_offset stack_offset-(%2)
215    %endif
216%endmacro
217
218%macro movifnidn 2
219    %ifnidn %1, %2
220        mov %1, %2
221    %endif
222%endmacro
223
224%macro movsxdifnidn 2
225    %ifnidn %1, %2
226        movsxd %1, %2
227    %endif
228%endmacro
229
230%macro ASSERT 1
231    %if (%1) == 0
232        %error assert failed
233    %endif
234%endmacro
235
236%macro DEFINE_ARGS 0-*
237    %ifdef n_arg_names
238        %assign %%i 0
239        %rep n_arg_names
240            CAT_UNDEF arg_name %+ %%i, q
241            CAT_UNDEF arg_name %+ %%i, d
242            CAT_UNDEF arg_name %+ %%i, w
243            CAT_UNDEF arg_name %+ %%i, b
244            CAT_UNDEF arg_name %+ %%i, m
245            CAT_UNDEF arg_name, %%i
246            %assign %%i %%i+1
247        %endrep
248    %endif
249
250    %assign %%i 0
251    %rep %0
252        %xdefine %1q r %+ %%i %+ q
253        %xdefine %1d r %+ %%i %+ d
254        %xdefine %1w r %+ %%i %+ w
255        %xdefine %1b r %+ %%i %+ b
256        %xdefine %1m r %+ %%i %+ m
257        CAT_XDEFINE arg_name, %%i, %1
258        %assign %%i %%i+1
259        %rotate 1
260    %endrep
261    %assign n_arg_names %%i
262%endmacro
263
264%ifdef WIN64 ; Windows x64 ;=================================================
265
266DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
267DECLARE_REG 1, rdx, edx, dx,  dl,  edx
268DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
269DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
270DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
271DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
272DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
273%define r7m [rsp + stack_offset + 64]
274%define r8m [rsp + stack_offset + 72]
275
276%macro LOAD_IF_USED 2 ; reg_id, number_of_args
277    %if %1 < %2
278        mov r%1, [rsp + stack_offset + 8 + %1*8]
279    %endif
280%endmacro
281
282%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
283    ASSERT %2 >= %1
284    %assign regs_used %2
285    ASSERT regs_used <= 7
286    %if regs_used > 4
287        push r4
288        push r5
289        %assign stack_offset stack_offset+16
290    %endif
291    WIN64_SPILL_XMM %3
292    LOAD_IF_USED 4, %1
293    LOAD_IF_USED 5, %1
294    LOAD_IF_USED 6, %1
295    DEFINE_ARGS %4
296%endmacro
297
298%macro WIN64_SPILL_XMM 1
299    %assign xmm_regs_used %1
300    %if mmsize == 8
301        %assign xmm_regs_used 0
302    %endif
303    ASSERT xmm_regs_used <= 16
304    %if xmm_regs_used > 6
305        sub rsp, (xmm_regs_used-6)*16+16
306        %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
307        %assign %%i xmm_regs_used
308        %rep (xmm_regs_used-6)
309            %assign %%i %%i-1
310            movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
311        %endrep
312    %endif
313%endmacro
314
315%macro WIN64_RESTORE_XMM_INTERNAL 1
316    %if xmm_regs_used > 6
317        %assign %%i xmm_regs_used
318        %rep (xmm_regs_used-6)
319            %assign %%i %%i-1
320            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
321        %endrep
322        add %1, (xmm_regs_used-6)*16+16
323    %endif
324%endmacro
325
326%macro WIN64_RESTORE_XMM 1
327    WIN64_RESTORE_XMM_INTERNAL %1
328    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
329    %assign xmm_regs_used 0
330%endmacro
331
332%macro RET 0
333    WIN64_RESTORE_XMM_INTERNAL rsp
334    %if regs_used > 4
335        pop r5
336        pop r4
337    %endif
338    ret
339%endmacro
340
341%macro REP_RET 0
342    %if regs_used > 4 || xmm_regs_used > 6
343        RET
344    %else
345        rep ret
346    %endif
347%endmacro
348
349%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
350
351DECLARE_REG 0, rdi, edi, di,  dil, edi
352DECLARE_REG 1, rsi, esi, si,  sil, esi
353DECLARE_REG 2, rdx, edx, dx,  dl,  edx
354DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
355DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
356DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
357DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
358%define r7m [rsp + stack_offset + 16]
359%define r8m [rsp + stack_offset + 24]
360
361%macro LOAD_IF_USED 2 ; reg_id, number_of_args
362    %if %1 < %2
363        mov r%1, [rsp - 40 + %1*8]
364    %endif
365%endmacro
366
367%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
368    ASSERT %2 >= %1
369    ASSERT %2 <= 7
370    LOAD_IF_USED 6, %1
371    DEFINE_ARGS %4
372%endmacro
373
374%macro RET 0
375    ret
376%endmacro
377
378%macro REP_RET 0
379    rep ret
380%endmacro
381
382%else ; X86_32 ;==============================================================
383
384DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
385DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
386DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
387DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
388DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
389DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
390DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
391%define r7m [esp + stack_offset + 32]
392%define r8m [esp + stack_offset + 36]
393%define rsp esp
394
395%macro PUSH_IF_USED 1 ; reg_id
396    %if %1 < regs_used
397        push r%1
398        %assign stack_offset stack_offset+4
399    %endif
400%endmacro
401
402%macro POP_IF_USED 1 ; reg_id
403    %if %1 < regs_used
404        pop r%1
405    %endif
406%endmacro
407
408%macro LOAD_IF_USED 2 ; reg_id, number_of_args
409    %if %1 < %2
410        mov r%1, [esp + stack_offset + 4 + %1*4]
411    %endif
412%endmacro
413
414%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
415    ASSERT %2 >= %1
416    %assign regs_used %2
417    ASSERT regs_used <= 7
418    PUSH_IF_USED 3
419    PUSH_IF_USED 4
420    PUSH_IF_USED 5
421    PUSH_IF_USED 6
422    LOAD_IF_USED 0, %1
423    LOAD_IF_USED 1, %1
424    LOAD_IF_USED 2, %1
425    LOAD_IF_USED 3, %1
426    LOAD_IF_USED 4, %1
427    LOAD_IF_USED 5, %1
428    LOAD_IF_USED 6, %1
429    DEFINE_ARGS %4
430%endmacro
431
432%macro RET 0
433    POP_IF_USED 6
434    POP_IF_USED 5
435    POP_IF_USED 4
436    POP_IF_USED 3
437    ret
438%endmacro
439
440%macro REP_RET 0
441    %if regs_used > 3
442        RET
443    %else
444        rep ret
445    %endif
446%endmacro
447
448%endif ;======================================================================
449
450%ifndef WIN64
451%macro WIN64_SPILL_XMM 1
452%endmacro
453%macro WIN64_RESTORE_XMM 1
454%endmacro
455%endif
456
457
458
459;=============================================================================
460; arch-independent part
461;=============================================================================
462
463%assign function_align 16
464
465; Begin a function.
466; Applies any symbol mangling needed for C linkage, and sets up a define such that
467; subsequent uses of the function name automatically refer to the mangled version.
468; Appends cpuflags to the function name if cpuflags has been specified.
469%macro cglobal 1-2+ ; name, [PROLOGUE args]
470%if %0 == 1
471    cglobal_internal %1 %+ SUFFIX
472%else
473    cglobal_internal %1 %+ SUFFIX, %2
474%endif
475%endmacro
476%macro cglobal_internal 1-2+
477    %ifndef cglobaled_%1
478        %xdefine %1 mangle(program_name %+ _ %+ %1)
479        %xdefine %1.skip_prologue %1 %+ .skip_prologue
480        CAT_XDEFINE cglobaled_, %1, 1
481    %endif
482    %xdefine current_function %1
483    %ifidn __OUTPUT_FORMAT__,elf
484        global %1:function hidden
485    %else
486        global %1
487    %endif
488    align function_align
489    %1:
490    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
491    %assign stack_offset 0
492    %if %0 > 1
493        PROLOGUE %2
494    %endif
495%endmacro
496
497%macro cextern 1
498    %xdefine %1 mangle(program_name %+ _ %+ %1)
499    CAT_XDEFINE cglobaled_, %1, 1
500    extern %1
501%endmacro
502
503; like cextern, but without the prefix
504%macro cextern_naked 1
505    %xdefine %1 mangle(%1)
506    CAT_XDEFINE cglobaled_, %1, 1
507    extern %1
508%endmacro
509
510%macro const 2+
511    %xdefine %1 mangle(program_name %+ _ %+ %1)
512    global %1
513    %1: %2
514%endmacro
515
516; This is needed for ELF, otherwise the GNU linker assumes the stack is
517; executable by default.
518%ifidn __OUTPUT_FORMAT__,elf
519SECTION .note.GNU-stack noalloc noexec nowrite progbits
520%endif
521
522; cpuflags
523
524%assign cpuflags_mmx      (1<<0)
525%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
526%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
527%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
528%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
529%assign cpuflags_sse2     (1<<5) | cpuflags_sse
530%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
531%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
532%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
533%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
534%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
535%assign cpuflags_avx      (1<<11)| cpuflags_sse42
536%assign cpuflags_xop      (1<<12)| cpuflags_avx
537%assign cpuflags_fma4     (1<<13)| cpuflags_avx
538
539%assign cpuflags_cache32  (1<<16)
540%assign cpuflags_cache64  (1<<17)
541%assign cpuflags_slowctz  (1<<18)
542%assign cpuflags_lzcnt    (1<<19)
543%assign cpuflags_misalign (1<<20)
544%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
545%assign cpuflags_atom     (1<<22)
546
547%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
548%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
549
550; Takes up to 2 cpuflags from the above list.
551; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
552; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
553%macro INIT_CPUFLAGS 0-2
554    %if %0 >= 1
555        %xdefine cpuname %1
556        %assign cpuflags cpuflags_%1
557        %if %0 >= 2
558            %xdefine cpuname %1_%2
559            %assign cpuflags cpuflags | cpuflags_%2
560        %endif
561        %xdefine SUFFIX _ %+ cpuname
562        %if cpuflag(avx)
563            %assign avx_enabled 1
564        %endif
565        %if mmsize == 16 && notcpuflag(sse2)
566            %define mova movaps
567            %define movu movups
568            %define movnta movntps
569        %endif
570        %if cpuflag(aligned)
571            %define movu mova
572        %elifidn %1, sse3
573            %define movu lddqu
574        %endif
575    %else
576        %xdefine SUFFIX
577        %undef cpuname
578        %undef cpuflags
579    %endif
580%endmacro
581
582; merge mmx and sse*
583
584%macro CAT_XDEFINE 3
585    %xdefine %1%2 %3
586%endmacro
587
588%macro CAT_UNDEF 2
589    %undef %1%2
590%endmacro
591
592%macro INIT_MMX 0-1+
593    %assign avx_enabled 0
594    %define RESET_MM_PERMUTATION INIT_MMX %1
595    %define mmsize 8
596    %define num_mmregs 8
597    %define mova movq
598    %define movu movq
599    %define movh movd
600    %define movnta movntq
601    %assign %%i 0
602    %rep 8
603    CAT_XDEFINE m, %%i, mm %+ %%i
604    CAT_XDEFINE nmm, %%i, %%i
605    %assign %%i %%i+1
606    %endrep
607    %rep 8
608    CAT_UNDEF m, %%i
609    CAT_UNDEF nmm, %%i
610    %assign %%i %%i+1
611    %endrep
612    INIT_CPUFLAGS %1
613%endmacro
614
615%macro INIT_XMM 0-1+
616    %assign avx_enabled 0
617    %define RESET_MM_PERMUTATION INIT_XMM %1
618    %define mmsize 16
619    %define num_mmregs 8
620    %ifdef ARCH_X86_64
621    %define num_mmregs 16
622    %endif
623    %define mova movdqa
624    %define movu movdqu
625    %define movh movq
626    %define movnta movntdq
627    %assign %%i 0
628    %rep num_mmregs
629    CAT_XDEFINE m, %%i, xmm %+ %%i
630    CAT_XDEFINE nxmm, %%i, %%i
631    %assign %%i %%i+1
632    %endrep
633    INIT_CPUFLAGS %1
634%endmacro
635
636; FIXME: INIT_AVX can be replaced by INIT_XMM avx
637%macro INIT_AVX 0
638    INIT_XMM
639    %assign avx_enabled 1
640    %define PALIGNR PALIGNR_SSSE3
641    %define RESET_MM_PERMUTATION INIT_AVX
642%endmacro
643
644%macro INIT_YMM 0-1+
645    %assign avx_enabled 1
646    %define RESET_MM_PERMUTATION INIT_YMM %1
647    %define mmsize 32
648    %define num_mmregs 8
649    %ifdef ARCH_X86_64
650    %define num_mmregs 16
651    %endif
652    %define mova vmovaps
653    %define movu vmovups
654    %undef movh
655    %define movnta vmovntps
656    %assign %%i 0
657    %rep num_mmregs
658    CAT_XDEFINE m, %%i, ymm %+ %%i
659    CAT_XDEFINE nymm, %%i, %%i
660    %assign %%i %%i+1
661    %endrep
662    INIT_CPUFLAGS %1
663%endmacro
664
665INIT_XMM
666
667; I often want to use macros that permute their arguments. e.g. there's no
668; efficient way to implement butterfly or transpose or dct without swapping some
669; arguments.
670;
671; I would like to not have to manually keep track of the permutations:
672; If I insert a permutation in the middle of a function, it should automatically
673; change everything that follows. For more complex macros I may also have multiple
674; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
675;
676; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
677; permutes its arguments. It's equivalent to exchanging the contents of the
678; registers, except that this way you exchange the register names instead, so it
679; doesn't cost any cycles.
680
681%macro PERMUTE 2-* ; takes a list of pairs to swap
682%rep %0/2
683    %xdefine tmp%2 m%2
684    %xdefine ntmp%2 nm%2
685    %rotate 2
686%endrep
687%rep %0/2
688    %xdefine m%1 tmp%2
689    %xdefine nm%1 ntmp%2
690    %undef tmp%2
691    %undef ntmp%2
692    %rotate 2
693%endrep
694%endmacro
695
696%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
697%rep %0-1
698%ifdef m%1
699    %xdefine tmp m%1
700    %xdefine m%1 m%2
701    %xdefine m%2 tmp
702    CAT_XDEFINE n, m%1, %1
703    CAT_XDEFINE n, m%2, %2
704%else
705    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
706    ; Be careful using this mode in nested macros though, as in some cases there may be
707    ; other copies of m# that have already been dereferenced and don't get updated correctly.
708    %xdefine %%n1 n %+ %1
709    %xdefine %%n2 n %+ %2
710    %xdefine tmp m %+ %%n1
711    CAT_XDEFINE m, %%n1, m %+ %%n2
712    CAT_XDEFINE m, %%n2, tmp
713    CAT_XDEFINE n, m %+ %%n1, %%n1
714    CAT_XDEFINE n, m %+ %%n2, %%n2
715%endif
716    %undef tmp
717    %rotate 1
718%endrep
719%endmacro
720
721; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
722; calls to that function will automatically load the permutation, so values can
723; be returned in mmregs.
724%macro SAVE_MM_PERMUTATION 0-1
725    %if %0
726        %xdefine %%f %1_m
727    %else
728        %xdefine %%f current_function %+ _m
729    %endif
730    %assign %%i 0
731    %rep num_mmregs
732        CAT_XDEFINE %%f, %%i, m %+ %%i
733    %assign %%i %%i+1
734    %endrep
735%endmacro
736
737%macro LOAD_MM_PERMUTATION 1 ; name to load from
738    %ifdef %1_m0
739        %assign %%i 0
740        %rep num_mmregs
741            CAT_XDEFINE m, %%i, %1_m %+ %%i
742            CAT_XDEFINE n, m %+ %%i, %%i
743        %assign %%i %%i+1
744        %endrep
745    %endif
746%endmacro
747
748; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
749%macro call 1
750    call_internal %1, %1 %+ SUFFIX
751%endmacro
752%macro call_internal 2
753    %xdefine %%i %1
754    %ifndef cglobaled_%1
755        %ifdef cglobaled_%2
756            %xdefine %%i %2
757        %endif
758    %endif
759    call %%i
760    LOAD_MM_PERMUTATION %%i
761%endmacro
762
763; Substitutions that reduce instruction size but are functionally equivalent
764%macro add 2
765    %ifnum %2
766        %if %2==128
767            sub %1, -128
768        %else
769            add %1, %2
770        %endif
771    %else
772        add %1, %2
773    %endif
774%endmacro
775
776%macro sub 2
777    %ifnum %2
778        %if %2==128
779            add %1, -128
780        %else
781            sub %1, %2
782        %endif
783    %else
784        sub %1, %2
785    %endif
786%endmacro
787
788;=============================================================================
789; AVX abstraction layer
790;=============================================================================
791
792%assign i 0
793%rep 16
794    %if i < 8
795        CAT_XDEFINE sizeofmm, i, 8
796    %endif
797    CAT_XDEFINE sizeofxmm, i, 16
798    CAT_XDEFINE sizeofymm, i, 32
799%assign i i+1
800%endrep
801%undef i
802
803;%1 == instruction
804;%2 == 1 if float, 0 if int
805;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
806;%4 == number of operands given
807;%5+: operands
808%macro RUN_AVX_INSTR 6-7+
809    %ifid %5
810        %define %%size sizeof%5
811    %else
812        %define %%size mmsize
813    %endif
814    %if %%size==32
815        v%1 %5, %6, %7
816    %else
817        %if %%size==8
818            %define %%regmov movq
819        %elif %2
820            %define %%regmov movaps
821        %else
822            %define %%regmov movdqa
823        %endif
824
825        %if %4>=3+%3
826            %ifnidn %5, %6
827                %if avx_enabled && sizeof%5==16
828                    v%1 %5, %6, %7
829                %else
830                    %%regmov %5, %6
831                    %1 %5, %7
832                %endif
833            %else
834                %1 %5, %7
835            %endif
836        %elif %3
837            %1 %5, %6, %7
838        %else
839            %1 %5, %6
840        %endif
841    %endif
842%endmacro
843
844; 3arg AVX ops with a memory arg can only have it in src2,
845; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
846; So, if the op is symmetric and the wrong one is memory, swap them.
847%macro RUN_AVX_INSTR1 8
848    %assign %%swap 0
849    %if avx_enabled
850        %ifnid %6
851            %assign %%swap 1
852        %endif
853    %elifnidn %5, %6
854        %ifnid %7
855            %assign %%swap 1
856        %endif
857    %endif
858    %if %%swap && %3 == 0 && %8 == 1
859        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
860    %else
861        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
862    %endif
863%endmacro
864
865;%1 == instruction
866;%2 == 1 if float, 0 if int
867;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
868;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
869%macro AVX_INSTR 4
870    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
871        %ifidn %3, fnord
872            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
873        %elifidn %4, fnord
874            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
875        %elifidn %5, fnord
876            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
877        %else
878            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
879        %endif
880    %endmacro
881%endmacro
882
883AVX_INSTR addpd, 1, 0, 1
884AVX_INSTR addps, 1, 0, 1
885AVX_INSTR addsd, 1, 0, 1
886AVX_INSTR addss, 1, 0, 1
887AVX_INSTR addsubpd, 1, 0, 0
888AVX_INSTR addsubps, 1, 0, 0
889AVX_INSTR andpd, 1, 0, 1
890AVX_INSTR andps, 1, 0, 1
891AVX_INSTR andnpd, 1, 0, 0
892AVX_INSTR andnps, 1, 0, 0
893AVX_INSTR blendpd, 1, 0, 0
894AVX_INSTR blendps, 1, 0, 0
895AVX_INSTR blendvpd, 1, 0, 0
896AVX_INSTR blendvps, 1, 0, 0
897AVX_INSTR cmppd, 1, 0, 0
898AVX_INSTR cmpps, 1, 0, 0
899AVX_INSTR cmpsd, 1, 0, 0
900AVX_INSTR cmpss, 1, 0, 0
901AVX_INSTR divpd, 1, 0, 0
902AVX_INSTR divps, 1, 0, 0
903AVX_INSTR divsd, 1, 0, 0
904AVX_INSTR divss, 1, 0, 0
905AVX_INSTR dppd, 1, 1, 0
906AVX_INSTR dpps, 1, 1, 0
907AVX_INSTR haddpd, 1, 0, 0
908AVX_INSTR haddps, 1, 0, 0
909AVX_INSTR hsubpd, 1, 0, 0
910AVX_INSTR hsubps, 1, 0, 0
911AVX_INSTR maxpd, 1, 0, 1
912AVX_INSTR maxps, 1, 0, 1
913AVX_INSTR maxsd, 1, 0, 1
914AVX_INSTR maxss, 1, 0, 1
915AVX_INSTR minpd, 1, 0, 1
916AVX_INSTR minps, 1, 0, 1
917AVX_INSTR minsd, 1, 0, 1
918AVX_INSTR minss, 1, 0, 1
919AVX_INSTR movhlps, 1, 0, 0
920AVX_INSTR movlhps, 1, 0, 0
921AVX_INSTR movsd, 1, 0, 0
922AVX_INSTR movss, 1, 0, 0
923AVX_INSTR mpsadbw, 0, 1, 0
924AVX_INSTR mulpd, 1, 0, 1
925AVX_INSTR mulps, 1, 0, 1
926AVX_INSTR mulsd, 1, 0, 1
927AVX_INSTR mulss, 1, 0, 1
928AVX_INSTR orpd, 1, 0, 1
929AVX_INSTR orps, 1, 0, 1
930AVX_INSTR packsswb, 0, 0, 0
931AVX_INSTR packssdw, 0, 0, 0
932AVX_INSTR packuswb, 0, 0, 0
933AVX_INSTR packusdw, 0, 0, 0
934AVX_INSTR paddb, 0, 0, 1
935AVX_INSTR paddw, 0, 0, 1
936AVX_INSTR paddd, 0, 0, 1
937AVX_INSTR paddq, 0, 0, 1
938AVX_INSTR paddsb, 0, 0, 1
939AVX_INSTR paddsw, 0, 0, 1
940AVX_INSTR paddusb, 0, 0, 1
941AVX_INSTR paddusw, 0, 0, 1
942AVX_INSTR palignr, 0, 1, 0
943AVX_INSTR pand, 0, 0, 1
944AVX_INSTR pandn, 0, 0, 0
945AVX_INSTR pavgb, 0, 0, 1
946AVX_INSTR pavgw, 0, 0, 1
947AVX_INSTR pblendvb, 0, 0, 0
948AVX_INSTR pblendw, 0, 1, 0
949AVX_INSTR pcmpestri, 0, 0, 0
950AVX_INSTR pcmpestrm, 0, 0, 0
951AVX_INSTR pcmpistri, 0, 0, 0
952AVX_INSTR pcmpistrm, 0, 0, 0
953AVX_INSTR pcmpeqb, 0, 0, 1
954AVX_INSTR pcmpeqw, 0, 0, 1
955AVX_INSTR pcmpeqd, 0, 0, 1
956AVX_INSTR pcmpeqq, 0, 0, 1
957AVX_INSTR pcmpgtb, 0, 0, 0
958AVX_INSTR pcmpgtw, 0, 0, 0
959AVX_INSTR pcmpgtd, 0, 0, 0
960AVX_INSTR pcmpgtq, 0, 0, 0
961AVX_INSTR phaddw, 0, 0, 0
962AVX_INSTR phaddd, 0, 0, 0
963AVX_INSTR phaddsw, 0, 0, 0
964AVX_INSTR phsubw, 0, 0, 0
965AVX_INSTR phsubd, 0, 0, 0
966AVX_INSTR phsubsw, 0, 0, 0
967AVX_INSTR pmaddwd, 0, 0, 1
968AVX_INSTR pmaddubsw, 0, 0, 0
969AVX_INSTR pmaxsb, 0, 0, 1
970AVX_INSTR pmaxsw, 0, 0, 1
971AVX_INSTR pmaxsd, 0, 0, 1
972AVX_INSTR pmaxub, 0, 0, 1
973AVX_INSTR pmaxuw, 0, 0, 1
974AVX_INSTR pmaxud, 0, 0, 1
975AVX_INSTR pminsb, 0, 0, 1
976AVX_INSTR pminsw, 0, 0, 1
977AVX_INSTR pminsd, 0, 0, 1
978AVX_INSTR pminub, 0, 0, 1
979AVX_INSTR pminuw, 0, 0, 1
980AVX_INSTR pminud, 0, 0, 1
981AVX_INSTR pmulhuw, 0, 0, 1
982AVX_INSTR pmulhrsw, 0, 0, 1
983AVX_INSTR pmulhw, 0, 0, 1
984AVX_INSTR pmullw, 0, 0, 1
985AVX_INSTR pmulld, 0, 0, 1
986AVX_INSTR pmuludq, 0, 0, 1
987AVX_INSTR pmuldq, 0, 0, 1
988AVX_INSTR por, 0, 0, 1
989AVX_INSTR psadbw, 0, 0, 1
990AVX_INSTR pshufb, 0, 0, 0
991AVX_INSTR psignb, 0, 0, 0
992AVX_INSTR psignw, 0, 0, 0
993AVX_INSTR psignd, 0, 0, 0
994AVX_INSTR psllw, 0, 0, 0
995AVX_INSTR pslld, 0, 0, 0
996AVX_INSTR psllq, 0, 0, 0
997AVX_INSTR pslldq, 0, 0, 0
998AVX_INSTR psraw, 0, 0, 0
999AVX_INSTR psrad, 0, 0, 0
1000AVX_INSTR psrlw, 0, 0, 0
1001AVX_INSTR psrld, 0, 0, 0
1002AVX_INSTR psrlq, 0, 0, 0
1003AVX_INSTR psrldq, 0, 0, 0
1004AVX_INSTR psubb, 0, 0, 0
1005AVX_INSTR psubw, 0, 0, 0
1006AVX_INSTR psubd, 0, 0, 0
1007AVX_INSTR psubq, 0, 0, 0
1008AVX_INSTR psubsb, 0, 0, 0
1009AVX_INSTR psubsw, 0, 0, 0
1010AVX_INSTR psubusb, 0, 0, 0
1011AVX_INSTR psubusw, 0, 0, 0
1012AVX_INSTR punpckhbw, 0, 0, 0
1013AVX_INSTR punpckhwd, 0, 0, 0
1014AVX_INSTR punpckhdq, 0, 0, 0
1015AVX_INSTR punpckhqdq, 0, 0, 0
1016AVX_INSTR punpcklbw, 0, 0, 0
1017AVX_INSTR punpcklwd, 0, 0, 0
1018AVX_INSTR punpckldq, 0, 0, 0
1019AVX_INSTR punpcklqdq, 0, 0, 0
1020AVX_INSTR pxor, 0, 0, 1
1021AVX_INSTR shufps, 1, 1, 0
1022AVX_INSTR subpd, 1, 0, 0
1023AVX_INSTR subps, 1, 0, 0
1024AVX_INSTR subsd, 1, 0, 0
1025AVX_INSTR subss, 1, 0, 0
1026AVX_INSTR unpckhpd, 1, 0, 0
1027AVX_INSTR unpckhps, 1, 0, 0
1028AVX_INSTR unpcklpd, 1, 0, 0
1029AVX_INSTR unpcklps, 1, 0, 0
1030AVX_INSTR xorpd, 1, 0, 1
1031AVX_INSTR xorps, 1, 0, 1
1032
1033; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1034AVX_INSTR pfadd, 1, 0, 1
1035AVX_INSTR pfsub, 1, 0, 0
1036AVX_INSTR pfmul, 1, 0, 1
1037
1038; base-4 constants for shuffles
1039%assign i 0
1040%rep 256
1041    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1042    %if j < 10
1043        CAT_XDEFINE q000, j, i
1044    %elif j < 100
1045        CAT_XDEFINE q00, j, i
1046    %elif j < 1000
1047        CAT_XDEFINE q0, j, i
1048    %else
1049        CAT_XDEFINE q, j, i
1050    %endif
1051%assign i i+1
1052%endrep
1053%undef i
1054%undef j
1055
1056%macro FMA_INSTR 3
1057    %macro %1 4-7 %1, %2, %3
1058        %if cpuflag(xop)
1059            v%5 %1, %2, %3, %4
1060        %else
1061            %6 %1, %2, %3
1062            %7 %1, %4
1063        %endif
1064    %endmacro
1065%endmacro
1066
1067FMA_INSTR  pmacsdd,  pmulld, paddd
1068FMA_INSTR  pmacsww,  pmullw, paddw
1069FMA_INSTR pmadcswd, pmaddwd, paddd
1070