floatundisf.S revision 276789
1276789Sdim// This file is dual licensed under the MIT and the University of Illinois Open 2276789Sdim// Source Licenses. See LICENSE.TXT for details. 3276789Sdim 4276789Sdim#include "../assembly.h" 5276789Sdim 6276789Sdim// float __floatundisf(du_int a); 7276789Sdim 8276789Sdim// Note that there is a hardware instruction, fildll, that does most of what 9276789Sdim// this function needs to do. However, because of our ia32 ABI, it will take 10276789Sdim// a write-small read-large stall, so the software implementation here is 11276789Sdim// actually several cycles faster. 12276789Sdim 13276789Sdim// This is a branch-free implementation. A branchy implementation might be 14276789Sdim// faster for the common case if you know something a priori about the input 15276789Sdim// distribution. 16276789Sdim 17276789Sdim/* branch-free x87 implementation - one cycle slower than without x87. 18276789Sdim 19276789Sdim#ifdef __i386__ 20276789Sdim 21276789SdimCONST_SECTION 22276789Sdim.balign 3 23276789Sdim 24276789Sdim .quad 0x43f0000000000000 25276789Sdimtwop64: .quad 0x0000000000000000 26276789Sdim 27276789Sdim#define TWOp64 twop64-0b(%ecx,%eax,8) 28276789Sdim 29276789Sdim.text 30276789Sdim.balign 4 31276789SdimDEFINE_COMPILERRT_FUNCTION(__floatundisf) 32276789Sdim movl 8(%esp), %eax 33276789Sdim movd 8(%esp), %xmm1 34276789Sdim movd 4(%esp), %xmm0 35276789Sdim punpckldq %xmm1, %xmm0 36276789Sdim calll 0f 37276789Sdim0: popl %ecx 38276789Sdim sarl $31, %eax 39276789Sdim movq %xmm0, 4(%esp) 40276789Sdim fildll 4(%esp) 41276789Sdim faddl TWOp64 42276789Sdim fstps 4(%esp) 43276789Sdim flds 4(%esp) 44276789Sdim ret 45276789SdimEND_COMPILERRT_FUNCTION(__floatundisf) 46276789Sdim 47276789Sdim#endif // __i386__ 48276789Sdim 49276789Sdim*/ 50276789Sdim 51276789Sdim/* branch-free, x87-free implementation - faster at the expense of code size */ 52276789Sdim 53276789Sdim#ifdef __i386__ 54276789Sdim 55276789SdimCONST_SECTION 56276789Sdim 57276789Sdim .balign 16 58276789Sdimtwop52: 59276789Sdim .quad 0x4330000000000000 60276789Sdim .quad 0x0000000000000fff 61276789Sdim 62276789Sdim .balign 16 63276789Sdimsticky: 64276789Sdim .quad 0x0000000000000000 65276789Sdim .long 0x00000012 66276789Sdim 67276789Sdim .balign 16 68276789Sdimtwelve: 69276789Sdim .long 0x00000000 70276789Sdim 71276789Sdim#define TWOp52 twop52-0b(%ecx) 72276789Sdim#define STICKY sticky-0b(%ecx,%eax,8) 73276789Sdim 74276789Sdim.text 75276789Sdim.balign 4 76276789SdimDEFINE_COMPILERRT_FUNCTION(__floatundisf) 77276789Sdim movl 8(%esp), %eax 78276789Sdim movd 8(%esp), %xmm1 79276789Sdim movd 4(%esp), %xmm0 80276789Sdim punpckldq %xmm1, %xmm0 81276789Sdim 82276789Sdim calll 0f 83276789Sdim0: popl %ecx 84276789Sdim shrl %eax // high 31 bits of input as sint32 85276789Sdim addl $0x7ff80000, %eax 86276789Sdim sarl $31, %eax // (big input) ? -1 : 0 87276789Sdim movsd STICKY, %xmm1 // (big input) ? 0xfff : 0 88276789Sdim movl $12, %edx 89276789Sdim andl %eax, %edx // (big input) ? 12 : 0 90276789Sdim movd %edx, %xmm3 91276789Sdim andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0 92276789Sdim movsd TWOp52, %xmm2 // 0x1.0p52 93276789Sdim psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input 94276789Sdim orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input) 95276789Sdim orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input) 96276789Sdim subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input) 97276789Sdim cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input) 98276789Sdim pslld $23, %xmm3 99276789Sdim paddd %xmm3, %xmm0 // (float)input 100276789Sdim movd %xmm0, 4(%esp) 101276789Sdim flds 4(%esp) 102276789Sdim ret 103276789SdimEND_COMPILERRT_FUNCTION(__floatundisf) 104276789Sdim 105276789Sdim#endif // __i386__ 106