1353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2353358Sdim// See https://llvm.org/LICENSE.txt for license information. 3353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4276789Sdim 5276789Sdim#include "../assembly.h" 6276789Sdim 7276789Sdim// float __floatundisf(du_int a); 8276789Sdim 9276789Sdim// Note that there is a hardware instruction, fildll, that does most of what 10276789Sdim// this function needs to do. However, because of our ia32 ABI, it will take 11276789Sdim// a write-small read-large stall, so the software implementation here is 12276789Sdim// actually several cycles faster. 13276789Sdim 14276789Sdim// This is a branch-free implementation. A branchy implementation might be 15276789Sdim// faster for the common case if you know something a priori about the input 16276789Sdim// distribution. 17276789Sdim 18276789Sdim/* branch-free x87 implementation - one cycle slower than without x87. 19276789Sdim 20276789Sdim#ifdef __i386__ 21276789Sdim 22276789SdimCONST_SECTION 23276789Sdim.balign 3 24276789Sdim 25276789Sdim .quad 0x43f0000000000000 26276789Sdimtwop64: .quad 0x0000000000000000 27276789Sdim 28276789Sdim#define TWOp64 twop64-0b(%ecx,%eax,8) 29276789Sdim 30276789Sdim.text 31276789Sdim.balign 4 32276789SdimDEFINE_COMPILERRT_FUNCTION(__floatundisf) 33276789Sdim movl 8(%esp), %eax 34276789Sdim movd 8(%esp), %xmm1 35276789Sdim movd 4(%esp), %xmm0 36276789Sdim punpckldq %xmm1, %xmm0 37276789Sdim calll 0f 38276789Sdim0: popl %ecx 39276789Sdim sarl $31, %eax 40276789Sdim movq %xmm0, 4(%esp) 41276789Sdim fildll 4(%esp) 42276789Sdim faddl TWOp64 43276789Sdim fstps 4(%esp) 44276789Sdim flds 4(%esp) 45276789Sdim ret 46276789SdimEND_COMPILERRT_FUNCTION(__floatundisf) 47276789Sdim 48276789Sdim#endif // __i386__ 49276789Sdim 50276789Sdim*/ 51276789Sdim 52353358Sdim// branch-free, x87-free implementation - faster at the expense of code size 53276789Sdim 54276789Sdim#ifdef __i386__ 55276789Sdim 56276789SdimCONST_SECTION 57276789Sdim 58276789Sdim .balign 16 59276789Sdimtwop52: 60276789Sdim .quad 0x4330000000000000 61276789Sdim .quad 0x0000000000000fff 62276789Sdim 63276789Sdim .balign 16 64276789Sdimsticky: 65276789Sdim .quad 0x0000000000000000 66276789Sdim .long 0x00000012 67276789Sdim 68276789Sdim .balign 16 69276789Sdimtwelve: 70276789Sdim .long 0x00000000 71276789Sdim 72276789Sdim#define TWOp52 twop52-0b(%ecx) 73276789Sdim#define STICKY sticky-0b(%ecx,%eax,8) 74276789Sdim 75276789Sdim.text 76276789Sdim.balign 4 77276789SdimDEFINE_COMPILERRT_FUNCTION(__floatundisf) 78276789Sdim movl 8(%esp), %eax 79276789Sdim movd 8(%esp), %xmm1 80276789Sdim movd 4(%esp), %xmm0 81276789Sdim punpckldq %xmm1, %xmm0 82353358Sdim 83276789Sdim calll 0f 84276789Sdim0: popl %ecx 85276789Sdim shrl %eax // high 31 bits of input as sint32 86276789Sdim addl $0x7ff80000, %eax 87276789Sdim sarl $31, %eax // (big input) ? -1 : 0 88276789Sdim movsd STICKY, %xmm1 // (big input) ? 0xfff : 0 89276789Sdim movl $12, %edx 90276789Sdim andl %eax, %edx // (big input) ? 12 : 0 91276789Sdim movd %edx, %xmm3 92276789Sdim andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0 93276789Sdim movsd TWOp52, %xmm2 // 0x1.0p52 94276789Sdim psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input 95276789Sdim orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input) 96276789Sdim orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input) 97276789Sdim subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input) 98276789Sdim cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input) 99276789Sdim pslld $23, %xmm3 100276789Sdim paddd %xmm3, %xmm0 // (float)input 101276789Sdim movd %xmm0, 4(%esp) 102276789Sdim flds 4(%esp) 103276789Sdim ret 104276789SdimEND_COMPILERRT_FUNCTION(__floatundisf) 105276789Sdim 106276789Sdim#endif // __i386__ 107309124Sdim 108309124SdimNO_EXEC_STACK_DIRECTIVE 109309124Sdim 110