1*0a6a1f1dSLionel Sambuc// This file is dual licensed under the MIT and the University of Illinois Open 2*0a6a1f1dSLionel Sambuc// Source Licenses. See LICENSE.TXT for details. 3*0a6a1f1dSLionel Sambuc 4*0a6a1f1dSLionel Sambuc#include "../assembly.h" 5*0a6a1f1dSLionel Sambuc 6*0a6a1f1dSLionel Sambuc// float __floatundisf(du_int a); 7*0a6a1f1dSLionel Sambuc 8*0a6a1f1dSLionel Sambuc// Note that there is a hardware instruction, fildll, that does most of what 9*0a6a1f1dSLionel Sambuc// this function needs to do. However, because of our ia32 ABI, it will take 10*0a6a1f1dSLionel Sambuc// a write-small read-large stall, so the software implementation here is 11*0a6a1f1dSLionel Sambuc// actually several cycles faster. 12*0a6a1f1dSLionel Sambuc 13*0a6a1f1dSLionel Sambuc// This is a branch-free implementation. A branchy implementation might be 14*0a6a1f1dSLionel Sambuc// faster for the common case if you know something a priori about the input 15*0a6a1f1dSLionel Sambuc// distribution. 16*0a6a1f1dSLionel Sambuc 17*0a6a1f1dSLionel Sambuc/* branch-free x87 implementation - one cycle slower than without x87. 18*0a6a1f1dSLionel Sambuc 19*0a6a1f1dSLionel Sambuc#ifdef __i386__ 20*0a6a1f1dSLionel Sambuc 21*0a6a1f1dSLionel Sambuc.const 22*0a6a1f1dSLionel Sambuc.balign 3 23*0a6a1f1dSLionel Sambuc 24*0a6a1f1dSLionel Sambuc .quad 0x43f0000000000000 25*0a6a1f1dSLionel Sambuctwop64: .quad 0x0000000000000000 26*0a6a1f1dSLionel Sambuc 27*0a6a1f1dSLionel Sambuc#define TWOp64 twop64-0b(%ecx,%eax,8) 28*0a6a1f1dSLionel Sambuc 29*0a6a1f1dSLionel Sambuc.text 30*0a6a1f1dSLionel Sambuc.balign 4 31*0a6a1f1dSLionel SambucDEFINE_COMPILERRT_FUNCTION(__floatundisf) 32*0a6a1f1dSLionel Sambuc movl 8(%esp), %eax 33*0a6a1f1dSLionel Sambuc movd 8(%esp), %xmm1 34*0a6a1f1dSLionel Sambuc movd 4(%esp), %xmm0 35*0a6a1f1dSLionel Sambuc punpckldq %xmm1, %xmm0 36*0a6a1f1dSLionel Sambuc calll 0f 37*0a6a1f1dSLionel Sambuc0: popl %ecx 38*0a6a1f1dSLionel Sambuc sarl $31, %eax 39*0a6a1f1dSLionel Sambuc movq %xmm0, 4(%esp) 40*0a6a1f1dSLionel Sambuc fildll 4(%esp) 41*0a6a1f1dSLionel Sambuc faddl TWOp64 42*0a6a1f1dSLionel Sambuc fstps 4(%esp) 43*0a6a1f1dSLionel Sambuc flds 4(%esp) 44*0a6a1f1dSLionel Sambuc ret 45*0a6a1f1dSLionel SambucEND_COMPILERRT_FUNCTION(__floatundisf) 46*0a6a1f1dSLionel Sambuc 47*0a6a1f1dSLionel Sambuc#endif // __i386__ 48*0a6a1f1dSLionel Sambuc 49*0a6a1f1dSLionel Sambuc*/ 50*0a6a1f1dSLionel Sambuc 51*0a6a1f1dSLionel Sambuc/* branch-free, x87-free implementation - faster at the expense of code size */ 52*0a6a1f1dSLionel Sambuc 53*0a6a1f1dSLionel Sambuc#ifdef __i386__ 54*0a6a1f1dSLionel Sambuc 55*0a6a1f1dSLionel Sambuc#if defined(__APPLE__) 56*0a6a1f1dSLionel Sambuc .const 57*0a6a1f1dSLionel Sambuc#elif defined(__ELF__) 58*0a6a1f1dSLionel Sambuc .section .rodata 59*0a6a1f1dSLionel Sambuc#else 60*0a6a1f1dSLionel Sambuc .section .rdata,"rd" 61*0a6a1f1dSLionel Sambuc#endif 62*0a6a1f1dSLionel Sambuc 63*0a6a1f1dSLionel Sambuc .balign 16 64*0a6a1f1dSLionel Sambuctwop52: 65*0a6a1f1dSLionel Sambuc .quad 0x4330000000000000 66*0a6a1f1dSLionel Sambuc .quad 0x0000000000000fff 67*0a6a1f1dSLionel Sambuc 68*0a6a1f1dSLionel Sambuc .balign 16 69*0a6a1f1dSLionel Sambucsticky: 70*0a6a1f1dSLionel Sambuc .quad 0x0000000000000000 71*0a6a1f1dSLionel Sambuc .long 0x00000012 72*0a6a1f1dSLionel Sambuc 73*0a6a1f1dSLionel Sambuc .balign 16 74*0a6a1f1dSLionel Sambuctwelve: 75*0a6a1f1dSLionel Sambuc .long 0x00000000 76*0a6a1f1dSLionel Sambuc 77*0a6a1f1dSLionel Sambuc#define TWOp52 twop52-0b(%ecx) 78*0a6a1f1dSLionel Sambuc#define STICKY sticky-0b(%ecx,%eax,8) 79*0a6a1f1dSLionel Sambuc 80*0a6a1f1dSLionel Sambuc.text 81*0a6a1f1dSLionel Sambuc.balign 4 82*0a6a1f1dSLionel SambucDEFINE_COMPILERRT_FUNCTION(__floatundisf) 83*0a6a1f1dSLionel Sambuc movl 8(%esp), %eax 84*0a6a1f1dSLionel Sambuc movd 8(%esp), %xmm1 85*0a6a1f1dSLionel Sambuc movd 4(%esp), %xmm0 86*0a6a1f1dSLionel Sambuc punpckldq %xmm1, %xmm0 87*0a6a1f1dSLionel Sambuc 88*0a6a1f1dSLionel Sambuc calll 0f 89*0a6a1f1dSLionel Sambuc0: popl %ecx 90*0a6a1f1dSLionel Sambuc shrl %eax // high 31 bits of input as sint32 91*0a6a1f1dSLionel Sambuc addl $0x7ff80000, %eax 92*0a6a1f1dSLionel Sambuc sarl $31, %eax // (big input) ? -1 : 0 93*0a6a1f1dSLionel Sambuc movsd STICKY, %xmm1 // (big input) ? 0xfff : 0 94*0a6a1f1dSLionel Sambuc movl $12, %edx 95*0a6a1f1dSLionel Sambuc andl %eax, %edx // (big input) ? 12 : 0 96*0a6a1f1dSLionel Sambuc movd %edx, %xmm3 97*0a6a1f1dSLionel Sambuc andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0 98*0a6a1f1dSLionel Sambuc movsd TWOp52, %xmm2 // 0x1.0p52 99*0a6a1f1dSLionel Sambuc psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input 100*0a6a1f1dSLionel Sambuc orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input) 101*0a6a1f1dSLionel Sambuc orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input) 102*0a6a1f1dSLionel Sambuc subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input) 103*0a6a1f1dSLionel Sambuc cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input) 104*0a6a1f1dSLionel Sambuc pslld $23, %xmm3 105*0a6a1f1dSLionel Sambuc paddd %xmm3, %xmm0 // (float)input 106*0a6a1f1dSLionel Sambuc movd %xmm0, 4(%esp) 107*0a6a1f1dSLionel Sambuc flds 4(%esp) 108*0a6a1f1dSLionel Sambuc ret 109*0a6a1f1dSLionel SambucEND_COMPILERRT_FUNCTION(__floatundisf) 110*0a6a1f1dSLionel Sambuc 111*0a6a1f1dSLionel Sambuc#endif // __i386__ 112