1*3cab2bb3Spatrick// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2*3cab2bb3Spatrick// See https://llvm.org/LICENSE.txt for license information. 3*3cab2bb3Spatrick// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4*3cab2bb3Spatrick 5*3cab2bb3Spatrick#include "../assembly.h" 6*3cab2bb3Spatrick 7*3cab2bb3Spatrick// float __floatundisf(du_int a); 8*3cab2bb3Spatrick 9*3cab2bb3Spatrick// Note that there is a hardware instruction, fildll, that does most of what 10*3cab2bb3Spatrick// this function needs to do. However, because of our ia32 ABI, it will take 11*3cab2bb3Spatrick// a write-small read-large stall, so the software implementation here is 12*3cab2bb3Spatrick// actually several cycles faster. 13*3cab2bb3Spatrick 14*3cab2bb3Spatrick// This is a branch-free implementation. A branchy implementation might be 15*3cab2bb3Spatrick// faster for the common case if you know something a priori about the input 16*3cab2bb3Spatrick// distribution. 17*3cab2bb3Spatrick 18*3cab2bb3Spatrick/* branch-free x87 implementation - one cycle slower than without x87. 19*3cab2bb3Spatrick 20*3cab2bb3Spatrick#ifdef __i386__ 21*3cab2bb3Spatrick 22*3cab2bb3SpatrickCONST_SECTION 23*3cab2bb3Spatrick.balign 3 24*3cab2bb3Spatrick 25*3cab2bb3Spatrick .quad 0x43f0000000000000 26*3cab2bb3Spatricktwop64: .quad 0x0000000000000000 27*3cab2bb3Spatrick 28*3cab2bb3Spatrick#define TWOp64 twop64-0b(%ecx,%eax,8) 29*3cab2bb3Spatrick 30*3cab2bb3Spatrick.text 31*3cab2bb3Spatrick.balign 4 32*3cab2bb3SpatrickDEFINE_COMPILERRT_FUNCTION(__floatundisf) 33*3cab2bb3Spatrick movl 8(%esp), %eax 34*3cab2bb3Spatrick movd 8(%esp), %xmm1 35*3cab2bb3Spatrick movd 4(%esp), %xmm0 36*3cab2bb3Spatrick punpckldq %xmm1, %xmm0 37*3cab2bb3Spatrick calll 0f 38*3cab2bb3Spatrick0: popl %ecx 39*3cab2bb3Spatrick sarl $31, %eax 40*3cab2bb3Spatrick movq %xmm0, 4(%esp) 41*3cab2bb3Spatrick fildll 4(%esp) 42*3cab2bb3Spatrick faddl TWOp64 43*3cab2bb3Spatrick fstps 4(%esp) 44*3cab2bb3Spatrick flds 4(%esp) 45*3cab2bb3Spatrick ret 46*3cab2bb3SpatrickEND_COMPILERRT_FUNCTION(__floatundisf) 47*3cab2bb3Spatrick 48*3cab2bb3Spatrick#endif // __i386__ 49*3cab2bb3Spatrick 50*3cab2bb3Spatrick*/ 51*3cab2bb3Spatrick 52*3cab2bb3Spatrick// branch-free, x87-free implementation - faster at the expense of code size 53*3cab2bb3Spatrick 54*3cab2bb3Spatrick#ifdef __i386__ 55*3cab2bb3Spatrick 56*3cab2bb3SpatrickCONST_SECTION 57*3cab2bb3Spatrick 58*3cab2bb3Spatrick .balign 16 59*3cab2bb3Spatricktwop52: 60*3cab2bb3Spatrick .quad 0x4330000000000000 61*3cab2bb3Spatrick .quad 0x0000000000000fff 62*3cab2bb3Spatrick 63*3cab2bb3Spatrick .balign 16 64*3cab2bb3Spatricksticky: 65*3cab2bb3Spatrick .quad 0x0000000000000000 66*3cab2bb3Spatrick .long 0x00000012 67*3cab2bb3Spatrick 68*3cab2bb3Spatrick .balign 16 69*3cab2bb3Spatricktwelve: 70*3cab2bb3Spatrick .long 0x00000000 71*3cab2bb3Spatrick 72*3cab2bb3Spatrick#define TWOp52 twop52-0b(%ecx) 73*3cab2bb3Spatrick#define STICKY sticky-0b(%ecx,%eax,8) 74*3cab2bb3Spatrick 75*3cab2bb3Spatrick.text 76*3cab2bb3Spatrick.balign 4 77*3cab2bb3SpatrickDEFINE_COMPILERRT_FUNCTION(__floatundisf) 78*3cab2bb3Spatrick movl 8(%esp), %eax 79*3cab2bb3Spatrick movd 8(%esp), %xmm1 80*3cab2bb3Spatrick movd 4(%esp), %xmm0 81*3cab2bb3Spatrick punpckldq %xmm1, %xmm0 82*3cab2bb3Spatrick 83*3cab2bb3Spatrick calll 0f 84*3cab2bb3Spatrick0: popl %ecx 85*3cab2bb3Spatrick shrl %eax // high 31 bits of input as sint32 86*3cab2bb3Spatrick addl $0x7ff80000, %eax 87*3cab2bb3Spatrick sarl $31, %eax // (big input) ? -1 : 0 88*3cab2bb3Spatrick movsd STICKY, %xmm1 // (big input) ? 0xfff : 0 89*3cab2bb3Spatrick movl $12, %edx 90*3cab2bb3Spatrick andl %eax, %edx // (big input) ? 12 : 0 91*3cab2bb3Spatrick movd %edx, %xmm3 92*3cab2bb3Spatrick andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0 93*3cab2bb3Spatrick movsd TWOp52, %xmm2 // 0x1.0p52 94*3cab2bb3Spatrick psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input 95*3cab2bb3Spatrick orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input) 96*3cab2bb3Spatrick orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input) 97*3cab2bb3Spatrick subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input) 98*3cab2bb3Spatrick cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input) 99*3cab2bb3Spatrick pslld $23, %xmm3 100*3cab2bb3Spatrick paddd %xmm3, %xmm0 // (float)input 101*3cab2bb3Spatrick movd %xmm0, 4(%esp) 102*3cab2bb3Spatrick flds 4(%esp) 103*3cab2bb3Spatrick ret 104*3cab2bb3SpatrickEND_COMPILERRT_FUNCTION(__floatundisf) 105*3cab2bb3Spatrick 106*3cab2bb3Spatrick#endif // __i386__ 107*3cab2bb3Spatrick 108*3cab2bb3SpatrickNO_EXEC_STACK_DIRECTIVE 109*3cab2bb3Spatrick 110