xref: /openbsd-src/gnu/llvm/libcxx/src/include/ryu/d2s_intrinsics.h (revision 4bdff4bed0e3d54e55670334c7d0077db4170f86)
1*4bdff4beSrobert //===----------------------------------------------------------------------===//
2*4bdff4beSrobert //
3*4bdff4beSrobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*4bdff4beSrobert // See https://llvm.org/LICENSE.txt for license information.
5*4bdff4beSrobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*4bdff4beSrobert //
7*4bdff4beSrobert //===----------------------------------------------------------------------===//
8*4bdff4beSrobert 
9*4bdff4beSrobert // Copyright (c) Microsoft Corporation.
10*4bdff4beSrobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11*4bdff4beSrobert 
12*4bdff4beSrobert // Copyright 2018 Ulf Adams
13*4bdff4beSrobert // Copyright (c) Microsoft Corporation. All rights reserved.
14*4bdff4beSrobert 
15*4bdff4beSrobert // Boost Software License - Version 1.0 - August 17th, 2003
16*4bdff4beSrobert 
17*4bdff4beSrobert // Permission is hereby granted, free of charge, to any person or organization
18*4bdff4beSrobert // obtaining a copy of the software and accompanying documentation covered by
19*4bdff4beSrobert // this license (the "Software") to use, reproduce, display, distribute,
20*4bdff4beSrobert // execute, and transmit the Software, and to prepare derivative works of the
21*4bdff4beSrobert // Software, and to permit third-parties to whom the Software is furnished to
22*4bdff4beSrobert // do so, all subject to the following:
23*4bdff4beSrobert 
24*4bdff4beSrobert // The copyright notices in the Software and this entire statement, including
25*4bdff4beSrobert // the above license grant, this restriction and the following disclaimer,
26*4bdff4beSrobert // must be included in all copies of the Software, in whole or in part, and
27*4bdff4beSrobert // all derivative works of the Software, unless such copies or derivative
28*4bdff4beSrobert // works are solely in the form of machine-executable object code generated by
29*4bdff4beSrobert // a source language processor.
30*4bdff4beSrobert 
31*4bdff4beSrobert // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32*4bdff4beSrobert // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33*4bdff4beSrobert // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
34*4bdff4beSrobert // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
35*4bdff4beSrobert // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
36*4bdff4beSrobert // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
37*4bdff4beSrobert // DEALINGS IN THE SOFTWARE.
38*4bdff4beSrobert 
39*4bdff4beSrobert #ifndef _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
40*4bdff4beSrobert #define _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
41*4bdff4beSrobert 
42*4bdff4beSrobert // Avoid formatting to keep the changes with the original code minimal.
43*4bdff4beSrobert // clang-format off
44*4bdff4beSrobert 
45*4bdff4beSrobert #include <__assert>
46*4bdff4beSrobert #include <__config>
47*4bdff4beSrobert 
48*4bdff4beSrobert #include "include/ryu/ryu.h"
49*4bdff4beSrobert 
50*4bdff4beSrobert _LIBCPP_BEGIN_NAMESPACE_STD
51*4bdff4beSrobert 
52*4bdff4beSrobert #if defined(_M_X64) && defined(_MSC_VER)
53*4bdff4beSrobert #define _LIBCPP_INTRINSIC128 1
__ryu_umul128(const uint64_t __a,const uint64_t __b,uint64_t * const __productHi)54*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
55*4bdff4beSrobert   return _umul128(__a, __b, __productHi);
56*4bdff4beSrobert }
57*4bdff4beSrobert 
__ryu_shiftright128(const uint64_t __lo,const uint64_t __hi,const uint32_t __dist)58*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
59*4bdff4beSrobert   // For the __shiftright128 intrinsic, the shift value is always
60*4bdff4beSrobert   // modulo 64.
61*4bdff4beSrobert   // In the current implementation of the double-precision version
62*4bdff4beSrobert   // of Ryu, the shift value is always < 64.
63*4bdff4beSrobert   // (The shift value is in the range [49, 58].)
64*4bdff4beSrobert   // Check this here in case a future change requires larger shift
65*4bdff4beSrobert   // values. In this case this function needs to be adjusted.
66*4bdff4beSrobert   _LIBCPP_ASSERT(__dist < 64, "");
67*4bdff4beSrobert   return __shiftright128(__lo, __hi, static_cast<unsigned char>(__dist));
68*4bdff4beSrobert }
69*4bdff4beSrobert 
70*4bdff4beSrobert // ^^^ intrinsics available ^^^ / vvv __int128 available vvv
71*4bdff4beSrobert #elif defined(__SIZEOF_INT128__) && ( \
72*4bdff4beSrobert     (defined(__clang__) && !defined(_MSC_VER)) || \
73*4bdff4beSrobert     (defined(__GNUC__) && !defined(__clang__) && !defined(__CUDACC__)))
74*4bdff4beSrobert #define _LIBCPP_INTRINSIC128 1
75*4bdff4beSrobert   // We have __uint128 support in clang or gcc
76*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
77*4bdff4beSrobert   auto __temp = __a * (unsigned __int128)__b;
78*4bdff4beSrobert   *__productHi = __temp >> 64;
79*4bdff4beSrobert   return static_cast<uint64_t>(__temp);
80*4bdff4beSrobert }
81*4bdff4beSrobert 
82*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
83*4bdff4beSrobert   // In the current implementation of the double-precision version
84*4bdff4beSrobert   // of Ryu, the shift value is always < 64.
85*4bdff4beSrobert   // (The shift value is in the range [49, 58].)
86*4bdff4beSrobert   // Check this here in case a future change requires larger shift
87*4bdff4beSrobert   // values. In this case this function needs to be adjusted.
88*4bdff4beSrobert   _LIBCPP_ASSERT(__dist < 64, "");
89*4bdff4beSrobert   auto __temp = __lo | ((unsigned __int128)__hi << 64);
90*4bdff4beSrobert   // For x64 128-bit shfits using the `shrd` instruction and two 64-bit
91*4bdff4beSrobert   // registers, the shift value is modulo 64.  Thus the `& 63` is free.
92*4bdff4beSrobert   return static_cast<uint64_t>(__temp >> (__dist & 63));
93*4bdff4beSrobert }
94*4bdff4beSrobert #else // ^^^ __int128 available ^^^ / vvv intrinsics unavailable vvv
95*4bdff4beSrobert 
96*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_ALWAYS_INLINE uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
97*4bdff4beSrobert   // TRANSITION, VSO-634761
98*4bdff4beSrobert   // The casts here help MSVC to avoid calls to the __allmul library function.
99*4bdff4beSrobert   const uint32_t __aLo = static_cast<uint32_t>(__a);
100*4bdff4beSrobert   const uint32_t __aHi = static_cast<uint32_t>(__a >> 32);
101*4bdff4beSrobert   const uint32_t __bLo = static_cast<uint32_t>(__b);
102*4bdff4beSrobert   const uint32_t __bHi = static_cast<uint32_t>(__b >> 32);
103*4bdff4beSrobert 
104*4bdff4beSrobert   const uint64_t __b00 = static_cast<uint64_t>(__aLo) * __bLo;
105*4bdff4beSrobert   const uint64_t __b01 = static_cast<uint64_t>(__aLo) * __bHi;
106*4bdff4beSrobert   const uint64_t __b10 = static_cast<uint64_t>(__aHi) * __bLo;
107*4bdff4beSrobert   const uint64_t __b11 = static_cast<uint64_t>(__aHi) * __bHi;
108*4bdff4beSrobert 
109*4bdff4beSrobert   const uint32_t __b00Lo = static_cast<uint32_t>(__b00);
110*4bdff4beSrobert   const uint32_t __b00Hi = static_cast<uint32_t>(__b00 >> 32);
111*4bdff4beSrobert 
112*4bdff4beSrobert   const uint64_t __mid1 = __b10 + __b00Hi;
113*4bdff4beSrobert   const uint32_t __mid1Lo = static_cast<uint32_t>(__mid1);
114*4bdff4beSrobert   const uint32_t __mid1Hi = static_cast<uint32_t>(__mid1 >> 32);
115*4bdff4beSrobert 
116*4bdff4beSrobert   const uint64_t __mid2 = __b01 + __mid1Lo;
117*4bdff4beSrobert   const uint32_t __mid2Lo = static_cast<uint32_t>(__mid2);
118*4bdff4beSrobert   const uint32_t __mid2Hi = static_cast<uint32_t>(__mid2 >> 32);
119*4bdff4beSrobert 
120*4bdff4beSrobert   const uint64_t __pHi = __b11 + __mid1Hi + __mid2Hi;
121*4bdff4beSrobert   const uint64_t __pLo = (static_cast<uint64_t>(__mid2Lo) << 32) | __b00Lo;
122*4bdff4beSrobert 
123*4bdff4beSrobert   *__productHi = __pHi;
124*4bdff4beSrobert   return __pLo;
125*4bdff4beSrobert }
126*4bdff4beSrobert 
127*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
128*4bdff4beSrobert   // We don't need to handle the case __dist >= 64 here (see above).
129*4bdff4beSrobert   _LIBCPP_ASSERT(__dist < 64, "");
130*4bdff4beSrobert #ifdef _LIBCPP_64_BIT
131*4bdff4beSrobert   _LIBCPP_ASSERT(__dist > 0, "");
132*4bdff4beSrobert   return (__hi << (64 - __dist)) | (__lo >> __dist);
133*4bdff4beSrobert #else // ^^^ 64-bit ^^^ / vvv 32-bit vvv
134*4bdff4beSrobert   // Avoid a 64-bit shift by taking advantage of the range of shift values.
135*4bdff4beSrobert   _LIBCPP_ASSERT(__dist >= 32, "");
136*4bdff4beSrobert   return (__hi << (64 - __dist)) | (static_cast<uint32_t>(__lo >> 32) >> (__dist - 32));
137*4bdff4beSrobert #endif // ^^^ 32-bit ^^^
138*4bdff4beSrobert }
139*4bdff4beSrobert 
140*4bdff4beSrobert #endif // ^^^ intrinsics unavailable ^^^
141*4bdff4beSrobert 
142*4bdff4beSrobert #ifndef _LIBCPP_64_BIT
143*4bdff4beSrobert 
144*4bdff4beSrobert // Returns the high 64 bits of the 128-bit product of __a and __b.
__umulh(const uint64_t __a,const uint64_t __b)145*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __umulh(const uint64_t __a, const uint64_t __b) {
146*4bdff4beSrobert   // Reuse the __ryu_umul128 implementation.
147*4bdff4beSrobert   // Optimizers will likely eliminate the instructions used to compute the
148*4bdff4beSrobert   // low part of the product.
149*4bdff4beSrobert   uint64_t __hi;
150*4bdff4beSrobert   (void) __ryu_umul128(__a, __b, &__hi);
151*4bdff4beSrobert   return __hi;
152*4bdff4beSrobert }
153*4bdff4beSrobert 
154*4bdff4beSrobert // On 32-bit platforms, compilers typically generate calls to library
155*4bdff4beSrobert // functions for 64-bit divisions, even if the divisor is a constant.
156*4bdff4beSrobert //
157*4bdff4beSrobert // TRANSITION, LLVM-37932
158*4bdff4beSrobert //
159*4bdff4beSrobert // The functions here perform division-by-constant using multiplications
160*4bdff4beSrobert // in the same way as 64-bit compilers would do.
161*4bdff4beSrobert //
162*4bdff4beSrobert // NB:
163*4bdff4beSrobert // The multipliers and shift values are the ones generated by clang x64
164*4bdff4beSrobert // for expressions like x/5, x/10, etc.
165*4bdff4beSrobert 
__div5(const uint64_t __x)166*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) {
167*4bdff4beSrobert   return __umulh(__x, 0xCCCCCCCCCCCCCCCDu) >> 2;
168*4bdff4beSrobert }
169*4bdff4beSrobert 
__div10(const uint64_t __x)170*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) {
171*4bdff4beSrobert   return __umulh(__x, 0xCCCCCCCCCCCCCCCDu) >> 3;
172*4bdff4beSrobert }
173*4bdff4beSrobert 
__div100(const uint64_t __x)174*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) {
175*4bdff4beSrobert   return __umulh(__x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
176*4bdff4beSrobert }
177*4bdff4beSrobert 
__div1e8(const uint64_t __x)178*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) {
179*4bdff4beSrobert   return __umulh(__x, 0xABCC77118461CEFDu) >> 26;
180*4bdff4beSrobert }
181*4bdff4beSrobert 
__div1e9(const uint64_t __x)182*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) {
183*4bdff4beSrobert   return __umulh(__x >> 9, 0x44B82FA09B5A53u) >> 11;
184*4bdff4beSrobert }
185*4bdff4beSrobert 
__mod1e9(const uint64_t __x)186*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) {
187*4bdff4beSrobert   // Avoid 64-bit math as much as possible.
188*4bdff4beSrobert   // Returning static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x)) would
189*4bdff4beSrobert   // perform 32x64-bit multiplication and 64-bit subtraction.
190*4bdff4beSrobert   // __x and 1000000000 * __div1e9(__x) are guaranteed to differ by
191*4bdff4beSrobert   // less than 10^9, so their highest 32 bits must be identical,
192*4bdff4beSrobert   // so we can truncate both sides to uint32_t before subtracting.
193*4bdff4beSrobert   // We can also simplify static_cast<uint32_t>(1000000000 * __div1e9(__x)).
194*4bdff4beSrobert   // We can truncate before multiplying instead of after, as multiplying
195*4bdff4beSrobert   // the highest 32 bits of __div1e9(__x) can't affect the lowest 32 bits.
196*4bdff4beSrobert   return static_cast<uint32_t>(__x) - 1000000000 * static_cast<uint32_t>(__div1e9(__x));
197*4bdff4beSrobert }
198*4bdff4beSrobert 
199*4bdff4beSrobert #else // ^^^ 32-bit ^^^ / vvv 64-bit vvv
200*4bdff4beSrobert 
__div5(const uint64_t __x)201*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) {
202*4bdff4beSrobert   return __x / 5;
203*4bdff4beSrobert }
204*4bdff4beSrobert 
__div10(const uint64_t __x)205*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) {
206*4bdff4beSrobert   return __x / 10;
207*4bdff4beSrobert }
208*4bdff4beSrobert 
__div100(const uint64_t __x)209*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) {
210*4bdff4beSrobert   return __x / 100;
211*4bdff4beSrobert }
212*4bdff4beSrobert 
__div1e8(const uint64_t __x)213*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) {
214*4bdff4beSrobert   return __x / 100000000;
215*4bdff4beSrobert }
216*4bdff4beSrobert 
__div1e9(const uint64_t __x)217*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) {
218*4bdff4beSrobert   return __x / 1000000000;
219*4bdff4beSrobert }
220*4bdff4beSrobert 
__mod1e9(const uint64_t __x)221*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) {
222*4bdff4beSrobert   return static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x));
223*4bdff4beSrobert }
224*4bdff4beSrobert 
225*4bdff4beSrobert #endif // ^^^ 64-bit ^^^
226*4bdff4beSrobert 
__pow5Factor(uint64_t __value)227*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __pow5Factor(uint64_t __value) {
228*4bdff4beSrobert   uint32_t __count = 0;
229*4bdff4beSrobert   for (;;) {
230*4bdff4beSrobert     _LIBCPP_ASSERT(__value != 0, "");
231*4bdff4beSrobert     const uint64_t __q = __div5(__value);
232*4bdff4beSrobert     const uint32_t __r = static_cast<uint32_t>(__value) - 5 * static_cast<uint32_t>(__q);
233*4bdff4beSrobert     if (__r != 0) {
234*4bdff4beSrobert       break;
235*4bdff4beSrobert     }
236*4bdff4beSrobert     __value = __q;
237*4bdff4beSrobert     ++__count;
238*4bdff4beSrobert   }
239*4bdff4beSrobert   return __count;
240*4bdff4beSrobert }
241*4bdff4beSrobert 
242*4bdff4beSrobert // Returns true if __value is divisible by 5^__p.
__multipleOfPowerOf5(const uint64_t __value,const uint32_t __p)243*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf5(const uint64_t __value, const uint32_t __p) {
244*4bdff4beSrobert   // I tried a case distinction on __p, but there was no performance difference.
245*4bdff4beSrobert   return __pow5Factor(__value) >= __p;
246*4bdff4beSrobert }
247*4bdff4beSrobert 
248*4bdff4beSrobert // Returns true if __value is divisible by 2^__p.
__multipleOfPowerOf2(const uint64_t __value,const uint32_t __p)249*4bdff4beSrobert [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf2(const uint64_t __value, const uint32_t __p) {
250*4bdff4beSrobert   _LIBCPP_ASSERT(__value != 0, "");
251*4bdff4beSrobert   _LIBCPP_ASSERT(__p < 64, "");
252*4bdff4beSrobert   // __builtin_ctzll doesn't appear to be faster here.
253*4bdff4beSrobert   return (__value & ((1ull << __p) - 1)) == 0;
254*4bdff4beSrobert }
255*4bdff4beSrobert 
256*4bdff4beSrobert _LIBCPP_END_NAMESPACE_STD
257*4bdff4beSrobert 
258*4bdff4beSrobert // clang-format on
259*4bdff4beSrobert 
260*4bdff4beSrobert #endif // _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
261