1b8e8012aSKirill Okhotnikov //===-- Common header for fmod implementations ------------------*- C++ -*-===// 2b8e8012aSKirill Okhotnikov // 3b8e8012aSKirill Okhotnikov // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4b8e8012aSKirill Okhotnikov // See https://llvm.org/LICENSE.txt for license information. 5b8e8012aSKirill Okhotnikov // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6b8e8012aSKirill Okhotnikov // 7b8e8012aSKirill Okhotnikov //===----------------------------------------------------------------------===// 8b8e8012aSKirill Okhotnikov 9270547f3SGuillaume Chatelet #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMOD_H 10270547f3SGuillaume Chatelet #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMOD_H 11b8e8012aSKirill Okhotnikov 121d894788SGuillaume Chatelet #include "src/__support/CPP/bit.h" 1391eb0b65SGuillaume Chatelet #include "src/__support/CPP/limits.h" 14f7226150SGuillaume Chatelet #include "src/__support/CPP/type_traits.h" 15b8e8012aSKirill Okhotnikov #include "src/__support/FPUtil/FEnvImpl.h" 16b8e8012aSKirill Okhotnikov #include "src/__support/FPUtil/FPBits.h" 17*5ff3ff33SPetr Hosek #include "src/__support/macros/config.h" 18737e1cd1SGuillaume Chatelet #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY 19b8e8012aSKirill Okhotnikov 20*5ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL { 21b8e8012aSKirill Okhotnikov namespace fputil { 22b8e8012aSKirill Okhotnikov namespace generic { 23b8e8012aSKirill Okhotnikov 24b8e8012aSKirill Okhotnikov // Objective: 25b8e8012aSKirill Okhotnikov // The algorithm uses integer arithmetic (max uint64_t) for general case. 26b8e8012aSKirill Okhotnikov // Some common cases, like abs(x) < abs(y) or abs(x) < 1000 * abs(y) are 27b8e8012aSKirill Okhotnikov // treated specially to increase performance. The part of checking special 28b8e8012aSKirill Okhotnikov // cases, numbers NaN, INF etc. treated separately. 29b8e8012aSKirill Okhotnikov // 30b8e8012aSKirill Okhotnikov // Objective: 31b8e8012aSKirill Okhotnikov // 1) FMod definition (https://cplusplus.com/reference/cmath/fmod/): 32b8e8012aSKirill Okhotnikov // fmod = numer - tquot * denom, where tquot is the truncated 33b8e8012aSKirill Okhotnikov // (i.e., rounded towards zero) result of: numer/denom. 34b8e8012aSKirill Okhotnikov // 2) FMod with negative x and/or y can be trivially converted to fmod for 35b8e8012aSKirill Okhotnikov // positive x and y. Therefore the algorithm below works only with 36b8e8012aSKirill Okhotnikov // positive numbers. 37b8e8012aSKirill Okhotnikov // 3) All positive floating point numbers can be represented as m * 2^e, 38b8e8012aSKirill Okhotnikov // where "m" is positive integer and "e" is signed. 39b8e8012aSKirill Okhotnikov // 4) FMod function can be calculated in integer numbers (x > y): 40b8e8012aSKirill Okhotnikov // fmod = m_x * 2^e_x - tquot * m_y * 2^e_y 41b8e8012aSKirill Okhotnikov // = 2^e_y * (m_x * 2^(e_x - e^y) - tquot * m_y). 42b8e8012aSKirill Okhotnikov // All variables in parentheses are unsigned integers. 43b8e8012aSKirill Okhotnikov // 44b8e8012aSKirill Okhotnikov // Mathematical background: 45b8e8012aSKirill Okhotnikov // Input x,y in the algorithm is represented (mathematically) like m_x*2^e_x 46b8e8012aSKirill Okhotnikov // and m_y*2^e_y. This is an ambiguous number representation. For example: 47b8e8012aSKirill Okhotnikov // m * 2^e = (2 * m) * 2^(e-1) 48b8e8012aSKirill Okhotnikov // The algorithm uses the facts that 49b8e8012aSKirill Okhotnikov // r = a % b = (a % (N * b)) % b, 50b8e8012aSKirill Okhotnikov // (a * c) % (b * c) = (a % b) * c 51b8e8012aSKirill Okhotnikov // where N is positive integer number. a, b and c - positive. Let's adopt 52b8e8012aSKirill Okhotnikov // the formula for representation above. 53b8e8012aSKirill Okhotnikov // a = m_x * 2^e_x, b = m_y * 2^e_y, N = 2^k 54b8e8012aSKirill Okhotnikov // r(k) = a % b = (m_x * 2^e_x) % (2^k * m_y * 2^e_y) 55b8e8012aSKirill Okhotnikov // = 2^(e_y + k) * (m_x * 2^(e_x - e_y - k) % m_y) 56b8e8012aSKirill Okhotnikov // r(k) = m_r * 2^e_r = (m_x % m_y) * 2^(m_y + k) 57b8e8012aSKirill Okhotnikov // = (2^p * (m_x % m_y) * 2^(e_y + k - p)) 58b8e8012aSKirill Okhotnikov // m_r = 2^p * (m_x % m_y), e_r = m_y + k - p 59b8e8012aSKirill Okhotnikov // 60b8e8012aSKirill Okhotnikov // Algorithm description: 61b8e8012aSKirill Okhotnikov // First, let write x = m_x * 2^e_x and y = m_y * 2^e_y with m_x, m_y, e_x, e_y 62b8e8012aSKirill Okhotnikov // are integers (m_x amd m_y positive). 63b8e8012aSKirill Okhotnikov // Then the naive implementation of the fmod function with a simple 64b8e8012aSKirill Okhotnikov // for/while loop: 65b8e8012aSKirill Okhotnikov // while (e_x > e_y) { 66b8e8012aSKirill Okhotnikov // m_x *= 2; --e_x; // m_x * 2^e_x == 2 * m_x * 2^(e_x - 1) 67b8e8012aSKirill Okhotnikov // m_x %= m_y; 68b8e8012aSKirill Okhotnikov // } 69b8e8012aSKirill Okhotnikov // On the other hand, the algorithm exploits the fact that m_x, m_y are the 70b8e8012aSKirill Okhotnikov // mantissas of floating point numbers, which use less bits than the storage 71b8e8012aSKirill Okhotnikov // integers: 24 / 32 for floats and 53 / 64 for doubles, so if in each step of 72b8e8012aSKirill Okhotnikov // the iteration, we can left shift m_x as many bits as the storage integer 73b8e8012aSKirill Okhotnikov // type can hold, the exponent reduction per step will be at least 32 - 24 = 8 74b8e8012aSKirill Okhotnikov // for floats and 64 - 53 = 11 for doubles (double example below): 75b8e8012aSKirill Okhotnikov // while (e_x > e_y) { 76b8e8012aSKirill Okhotnikov // m_x <<= 11; e_x -= 11; // m_x * 2^e_x == 2^11 * m_x * 2^(e_x - 11) 77b8e8012aSKirill Okhotnikov // m_x %= m_y; 78b8e8012aSKirill Okhotnikov // } 79b8e8012aSKirill Okhotnikov // Some extra improvements are done: 80b8e8012aSKirill Okhotnikov // 1) Shift m_y maximum to the right, which can significantly improve 81b8e8012aSKirill Okhotnikov // performance for small integer numbers (y = 3 for example). 82b8e8012aSKirill Okhotnikov // The m_x shift in the loop can be 62 instead of 11 for double. 83b8e8012aSKirill Okhotnikov // 2) For some architectures with very slow division, it can be better to 84b8e8012aSKirill Okhotnikov // calculate inverse value ones, and after do multiplication in the loop. 85b8e8012aSKirill Okhotnikov // 3) "likely" special cases are treated specially to improve performance. 86b8e8012aSKirill Okhotnikov // 87b8e8012aSKirill Okhotnikov // Simple example: 88b8e8012aSKirill Okhotnikov // The examples below use byte for simplicity. 89b8e8012aSKirill Okhotnikov // 1) Shift hy maximum to right without losing bits and increase iy value 90b8e8012aSKirill Okhotnikov // m_y = 0b00101100 e_y = 20 after shift m_y = 0b00001011 e_y = 22. 91b8e8012aSKirill Okhotnikov // 2) m_x = m_x % m_y. 92b8e8012aSKirill Okhotnikov // 3) Move m_x maximum to left. Note that after (m_x = m_x % m_y) CLZ in m_x 93b8e8012aSKirill Okhotnikov // is not lower than CLZ in m_y. m_x=0b00001001 e_x = 100, m_x=0b10010000, 94b8e8012aSKirill Okhotnikov // e_x = 100-4 = 96. 95b8e8012aSKirill Okhotnikov // 4) Repeat (2) until e_x == e_y. 96b8e8012aSKirill Okhotnikov // 97b8e8012aSKirill Okhotnikov // Complexity analysis (double): 98b8e8012aSKirill Okhotnikov // Converting x,y to (m_x,e_x),(m_y, e_y): CTZ/shift/AND/OR/if. Loop count: 99b8e8012aSKirill Okhotnikov // (m_x - m_y) / (64 - "length of m_y"). 100b8e8012aSKirill Okhotnikov // max("length of m_y") = 53, 101b8e8012aSKirill Okhotnikov // max(e_x - e_y) = 2048 102b8e8012aSKirill Okhotnikov // Maximum operation is 186. For rare "unrealistic" cases. 103b8e8012aSKirill Okhotnikov // 104b8e8012aSKirill Okhotnikov // Special cases (double): 105b8e8012aSKirill Okhotnikov // Supposing that case where |y| > 1e-292 and |x/y|<2000 is very common 106b8e8012aSKirill Okhotnikov // special processing is implemented. No m_y alignment, no loop: 107b8e8012aSKirill Okhotnikov // result = (m_x * 2^(e_x - e_y)) % m_y. 108b8e8012aSKirill Okhotnikov // When x and y are both subnormal (rare case but...) the 109b8e8012aSKirill Okhotnikov // result = m_x % m_y. 110b8e8012aSKirill Okhotnikov // Simplified conversion back to double. 111b8e8012aSKirill Okhotnikov 112b8e8012aSKirill Okhotnikov // Exceptional cases handler according to cppreference.com 113b8e8012aSKirill Okhotnikov // https://en.cppreference.com/w/cpp/numeric/math/fmod 114b8e8012aSKirill Okhotnikov // and POSIX standard described in Linux man 115b8e8012aSKirill Okhotnikov // https://man7.org/linux/man-pages/man3/fmod.3p.html 116b8e8012aSKirill Okhotnikov // C standard for the function is not full, so not by default (although it can 117b8e8012aSKirill Okhotnikov // be implemented in another handler. 11827aca975SKirill Okhotnikov // Signaling NaN converted to quiet NaN with FE_INVALID exception. 11927aca975SKirill Okhotnikov // https://www.open-std.org/JTC1/SC22/WG14/www/docs/n1011.htm 1204d21e752Slntue template <typename T> struct FModDivisionSimpleHelper { 1214d21e752Slntue LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count, 1224d21e752Slntue T m_x, T m_y) { 1234d21e752Slntue while (exp_diff > sides_zeroes_count) { 1244d21e752Slntue exp_diff -= sides_zeroes_count; 1254d21e752Slntue m_x <<= sides_zeroes_count; 1264d21e752Slntue m_x %= m_y; 1274d21e752Slntue } 1284d21e752Slntue m_x <<= exp_diff; 1294d21e752Slntue m_x %= m_y; 1304d21e752Slntue return m_x; 1314d21e752Slntue } 1324d21e752Slntue }; 133b8e8012aSKirill Okhotnikov 1344d21e752Slntue template <typename T> struct FModDivisionInvMultHelper { 1354d21e752Slntue LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count, 1364d21e752Slntue T m_x, T m_y) { 1374d21e752Slntue constexpr int LENGTH = sizeof(T) * CHAR_BIT; 1384d21e752Slntue if (exp_diff > sides_zeroes_count) { 1394d21e752Slntue T inv_hy = (cpp::numeric_limits<T>::max() / m_y); 1404d21e752Slntue while (exp_diff > sides_zeroes_count) { 1414d21e752Slntue exp_diff -= sides_zeroes_count; 1424d21e752Slntue T hd = (m_x * inv_hy) >> (LENGTH - sides_zeroes_count); 1434d21e752Slntue m_x <<= sides_zeroes_count; 1444d21e752Slntue m_x -= hd * m_y; 1454d21e752Slntue while (LIBC_UNLIKELY(m_x > m_y)) 1464d21e752Slntue m_x -= m_y; 1474d21e752Slntue } 1484d21e752Slntue T hd = (m_x * inv_hy) >> (LENGTH - exp_diff); 1494d21e752Slntue m_x <<= exp_diff; 1504d21e752Slntue m_x -= hd * m_y; 1514d21e752Slntue while (LIBC_UNLIKELY(m_x > m_y)) 1524d21e752Slntue m_x -= m_y; 1534d21e752Slntue } else { 1544d21e752Slntue m_x <<= exp_diff; 1554d21e752Slntue m_x %= m_y; 1564d21e752Slntue } 1574d21e752Slntue return m_x; 1584d21e752Slntue } 1594d21e752Slntue }; 1604d21e752Slntue 1614d21e752Slntue template <typename T, typename U = typename FPBits<T>::StorageType, 1624d21e752Slntue typename DivisionHelper = FModDivisionSimpleHelper<U>> 1634d21e752Slntue class FMod { 1640ccec541SMikhail R. Gadelha static_assert(cpp::is_floating_point_v<T> && 1650ccec541SMikhail R. Gadelha is_unsigned_integral_or_big_int_v<U> && 1664d21e752Slntue (sizeof(U) * CHAR_BIT > FPBits<T>::FRACTION_LEN), 1674d21e752Slntue "FMod instantiated with invalid type."); 1684d21e752Slntue 1694d21e752Slntue private: 1704d21e752Slntue using FPB = FPBits<T>; 1714d21e752Slntue using StorageType = typename FPB::StorageType; 172b8e8012aSKirill Okhotnikov 173e35c7149STue Ly LIBC_INLINE static bool pre_check(T x, T y, T &out) { 174a9af317fSGuillaume Chatelet using FPB = fputil::FPBits<T>; 175ace383dfSGuillaume Chatelet const T quiet_nan = FPB::quiet_nan().get_val(); 176a9af317fSGuillaume Chatelet FPB sx(x), sy(y); 17729f8e076SGuillaume Chatelet if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() && 1784d21e752Slntue !sx.is_inf_or_nan())) 179b8e8012aSKirill Okhotnikov return false; 180b8e8012aSKirill Okhotnikov 18127aca975SKirill Okhotnikov if (sx.is_nan() || sy.is_nan()) { 1824d21e752Slntue if (sx.is_signaling_nan() || sy.is_signaling_nan()) 183e35c7149STue Ly fputil::raise_except_if_required(FE_INVALID); 184e35c7149STue Ly out = quiet_nan; 185b8e8012aSKirill Okhotnikov return true; 186b8e8012aSKirill Okhotnikov } 187b8e8012aSKirill Okhotnikov 18827aca975SKirill Okhotnikov if (sx.is_inf() || sy.is_zero()) { 189e35c7149STue Ly fputil::raise_except_if_required(FE_INVALID); 190e35c7149STue Ly fputil::set_errno_if_required(EDOM); 191e35c7149STue Ly out = quiet_nan; 19227aca975SKirill Okhotnikov return true; 19327aca975SKirill Okhotnikov } 19427aca975SKirill Okhotnikov 195b8e8012aSKirill Okhotnikov out = x; 196b8e8012aSKirill Okhotnikov return true; 197b8e8012aSKirill Okhotnikov } 198b8e8012aSKirill Okhotnikov 199a9af317fSGuillaume Chatelet LIBC_INLINE static constexpr FPB eval_internal(FPB sx, FPB sy) { 200b8e8012aSKirill Okhotnikov 20129f8e076SGuillaume Chatelet if (LIBC_LIKELY(sx.uintval() <= sy.uintval())) { 202b8e8012aSKirill Okhotnikov if (sx.uintval() < sy.uintval()) 203b8e8012aSKirill Okhotnikov return sx; // |x|<|y| return x 2044d21e752Slntue return FPB::zero(); // |x|=|y| return 0.0 205b8e8012aSKirill Okhotnikov } 206b8e8012aSKirill Okhotnikov 2077b387d27SGuillaume Chatelet int e_x = sx.get_biased_exponent(); 2087b387d27SGuillaume Chatelet int e_y = sy.get_biased_exponent(); 209b8e8012aSKirill Okhotnikov 2103546f4daSGuillaume Chatelet // Most common case where |y| is "very normal" and |x/y| < 2^EXP_LEN 2113546f4daSGuillaume Chatelet if (LIBC_LIKELY(e_y > int(FPB::FRACTION_LEN) && 2123546f4daSGuillaume Chatelet e_x - e_y <= int(FPB::EXP_LEN))) { 2133546f4daSGuillaume Chatelet StorageType m_x = sx.get_explicit_mantissa(); 2143546f4daSGuillaume Chatelet StorageType m_y = sy.get_explicit_mantissa(); 2150cdb0b74SOverMighty StorageType d = (e_x == e_y) 2160cdb0b74SOverMighty ? (m_x - m_y) 2170cdb0b74SOverMighty : static_cast<StorageType>(m_x << (e_x - e_y)) % m_y; 218b8e8012aSKirill Okhotnikov if (d == 0) 2194d21e752Slntue return FPB::zero(); 220b8e8012aSKirill Okhotnikov // iy - 1 because of "zero power" for number with power 1 221a9af317fSGuillaume Chatelet return FPB::make_value(d, e_y - 1); 222b8e8012aSKirill Okhotnikov } 2234d21e752Slntue // Both subnormal special case. 22429f8e076SGuillaume Chatelet if (LIBC_UNLIKELY(e_x == 0 && e_y == 0)) { 225a9af317fSGuillaume Chatelet FPB d; 226b8e8012aSKirill Okhotnikov d.set_mantissa(sx.uintval() % sy.uintval()); 227b8e8012aSKirill Okhotnikov return d; 228b8e8012aSKirill Okhotnikov } 229b8e8012aSKirill Okhotnikov 230b8e8012aSKirill Okhotnikov // Note that hx is not subnormal by conditions above. 2314d21e752Slntue U m_x = static_cast<U>(sx.get_explicit_mantissa()); 232b8e8012aSKirill Okhotnikov e_x--; 233b8e8012aSKirill Okhotnikov 2344d21e752Slntue U m_y = static_cast<U>(sy.get_explicit_mantissa()); 2354d21e752Slntue constexpr int DEFAULT_LEAD_ZEROS = 2364d21e752Slntue sizeof(U) * CHAR_BIT - FPB::FRACTION_LEN - 1; 2374d21e752Slntue int lead_zeros_m_y = DEFAULT_LEAD_ZEROS; 23829f8e076SGuillaume Chatelet if (LIBC_LIKELY(e_y > 0)) { 239b8e8012aSKirill Okhotnikov e_y--; 240b8e8012aSKirill Okhotnikov } else { 2414d21e752Slntue m_y = static_cast<U>(sy.get_mantissa()); 2421d894788SGuillaume Chatelet lead_zeros_m_y = cpp::countl_zero(m_y); 243b8e8012aSKirill Okhotnikov } 244b8e8012aSKirill Okhotnikov 245b8e8012aSKirill Okhotnikov // Assume hy != 0 2461d894788SGuillaume Chatelet int tail_zeros_m_y = cpp::countr_zero(m_y); 247b8e8012aSKirill Okhotnikov int sides_zeroes_count = lead_zeros_m_y + tail_zeros_m_y; 248b8e8012aSKirill Okhotnikov // n > 0 by conditions above 249b8e8012aSKirill Okhotnikov int exp_diff = e_x - e_y; 250b8e8012aSKirill Okhotnikov { 251b8e8012aSKirill Okhotnikov // Shift hy right until the end or n = 0 252b8e8012aSKirill Okhotnikov int right_shift = exp_diff < tail_zeros_m_y ? exp_diff : tail_zeros_m_y; 253b8e8012aSKirill Okhotnikov m_y >>= right_shift; 254b8e8012aSKirill Okhotnikov exp_diff -= right_shift; 255b8e8012aSKirill Okhotnikov e_y += right_shift; 256b8e8012aSKirill Okhotnikov } 257b8e8012aSKirill Okhotnikov 258b8e8012aSKirill Okhotnikov { 259b8e8012aSKirill Okhotnikov // Shift hx left until the end or n = 0 2604d21e752Slntue int left_shift = 2614d21e752Slntue exp_diff < DEFAULT_LEAD_ZEROS ? exp_diff : DEFAULT_LEAD_ZEROS; 262b8e8012aSKirill Okhotnikov m_x <<= left_shift; 263b8e8012aSKirill Okhotnikov exp_diff -= left_shift; 264b8e8012aSKirill Okhotnikov } 265b8e8012aSKirill Okhotnikov 266b8e8012aSKirill Okhotnikov m_x %= m_y; 26729f8e076SGuillaume Chatelet if (LIBC_UNLIKELY(m_x == 0)) 2684d21e752Slntue return FPB::zero(); 269b8e8012aSKirill Okhotnikov 270b8e8012aSKirill Okhotnikov if (exp_diff == 0) 2714d21e752Slntue return FPB::make_value(static_cast<StorageType>(m_x), e_y); 272b8e8012aSKirill Okhotnikov 2734d21e752Slntue // hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0 274b8e8012aSKirill Okhotnikov m_x = DivisionHelper::execute(exp_diff, sides_zeroes_count, m_x, m_y); 2754d21e752Slntue return FPB::make_value(static_cast<StorageType>(m_x), e_y); 276b8e8012aSKirill Okhotnikov } 277b8e8012aSKirill Okhotnikov 278b8e8012aSKirill Okhotnikov public: 27959c809cdSSiva Chandra Reddy LIBC_INLINE static T eval(T x, T y) { 2804d21e752Slntue if (T out; LIBC_UNLIKELY(pre_check(x, y, out))) 281b8e8012aSKirill Okhotnikov return out; 282a9af317fSGuillaume Chatelet FPB sx(x), sy(y); 28311ec512fSGuillaume Chatelet Sign sign = sx.sign(); 28411ec512fSGuillaume Chatelet sx.set_sign(Sign::POS); 28511ec512fSGuillaume Chatelet sy.set_sign(Sign::POS); 286a9af317fSGuillaume Chatelet FPB result = eval_internal(sx, sy); 287b8e8012aSKirill Okhotnikov result.set_sign(sign); 288b8e8012aSKirill Okhotnikov return result.get_val(); 289b8e8012aSKirill Okhotnikov } 290b8e8012aSKirill Okhotnikov }; 291b8e8012aSKirill Okhotnikov 292b8e8012aSKirill Okhotnikov } // namespace generic 293b8e8012aSKirill Okhotnikov } // namespace fputil 294*5ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL 295b8e8012aSKirill Okhotnikov 296270547f3SGuillaume Chatelet #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMOD_H 297