FPUtil/generic/FMod.h

b8e8012aSKirill Okhotnikov//===-- Common header for fmod implementations ------------------*- C++ -*-===//
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
b8e8012aSKirill Okhotnikov// See https://llvm.org/LICENSE.txt for license information.
b8e8012aSKirill Okhotnikov// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov//===----------------------------------------------------------------------===//
b8e8012aSKirill Okhotnikov
270547f3SGuillaume Chatelet#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMOD_H
270547f3SGuillaume Chatelet#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMOD_H
b8e8012aSKirill Okhotnikov
1d894788SGuillaume Chatelet#include "src/__support/CPP/bit.h"
91eb0b65SGuillaume Chatelet#include "src/__support/CPP/limits.h"
f7226150SGuillaume Chatelet#include "src/__support/CPP/type_traits.h"
b8e8012aSKirill Okhotnikov#include "src/__support/FPUtil/FEnvImpl.h"
b8e8012aSKirill Okhotnikov#include "src/__support/FPUtil/FPBits.h"
*5ff3ff33SPetr Hosek#include "src/__support/macros/config.h"
737e1cd1SGuillaume Chatelet#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
b8e8012aSKirill Okhotnikov
*5ff3ff33SPetr Hoseknamespace LIBC_NAMESPACE_DECL {
b8e8012aSKirill Okhotnikovnamespace fputil {
b8e8012aSKirill Okhotnikovnamespace generic {
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov//  Objective:
b8e8012aSKirill Okhotnikov//    The  algorithm uses  integer arithmetic  (max uint64_t)  for general case.
b8e8012aSKirill Okhotnikov//    Some common  cases, like  abs(x) < abs(y)  or  abs(x) < 1000 *  abs(y) are
b8e8012aSKirill Okhotnikov//    treated specially to increase  performance.  The part of checking  special
b8e8012aSKirill Okhotnikov//    cases, numbers NaN, INF etc. treated separately.
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov//  Objective:
b8e8012aSKirill Okhotnikov//    1) FMod definition (https://cplusplus.com/reference/cmath/fmod/):
b8e8012aSKirill Okhotnikov//       fmod = numer - tquot * denom, where tquot is the truncated
b8e8012aSKirill Okhotnikov//       (i.e., rounded towards zero) result of: numer/denom.
b8e8012aSKirill Okhotnikov//    2) FMod with negative x and/or y can be trivially converted to fmod for
b8e8012aSKirill Okhotnikov//       positive x and y. Therefore the algorithm below works only with
b8e8012aSKirill Okhotnikov//       positive numbers.
b8e8012aSKirill Okhotnikov//    3) All positive floating point numbers can be represented as m * 2^e,
b8e8012aSKirill Okhotnikov//       where "m" is positive integer and "e" is signed.
b8e8012aSKirill Okhotnikov//    4) FMod function can be calculated in integer numbers (x > y):
b8e8012aSKirill Okhotnikov//         fmod = m_x * 2^e_x - tquot * m_y * 2^e_y
b8e8012aSKirill Okhotnikov//              = 2^e_y * (m_x * 2^(e_x - e^y) - tquot * m_y).
b8e8012aSKirill Okhotnikov//       All variables in parentheses are unsigned integers.
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov//  Mathematical background:
b8e8012aSKirill Okhotnikov//    Input x,y in the algorithm is represented (mathematically) like m_x*2^e_x
b8e8012aSKirill Okhotnikov//    and m_y*2^e_y. This is an ambiguous number representation. For example:
b8e8012aSKirill Okhotnikov//      m * 2^e = (2 * m) * 2^(e-1)
b8e8012aSKirill Okhotnikov//    The algorithm uses the facts that
b8e8012aSKirill Okhotnikov//      r = a % b = (a % (N * b)) % b,
b8e8012aSKirill Okhotnikov//      (a * c) % (b * c) = (a % b) * c
b8e8012aSKirill Okhotnikov//    where N is positive  integer number. a, b and c - positive. Let's  adopt
b8e8012aSKirill Okhotnikov//    the formula for representation above.
b8e8012aSKirill Okhotnikov//      a = m_x * 2^e_x, b = m_y * 2^e_y, N = 2^k
b8e8012aSKirill Okhotnikov//      r(k) = a % b = (m_x * 2^e_x) % (2^k * m_y * 2^e_y)
b8e8012aSKirill Okhotnikov//           = 2^(e_y + k) * (m_x * 2^(e_x - e_y - k) % m_y)
b8e8012aSKirill Okhotnikov//      r(k) = m_r * 2^e_r = (m_x % m_y) * 2^(m_y + k)
b8e8012aSKirill Okhotnikov//           = (2^p * (m_x % m_y) * 2^(e_y + k - p))
b8e8012aSKirill Okhotnikov//        m_r = 2^p * (m_x % m_y), e_r = m_y + k - p
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov//  Algorithm description:
b8e8012aSKirill Okhotnikov//  First, let write x = m_x * 2^e_x and y = m_y * 2^e_y with m_x, m_y, e_x, e_y
b8e8012aSKirill Okhotnikov//  are integers (m_x amd m_y positive).
b8e8012aSKirill Okhotnikov//  Then the naive  implementation of the fmod function with a simple
b8e8012aSKirill Okhotnikov//  for/while loop:
b8e8012aSKirill Okhotnikov//      while (e_x > e_y) {
b8e8012aSKirill Okhotnikov//        m_x *= 2; --e_x; //  m_x * 2^e_x == 2 * m_x * 2^(e_x - 1)
b8e8012aSKirill Okhotnikov//        m_x %= m_y;
b8e8012aSKirill Okhotnikov//      }
b8e8012aSKirill Okhotnikov//  On the other hand, the algorithm exploits the fact that m_x, m_y are the
b8e8012aSKirill Okhotnikov//  mantissas of floating point numbers, which use less bits than the storage
b8e8012aSKirill Okhotnikov//  integers: 24 / 32 for floats and 53 / 64 for doubles, so if in each step of
b8e8012aSKirill Okhotnikov//  the iteration, we can left shift m_x as many bits as the storage integer
b8e8012aSKirill Okhotnikov//  type can hold, the exponent reduction per step will be at least 32 - 24 = 8
b8e8012aSKirill Okhotnikov//  for floats and 64 - 53 = 11 for doubles (double example below):
b8e8012aSKirill Okhotnikov//      while (e_x > e_y) {
b8e8012aSKirill Okhotnikov//        m_x <<= 11; e_x -= 11; //  m_x * 2^e_x == 2^11 * m_x * 2^(e_x - 11)
b8e8012aSKirill Okhotnikov//        m_x %= m_y;
b8e8012aSKirill Okhotnikov//      }
b8e8012aSKirill Okhotnikov//  Some extra improvements are done:
b8e8012aSKirill Okhotnikov//    1) Shift m_y maximum to the right, which can significantly improve
b8e8012aSKirill Okhotnikov//       performance for small integer numbers (y = 3 for example).
b8e8012aSKirill Okhotnikov//       The m_x shift in the loop can be 62 instead of 11 for double.
b8e8012aSKirill Okhotnikov//    2) For some architectures with very slow division, it can be better to
b8e8012aSKirill Okhotnikov//       calculate inverse value ones, and after do multiplication in the loop.
b8e8012aSKirill Okhotnikov//    3) "likely" special cases are treated specially to improve performance.
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov//  Simple example:
b8e8012aSKirill Okhotnikov//  The examples below use byte for simplicity.
b8e8012aSKirill Okhotnikov//    1) Shift hy maximum to right without losing bits and increase iy value
b8e8012aSKirill Okhotnikov//       m_y = 0b00101100 e_y = 20 after shift m_y = 0b00001011 e_y = 22.
b8e8012aSKirill Okhotnikov//    2) m_x = m_x % m_y.
b8e8012aSKirill Okhotnikov//    3) Move m_x maximum to left. Note that after (m_x = m_x % m_y) CLZ in m_x
b8e8012aSKirill Okhotnikov//    is not lower than CLZ in m_y. m_x=0b00001001 e_x = 100, m_x=0b10010000,
b8e8012aSKirill Okhotnikov//       e_x = 100-4 = 96.
b8e8012aSKirill Okhotnikov//    4) Repeat (2) until e_x == e_y.
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov//  Complexity analysis (double):
b8e8012aSKirill Okhotnikov//    Converting x,y to (m_x,e_x),(m_y, e_y): CTZ/shift/AND/OR/if. Loop  count:
b8e8012aSKirill Okhotnikov//      (m_x - m_y) / (64 -  "length of m_y").
b8e8012aSKirill Okhotnikov//      max("length of m_y")  = 53,
b8e8012aSKirill Okhotnikov//      max(e_x - e_y)  = 2048
b8e8012aSKirill Okhotnikov//    Maximum operation is  186. For rare "unrealistic" cases.
b8e8012aSKirill Okhotnikov//
b8e8012aSKirill Okhotnikov//  Special cases (double):
b8e8012aSKirill Okhotnikov//    Supposing  that  case  where |y| > 1e-292 and |x/y|<2000  is  very  common
b8e8012aSKirill Okhotnikov//    special processing is implemented. No m_y alignment, no loop:
b8e8012aSKirill Okhotnikov//      result = (m_x * 2^(e_x - e_y)) % m_y.
b8e8012aSKirill Okhotnikov//    When x and y are both subnormal (rare case but...) the
b8e8012aSKirill Okhotnikov//      result = m_x % m_y.
b8e8012aSKirill Okhotnikov//    Simplified conversion back to double.
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov// Exceptional cases handler according to cppreference.com
b8e8012aSKirill Okhotnikov//    https://en.cppreference.com/w/cpp/numeric/math/fmod
b8e8012aSKirill Okhotnikov// and POSIX standard described in Linux man
b8e8012aSKirill Okhotnikov//   https://man7.org/linux/man-pages/man3/fmod.3p.html
b8e8012aSKirill Okhotnikov// C standard for the function is not full, so not by default (although it can
b8e8012aSKirill Okhotnikov// be implemented in another handler.
27aca975SKirill Okhotnikov// Signaling NaN converted to quiet NaN with FE_INVALID exception.
27aca975SKirill Okhotnikov//    https://www.open-std.org/JTC1/SC22/WG14/www/docs/n1011.htm
4d21e752Slntuetemplate <typename T> struct FModDivisionSimpleHelper {
4d21e752Slntue  LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count,
4d21e752Slntue                                         T m_x, T m_y) {
4d21e752Slntue    while (exp_diff > sides_zeroes_count) {
4d21e752Slntue      exp_diff -= sides_zeroes_count;
4d21e752Slntue      m_x <<= sides_zeroes_count;
4d21e752Slntue      m_x %= m_y;
4d21e752Slntue    }
4d21e752Slntue    m_x <<= exp_diff;
4d21e752Slntue    m_x %= m_y;
4d21e752Slntue    return m_x;
4d21e752Slntue  }
4d21e752Slntue};
b8e8012aSKirill Okhotnikov
4d21e752Slntuetemplate <typename T> struct FModDivisionInvMultHelper {
4d21e752Slntue  LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count,
4d21e752Slntue                                         T m_x, T m_y) {
4d21e752Slntue    constexpr int LENGTH = sizeof(T) * CHAR_BIT;
4d21e752Slntue    if (exp_diff > sides_zeroes_count) {
4d21e752Slntue      T inv_hy = (cpp::numeric_limits<T>::max() / m_y);
4d21e752Slntue      while (exp_diff > sides_zeroes_count) {
4d21e752Slntue        exp_diff -= sides_zeroes_count;
4d21e752Slntue        T hd = (m_x * inv_hy) >> (LENGTH - sides_zeroes_count);
4d21e752Slntue        m_x <<= sides_zeroes_count;
4d21e752Slntue        m_x -= hd * m_y;
4d21e752Slntue        while (LIBC_UNLIKELY(m_x > m_y))
4d21e752Slntue          m_x -= m_y;
4d21e752Slntue      }
4d21e752Slntue      T hd = (m_x * inv_hy) >> (LENGTH - exp_diff);
4d21e752Slntue      m_x <<= exp_diff;
4d21e752Slntue      m_x -= hd * m_y;
4d21e752Slntue      while (LIBC_UNLIKELY(m_x > m_y))
4d21e752Slntue        m_x -= m_y;
4d21e752Slntue    } else {
4d21e752Slntue      m_x <<= exp_diff;
4d21e752Slntue      m_x %= m_y;
4d21e752Slntue    }
4d21e752Slntue    return m_x;
4d21e752Slntue  }
4d21e752Slntue};
4d21e752Slntue
4d21e752Slntuetemplate <typename T, typename U = typename FPBits<T>::StorageType,
4d21e752Slntue          typename DivisionHelper = FModDivisionSimpleHelper<U>>
4d21e752Slntueclass FMod {
0ccec541SMikhail R. Gadelha  static_assert(cpp::is_floating_point_v<T> &&
0ccec541SMikhail R. Gadelha                    is_unsigned_integral_or_big_int_v<U> &&
4d21e752Slntue                    (sizeof(U) * CHAR_BIT > FPBits<T>::FRACTION_LEN),
4d21e752Slntue                "FMod instantiated with invalid type.");
4d21e752Slntue
4d21e752Slntueprivate:
4d21e752Slntue  using FPB = FPBits<T>;
4d21e752Slntue  using StorageType = typename FPB::StorageType;
b8e8012aSKirill Okhotnikov
e35c7149STue Ly  LIBC_INLINE static bool pre_check(T x, T y, T &out) {
a9af317fSGuillaume Chatelet    using FPB = fputil::FPBits<T>;
ace383dfSGuillaume Chatelet    const T quiet_nan = FPB::quiet_nan().get_val();
a9af317fSGuillaume Chatelet    FPB sx(x), sy(y);
29f8e076SGuillaume Chatelet    if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
4d21e752Slntue                    !sx.is_inf_or_nan()))
b8e8012aSKirill Okhotnikov      return false;
b8e8012aSKirill Okhotnikov
27aca975SKirill Okhotnikov    if (sx.is_nan() || sy.is_nan()) {
4d21e752Slntue      if (sx.is_signaling_nan() || sy.is_signaling_nan())
e35c7149STue Ly        fputil::raise_except_if_required(FE_INVALID);
e35c7149STue Ly      out = quiet_nan;
b8e8012aSKirill Okhotnikov      return true;
b8e8012aSKirill Okhotnikov    }
b8e8012aSKirill Okhotnikov
27aca975SKirill Okhotnikov    if (sx.is_inf() || sy.is_zero()) {
e35c7149STue Ly      fputil::raise_except_if_required(FE_INVALID);
e35c7149STue Ly      fputil::set_errno_if_required(EDOM);
e35c7149STue Ly      out = quiet_nan;
27aca975SKirill Okhotnikov      return true;
27aca975SKirill Okhotnikov    }
27aca975SKirill Okhotnikov
b8e8012aSKirill Okhotnikov    out = x;
b8e8012aSKirill Okhotnikov    return true;
b8e8012aSKirill Okhotnikov  }
b8e8012aSKirill Okhotnikov
a9af317fSGuillaume Chatelet  LIBC_INLINE static constexpr FPB eval_internal(FPB sx, FPB sy) {
b8e8012aSKirill Okhotnikov
29f8e076SGuillaume Chatelet    if (LIBC_LIKELY(sx.uintval() <= sy.uintval())) {
b8e8012aSKirill Okhotnikov      if (sx.uintval() < sy.uintval())
b8e8012aSKirill Okhotnikov        return sx;             // |x|<|y| return x
4d21e752Slntue      return FPB::zero();      // |x|=|y| return 0.0
b8e8012aSKirill Okhotnikov    }
b8e8012aSKirill Okhotnikov
7b387d27SGuillaume Chatelet    int e_x = sx.get_biased_exponent();
7b387d27SGuillaume Chatelet    int e_y = sy.get_biased_exponent();
b8e8012aSKirill Okhotnikov
3546f4daSGuillaume Chatelet    // Most common case where |y| is "very normal" and |x/y| < 2^EXP_LEN
3546f4daSGuillaume Chatelet    if (LIBC_LIKELY(e_y > int(FPB::FRACTION_LEN) &&
3546f4daSGuillaume Chatelet                    e_x - e_y <= int(FPB::EXP_LEN))) {
3546f4daSGuillaume Chatelet      StorageType m_x = sx.get_explicit_mantissa();
3546f4daSGuillaume Chatelet      StorageType m_y = sy.get_explicit_mantissa();
0cdb0b74SOverMighty      StorageType d = (e_x == e_y)
0cdb0b74SOverMighty                          ? (m_x - m_y)
0cdb0b74SOverMighty                          : static_cast<StorageType>(m_x << (e_x - e_y)) % m_y;
b8e8012aSKirill Okhotnikov      if (d == 0)
4d21e752Slntue        return FPB::zero();
b8e8012aSKirill Okhotnikov      // iy - 1 because of "zero power" for number with power 1
a9af317fSGuillaume Chatelet      return FPB::make_value(d, e_y - 1);
b8e8012aSKirill Okhotnikov    }
4d21e752Slntue    // Both subnormal special case.
29f8e076SGuillaume Chatelet    if (LIBC_UNLIKELY(e_x == 0 && e_y == 0)) {
a9af317fSGuillaume Chatelet      FPB d;
b8e8012aSKirill Okhotnikov      d.set_mantissa(sx.uintval() % sy.uintval());
b8e8012aSKirill Okhotnikov      return d;
b8e8012aSKirill Okhotnikov    }
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov    // Note that hx is not subnormal by conditions above.
4d21e752Slntue    U m_x = static_cast<U>(sx.get_explicit_mantissa());
b8e8012aSKirill Okhotnikov    e_x--;
b8e8012aSKirill Okhotnikov
4d21e752Slntue    U m_y = static_cast<U>(sy.get_explicit_mantissa());
4d21e752Slntue    constexpr int DEFAULT_LEAD_ZEROS =
4d21e752Slntue        sizeof(U) * CHAR_BIT - FPB::FRACTION_LEN - 1;
4d21e752Slntue    int lead_zeros_m_y = DEFAULT_LEAD_ZEROS;
29f8e076SGuillaume Chatelet    if (LIBC_LIKELY(e_y > 0)) {
b8e8012aSKirill Okhotnikov      e_y--;
b8e8012aSKirill Okhotnikov    } else {
4d21e752Slntue      m_y = static_cast<U>(sy.get_mantissa());
1d894788SGuillaume Chatelet      lead_zeros_m_y = cpp::countl_zero(m_y);
b8e8012aSKirill Okhotnikov    }
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov    // Assume hy != 0
1d894788SGuillaume Chatelet    int tail_zeros_m_y = cpp::countr_zero(m_y);
b8e8012aSKirill Okhotnikov    int sides_zeroes_count = lead_zeros_m_y + tail_zeros_m_y;
b8e8012aSKirill Okhotnikov    // n > 0 by conditions above
b8e8012aSKirill Okhotnikov    int exp_diff = e_x - e_y;
b8e8012aSKirill Okhotnikov    {
b8e8012aSKirill Okhotnikov      // Shift hy right until the end or n = 0
b8e8012aSKirill Okhotnikov      int right_shift = exp_diff < tail_zeros_m_y ? exp_diff : tail_zeros_m_y;
b8e8012aSKirill Okhotnikov      m_y >>= right_shift;
b8e8012aSKirill Okhotnikov      exp_diff -= right_shift;
b8e8012aSKirill Okhotnikov      e_y += right_shift;
b8e8012aSKirill Okhotnikov    }
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov    {
b8e8012aSKirill Okhotnikov      // Shift hx left until the end or n = 0
4d21e752Slntue      int left_shift =
4d21e752Slntue          exp_diff < DEFAULT_LEAD_ZEROS ? exp_diff : DEFAULT_LEAD_ZEROS;
b8e8012aSKirill Okhotnikov      m_x <<= left_shift;
b8e8012aSKirill Okhotnikov      exp_diff -= left_shift;
b8e8012aSKirill Okhotnikov    }
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov    m_x %= m_y;
29f8e076SGuillaume Chatelet    if (LIBC_UNLIKELY(m_x == 0))
4d21e752Slntue      return FPB::zero();
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov    if (exp_diff == 0)
4d21e752Slntue      return FPB::make_value(static_cast<StorageType>(m_x), e_y);
b8e8012aSKirill Okhotnikov
4d21e752Slntue    // hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0
b8e8012aSKirill Okhotnikov    m_x = DivisionHelper::execute(exp_diff, sides_zeroes_count, m_x, m_y);
4d21e752Slntue    return FPB::make_value(static_cast<StorageType>(m_x), e_y);
b8e8012aSKirill Okhotnikov  }
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikovpublic:
59c809cdSSiva Chandra Reddy  LIBC_INLINE static T eval(T x, T y) {
4d21e752Slntue    if (T out; LIBC_UNLIKELY(pre_check(x, y, out)))
b8e8012aSKirill Okhotnikov      return out;
a9af317fSGuillaume Chatelet    FPB sx(x), sy(y);
11ec512fSGuillaume Chatelet    Sign sign = sx.sign();
11ec512fSGuillaume Chatelet    sx.set_sign(Sign::POS);
11ec512fSGuillaume Chatelet    sy.set_sign(Sign::POS);
a9af317fSGuillaume Chatelet    FPB result = eval_internal(sx, sy);
b8e8012aSKirill Okhotnikov    result.set_sign(sign);
b8e8012aSKirill Okhotnikov    return result.get_val();
b8e8012aSKirill Okhotnikov  }
b8e8012aSKirill Okhotnikov};
b8e8012aSKirill Okhotnikov
b8e8012aSKirill Okhotnikov} // namespace generic
b8e8012aSKirill Okhotnikov} // namespace fputil
*5ff3ff33SPetr Hosek} // namespace LIBC_NAMESPACE_DECL
b8e8012aSKirill Okhotnikov
270547f3SGuillaume Chatelet#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMOD_H