arm-optimized-routines/math/erff.c

31914882SAlex Richardson/*
31914882SAlex Richardson * Single-precision erf(x) function.
31914882SAlex Richardson *
*f3087befSAndrew Turner * Copyright (c) 2020-2024, Arm Limited.
072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
31914882SAlex Richardson */
31914882SAlex Richardson
31914882SAlex Richardson#include <stdint.h>
31914882SAlex Richardson#include <math.h>
31914882SAlex Richardson#include "math_config.h"
*f3087befSAndrew Turner#include "test_defs.h"
*f3087befSAndrew Turner#include "test_sig.h"
31914882SAlex Richardson
31914882SAlex Richardson#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
31914882SAlex Richardson#define A __erff_data.erff_poly_A
31914882SAlex Richardson#define B __erff_data.erff_poly_B
31914882SAlex Richardson
31914882SAlex Richardson/* Top 12 bits of a float.  */
31914882SAlex Richardsonstatic inline uint32_t
31914882SAlex Richardsontop12 (float x)
31914882SAlex Richardson{
31914882SAlex Richardson  return asuint (x) >> 20;
31914882SAlex Richardson}
31914882SAlex Richardson
31914882SAlex Richardson/* Efficient implementation of erff
31914882SAlex Richardson   using either a pure polynomial approximation or
31914882SAlex Richardson   the exponential of a polynomial.
31914882SAlex Richardson   Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
31914882SAlex Richardsonfloat
31914882SAlex Richardsonerff (float x)
31914882SAlex Richardson{
31914882SAlex Richardson  float r, x2, u;
31914882SAlex Richardson
31914882SAlex Richardson  /* Get top word.  */
31914882SAlex Richardson  uint32_t ix = asuint (x);
31914882SAlex Richardson  uint32_t sign = ix >> 31;
31914882SAlex Richardson  uint32_t ia12 = top12 (x) & 0x7ff;
31914882SAlex Richardson
31914882SAlex Richardson  /* Limit of both intervals is 0.875 for performance reasons but coefficients
31914882SAlex Richardson     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
31914882SAlex Richardson     from 0.94 to 1.1ulps.  */
31914882SAlex Richardson  if (ia12 < 0x3f6)
31914882SAlex Richardson    { /* a = |x| < 0.875.  */
31914882SAlex Richardson
31914882SAlex Richardson      /* Tiny and subnormal cases.  */
31914882SAlex Richardson      if (unlikely (ia12 < 0x318))
31914882SAlex Richardson	{ /* |x| < 2^(-28).  */
31914882SAlex Richardson	  if (unlikely (ia12 < 0x040))
31914882SAlex Richardson	    { /* |x| < 2^(-119).  */
31914882SAlex Richardson	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
31914882SAlex Richardson	      return check_uflowf (y);
31914882SAlex Richardson	    }
31914882SAlex Richardson	  return x + TwoOverSqrtPiMinusOne * x;
31914882SAlex Richardson	}
31914882SAlex Richardson
31914882SAlex Richardson      x2 = x * x;
31914882SAlex Richardson
31914882SAlex Richardson      /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
31914882SAlex Richardson      r = A[5];
31914882SAlex Richardson      r = fmaf (r, x2, A[4]);
31914882SAlex Richardson      r = fmaf (r, x2, A[3]);
31914882SAlex Richardson      r = fmaf (r, x2, A[2]);
31914882SAlex Richardson      r = fmaf (r, x2, A[1]);
31914882SAlex Richardson      r = fmaf (r, x2, A[0]);
31914882SAlex Richardson      r = fmaf (r, x, x);
31914882SAlex Richardson    }
31914882SAlex Richardson  else if (ia12 < 0x408)
31914882SAlex Richardson    { /* |x| < 4.0 - Use a custom Estrin scheme.  */
31914882SAlex Richardson
31914882SAlex Richardson      float a = fabsf (x);
31914882SAlex Richardson      /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
31914882SAlex Richardson      r = fmaf (B[6], a, B[5]);
31914882SAlex Richardson      u = fmaf (B[4], a, B[3]);
31914882SAlex Richardson      x2 = x * x;
31914882SAlex Richardson      r = fmaf (r, x2, u);
31914882SAlex Richardson      /* Then switch to pure Horner scheme.  */
31914882SAlex Richardson      r = fmaf (r, a, B[2]);
31914882SAlex Richardson      r = fmaf (r, a, B[1]);
31914882SAlex Richardson      r = fmaf (r, a, B[0]);
31914882SAlex Richardson      r = fmaf (r, a, a);
31914882SAlex Richardson      /* Single precision exponential with ~0.5ulps,
31914882SAlex Richardson	 ensures erff has max. rel. error
31914882SAlex Richardson	 < 1ulp on [0.921875, 4.0],
31914882SAlex Richardson	 < 1.1ulps on [0.875, 4.0].  */
31914882SAlex Richardson      r = expf (-r);
31914882SAlex Richardson      /* Explicit copysign (calling copysignf increases latency).  */
31914882SAlex Richardson      if (sign)
31914882SAlex Richardson	r = -1.0f + r;
31914882SAlex Richardson      else
31914882SAlex Richardson	r = 1.0f - r;
31914882SAlex Richardson    }
31914882SAlex Richardson  else
31914882SAlex Richardson    { /* |x| >= 4.0.  */
31914882SAlex Richardson
31914882SAlex Richardson      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
31914882SAlex Richardson      if (unlikely (ia12 >= 0x7f8))
31914882SAlex Richardson	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
31914882SAlex Richardson
31914882SAlex Richardson      /* Explicit copysign (calling copysignf increases latency).  */
31914882SAlex Richardson      if (sign)
31914882SAlex Richardson	r = -1.0f;
31914882SAlex Richardson      else
31914882SAlex Richardson	r = 1.0f;
31914882SAlex Richardson    }
31914882SAlex Richardson  return r;
31914882SAlex Richardson}
*f3087befSAndrew Turner
*f3087befSAndrew TurnerTEST_SIG (S, F, 1, erf, -6.0, 6.0)
*f3087befSAndrew TurnerTEST_ULP (erff, 0.6)
*f3087befSAndrew TurnerTEST_ULP_NONNEAREST (erff, 0.9)
*f3087befSAndrew TurnerTEST_INTERVAL (erff, 0, 0xffff0000, 10000)
*f3087befSAndrew TurnerTEST_SYM_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
*f3087befSAndrew TurnerTEST_SYM_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
*f3087befSAndrew TurnerTEST_INTERVAL (erff, 0, inf, 40000)