xref: /freebsd-src/contrib/arm-optimized-routines/math/erff.c (revision f3087bef11543b42e0d69b708f367097a4118d24)
131914882SAlex Richardson /*
231914882SAlex Richardson  * Single-precision erf(x) function.
331914882SAlex Richardson  *
4*f3087befSAndrew Turner  * Copyright (c) 2020-2024, Arm Limited.
5072a4ba8SAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
631914882SAlex Richardson  */
731914882SAlex Richardson 
831914882SAlex Richardson #include <stdint.h>
931914882SAlex Richardson #include <math.h>
1031914882SAlex Richardson #include "math_config.h"
11*f3087befSAndrew Turner #include "test_defs.h"
12*f3087befSAndrew Turner #include "test_sig.h"
1331914882SAlex Richardson 
1431914882SAlex Richardson #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
1531914882SAlex Richardson #define A __erff_data.erff_poly_A
1631914882SAlex Richardson #define B __erff_data.erff_poly_B
1731914882SAlex Richardson 
1831914882SAlex Richardson /* Top 12 bits of a float.  */
1931914882SAlex Richardson static inline uint32_t
2031914882SAlex Richardson top12 (float x)
2131914882SAlex Richardson {
2231914882SAlex Richardson   return asuint (x) >> 20;
2331914882SAlex Richardson }
2431914882SAlex Richardson 
2531914882SAlex Richardson /* Efficient implementation of erff
2631914882SAlex Richardson    using either a pure polynomial approximation or
2731914882SAlex Richardson    the exponential of a polynomial.
2831914882SAlex Richardson    Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
2931914882SAlex Richardson float
3031914882SAlex Richardson erff (float x)
3131914882SAlex Richardson {
3231914882SAlex Richardson   float r, x2, u;
3331914882SAlex Richardson 
3431914882SAlex Richardson   /* Get top word.  */
3531914882SAlex Richardson   uint32_t ix = asuint (x);
3631914882SAlex Richardson   uint32_t sign = ix >> 31;
3731914882SAlex Richardson   uint32_t ia12 = top12 (x) & 0x7ff;
3831914882SAlex Richardson 
3931914882SAlex Richardson   /* Limit of both intervals is 0.875 for performance reasons but coefficients
4031914882SAlex Richardson      computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
4131914882SAlex Richardson      from 0.94 to 1.1ulps.  */
4231914882SAlex Richardson   if (ia12 < 0x3f6)
4331914882SAlex Richardson     { /* a = |x| < 0.875.  */
4431914882SAlex Richardson 
4531914882SAlex Richardson       /* Tiny and subnormal cases.  */
4631914882SAlex Richardson       if (unlikely (ia12 < 0x318))
4731914882SAlex Richardson 	{ /* |x| < 2^(-28).  */
4831914882SAlex Richardson 	  if (unlikely (ia12 < 0x040))
4931914882SAlex Richardson 	    { /* |x| < 2^(-119).  */
5031914882SAlex Richardson 	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
5131914882SAlex Richardson 	      return check_uflowf (y);
5231914882SAlex Richardson 	    }
5331914882SAlex Richardson 	  return x + TwoOverSqrtPiMinusOne * x;
5431914882SAlex Richardson 	}
5531914882SAlex Richardson 
5631914882SAlex Richardson       x2 = x * x;
5731914882SAlex Richardson 
5831914882SAlex Richardson       /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
5931914882SAlex Richardson       r = A[5];
6031914882SAlex Richardson       r = fmaf (r, x2, A[4]);
6131914882SAlex Richardson       r = fmaf (r, x2, A[3]);
6231914882SAlex Richardson       r = fmaf (r, x2, A[2]);
6331914882SAlex Richardson       r = fmaf (r, x2, A[1]);
6431914882SAlex Richardson       r = fmaf (r, x2, A[0]);
6531914882SAlex Richardson       r = fmaf (r, x, x);
6631914882SAlex Richardson     }
6731914882SAlex Richardson   else if (ia12 < 0x408)
6831914882SAlex Richardson     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
6931914882SAlex Richardson 
7031914882SAlex Richardson       float a = fabsf (x);
7131914882SAlex Richardson       /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
7231914882SAlex Richardson       r = fmaf (B[6], a, B[5]);
7331914882SAlex Richardson       u = fmaf (B[4], a, B[3]);
7431914882SAlex Richardson       x2 = x * x;
7531914882SAlex Richardson       r = fmaf (r, x2, u);
7631914882SAlex Richardson       /* Then switch to pure Horner scheme.  */
7731914882SAlex Richardson       r = fmaf (r, a, B[2]);
7831914882SAlex Richardson       r = fmaf (r, a, B[1]);
7931914882SAlex Richardson       r = fmaf (r, a, B[0]);
8031914882SAlex Richardson       r = fmaf (r, a, a);
8131914882SAlex Richardson       /* Single precision exponential with ~0.5ulps,
8231914882SAlex Richardson 	 ensures erff has max. rel. error
8331914882SAlex Richardson 	 < 1ulp on [0.921875, 4.0],
8431914882SAlex Richardson 	 < 1.1ulps on [0.875, 4.0].  */
8531914882SAlex Richardson       r = expf (-r);
8631914882SAlex Richardson       /* Explicit copysign (calling copysignf increases latency).  */
8731914882SAlex Richardson       if (sign)
8831914882SAlex Richardson 	r = -1.0f + r;
8931914882SAlex Richardson       else
9031914882SAlex Richardson 	r = 1.0f - r;
9131914882SAlex Richardson     }
9231914882SAlex Richardson   else
9331914882SAlex Richardson     { /* |x| >= 4.0.  */
9431914882SAlex Richardson 
9531914882SAlex Richardson       /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
9631914882SAlex Richardson       if (unlikely (ia12 >= 0x7f8))
9731914882SAlex Richardson 	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
9831914882SAlex Richardson 
9931914882SAlex Richardson       /* Explicit copysign (calling copysignf increases latency).  */
10031914882SAlex Richardson       if (sign)
10131914882SAlex Richardson 	r = -1.0f;
10231914882SAlex Richardson       else
10331914882SAlex Richardson 	r = 1.0f;
10431914882SAlex Richardson     }
10531914882SAlex Richardson   return r;
10631914882SAlex Richardson }
107*f3087befSAndrew Turner 
108*f3087befSAndrew Turner TEST_SIG (S, F, 1, erf, -6.0, 6.0)
109*f3087befSAndrew Turner TEST_ULP (erff, 0.6)
110*f3087befSAndrew Turner TEST_ULP_NONNEAREST (erff, 0.9)
111*f3087befSAndrew Turner TEST_INTERVAL (erff, 0, 0xffff0000, 10000)
112*f3087befSAndrew Turner TEST_SYM_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
113*f3087befSAndrew Turner TEST_SYM_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
114*f3087befSAndrew Turner TEST_INTERVAL (erff, 0, inf, 40000)
115