131914882SAlex Richardson /* 231914882SAlex Richardson * Single-precision erf(x) function. 331914882SAlex Richardson * 4*f3087befSAndrew Turner * Copyright (c) 2020-2024, Arm Limited. 5072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 631914882SAlex Richardson */ 731914882SAlex Richardson 831914882SAlex Richardson #include <stdint.h> 931914882SAlex Richardson #include <math.h> 1031914882SAlex Richardson #include "math_config.h" 11*f3087befSAndrew Turner #include "test_defs.h" 12*f3087befSAndrew Turner #include "test_sig.h" 1331914882SAlex Richardson 1431914882SAlex Richardson #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f 1531914882SAlex Richardson #define A __erff_data.erff_poly_A 1631914882SAlex Richardson #define B __erff_data.erff_poly_B 1731914882SAlex Richardson 1831914882SAlex Richardson /* Top 12 bits of a float. */ 1931914882SAlex Richardson static inline uint32_t 2031914882SAlex Richardson top12 (float x) 2131914882SAlex Richardson { 2231914882SAlex Richardson return asuint (x) >> 20; 2331914882SAlex Richardson } 2431914882SAlex Richardson 2531914882SAlex Richardson /* Efficient implementation of erff 2631914882SAlex Richardson using either a pure polynomial approximation or 2731914882SAlex Richardson the exponential of a polynomial. 2831914882SAlex Richardson Worst-case error is 1.09ulps at 0x1.c111acp-1. */ 2931914882SAlex Richardson float 3031914882SAlex Richardson erff (float x) 3131914882SAlex Richardson { 3231914882SAlex Richardson float r, x2, u; 3331914882SAlex Richardson 3431914882SAlex Richardson /* Get top word. */ 3531914882SAlex Richardson uint32_t ix = asuint (x); 3631914882SAlex Richardson uint32_t sign = ix >> 31; 3731914882SAlex Richardson uint32_t ia12 = top12 (x) & 0x7ff; 3831914882SAlex Richardson 3931914882SAlex Richardson /* Limit of both intervals is 0.875 for performance reasons but coefficients 4031914882SAlex Richardson computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy 4131914882SAlex Richardson from 0.94 to 1.1ulps. */ 4231914882SAlex Richardson if (ia12 < 0x3f6) 4331914882SAlex Richardson { /* a = |x| < 0.875. */ 4431914882SAlex Richardson 4531914882SAlex Richardson /* Tiny and subnormal cases. */ 4631914882SAlex Richardson if (unlikely (ia12 < 0x318)) 4731914882SAlex Richardson { /* |x| < 2^(-28). */ 4831914882SAlex Richardson if (unlikely (ia12 < 0x040)) 4931914882SAlex Richardson { /* |x| < 2^(-119). */ 5031914882SAlex Richardson float y = fmaf (TwoOverSqrtPiMinusOne, x, x); 5131914882SAlex Richardson return check_uflowf (y); 5231914882SAlex Richardson } 5331914882SAlex Richardson return x + TwoOverSqrtPiMinusOne * x; 5431914882SAlex Richardson } 5531914882SAlex Richardson 5631914882SAlex Richardson x2 = x * x; 5731914882SAlex Richardson 5831914882SAlex Richardson /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */ 5931914882SAlex Richardson r = A[5]; 6031914882SAlex Richardson r = fmaf (r, x2, A[4]); 6131914882SAlex Richardson r = fmaf (r, x2, A[3]); 6231914882SAlex Richardson r = fmaf (r, x2, A[2]); 6331914882SAlex Richardson r = fmaf (r, x2, A[1]); 6431914882SAlex Richardson r = fmaf (r, x2, A[0]); 6531914882SAlex Richardson r = fmaf (r, x, x); 6631914882SAlex Richardson } 6731914882SAlex Richardson else if (ia12 < 0x408) 6831914882SAlex Richardson { /* |x| < 4.0 - Use a custom Estrin scheme. */ 6931914882SAlex Richardson 7031914882SAlex Richardson float a = fabsf (x); 7131914882SAlex Richardson /* Start with Estrin scheme on high order (small magnitude) coefficients. */ 7231914882SAlex Richardson r = fmaf (B[6], a, B[5]); 7331914882SAlex Richardson u = fmaf (B[4], a, B[3]); 7431914882SAlex Richardson x2 = x * x; 7531914882SAlex Richardson r = fmaf (r, x2, u); 7631914882SAlex Richardson /* Then switch to pure Horner scheme. */ 7731914882SAlex Richardson r = fmaf (r, a, B[2]); 7831914882SAlex Richardson r = fmaf (r, a, B[1]); 7931914882SAlex Richardson r = fmaf (r, a, B[0]); 8031914882SAlex Richardson r = fmaf (r, a, a); 8131914882SAlex Richardson /* Single precision exponential with ~0.5ulps, 8231914882SAlex Richardson ensures erff has max. rel. error 8331914882SAlex Richardson < 1ulp on [0.921875, 4.0], 8431914882SAlex Richardson < 1.1ulps on [0.875, 4.0]. */ 8531914882SAlex Richardson r = expf (-r); 8631914882SAlex Richardson /* Explicit copysign (calling copysignf increases latency). */ 8731914882SAlex Richardson if (sign) 8831914882SAlex Richardson r = -1.0f + r; 8931914882SAlex Richardson else 9031914882SAlex Richardson r = 1.0f - r; 9131914882SAlex Richardson } 9231914882SAlex Richardson else 9331914882SAlex Richardson { /* |x| >= 4.0. */ 9431914882SAlex Richardson 9531914882SAlex Richardson /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ 9631914882SAlex Richardson if (unlikely (ia12 >= 0x7f8)) 9731914882SAlex Richardson return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; 9831914882SAlex Richardson 9931914882SAlex Richardson /* Explicit copysign (calling copysignf increases latency). */ 10031914882SAlex Richardson if (sign) 10131914882SAlex Richardson r = -1.0f; 10231914882SAlex Richardson else 10331914882SAlex Richardson r = 1.0f; 10431914882SAlex Richardson } 10531914882SAlex Richardson return r; 10631914882SAlex Richardson } 107*f3087befSAndrew Turner 108*f3087befSAndrew Turner TEST_SIG (S, F, 1, erf, -6.0, 6.0) 109*f3087befSAndrew Turner TEST_ULP (erff, 0.6) 110*f3087befSAndrew Turner TEST_ULP_NONNEAREST (erff, 0.9) 111*f3087befSAndrew Turner TEST_INTERVAL (erff, 0, 0xffff0000, 10000) 112*f3087befSAndrew Turner TEST_SYM_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000) 113*f3087befSAndrew Turner TEST_SYM_INTERVAL (erff, 0x1p-26, 0x1p3, 40000) 114*f3087befSAndrew Turner TEST_INTERVAL (erff, 0, inf, 40000) 115