131914882SAlex Richardson /* 231914882SAlex Richardson * Single-precision erf(x) function. 331914882SAlex Richardson * 431914882SAlex Richardson * Copyright (c) 2020, Arm Limited. 5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 631914882SAlex Richardson */ 731914882SAlex Richardson 831914882SAlex Richardson #include <stdint.h> 931914882SAlex Richardson #include <math.h> 1031914882SAlex Richardson #include "math_config.h" 1131914882SAlex Richardson 1231914882SAlex Richardson #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f 1331914882SAlex Richardson #define A __erff_data.erff_poly_A 1431914882SAlex Richardson #define B __erff_data.erff_poly_B 1531914882SAlex Richardson 1631914882SAlex Richardson /* Top 12 bits of a float. */ 1731914882SAlex Richardson static inline uint32_t 1831914882SAlex Richardson top12 (float x) 1931914882SAlex Richardson { 2031914882SAlex Richardson return asuint (x) >> 20; 2131914882SAlex Richardson } 2231914882SAlex Richardson 2331914882SAlex Richardson /* Efficient implementation of erff 2431914882SAlex Richardson using either a pure polynomial approximation or 2531914882SAlex Richardson the exponential of a polynomial. 2631914882SAlex Richardson Worst-case error is 1.09ulps at 0x1.c111acp-1. */ 2731914882SAlex Richardson float 2831914882SAlex Richardson erff (float x) 2931914882SAlex Richardson { 3031914882SAlex Richardson float r, x2, u; 3131914882SAlex Richardson 3231914882SAlex Richardson /* Get top word. */ 3331914882SAlex Richardson uint32_t ix = asuint (x); 3431914882SAlex Richardson uint32_t sign = ix >> 31; 3531914882SAlex Richardson uint32_t ia12 = top12 (x) & 0x7ff; 3631914882SAlex Richardson 3731914882SAlex Richardson /* Limit of both intervals is 0.875 for performance reasons but coefficients 3831914882SAlex Richardson computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy 3931914882SAlex Richardson from 0.94 to 1.1ulps. */ 4031914882SAlex Richardson if (ia12 < 0x3f6) 4131914882SAlex Richardson { /* a = |x| < 0.875. */ 4231914882SAlex Richardson 4331914882SAlex Richardson /* Tiny and subnormal cases. */ 4431914882SAlex Richardson if (unlikely (ia12 < 0x318)) 4531914882SAlex Richardson { /* |x| < 2^(-28). */ 4631914882SAlex Richardson if (unlikely (ia12 < 0x040)) 4731914882SAlex Richardson { /* |x| < 2^(-119). */ 4831914882SAlex Richardson float y = fmaf (TwoOverSqrtPiMinusOne, x, x); 4931914882SAlex Richardson return check_uflowf (y); 5031914882SAlex Richardson } 5131914882SAlex Richardson return x + TwoOverSqrtPiMinusOne * x; 5231914882SAlex Richardson } 5331914882SAlex Richardson 5431914882SAlex Richardson x2 = x * x; 5531914882SAlex Richardson 5631914882SAlex Richardson /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */ 5731914882SAlex Richardson r = A[5]; 5831914882SAlex Richardson r = fmaf (r, x2, A[4]); 5931914882SAlex Richardson r = fmaf (r, x2, A[3]); 6031914882SAlex Richardson r = fmaf (r, x2, A[2]); 6131914882SAlex Richardson r = fmaf (r, x2, A[1]); 6231914882SAlex Richardson r = fmaf (r, x2, A[0]); 6331914882SAlex Richardson r = fmaf (r, x, x); 6431914882SAlex Richardson } 6531914882SAlex Richardson else if (ia12 < 0x408) 6631914882SAlex Richardson { /* |x| < 4.0 - Use a custom Estrin scheme. */ 6731914882SAlex Richardson 6831914882SAlex Richardson float a = fabsf (x); 6931914882SAlex Richardson /* Start with Estrin scheme on high order (small magnitude) coefficients. */ 7031914882SAlex Richardson r = fmaf (B[6], a, B[5]); 7131914882SAlex Richardson u = fmaf (B[4], a, B[3]); 7231914882SAlex Richardson x2 = x * x; 7331914882SAlex Richardson r = fmaf (r, x2, u); 7431914882SAlex Richardson /* Then switch to pure Horner scheme. */ 7531914882SAlex Richardson r = fmaf (r, a, B[2]); 7631914882SAlex Richardson r = fmaf (r, a, B[1]); 7731914882SAlex Richardson r = fmaf (r, a, B[0]); 7831914882SAlex Richardson r = fmaf (r, a, a); 7931914882SAlex Richardson /* Single precision exponential with ~0.5ulps, 8031914882SAlex Richardson ensures erff has max. rel. error 8131914882SAlex Richardson < 1ulp on [0.921875, 4.0], 8231914882SAlex Richardson < 1.1ulps on [0.875, 4.0]. */ 8331914882SAlex Richardson r = expf (-r); 8431914882SAlex Richardson /* Explicit copysign (calling copysignf increases latency). */ 8531914882SAlex Richardson if (sign) 8631914882SAlex Richardson r = -1.0f + r; 8731914882SAlex Richardson else 8831914882SAlex Richardson r = 1.0f - r; 8931914882SAlex Richardson } 9031914882SAlex Richardson else 9131914882SAlex Richardson { /* |x| >= 4.0. */ 9231914882SAlex Richardson 9331914882SAlex Richardson /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ 9431914882SAlex Richardson if (unlikely (ia12 >= 0x7f8)) 9531914882SAlex Richardson return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; 9631914882SAlex Richardson 9731914882SAlex Richardson /* Explicit copysign (calling copysignf increases latency). */ 9831914882SAlex Richardson if (sign) 9931914882SAlex Richardson r = -1.0f; 10031914882SAlex Richardson else 10131914882SAlex Richardson r = 1.0f; 10231914882SAlex Richardson } 10331914882SAlex Richardson return r; 10431914882SAlex Richardson } 105