1*f3087befSAndrew Turner /* 2*f3087befSAndrew Turner * Single-precision scalar tan(x) function. 3*f3087befSAndrew Turner * 4*f3087befSAndrew Turner * Copyright (c) 2021-2024, Arm Limited. 5*f3087befSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*f3087befSAndrew Turner */ 7*f3087befSAndrew Turner #include "math_config.h" 8*f3087befSAndrew Turner #include "test_sig.h" 9*f3087befSAndrew Turner #include "test_defs.h" 10*f3087befSAndrew Turner #include "poly_scalar_f32.h" 11*f3087befSAndrew Turner 12*f3087befSAndrew Turner /* Useful constants. */ 13*f3087befSAndrew Turner #define NegPio2_1 (-0x1.921fb6p+0f) 14*f3087befSAndrew Turner #define NegPio2_2 (0x1.777a5cp-25f) 15*f3087befSAndrew Turner #define NegPio2_3 (0x1.ee59dap-50f) 16*f3087befSAndrew Turner /* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps. */ 17*f3087befSAndrew Turner #define RangeVal (0x1p17f) 18*f3087befSAndrew Turner #define InvPio2 ((0x1.45f306p-1f)) 19*f3087befSAndrew Turner #define Shift (0x1.8p+23f) 20*f3087befSAndrew Turner #define AbsMask (0x7fffffff) 21*f3087befSAndrew Turner #define Pio4 (0x1.921fb6p-1) 22*f3087befSAndrew Turner /* 2PI * 2^-64. */ 23*f3087befSAndrew Turner #define Pio2p63 (0x1.921FB54442D18p-62) 24*f3087befSAndrew Turner 25*f3087befSAndrew Turner static inline float 26*f3087befSAndrew Turner eval_P (float z) 27*f3087befSAndrew Turner { 28*f3087befSAndrew Turner return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan); 29*f3087befSAndrew Turner } 30*f3087befSAndrew Turner 31*f3087befSAndrew Turner static inline float 32*f3087befSAndrew Turner eval_Q (float z) 33*f3087befSAndrew Turner { 34*f3087befSAndrew Turner return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan); 35*f3087befSAndrew Turner } 36*f3087befSAndrew Turner 37*f3087befSAndrew Turner /* Reduction of the input argument x using Cody-Waite approach, such that x = r 38*f3087befSAndrew Turner + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer. */ 39*f3087befSAndrew Turner static inline float 40*f3087befSAndrew Turner reduce (float x, int32_t *in) 41*f3087befSAndrew Turner { 42*f3087befSAndrew Turner /* n = rint(x/(pi/2)). */ 43*f3087befSAndrew Turner float r = x; 44*f3087befSAndrew Turner float q = fmaf (InvPio2, r, Shift); 45*f3087befSAndrew Turner float n = q - Shift; 46*f3087befSAndrew Turner /* There is no rounding here, n is representable by a signed integer. */ 47*f3087befSAndrew Turner *in = (int32_t) n; 48*f3087befSAndrew Turner /* r = x - n * (pi/2) (range reduction into -pi/4 .. pi/4). */ 49*f3087befSAndrew Turner r = fmaf (NegPio2_1, n, r); 50*f3087befSAndrew Turner r = fmaf (NegPio2_2, n, r); 51*f3087befSAndrew Turner r = fmaf (NegPio2_3, n, r); 52*f3087befSAndrew Turner return r; 53*f3087befSAndrew Turner } 54*f3087befSAndrew Turner 55*f3087befSAndrew Turner /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic. 56*f3087befSAndrew Turner XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored). 57*f3087befSAndrew Turner Return the modulo between -PI/4 and PI/4 and store the quadrant in NP. 58*f3087befSAndrew Turner Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit 59*f3087befSAndrew Turner multiply computes the exact 2.62-bit fixed-point modulo. Since the result 60*f3087befSAndrew Turner can have at most 29 leading zeros after the binary point, the double 61*f3087befSAndrew Turner precision result is accurate to 33 bits. */ 62*f3087befSAndrew Turner static inline double 63*f3087befSAndrew Turner reduce_large (uint32_t xi, int *np) 64*f3087befSAndrew Turner { 65*f3087befSAndrew Turner const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15]; 66*f3087befSAndrew Turner int shift = (xi >> 23) & 7; 67*f3087befSAndrew Turner uint64_t n, res0, res1, res2; 68*f3087befSAndrew Turner 69*f3087befSAndrew Turner xi = (xi & 0xffffff) | 0x800000; 70*f3087befSAndrew Turner xi <<= shift; 71*f3087befSAndrew Turner 72*f3087befSAndrew Turner res0 = xi * arr[0]; 73*f3087befSAndrew Turner res1 = (uint64_t) xi * arr[4]; 74*f3087befSAndrew Turner res2 = (uint64_t) xi * arr[8]; 75*f3087befSAndrew Turner res0 = (res2 >> 32) | (res0 << 32); 76*f3087befSAndrew Turner res0 += res1; 77*f3087befSAndrew Turner 78*f3087befSAndrew Turner n = (res0 + (1ULL << 61)) >> 62; 79*f3087befSAndrew Turner res0 -= n << 62; 80*f3087befSAndrew Turner double x = (int64_t) res0; 81*f3087befSAndrew Turner *np = n; 82*f3087befSAndrew Turner return x * Pio2p63; 83*f3087befSAndrew Turner } 84*f3087befSAndrew Turner 85*f3087befSAndrew Turner /* Top 12 bits of the float representation with the sign bit cleared. */ 86*f3087befSAndrew Turner static inline uint32_t 87*f3087befSAndrew Turner top12 (float x) 88*f3087befSAndrew Turner { 89*f3087befSAndrew Turner return (asuint (x) >> 20); 90*f3087befSAndrew Turner } 91*f3087befSAndrew Turner 92*f3087befSAndrew Turner /* Fast single-precision tan implementation. 93*f3087befSAndrew Turner Maximum ULP error: 3.293ulps. 94*f3087befSAndrew Turner tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1. */ 95*f3087befSAndrew Turner float 96*f3087befSAndrew Turner tanf (float x) 97*f3087befSAndrew Turner { 98*f3087befSAndrew Turner /* Get top words. */ 99*f3087befSAndrew Turner uint32_t ix = asuint (x); 100*f3087befSAndrew Turner uint32_t ia = ix & AbsMask; 101*f3087befSAndrew Turner uint32_t ia12 = ia >> 20; 102*f3087befSAndrew Turner 103*f3087befSAndrew Turner /* Dispatch between no reduction (small numbers), fast reduction and 104*f3087befSAndrew Turner slow large numbers reduction. The reduction step determines r float 105*f3087befSAndrew Turner (|r| < pi/4) and n signed integer such that x = r + n * pi/2. */ 106*f3087befSAndrew Turner int32_t n; 107*f3087befSAndrew Turner float r; 108*f3087befSAndrew Turner if (ia12 < top12 (Pio4)) 109*f3087befSAndrew Turner { 110*f3087befSAndrew Turner /* Optimize small values. */ 111*f3087befSAndrew Turner if (unlikely (ia12 < top12 (0x1p-12f))) 112*f3087befSAndrew Turner { 113*f3087befSAndrew Turner if (unlikely (ia12 < top12 (0x1p-126f))) 114*f3087befSAndrew Turner /* Force underflow for tiny x. */ 115*f3087befSAndrew Turner force_eval_float (x * x); 116*f3087befSAndrew Turner return x; 117*f3087befSAndrew Turner } 118*f3087befSAndrew Turner 119*f3087befSAndrew Turner /* tan (x) ~= x + x^3 * P(x^2). */ 120*f3087befSAndrew Turner float x2 = x * x; 121*f3087befSAndrew Turner float y = eval_P (x2); 122*f3087befSAndrew Turner return fmaf (x2, x * y, x); 123*f3087befSAndrew Turner } 124*f3087befSAndrew Turner /* Similar to other trigonometric routines, fast inaccurate reduction is 125*f3087befSAndrew Turner performed for values of x from pi/4 up to RangeVal. In order to keep 126*f3087befSAndrew Turner errors below 3.5ulps, we set the value of RangeVal to 2^17. This might 127*f3087befSAndrew Turner differ for other trigonometric routines. Above this value more advanced 128*f3087befSAndrew Turner but slower reduction techniques need to be implemented to reach a similar 129*f3087befSAndrew Turner accuracy. */ 130*f3087befSAndrew Turner else if (ia12 < top12 (RangeVal)) 131*f3087befSAndrew Turner { 132*f3087befSAndrew Turner /* Fast inaccurate reduction. */ 133*f3087befSAndrew Turner r = reduce (x, &n); 134*f3087befSAndrew Turner } 135*f3087befSAndrew Turner else if (ia12 < 0x7f8) 136*f3087befSAndrew Turner { 137*f3087befSAndrew Turner /* Slow accurate reduction. */ 138*f3087befSAndrew Turner uint32_t sign = ix & ~AbsMask; 139*f3087befSAndrew Turner double dar = reduce_large (ia, &n); 140*f3087befSAndrew Turner float ar = (float) dar; 141*f3087befSAndrew Turner r = asfloat (asuint (ar) ^ sign); 142*f3087befSAndrew Turner } 143*f3087befSAndrew Turner else 144*f3087befSAndrew Turner { 145*f3087befSAndrew Turner /* tan(Inf or NaN) is NaN. */ 146*f3087befSAndrew Turner return __math_invalidf (x); 147*f3087befSAndrew Turner } 148*f3087befSAndrew Turner 149*f3087befSAndrew Turner /* If x lives in an interval where |tan(x)| 150*f3087befSAndrew Turner - is finite then use an approximation of tangent in the form 151*f3087befSAndrew Turner tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). 152*f3087befSAndrew Turner - grows to infinity then use an approximation of cotangent in the form 153*f3087befSAndrew Turner cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early. 154*f3087befSAndrew Turner Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r), 155*f3087befSAndrew Turner we only need to change the sign of r to obtain tan(x) from cotan(r). 156*f3087befSAndrew Turner This 2-interval approach requires 2 different sets of coefficients P and 157*f3087befSAndrew Turner Q, where Q is a lower order polynomial than P. */ 158*f3087befSAndrew Turner 159*f3087befSAndrew Turner /* Determine if x lives in an interval where |tan(x)| grows to infinity. */ 160*f3087befSAndrew Turner uint32_t alt = (uint32_t) n & 1; 161*f3087befSAndrew Turner 162*f3087befSAndrew Turner /* Perform additional reduction if required. */ 163*f3087befSAndrew Turner float z = alt ? -r : r; 164*f3087befSAndrew Turner 165*f3087befSAndrew Turner /* Prepare backward transformation. */ 166*f3087befSAndrew Turner float z2 = r * r; 167*f3087befSAndrew Turner float offset = alt ? 1.0f / z : z; 168*f3087befSAndrew Turner float scale = alt ? z : z * z2; 169*f3087befSAndrew Turner 170*f3087befSAndrew Turner /* Evaluate polynomial approximation of tan or cotan. */ 171*f3087befSAndrew Turner float p = alt ? eval_Q (z2) : eval_P (z2); 172*f3087befSAndrew Turner 173*f3087befSAndrew Turner /* A unified way of assembling the result on both interval types. */ 174*f3087befSAndrew Turner return fmaf (scale, p, offset); 175*f3087befSAndrew Turner } 176*f3087befSAndrew Turner 177*f3087befSAndrew Turner TEST_SIG (S, F, 1, tan, -3.1, 3.1) 178*f3087befSAndrew Turner TEST_ULP (tanf, 2.80) 179*f3087befSAndrew Turner TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) 180*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) 181*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000) 182*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000) 183*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000) 184*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) 185*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000) 186