15a02ffc3SAndrew Turner /* 25a02ffc3SAndrew Turner * Implementation of the true gamma function (as opposed to lgamma) 35a02ffc3SAndrew Turner * for 128-bit long double. 45a02ffc3SAndrew Turner * 55a02ffc3SAndrew Turner * Copyright (c) 2006-2024, Arm Limited. 65a02ffc3SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 75a02ffc3SAndrew Turner */ 85a02ffc3SAndrew Turner 95a02ffc3SAndrew Turner /* 105a02ffc3SAndrew Turner * This module implements the float128 gamma function under the name 115a02ffc3SAndrew Turner * tgamma128. It's expected to be suitable for integration into system 125a02ffc3SAndrew Turner * maths libraries under the standard name tgammal, if long double is 135a02ffc3SAndrew Turner * 128-bit. Such a library will probably want to check the error 145a02ffc3SAndrew Turner * handling and optimize the initial process of extracting the 155a02ffc3SAndrew Turner * exponent, which is done here by simple and portable (but 165a02ffc3SAndrew Turner * potentially slower) methods. 175a02ffc3SAndrew Turner */ 185a02ffc3SAndrew Turner 195a02ffc3SAndrew Turner #include <float.h> 205a02ffc3SAndrew Turner #include <math.h> 215a02ffc3SAndrew Turner #include <stdbool.h> 225a02ffc3SAndrew Turner #include <stddef.h> 235a02ffc3SAndrew Turner 245a02ffc3SAndrew Turner /* Only binary128 format is supported. */ 255a02ffc3SAndrew Turner #if LDBL_MANT_DIG == 113 265a02ffc3SAndrew Turner 275a02ffc3SAndrew Turner #include "tgamma128.h" 285a02ffc3SAndrew Turner 295a02ffc3SAndrew Turner #define lenof(x) (sizeof(x)/sizeof(*(x))) 305a02ffc3SAndrew Turner 315a02ffc3SAndrew Turner /* 325a02ffc3SAndrew Turner * Helper routine to evaluate a polynomial via Horner's rule 335a02ffc3SAndrew Turner */ 345a02ffc3SAndrew Turner static long double poly(const long double *coeffs, size_t n, long double x) 355a02ffc3SAndrew Turner { 365a02ffc3SAndrew Turner long double result = coeffs[--n]; 375a02ffc3SAndrew Turner 385a02ffc3SAndrew Turner while (n > 0) 395a02ffc3SAndrew Turner result = (result * x) + coeffs[--n]; 405a02ffc3SAndrew Turner 415a02ffc3SAndrew Turner return result; 425a02ffc3SAndrew Turner } 435a02ffc3SAndrew Turner 445a02ffc3SAndrew Turner /* 455a02ffc3SAndrew Turner * Compute sin(pi*x) / pi, for use in the reflection formula that 465a02ffc3SAndrew Turner * relates gamma(-x) and gamma(x). 475a02ffc3SAndrew Turner */ 485a02ffc3SAndrew Turner static long double sin_pi_x_over_pi(long double x) 495a02ffc3SAndrew Turner { 505a02ffc3SAndrew Turner int quo; 515a02ffc3SAndrew Turner long double fracpart = remquol(x, 0.5L, &quo); 525a02ffc3SAndrew Turner 535a02ffc3SAndrew Turner long double sign = 1.0L; 545a02ffc3SAndrew Turner if (quo & 2) 555a02ffc3SAndrew Turner sign = -sign; 565a02ffc3SAndrew Turner quo &= 1; 575a02ffc3SAndrew Turner 585a02ffc3SAndrew Turner if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) { 595a02ffc3SAndrew Turner /* For numbers this size, sin(pi*x) is so close to pi*x that 605a02ffc3SAndrew Turner * sin(pi*x)/pi is indistinguishable from x in float128 */ 615a02ffc3SAndrew Turner return sign * fracpart; 625a02ffc3SAndrew Turner } 635a02ffc3SAndrew Turner 645a02ffc3SAndrew Turner if (quo == 0) { 655a02ffc3SAndrew Turner return sign * sinl(pi*fracpart) / pi; 665a02ffc3SAndrew Turner } else { 675a02ffc3SAndrew Turner return sign * cosl(pi*fracpart) / pi; 685a02ffc3SAndrew Turner } 695a02ffc3SAndrew Turner } 705a02ffc3SAndrew Turner 715a02ffc3SAndrew Turner /* Return tgamma(x) on the assumption that x >= 8. */ 725a02ffc3SAndrew Turner static long double tgamma_large(long double x, 735a02ffc3SAndrew Turner bool negative, long double negadjust) 745a02ffc3SAndrew Turner { 755a02ffc3SAndrew Turner /* 765a02ffc3SAndrew Turner * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K, 775a02ffc3SAndrew Turner * where K is a correction factor computed as a polynomial in 1/x. 785a02ffc3SAndrew Turner * 795a02ffc3SAndrew Turner * (Vaguely inspired by the form of the Lanczos approximation, but 805a02ffc3SAndrew Turner * I tried the Lanczos approximation itself and it suffers badly 815a02ffc3SAndrew Turner * from big cancellation leading to loss of significance.) 825a02ffc3SAndrew Turner */ 835a02ffc3SAndrew Turner long double t = 1/x; 845a02ffc3SAndrew Turner long double p = poly(coeffs_large, lenof(coeffs_large), t); 855a02ffc3SAndrew Turner 865a02ffc3SAndrew Turner /* 875a02ffc3SAndrew Turner * To avoid overflow in cases where x^(x-0.5) does overflow 885a02ffc3SAndrew Turner * but gamma(x) does not, we split x^(x-0.5) in half and 895a02ffc3SAndrew Turner * multiply back up _after_ multiplying the shrinking factor 905a02ffc3SAndrew Turner * of exp(-(x-0.5)). 915a02ffc3SAndrew Turner * 925a02ffc3SAndrew Turner * Note that computing x-0.5 and (x-0.5)/2 is exact for the 935a02ffc3SAndrew Turner * relevant range of x, so the only sources of error are pow 945a02ffc3SAndrew Turner * and exp themselves, plus the multiplications. 955a02ffc3SAndrew Turner */ 965a02ffc3SAndrew Turner long double powhalf = powl(x, (x-0.5L)/2.0L); 975a02ffc3SAndrew Turner long double expret = expl(-(x-0.5L)); 985a02ffc3SAndrew Turner 995a02ffc3SAndrew Turner if (!negative) { 1005a02ffc3SAndrew Turner return (expret * powhalf) * powhalf * p; 1015a02ffc3SAndrew Turner } else { 1025a02ffc3SAndrew Turner /* 1035a02ffc3SAndrew Turner * Apply the reflection formula as commented below, but 1045a02ffc3SAndrew Turner * carefully: negadjust has magnitude less than 1, so it can 1055a02ffc3SAndrew Turner * turn a case where gamma(+x) would overflow into a case 1065a02ffc3SAndrew Turner * where gamma(-x) doesn't underflow. Not only that, but the 1075a02ffc3SAndrew Turner * FP format has greater range in the tiny domain due to 1085a02ffc3SAndrew Turner * denormals. For both reasons, it's not good enough to 1095a02ffc3SAndrew Turner * compute the positive result and then adjust it. 1105a02ffc3SAndrew Turner */ 1115a02ffc3SAndrew Turner long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p); 1125a02ffc3SAndrew Turner return ret / powhalf; 1135a02ffc3SAndrew Turner } 1145a02ffc3SAndrew Turner } 1155a02ffc3SAndrew Turner 1165a02ffc3SAndrew Turner /* Return tgamma(x) on the assumption that 0 <= x < 1/32. */ 1175a02ffc3SAndrew Turner static long double tgamma_tiny(long double x, 1185a02ffc3SAndrew Turner bool negative, long double negadjust) 1195a02ffc3SAndrew Turner { 1205a02ffc3SAndrew Turner /* 1215a02ffc3SAndrew Turner * For x near zero, we use a polynomial approximation to 1225a02ffc3SAndrew Turner * g = 1/(x*gamma(x)), and then return 1/(g*x). 1235a02ffc3SAndrew Turner */ 1245a02ffc3SAndrew Turner long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x); 1255a02ffc3SAndrew Turner if (!negative) 1265a02ffc3SAndrew Turner return 1.0L / (g*x); 1275a02ffc3SAndrew Turner else 1285a02ffc3SAndrew Turner return g / negadjust; 1295a02ffc3SAndrew Turner } 1305a02ffc3SAndrew Turner 1315a02ffc3SAndrew Turner /* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */ 1325a02ffc3SAndrew Turner static long double tgamma_ultratiny(long double x, bool negative, 1335a02ffc3SAndrew Turner long double negadjust) 1345a02ffc3SAndrew Turner { 1355a02ffc3SAndrew Turner /* On this interval, gamma can't even be distinguished from 1/x, 1365a02ffc3SAndrew Turner * so we skip the polynomial evaluation in tgamma_tiny, partly to 1375a02ffc3SAndrew Turner * save time and partly to avoid the tiny intermediate values 1385a02ffc3SAndrew Turner * setting the underflow exception flag. */ 1395a02ffc3SAndrew Turner if (!negative) 1405a02ffc3SAndrew Turner return 1.0L / x; 1415a02ffc3SAndrew Turner else 1425a02ffc3SAndrew Turner return 1.0L / negadjust; 1435a02ffc3SAndrew Turner } 1445a02ffc3SAndrew Turner 1455a02ffc3SAndrew Turner /* Return tgamma(x) on the assumption that 1 <= x <= 2. */ 1465a02ffc3SAndrew Turner static long double tgamma_central(long double x) 1475a02ffc3SAndrew Turner { 1485a02ffc3SAndrew Turner /* 1495a02ffc3SAndrew Turner * In this central interval, our strategy is to finding the 1505a02ffc3SAndrew Turner * difference between x and the point where gamma has a minimum, 1515a02ffc3SAndrew Turner * and approximate based on that. 1525a02ffc3SAndrew Turner */ 1535a02ffc3SAndrew Turner 1545a02ffc3SAndrew Turner /* The difference between the input x and the minimum x. The first 1555a02ffc3SAndrew Turner * subtraction is expected to be exact, since x and min_hi have 1565a02ffc3SAndrew Turner * the same exponent (unless x=2, in which case it will still be 1575a02ffc3SAndrew Turner * exact). */ 1585a02ffc3SAndrew Turner long double t = (x - min_x_hi) - min_x_lo; 1595a02ffc3SAndrew Turner 1605a02ffc3SAndrew Turner /* 1615a02ffc3SAndrew Turner * Now use two different polynomials for the intervals [1,m] and 1625a02ffc3SAndrew Turner * [m,2]. 1635a02ffc3SAndrew Turner */ 1645a02ffc3SAndrew Turner long double p; 1655a02ffc3SAndrew Turner if (t < 0) 1665a02ffc3SAndrew Turner p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t); 1675a02ffc3SAndrew Turner else 1685a02ffc3SAndrew Turner p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t); 1695a02ffc3SAndrew Turner 1705a02ffc3SAndrew Turner return (min_y_lo + p * (t*t)) + min_y_hi; 1715a02ffc3SAndrew Turner } 1725a02ffc3SAndrew Turner 1735a02ffc3SAndrew Turner long double tgamma128(long double x) 1745a02ffc3SAndrew Turner { 1755a02ffc3SAndrew Turner /* 1765a02ffc3SAndrew Turner * Start by extracting the number's sign and exponent, and ruling 1775a02ffc3SAndrew Turner * out cases of non-normalized numbers. 1785a02ffc3SAndrew Turner * 1795a02ffc3SAndrew Turner * For an implementation integrated into a system libm, it would 1805a02ffc3SAndrew Turner * almost certainly be quicker to do this by direct bitwise access 1815a02ffc3SAndrew Turner * to the input float128 value, using whatever is the local idiom 1825a02ffc3SAndrew Turner * for knowing its endianness. 1835a02ffc3SAndrew Turner * 1845a02ffc3SAndrew Turner * Integration into a system libc may also need to worry about 1855a02ffc3SAndrew Turner * setting errno, if that's the locally preferred way to report 1865a02ffc3SAndrew Turner * math.h errors. 1875a02ffc3SAndrew Turner */ 1885a02ffc3SAndrew Turner int sign = signbit(x); 1895a02ffc3SAndrew Turner int exponent; 1905a02ffc3SAndrew Turner switch (fpclassify(x)) { 1915a02ffc3SAndrew Turner case FP_NAN: 1925a02ffc3SAndrew Turner return x+x; /* propagate QNaN, make SNaN throw an exception */ 1935a02ffc3SAndrew Turner case FP_ZERO: 1945a02ffc3SAndrew Turner return 1/x; /* divide by zero on purpose to indicate a pole */ 1955a02ffc3SAndrew Turner case FP_INFINITE: 1965a02ffc3SAndrew Turner if (sign) { 1975a02ffc3SAndrew Turner return x-x; /* gamma(-inf) has indeterminate sign, so provoke an 1985a02ffc3SAndrew Turner * IEEE invalid operation exception to indicate that */ 1995a02ffc3SAndrew Turner } 2005a02ffc3SAndrew Turner return x; /* but gamma(+inf) is just +inf with no error */ 2015a02ffc3SAndrew Turner case FP_SUBNORMAL: 2025a02ffc3SAndrew Turner exponent = -16384; 2035a02ffc3SAndrew Turner break; 2045a02ffc3SAndrew Turner default: 2055a02ffc3SAndrew Turner frexpl(x, &exponent); 2065a02ffc3SAndrew Turner exponent--; 2075a02ffc3SAndrew Turner break; 2085a02ffc3SAndrew Turner } 2095a02ffc3SAndrew Turner 2105a02ffc3SAndrew Turner bool negative = false; 2115a02ffc3SAndrew Turner long double negadjust = 0.0L; 2125a02ffc3SAndrew Turner 2135a02ffc3SAndrew Turner if (sign) { 2145a02ffc3SAndrew Turner /* 2155a02ffc3SAndrew Turner * Euler's reflection formula is 2165a02ffc3SAndrew Turner * 2175a02ffc3SAndrew Turner * gamma(1-x) gamma(x) = pi/sin(pi*x) 2185a02ffc3SAndrew Turner * 2195a02ffc3SAndrew Turner * pi 2205a02ffc3SAndrew Turner * => gamma(x) = -------------------- 2215a02ffc3SAndrew Turner * gamma(1-x) sin(pi*x) 2225a02ffc3SAndrew Turner * 2235a02ffc3SAndrew Turner * But computing 1-x is going to lose a lot of accuracy when x 2245a02ffc3SAndrew Turner * is very small, so instead we transform using the recurrence 2255a02ffc3SAndrew Turner * gamma(t+1)=t gamma(t). Setting t=-x, this gives us 2265a02ffc3SAndrew Turner * gamma(1-x) = -x gamma(-x), so we now have 2275a02ffc3SAndrew Turner * 2285a02ffc3SAndrew Turner * pi 2295a02ffc3SAndrew Turner * gamma(x) = ---------------------- 2305a02ffc3SAndrew Turner * -x gamma(-x) sin(pi*x) 2315a02ffc3SAndrew Turner * 2325a02ffc3SAndrew Turner * which relates gamma(x) to gamma(-x), which is much nicer, 2335a02ffc3SAndrew Turner * since x can be turned into -x without rounding. 2345a02ffc3SAndrew Turner */ 2355a02ffc3SAndrew Turner negadjust = sin_pi_x_over_pi(x); 2365a02ffc3SAndrew Turner negative = true; 2375a02ffc3SAndrew Turner x = -x; 2385a02ffc3SAndrew Turner 2395a02ffc3SAndrew Turner /* 2405a02ffc3SAndrew Turner * Now the ultimate answer we want is 2415a02ffc3SAndrew Turner * 2425a02ffc3SAndrew Turner * 1 / (gamma(x) * x * negadjust) 2435a02ffc3SAndrew Turner * 2445a02ffc3SAndrew Turner * where x is the positive value we've just turned it into. 2455a02ffc3SAndrew Turner * 2465a02ffc3SAndrew Turner * For some of the cases below, we'll compute gamma(x) 2475a02ffc3SAndrew Turner * normally and then compute this adjusted value afterwards. 2485a02ffc3SAndrew Turner * But for others, we can implement the reciprocal operation 2495a02ffc3SAndrew Turner * in this formula by _avoiding_ an inversion that the 2505a02ffc3SAndrew Turner * sub-case was going to do anyway. 2515a02ffc3SAndrew Turner */ 2525a02ffc3SAndrew Turner 2535a02ffc3SAndrew Turner if (negadjust == 0) { 2545a02ffc3SAndrew Turner /* 2555a02ffc3SAndrew Turner * Special case for negative integers. Applying the 2565a02ffc3SAndrew Turner * reflection formula would cause division by zero, but 2575a02ffc3SAndrew Turner * standards would prefer we treat this error case as an 2585a02ffc3SAndrew Turner * invalid operation and return NaN instead. (Possibly 2595a02ffc3SAndrew Turner * because otherwise you'd have to decide which sign of 2605a02ffc3SAndrew Turner * infinity to return, and unlike the x=0 case, there's no 2615a02ffc3SAndrew Turner * sign of zero available to disambiguate.) 2625a02ffc3SAndrew Turner */ 2635a02ffc3SAndrew Turner return negadjust / negadjust; 2645a02ffc3SAndrew Turner } 2655a02ffc3SAndrew Turner } 2665a02ffc3SAndrew Turner 2675a02ffc3SAndrew Turner /* 2685a02ffc3SAndrew Turner * Split the positive domain into various cases. For cases where 2695a02ffc3SAndrew Turner * we do the negative-number adjustment the usual way, we'll leave 2705a02ffc3SAndrew Turner * the answer in 'g' and drop out of the if statement. 2715a02ffc3SAndrew Turner */ 2725a02ffc3SAndrew Turner long double g; 2735a02ffc3SAndrew Turner 2745a02ffc3SAndrew Turner if (exponent >= 11) { 2755a02ffc3SAndrew Turner /* 2765a02ffc3SAndrew Turner * gamma of any positive value this large overflows, and gamma 2775a02ffc3SAndrew Turner * of any negative value underflows. 2785a02ffc3SAndrew Turner */ 2795a02ffc3SAndrew Turner if (!negative) { 2805a02ffc3SAndrew Turner long double huge = 0x1p+12288L; 2815a02ffc3SAndrew Turner return huge * huge; /* provoke an overflow */ 2825a02ffc3SAndrew Turner } else { 2835a02ffc3SAndrew Turner long double tiny = 0x1p-12288L; 2845a02ffc3SAndrew Turner return tiny * tiny * negadjust; /* underflow, of the right sign */ 2855a02ffc3SAndrew Turner } 2865a02ffc3SAndrew Turner } else if (exponent >= 3) { 2875a02ffc3SAndrew Turner /* Negative-number adjustment happens inside here */ 2885a02ffc3SAndrew Turner return tgamma_large(x, negative, negadjust); 2895a02ffc3SAndrew Turner } else if (exponent < -113) { 2905a02ffc3SAndrew Turner /* Negative-number adjustment happens inside here */ 2915a02ffc3SAndrew Turner return tgamma_ultratiny(x, negative, negadjust); 2925a02ffc3SAndrew Turner } else if (exponent < -5) { 2935a02ffc3SAndrew Turner /* Negative-number adjustment happens inside here */ 2945a02ffc3SAndrew Turner return tgamma_tiny(x, negative, negadjust); 2955a02ffc3SAndrew Turner } else if (exponent == 0) { 2965a02ffc3SAndrew Turner g = tgamma_central(x); 2975a02ffc3SAndrew Turner } else if (exponent < 0) { 2985a02ffc3SAndrew Turner /* 2995a02ffc3SAndrew Turner * For x in [1/32,1) we range-reduce upwards to the interval 3005a02ffc3SAndrew Turner * [1,2), using the inverse of the normal recurrence formula: 3015a02ffc3SAndrew Turner * gamma(x) = gamma(x+1)/x. 3025a02ffc3SAndrew Turner */ 3035a02ffc3SAndrew Turner g = tgamma_central(1+x) / x; 3045a02ffc3SAndrew Turner } else { 3055a02ffc3SAndrew Turner /* 3065a02ffc3SAndrew Turner * For x in [2,8) we range-reduce downwards to the interval 3075a02ffc3SAndrew Turner * [1,2) by repeated application of the recurrence formula. 3085a02ffc3SAndrew Turner * 3095a02ffc3SAndrew Turner * Actually multiplying (x-1) by (x-2) by (x-3) and so on 3105a02ffc3SAndrew Turner * would introduce multiple ULPs of rounding error. We can get 3115a02ffc3SAndrew Turner * better accuracy by writing x = (k+1/2) + t, where k is an 3125a02ffc3SAndrew Turner * integer and |t|<1/2, and expanding out the obvious factor 3135a02ffc3SAndrew Turner * (x-1)(x-2)...(x-k+1) as a polynomial in t. 3145a02ffc3SAndrew Turner */ 3155a02ffc3SAndrew Turner long double mult; 3165a02ffc3SAndrew Turner int i = x; 3175a02ffc3SAndrew Turner if (i == 2) { /* x in [2,3) */ 3185a02ffc3SAndrew Turner mult = (x-1); 3195a02ffc3SAndrew Turner } else { 3205a02ffc3SAndrew Turner long double t = x - (i + 0.5L); 3215a02ffc3SAndrew Turner switch (i) { 3225a02ffc3SAndrew Turner /* E.g. for x=3.5+t, we want 3235a02ffc3SAndrew Turner * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */ 3245a02ffc3SAndrew Turner case 3: 3255a02ffc3SAndrew Turner mult = 3.75L+t*(4.0L+t); 3265a02ffc3SAndrew Turner break; 3275a02ffc3SAndrew Turner case 4: 3285a02ffc3SAndrew Turner mult = 13.125L+t*(17.75L+t*(7.5L+t)); 3295a02ffc3SAndrew Turner break; 3305a02ffc3SAndrew Turner case 5: 3315a02ffc3SAndrew Turner mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t))); 3325a02ffc3SAndrew Turner break; 3335a02ffc3SAndrew Turner case 6: 3345a02ffc3SAndrew Turner mult = 324.84375L+t*(570.5625L+t*(376.250L+t*( 3355a02ffc3SAndrew Turner 117.5L+t*(17.5L+t)))); 3365a02ffc3SAndrew Turner break; 3375a02ffc3SAndrew Turner case 7: 3385a02ffc3SAndrew Turner mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*( 3395a02ffc3SAndrew Turner 1140.0L+t*(231.25L+t*(24.0L+t))))); 3405a02ffc3SAndrew Turner break; 341*f3087befSAndrew Turner default: 342*f3087befSAndrew Turner __builtin_unreachable(); 3435a02ffc3SAndrew Turner } 3445a02ffc3SAndrew Turner } 3455a02ffc3SAndrew Turner 3465a02ffc3SAndrew Turner g = tgamma_central(x - (i-1)) * mult; 3475a02ffc3SAndrew Turner } 3485a02ffc3SAndrew Turner 3495a02ffc3SAndrew Turner if (!negative) { 3505a02ffc3SAndrew Turner /* Positive domain: return g unmodified */ 3515a02ffc3SAndrew Turner return g; 3525a02ffc3SAndrew Turner } else { 3535a02ffc3SAndrew Turner /* Negative domain: apply the reflection formula as commented above */ 3545a02ffc3SAndrew Turner return 1.0L / (g * x * negadjust); 3555a02ffc3SAndrew Turner } 3565a02ffc3SAndrew Turner } 3575a02ffc3SAndrew Turner 3585a02ffc3SAndrew Turner #endif 359