libm/ld80/s_expl.c

*cfe182f3Schristos/*-
*cfe182f3Schristos * SPDX-License-Identifier: BSD-2-Clause
*cfe182f3Schristos *
*cfe182f3Schristos * Copyright (c) 2009-2013 Steven G. Kargl
*cfe182f3Schristos * All rights reserved.
*cfe182f3Schristos *
*cfe182f3Schristos * Redistribution and use in source and binary forms, with or without
*cfe182f3Schristos * modification, are permitted provided that the following conditions
*cfe182f3Schristos * are met:
*cfe182f3Schristos * 1. Redistributions of source code must retain the above copyright
*cfe182f3Schristos *    notice unmodified, this list of conditions, and the following
*cfe182f3Schristos *    disclaimer.
*cfe182f3Schristos * 2. Redistributions in binary form must reproduce the above copyright
*cfe182f3Schristos *    notice, this list of conditions and the following disclaimer in the
*cfe182f3Schristos *    documentation and/or other materials provided with the distribution.
*cfe182f3Schristos *
*cfe182f3Schristos * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
*cfe182f3Schristos * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
*cfe182f3Schristos * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
*cfe182f3Schristos * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
*cfe182f3Schristos * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
*cfe182f3Schristos * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
*cfe182f3Schristos * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
*cfe182f3Schristos * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
*cfe182f3Schristos * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
*cfe182f3Schristos * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*cfe182f3Schristos *
*cfe182f3Schristos * Optimized by Bruce D. Evans.
*cfe182f3Schristos */
*cfe182f3Schristos
*cfe182f3Schristos#include <sys/cdefs.h>
*cfe182f3Schristos/**
*cfe182f3Schristos * Compute the exponential of x for Intel 80-bit format.  This is based on:
*cfe182f3Schristos *
*cfe182f3Schristos *   PTP Tang, "Table-driven implementation of the exponential function
*cfe182f3Schristos *   in IEEE floating-point arithmetic," ACM Trans. Math. Soft., 15,
*cfe182f3Schristos *   144-157 (1989).
*cfe182f3Schristos *
*cfe182f3Schristos * where the 32 table entries have been expanded to INTERVALS (see below).
*cfe182f3Schristos */
*cfe182f3Schristos
*cfe182f3Schristos#include <float.h>
*cfe182f3Schristos
*cfe182f3Schristos#ifdef __FreeBSD__
*cfe182f3Schristos#include "fpmath.h"
*cfe182f3Schristos#endif
*cfe182f3Schristos#include "math.h"
*cfe182f3Schristos#include "math_private.h"
*cfe182f3Schristos#include "k_expl.h"
*cfe182f3Schristos
*cfe182f3Schristos/* XXX Prevent compilers from erroneously constant folding these: */
*cfe182f3Schristosstatic const volatile long double
*cfe182f3Schristoshuge = 0x1p10000L,
*cfe182f3Schristostiny = 0x1p-10000L;
*cfe182f3Schristos
*cfe182f3Schristosstatic const long double
*cfe182f3Schristostwom10000 = 0x1p-10000L;
*cfe182f3Schristos
*cfe182f3Schristosstatic const union ieee_ext_u
*cfe182f3Schristos/* log(2**16384 - 0.5) rounded towards zero: */
*cfe182f3Schristos/* log(2**16384 - 0.5 + 1) rounded towards zero for expm1l() is the same: */
*cfe182f3Schristoso_thresholdu = LD80C(0xb17217f7d1cf79ab, 13,  11356.5234062941439488L),
*cfe182f3Schristos#define o_threshold	 (o_thresholdu.extu_ld)
*cfe182f3Schristos/* log(2**(-16381-64-1)) rounded towards zero: */
*cfe182f3Schristosu_thresholdu = LD80C(0xb21dfe7f09e2baa9, 13, -11399.4985314888605581L);
*cfe182f3Schristos#define u_threshold	 (u_thresholdu.extu_ld)
*cfe182f3Schristos
*cfe182f3Schristoslong double
*cfe182f3Schristosexpl(long double x)
*cfe182f3Schristos{
*cfe182f3Schristos	union ieee_ext_u u;
*cfe182f3Schristos	long double hi, lo, t, twopk;
*cfe182f3Schristos	int k;
*cfe182f3Schristos	uint16_t hx, ix;
*cfe182f3Schristos
*cfe182f3Schristos	/* Filter out exceptional cases. */
*cfe182f3Schristos	u.extu_ld = x;
*cfe182f3Schristos	hx = GET_EXPSIGN(&u);
*cfe182f3Schristos	ix = hx & 0x7fff;
*cfe182f3Schristos	if (ix >= BIAS + 13) {		/* |x| >= 8192 or x is NaN */
*cfe182f3Schristos		if (ix == BIAS + LDBL_MAX_EXP) {
*cfe182f3Schristos			if (hx & 0x8000)  /* x is -Inf, -NaN or unsupported */
*cfe182f3Schristos				RETURNF(-1 / x);
*cfe182f3Schristos			RETURNF(x + x);	/* x is +Inf, +NaN or unsupported */
*cfe182f3Schristos		}
*cfe182f3Schristos		if (x > o_threshold)
*cfe182f3Schristos			RETURNF(huge * huge);
*cfe182f3Schristos		if (x < u_threshold)
*cfe182f3Schristos			RETURNF(tiny * tiny);
*cfe182f3Schristos	} else if (ix < BIAS - 75) {	/* |x| < 0x1p-75 (includes pseudos) */
*cfe182f3Schristos		RETURNF(1 + x);		/* 1 with inexact iff x != 0 */
*cfe182f3Schristos	}
*cfe182f3Schristos
*cfe182f3Schristos	ENTERI();
*cfe182f3Schristos
*cfe182f3Schristos	twopk = 1;
*cfe182f3Schristos	__k_expl(x, &hi, &lo, &k);
*cfe182f3Schristos	t = SUM2P(hi, lo);
*cfe182f3Schristos
*cfe182f3Schristos	/* Scale by 2**k. */
*cfe182f3Schristos	if (k >= LDBL_MIN_EXP) {
*cfe182f3Schristos		if (k == LDBL_MAX_EXP)
*cfe182f3Schristos			RETURNI(t * 2 * 0x1p16383L);
*cfe182f3Schristos		SET_LDBL_EXPSIGN(twopk, BIAS + k);
*cfe182f3Schristos		RETURNI(t * twopk);
*cfe182f3Schristos	} else {
*cfe182f3Schristos		SET_LDBL_EXPSIGN(twopk, BIAS + k + 10000);
*cfe182f3Schristos		RETURNI(t * twopk * twom10000);
*cfe182f3Schristos	}
*cfe182f3Schristos}
*cfe182f3Schristos
*cfe182f3Schristos/**
*cfe182f3Schristos * Compute expm1l(x) for Intel 80-bit format.  This is based on:
*cfe182f3Schristos *
*cfe182f3Schristos *   PTP Tang, "Table-driven implementation of the Expm1 function
*cfe182f3Schristos *   in IEEE floating-point arithmetic," ACM Trans. Math. Soft., 18,
*cfe182f3Schristos *   211-222 (1992).
*cfe182f3Schristos */
*cfe182f3Schristos
*cfe182f3Schristos/*
*cfe182f3Schristos * Our T1 and T2 are chosen to be approximately the points where method
*cfe182f3Schristos * A and method B have the same accuracy.  Tang's T1 and T2 are the
*cfe182f3Schristos * points where method A's accuracy changes by a full bit.  For Tang,
*cfe182f3Schristos * this drop in accuracy makes method A immediately less accurate than
*cfe182f3Schristos * method B, but our larger INTERVALS makes method A 2 bits more
*cfe182f3Schristos * accurate so it remains the most accurate method significantly
*cfe182f3Schristos * closer to the origin despite losing the full bit in our extended
*cfe182f3Schristos * range for it.
*cfe182f3Schristos */
*cfe182f3Schristosstatic const double
*cfe182f3SchristosT1 = -0.1659,				/* ~-30.625/128 * log(2) */
*cfe182f3SchristosT2 =  0.1659;				/* ~30.625/128 * log(2) */
*cfe182f3Schristos
*cfe182f3Schristos/*
*cfe182f3Schristos * Domain [-0.1659, 0.1659], range ~[-2.6155e-22, 2.5507e-23]:
*cfe182f3Schristos * |(exp(x)-1-x-x**2/2)/x - p(x)| < 2**-71.6
*cfe182f3Schristos *
*cfe182f3Schristos * XXX the coeffs aren't very carefully rounded, and I get 2.8 more bits,
*cfe182f3Schristos * but unlike for ld128 we can't drop any terms.
*cfe182f3Schristos */
*cfe182f3Schristosstatic const union ieee_ext_u
*cfe182f3SchristosB3 = LD80C(0xaaaaaaaaaaaaaaab, -3,  1.66666666666666666671e-1L),
*cfe182f3SchristosB4 = LD80C(0xaaaaaaaaaaaaaaac, -5,  4.16666666666666666712e-2L);
*cfe182f3Schristos
*cfe182f3Schristosstatic const double
*cfe182f3SchristosB5  =  8.3333333333333245e-3,		/*  0x1.111111111110cp-7 */
*cfe182f3SchristosB6  =  1.3888888888888861e-3,		/*  0x1.6c16c16c16c0ap-10 */
*cfe182f3SchristosB7  =  1.9841269841532042e-4,		/*  0x1.a01a01a0319f9p-13 */
*cfe182f3SchristosB8  =  2.4801587302069236e-5,		/*  0x1.a01a01a03cbbcp-16 */
*cfe182f3SchristosB9  =  2.7557316558468562e-6,		/*  0x1.71de37fd33d67p-19 */
*cfe182f3SchristosB10 =  2.7557315829785151e-7,		/*  0x1.27e4f91418144p-22 */
*cfe182f3SchristosB11 =  2.5063168199779829e-8,		/*  0x1.ae94fabdc6b27p-26 */
*cfe182f3SchristosB12 =  2.0887164654459567e-9;		/*  0x1.1f122d6413fe1p-29 */
*cfe182f3Schristos
*cfe182f3Schristoslong double
*cfe182f3Schristosexpm1l(long double x)
*cfe182f3Schristos{
*cfe182f3Schristos	union ieee_ext_u u, v;
*cfe182f3Schristos	long double fn, hx2_hi, hx2_lo, q, r, r1, r2, t, twomk, twopk, x_hi;
*cfe182f3Schristos	long double x_lo, x2, z;
*cfe182f3Schristos	long double x4;
*cfe182f3Schristos	int k, n, n2;
*cfe182f3Schristos	uint16_t hx, ix;
*cfe182f3Schristos
*cfe182f3Schristos	/* Filter out exceptional cases. */
*cfe182f3Schristos	u.extu_ld = x;
*cfe182f3Schristos	hx = GET_EXPSIGN(&u);
*cfe182f3Schristos	ix = hx & 0x7fff;
*cfe182f3Schristos	if (ix >= BIAS + 6) {		/* |x| >= 64 or x is NaN */
*cfe182f3Schristos		if (ix == BIAS + LDBL_MAX_EXP) {
*cfe182f3Schristos			if (hx & 0x8000)  /* x is -Inf, -NaN or unsupported */
*cfe182f3Schristos				RETURNF(-1 / x - 1);
*cfe182f3Schristos			RETURNF(x + x);	/* x is +Inf, +NaN or unsupported */
*cfe182f3Schristos		}
*cfe182f3Schristos		if (x > o_threshold)
*cfe182f3Schristos			RETURNF(huge * huge);
*cfe182f3Schristos		/*
*cfe182f3Schristos		 * expm1l() never underflows, but it must avoid
*cfe182f3Schristos		 * unrepresentable large negative exponents.  We used a
*cfe182f3Schristos		 * much smaller threshold for large |x| above than in
*cfe182f3Schristos		 * expl() so as to handle not so large negative exponents
*cfe182f3Schristos		 * in the same way as large ones here.
*cfe182f3Schristos		 */
*cfe182f3Schristos		if (hx & 0x8000)	/* x <= -64 */
*cfe182f3Schristos			RETURNF(tiny - 1);	/* good for x < -65ln2 - eps */
*cfe182f3Schristos	}
*cfe182f3Schristos
*cfe182f3Schristos	ENTERI();
*cfe182f3Schristos
*cfe182f3Schristos	if (T1 < x && x < T2) {
*cfe182f3Schristos		if (ix < BIAS - 74) {	/* |x| < 0x1p-74 (includes pseudos) */
*cfe182f3Schristos			/* x (rounded) with inexact if x != 0: */
*cfe182f3Schristos			RETURNI(x == 0 ? x :
*cfe182f3Schristos			    (0x1p100 * x + fabsl(x)) * 0x1p-100);
*cfe182f3Schristos		}
*cfe182f3Schristos
*cfe182f3Schristos		x2 = x * x;
*cfe182f3Schristos		x4 = x2 * x2;
*cfe182f3Schristos		q = x4 * (x2 * (x4 *
*cfe182f3Schristos		    /*
*cfe182f3Schristos		     * XXX the number of terms is no longer good for
*cfe182f3Schristos		     * pairwise grouping of all except B3, and the
*cfe182f3Schristos		     * grouping is no longer from highest down.
*cfe182f3Schristos		     */
*cfe182f3Schristos		    (x2 *            B12  + (x * B11 + B10)) +
*cfe182f3Schristos		    (x2 * (x * B9 +  B8) +  (x * B7 +  B6))) +
*cfe182f3Schristos			  (x * B5 +  B4.extu_ld)) + x2 * x * B3.extu_ld;
*cfe182f3Schristos
*cfe182f3Schristos		x_hi = (float)x;
*cfe182f3Schristos		x_lo = x - x_hi;
*cfe182f3Schristos		hx2_hi = x_hi * x_hi / 2;
*cfe182f3Schristos		hx2_lo = x_lo * (x + x_hi) / 2;
*cfe182f3Schristos		if (ix >= BIAS - 7)
*cfe182f3Schristos			RETURNI((hx2_hi + x_hi) + (hx2_lo + x_lo + q));
*cfe182f3Schristos		else
*cfe182f3Schristos			RETURNI(x + (hx2_lo + q + hx2_hi));
*cfe182f3Schristos	}
*cfe182f3Schristos
*cfe182f3Schristos	/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
*cfe182f3Schristos	fn = rnintl(x * INV_L);
*cfe182f3Schristos	n = irint(fn);
*cfe182f3Schristos	n2 = (unsigned)n % INTERVALS;
*cfe182f3Schristos	k = n >> LOG2_INTERVALS;
*cfe182f3Schristos	r1 = x - fn * L1;
*cfe182f3Schristos	r2 = fn * -L2;
*cfe182f3Schristos	r = r1 + r2;
*cfe182f3Schristos
*cfe182f3Schristos	/* Prepare scale factor. */
*cfe182f3Schristos	v.extu_ld = 1;
*cfe182f3Schristos	SET_EXPSIGN(&v, BIAS + k);
*cfe182f3Schristos	twopk = v.extu_ld;
*cfe182f3Schristos
*cfe182f3Schristos	/*
*cfe182f3Schristos	 * Evaluate lower terms of
*cfe182f3Schristos	 * expl(endpoint[n2] + r1 + r2) = tbl[n2] * expl(r1 + r2).
*cfe182f3Schristos	 */
*cfe182f3Schristos	z = r * r;
*cfe182f3Schristos	q = r2 + z * (A2 + r * A3) + z * z * (A4 + r * A5) + z * z * z * A6;
*cfe182f3Schristos
*cfe182f3Schristos	t = (long double)tbl[n2].lo + tbl[n2].hi;
*cfe182f3Schristos
*cfe182f3Schristos	if (k == 0) {
*cfe182f3Schristos		t = SUM2P(tbl[n2].hi - 1, tbl[n2].lo * (r1 + 1) + t * q +
*cfe182f3Schristos		    tbl[n2].hi * r1);
*cfe182f3Schristos		RETURNI(t);
*cfe182f3Schristos	}
*cfe182f3Schristos	if (k == -1) {
*cfe182f3Schristos		t = SUM2P(tbl[n2].hi - 2, tbl[n2].lo * (r1 + 1) + t * q +
*cfe182f3Schristos		    tbl[n2].hi * r1);
*cfe182f3Schristos		RETURNI(t / 2);
*cfe182f3Schristos	}
*cfe182f3Schristos	if (k < -7) {
*cfe182f3Schristos		t = SUM2P(tbl[n2].hi, tbl[n2].lo + t * (q + r1));
*cfe182f3Schristos		RETURNI(t * twopk - 1);
*cfe182f3Schristos	}
*cfe182f3Schristos	if (k > 2 * LDBL_MANT_DIG - 1) {
*cfe182f3Schristos		t = SUM2P(tbl[n2].hi, tbl[n2].lo + t * (q + r1));
*cfe182f3Schristos		if (k == LDBL_MAX_EXP)
*cfe182f3Schristos			RETURNI(t * 2 * 0x1p16383L - 1);
*cfe182f3Schristos		RETURNI(t * twopk - 1);
*cfe182f3Schristos	}
*cfe182f3Schristos
*cfe182f3Schristos	SET_EXPSIGN(&v, BIAS - k);
*cfe182f3Schristos	twomk = v.extu_ld;
*cfe182f3Schristos
*cfe182f3Schristos	if (k > LDBL_MANT_DIG - 1)
*cfe182f3Schristos		t = SUM2P(tbl[n2].hi, tbl[n2].lo - twomk + t * (q + r1));
*cfe182f3Schristos	else
*cfe182f3Schristos		t = SUM2P(tbl[n2].hi - twomk, tbl[n2].lo + t * (q + r1));
*cfe182f3Schristos	RETURNI(t * twopk);
*cfe182f3Schristos}