1#include <clc/clc.h> 2#include <clc/clcmacro.h> 3#include <clc/math/math.h> 4#include <clc/math/tables.h> 5 6/* Refer to the exp routine for the underlying algorithm */ 7 8_CLC_OVERLOAD _CLC_DEF float expm1(float x) { 9 const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673 10 const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184 11 12 const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657 13 const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227 14 const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388 15 16 uint xi = as_uint(x); 17 int n = (int)(x * R_64_BY_LOG2); 18 float fn = (float)n; 19 20 int j = n & 0x3f; 21 int m = n >> 6; 22 23 float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x)); 24 25 // Truncated Taylor series 26 float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f, 0x1.555556p-3f), 0.5f), r); 27 28 float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); 29 float2 tv = USE_TABLE(exp_tbl_ep, j); 30 31 float two_to_jby64_h = tv.s0 * m2; 32 float two_to_jby64_t = tv.s1 * m2; 33 float two_to_jby64 = two_to_jby64_h + two_to_jby64_t; 34 35 z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f); 36 //Make subnormals work 37 z2 = x == 0.f ? x : z2; 38 z2 = x < X_MIN | m < -24 ? -1.0f : z2; 39 z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2; 40 z2 = isnan(x) ? x : z2; 41 42 return z2; 43} 44 45_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, expm1, float) 46 47#ifdef cl_khr_fp64 48 49#include "exp_helper.h" 50 51#pragma OPENCL EXTENSION cl_khr_fp64 : enable 52 53_CLC_OVERLOAD _CLC_DEF double expm1(double x) { 54 const double max_expm1_arg = 709.8; 55 const double min_expm1_arg = -37.42994775023704; 56 const double log_OnePlus_OneByFour = 0.22314355131420976; //0x3FCC8FF7C79A9A22 = log(1+1/4) 57 const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4) 58 const double sixtyfour_by_lnof2 = 92.33248261689366; //0x40571547652b82fe 59 const double lnof2_by_64_head = 0.010830424696223417; //0x3f862e42fefa0000 60 const double lnof2_by_64_tail = 2.5728046223276688e-14; //0x3d1cf79abc9e3b39 61 62 // First, assume log(1-1/4) < x < log(1+1/4) i.e -0.28768 < x < 0.22314 63 double u = as_double(as_ulong(x) & 0xffffffffff000000UL); 64 double v = x - u; 65 double y = u * u * 0.5; 66 double z = v * (x + u) * 0.5; 67 68 double q = fma(x, 69 fma(x, 70 fma(x, 71 fma(x, 72 fma(x, 73 fma(x, 74 fma(x, 75 fma(x,2.4360682937111612e-8, 2.7582184028154370e-7), 76 2.7558212415361945e-6), 77 2.4801576918453420e-5), 78 1.9841269447671544e-4), 79 1.3888888890687830e-3), 80 8.3333333334012270e-3), 81 4.1666666666665560e-2), 82 1.6666666666666632e-1); 83 q *= x * x * x; 84 85 double z1g = (u + y) + (q + (v + z)); 86 double z1 = x + (y + (q + z)); 87 z1 = y >= 0x1.0p-7 ? z1g : z1; 88 89 // Now assume outside interval around 0 90 int n = (int)(x * sixtyfour_by_lnof2); 91 int j = n & 0x3f; 92 int m = n >> 6; 93 94 double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j); 95 double f1 = tv.s0; 96 double f2 = tv.s1; 97 double f = f1 + f2; 98 99 double dn = -n; 100 double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x)); 101 102 q = fma(r, 103 fma(r, 104 fma(r, 105 fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03), 106 4.16666666662260795726e-02), 107 1.66666666665260878863e-01), 108 5.00000000000000008883e-01); 109 q = fma(r*r, q, r); 110 111 double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64); 112 double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64); 113 114 // Computations for m > 52, including where result is close to Inf 115 ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2)))); 116 int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1; 117 118 double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64)); 119 zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024; 120 121 double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm)); 122 zmg52 = m == 1024 ? zme1024 : zmg52; 123 124 // For m < 53 125 double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q))); 126 127 // For m < -7 128 double zmln7 = fma(twopm, f1 + fma(f, q, f2), -1.0); 129 130 z = m < 53 ? zml53 : zmg52; 131 z = m < -7 ? zmln7 : z; 132 z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z; 133 z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z; 134 z = x < min_expm1_arg ? -1.0 : z; 135 136 return z; 137} 138 139_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double) 140 141#endif 142 143#ifdef cl_khr_fp16 144 145#pragma OPENCL EXTENSION cl_khr_fp16 : enable 146 147_CLC_DEFINE_UNARY_BUILTIN_FP16(expm1) 148 149#endif 150