1/* 2 * Copyright (c) 2014 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a copy 5 * of this software and associated documentation files (the "Software"), to deal 6 * in the Software without restriction, including without limitation the rights 7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 * copies of the Software, and to permit persons to whom the Software is 9 * furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 * THE SOFTWARE. 21 */ 22 23#include "sincos_helpers.h" 24#include <clc/clc.h> 25#include <clc/integer/clc_clz.h> 26#include <clc/integer/clc_mul_hi.h> 27#include <clc/math/clc_mad.h> 28#include <clc/math/clc_trunc.h> 29#include <clc/math/math.h> 30#include <clc/math/tables.h> 31#include <clc/shared/clc_max.h> 32 33#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift)); 34 35#define bytealign(src0, src1, src2) \ 36 ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8))) 37 38_CLC_DEF float __clc_sinf_piby4(float x, float y) { 39 // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... 40 // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... 41 // = x * f(w) 42 // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... 43 // We use a minimax approximation of (f(w) - 1) / w 44 // because this produces an expansion in even powers of x. 45 46 const float c1 = -0.1666666666e0f; 47 const float c2 = 0.8333331876e-2f; 48 const float c3 = -0.198400874e-3f; 49 const float c4 = 0.272500015e-5f; 50 const float c5 = -2.5050759689e-08f; // 0xb2d72f34 51 const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3 52 53 float z = x * x; 54 float v = z * x; 55 float r = __clc_mad( 56 z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2); 57 float ret = 58 x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y)); 59 60 return ret; 61} 62 63_CLC_DEF float __clc_cosf_piby4(float x, float y) { 64 // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... 65 // = f(w) 66 // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... 67 // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) 68 // because this produces an expansion in even powers of x. 69 70 const float c1 = 0.416666666e-1f; 71 const float c2 = -0.138888876e-2f; 72 const float c3 = 0.248006008e-4f; 73 const float c4 = -0.2730101334e-6f; 74 const float c5 = 2.0875723372e-09f; // 0x310f74f6 75 const float c6 = -1.1359647598e-11f; // 0xad47d74e 76 77 float z = x * x; 78 float r = 79 z * 80 __clc_mad( 81 z, 82 __clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), 83 c2), 84 c1); 85 86 // if |x| < 0.3 87 float qx = 0.0f; 88 89 int ix = as_int(x) & EXSIGNBIT_SP32; 90 91 // 0.78125 > |x| >= 0.3 92 float xby4 = as_float(ix - 0x01000000); 93 qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx; 94 95 // x > 0.78125 96 qx = ix > 0x3f480000 ? 0.28125f : qx; 97 98 float hz = __clc_mad(z, 0.5f, -qx); 99 float a = 1.0f - qx; 100 float ret = a - (hz - __clc_mad(z, r, -x * y)); 101 return ret; 102} 103 104_CLC_DEF float __clc_tanf_piby4(float x, int regn) { 105 // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4]. 106 float r = x * x; 107 108 float a = 109 __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f); 110 111 float b = __clc_mad( 112 r, 113 __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f), 114 1.15588821434688393452299f); 115 116 float t = __clc_mad(x * r, native_divide(a, b), x); 117 float tr = -MATH_RECIP(t); 118 119 return regn & 1 ? tr : t; 120} 121 122_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, 123 float bt) { 124 if (HAVE_HW_FMA32()) { 125 float ph = a * b; 126 *hi = ph; 127 *lo = fma(a, b, -ph); 128 } else { 129 float ah = as_float(as_uint(a) & 0xfffff000U); 130 float at = a - ah; 131 float ph = a * b; 132 float pt = __clc_mad( 133 at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph)))); 134 *hi = ph; 135 *lo = pt; 136 } 137} 138 139_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) { 140 // 72 bits of pi/2 141 const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f; 142 const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f; 143 const float fpiby2_1_t = (float)0xFDA / 0x1.0p+23f; 144 145 const float fpiby2_2 = (float)0xA22168 / 0x1.0p+47f; 146 const float fpiby2_2_h = (float)0xA22 / 0x1.0p+35f; 147 const float fpiby2_2_t = (float)0x168 / 0x1.0p+47f; 148 149 const float fpiby2_3 = (float)0xC234C4 / 0x1.0p+71f; 150 const float fpiby2_3_h = (float)0xC23 / 0x1.0p+59f; 151 const float fpiby2_3_t = (float)0x4C4 / 0x1.0p+71f; 152 153 const float twobypi = 0x1.45f306p-1f; 154 155 float fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f)); 156 157 // subtract n * pi/2 from x 158 float rhead, rtail; 159 __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t); 160 float v = x - rhead; 161 float rem = v + (((x - v) - rhead) - rtail); 162 163 float rhead2, rtail2; 164 __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t); 165 v = rem - rhead2; 166 rem = v + (((rem - v) - rhead2) - rtail2); 167 168 float rhead3, rtail3; 169 __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t); 170 v = rem - rhead3; 171 172 *hi = v + ((rem - v) - rhead3); 173 *lo = -rtail3; 174 return fnpi2; 175} 176 177_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) { 178 float fnpi2 = __clc_removePi2S(r, rr, x); 179 return (int)fnpi2 & 0x3; 180} 181 182#define FULL_MUL(A, B, HI, LO) \ 183 LO = A * B; \ 184 HI = __clc_mul_hi(A, B) 185 186#define FULL_MAD(A, B, C, HI, LO) \ 187 LO = ((A) * (B) + (C)); \ 188 HI = __clc_mul_hi(A, B); \ 189 HI += LO < C 190 191_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) { 192 int xe = (int)(as_uint(x) >> 23) - 127; 193 uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU); 194 195 // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 196 // FE5163AB 197 const uint b6 = 0xA2F9836EU; 198 const uint b5 = 0x4E441529U; 199 const uint b4 = 0xFC2757D1U; 200 const uint b3 = 0xF534DDC0U; 201 const uint b2 = 0xDB629599U; 202 const uint b1 = 0x3C439041U; 203 const uint b0 = 0xFE5163ABU; 204 205 uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1; 206 207 FULL_MUL(xm, b0, c0, p0); 208 FULL_MAD(xm, b1, c0, c1, p1); 209 FULL_MAD(xm, b2, c1, c0, p2); 210 FULL_MAD(xm, b3, c0, c1, p3); 211 FULL_MAD(xm, b4, c1, c0, p4); 212 FULL_MAD(xm, b5, c0, c1, p5); 213 FULL_MAD(xm, b6, c1, p7, p6); 214 215 uint fbits = 224 + 23 - xe; 216 217 // shift amount to get 2 lsb of integer part at top 2 bits 218 // min: 25 (xe=18) max: 134 (xe=127) 219 uint shift = 256U - 2 - fbits; 220 221 // Shift by up to 134/32 = 4 words 222 int c = shift > 31; 223 p7 = c ? p6 : p7; 224 p6 = c ? p5 : p6; 225 p5 = c ? p4 : p5; 226 p4 = c ? p3 : p4; 227 p3 = c ? p2 : p3; 228 p2 = c ? p1 : p2; 229 p1 = c ? p0 : p1; 230 shift -= (-c) & 32; 231 232 c = shift > 31; 233 p7 = c ? p6 : p7; 234 p6 = c ? p5 : p6; 235 p5 = c ? p4 : p5; 236 p4 = c ? p3 : p4; 237 p3 = c ? p2 : p3; 238 p2 = c ? p1 : p2; 239 shift -= (-c) & 32; 240 241 c = shift > 31; 242 p7 = c ? p6 : p7; 243 p6 = c ? p5 : p6; 244 p5 = c ? p4 : p5; 245 p4 = c ? p3 : p4; 246 p3 = c ? p2 : p3; 247 shift -= (-c) & 32; 248 249 c = shift > 31; 250 p7 = c ? p6 : p7; 251 p6 = c ? p5 : p6; 252 p5 = c ? p4 : p5; 253 p4 = c ? p3 : p4; 254 shift -= (-c) & 32; 255 256 // bitalign cannot handle a shift of 32 257 c = shift > 0; 258 shift = 32 - shift; 259 uint t7 = bitalign(p7, p6, shift); 260 uint t6 = bitalign(p6, p5, shift); 261 uint t5 = bitalign(p5, p4, shift); 262 p7 = c ? t7 : p7; 263 p6 = c ? t6 : p6; 264 p5 = c ? t5 : p5; 265 266 // Get 2 lsb of int part and msb of fraction 267 int i = p7 >> 29; 268 269 // Scoot up 2 more bits so only fraction remains 270 p7 = bitalign(p7, p6, 30); 271 p6 = bitalign(p6, p5, 30); 272 p5 = bitalign(p5, p4, 30); 273 274 // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5 275 uint flip = i & 1 ? 0xffffffffU : 0U; 276 uint sign = i & 1 ? 0x80000000U : 0U; 277 p7 = p7 ^ flip; 278 p6 = p6 ^ flip; 279 p5 = p5 ^ flip; 280 281 // Find exponent and shift away leading zeroes and hidden bit 282 xe = __clc_clz(p7) + 1; 283 shift = 32 - xe; 284 p7 = bitalign(p7, p6, shift); 285 p6 = bitalign(p6, p5, shift); 286 287 // Most significant part of fraction 288 float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9)); 289 290 // Shift out bits we captured on q1 291 p7 = bitalign(p7, p6, 32 - 23); 292 293 // Get 24 more bits of fraction in another float, there are not long strings 294 // of zeroes here 295 int xxe = __clc_clz(p7) + 1; 296 p7 = bitalign(p7, p6, 32 - xxe); 297 float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9)); 298 299 // At this point, the fraction q1 + q0 is correct to at least 48 bits 300 // Now we need to multiply the fraction by pi/2 301 // This loses us about 4 bits 302 // pi/2 = C90 FDA A22 168 C23 4C4 303 304 const float pio2h = (float)0xc90fda / 0x1.0p+23f; 305 const float pio2hh = (float)0xc90 / 0x1.0p+11f; 306 const float pio2ht = (float)0xfda / 0x1.0p+23f; 307 const float pio2t = (float)0xa22168 / 0x1.0p+47f; 308 309 float rh, rt; 310 311 if (HAVE_HW_FMA32()) { 312 rh = q1 * pio2h; 313 rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh))); 314 } else { 315 float q1h = as_float(as_uint(q1) & 0xfffff000); 316 float q1t = q1 - q1h; 317 rh = q1 * pio2h; 318 rt = __clc_mad( 319 q1t, pio2ht, 320 __clc_mad(q1t, pio2hh, 321 __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh)))); 322 rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt)); 323 } 324 325 float t = rh + rt; 326 rt = rt - (t - rh); 327 328 *r = t; 329 *rr = rt; 330 return ((i >> 1) + (i & 1)) & 0x3; 331} 332 333_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) { 334 if (x < 0x1.0p+23f) 335 return __clc_argReductionSmallS(r, rr, x); 336 else 337 return __clc_argReductionLargeS(r, rr, x); 338} 339 340#ifdef cl_khr_fp64 341 342#pragma OPENCL EXTENSION cl_khr_fp64 : enable 343 344// Reduction for medium sized arguments 345_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, 346 int *regn) { 347 // How many pi/2 is x a multiple of? 348 const double two_by_pi = 0x1.45f306dc9c883p-1; 349 double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5)); 350 351 const double piby2_h = -7074237752028440.0 / 0x1.0p+52; 352 const double piby2_m = -2483878800010755.0 / 0x1.0p+105; 353 const double piby2_t = -3956492004828932.0 / 0x1.0p+158; 354 355 // Compute product of npi2 with 159 bits of 2/pi 356 double p_hh = piby2_h * dnpi2; 357 double p_ht = fma(piby2_h, dnpi2, -p_hh); 358 double p_mh = piby2_m * dnpi2; 359 double p_mt = fma(piby2_m, dnpi2, -p_mh); 360 double p_th = piby2_t * dnpi2; 361 double p_tt = fma(piby2_t, dnpi2, -p_th); 362 363 // Reduce to 159 bits 364 double ph = p_hh; 365 double pm = p_ht + p_mh; 366 double t = p_mh - (pm - p_ht); 367 double pt = p_th + t + p_mt + p_tt; 368 t = ph + pm; 369 pm = pm - (t - ph); 370 ph = t; 371 t = pm + pt; 372 pt = pt - (t - pm); 373 pm = t; 374 375 // Subtract from x 376 t = x + ph; 377 double qh = t + pm; 378 double qt = pm - (qh - t) + pt; 379 380 *r = qh; 381 *rr = qt; 382 *regn = (int)(long)dnpi2 & 0x3; 383} 384 385// Given positive argument x, reduce it to the range [-pi/4,pi/4] using 386// extra precision, and return the result in r, rr. 387// Return value "regn" tells how many lots of pi/2 were subtracted 388// from x to put it in the range [-pi/4,pi/4], mod 4. 389 390_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, 391 int *regn) { 392 393 long ux = as_long(x); 394 int e = (int)(ux >> 52) - 1023; 395 int i = __clc_max(23, (e >> 3) + 17); 396 int j = 150 - i; 397 int j16 = j & ~0xf; 398 double fract_temp; 399 400 // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary 401 // byte boundary 402 uint4 q0 = USE_TABLE(pibits_tbl, j16); 403 uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16)); 404 uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32)); 405 406 int k = (j >> 2) & 0x3; 407 int4 c = (int4)k == (int4)(0, 1, 2, 3); 408 409 uint u0, u1, u2, u3, u4, u5, u6; 410 411 u0 = c.s1 ? q0.s1 : q0.s0; 412 u0 = c.s2 ? q0.s2 : u0; 413 u0 = c.s3 ? q0.s3 : u0; 414 415 u1 = c.s1 ? q0.s2 : q0.s1; 416 u1 = c.s2 ? q0.s3 : u1; 417 u1 = c.s3 ? q1.s0 : u1; 418 419 u2 = c.s1 ? q0.s3 : q0.s2; 420 u2 = c.s2 ? q1.s0 : u2; 421 u2 = c.s3 ? q1.s1 : u2; 422 423 u3 = c.s1 ? q1.s0 : q0.s3; 424 u3 = c.s2 ? q1.s1 : u3; 425 u3 = c.s3 ? q1.s2 : u3; 426 427 u4 = c.s1 ? q1.s1 : q1.s0; 428 u4 = c.s2 ? q1.s2 : u4; 429 u4 = c.s3 ? q1.s3 : u4; 430 431 u5 = c.s1 ? q1.s2 : q1.s1; 432 u5 = c.s2 ? q1.s3 : u5; 433 u5 = c.s3 ? q2.s0 : u5; 434 435 u6 = c.s1 ? q1.s3 : q1.s2; 436 u6 = c.s2 ? q2.s0 : u6; 437 u6 = c.s3 ? q2.s1 : u6; 438 439 uint v0 = bytealign(u1, u0, j); 440 uint v1 = bytealign(u2, u1, j); 441 uint v2 = bytealign(u3, u2, j); 442 uint v3 = bytealign(u4, u3, j); 443 uint v4 = bytealign(u5, u4, j); 444 uint v5 = bytealign(u6, u5, j); 445 446 // Place those 192 bits in 4 48-bit doubles along with correct exponent 447 // If i > 1018 we would get subnormals so we scale p up and x down to get the 448 // same product 449 i = 2 + 8 * i; 450 x *= i > 1018 ? 0x1.0p-136 : 1.0; 451 i -= i > 1018 ? 136 : 0; 452 453 uint ua = (uint)(1023 + 52 - i) << 20; 454 double a = as_double((uint2)(0, ua)); 455 double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a; 456 ua += 0x03000000U; 457 a = as_double((uint2)(0, ua)); 458 double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a; 459 ua += 0x03000000U; 460 a = as_double((uint2)(0, ua)); 461 double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a; 462 ua += 0x03000000U; 463 a = as_double((uint2)(0, ua)); 464 double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a; 465 466 // Exact multiply 467 double f0h = p0 * x; 468 double f0l = fma(p0, x, -f0h); 469 double f1h = p1 * x; 470 double f1l = fma(p1, x, -f1h); 471 double f2h = p2 * x; 472 double f2l = fma(p2, x, -f2h); 473 double f3h = p3 * x; 474 double f3l = fma(p3, x, -f3h); 475 476 // Accumulate product into 4 doubles 477 double s, t; 478 479 double f3 = f3h + f2h; 480 t = f2h - (f3 - f3h); 481 s = f3l + t; 482 t = t - (s - f3l); 483 484 double f2 = s + f1h; 485 t = f1h - (f2 - s) + t; 486 s = f2l + t; 487 t = t - (s - f2l); 488 489 double f1 = s + f0h; 490 t = f0h - (f1 - s) + t; 491 s = f1l + t; 492 493 double f0 = s + f0l; 494 495 // Strip off unwanted large integer bits 496 f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp); 497 f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0; 498 499 // Compute least significant integer bits 500 t = f3 + f2; 501 double di = t - fract(t, &fract_temp); 502 i = (float)di; 503 504 // Shift out remaining integer part 505 f3 -= di; 506 s = f3 + f2; 507 t = f2 - (s - f3); 508 f3 = s; 509 f2 = t; 510 s = f2 + f1; 511 t = f1 - (s - f2); 512 f2 = s; 513 f1 = t; 514 f1 += f0; 515 516 // Subtract 1 if fraction is >= 0.5, and update regn 517 int g = f3 >= 0.5; 518 i += g; 519 f3 -= (float)g; 520 521 // Shift up bits 522 s = f3 + f2; 523 t = f2 - (s - f3); 524 f3 = s; 525 f2 = t + f1; 526 527 // Multiply precise fraction by pi/2 to get radians 528 const double p2h = 7074237752028440.0 / 0x1.0p+52; 529 const double p2t = 4967757600021510.0 / 0x1.0p+106; 530 531 double rhi = f3 * p2h; 532 double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi))); 533 534 *r = rhi + rlo; 535 *rr = rlo - (*r - rhi); 536 *regn = i & 0x3; 537} 538 539_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) { 540 // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... 541 // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... 542 // = x * f(w) 543 // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... 544 // We use a minimax approximation of (f(w) - 1) / w 545 // because this produces an expansion in even powers of x. 546 // If xx (the tail of x) is non-zero, we add a correction 547 // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) 548 // is an approximation to cos(x)*sin(xx) valid because 549 // xx is tiny relative to x. 550 551 // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... 552 // = f(w) 553 // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... 554 // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) 555 // because this produces an expansion in even powers of x. 556 // If xx (the tail of x) is non-zero, we subtract a correction 557 // term g(x,xx) = x*xx to the result, where g(x,xx) 558 // is an approximation to sin(x)*sin(xx) valid because 559 // xx is tiny relative to x. 560 561 const double sc1 = -0.166666666666666646259241729; 562 const double sc2 = 0.833333333333095043065222816e-2; 563 const double sc3 = -0.19841269836761125688538679e-3; 564 const double sc4 = 0.275573161037288022676895908448e-5; 565 const double sc5 = -0.25051132068021699772257377197e-7; 566 const double sc6 = 0.159181443044859136852668200e-9; 567 568 const double cc1 = 0.41666666666666665390037e-1; 569 const double cc2 = -0.13888888888887398280412e-2; 570 const double cc3 = 0.248015872987670414957399e-4; 571 const double cc4 = -0.275573172723441909470836e-6; 572 const double cc5 = 0.208761463822329611076335e-8; 573 const double cc6 = -0.113826398067944859590880e-10; 574 575 double x2 = x * x; 576 double x3 = x2 * x; 577 double r = 0.5 * x2; 578 double t = 1.0 - r; 579 580 double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); 581 582 double cp = 583 t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), 584 x2, cc1), 585 x2 * x2, fma(x, xx, (1.0 - t) - r)); 586 587 double2 ret; 588 ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5 * xx), x2, -xx)); 589 ret.hi = cp; 590 591 return ret; 592} 593 594#endif 595