xref: /llvm-project/libclc/generic/lib/math/sincos_helpers.cl (revision 7441e87fe05376782d0ddb90a13e1756eb1b1976)
1/*
2 * Copyright (c) 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 * THE SOFTWARE.
21 */
22
23#include "sincos_helpers.h"
24#include <clc/clc.h>
25#include <clc/integer/clc_clz.h>
26#include <clc/integer/clc_mul_hi.h>
27#include <clc/math/clc_mad.h>
28#include <clc/math/clc_trunc.h>
29#include <clc/math/math.h>
30#include <clc/math/tables.h>
31#include <clc/shared/clc_max.h>
32
33#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
34
35#define bytealign(src0, src1, src2)                                            \
36  ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
37
38_CLC_DEF float __clc_sinf_piby4(float x, float y) {
39  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
40  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
41  // = x * f(w)
42  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
43  // We use a minimax approximation of (f(w) - 1) / w
44  // because this produces an expansion in even powers of x.
45
46  const float c1 = -0.1666666666e0f;
47  const float c2 = 0.8333331876e-2f;
48  const float c3 = -0.198400874e-3f;
49  const float c4 = 0.272500015e-5f;
50  const float c5 = -2.5050759689e-08f; // 0xb2d72f34
51  const float c6 = 1.5896910177e-10f;  // 0x2f2ec9d3
52
53  float z = x * x;
54  float v = z * x;
55  float r = __clc_mad(
56      z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2);
57  float ret =
58      x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y));
59
60  return ret;
61}
62
63_CLC_DEF float __clc_cosf_piby4(float x, float y) {
64  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
65  // = f(w)
66  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
67  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
68  // because this produces an expansion in even powers of x.
69
70  const float c1 = 0.416666666e-1f;
71  const float c2 = -0.138888876e-2f;
72  const float c3 = 0.248006008e-4f;
73  const float c4 = -0.2730101334e-6f;
74  const float c5 = 2.0875723372e-09f;  // 0x310f74f6
75  const float c6 = -1.1359647598e-11f; // 0xad47d74e
76
77  float z = x * x;
78  float r =
79      z *
80      __clc_mad(
81          z,
82          __clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3),
83                    c2),
84          c1);
85
86  // if |x| < 0.3
87  float qx = 0.0f;
88
89  int ix = as_int(x) & EXSIGNBIT_SP32;
90
91  //  0.78125 > |x| >= 0.3
92  float xby4 = as_float(ix - 0x01000000);
93  qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
94
95  // x > 0.78125
96  qx = ix > 0x3f480000 ? 0.28125f : qx;
97
98  float hz = __clc_mad(z, 0.5f, -qx);
99  float a = 1.0f - qx;
100  float ret = a - (hz - __clc_mad(z, r, -x * y));
101  return ret;
102}
103
104_CLC_DEF float __clc_tanf_piby4(float x, int regn) {
105  // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
106  float r = x * x;
107
108  float a =
109      __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
110
111  float b = __clc_mad(
112      r,
113      __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
114      1.15588821434688393452299f);
115
116  float t = __clc_mad(x * r, native_divide(a, b), x);
117  float tr = -MATH_RECIP(t);
118
119  return regn & 1 ? tr : t;
120}
121
122_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh,
123                             float bt) {
124  if (HAVE_HW_FMA32()) {
125    float ph = a * b;
126    *hi = ph;
127    *lo = fma(a, b, -ph);
128  } else {
129    float ah = as_float(as_uint(a) & 0xfffff000U);
130    float at = a - ah;
131    float ph = a * b;
132    float pt = __clc_mad(
133        at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph))));
134    *hi = ph;
135    *lo = pt;
136  }
137}
138
139_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) {
140  // 72 bits of pi/2
141  const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f;
142  const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f;
143  const float fpiby2_1_t = (float)0xFDA / 0x1.0p+23f;
144
145  const float fpiby2_2 = (float)0xA22168 / 0x1.0p+47f;
146  const float fpiby2_2_h = (float)0xA22 / 0x1.0p+35f;
147  const float fpiby2_2_t = (float)0x168 / 0x1.0p+47f;
148
149  const float fpiby2_3 = (float)0xC234C4 / 0x1.0p+71f;
150  const float fpiby2_3_h = (float)0xC23 / 0x1.0p+59f;
151  const float fpiby2_3_t = (float)0x4C4 / 0x1.0p+71f;
152
153  const float twobypi = 0x1.45f306p-1f;
154
155  float fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f));
156
157  // subtract n * pi/2 from x
158  float rhead, rtail;
159  __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
160  float v = x - rhead;
161  float rem = v + (((x - v) - rhead) - rtail);
162
163  float rhead2, rtail2;
164  __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
165  v = rem - rhead2;
166  rem = v + (((rem - v) - rhead2) - rtail2);
167
168  float rhead3, rtail3;
169  __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
170  v = rem - rhead3;
171
172  *hi = v + ((rem - v) - rhead3);
173  *lo = -rtail3;
174  return fnpi2;
175}
176
177_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) {
178  float fnpi2 = __clc_removePi2S(r, rr, x);
179  return (int)fnpi2 & 0x3;
180}
181
182#define FULL_MUL(A, B, HI, LO)                                                 \
183  LO = A * B;                                                                  \
184  HI = __clc_mul_hi(A, B)
185
186#define FULL_MAD(A, B, C, HI, LO)                                              \
187  LO = ((A) * (B) + (C));                                                      \
188  HI = __clc_mul_hi(A, B);                                                     \
189  HI += LO < C
190
191_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) {
192  int xe = (int)(as_uint(x) >> 23) - 127;
193  uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
194
195  // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041
196  // FE5163AB
197  const uint b6 = 0xA2F9836EU;
198  const uint b5 = 0x4E441529U;
199  const uint b4 = 0xFC2757D1U;
200  const uint b3 = 0xF534DDC0U;
201  const uint b2 = 0xDB629599U;
202  const uint b1 = 0x3C439041U;
203  const uint b0 = 0xFE5163ABU;
204
205  uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
206
207  FULL_MUL(xm, b0, c0, p0);
208  FULL_MAD(xm, b1, c0, c1, p1);
209  FULL_MAD(xm, b2, c1, c0, p2);
210  FULL_MAD(xm, b3, c0, c1, p3);
211  FULL_MAD(xm, b4, c1, c0, p4);
212  FULL_MAD(xm, b5, c0, c1, p5);
213  FULL_MAD(xm, b6, c1, p7, p6);
214
215  uint fbits = 224 + 23 - xe;
216
217  // shift amount to get 2 lsb of integer part at top 2 bits
218  //   min: 25 (xe=18) max: 134 (xe=127)
219  uint shift = 256U - 2 - fbits;
220
221  // Shift by up to 134/32 = 4 words
222  int c = shift > 31;
223  p7 = c ? p6 : p7;
224  p6 = c ? p5 : p6;
225  p5 = c ? p4 : p5;
226  p4 = c ? p3 : p4;
227  p3 = c ? p2 : p3;
228  p2 = c ? p1 : p2;
229  p1 = c ? p0 : p1;
230  shift -= (-c) & 32;
231
232  c = shift > 31;
233  p7 = c ? p6 : p7;
234  p6 = c ? p5 : p6;
235  p5 = c ? p4 : p5;
236  p4 = c ? p3 : p4;
237  p3 = c ? p2 : p3;
238  p2 = c ? p1 : p2;
239  shift -= (-c) & 32;
240
241  c = shift > 31;
242  p7 = c ? p6 : p7;
243  p6 = c ? p5 : p6;
244  p5 = c ? p4 : p5;
245  p4 = c ? p3 : p4;
246  p3 = c ? p2 : p3;
247  shift -= (-c) & 32;
248
249  c = shift > 31;
250  p7 = c ? p6 : p7;
251  p6 = c ? p5 : p6;
252  p5 = c ? p4 : p5;
253  p4 = c ? p3 : p4;
254  shift -= (-c) & 32;
255
256  // bitalign cannot handle a shift of 32
257  c = shift > 0;
258  shift = 32 - shift;
259  uint t7 = bitalign(p7, p6, shift);
260  uint t6 = bitalign(p6, p5, shift);
261  uint t5 = bitalign(p5, p4, shift);
262  p7 = c ? t7 : p7;
263  p6 = c ? t6 : p6;
264  p5 = c ? t5 : p5;
265
266  // Get 2 lsb of int part and msb of fraction
267  int i = p7 >> 29;
268
269  // Scoot up 2 more bits so only fraction remains
270  p7 = bitalign(p7, p6, 30);
271  p6 = bitalign(p6, p5, 30);
272  p5 = bitalign(p5, p4, 30);
273
274  // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
275  uint flip = i & 1 ? 0xffffffffU : 0U;
276  uint sign = i & 1 ? 0x80000000U : 0U;
277  p7 = p7 ^ flip;
278  p6 = p6 ^ flip;
279  p5 = p5 ^ flip;
280
281  // Find exponent and shift away leading zeroes and hidden bit
282  xe = __clc_clz(p7) + 1;
283  shift = 32 - xe;
284  p7 = bitalign(p7, p6, shift);
285  p6 = bitalign(p6, p5, shift);
286
287  // Most significant part of fraction
288  float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
289
290  // Shift out bits we captured on q1
291  p7 = bitalign(p7, p6, 32 - 23);
292
293  // Get 24 more bits of fraction in another float, there are not long strings
294  // of zeroes here
295  int xxe = __clc_clz(p7) + 1;
296  p7 = bitalign(p7, p6, 32 - xxe);
297  float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
298
299  // At this point, the fraction q1 + q0 is correct to at least 48 bits
300  // Now we need to multiply the fraction by pi/2
301  // This loses us about 4 bits
302  // pi/2 = C90 FDA A22 168 C23 4C4
303
304  const float pio2h = (float)0xc90fda / 0x1.0p+23f;
305  const float pio2hh = (float)0xc90 / 0x1.0p+11f;
306  const float pio2ht = (float)0xfda / 0x1.0p+23f;
307  const float pio2t = (float)0xa22168 / 0x1.0p+47f;
308
309  float rh, rt;
310
311  if (HAVE_HW_FMA32()) {
312    rh = q1 * pio2h;
313    rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
314  } else {
315    float q1h = as_float(as_uint(q1) & 0xfffff000);
316    float q1t = q1 - q1h;
317    rh = q1 * pio2h;
318    rt = __clc_mad(
319        q1t, pio2ht,
320        __clc_mad(q1t, pio2hh,
321                  __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh))));
322    rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt));
323  }
324
325  float t = rh + rt;
326  rt = rt - (t - rh);
327
328  *r = t;
329  *rr = rt;
330  return ((i >> 1) + (i & 1)) & 0x3;
331}
332
333_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) {
334  if (x < 0x1.0p+23f)
335    return __clc_argReductionSmallS(r, rr, x);
336  else
337    return __clc_argReductionLargeS(r, rr, x);
338}
339
340#ifdef cl_khr_fp64
341
342#pragma OPENCL EXTENSION cl_khr_fp64 : enable
343
344// Reduction for medium sized arguments
345_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr,
346                                           int *regn) {
347  // How many pi/2 is x a multiple of?
348  const double two_by_pi = 0x1.45f306dc9c883p-1;
349  double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5));
350
351  const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
352  const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
353  const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
354
355  // Compute product of npi2 with 159 bits of 2/pi
356  double p_hh = piby2_h * dnpi2;
357  double p_ht = fma(piby2_h, dnpi2, -p_hh);
358  double p_mh = piby2_m * dnpi2;
359  double p_mt = fma(piby2_m, dnpi2, -p_mh);
360  double p_th = piby2_t * dnpi2;
361  double p_tt = fma(piby2_t, dnpi2, -p_th);
362
363  // Reduce to 159 bits
364  double ph = p_hh;
365  double pm = p_ht + p_mh;
366  double t = p_mh - (pm - p_ht);
367  double pt = p_th + t + p_mt + p_tt;
368  t = ph + pm;
369  pm = pm - (t - ph);
370  ph = t;
371  t = pm + pt;
372  pt = pt - (t - pm);
373  pm = t;
374
375  // Subtract from x
376  t = x + ph;
377  double qh = t + pm;
378  double qt = pm - (qh - t) + pt;
379
380  *r = qh;
381  *rr = qt;
382  *regn = (int)(long)dnpi2 & 0x3;
383}
384
385// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
386// extra precision, and return the result in r, rr.
387// Return value "regn" tells how many lots of pi/2 were subtracted
388// from x to put it in the range [-pi/4,pi/4], mod 4.
389
390_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr,
391                                          int *regn) {
392
393  long ux = as_long(x);
394  int e = (int)(ux >> 52) - 1023;
395  int i = __clc_max(23, (e >> 3) + 17);
396  int j = 150 - i;
397  int j16 = j & ~0xf;
398  double fract_temp;
399
400  // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
401  // byte boundary
402  uint4 q0 = USE_TABLE(pibits_tbl, j16);
403  uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
404  uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
405
406  int k = (j >> 2) & 0x3;
407  int4 c = (int4)k == (int4)(0, 1, 2, 3);
408
409  uint u0, u1, u2, u3, u4, u5, u6;
410
411  u0 = c.s1 ? q0.s1 : q0.s0;
412  u0 = c.s2 ? q0.s2 : u0;
413  u0 = c.s3 ? q0.s3 : u0;
414
415  u1 = c.s1 ? q0.s2 : q0.s1;
416  u1 = c.s2 ? q0.s3 : u1;
417  u1 = c.s3 ? q1.s0 : u1;
418
419  u2 = c.s1 ? q0.s3 : q0.s2;
420  u2 = c.s2 ? q1.s0 : u2;
421  u2 = c.s3 ? q1.s1 : u2;
422
423  u3 = c.s1 ? q1.s0 : q0.s3;
424  u3 = c.s2 ? q1.s1 : u3;
425  u3 = c.s3 ? q1.s2 : u3;
426
427  u4 = c.s1 ? q1.s1 : q1.s0;
428  u4 = c.s2 ? q1.s2 : u4;
429  u4 = c.s3 ? q1.s3 : u4;
430
431  u5 = c.s1 ? q1.s2 : q1.s1;
432  u5 = c.s2 ? q1.s3 : u5;
433  u5 = c.s3 ? q2.s0 : u5;
434
435  u6 = c.s1 ? q1.s3 : q1.s2;
436  u6 = c.s2 ? q2.s0 : u6;
437  u6 = c.s3 ? q2.s1 : u6;
438
439  uint v0 = bytealign(u1, u0, j);
440  uint v1 = bytealign(u2, u1, j);
441  uint v2 = bytealign(u3, u2, j);
442  uint v3 = bytealign(u4, u3, j);
443  uint v4 = bytealign(u5, u4, j);
444  uint v5 = bytealign(u6, u5, j);
445
446  // Place those 192 bits in 4 48-bit doubles along with correct exponent
447  // If i > 1018 we would get subnormals so we scale p up and x down to get the
448  // same product
449  i = 2 + 8 * i;
450  x *= i > 1018 ? 0x1.0p-136 : 1.0;
451  i -= i > 1018 ? 136 : 0;
452
453  uint ua = (uint)(1023 + 52 - i) << 20;
454  double a = as_double((uint2)(0, ua));
455  double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
456  ua += 0x03000000U;
457  a = as_double((uint2)(0, ua));
458  double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
459  ua += 0x03000000U;
460  a = as_double((uint2)(0, ua));
461  double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
462  ua += 0x03000000U;
463  a = as_double((uint2)(0, ua));
464  double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
465
466  // Exact multiply
467  double f0h = p0 * x;
468  double f0l = fma(p0, x, -f0h);
469  double f1h = p1 * x;
470  double f1l = fma(p1, x, -f1h);
471  double f2h = p2 * x;
472  double f2l = fma(p2, x, -f2h);
473  double f3h = p3 * x;
474  double f3l = fma(p3, x, -f3h);
475
476  // Accumulate product into 4 doubles
477  double s, t;
478
479  double f3 = f3h + f2h;
480  t = f2h - (f3 - f3h);
481  s = f3l + t;
482  t = t - (s - f3l);
483
484  double f2 = s + f1h;
485  t = f1h - (f2 - s) + t;
486  s = f2l + t;
487  t = t - (s - f2l);
488
489  double f1 = s + f0h;
490  t = f0h - (f1 - s) + t;
491  s = f1l + t;
492
493  double f0 = s + f0l;
494
495  // Strip off unwanted large integer bits
496  f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
497  f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
498
499  // Compute least significant integer bits
500  t = f3 + f2;
501  double di = t - fract(t, &fract_temp);
502  i = (float)di;
503
504  // Shift out remaining integer part
505  f3 -= di;
506  s = f3 + f2;
507  t = f2 - (s - f3);
508  f3 = s;
509  f2 = t;
510  s = f2 + f1;
511  t = f1 - (s - f2);
512  f2 = s;
513  f1 = t;
514  f1 += f0;
515
516  // Subtract 1 if fraction is >= 0.5, and update regn
517  int g = f3 >= 0.5;
518  i += g;
519  f3 -= (float)g;
520
521  // Shift up bits
522  s = f3 + f2;
523  t = f2 - (s - f3);
524  f3 = s;
525  f2 = t + f1;
526
527  // Multiply precise fraction by pi/2 to get radians
528  const double p2h = 7074237752028440.0 / 0x1.0p+52;
529  const double p2t = 4967757600021510.0 / 0x1.0p+106;
530
531  double rhi = f3 * p2h;
532  double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
533
534  *r = rhi + rlo;
535  *rr = rlo - (*r - rhi);
536  *regn = i & 0x3;
537}
538
539_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
540  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
541  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
542  //                      = x * f(w)
543  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
544  // We use a minimax approximation of (f(w) - 1) / w
545  // because this produces an expansion in even powers of x.
546  // If xx (the tail of x) is non-zero, we add a correction
547  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
548  // is an approximation to cos(x)*sin(xx) valid because
549  // xx is tiny relative to x.
550
551  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
552  //                      = f(w)
553  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
554  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
555  // because this produces an expansion in even powers of x.
556  // If xx (the tail of x) is non-zero, we subtract a correction
557  // term g(x,xx) = x*xx to the result, where g(x,xx)
558  // is an approximation to sin(x)*sin(xx) valid because
559  // xx is tiny relative to x.
560
561  const double sc1 = -0.166666666666666646259241729;
562  const double sc2 = 0.833333333333095043065222816e-2;
563  const double sc3 = -0.19841269836761125688538679e-3;
564  const double sc4 = 0.275573161037288022676895908448e-5;
565  const double sc5 = -0.25051132068021699772257377197e-7;
566  const double sc6 = 0.159181443044859136852668200e-9;
567
568  const double cc1 = 0.41666666666666665390037e-1;
569  const double cc2 = -0.13888888888887398280412e-2;
570  const double cc3 = 0.248015872987670414957399e-4;
571  const double cc4 = -0.275573172723441909470836e-6;
572  const double cc5 = 0.208761463822329611076335e-8;
573  const double cc6 = -0.113826398067944859590880e-10;
574
575  double x2 = x * x;
576  double x3 = x2 * x;
577  double r = 0.5 * x2;
578  double t = 1.0 - r;
579
580  double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
581
582  double cp =
583      t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2),
584                  x2, cc1),
585              x2 * x2, fma(x, xx, (1.0 - t) - r));
586
587  double2 ret;
588  ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5 * xx), x2, -xx));
589  ret.hi = cp;
590
591  return ret;
592}
593
594#endif
595