1/* 2 * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a copy 5 * of this software and associated documentation files (the "Software"), to deal 6 * in the Software without restriction, including without limitation the rights 7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 * copies of the Software, and to permit persons to whom the Software is 9 * furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 * THE SOFTWARE. 21 */ 22 23#include <clc/clc.h> 24#include <clc/clcmacro.h> 25#include <clc/math/math.h> 26#include <clc/math/tables.h> 27 28_CLC_OVERLOAD _CLC_DEF float atan2pi(float y, float x) { 29 const float pi = 0x1.921fb6p+1f; 30 31 float ax = fabs(x); 32 float ay = fabs(y); 33 float v = min(ax, ay); 34 float u = max(ax, ay); 35 36 // Scale since u could be large, as in "regular" divide 37 float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; 38 float vbyu = s * MATH_DIVIDE(v, s*u); 39 40 float vbyu2 = vbyu * vbyu; 41 42 float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu; 43 float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); 44 45 // Octant 0 result 46 float a = MATH_DIVIDE(mad(p, MATH_RECIP(q), vbyu), pi); 47 48 // Fix up 3 other octants 49 float at = 0.5f - a; 50 a = ay > ax ? at : a; 51 at = 1.0f - a; 52 a = x < 0.0F ? at : a; 53 54 // y == 0 => 0 for x >= 0, pi for x < 0 55 at = as_int(x) < 0 ? 1.0f : 0.0f; 56 a = y == 0.0f ? at : a; 57 58 // if (!FINITE_ONLY()) { 59 // x and y are +- Inf 60 at = x > 0.0f ? 0.25f : 0.75f; 61 a = ax == INFINITY & ay == INFINITY ? at : a; 62 63 // x or y is NaN 64 a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a; 65 // } 66 67 // Fixup sign and return 68 return copysign(a, y); 69} 70 71_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan2pi, float, float) 72 73#ifdef cl_khr_fp64 74#pragma OPENCL EXTENSION cl_khr_fp64 : enable 75 76_CLC_OVERLOAD _CLC_DEF double atan2pi(double y, double x) { 77 const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ 78 const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ 79 const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ 80 const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ 81 const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ 82 83 double x2 = x; 84 int xneg = as_int2(x).hi < 0; 85 int xexp = (as_int2(x).hi >> 20) & 0x7ff; 86 87 double y2 = y; 88 int yneg = as_int2(y).hi < 0; 89 int yexp = (as_int2(y).hi >> 20) & 0x7ff; 90 91 int cond2 = (xexp < 1021) & (yexp < 1021); 92 int diffexp = yexp - xexp; 93 94 // Scale up both x and y if they are both below 1/4 95 double x1 = ldexp(x, 1024); 96 int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff; 97 double y1 = ldexp(y, 1024); 98 int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff; 99 int diffexp1 = yexp1 - xexp1; 100 101 diffexp = cond2 ? diffexp1 : diffexp; 102 x = cond2 ? x1 : x; 103 y = cond2 ? y1 : y; 104 105 // General case: take absolute values of arguments 106 double u = fabs(x); 107 double v = fabs(y); 108 109 // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. 110 int swap_vu = u < v; 111 double uu = u; 112 u = swap_vu ? v : u; 113 v = swap_vu ? uu : v; 114 115 double vbyu = v / u; 116 double q1, q2; 117 118 // General values of v/u. Use a look-up table and series expansion. 119 120 { 121 double val = vbyu > 0.0625 ? vbyu : 0.063; 122 int index = convert_int(fma(256.0, val, 0.5)); 123 double2 tv = USE_TABLE(atan_jby256_tbl, (index - 16)); 124 q1 = tv.s0; 125 q2 = tv.s1; 126 double c = (double)index * 0x1.0p-8; 127 128 // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 129 // u_exponent could be EMAX so we have to do it in 2 steps 130 int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); 131 double um = ldexp(u, m); 132 double vm = ldexp(v, m); 133 134 // 26 leading bits of u 135 double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL); 136 double u2 = um - u1; 137 138 double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um)); 139 140 // Polynomial approximation to atan(r) 141 double s = r * r; 142 q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r); 143 } 144 145 146 double q3, q4; 147 { 148 q3 = 0.0; 149 q4 = vbyu; 150 } 151 152 double q5, q6; 153 { 154 double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL); 155 double u2 = u - u1; 156 double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL); 157 double vu2 = vbyu - vu1; 158 159 q5 = 0.0; 160 double s = vbyu * vbyu; 161 q6 = vbyu + fma(-vbyu * s, 162 fma(-s, 163 fma(-s, 164 fma(-s, 165 fma(-s, 0.90029810285449784439E-01, 166 0.11110736283514525407), 167 0.14285713561807169030), 168 0.19999999999393223405), 169 0.33333333333333170500), 170 MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u)); 171 } 172 173 174 q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; 175 q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; 176 177 q1 = vbyu > 0.0625 ? q1 : q3; 178 q2 = vbyu > 0.0625 ? q2 : q4; 179 180 // Tidy-up according to which quadrant the arguments lie in 181 double res1, res2, res3, res4; 182 q1 = swap_vu ? piby2_head - q1 : q1; 183 q2 = swap_vu ? piby2_tail - q2 : q2; 184 q1 = xneg ? pi_head - q1 : q1; 185 q2 = xneg ? pi_tail - q2 : q2; 186 q1 = MATH_DIVIDE(q1 + q2, pi); 187 res4 = yneg ? -q1 : q1; 188 189 res1 = yneg ? -0.75 : 0.75; 190 res2 = yneg ? -0.25 : 0.25; 191 res3 = xneg ? res1 : res2; 192 193 res3 = isinf(y2) & isinf(x2) ? res3 : res4; 194 res1 = yneg ? -1.0 : 1.0; 195 196 // abs(x)/abs(y) > 2^56 and x < 0 197 res3 = (diffexp < -56 && xneg) ? res1 : res3; 198 199 res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi); 200 // x positive and dominant over y by a factor of 2^28 201 res3 = diffexp < -28 & xneg == 0 ? res4 : res3; 202 203 // abs(y)/abs(x) > 2^56 204 res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2 205 res3 = diffexp > 56 ? res4 : res3; 206 207 res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y 208 res4 = xneg ? res1 : y2; 209 210 res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x 211 res3 = isnan(y2) ? y2 : res3; 212 res3 = isnan(x2) ? x2 : res3; 213 214 return res3; 215} 216 217 218_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2pi, double, double) 219 220#endif 221 222#ifdef cl_khr_fp16 223 224#pragma OPENCL EXTENSION cl_khr_fp16 : enable 225 226_CLC_DEFINE_BINARY_BUILTIN_FP16(atan2pi) 227 228#endif 229