1 /* Copyright (C) 2007-2020 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify it under 6 the terms of the GNU General Public License as published by the Free 7 Software Foundation; either version 3, or (at your option) any later 8 version. 9 10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 11 WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 #ifndef _SQRT_MACROS_H_ 25 #define _SQRT_MACROS_H_ 26 27 #define FENCE __fence 28 29 #if DOUBLE_EXTENDED_ON 30 31 extern BINARY80 SQRT80 (BINARY80); 32 33 34 __BID_INLINE__ UINT64 35 short_sqrt128 (UINT128 A10) { 36 BINARY80 lx, ly, l64; 37 int_float f64; 38 39 // 2^64 40 f64.i = 0x5f800000; 41 l64 = (BINARY80) f64.d; 42 lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0]; 43 ly = SQRT80 (lx); 44 return (UINT64) ly; 45 } 46 47 48 __BID_INLINE__ void 49 long_sqrt128 (UINT128 * pCS, UINT256 C256) { 50 UINT256 C4; 51 UINT128 CS; 52 UINT64 X; 53 SINT64 SE; 54 BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2, 55 l1, l0, lp, lCl; 56 int_float fx, f64, fm64; 57 int *ple = (int *) &lx; 58 59 // 2^64 60 f64.i = 0x5f800000; 61 l64 = (BINARY80) f64.d; 62 63 l128 = l64 * l64; 64 lx = l3 = (BINARY80) C256.w[3] * l64 * l128; 65 l2 = (BINARY80) C256.w[2] * l128; 66 lx = FENCE (lx + l2); 67 l1 = (BINARY80) C256.w[1] * l64; 68 lx = FENCE (lx + l1); 69 l0 = (BINARY80) C256.w[0]; 70 lx = FENCE (lx + l0); 71 // sqrt(C256) 72 lS = SQRT80 (lx); 73 74 // get coefficient 75 // 2^(-64) 76 fm64.i = 0x1f800000; 77 lm64 = (BINARY80) fm64.d; 78 CS.w[1] = (UINT64) (lS * lm64); 79 CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64); 80 81 /////////////////////////////////////// 82 // CAUTION! 83 // little endian code only 84 // add solution for big endian 85 ////////////////////////////////////// 86 lSH = lS; 87 *((UINT64 *) & lSH) &= 0xffffffff00000000ull; 88 89 // correction for C256 rounding 90 lCl = FENCE (l3 - lx); 91 lCl = FENCE (lCl + l2); 92 lCl = FENCE (lCl + l1); 93 lCl = FENCE (lCl + l0); 94 95 lSL = lS - lSH; 96 97 ////////////////////////////////////////// 98 // Watch for compiler re-ordering 99 // 100 ///////////////////////////////////////// 101 // C256-S^2 102 lxL = FENCE (lx - lSH * lSH); 103 lp = lSH * lSL; 104 lp += lp; 105 lxL = FENCE (lxL - lp); 106 lSL *= lSL; 107 lxL = FENCE (lxL - lSL); 108 lCl += lxL; 109 110 // correction term 111 lE = lCl / (lS + lS); 112 113 // get low part of coefficient 114 X = CS.w[0]; 115 if (lCl >= 0) { 116 SE = (SINT64) (lE); 117 CS.w[0] += SE; 118 if (CS.w[0] < X) 119 CS.w[1]++; 120 } else { 121 SE = (SINT64) (-lE); 122 CS.w[0] -= SE; 123 if (CS.w[0] > X) 124 CS.w[1]--; 125 } 126 127 pCS->w[0] = CS.w[0]; 128 pCS->w[1] = CS.w[1]; 129 } 130 131 #else 132 133 extern double sqrt (double); 134 135 __BID_INLINE__ UINT64 136 short_sqrt128 (UINT128 A10) { 137 UINT256 ARS, ARS0, AE0, AE, S; 138 139 UINT64 MY, ES, CY; 140 double lx, l64; 141 int_double f64, ly; 142 int ey, k; 143 144 // 2^64 145 f64.i = 0x43f0000000000000ull; 146 l64 = f64.d; 147 lx = (double) A10.w[1] * l64 + (double) A10.w[0]; 148 ly.d = 1.0 / sqrt (lx); 149 150 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull; 151 ey = 0x3ff - (ly.i >> 52); 152 153 // A10*RS^2 154 __mul_64x128_to_192 (ARS0, MY, A10); 155 __mul_64x192_to_256 (ARS, MY, ARS0); 156 157 // shr by 2*ey+40, to get a 64-bit value 158 k = (ey << 1) + 104 - 64; 159 if (k >= 128) { 160 if (k > 128) 161 ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k)); 162 else 163 ES = ARS.w[2]; 164 } else { 165 if (k >= 64) { 166 ARS.w[0] = ARS.w[1]; 167 ARS.w[1] = ARS.w[2]; 168 k -= 64; 169 } 170 if (k) { 171 __shr_128 (ARS, ARS, k); 172 } 173 ES = ARS.w[0]; 174 } 175 176 ES = ((SINT64) ES) >> 1; 177 178 if (((SINT64) ES) < 0) { 179 ES = -ES; 180 181 // A*RS*eps (scaled by 2^64) 182 __mul_64x192_to_256 (AE0, ES, ARS0); 183 184 AE.w[0] = AE0.w[1]; 185 AE.w[1] = AE0.w[2]; 186 AE.w[2] = AE0.w[3]; 187 188 __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]); 189 __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY); 190 S.w[2] = ARS0.w[2] + AE.w[2] + CY; 191 } else { 192 // A*RS*eps (scaled by 2^64) 193 __mul_64x192_to_256 (AE0, ES, ARS0); 194 195 AE.w[0] = AE0.w[1]; 196 AE.w[1] = AE0.w[2]; 197 AE.w[2] = AE0.w[3]; 198 199 __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]); 200 __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY); 201 S.w[2] = ARS0.w[2] - AE.w[2] - CY; 202 } 203 204 k = ey + 51; 205 206 if (k >= 64) { 207 if (k >= 128) { 208 S.w[0] = S.w[2]; 209 S.w[1] = 0; 210 k -= 128; 211 } else { 212 S.w[0] = S.w[1]; 213 S.w[1] = S.w[2]; 214 } 215 k -= 64; 216 } 217 if (k) { 218 __shr_128 (S, S, k); 219 } 220 221 222 return (UINT64) ((S.w[0] + 1) >> 1); 223 224 } 225 226 227 228 __BID_INLINE__ void 229 long_sqrt128 (UINT128 * pCS, UINT256 C256) { 230 UINT512 ARS0, ARS; 231 UINT256 ARS00, AE, AE2, S; 232 UINT128 ES, ES2, ARS1; 233 UINT64 ES32, CY, MY; 234 double l64, l128, lx, l2, l1, l0; 235 int_double f64, ly; 236 int ey, k, k2; 237 238 // 2^64 239 f64.i = 0x43f0000000000000ull; 240 l64 = f64.d; 241 242 l128 = l64 * l64; 243 lx = (double) C256.w[3] * l64 * l128; 244 l2 = (double) C256.w[2] * l128; 245 lx = FENCE (lx + l2); 246 l1 = (double) C256.w[1] * l64; 247 lx = FENCE (lx + l1); 248 l0 = (double) C256.w[0]; 249 lx = FENCE (lx + l0); 250 // sqrt(C256) 251 ly.d = 1.0 / sqrt (lx); 252 253 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull; 254 ey = 0x3ff - (ly.i >> 52); 255 256 // A10*RS^2, scaled by 2^(2*ey+104) 257 __mul_64x256_to_320 (ARS0, MY, C256); 258 __mul_64x320_to_384 (ARS, MY, ARS0); 259 260 // shr by k=(2*ey+104)-128 261 // expect k is in the range (192, 256) if result in [10^33, 10^34) 262 // apply an additional signed shift by 1 at the same time (to get eps=eps0/2) 263 k = (ey << 1) + 104 - 128 - 192; 264 k2 = 64 - k; 265 ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1)); 266 ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2); 267 ES.w[1] = ((SINT64) ES.w[1]) >> 1; 268 269 // A*RS >> 192 (for error term computation) 270 ARS1.w[0] = ARS0.w[3]; 271 ARS1.w[1] = ARS0.w[4]; 272 273 // A*RS>>64 274 ARS00.w[0] = ARS0.w[1]; 275 ARS00.w[1] = ARS0.w[2]; 276 ARS00.w[2] = ARS0.w[3]; 277 ARS00.w[3] = ARS0.w[4]; 278 279 if (((SINT64) ES.w[1]) < 0) { 280 ES.w[0] = -ES.w[0]; 281 ES.w[1] = -ES.w[1]; 282 if (ES.w[0]) 283 ES.w[1]--; 284 285 // A*RS*eps 286 __mul_128x128_to_256 (AE, ES, ARS1); 287 288 __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]); 289 __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY); 290 __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY); 291 S.w[3] = ARS00.w[3] + AE.w[3] + CY; 292 } else { 293 // A*RS*eps 294 __mul_128x128_to_256 (AE, ES, ARS1); 295 296 __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]); 297 __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY); 298 __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY); 299 S.w[3] = ARS00.w[3] - AE.w[3] - CY; 300 } 301 302 // 3/2*eps^2, scaled by 2^128 303 ES32 = ES.w[1] + (ES.w[1] >> 1); 304 __mul_64x64_to_128 (ES2, ES32, ES.w[1]); 305 // A*RS*3/2*eps^2 306 __mul_128x128_to_256 (AE2, ES2, ARS1); 307 308 // result, scaled by 2^(ey+52-64) 309 __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]); 310 __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY); 311 __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY); 312 S.w[3] = S.w[3] + AE2.w[3] + CY; 313 314 // k in (0, 64) 315 k = ey + 51 - 128; 316 k2 = 64 - k; 317 S.w[0] = (S.w[1] >> k) | (S.w[2] << k2); 318 S.w[1] = (S.w[2] >> k) | (S.w[3] << k2); 319 320 // round to nearest 321 S.w[0]++; 322 if (!S.w[0]) 323 S.w[1]++; 324 325 pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1); 326 pCS->w[1] = S.w[1] >> 1; 327 328 } 329 330 #endif 331 #endif 332