1 /* Copyright (C) 2007, 2009 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify it under 6 the terms of the GNU General Public License as published by the Free 7 Software Foundation; either version 3, or (at your option) any later 8 version. 9 10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 11 WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /***************************************************************************** 25 * BID64 square root 26 ***************************************************************************** 27 * 28 * Algorithm description: 29 * 30 * if(exponent_x is odd) 31 * scale coefficient_x by 10, adjust exponent 32 * - get lower estimate for number of digits in coefficient_x 33 * - scale coefficient x to between 31 and 33 decimal digits 34 * - in parallel, check for exact case and return if true 35 * - get high part of result coefficient using double precision sqrt 36 * - compute remainder and refine coefficient in one iteration (which 37 * modifies it by at most 1) 38 * - result exponent is easy to compute from the adjusted arg. exponent 39 * 40 ****************************************************************************/ 41 42 #include "bid_internal.h" 43 #include "bid_sqrt_macros.h" 44 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 45 #include <fenv.h> 46 47 #define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT 48 #endif 49 50 extern double sqrt (double); 51 52 #if DECIMAL_CALL_BY_REFERENCE 53 54 void 55 bid64_sqrt (UINT64 * pres, 56 UINT64 * 57 px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM 58 _EXC_INFO_PARAM) { 59 UINT64 x; 60 #else 61 62 UINT64 63 bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM 64 _EXC_MASKS_PARAM _EXC_INFO_PARAM) { 65 #endif 66 UINT128 CA, CT; 67 UINT64 sign_x, coefficient_x; 68 UINT64 Q, Q2, A10, C4, R, R2, QE, res; 69 SINT64 D; 70 int_double t_scale; 71 int_float tempx; 72 double da, dq, da_h, da_l, dqe; 73 int exponent_x, exponent_q, bin_expon_cx; 74 int digits_x; 75 int scale; 76 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 77 fexcept_t binaryflags = 0; 78 #endif 79 80 #if DECIMAL_CALL_BY_REFERENCE 81 #if !DECIMAL_GLOBAL_ROUNDING 82 _IDEC_round rnd_mode = *prnd_mode; 83 #endif 84 x = *px; 85 #endif 86 87 // unpack arguments, check for NaN or Infinity 88 if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) { 89 // x is Inf. or NaN or 0 90 if ((x & INFINITY_MASK64) == INFINITY_MASK64) { 91 res = coefficient_x; 92 if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64) // -Infinity 93 { 94 res = NAN_MASK64; 95 #ifdef SET_STATUS_FLAGS 96 __set_status_flags (pfpsf, INVALID_EXCEPTION); 97 #endif 98 } 99 #ifdef SET_STATUS_FLAGS 100 if ((x & SNAN_MASK64) == SNAN_MASK64) // sNaN 101 __set_status_flags (pfpsf, INVALID_EXCEPTION); 102 #endif 103 BID_RETURN (res & QUIET_MASK64); 104 } 105 // x is 0 106 exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1; 107 res = sign_x | (((UINT64) exponent_x) << 53); 108 BID_RETURN (res); 109 } 110 // x<0? 111 if (sign_x && coefficient_x) { 112 res = NAN_MASK64; 113 #ifdef SET_STATUS_FLAGS 114 __set_status_flags (pfpsf, INVALID_EXCEPTION); 115 #endif 116 BID_RETURN (res); 117 } 118 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 119 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS); 120 #endif 121 //--- get number of bits in the coefficient of x --- 122 tempx.d = (float) coefficient_x; 123 bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f; 124 digits_x = estimate_decimal_digits[bin_expon_cx]; 125 // add test for range 126 if (coefficient_x >= power10_index_binexp[bin_expon_cx]) 127 digits_x++; 128 129 A10 = coefficient_x; 130 if (exponent_x & 1) { 131 A10 = (A10 << 2) + A10; 132 A10 += A10; 133 } 134 135 dqe = sqrt ((double) A10); 136 QE = (UINT32) dqe; 137 if (QE * QE == A10) { 138 res = 139 very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1, 140 QE); 141 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 142 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 143 #endif 144 BID_RETURN (res); 145 } 146 // if exponent is odd, scale coefficient by 10 147 scale = 31 - digits_x; 148 exponent_q = exponent_x - scale; 149 scale += (exponent_q & 1); // exp. bias is even 150 151 CT = power10_table_128[scale]; 152 __mul_64x128_short (CA, coefficient_x, CT); 153 154 // 2^64 155 t_scale.i = 0x43f0000000000000ull; 156 // convert CA to DP 157 da_h = CA.w[1]; 158 da_l = CA.w[0]; 159 da = da_h * t_scale.d + da_l; 160 161 dq = sqrt (da); 162 163 Q = (UINT64) dq; 164 165 // get sign(sqrt(CA)-Q) 166 R = CA.w[0] - Q * Q; 167 R = ((SINT64) R) >> 63; 168 D = R + R + 1; 169 170 exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1; 171 172 #ifdef SET_STATUS_FLAGS 173 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 174 #endif 175 176 #ifndef IEEE_ROUND_NEAREST 177 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY 178 if (!((rnd_mode) & 3)) { 179 #endif 180 #endif 181 182 // midpoint to check 183 Q2 = Q + Q + D; 184 C4 = CA.w[0] << 2; 185 186 // get sign(-sqrt(CA)+Midpoint) 187 R2 = Q2 * Q2 - C4; 188 R2 = ((SINT64) R2) >> 63; 189 190 // adjust Q if R!=R2 191 Q += (D & (R ^ R2)); 192 #ifndef IEEE_ROUND_NEAREST 193 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY 194 } else { 195 C4 = CA.w[0]; 196 Q += D; 197 if ((SINT64) (Q * Q - C4) > 0) 198 Q--; 199 if (rnd_mode == ROUNDING_UP) 200 Q++; 201 } 202 #endif 203 #endif 204 205 res = fast_get_BID64 (0, exponent_q, Q); 206 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 207 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 208 #endif 209 BID_RETURN (res); 210 } 211 212 213 TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x) 214 215 UINT256 M256, C4, C8; 216 UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1, 217 mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql; 218 UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0; 219 SINT64 D; 220 int_float fx, f64; 221 int exponent_x, bin_expon_cx, done = 0; 222 int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits; 223 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 224 fexcept_t binaryflags = 0; 225 #endif 226 227 // unpack arguments, check for NaN or Infinity 228 if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) { 229 res = CX.w[1]; 230 // NaN ? 231 if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) { 232 #ifdef SET_STATUS_FLAGS 233 if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull) // sNaN 234 __set_status_flags (pfpsf, INVALID_EXCEPTION); 235 #endif 236 Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull); 237 Tmp.w[0] = CX.w[0]; 238 TP128 = reciprocals10_128[18]; 239 __mul_128x128_full (Qh, Ql, Tmp, TP128); 240 amount = recip_scale[18]; 241 __shr_128 (Tmp, Qh, amount); 242 res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0]; 243 BID_RETURN (res); 244 } 245 // x is Infinity? 246 if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) { 247 if (sign_x) { 248 // -Inf, return NaN 249 res = 0x7c00000000000000ull; 250 #ifdef SET_STATUS_FLAGS 251 __set_status_flags (pfpsf, INVALID_EXCEPTION); 252 #endif 253 } 254 BID_RETURN (res); 255 } 256 // x is 0 otherwise 257 258 exponent_x = 259 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) + 260 DECIMAL_EXPONENT_BIAS; 261 if (exponent_x < 0) 262 exponent_x = 0; 263 if (exponent_x > DECIMAL_MAX_EXPON_64) 264 exponent_x = DECIMAL_MAX_EXPON_64; 265 //res= sign_x | (((UINT64)exponent_x)<<53); 266 res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf); 267 BID_RETURN (res); 268 } 269 if (sign_x) { 270 res = 0x7c00000000000000ull; 271 #ifdef SET_STATUS_FLAGS 272 __set_status_flags (pfpsf, INVALID_EXCEPTION); 273 #endif 274 BID_RETURN (res); 275 } 276 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 277 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS); 278 #endif 279 280 // 2^64 281 f64.i = 0x5f800000; 282 283 // fx ~ CX 284 fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0]; 285 bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f; 286 digits = estimate_decimal_digits[bin_expon_cx]; 287 288 A10 = CX; 289 if (exponent_x & 1) { 290 A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61); 291 A10.w[0] = CX.w[0] << 3; 292 CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63); 293 CX2.w[0] = CX.w[0] << 1; 294 __add_128_128 (A10, A10, CX2); 295 } 296 297 C256.w[1] = A10.w[1]; 298 C256.w[0] = A10.w[0]; 299 CS.w[0] = short_sqrt128 (A10); 300 CS.w[1] = 0; 301 mul_factor = 0; 302 // check for exact result 303 if (CS.w[0] < 10000000000000000ull) { 304 if (CS.w[0] * CS.w[0] == A10.w[0]) { 305 __sqr64_fast (S2, CS.w[0]); 306 if (S2.w[1] == A10.w[1]) // && S2.w[0]==A10.w[0]) 307 { 308 res = 309 get_BID64 (0, 310 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) + 311 DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf); 312 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 313 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 314 #endif 315 BID_RETURN (res); 316 } 317 } 318 if (CS.w[0] >= 1000000000000000ull) { 319 done = 1; 320 exponent_q = exponent_x; 321 C256.w[1] = A10.w[1]; 322 C256.w[0] = A10.w[0]; 323 } 324 #ifdef SET_STATUS_FLAGS 325 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 326 #endif 327 exact = 0; 328 } else { 329 B10 = 0x3333333333333334ull; 330 __mul_64x64_to_128_full (CS2, CS.w[0], B10); 331 CS0 = CS2.w[1] >> 1; 332 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) { 333 #ifdef SET_STATUS_FLAGS 334 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 335 #endif 336 exact = 0; 337 } 338 done = 1; 339 CS.w[0] = CS0; 340 exponent_q = exponent_x + 2; 341 mul_factor = 10; 342 mul_factor2 = 100; 343 if (CS.w[0] >= 10000000000000000ull) { 344 __mul_64x64_to_128_full (CS2, CS.w[0], B10); 345 CS0 = CS2.w[1] >> 1; 346 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) { 347 #ifdef SET_STATUS_FLAGS 348 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 349 #endif 350 exact = 0; 351 } 352 exponent_q += 2; 353 CS.w[0] = CS0; 354 mul_factor = 100; 355 mul_factor2 = 10000; 356 } 357 if (exact) { 358 CS0 = CS.w[0] * mul_factor; 359 __sqr64_fast (CS1, CS0) 360 if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) { 361 #ifdef SET_STATUS_FLAGS 362 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 363 #endif 364 exact = 0; 365 } 366 } 367 } 368 369 if (!done) { 370 // get number of digits in CX 371 D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1]; 372 if (D > 0 373 || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0])) 374 digits++; 375 376 // if exponent is odd, scale coefficient by 10 377 scale = 31 - digits; 378 exponent_q = exponent_x - scale; 379 scale += (exponent_q & 1); // exp. bias is even 380 381 T128 = power10_table_128[scale]; 382 __mul_128x128_low (C256, CX, T128); 383 384 385 CS.w[0] = short_sqrt128 (C256); 386 } 387 //printf("CS=%016I64x\n",CS.w[0]); 388 389 exponent_q = 390 ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) + 391 DECIMAL_EXPONENT_BIAS; 392 if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) { 393 extra_digits = -exponent_q; 394 exponent_q = 0; 395 396 // get coeff*(2^M[extra_digits])/10^extra_digits 397 __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]); 398 399 // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128 400 amount = short_recip_scale[extra_digits]; 401 402 CS0 = QH.w[1] >> amount; 403 404 #ifdef SET_STATUS_FLAGS 405 if (exact) { 406 if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0]) 407 exact = 0; 408 } 409 if (!exact) 410 __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION); 411 #endif 412 413 CS.w[0] = CS0; 414 if (!mul_factor) 415 mul_factor = 1; 416 mul_factor *= power10_table_128[extra_digits].w[0]; 417 __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor); 418 if (mul_factor2_long.w[1]) 419 mul_factor2 = 0; 420 else 421 mul_factor2 = mul_factor2_long.w[1]; 422 } 423 // 4*C256 424 C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62); 425 C4.w[0] = C256.w[0] << 2; 426 427 #ifndef IEEE_ROUND_NEAREST 428 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY 429 if (!((rnd_mode) & 3)) { 430 #endif 431 #endif 432 // compare to midpoints 433 CSM.w[0] = (CS.w[0] + CS.w[0]) | 1; 434 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]); 435 if (mul_factor) 436 CSM.w[0] *= mul_factor; 437 // CSM^2 438 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]); 439 //__mul_128x128_to_256(M256, CSM, CSM); 440 441 if (C4.w[1] > M256.w[1] || 442 (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) { 443 // round up 444 CS.w[0]++; 445 } else { 446 C8.w[0] = CS.w[0] << 3; 447 C8.w[1] = 0; 448 if (mul_factor) { 449 if (mul_factor2) { 450 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2); 451 } else { 452 __mul_64x128_low (C8, C8.w[0], mul_factor2_long); 453 } 454 } 455 // M256 - 8*CSM 456 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]); 457 M256.w[1] = M256.w[1] - C8.w[1] - Carry; 458 459 // if CSM' > C256, round up 460 if (M256.w[1] > C4.w[1] || 461 (M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) { 462 // round down 463 if (CS.w[0]) 464 CS.w[0]--; 465 } 466 } 467 #ifndef IEEE_ROUND_NEAREST 468 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY 469 } else { 470 CS.w[0]++; 471 CSM.w[0] = CS.w[0]; 472 C8.w[0] = CSM.w[0] << 1; 473 if (mul_factor) 474 CSM.w[0] *= mul_factor; 475 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]); 476 C8.w[1] = 0; 477 if (mul_factor) { 478 if (mul_factor2) { 479 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2); 480 } else { 481 __mul_64x128_low (C8, C8.w[0], mul_factor2_long); 482 } 483 } 484 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]); 485 486 if (M256.w[1] > C256.w[1] || 487 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) { 488 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]); 489 M256.w[1] = M256.w[1] - Carry - C8.w[1]; 490 M256.w[0]++; 491 if (!M256.w[0]) { 492 M256.w[1]++; 493 494 } 495 496 if ((M256.w[1] > C256.w[1] || 497 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) 498 && (CS.w[0] > 1)) { 499 500 CS.w[0]--; 501 502 if (CS.w[0] > 1) { 503 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]); 504 M256.w[1] = M256.w[1] - Carry - C8.w[1]; 505 M256.w[0]++; 506 if (!M256.w[0]) { 507 M256.w[1]++; 508 } 509 510 if (M256.w[1] > C256.w[1] || 511 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) 512 CS.w[0]--; 513 } 514 } 515 } 516 517 else { 518 /*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]); 519 M256.w[1] = M256.w[1] + Carry + C8.w[1]; 520 M256.w[0]++; 521 if(!M256.w[0]) 522 { 523 M256.w[1]++; 524 } 525 CS.w[0]++; 526 if(M256.w[1]<C256.w[1] || 527 (M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0])) 528 { 529 CS.w[0]++; 530 }*/ 531 CS.w[0]++; 532 } 533 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact); 534 // RU? 535 if (((rnd_mode) != ROUNDING_UP) || exact) { 536 if (CS.w[0]) 537 CS.w[0]--; 538 } 539 540 } 541 #endif 542 #endif 543 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact); 544 545 res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf); 546 #ifdef UNCHANGED_BINARY_STATUS_FLAGS 547 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 548 #endif 549 BID_RETURN (res); 550 551 552 } 553