1 /* $OpenBSD: softfloat.c,v 1.3 2008/10/07 22:06:29 martynas Exp $ */ 2 /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */ 3 4 /* 5 * This version hacked for use with gcc -msoft-float by bjh21. 6 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 7 * itself). 8 */ 9 10 /* 11 * Things you may want to define: 12 * 13 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 14 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 15 * properly renamed. 16 */ 17 18 /* 19 =============================================================================== 20 21 This C source file is part of the SoftFloat IEC/IEEE Floating-point 22 Arithmetic Package, Release 2a. 23 24 Written by John R. Hauser. This work was made possible in part by the 25 International Computer Science Institute, located at Suite 600, 1947 Center 26 Street, Berkeley, California 94704. Funding was partially provided by the 27 National Science Foundation under grant MIP-9311980. The original version 28 of this code was written as part of a project to build a fixed-point vector 29 processor in collaboration with the University of California at Berkeley, 30 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 32 arithmetic/SoftFloat.html'. 33 34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable 35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT 36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS 37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL 38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM 39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY 40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL 41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS 42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS. 43 44 Derivative works are acceptable, even for commercial purposes, so long as 45 (1) they include prominent notice that the work is derivative, and (2) they 46 include prominent notice akin to these four paragraphs for those parts of 47 this code that are retained. 48 49 =============================================================================== 50 */ 51 52 #include <sys/cdefs.h> 53 #if defined(LIBC_SCCS) && !defined(lint) 54 __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $"); 55 #endif /* LIBC_SCCS and not lint */ 56 57 #ifdef SOFTFLOAT_FOR_GCC 58 #include "softfloat-for-gcc.h" 59 #endif 60 61 #include "milieu.h" 62 #include "softfloat.h" 63 64 /* 65 * Conversions between floats as stored in memory and floats as 66 * SoftFloat uses them 67 */ 68 #ifndef FLOAT64_DEMANGLE 69 #define FLOAT64_DEMANGLE(a) (a) 70 #endif 71 #ifndef FLOAT64_MANGLE 72 #define FLOAT64_MANGLE(a) (a) 73 #endif 74 75 /* 76 ------------------------------------------------------------------------------- 77 Floating-point rounding mode, extended double-precision rounding precision, 78 and exception flags. 79 ------------------------------------------------------------------------------- 80 */ 81 82 /* 83 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 84 * Right now, it does not. I've removed all other dynamic global 85 * variables. [ross] 86 */ 87 #ifdef FLOATX80 88 int8 floatx80_rounding_precision = 80; 89 #endif 90 91 /* 92 ------------------------------------------------------------------------------- 93 Primitive arithmetic functions, including multi-word arithmetic, and 94 division and square root approximations. (Can be specialized to target if 95 desired.) 96 ------------------------------------------------------------------------------- 97 */ 98 #include "softfloat-macros.h" 99 100 /* 101 ------------------------------------------------------------------------------- 102 Functions and definitions to determine: (1) whether tininess for underflow 103 is detected before or after rounding by default, (2) what (if anything) 104 happens when exceptions are raised, (3) how signaling NaNs are distinguished 105 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 106 are propagated from function inputs to output. These details are target- 107 specific. 108 ------------------------------------------------------------------------------- 109 */ 110 #include "softfloat-specialize.h" 111 112 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 113 /* 114 ------------------------------------------------------------------------------- 115 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 116 and 7, and returns the properly rounded 32-bit integer corresponding to the 117 input. If `zSign' is 1, the input is negated before being converted to an 118 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 119 is simply rounded to an integer, with the inexact exception raised if the 120 input cannot be represented exactly as an integer. However, if the fixed- 121 point input is too large, the invalid exception is raised and the largest 122 positive or negative integer is returned. 123 ------------------------------------------------------------------------------- 124 */ 125 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 126 { 127 int8 roundingMode; 128 flag roundNearestEven; 129 int8 roundIncrement, roundBits; 130 int32 z; 131 132 roundingMode = float_rounding_mode(); 133 roundNearestEven = ( roundingMode == float_round_nearest_even ); 134 roundIncrement = 0x40; 135 if ( ! roundNearestEven ) { 136 if ( roundingMode == float_round_to_zero ) { 137 roundIncrement = 0; 138 } 139 else { 140 roundIncrement = 0x7F; 141 if ( zSign ) { 142 if ( roundingMode == float_round_up ) roundIncrement = 0; 143 } 144 else { 145 if ( roundingMode == float_round_down ) roundIncrement = 0; 146 } 147 } 148 } 149 roundBits = absZ & 0x7F; 150 absZ = ( absZ + roundIncrement )>>7; 151 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 152 z = absZ; 153 if ( zSign ) z = - z; 154 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 155 float_raise( float_flag_invalid ); 156 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 157 } 158 if ( roundBits ) float_set_inexact(); 159 return z; 160 161 } 162 163 /* 164 ------------------------------------------------------------------------------- 165 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 166 `absZ1', with binary point between bits 63 and 64 (between the input words), 167 and returns the properly rounded 64-bit integer corresponding to the input. 168 If `zSign' is 1, the input is negated before being converted to an integer. 169 Ordinarily, the fixed-point input is simply rounded to an integer, with 170 the inexact exception raised if the input cannot be represented exactly as 171 an integer. However, if the fixed-point input is too large, the invalid 172 exception is raised and the largest positive or negative integer is 173 returned. 174 ------------------------------------------------------------------------------- 175 */ 176 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 177 { 178 int8 roundingMode; 179 flag roundNearestEven, increment; 180 int64 z; 181 182 roundingMode = float_rounding_mode(); 183 roundNearestEven = ( roundingMode == float_round_nearest_even ); 184 increment = ( (sbits64) absZ1 < 0 ); 185 if ( ! roundNearestEven ) { 186 if ( roundingMode == float_round_to_zero ) { 187 increment = 0; 188 } 189 else { 190 if ( zSign ) { 191 increment = ( roundingMode == float_round_down ) && absZ1; 192 } 193 else { 194 increment = ( roundingMode == float_round_up ) && absZ1; 195 } 196 } 197 } 198 if ( increment ) { 199 ++absZ0; 200 if ( absZ0 == 0 ) goto overflow; 201 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 202 } 203 z = absZ0; 204 if ( zSign ) z = - z; 205 if ( z && ( ( z < 0 ) ^ zSign ) ) { 206 overflow: 207 float_raise( float_flag_invalid ); 208 return 209 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 210 : LIT64( 0x7FFFFFFFFFFFFFFF ); 211 } 212 if ( absZ1 ) float_set_inexact(); 213 return z; 214 215 } 216 217 #ifdef __alpha__ 218 /* 219 ------------------------------------------------------------------------------- 220 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 221 `absZ1', with binary point between bits 63 and 64 (between the input words), 222 and returns the properly rounded 64-bit integer corresponding to the input. 223 If `zSign' is 1, the input is negated before being converted to an integer. 224 Ordinarily, the fixed-point input is simply rounded to an integer, with 225 the inexact exception raised if the input cannot be represented exactly as 226 an integer. 227 ------------------------------------------------------------------------------- 228 */ 229 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0, 230 bits64 absZ1 ) 231 { 232 int8 roundingMode; 233 flag roundNearestEven, increment; 234 int64 z; 235 236 roundingMode = float_rounding_mode(); 237 roundNearestEven = ( roundingMode == float_round_nearest_even ); 238 increment = ( (sbits64) absZ1 < 0 ); 239 if ( ! roundNearestEven ) { 240 if ( roundingMode == float_round_to_zero ) { 241 increment = 0; 242 } 243 else { 244 if ( zSign ) { 245 increment = ( roundingMode == float_round_down ) && absZ1; 246 } 247 else { 248 increment = ( roundingMode == float_round_up ) && absZ1; 249 } 250 } 251 } 252 if ( increment ) { 253 ++absZ0; 254 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 255 } 256 z = absZ0; 257 if ( zSign ) z = - z; 258 if ( absZ1 ) float_set_inexact(); 259 return z; 260 261 } 262 #endif /* __alpha__ */ 263 #endif 264 265 /* 266 ------------------------------------------------------------------------------- 267 Returns the fraction bits of the single-precision floating-point value `a'. 268 ------------------------------------------------------------------------------- 269 */ 270 INLINE bits32 extractFloat32Frac( float32 a ) 271 { 272 273 return a & 0x007FFFFF; 274 275 } 276 277 /* 278 ------------------------------------------------------------------------------- 279 Returns the exponent bits of the single-precision floating-point value `a'. 280 ------------------------------------------------------------------------------- 281 */ 282 INLINE int16 extractFloat32Exp( float32 a ) 283 { 284 285 return ( a>>23 ) & 0xFF; 286 287 } 288 289 /* 290 ------------------------------------------------------------------------------- 291 Returns the sign bit of the single-precision floating-point value `a'. 292 ------------------------------------------------------------------------------- 293 */ 294 INLINE flag extractFloat32Sign( float32 a ) 295 { 296 297 return a>>31; 298 299 } 300 301 /* 302 ------------------------------------------------------------------------------- 303 Normalizes the subnormal single-precision floating-point value represented 304 by the denormalized significand `aSig'. The normalized exponent and 305 significand are stored at the locations pointed to by `zExpPtr' and 306 `zSigPtr', respectively. 307 ------------------------------------------------------------------------------- 308 */ 309 static void 310 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 311 { 312 int8 shiftCount; 313 314 shiftCount = countLeadingZeros32( aSig ) - 8; 315 *zSigPtr = aSig<<shiftCount; 316 *zExpPtr = 1 - shiftCount; 317 318 } 319 320 /* 321 ------------------------------------------------------------------------------- 322 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 323 single-precision floating-point value, returning the result. After being 324 shifted into the proper positions, the three fields are simply added 325 together to form the result. This means that any integer portion of `zSig' 326 will be added into the exponent. Since a properly normalized significand 327 will have an integer portion equal to 1, the `zExp' input should be 1 less 328 than the desired result exponent whenever `zSig' is a complete, normalized 329 significand. 330 ------------------------------------------------------------------------------- 331 */ 332 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 333 { 334 335 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 336 337 } 338 339 /* 340 ------------------------------------------------------------------------------- 341 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 342 and significand `zSig', and returns the proper single-precision floating- 343 point value corresponding to the abstract input. Ordinarily, the abstract 344 value is simply rounded and packed into the single-precision format, with 345 the inexact exception raised if the abstract input cannot be represented 346 exactly. However, if the abstract value is too large, the overflow and 347 inexact exceptions are raised and an infinity or maximal finite value is 348 returned. If the abstract value is too small, the input value is rounded to 349 a subnormal number, and the underflow and inexact exceptions are raised if 350 the abstract input cannot be represented exactly as a subnormal single- 351 precision floating-point number. 352 The input significand `zSig' has its binary point between bits 30 353 and 29, which is 7 bits to the left of the usual location. This shifted 354 significand must be normalized or smaller. If `zSig' is not normalized, 355 `zExp' must be 0; in that case, the result returned is a subnormal number, 356 and it must not require rounding. In the usual case that `zSig' is 357 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 358 The handling of underflow and overflow follows the IEC/IEEE Standard for 359 Binary Floating-Point Arithmetic. 360 ------------------------------------------------------------------------------- 361 */ 362 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 363 { 364 int8 roundingMode; 365 flag roundNearestEven; 366 int8 roundIncrement, roundBits; 367 flag isTiny; 368 369 roundingMode = float_rounding_mode(); 370 roundNearestEven = ( roundingMode == float_round_nearest_even ); 371 roundIncrement = 0x40; 372 if ( ! roundNearestEven ) { 373 if ( roundingMode == float_round_to_zero ) { 374 roundIncrement = 0; 375 } 376 else { 377 roundIncrement = 0x7F; 378 if ( zSign ) { 379 if ( roundingMode == float_round_up ) roundIncrement = 0; 380 } 381 else { 382 if ( roundingMode == float_round_down ) roundIncrement = 0; 383 } 384 } 385 } 386 roundBits = zSig & 0x7F; 387 if ( 0xFD <= (bits16) zExp ) { 388 if ( ( 0xFD < zExp ) 389 || ( ( zExp == 0xFD ) 390 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 391 ) { 392 float_raise( float_flag_overflow | float_flag_inexact ); 393 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 394 } 395 if ( zExp < 0 ) { 396 isTiny = 397 ( float_detect_tininess == float_tininess_before_rounding ) 398 || ( zExp < -1 ) 399 || ( zSig + roundIncrement < 0x80000000 ); 400 shift32RightJamming( zSig, - zExp, &zSig ); 401 zExp = 0; 402 roundBits = zSig & 0x7F; 403 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 404 } 405 } 406 if ( roundBits ) float_set_inexact(); 407 zSig = ( zSig + roundIncrement )>>7; 408 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 409 if ( zSig == 0 ) zExp = 0; 410 return packFloat32( zSign, zExp, zSig ); 411 412 } 413 414 /* 415 ------------------------------------------------------------------------------- 416 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 417 and significand `zSig', and returns the proper single-precision floating- 418 point value corresponding to the abstract input. This routine is just like 419 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 420 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 421 floating-point exponent. 422 ------------------------------------------------------------------------------- 423 */ 424 static float32 425 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 426 { 427 int8 shiftCount; 428 429 shiftCount = countLeadingZeros32( zSig ) - 1; 430 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 431 432 } 433 434 /* 435 ------------------------------------------------------------------------------- 436 Returns the fraction bits of the double-precision floating-point value `a'. 437 ------------------------------------------------------------------------------- 438 */ 439 INLINE bits64 extractFloat64Frac( float64 a ) 440 { 441 442 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 443 444 } 445 446 /* 447 ------------------------------------------------------------------------------- 448 Returns the exponent bits of the double-precision floating-point value `a'. 449 ------------------------------------------------------------------------------- 450 */ 451 INLINE int16 extractFloat64Exp( float64 a ) 452 { 453 454 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 455 456 } 457 458 /* 459 ------------------------------------------------------------------------------- 460 Returns the sign bit of the double-precision floating-point value `a'. 461 ------------------------------------------------------------------------------- 462 */ 463 INLINE flag extractFloat64Sign( float64 a ) 464 { 465 466 return FLOAT64_DEMANGLE(a)>>63; 467 468 } 469 470 /* 471 ------------------------------------------------------------------------------- 472 Normalizes the subnormal double-precision floating-point value represented 473 by the denormalized significand `aSig'. The normalized exponent and 474 significand are stored at the locations pointed to by `zExpPtr' and 475 `zSigPtr', respectively. 476 ------------------------------------------------------------------------------- 477 */ 478 static void 479 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 480 { 481 int8 shiftCount; 482 483 shiftCount = countLeadingZeros64( aSig ) - 11; 484 *zSigPtr = aSig<<shiftCount; 485 *zExpPtr = 1 - shiftCount; 486 487 } 488 489 /* 490 ------------------------------------------------------------------------------- 491 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 492 double-precision floating-point value, returning the result. After being 493 shifted into the proper positions, the three fields are simply added 494 together to form the result. This means that any integer portion of `zSig' 495 will be added into the exponent. Since a properly normalized significand 496 will have an integer portion equal to 1, the `zExp' input should be 1 less 497 than the desired result exponent whenever `zSig' is a complete, normalized 498 significand. 499 ------------------------------------------------------------------------------- 500 */ 501 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 502 { 503 504 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 505 ( ( (bits64) zExp )<<52 ) + zSig ); 506 507 } 508 509 /* 510 ------------------------------------------------------------------------------- 511 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 512 and significand `zSig', and returns the proper double-precision floating- 513 point value corresponding to the abstract input. Ordinarily, the abstract 514 value is simply rounded and packed into the double-precision format, with 515 the inexact exception raised if the abstract input cannot be represented 516 exactly. However, if the abstract value is too large, the overflow and 517 inexact exceptions are raised and an infinity or maximal finite value is 518 returned. If the abstract value is too small, the input value is rounded to 519 a subnormal number, and the underflow and inexact exceptions are raised if 520 the abstract input cannot be represented exactly as a subnormal double- 521 precision floating-point number. 522 The input significand `zSig' has its binary point between bits 62 523 and 61, which is 10 bits to the left of the usual location. This shifted 524 significand must be normalized or smaller. If `zSig' is not normalized, 525 `zExp' must be 0; in that case, the result returned is a subnormal number, 526 and it must not require rounding. In the usual case that `zSig' is 527 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 528 The handling of underflow and overflow follows the IEC/IEEE Standard for 529 Binary Floating-Point Arithmetic. 530 ------------------------------------------------------------------------------- 531 */ 532 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 533 { 534 int8 roundingMode; 535 flag roundNearestEven; 536 int16 roundIncrement, roundBits; 537 flag isTiny; 538 539 roundingMode = float_rounding_mode(); 540 roundNearestEven = ( roundingMode == float_round_nearest_even ); 541 roundIncrement = 0x200; 542 if ( ! roundNearestEven ) { 543 if ( roundingMode == float_round_to_zero ) { 544 roundIncrement = 0; 545 } 546 else { 547 roundIncrement = 0x3FF; 548 if ( zSign ) { 549 if ( roundingMode == float_round_up ) roundIncrement = 0; 550 } 551 else { 552 if ( roundingMode == float_round_down ) roundIncrement = 0; 553 } 554 } 555 } 556 roundBits = zSig & 0x3FF; 557 if ( 0x7FD <= (bits16) zExp ) { 558 if ( ( 0x7FD < zExp ) 559 || ( ( zExp == 0x7FD ) 560 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 561 ) { 562 float_raise( float_flag_overflow | float_flag_inexact ); 563 return FLOAT64_MANGLE( 564 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 565 ( roundIncrement == 0 )); 566 } 567 if ( zExp < 0 ) { 568 isTiny = 569 ( float_detect_tininess == float_tininess_before_rounding ) 570 || ( zExp < -1 ) 571 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 572 shift64RightJamming( zSig, - zExp, &zSig ); 573 zExp = 0; 574 roundBits = zSig & 0x3FF; 575 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 576 } 577 } 578 if ( roundBits ) float_set_inexact(); 579 zSig = ( zSig + roundIncrement )>>10; 580 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 581 if ( zSig == 0 ) zExp = 0; 582 return packFloat64( zSign, zExp, zSig ); 583 584 } 585 586 /* 587 ------------------------------------------------------------------------------- 588 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 589 and significand `zSig', and returns the proper double-precision floating- 590 point value corresponding to the abstract input. This routine is just like 591 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 592 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 593 floating-point exponent. 594 ------------------------------------------------------------------------------- 595 */ 596 static float64 597 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 598 { 599 int8 shiftCount; 600 601 shiftCount = countLeadingZeros64( zSig ) - 1; 602 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 603 604 } 605 606 #ifdef FLOATX80 607 608 /* 609 ------------------------------------------------------------------------------- 610 Returns the fraction bits of the extended double-precision floating-point 611 value `a'. 612 ------------------------------------------------------------------------------- 613 */ 614 INLINE bits64 extractFloatx80Frac( floatx80 a ) 615 { 616 617 return a.low; 618 619 } 620 621 /* 622 ------------------------------------------------------------------------------- 623 Returns the exponent bits of the extended double-precision floating-point 624 value `a'. 625 ------------------------------------------------------------------------------- 626 */ 627 INLINE int32 extractFloatx80Exp( floatx80 a ) 628 { 629 630 return a.high & 0x7FFF; 631 632 } 633 634 /* 635 ------------------------------------------------------------------------------- 636 Returns the sign bit of the extended double-precision floating-point value 637 `a'. 638 ------------------------------------------------------------------------------- 639 */ 640 INLINE flag extractFloatx80Sign( floatx80 a ) 641 { 642 643 return a.high>>15; 644 645 } 646 647 /* 648 ------------------------------------------------------------------------------- 649 Normalizes the subnormal extended double-precision floating-point value 650 represented by the denormalized significand `aSig'. The normalized exponent 651 and significand are stored at the locations pointed to by `zExpPtr' and 652 `zSigPtr', respectively. 653 ------------------------------------------------------------------------------- 654 */ 655 static void 656 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 657 { 658 int8 shiftCount; 659 660 shiftCount = countLeadingZeros64( aSig ); 661 *zSigPtr = aSig<<shiftCount; 662 *zExpPtr = 1 - shiftCount; 663 664 } 665 666 /* 667 ------------------------------------------------------------------------------- 668 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 669 extended double-precision floating-point value, returning the result. 670 ------------------------------------------------------------------------------- 671 */ 672 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 673 { 674 floatx80 z; 675 676 z.low = zSig; 677 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 678 return z; 679 680 } 681 682 /* 683 ------------------------------------------------------------------------------- 684 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 685 and extended significand formed by the concatenation of `zSig0' and `zSig1', 686 and returns the proper extended double-precision floating-point value 687 corresponding to the abstract input. Ordinarily, the abstract value is 688 rounded and packed into the extended double-precision format, with the 689 inexact exception raised if the abstract input cannot be represented 690 exactly. However, if the abstract value is too large, the overflow and 691 inexact exceptions are raised and an infinity or maximal finite value is 692 returned. If the abstract value is too small, the input value is rounded to 693 a subnormal number, and the underflow and inexact exceptions are raised if 694 the abstract input cannot be represented exactly as a subnormal extended 695 double-precision floating-point number. 696 If `roundingPrecision' is 32 or 64, the result is rounded to the same 697 number of bits as single or double precision, respectively. Otherwise, the 698 result is rounded to the full precision of the extended double-precision 699 format. 700 The input significand must be normalized or smaller. If the input 701 significand is not normalized, `zExp' must be 0; in that case, the result 702 returned is a subnormal number, and it must not require rounding. The 703 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 704 Floating-Point Arithmetic. 705 ------------------------------------------------------------------------------- 706 */ 707 static floatx80 708 roundAndPackFloatx80( 709 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 710 ) 711 { 712 int8 roundingMode; 713 flag roundNearestEven, increment, isTiny; 714 int64 roundIncrement, roundMask, roundBits; 715 716 roundingMode = float_rounding_mode(); 717 roundNearestEven = ( roundingMode == float_round_nearest_even ); 718 if ( roundingPrecision == 80 ) goto precision80; 719 if ( roundingPrecision == 64 ) { 720 roundIncrement = LIT64( 0x0000000000000400 ); 721 roundMask = LIT64( 0x00000000000007FF ); 722 } 723 else if ( roundingPrecision == 32 ) { 724 roundIncrement = LIT64( 0x0000008000000000 ); 725 roundMask = LIT64( 0x000000FFFFFFFFFF ); 726 } 727 else { 728 goto precision80; 729 } 730 zSig0 |= ( zSig1 != 0 ); 731 if ( ! roundNearestEven ) { 732 if ( roundingMode == float_round_to_zero ) { 733 roundIncrement = 0; 734 } 735 else { 736 roundIncrement = roundMask; 737 if ( zSign ) { 738 if ( roundingMode == float_round_up ) roundIncrement = 0; 739 } 740 else { 741 if ( roundingMode == float_round_down ) roundIncrement = 0; 742 } 743 } 744 } 745 roundBits = zSig0 & roundMask; 746 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 747 if ( ( 0x7FFE < zExp ) 748 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 749 ) { 750 goto overflow; 751 } 752 if ( zExp <= 0 ) { 753 isTiny = 754 ( float_detect_tininess == float_tininess_before_rounding ) 755 || ( zExp < 0 ) 756 || ( zSig0 <= zSig0 + roundIncrement ); 757 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 758 zExp = 0; 759 roundBits = zSig0 & roundMask; 760 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 761 if ( roundBits ) float_set_inexact(); 762 zSig0 += roundIncrement; 763 if ( (sbits64) zSig0 < 0 ) zExp = 1; 764 roundIncrement = roundMask + 1; 765 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 766 roundMask |= roundIncrement; 767 } 768 zSig0 &= ~ roundMask; 769 return packFloatx80( zSign, zExp, zSig0 ); 770 } 771 } 772 if ( roundBits ) float_set_inexact(); 773 zSig0 += roundIncrement; 774 if ( zSig0 < roundIncrement ) { 775 ++zExp; 776 zSig0 = LIT64( 0x8000000000000000 ); 777 } 778 roundIncrement = roundMask + 1; 779 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 780 roundMask |= roundIncrement; 781 } 782 zSig0 &= ~ roundMask; 783 if ( zSig0 == 0 ) zExp = 0; 784 return packFloatx80( zSign, zExp, zSig0 ); 785 precision80: 786 increment = ( (sbits64) zSig1 < 0 ); 787 if ( ! roundNearestEven ) { 788 if ( roundingMode == float_round_to_zero ) { 789 increment = 0; 790 } 791 else { 792 if ( zSign ) { 793 increment = ( roundingMode == float_round_down ) && zSig1; 794 } 795 else { 796 increment = ( roundingMode == float_round_up ) && zSig1; 797 } 798 } 799 } 800 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 801 if ( ( 0x7FFE < zExp ) 802 || ( ( zExp == 0x7FFE ) 803 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 804 && increment 805 ) 806 ) { 807 roundMask = 0; 808 overflow: 809 float_raise( float_flag_overflow | float_flag_inexact ); 810 if ( ( roundingMode == float_round_to_zero ) 811 || ( zSign && ( roundingMode == float_round_up ) ) 812 || ( ! zSign && ( roundingMode == float_round_down ) ) 813 ) { 814 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 815 } 816 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 817 } 818 if ( zExp <= 0 ) { 819 isTiny = 820 ( float_detect_tininess == float_tininess_before_rounding ) 821 || ( zExp < 0 ) 822 || ! increment 823 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 824 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 825 zExp = 0; 826 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 827 if ( zSig1 ) float_set_inexact(); 828 if ( roundNearestEven ) { 829 increment = ( (sbits64) zSig1 < 0 ); 830 } 831 else { 832 if ( zSign ) { 833 increment = ( roundingMode == float_round_down ) && zSig1; 834 } 835 else { 836 increment = ( roundingMode == float_round_up ) && zSig1; 837 } 838 } 839 if ( increment ) { 840 ++zSig0; 841 zSig0 &= 842 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 843 if ( (sbits64) zSig0 < 0 ) zExp = 1; 844 } 845 return packFloatx80( zSign, zExp, zSig0 ); 846 } 847 } 848 if ( zSig1 ) float_set_inexact(); 849 if ( increment ) { 850 ++zSig0; 851 if ( zSig0 == 0 ) { 852 ++zExp; 853 zSig0 = LIT64( 0x8000000000000000 ); 854 } 855 else { 856 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 857 } 858 } 859 else { 860 if ( zSig0 == 0 ) zExp = 0; 861 } 862 return packFloatx80( zSign, zExp, zSig0 ); 863 864 } 865 866 /* 867 ------------------------------------------------------------------------------- 868 Takes an abstract floating-point value having sign `zSign', exponent 869 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 870 and returns the proper extended double-precision floating-point value 871 corresponding to the abstract input. This routine is just like 872 `roundAndPackFloatx80' except that the input significand does not have to be 873 normalized. 874 ------------------------------------------------------------------------------- 875 */ 876 static floatx80 877 normalizeRoundAndPackFloatx80( 878 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 879 ) 880 { 881 int8 shiftCount; 882 883 if ( zSig0 == 0 ) { 884 zSig0 = zSig1; 885 zSig1 = 0; 886 zExp -= 64; 887 } 888 shiftCount = countLeadingZeros64( zSig0 ); 889 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 890 zExp -= shiftCount; 891 return 892 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 893 894 } 895 896 #endif 897 898 #ifdef FLOAT128 899 900 /* 901 ------------------------------------------------------------------------------- 902 Returns the least-significant 64 fraction bits of the quadruple-precision 903 floating-point value `a'. 904 ------------------------------------------------------------------------------- 905 */ 906 INLINE bits64 extractFloat128Frac1( float128 a ) 907 { 908 909 return a.low; 910 911 } 912 913 /* 914 ------------------------------------------------------------------------------- 915 Returns the most-significant 48 fraction bits of the quadruple-precision 916 floating-point value `a'. 917 ------------------------------------------------------------------------------- 918 */ 919 INLINE bits64 extractFloat128Frac0( float128 a ) 920 { 921 922 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 923 924 } 925 926 /* 927 ------------------------------------------------------------------------------- 928 Returns the exponent bits of the quadruple-precision floating-point value 929 `a'. 930 ------------------------------------------------------------------------------- 931 */ 932 INLINE int32 extractFloat128Exp( float128 a ) 933 { 934 935 return ( a.high>>48 ) & 0x7FFF; 936 937 } 938 939 /* 940 ------------------------------------------------------------------------------- 941 Returns the sign bit of the quadruple-precision floating-point value `a'. 942 ------------------------------------------------------------------------------- 943 */ 944 INLINE flag extractFloat128Sign( float128 a ) 945 { 946 947 return a.high>>63; 948 949 } 950 951 /* 952 ------------------------------------------------------------------------------- 953 Normalizes the subnormal quadruple-precision floating-point value 954 represented by the denormalized significand formed by the concatenation of 955 `aSig0' and `aSig1'. The normalized exponent is stored at the location 956 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 957 significand are stored at the location pointed to by `zSig0Ptr', and the 958 least significant 64 bits of the normalized significand are stored at the 959 location pointed to by `zSig1Ptr'. 960 ------------------------------------------------------------------------------- 961 */ 962 static void 963 normalizeFloat128Subnormal( 964 bits64 aSig0, 965 bits64 aSig1, 966 int32 *zExpPtr, 967 bits64 *zSig0Ptr, 968 bits64 *zSig1Ptr 969 ) 970 { 971 int8 shiftCount; 972 973 if ( aSig0 == 0 ) { 974 shiftCount = countLeadingZeros64( aSig1 ) - 15; 975 if ( shiftCount < 0 ) { 976 *zSig0Ptr = aSig1>>( - shiftCount ); 977 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 978 } 979 else { 980 *zSig0Ptr = aSig1<<shiftCount; 981 *zSig1Ptr = 0; 982 } 983 *zExpPtr = - shiftCount - 63; 984 } 985 else { 986 shiftCount = countLeadingZeros64( aSig0 ) - 15; 987 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 988 *zExpPtr = 1 - shiftCount; 989 } 990 991 } 992 993 /* 994 ------------------------------------------------------------------------------- 995 Packs the sign `zSign', the exponent `zExp', and the significand formed 996 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 997 floating-point value, returning the result. After being shifted into the 998 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 999 added together to form the most significant 32 bits of the result. This 1000 means that any integer portion of `zSig0' will be added into the exponent. 1001 Since a properly normalized significand will have an integer portion equal 1002 to 1, the `zExp' input should be 1 less than the desired result exponent 1003 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1004 significand. 1005 ------------------------------------------------------------------------------- 1006 */ 1007 INLINE float128 1008 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1009 { 1010 float128 z; 1011 1012 z.low = zSig1; 1013 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 1014 return z; 1015 1016 } 1017 1018 /* 1019 ------------------------------------------------------------------------------- 1020 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1021 and extended significand formed by the concatenation of `zSig0', `zSig1', 1022 and `zSig2', and returns the proper quadruple-precision floating-point value 1023 corresponding to the abstract input. Ordinarily, the abstract value is 1024 simply rounded and packed into the quadruple-precision format, with the 1025 inexact exception raised if the abstract input cannot be represented 1026 exactly. However, if the abstract value is too large, the overflow and 1027 inexact exceptions are raised and an infinity or maximal finite value is 1028 returned. If the abstract value is too small, the input value is rounded to 1029 a subnormal number, and the underflow and inexact exceptions are raised if 1030 the abstract input cannot be represented exactly as a subnormal quadruple- 1031 precision floating-point number. 1032 The input significand must be normalized or smaller. If the input 1033 significand is not normalized, `zExp' must be 0; in that case, the result 1034 returned is a subnormal number, and it must not require rounding. In the 1035 usual case that the input significand is normalized, `zExp' must be 1 less 1036 than the ``true'' floating-point exponent. The handling of underflow and 1037 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1038 ------------------------------------------------------------------------------- 1039 */ 1040 static float128 1041 roundAndPackFloat128( 1042 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 1043 { 1044 int8 roundingMode; 1045 flag roundNearestEven, increment, isTiny; 1046 1047 roundingMode = float_rounding_mode(); 1048 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1049 increment = ( (sbits64) zSig2 < 0 ); 1050 if ( ! roundNearestEven ) { 1051 if ( roundingMode == float_round_to_zero ) { 1052 increment = 0; 1053 } 1054 else { 1055 if ( zSign ) { 1056 increment = ( roundingMode == float_round_down ) && zSig2; 1057 } 1058 else { 1059 increment = ( roundingMode == float_round_up ) && zSig2; 1060 } 1061 } 1062 } 1063 if ( 0x7FFD <= (bits32) zExp ) { 1064 if ( ( 0x7FFD < zExp ) 1065 || ( ( zExp == 0x7FFD ) 1066 && eq128( 1067 LIT64( 0x0001FFFFFFFFFFFF ), 1068 LIT64( 0xFFFFFFFFFFFFFFFF ), 1069 zSig0, 1070 zSig1 1071 ) 1072 && increment 1073 ) 1074 ) { 1075 float_raise( float_flag_overflow | float_flag_inexact ); 1076 if ( ( roundingMode == float_round_to_zero ) 1077 || ( zSign && ( roundingMode == float_round_up ) ) 1078 || ( ! zSign && ( roundingMode == float_round_down ) ) 1079 ) { 1080 return 1081 packFloat128( 1082 zSign, 1083 0x7FFE, 1084 LIT64( 0x0000FFFFFFFFFFFF ), 1085 LIT64( 0xFFFFFFFFFFFFFFFF ) 1086 ); 1087 } 1088 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1089 } 1090 if ( zExp < 0 ) { 1091 isTiny = 1092 ( float_detect_tininess == float_tininess_before_rounding ) 1093 || ( zExp < -1 ) 1094 || ! increment 1095 || lt128( 1096 zSig0, 1097 zSig1, 1098 LIT64( 0x0001FFFFFFFFFFFF ), 1099 LIT64( 0xFFFFFFFFFFFFFFFF ) 1100 ); 1101 shift128ExtraRightJamming( 1102 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1103 zExp = 0; 1104 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1105 if ( roundNearestEven ) { 1106 increment = ( (sbits64) zSig2 < 0 ); 1107 } 1108 else { 1109 if ( zSign ) { 1110 increment = ( roundingMode == float_round_down ) && zSig2; 1111 } 1112 else { 1113 increment = ( roundingMode == float_round_up ) && zSig2; 1114 } 1115 } 1116 } 1117 } 1118 if ( zSig2 ) float_set_inexact(); 1119 if ( increment ) { 1120 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1121 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1122 } 1123 else { 1124 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1125 } 1126 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1127 1128 } 1129 1130 /* 1131 ------------------------------------------------------------------------------- 1132 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1133 and significand formed by the concatenation of `zSig0' and `zSig1', and 1134 returns the proper quadruple-precision floating-point value corresponding 1135 to the abstract input. This routine is just like `roundAndPackFloat128' 1136 except that the input significand has fewer bits and does not have to be 1137 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1138 point exponent. 1139 ------------------------------------------------------------------------------- 1140 */ 1141 static float128 1142 normalizeRoundAndPackFloat128( 1143 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1144 { 1145 int8 shiftCount; 1146 bits64 zSig2; 1147 1148 if ( zSig0 == 0 ) { 1149 zSig0 = zSig1; 1150 zSig1 = 0; 1151 zExp -= 64; 1152 } 1153 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1154 if ( 0 <= shiftCount ) { 1155 zSig2 = 0; 1156 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1157 } 1158 else { 1159 shift128ExtraRightJamming( 1160 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1161 } 1162 zExp -= shiftCount; 1163 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1164 1165 } 1166 1167 #endif 1168 1169 /* 1170 ------------------------------------------------------------------------------- 1171 Returns the result of converting the 32-bit two's complement integer `a' 1172 to the single-precision floating-point format. The conversion is performed 1173 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1174 ------------------------------------------------------------------------------- 1175 */ 1176 float32 int32_to_float32( int32 a ) 1177 { 1178 flag zSign; 1179 1180 if ( a == 0 ) return 0; 1181 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1182 zSign = ( a < 0 ); 1183 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1184 1185 } 1186 1187 /* 1188 ------------------------------------------------------------------------------- 1189 Returns the result of converting the 32-bit two's complement integer `a' 1190 to the double-precision floating-point format. The conversion is performed 1191 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1192 ------------------------------------------------------------------------------- 1193 */ 1194 float64 int32_to_float64( int32 a ) 1195 { 1196 flag zSign; 1197 uint32 absA; 1198 int8 shiftCount; 1199 bits64 zSig; 1200 1201 if ( a == 0 ) return 0; 1202 zSign = ( a < 0 ); 1203 absA = zSign ? - a : a; 1204 shiftCount = countLeadingZeros32( absA ) + 21; 1205 zSig = absA; 1206 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1207 1208 } 1209 1210 #ifdef FLOATX80 1211 1212 /* 1213 ------------------------------------------------------------------------------- 1214 Returns the result of converting the 32-bit two's complement integer `a' 1215 to the extended double-precision floating-point format. The conversion 1216 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1217 Arithmetic. 1218 ------------------------------------------------------------------------------- 1219 */ 1220 floatx80 int32_to_floatx80( int32 a ) 1221 { 1222 flag zSign; 1223 uint32 absA; 1224 int8 shiftCount; 1225 bits64 zSig; 1226 1227 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1228 zSign = ( a < 0 ); 1229 absA = zSign ? - a : a; 1230 shiftCount = countLeadingZeros32( absA ) + 32; 1231 zSig = absA; 1232 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1233 1234 } 1235 1236 #endif 1237 1238 #ifdef FLOAT128 1239 1240 /* 1241 ------------------------------------------------------------------------------- 1242 Returns the result of converting the 32-bit two's complement integer `a' to 1243 the quadruple-precision floating-point format. The conversion is performed 1244 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1245 ------------------------------------------------------------------------------- 1246 */ 1247 float128 int32_to_float128( int32 a ) 1248 { 1249 flag zSign; 1250 uint32 absA; 1251 int8 shiftCount; 1252 bits64 zSig0; 1253 1254 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1255 zSign = ( a < 0 ); 1256 absA = zSign ? - a : a; 1257 shiftCount = countLeadingZeros32( absA ) + 17; 1258 zSig0 = absA; 1259 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1260 1261 } 1262 1263 #endif 1264 1265 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1266 /* 1267 ------------------------------------------------------------------------------- 1268 Returns the result of converting the 64-bit two's complement integer `a' 1269 to the single-precision floating-point format. The conversion is performed 1270 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1271 ------------------------------------------------------------------------------- 1272 */ 1273 float32 int64_to_float32( int64 a ) 1274 { 1275 flag zSign; 1276 uint64 absA; 1277 int8 shiftCount; 1278 1279 if ( a == 0 ) return 0; 1280 zSign = ( a < 0 ); 1281 absA = zSign ? - a : a; 1282 shiftCount = countLeadingZeros64( absA ) - 40; 1283 if ( 0 <= shiftCount ) { 1284 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1285 } 1286 else { 1287 shiftCount += 7; 1288 if ( shiftCount < 0 ) { 1289 shift64RightJamming( absA, - shiftCount, &absA ); 1290 } 1291 else { 1292 absA <<= shiftCount; 1293 } 1294 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1295 } 1296 1297 } 1298 1299 /* 1300 ------------------------------------------------------------------------------- 1301 Returns the result of converting the 64-bit two's complement integer `a' 1302 to the double-precision floating-point format. The conversion is performed 1303 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1304 ------------------------------------------------------------------------------- 1305 */ 1306 float64 int64_to_float64( int64 a ) 1307 { 1308 flag zSign; 1309 1310 if ( a == 0 ) return 0; 1311 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1312 return packFloat64( 1, 0x43E, 0 ); 1313 } 1314 zSign = ( a < 0 ); 1315 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1316 1317 } 1318 1319 #ifdef FLOATX80 1320 1321 /* 1322 ------------------------------------------------------------------------------- 1323 Returns the result of converting the 64-bit two's complement integer `a' 1324 to the extended double-precision floating-point format. The conversion 1325 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1326 Arithmetic. 1327 ------------------------------------------------------------------------------- 1328 */ 1329 floatx80 int64_to_floatx80( int64 a ) 1330 { 1331 flag zSign; 1332 uint64 absA; 1333 int8 shiftCount; 1334 1335 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1336 zSign = ( a < 0 ); 1337 absA = zSign ? - a : a; 1338 shiftCount = countLeadingZeros64( absA ); 1339 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1340 1341 } 1342 1343 #endif 1344 1345 #ifdef FLOAT128 1346 1347 /* 1348 ------------------------------------------------------------------------------- 1349 Returns the result of converting the 64-bit two's complement integer `a' to 1350 the quadruple-precision floating-point format. The conversion is performed 1351 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1352 ------------------------------------------------------------------------------- 1353 */ 1354 float128 int64_to_float128( int64 a ) 1355 { 1356 flag zSign; 1357 uint64 absA; 1358 int8 shiftCount; 1359 int32 zExp; 1360 bits64 zSig0, zSig1; 1361 1362 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1363 zSign = ( a < 0 ); 1364 absA = zSign ? - a : a; 1365 shiftCount = countLeadingZeros64( absA ) + 49; 1366 zExp = 0x406E - shiftCount; 1367 if ( 64 <= shiftCount ) { 1368 zSig1 = 0; 1369 zSig0 = absA; 1370 shiftCount -= 64; 1371 } 1372 else { 1373 zSig1 = absA; 1374 zSig0 = 0; 1375 } 1376 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1377 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1378 1379 } 1380 1381 #endif 1382 #endif /* !SOFTFLOAT_FOR_GCC */ 1383 1384 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1385 /* 1386 ------------------------------------------------------------------------------- 1387 Returns the result of converting the single-precision floating-point value 1388 `a' to the 32-bit two's complement integer format. The conversion is 1389 performed according to the IEC/IEEE Standard for Binary Floating-Point 1390 Arithmetic---which means in particular that the conversion is rounded 1391 according to the current rounding mode. If `a' is a NaN, the largest 1392 positive integer is returned. Otherwise, if the conversion overflows, the 1393 largest integer with the same sign as `a' is returned. 1394 ------------------------------------------------------------------------------- 1395 */ 1396 int32 float32_to_int32( float32 a ) 1397 { 1398 flag aSign; 1399 int16 aExp, shiftCount; 1400 bits32 aSig; 1401 bits64 aSig64; 1402 1403 aSig = extractFloat32Frac( a ); 1404 aExp = extractFloat32Exp( a ); 1405 aSign = extractFloat32Sign( a ); 1406 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1407 if ( aExp ) aSig |= 0x00800000; 1408 shiftCount = 0xAF - aExp; 1409 aSig64 = aSig; 1410 aSig64 <<= 32; 1411 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1412 return roundAndPackInt32( aSign, aSig64 ); 1413 1414 } 1415 #endif /* !SOFTFLOAT_FOR_GCC */ 1416 1417 /* 1418 ------------------------------------------------------------------------------- 1419 Returns the result of converting the single-precision floating-point value 1420 `a' to the 32-bit two's complement integer format. The conversion is 1421 performed according to the IEC/IEEE Standard for Binary Floating-Point 1422 Arithmetic, except that the conversion is always rounded toward zero. 1423 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1424 the conversion overflows, the largest integer with the same sign as `a' is 1425 returned. 1426 ------------------------------------------------------------------------------- 1427 */ 1428 int32 float32_to_int32_round_to_zero( float32 a ) 1429 { 1430 flag aSign; 1431 int16 aExp, shiftCount; 1432 bits32 aSig; 1433 int32 z; 1434 1435 aSig = extractFloat32Frac( a ); 1436 aExp = extractFloat32Exp( a ); 1437 aSign = extractFloat32Sign( a ); 1438 shiftCount = aExp - 0x9E; 1439 if ( 0 <= shiftCount ) { 1440 if ( a != 0xCF000000 ) { 1441 float_raise( float_flag_invalid ); 1442 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1443 } 1444 return (sbits32) 0x80000000; 1445 } 1446 else if ( aExp <= 0x7E ) { 1447 if ( aExp | aSig ) float_set_inexact(); 1448 return 0; 1449 } 1450 aSig = ( aSig | 0x00800000 )<<8; 1451 z = aSig>>( - shiftCount ); 1452 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1453 float_set_inexact(); 1454 } 1455 if ( aSign ) z = - z; 1456 return z; 1457 1458 } 1459 1460 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1461 /* 1462 ------------------------------------------------------------------------------- 1463 Returns the result of converting the single-precision floating-point value 1464 `a' to the 64-bit two's complement integer format. The conversion is 1465 performed according to the IEC/IEEE Standard for Binary Floating-Point 1466 Arithmetic---which means in particular that the conversion is rounded 1467 according to the current rounding mode. If `a' is a NaN, the largest 1468 positive integer is returned. Otherwise, if the conversion overflows, the 1469 largest integer with the same sign as `a' is returned. 1470 ------------------------------------------------------------------------------- 1471 */ 1472 int64 float32_to_int64( float32 a ) 1473 { 1474 flag aSign; 1475 int16 aExp, shiftCount; 1476 bits32 aSig; 1477 bits64 aSig64, aSigExtra; 1478 1479 aSig = extractFloat32Frac( a ); 1480 aExp = extractFloat32Exp( a ); 1481 aSign = extractFloat32Sign( a ); 1482 shiftCount = 0xBE - aExp; 1483 if ( shiftCount < 0 ) { 1484 float_raise( float_flag_invalid ); 1485 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1486 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1487 } 1488 return (sbits64) LIT64( 0x8000000000000000 ); 1489 } 1490 if ( aExp ) aSig |= 0x00800000; 1491 aSig64 = aSig; 1492 aSig64 <<= 40; 1493 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1494 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1495 1496 } 1497 1498 /* 1499 ------------------------------------------------------------------------------- 1500 Returns the result of converting the single-precision floating-point value 1501 `a' to the 64-bit two's complement integer format. The conversion is 1502 performed according to the IEC/IEEE Standard for Binary Floating-Point 1503 Arithmetic, except that the conversion is always rounded toward zero. If 1504 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1505 conversion overflows, the largest integer with the same sign as `a' is 1506 returned. 1507 ------------------------------------------------------------------------------- 1508 */ 1509 int64 float32_to_int64_round_to_zero( float32 a ) 1510 { 1511 flag aSign; 1512 int16 aExp, shiftCount; 1513 bits32 aSig; 1514 bits64 aSig64; 1515 int64 z; 1516 1517 aSig = extractFloat32Frac( a ); 1518 aExp = extractFloat32Exp( a ); 1519 aSign = extractFloat32Sign( a ); 1520 shiftCount = aExp - 0xBE; 1521 if ( 0 <= shiftCount ) { 1522 if ( a != 0xDF000000 ) { 1523 float_raise( float_flag_invalid ); 1524 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1525 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1526 } 1527 } 1528 return (sbits64) LIT64( 0x8000000000000000 ); 1529 } 1530 else if ( aExp <= 0x7E ) { 1531 if ( aExp | aSig ) float_set_inexact(); 1532 return 0; 1533 } 1534 aSig64 = aSig | 0x00800000; 1535 aSig64 <<= 40; 1536 z = aSig64>>( - shiftCount ); 1537 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1538 float_set_inexact(); 1539 } 1540 if ( aSign ) z = - z; 1541 return z; 1542 1543 } 1544 #endif /* !SOFTFLOAT_FOR_GCC */ 1545 1546 /* 1547 ------------------------------------------------------------------------------- 1548 Returns the result of converting the single-precision floating-point value 1549 `a' to the double-precision floating-point format. The conversion is 1550 performed according to the IEC/IEEE Standard for Binary Floating-Point 1551 Arithmetic. 1552 ------------------------------------------------------------------------------- 1553 */ 1554 float64 float32_to_float64( float32 a ) 1555 { 1556 flag aSign; 1557 int16 aExp; 1558 bits32 aSig; 1559 1560 aSig = extractFloat32Frac( a ); 1561 aExp = extractFloat32Exp( a ); 1562 aSign = extractFloat32Sign( a ); 1563 if ( aExp == 0xFF ) { 1564 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1565 return packFloat64( aSign, 0x7FF, 0 ); 1566 } 1567 if ( aExp == 0 ) { 1568 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1569 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1570 --aExp; 1571 } 1572 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1573 1574 } 1575 1576 #ifdef FLOATX80 1577 1578 /* 1579 ------------------------------------------------------------------------------- 1580 Returns the result of converting the single-precision floating-point value 1581 `a' to the extended double-precision floating-point format. The conversion 1582 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1583 Arithmetic. 1584 ------------------------------------------------------------------------------- 1585 */ 1586 floatx80 float32_to_floatx80( float32 a ) 1587 { 1588 flag aSign; 1589 int16 aExp; 1590 bits32 aSig; 1591 1592 aSig = extractFloat32Frac( a ); 1593 aExp = extractFloat32Exp( a ); 1594 aSign = extractFloat32Sign( a ); 1595 if ( aExp == 0xFF ) { 1596 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1597 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1598 } 1599 if ( aExp == 0 ) { 1600 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1601 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1602 } 1603 aSig |= 0x00800000; 1604 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1605 1606 } 1607 1608 #endif 1609 1610 #ifdef FLOAT128 1611 1612 /* 1613 ------------------------------------------------------------------------------- 1614 Returns the result of converting the single-precision floating-point value 1615 `a' to the double-precision floating-point format. The conversion is 1616 performed according to the IEC/IEEE Standard for Binary Floating-Point 1617 Arithmetic. 1618 ------------------------------------------------------------------------------- 1619 */ 1620 float128 float32_to_float128( float32 a ) 1621 { 1622 flag aSign; 1623 int16 aExp; 1624 bits32 aSig; 1625 1626 aSig = extractFloat32Frac( a ); 1627 aExp = extractFloat32Exp( a ); 1628 aSign = extractFloat32Sign( a ); 1629 if ( aExp == 0xFF ) { 1630 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1631 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1632 } 1633 if ( aExp == 0 ) { 1634 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1635 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1636 --aExp; 1637 } 1638 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1639 1640 } 1641 1642 #endif 1643 1644 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1645 /* 1646 ------------------------------------------------------------------------------- 1647 Rounds the single-precision floating-point value `a' to an integer, and 1648 returns the result as a single-precision floating-point value. The 1649 operation is performed according to the IEC/IEEE Standard for Binary 1650 Floating-Point Arithmetic. 1651 ------------------------------------------------------------------------------- 1652 */ 1653 float32 float32_round_to_int( float32 a ) 1654 { 1655 flag aSign; 1656 int16 aExp; 1657 bits32 lastBitMask, roundBitsMask; 1658 int8 roundingMode; 1659 float32 z; 1660 1661 aExp = extractFloat32Exp( a ); 1662 if ( 0x96 <= aExp ) { 1663 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1664 return propagateFloat32NaN( a, a ); 1665 } 1666 return a; 1667 } 1668 if ( aExp <= 0x7E ) { 1669 if ( (bits32) ( a<<1 ) == 0 ) return a; 1670 float_set_inexact(); 1671 aSign = extractFloat32Sign( a ); 1672 switch ( float_rounding_mode() ) { 1673 case float_round_nearest_even: 1674 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1675 return packFloat32( aSign, 0x7F, 0 ); 1676 } 1677 break; 1678 case float_round_down: 1679 return aSign ? 0xBF800000 : 0; 1680 case float_round_up: 1681 return aSign ? 0x80000000 : 0x3F800000; 1682 } 1683 return packFloat32( aSign, 0, 0 ); 1684 } 1685 lastBitMask = 1; 1686 lastBitMask <<= 0x96 - aExp; 1687 roundBitsMask = lastBitMask - 1; 1688 z = a; 1689 roundingMode = float_rounding_mode(); 1690 if ( roundingMode == float_round_nearest_even ) { 1691 z += lastBitMask>>1; 1692 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1693 } 1694 else if ( roundingMode != float_round_to_zero ) { 1695 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1696 z += roundBitsMask; 1697 } 1698 } 1699 z &= ~ roundBitsMask; 1700 if ( z != a ) float_set_inexact(); 1701 return z; 1702 1703 } 1704 #endif /* !SOFTFLOAT_FOR_GCC */ 1705 1706 /* 1707 ------------------------------------------------------------------------------- 1708 Returns the result of adding the absolute values of the single-precision 1709 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1710 before being returned. `zSign' is ignored if the result is a NaN. 1711 The addition is performed according to the IEC/IEEE Standard for Binary 1712 Floating-Point Arithmetic. 1713 ------------------------------------------------------------------------------- 1714 */ 1715 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1716 { 1717 int16 aExp, bExp, zExp; 1718 bits32 aSig, bSig, zSig; 1719 int16 expDiff; 1720 1721 aSig = extractFloat32Frac( a ); 1722 aExp = extractFloat32Exp( a ); 1723 bSig = extractFloat32Frac( b ); 1724 bExp = extractFloat32Exp( b ); 1725 expDiff = aExp - bExp; 1726 aSig <<= 6; 1727 bSig <<= 6; 1728 if ( 0 < expDiff ) { 1729 if ( aExp == 0xFF ) { 1730 if ( aSig ) return propagateFloat32NaN( a, b ); 1731 return a; 1732 } 1733 if ( bExp == 0 ) { 1734 --expDiff; 1735 } 1736 else { 1737 bSig |= 0x20000000; 1738 } 1739 shift32RightJamming( bSig, expDiff, &bSig ); 1740 zExp = aExp; 1741 } 1742 else if ( expDiff < 0 ) { 1743 if ( bExp == 0xFF ) { 1744 if ( bSig ) return propagateFloat32NaN( a, b ); 1745 return packFloat32( zSign, 0xFF, 0 ); 1746 } 1747 if ( aExp == 0 ) { 1748 ++expDiff; 1749 } 1750 else { 1751 aSig |= 0x20000000; 1752 } 1753 shift32RightJamming( aSig, - expDiff, &aSig ); 1754 zExp = bExp; 1755 } 1756 else { 1757 if ( aExp == 0xFF ) { 1758 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1759 return a; 1760 } 1761 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1762 zSig = 0x40000000 + aSig + bSig; 1763 zExp = aExp; 1764 goto roundAndPack; 1765 } 1766 aSig |= 0x20000000; 1767 zSig = ( aSig + bSig )<<1; 1768 --zExp; 1769 if ( (sbits32) zSig < 0 ) { 1770 zSig = aSig + bSig; 1771 ++zExp; 1772 } 1773 roundAndPack: 1774 return roundAndPackFloat32( zSign, zExp, zSig ); 1775 1776 } 1777 1778 /* 1779 ------------------------------------------------------------------------------- 1780 Returns the result of subtracting the absolute values of the single- 1781 precision floating-point values `a' and `b'. If `zSign' is 1, the 1782 difference is negated before being returned. `zSign' is ignored if the 1783 result is a NaN. The subtraction is performed according to the IEC/IEEE 1784 Standard for Binary Floating-Point Arithmetic. 1785 ------------------------------------------------------------------------------- 1786 */ 1787 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1788 { 1789 int16 aExp, bExp, zExp; 1790 bits32 aSig, bSig, zSig; 1791 int16 expDiff; 1792 1793 aSig = extractFloat32Frac( a ); 1794 aExp = extractFloat32Exp( a ); 1795 bSig = extractFloat32Frac( b ); 1796 bExp = extractFloat32Exp( b ); 1797 expDiff = aExp - bExp; 1798 aSig <<= 7; 1799 bSig <<= 7; 1800 if ( 0 < expDiff ) goto aExpBigger; 1801 if ( expDiff < 0 ) goto bExpBigger; 1802 if ( aExp == 0xFF ) { 1803 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1804 float_raise( float_flag_invalid ); 1805 return float32_default_nan; 1806 } 1807 if ( aExp == 0 ) { 1808 aExp = 1; 1809 bExp = 1; 1810 } 1811 if ( bSig < aSig ) goto aBigger; 1812 if ( aSig < bSig ) goto bBigger; 1813 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1814 bExpBigger: 1815 if ( bExp == 0xFF ) { 1816 if ( bSig ) return propagateFloat32NaN( a, b ); 1817 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1818 } 1819 if ( aExp == 0 ) { 1820 ++expDiff; 1821 } 1822 else { 1823 aSig |= 0x40000000; 1824 } 1825 shift32RightJamming( aSig, - expDiff, &aSig ); 1826 bSig |= 0x40000000; 1827 bBigger: 1828 zSig = bSig - aSig; 1829 zExp = bExp; 1830 zSign ^= 1; 1831 goto normalizeRoundAndPack; 1832 aExpBigger: 1833 if ( aExp == 0xFF ) { 1834 if ( aSig ) return propagateFloat32NaN( a, b ); 1835 return a; 1836 } 1837 if ( bExp == 0 ) { 1838 --expDiff; 1839 } 1840 else { 1841 bSig |= 0x40000000; 1842 } 1843 shift32RightJamming( bSig, expDiff, &bSig ); 1844 aSig |= 0x40000000; 1845 aBigger: 1846 zSig = aSig - bSig; 1847 zExp = aExp; 1848 normalizeRoundAndPack: 1849 --zExp; 1850 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1851 1852 } 1853 1854 /* 1855 ------------------------------------------------------------------------------- 1856 Returns the result of adding the single-precision floating-point values `a' 1857 and `b'. The operation is performed according to the IEC/IEEE Standard for 1858 Binary Floating-Point Arithmetic. 1859 ------------------------------------------------------------------------------- 1860 */ 1861 float32 float32_add( float32 a, float32 b ) 1862 { 1863 flag aSign, bSign; 1864 1865 aSign = extractFloat32Sign( a ); 1866 bSign = extractFloat32Sign( b ); 1867 if ( aSign == bSign ) { 1868 return addFloat32Sigs( a, b, aSign ); 1869 } 1870 else { 1871 return subFloat32Sigs( a, b, aSign ); 1872 } 1873 1874 } 1875 1876 /* 1877 ------------------------------------------------------------------------------- 1878 Returns the result of subtracting the single-precision floating-point values 1879 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1880 for Binary Floating-Point Arithmetic. 1881 ------------------------------------------------------------------------------- 1882 */ 1883 float32 float32_sub( float32 a, float32 b ) 1884 { 1885 flag aSign, bSign; 1886 1887 aSign = extractFloat32Sign( a ); 1888 bSign = extractFloat32Sign( b ); 1889 if ( aSign == bSign ) { 1890 return subFloat32Sigs( a, b, aSign ); 1891 } 1892 else { 1893 return addFloat32Sigs( a, b, aSign ); 1894 } 1895 1896 } 1897 1898 /* 1899 ------------------------------------------------------------------------------- 1900 Returns the result of multiplying the single-precision floating-point values 1901 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1902 for Binary Floating-Point Arithmetic. 1903 ------------------------------------------------------------------------------- 1904 */ 1905 float32 float32_mul( float32 a, float32 b ) 1906 { 1907 flag aSign, bSign, zSign; 1908 int16 aExp, bExp, zExp; 1909 bits32 aSig, bSig; 1910 bits64 zSig64; 1911 bits32 zSig; 1912 1913 aSig = extractFloat32Frac( a ); 1914 aExp = extractFloat32Exp( a ); 1915 aSign = extractFloat32Sign( a ); 1916 bSig = extractFloat32Frac( b ); 1917 bExp = extractFloat32Exp( b ); 1918 bSign = extractFloat32Sign( b ); 1919 zSign = aSign ^ bSign; 1920 if ( aExp == 0xFF ) { 1921 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1922 return propagateFloat32NaN( a, b ); 1923 } 1924 if ( ( bExp | bSig ) == 0 ) { 1925 float_raise( float_flag_invalid ); 1926 return float32_default_nan; 1927 } 1928 return packFloat32( zSign, 0xFF, 0 ); 1929 } 1930 if ( bExp == 0xFF ) { 1931 if ( bSig ) return propagateFloat32NaN( a, b ); 1932 if ( ( aExp | aSig ) == 0 ) { 1933 float_raise( float_flag_invalid ); 1934 return float32_default_nan; 1935 } 1936 return packFloat32( zSign, 0xFF, 0 ); 1937 } 1938 if ( aExp == 0 ) { 1939 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1940 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1941 } 1942 if ( bExp == 0 ) { 1943 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1944 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1945 } 1946 zExp = aExp + bExp - 0x7F; 1947 aSig = ( aSig | 0x00800000 )<<7; 1948 bSig = ( bSig | 0x00800000 )<<8; 1949 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1950 zSig = zSig64; 1951 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1952 zSig <<= 1; 1953 --zExp; 1954 } 1955 return roundAndPackFloat32( zSign, zExp, zSig ); 1956 1957 } 1958 1959 /* 1960 ------------------------------------------------------------------------------- 1961 Returns the result of dividing the single-precision floating-point value `a' 1962 by the corresponding value `b'. The operation is performed according to the 1963 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1964 ------------------------------------------------------------------------------- 1965 */ 1966 float32 float32_div( float32 a, float32 b ) 1967 { 1968 flag aSign, bSign, zSign; 1969 int16 aExp, bExp, zExp; 1970 bits32 aSig, bSig, zSig; 1971 1972 aSig = extractFloat32Frac( a ); 1973 aExp = extractFloat32Exp( a ); 1974 aSign = extractFloat32Sign( a ); 1975 bSig = extractFloat32Frac( b ); 1976 bExp = extractFloat32Exp( b ); 1977 bSign = extractFloat32Sign( b ); 1978 zSign = aSign ^ bSign; 1979 if ( aExp == 0xFF ) { 1980 if ( aSig ) return propagateFloat32NaN( a, b ); 1981 if ( bExp == 0xFF ) { 1982 if ( bSig ) return propagateFloat32NaN( a, b ); 1983 float_raise( float_flag_invalid ); 1984 return float32_default_nan; 1985 } 1986 return packFloat32( zSign, 0xFF, 0 ); 1987 } 1988 if ( bExp == 0xFF ) { 1989 if ( bSig ) return propagateFloat32NaN( a, b ); 1990 return packFloat32( zSign, 0, 0 ); 1991 } 1992 if ( bExp == 0 ) { 1993 if ( bSig == 0 ) { 1994 if ( ( aExp | aSig ) == 0 ) { 1995 float_raise( float_flag_invalid ); 1996 return float32_default_nan; 1997 } 1998 float_raise( float_flag_divbyzero ); 1999 return packFloat32( zSign, 0xFF, 0 ); 2000 } 2001 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2002 } 2003 if ( aExp == 0 ) { 2004 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2005 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2006 } 2007 zExp = aExp - bExp + 0x7D; 2008 aSig = ( aSig | 0x00800000 )<<7; 2009 bSig = ( bSig | 0x00800000 )<<8; 2010 if ( bSig <= ( aSig + aSig ) ) { 2011 aSig >>= 1; 2012 ++zExp; 2013 } 2014 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2015 if ( ( zSig & 0x3F ) == 0 ) { 2016 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2017 } 2018 return roundAndPackFloat32( zSign, zExp, zSig ); 2019 2020 } 2021 2022 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2023 /* 2024 ------------------------------------------------------------------------------- 2025 Returns the remainder of the single-precision floating-point value `a' 2026 with respect to the corresponding value `b'. The operation is performed 2027 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2028 ------------------------------------------------------------------------------- 2029 */ 2030 float32 float32_rem( float32 a, float32 b ) 2031 { 2032 flag aSign, bSign, zSign; 2033 int16 aExp, bExp, expDiff; 2034 bits32 aSig, bSig; 2035 bits32 q; 2036 bits64 aSig64, bSig64, q64; 2037 bits32 alternateASig; 2038 sbits32 sigMean; 2039 2040 aSig = extractFloat32Frac( a ); 2041 aExp = extractFloat32Exp( a ); 2042 aSign = extractFloat32Sign( a ); 2043 bSig = extractFloat32Frac( b ); 2044 bExp = extractFloat32Exp( b ); 2045 bSign = extractFloat32Sign( b ); 2046 if ( aExp == 0xFF ) { 2047 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2048 return propagateFloat32NaN( a, b ); 2049 } 2050 float_raise( float_flag_invalid ); 2051 return float32_default_nan; 2052 } 2053 if ( bExp == 0xFF ) { 2054 if ( bSig ) return propagateFloat32NaN( a, b ); 2055 return a; 2056 } 2057 if ( bExp == 0 ) { 2058 if ( bSig == 0 ) { 2059 float_raise( float_flag_invalid ); 2060 return float32_default_nan; 2061 } 2062 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2063 } 2064 if ( aExp == 0 ) { 2065 if ( aSig == 0 ) return a; 2066 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2067 } 2068 expDiff = aExp - bExp; 2069 aSig |= 0x00800000; 2070 bSig |= 0x00800000; 2071 if ( expDiff < 32 ) { 2072 aSig <<= 8; 2073 bSig <<= 8; 2074 if ( expDiff < 0 ) { 2075 if ( expDiff < -1 ) return a; 2076 aSig >>= 1; 2077 } 2078 q = ( bSig <= aSig ); 2079 if ( q ) aSig -= bSig; 2080 if ( 0 < expDiff ) { 2081 q = ( ( (bits64) aSig )<<32 ) / bSig; 2082 q >>= 32 - expDiff; 2083 bSig >>= 2; 2084 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2085 } 2086 else { 2087 aSig >>= 2; 2088 bSig >>= 2; 2089 } 2090 } 2091 else { 2092 if ( bSig <= aSig ) aSig -= bSig; 2093 aSig64 = ( (bits64) aSig )<<40; 2094 bSig64 = ( (bits64) bSig )<<40; 2095 expDiff -= 64; 2096 while ( 0 < expDiff ) { 2097 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2098 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2099 aSig64 = - ( ( bSig * q64 )<<38 ); 2100 expDiff -= 62; 2101 } 2102 expDiff += 64; 2103 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2104 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2105 q = q64>>( 64 - expDiff ); 2106 bSig <<= 6; 2107 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2108 } 2109 do { 2110 alternateASig = aSig; 2111 ++q; 2112 aSig -= bSig; 2113 } while ( 0 <= (sbits32) aSig ); 2114 sigMean = aSig + alternateASig; 2115 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2116 aSig = alternateASig; 2117 } 2118 zSign = ( (sbits32) aSig < 0 ); 2119 if ( zSign ) aSig = - aSig; 2120 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2121 2122 } 2123 #endif /* !SOFTFLOAT_FOR_GCC */ 2124 2125 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2126 /* 2127 ------------------------------------------------------------------------------- 2128 Returns the square root of the single-precision floating-point value `a'. 2129 The operation is performed according to the IEC/IEEE Standard for Binary 2130 Floating-Point Arithmetic. 2131 ------------------------------------------------------------------------------- 2132 */ 2133 float32 float32_sqrt( float32 a ) 2134 { 2135 flag aSign; 2136 int16 aExp, zExp; 2137 bits32 aSig, zSig; 2138 bits64 rem, term; 2139 2140 aSig = extractFloat32Frac( a ); 2141 aExp = extractFloat32Exp( a ); 2142 aSign = extractFloat32Sign( a ); 2143 if ( aExp == 0xFF ) { 2144 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2145 if ( ! aSign ) return a; 2146 float_raise( float_flag_invalid ); 2147 return float32_default_nan; 2148 } 2149 if ( aSign ) { 2150 if ( ( aExp | aSig ) == 0 ) return a; 2151 float_raise( float_flag_invalid ); 2152 return float32_default_nan; 2153 } 2154 if ( aExp == 0 ) { 2155 if ( aSig == 0 ) return 0; 2156 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2157 } 2158 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2159 aSig = ( aSig | 0x00800000 )<<8; 2160 zSig = estimateSqrt32( aExp, aSig ) + 2; 2161 if ( ( zSig & 0x7F ) <= 5 ) { 2162 if ( zSig < 2 ) { 2163 zSig = 0x7FFFFFFF; 2164 goto roundAndPack; 2165 } 2166 aSig >>= aExp & 1; 2167 term = ( (bits64) zSig ) * zSig; 2168 rem = ( ( (bits64) aSig )<<32 ) - term; 2169 while ( (sbits64) rem < 0 ) { 2170 --zSig; 2171 rem += ( ( (bits64) zSig )<<1 ) | 1; 2172 } 2173 zSig |= ( rem != 0 ); 2174 } 2175 shift32RightJamming( zSig, 1, &zSig ); 2176 roundAndPack: 2177 return roundAndPackFloat32( 0, zExp, zSig ); 2178 2179 } 2180 #endif /* !SOFTFLOAT_FOR_GCC */ 2181 2182 /* 2183 ------------------------------------------------------------------------------- 2184 Returns 1 if the single-precision floating-point value `a' is equal to 2185 the corresponding value `b', and 0 otherwise. The comparison is performed 2186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2187 ------------------------------------------------------------------------------- 2188 */ 2189 flag float32_eq( float32 a, float32 b ) 2190 { 2191 2192 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2193 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2194 ) { 2195 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2196 float_raise( float_flag_invalid ); 2197 } 2198 return 0; 2199 } 2200 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2201 2202 } 2203 2204 /* 2205 ------------------------------------------------------------------------------- 2206 Returns 1 if the single-precision floating-point value `a' is less than 2207 or equal to the corresponding value `b', and 0 otherwise. The comparison 2208 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2209 Arithmetic. 2210 ------------------------------------------------------------------------------- 2211 */ 2212 flag float32_le( float32 a, float32 b ) 2213 { 2214 flag aSign, bSign; 2215 2216 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2217 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2218 ) { 2219 float_raise( float_flag_invalid ); 2220 return 0; 2221 } 2222 aSign = extractFloat32Sign( a ); 2223 bSign = extractFloat32Sign( b ); 2224 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2225 return ( a == b ) || ( aSign ^ ( a < b ) ); 2226 2227 } 2228 2229 /* 2230 ------------------------------------------------------------------------------- 2231 Returns 1 if the single-precision floating-point value `a' is less than 2232 the corresponding value `b', and 0 otherwise. The comparison is performed 2233 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2234 ------------------------------------------------------------------------------- 2235 */ 2236 flag float32_lt( float32 a, float32 b ) 2237 { 2238 flag aSign, bSign; 2239 2240 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2241 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2242 ) { 2243 float_raise( float_flag_invalid ); 2244 return 0; 2245 } 2246 aSign = extractFloat32Sign( a ); 2247 bSign = extractFloat32Sign( b ); 2248 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2249 return ( a != b ) && ( aSign ^ ( a < b ) ); 2250 2251 } 2252 2253 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2254 /* 2255 ------------------------------------------------------------------------------- 2256 Returns 1 if the single-precision floating-point value `a' is equal to 2257 the corresponding value `b', and 0 otherwise. The invalid exception is 2258 raised if either operand is a NaN. Otherwise, the comparison is performed 2259 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2260 ------------------------------------------------------------------------------- 2261 */ 2262 flag float32_eq_signaling( float32 a, float32 b ) 2263 { 2264 2265 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2266 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2267 ) { 2268 float_raise( float_flag_invalid ); 2269 return 0; 2270 } 2271 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2272 2273 } 2274 2275 /* 2276 ------------------------------------------------------------------------------- 2277 Returns 1 if the single-precision floating-point value `a' is less than or 2278 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2279 cause an exception. Otherwise, the comparison is performed according to the 2280 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2281 ------------------------------------------------------------------------------- 2282 */ 2283 flag float32_le_quiet( float32 a, float32 b ) 2284 { 2285 flag aSign, bSign; 2286 2287 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2288 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2289 ) { 2290 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2291 float_raise( float_flag_invalid ); 2292 } 2293 return 0; 2294 } 2295 aSign = extractFloat32Sign( a ); 2296 bSign = extractFloat32Sign( b ); 2297 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2298 return ( a == b ) || ( aSign ^ ( a < b ) ); 2299 2300 } 2301 2302 /* 2303 ------------------------------------------------------------------------------- 2304 Returns 1 if the single-precision floating-point value `a' is less than 2305 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2306 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2307 Standard for Binary Floating-Point Arithmetic. 2308 ------------------------------------------------------------------------------- 2309 */ 2310 flag float32_lt_quiet( float32 a, float32 b ) 2311 { 2312 flag aSign, bSign; 2313 2314 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2315 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2316 ) { 2317 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2318 float_raise( float_flag_invalid ); 2319 } 2320 return 0; 2321 } 2322 aSign = extractFloat32Sign( a ); 2323 bSign = extractFloat32Sign( b ); 2324 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2325 return ( a != b ) && ( aSign ^ ( a < b ) ); 2326 2327 } 2328 #endif /* !SOFTFLOAT_FOR_GCC */ 2329 2330 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2331 /* 2332 ------------------------------------------------------------------------------- 2333 Returns the result of converting the double-precision floating-point value 2334 `a' to the 32-bit two's complement integer format. The conversion is 2335 performed according to the IEC/IEEE Standard for Binary Floating-Point 2336 Arithmetic---which means in particular that the conversion is rounded 2337 according to the current rounding mode. If `a' is a NaN, the largest 2338 positive integer is returned. Otherwise, if the conversion overflows, the 2339 largest integer with the same sign as `a' is returned. 2340 ------------------------------------------------------------------------------- 2341 */ 2342 int32 float64_to_int32( float64 a ) 2343 { 2344 flag aSign; 2345 int16 aExp, shiftCount; 2346 bits64 aSig; 2347 2348 aSig = extractFloat64Frac( a ); 2349 aExp = extractFloat64Exp( a ); 2350 aSign = extractFloat64Sign( a ); 2351 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2352 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2353 shiftCount = 0x42C - aExp; 2354 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2355 return roundAndPackInt32( aSign, aSig ); 2356 2357 } 2358 #endif /* !SOFTFLOAT_FOR_GCC */ 2359 2360 /* 2361 ------------------------------------------------------------------------------- 2362 Returns the result of converting the double-precision floating-point value 2363 `a' to the 32-bit two's complement integer format. The conversion is 2364 performed according to the IEC/IEEE Standard for Binary Floating-Point 2365 Arithmetic, except that the conversion is always rounded toward zero. 2366 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2367 the conversion overflows, the largest integer with the same sign as `a' is 2368 returned. 2369 ------------------------------------------------------------------------------- 2370 */ 2371 int32 float64_to_int32_round_to_zero( float64 a ) 2372 { 2373 flag aSign; 2374 int16 aExp, shiftCount; 2375 bits64 aSig, savedASig; 2376 int32 z; 2377 2378 aSig = extractFloat64Frac( a ); 2379 aExp = extractFloat64Exp( a ); 2380 aSign = extractFloat64Sign( a ); 2381 if ( 0x41E < aExp ) { 2382 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2383 goto invalid; 2384 } 2385 else if ( aExp < 0x3FF ) { 2386 if ( aExp || aSig ) float_set_inexact(); 2387 return 0; 2388 } 2389 aSig |= LIT64( 0x0010000000000000 ); 2390 shiftCount = 0x433 - aExp; 2391 savedASig = aSig; 2392 aSig >>= shiftCount; 2393 z = aSig; 2394 if ( aSign ) z = - z; 2395 if ( ( z < 0 ) ^ aSign ) { 2396 invalid: 2397 float_raise( float_flag_invalid ); 2398 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2399 } 2400 if ( ( aSig<<shiftCount ) != savedASig ) { 2401 float_set_inexact(); 2402 } 2403 return z; 2404 2405 } 2406 2407 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2408 /* 2409 ------------------------------------------------------------------------------- 2410 Returns the result of converting the double-precision floating-point value 2411 `a' to the 64-bit two's complement integer format. The conversion is 2412 performed according to the IEC/IEEE Standard for Binary Floating-Point 2413 Arithmetic---which means in particular that the conversion is rounded 2414 according to the current rounding mode. If `a' is a NaN, the largest 2415 positive integer is returned. Otherwise, if the conversion overflows, the 2416 largest integer with the same sign as `a' is returned. 2417 ------------------------------------------------------------------------------- 2418 */ 2419 int64 float64_to_int64( float64 a ) 2420 { 2421 flag aSign; 2422 int16 aExp, shiftCount; 2423 bits64 aSig, aSigExtra; 2424 2425 aSig = extractFloat64Frac( a ); 2426 aExp = extractFloat64Exp( a ); 2427 aSign = extractFloat64Sign( a ); 2428 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2429 shiftCount = 0x433 - aExp; 2430 if ( shiftCount <= 0 ) { 2431 if ( 0x43E < aExp ) { 2432 float_raise( float_flag_invalid ); 2433 if ( ! aSign 2434 || ( ( aExp == 0x7FF ) 2435 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2436 ) { 2437 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2438 } 2439 return (sbits64) LIT64( 0x8000000000000000 ); 2440 } 2441 aSigExtra = 0; 2442 aSig <<= - shiftCount; 2443 } 2444 else { 2445 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2446 } 2447 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2448 2449 } 2450 2451 #ifdef __alpha__ 2452 /* 2453 ------------------------------------------------------------------------------- 2454 Returns the result of converting the double-precision floating-point value 2455 `a' to the 64-bit two's complement integer format. The conversion is 2456 performed according to the IEC/IEEE Standard for Binary Floating-Point 2457 Arithmetic---which means in particular that the conversion is rounded 2458 according to the current rounding mode. If `a' is a NaN, the invalid 2459 exception is raised and zero is returned. 2460 ------------------------------------------------------------------------------- 2461 */ 2462 int64 float64_to_int64_no_overflow( float64 a ) 2463 { 2464 flag aSign; 2465 int16 aExp, shiftCount; 2466 bits64 aSig, aSigExtra; 2467 2468 aSig = extractFloat64Frac( a ); 2469 aExp = extractFloat64Exp( a ); 2470 aSign = extractFloat64Sign( a ); 2471 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2472 shiftCount = 0x433 - aExp; 2473 if ( shiftCount <= 0 ) { 2474 if ( 0x43E < aExp ) { 2475 float_raise( float_flag_invalid ); 2476 return 0; 2477 } 2478 aSigExtra = 0; 2479 aSig <<= - shiftCount; 2480 } 2481 else { 2482 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2483 } 2484 return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra ); 2485 2486 } 2487 #endif /* __alpha__ */ 2488 2489 /* 2490 ------------------------------------------------------------------------------- 2491 Returns the result of converting the double-precision floating-point value 2492 `a' to the 64-bit two's complement integer format. The conversion is 2493 performed according to the IEC/IEEE Standard for Binary Floating-Point 2494 Arithmetic, except that the conversion is always rounded toward zero. 2495 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2496 the conversion overflows, the largest integer with the same sign as `a' is 2497 returned. 2498 ------------------------------------------------------------------------------- 2499 */ 2500 int64 float64_to_int64_round_to_zero( float64 a ) 2501 { 2502 flag aSign; 2503 int16 aExp, shiftCount; 2504 bits64 aSig; 2505 int64 z; 2506 2507 aSig = extractFloat64Frac( a ); 2508 aExp = extractFloat64Exp( a ); 2509 aSign = extractFloat64Sign( a ); 2510 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2511 shiftCount = aExp - 0x433; 2512 if ( 0 <= shiftCount ) { 2513 if ( 0x43E <= aExp ) { 2514 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2515 float_raise( float_flag_invalid ); 2516 if ( ! aSign 2517 || ( ( aExp == 0x7FF ) 2518 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2519 ) { 2520 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2521 } 2522 } 2523 return (sbits64) LIT64( 0x8000000000000000 ); 2524 } 2525 z = aSig<<shiftCount; 2526 } 2527 else { 2528 if ( aExp < 0x3FE ) { 2529 if ( aExp | aSig ) float_set_inexact(); 2530 return 0; 2531 } 2532 z = aSig>>( - shiftCount ); 2533 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2534 float_set_inexact(); 2535 } 2536 } 2537 if ( aSign ) z = - z; 2538 return z; 2539 2540 } 2541 #endif /* !SOFTFLOAT_FOR_GCC */ 2542 2543 /* 2544 ------------------------------------------------------------------------------- 2545 Returns the result of converting the double-precision floating-point value 2546 `a' to the single-precision floating-point format. The conversion is 2547 performed according to the IEC/IEEE Standard for Binary Floating-Point 2548 Arithmetic. 2549 ------------------------------------------------------------------------------- 2550 */ 2551 float32 float64_to_float32( float64 a ) 2552 { 2553 flag aSign; 2554 int16 aExp; 2555 bits64 aSig; 2556 bits32 zSig; 2557 2558 aSig = extractFloat64Frac( a ); 2559 aExp = extractFloat64Exp( a ); 2560 aSign = extractFloat64Sign( a ); 2561 if ( aExp == 0x7FF ) { 2562 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2563 return packFloat32( aSign, 0xFF, 0 ); 2564 } 2565 shift64RightJamming( aSig, 22, &aSig ); 2566 zSig = aSig; 2567 if ( aExp || zSig ) { 2568 zSig |= 0x40000000; 2569 aExp -= 0x381; 2570 } 2571 return roundAndPackFloat32( aSign, aExp, zSig ); 2572 2573 } 2574 2575 #ifdef FLOATX80 2576 2577 /* 2578 ------------------------------------------------------------------------------- 2579 Returns the result of converting the double-precision floating-point value 2580 `a' to the extended double-precision floating-point format. The conversion 2581 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2582 Arithmetic. 2583 ------------------------------------------------------------------------------- 2584 */ 2585 floatx80 float64_to_floatx80( float64 a ) 2586 { 2587 flag aSign; 2588 int16 aExp; 2589 bits64 aSig; 2590 2591 aSig = extractFloat64Frac( a ); 2592 aExp = extractFloat64Exp( a ); 2593 aSign = extractFloat64Sign( a ); 2594 if ( aExp == 0x7FF ) { 2595 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2596 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2597 } 2598 if ( aExp == 0 ) { 2599 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2600 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2601 } 2602 return 2603 packFloatx80( 2604 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2605 2606 } 2607 2608 #endif 2609 2610 #ifdef FLOAT128 2611 2612 /* 2613 ------------------------------------------------------------------------------- 2614 Returns the result of converting the double-precision floating-point value 2615 `a' to the quadruple-precision floating-point format. The conversion is 2616 performed according to the IEC/IEEE Standard for Binary Floating-Point 2617 Arithmetic. 2618 ------------------------------------------------------------------------------- 2619 */ 2620 float128 float64_to_float128( float64 a ) 2621 { 2622 flag aSign; 2623 int16 aExp; 2624 bits64 aSig, zSig0, zSig1; 2625 2626 aSig = extractFloat64Frac( a ); 2627 aExp = extractFloat64Exp( a ); 2628 aSign = extractFloat64Sign( a ); 2629 if ( aExp == 0x7FF ) { 2630 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2631 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2632 } 2633 if ( aExp == 0 ) { 2634 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2635 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2636 --aExp; 2637 } 2638 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2639 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2640 2641 } 2642 2643 #endif 2644 2645 #ifndef SOFTFLOAT_FOR_GCC 2646 /* 2647 ------------------------------------------------------------------------------- 2648 Rounds the double-precision floating-point value `a' to an integer, and 2649 returns the result as a double-precision floating-point value. The 2650 operation is performed according to the IEC/IEEE Standard for Binary 2651 Floating-Point Arithmetic. 2652 ------------------------------------------------------------------------------- 2653 */ 2654 float64 float64_round_to_int( float64 a ) 2655 { 2656 flag aSign; 2657 int16 aExp; 2658 bits64 lastBitMask, roundBitsMask; 2659 int8 roundingMode; 2660 float64 z; 2661 2662 aExp = extractFloat64Exp( a ); 2663 if ( 0x433 <= aExp ) { 2664 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2665 return propagateFloat64NaN( a, a ); 2666 } 2667 return a; 2668 } 2669 if ( aExp < 0x3FF ) { 2670 if ( (bits64) ( a<<1 ) == 0 ) return a; 2671 float_set_inexact(); 2672 aSign = extractFloat64Sign( a ); 2673 switch ( float_rounding_mode() ) { 2674 case float_round_nearest_even: 2675 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2676 return packFloat64( aSign, 0x3FF, 0 ); 2677 } 2678 break; 2679 case float_round_down: 2680 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2681 case float_round_up: 2682 return 2683 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2684 } 2685 return packFloat64( aSign, 0, 0 ); 2686 } 2687 lastBitMask = 1; 2688 lastBitMask <<= 0x433 - aExp; 2689 roundBitsMask = lastBitMask - 1; 2690 z = a; 2691 roundingMode = float_rounding_mode(); 2692 if ( roundingMode == float_round_nearest_even ) { 2693 z += lastBitMask>>1; 2694 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2695 } 2696 else if ( roundingMode != float_round_to_zero ) { 2697 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2698 z += roundBitsMask; 2699 } 2700 } 2701 z &= ~ roundBitsMask; 2702 if ( z != a ) float_set_inexact(); 2703 return z; 2704 2705 } 2706 #endif 2707 2708 /* 2709 ------------------------------------------------------------------------------- 2710 Returns the result of adding the absolute values of the double-precision 2711 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2712 before being returned. `zSign' is ignored if the result is a NaN. 2713 The addition is performed according to the IEC/IEEE Standard for Binary 2714 Floating-Point Arithmetic. 2715 ------------------------------------------------------------------------------- 2716 */ 2717 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2718 { 2719 int16 aExp, bExp, zExp; 2720 bits64 aSig, bSig, zSig; 2721 int16 expDiff; 2722 2723 aSig = extractFloat64Frac( a ); 2724 aExp = extractFloat64Exp( a ); 2725 bSig = extractFloat64Frac( b ); 2726 bExp = extractFloat64Exp( b ); 2727 expDiff = aExp - bExp; 2728 aSig <<= 9; 2729 bSig <<= 9; 2730 if ( 0 < expDiff ) { 2731 if ( aExp == 0x7FF ) { 2732 if ( aSig ) return propagateFloat64NaN( a, b ); 2733 return a; 2734 } 2735 if ( bExp == 0 ) { 2736 --expDiff; 2737 } 2738 else { 2739 bSig |= LIT64( 0x2000000000000000 ); 2740 } 2741 shift64RightJamming( bSig, expDiff, &bSig ); 2742 zExp = aExp; 2743 } 2744 else if ( expDiff < 0 ) { 2745 if ( bExp == 0x7FF ) { 2746 if ( bSig ) return propagateFloat64NaN( a, b ); 2747 return packFloat64( zSign, 0x7FF, 0 ); 2748 } 2749 if ( aExp == 0 ) { 2750 ++expDiff; 2751 } 2752 else { 2753 aSig |= LIT64( 0x2000000000000000 ); 2754 } 2755 shift64RightJamming( aSig, - expDiff, &aSig ); 2756 zExp = bExp; 2757 } 2758 else { 2759 if ( aExp == 0x7FF ) { 2760 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2761 return a; 2762 } 2763 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2764 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2765 zExp = aExp; 2766 goto roundAndPack; 2767 } 2768 aSig |= LIT64( 0x2000000000000000 ); 2769 zSig = ( aSig + bSig )<<1; 2770 --zExp; 2771 if ( (sbits64) zSig < 0 ) { 2772 zSig = aSig + bSig; 2773 ++zExp; 2774 } 2775 roundAndPack: 2776 return roundAndPackFloat64( zSign, zExp, zSig ); 2777 2778 } 2779 2780 /* 2781 ------------------------------------------------------------------------------- 2782 Returns the result of subtracting the absolute values of the double- 2783 precision floating-point values `a' and `b'. If `zSign' is 1, the 2784 difference is negated before being returned. `zSign' is ignored if the 2785 result is a NaN. The subtraction is performed according to the IEC/IEEE 2786 Standard for Binary Floating-Point Arithmetic. 2787 ------------------------------------------------------------------------------- 2788 */ 2789 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2790 { 2791 int16 aExp, bExp, zExp; 2792 bits64 aSig, bSig, zSig; 2793 int16 expDiff; 2794 2795 aSig = extractFloat64Frac( a ); 2796 aExp = extractFloat64Exp( a ); 2797 bSig = extractFloat64Frac( b ); 2798 bExp = extractFloat64Exp( b ); 2799 expDiff = aExp - bExp; 2800 aSig <<= 10; 2801 bSig <<= 10; 2802 if ( 0 < expDiff ) goto aExpBigger; 2803 if ( expDiff < 0 ) goto bExpBigger; 2804 if ( aExp == 0x7FF ) { 2805 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2806 float_raise( float_flag_invalid ); 2807 return float64_default_nan; 2808 } 2809 if ( aExp == 0 ) { 2810 aExp = 1; 2811 bExp = 1; 2812 } 2813 if ( bSig < aSig ) goto aBigger; 2814 if ( aSig < bSig ) goto bBigger; 2815 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2816 bExpBigger: 2817 if ( bExp == 0x7FF ) { 2818 if ( bSig ) return propagateFloat64NaN( a, b ); 2819 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2820 } 2821 if ( aExp == 0 ) { 2822 ++expDiff; 2823 } 2824 else { 2825 aSig |= LIT64( 0x4000000000000000 ); 2826 } 2827 shift64RightJamming( aSig, - expDiff, &aSig ); 2828 bSig |= LIT64( 0x4000000000000000 ); 2829 bBigger: 2830 zSig = bSig - aSig; 2831 zExp = bExp; 2832 zSign ^= 1; 2833 goto normalizeRoundAndPack; 2834 aExpBigger: 2835 if ( aExp == 0x7FF ) { 2836 if ( aSig ) return propagateFloat64NaN( a, b ); 2837 return a; 2838 } 2839 if ( bExp == 0 ) { 2840 --expDiff; 2841 } 2842 else { 2843 bSig |= LIT64( 0x4000000000000000 ); 2844 } 2845 shift64RightJamming( bSig, expDiff, &bSig ); 2846 aSig |= LIT64( 0x4000000000000000 ); 2847 aBigger: 2848 zSig = aSig - bSig; 2849 zExp = aExp; 2850 normalizeRoundAndPack: 2851 --zExp; 2852 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2853 2854 } 2855 2856 /* 2857 ------------------------------------------------------------------------------- 2858 Returns the result of adding the double-precision floating-point values `a' 2859 and `b'. The operation is performed according to the IEC/IEEE Standard for 2860 Binary Floating-Point Arithmetic. 2861 ------------------------------------------------------------------------------- 2862 */ 2863 float64 float64_add( float64 a, float64 b ) 2864 { 2865 flag aSign, bSign; 2866 2867 aSign = extractFloat64Sign( a ); 2868 bSign = extractFloat64Sign( b ); 2869 if ( aSign == bSign ) { 2870 return addFloat64Sigs( a, b, aSign ); 2871 } 2872 else { 2873 return subFloat64Sigs( a, b, aSign ); 2874 } 2875 2876 } 2877 2878 /* 2879 ------------------------------------------------------------------------------- 2880 Returns the result of subtracting the double-precision floating-point values 2881 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2882 for Binary Floating-Point Arithmetic. 2883 ------------------------------------------------------------------------------- 2884 */ 2885 float64 float64_sub( float64 a, float64 b ) 2886 { 2887 flag aSign, bSign; 2888 2889 aSign = extractFloat64Sign( a ); 2890 bSign = extractFloat64Sign( b ); 2891 if ( aSign == bSign ) { 2892 return subFloat64Sigs( a, b, aSign ); 2893 } 2894 else { 2895 return addFloat64Sigs( a, b, aSign ); 2896 } 2897 2898 } 2899 2900 /* 2901 ------------------------------------------------------------------------------- 2902 Returns the result of multiplying the double-precision floating-point values 2903 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2904 for Binary Floating-Point Arithmetic. 2905 ------------------------------------------------------------------------------- 2906 */ 2907 float64 float64_mul( float64 a, float64 b ) 2908 { 2909 flag aSign, bSign, zSign; 2910 int16 aExp, bExp, zExp; 2911 bits64 aSig, bSig, zSig0, zSig1; 2912 2913 aSig = extractFloat64Frac( a ); 2914 aExp = extractFloat64Exp( a ); 2915 aSign = extractFloat64Sign( a ); 2916 bSig = extractFloat64Frac( b ); 2917 bExp = extractFloat64Exp( b ); 2918 bSign = extractFloat64Sign( b ); 2919 zSign = aSign ^ bSign; 2920 if ( aExp == 0x7FF ) { 2921 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2922 return propagateFloat64NaN( a, b ); 2923 } 2924 if ( ( bExp | bSig ) == 0 ) { 2925 float_raise( float_flag_invalid ); 2926 return float64_default_nan; 2927 } 2928 return packFloat64( zSign, 0x7FF, 0 ); 2929 } 2930 if ( bExp == 0x7FF ) { 2931 if ( bSig ) return propagateFloat64NaN( a, b ); 2932 if ( ( aExp | aSig ) == 0 ) { 2933 float_raise( float_flag_invalid ); 2934 return float64_default_nan; 2935 } 2936 return packFloat64( zSign, 0x7FF, 0 ); 2937 } 2938 if ( aExp == 0 ) { 2939 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2940 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2941 } 2942 if ( bExp == 0 ) { 2943 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2944 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2945 } 2946 zExp = aExp + bExp - 0x3FF; 2947 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2948 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2949 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2950 zSig0 |= ( zSig1 != 0 ); 2951 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2952 zSig0 <<= 1; 2953 --zExp; 2954 } 2955 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2956 2957 } 2958 2959 /* 2960 ------------------------------------------------------------------------------- 2961 Returns the result of dividing the double-precision floating-point value `a' 2962 by the corresponding value `b'. The operation is performed according to 2963 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2964 ------------------------------------------------------------------------------- 2965 */ 2966 float64 float64_div( float64 a, float64 b ) 2967 { 2968 flag aSign, bSign, zSign; 2969 int16 aExp, bExp, zExp; 2970 bits64 aSig, bSig, zSig; 2971 bits64 rem0, rem1; 2972 bits64 term0, term1; 2973 2974 aSig = extractFloat64Frac( a ); 2975 aExp = extractFloat64Exp( a ); 2976 aSign = extractFloat64Sign( a ); 2977 bSig = extractFloat64Frac( b ); 2978 bExp = extractFloat64Exp( b ); 2979 bSign = extractFloat64Sign( b ); 2980 zSign = aSign ^ bSign; 2981 if ( aExp == 0x7FF ) { 2982 if ( aSig ) return propagateFloat64NaN( a, b ); 2983 if ( bExp == 0x7FF ) { 2984 if ( bSig ) return propagateFloat64NaN( a, b ); 2985 float_raise( float_flag_invalid ); 2986 return float64_default_nan; 2987 } 2988 return packFloat64( zSign, 0x7FF, 0 ); 2989 } 2990 if ( bExp == 0x7FF ) { 2991 if ( bSig ) return propagateFloat64NaN( a, b ); 2992 return packFloat64( zSign, 0, 0 ); 2993 } 2994 if ( bExp == 0 ) { 2995 if ( bSig == 0 ) { 2996 if ( ( aExp | aSig ) == 0 ) { 2997 float_raise( float_flag_invalid ); 2998 return float64_default_nan; 2999 } 3000 float_raise( float_flag_divbyzero ); 3001 return packFloat64( zSign, 0x7FF, 0 ); 3002 } 3003 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3004 } 3005 if ( aExp == 0 ) { 3006 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3007 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3008 } 3009 zExp = aExp - bExp + 0x3FD; 3010 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3011 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3012 if ( bSig <= ( aSig + aSig ) ) { 3013 aSig >>= 1; 3014 ++zExp; 3015 } 3016 zSig = estimateDiv128To64( aSig, 0, bSig ); 3017 if ( ( zSig & 0x1FF ) <= 2 ) { 3018 mul64To128( bSig, zSig, &term0, &term1 ); 3019 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3020 while ( (sbits64) rem0 < 0 ) { 3021 --zSig; 3022 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3023 } 3024 zSig |= ( rem1 != 0 ); 3025 } 3026 return roundAndPackFloat64( zSign, zExp, zSig ); 3027 3028 } 3029 3030 #ifndef SOFTFLOAT_FOR_GCC 3031 /* 3032 ------------------------------------------------------------------------------- 3033 Returns the remainder of the double-precision floating-point value `a' 3034 with respect to the corresponding value `b'. The operation is performed 3035 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3036 ------------------------------------------------------------------------------- 3037 */ 3038 float64 float64_rem( float64 a, float64 b ) 3039 { 3040 flag aSign, bSign, zSign; 3041 int16 aExp, bExp, expDiff; 3042 bits64 aSig, bSig; 3043 bits64 q, alternateASig; 3044 sbits64 sigMean; 3045 3046 aSig = extractFloat64Frac( a ); 3047 aExp = extractFloat64Exp( a ); 3048 aSign = extractFloat64Sign( a ); 3049 bSig = extractFloat64Frac( b ); 3050 bExp = extractFloat64Exp( b ); 3051 bSign = extractFloat64Sign( b ); 3052 if ( aExp == 0x7FF ) { 3053 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3054 return propagateFloat64NaN( a, b ); 3055 } 3056 float_raise( float_flag_invalid ); 3057 return float64_default_nan; 3058 } 3059 if ( bExp == 0x7FF ) { 3060 if ( bSig ) return propagateFloat64NaN( a, b ); 3061 return a; 3062 } 3063 if ( bExp == 0 ) { 3064 if ( bSig == 0 ) { 3065 float_raise( float_flag_invalid ); 3066 return float64_default_nan; 3067 } 3068 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3069 } 3070 if ( aExp == 0 ) { 3071 if ( aSig == 0 ) return a; 3072 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3073 } 3074 expDiff = aExp - bExp; 3075 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3076 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3077 if ( expDiff < 0 ) { 3078 if ( expDiff < -1 ) return a; 3079 aSig >>= 1; 3080 } 3081 q = ( bSig <= aSig ); 3082 if ( q ) aSig -= bSig; 3083 expDiff -= 64; 3084 while ( 0 < expDiff ) { 3085 q = estimateDiv128To64( aSig, 0, bSig ); 3086 q = ( 2 < q ) ? q - 2 : 0; 3087 aSig = - ( ( bSig>>2 ) * q ); 3088 expDiff -= 62; 3089 } 3090 expDiff += 64; 3091 if ( 0 < expDiff ) { 3092 q = estimateDiv128To64( aSig, 0, bSig ); 3093 q = ( 2 < q ) ? q - 2 : 0; 3094 q >>= 64 - expDiff; 3095 bSig >>= 2; 3096 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3097 } 3098 else { 3099 aSig >>= 2; 3100 bSig >>= 2; 3101 } 3102 do { 3103 alternateASig = aSig; 3104 ++q; 3105 aSig -= bSig; 3106 } while ( 0 <= (sbits64) aSig ); 3107 sigMean = aSig + alternateASig; 3108 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3109 aSig = alternateASig; 3110 } 3111 zSign = ( (sbits64) aSig < 0 ); 3112 if ( zSign ) aSig = - aSig; 3113 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3114 3115 } 3116 3117 /* 3118 ------------------------------------------------------------------------------- 3119 Returns the square root of the double-precision floating-point value `a'. 3120 The operation is performed according to the IEC/IEEE Standard for Binary 3121 Floating-Point Arithmetic. 3122 ------------------------------------------------------------------------------- 3123 */ 3124 float64 float64_sqrt( float64 a ) 3125 { 3126 flag aSign; 3127 int16 aExp, zExp; 3128 bits64 aSig, zSig, doubleZSig; 3129 bits64 rem0, rem1, term0, term1; 3130 3131 aSig = extractFloat64Frac( a ); 3132 aExp = extractFloat64Exp( a ); 3133 aSign = extractFloat64Sign( a ); 3134 if ( aExp == 0x7FF ) { 3135 if ( aSig ) return propagateFloat64NaN( a, a ); 3136 if ( ! aSign ) return a; 3137 float_raise( float_flag_invalid ); 3138 return float64_default_nan; 3139 } 3140 if ( aSign ) { 3141 if ( ( aExp | aSig ) == 0 ) return a; 3142 float_raise( float_flag_invalid ); 3143 return float64_default_nan; 3144 } 3145 if ( aExp == 0 ) { 3146 if ( aSig == 0 ) return 0; 3147 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3148 } 3149 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3150 aSig |= LIT64( 0x0010000000000000 ); 3151 zSig = estimateSqrt32( aExp, aSig>>21 ); 3152 aSig <<= 9 - ( aExp & 1 ); 3153 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3154 if ( ( zSig & 0x1FF ) <= 5 ) { 3155 doubleZSig = zSig<<1; 3156 mul64To128( zSig, zSig, &term0, &term1 ); 3157 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3158 while ( (sbits64) rem0 < 0 ) { 3159 --zSig; 3160 doubleZSig -= 2; 3161 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3162 } 3163 zSig |= ( ( rem0 | rem1 ) != 0 ); 3164 } 3165 return roundAndPackFloat64( 0, zExp, zSig ); 3166 3167 } 3168 #endif 3169 3170 /* 3171 ------------------------------------------------------------------------------- 3172 Returns 1 if the double-precision floating-point value `a' is equal to the 3173 corresponding value `b', and 0 otherwise. The comparison is performed 3174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3175 ------------------------------------------------------------------------------- 3176 */ 3177 flag float64_eq( float64 a, float64 b ) 3178 { 3179 3180 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3181 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3182 ) { 3183 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3184 float_raise( float_flag_invalid ); 3185 } 3186 return 0; 3187 } 3188 return ( a == b ) || 3189 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3190 3191 } 3192 3193 /* 3194 ------------------------------------------------------------------------------- 3195 Returns 1 if the double-precision floating-point value `a' is less than or 3196 equal to the corresponding value `b', and 0 otherwise. The comparison is 3197 performed according to the IEC/IEEE Standard for Binary Floating-Point 3198 Arithmetic. 3199 ------------------------------------------------------------------------------- 3200 */ 3201 flag float64_le( float64 a, float64 b ) 3202 { 3203 flag aSign, bSign; 3204 3205 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3206 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3207 ) { 3208 float_raise( float_flag_invalid ); 3209 return 0; 3210 } 3211 aSign = extractFloat64Sign( a ); 3212 bSign = extractFloat64Sign( b ); 3213 if ( aSign != bSign ) 3214 return aSign || 3215 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3216 0 ); 3217 return ( a == b ) || 3218 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3219 3220 } 3221 3222 /* 3223 ------------------------------------------------------------------------------- 3224 Returns 1 if the double-precision floating-point value `a' is less than 3225 the corresponding value `b', and 0 otherwise. The comparison is performed 3226 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3227 ------------------------------------------------------------------------------- 3228 */ 3229 flag float64_lt( float64 a, float64 b ) 3230 { 3231 flag aSign, bSign; 3232 3233 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3234 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3235 ) { 3236 float_raise( float_flag_invalid ); 3237 return 0; 3238 } 3239 aSign = extractFloat64Sign( a ); 3240 bSign = extractFloat64Sign( b ); 3241 if ( aSign != bSign ) 3242 return aSign && 3243 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3244 0 ); 3245 return ( a != b ) && 3246 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3247 3248 } 3249 3250 #ifndef SOFTFLOAT_FOR_GCC 3251 /* 3252 ------------------------------------------------------------------------------- 3253 Returns 1 if the double-precision floating-point value `a' is equal to the 3254 corresponding value `b', and 0 otherwise. The invalid exception is raised 3255 if either operand is a NaN. Otherwise, the comparison is performed 3256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3257 ------------------------------------------------------------------------------- 3258 */ 3259 flag float64_eq_signaling( float64 a, float64 b ) 3260 { 3261 3262 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3263 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3264 ) { 3265 float_raise( float_flag_invalid ); 3266 return 0; 3267 } 3268 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3269 3270 } 3271 3272 /* 3273 ------------------------------------------------------------------------------- 3274 Returns 1 if the double-precision floating-point value `a' is less than or 3275 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3276 cause an exception. Otherwise, the comparison is performed according to the 3277 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3278 ------------------------------------------------------------------------------- 3279 */ 3280 flag float64_le_quiet( float64 a, float64 b ) 3281 { 3282 flag aSign, bSign; 3283 3284 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3285 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3286 ) { 3287 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3288 float_raise( float_flag_invalid ); 3289 } 3290 return 0; 3291 } 3292 aSign = extractFloat64Sign( a ); 3293 bSign = extractFloat64Sign( b ); 3294 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3295 return ( a == b ) || ( aSign ^ ( a < b ) ); 3296 3297 } 3298 3299 /* 3300 ------------------------------------------------------------------------------- 3301 Returns 1 if the double-precision floating-point value `a' is less than 3302 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3303 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3304 Standard for Binary Floating-Point Arithmetic. 3305 ------------------------------------------------------------------------------- 3306 */ 3307 flag float64_lt_quiet( float64 a, float64 b ) 3308 { 3309 flag aSign, bSign; 3310 3311 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3312 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3313 ) { 3314 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3315 float_raise( float_flag_invalid ); 3316 } 3317 return 0; 3318 } 3319 aSign = extractFloat64Sign( a ); 3320 bSign = extractFloat64Sign( b ); 3321 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3322 return ( a != b ) && ( aSign ^ ( a < b ) ); 3323 3324 } 3325 #endif 3326 3327 #ifdef FLOATX80 3328 3329 /* 3330 ------------------------------------------------------------------------------- 3331 Returns the result of converting the extended double-precision floating- 3332 point value `a' to the 32-bit two's complement integer format. The 3333 conversion is performed according to the IEC/IEEE Standard for Binary 3334 Floating-Point Arithmetic---which means in particular that the conversion 3335 is rounded according to the current rounding mode. If `a' is a NaN, the 3336 largest positive integer is returned. Otherwise, if the conversion 3337 overflows, the largest integer with the same sign as `a' is returned. 3338 ------------------------------------------------------------------------------- 3339 */ 3340 int32 floatx80_to_int32( floatx80 a ) 3341 { 3342 flag aSign; 3343 int32 aExp, shiftCount; 3344 bits64 aSig; 3345 3346 aSig = extractFloatx80Frac( a ); 3347 aExp = extractFloatx80Exp( a ); 3348 aSign = extractFloatx80Sign( a ); 3349 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3350 shiftCount = 0x4037 - aExp; 3351 if ( shiftCount <= 0 ) shiftCount = 1; 3352 shift64RightJamming( aSig, shiftCount, &aSig ); 3353 return roundAndPackInt32( aSign, aSig ); 3354 3355 } 3356 3357 /* 3358 ------------------------------------------------------------------------------- 3359 Returns the result of converting the extended double-precision floating- 3360 point value `a' to the 32-bit two's complement integer format. The 3361 conversion is performed according to the IEC/IEEE Standard for Binary 3362 Floating-Point Arithmetic, except that the conversion is always rounded 3363 toward zero. If `a' is a NaN, the largest positive integer is returned. 3364 Otherwise, if the conversion overflows, the largest integer with the same 3365 sign as `a' is returned. 3366 ------------------------------------------------------------------------------- 3367 */ 3368 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3369 { 3370 flag aSign; 3371 int32 aExp, shiftCount; 3372 bits64 aSig, savedASig; 3373 int32 z; 3374 3375 aSig = extractFloatx80Frac( a ); 3376 aExp = extractFloatx80Exp( a ); 3377 aSign = extractFloatx80Sign( a ); 3378 if ( 0x401E < aExp ) { 3379 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3380 goto invalid; 3381 } 3382 else if ( aExp < 0x3FFF ) { 3383 if ( aExp || aSig ) float_set_inexact(); 3384 return 0; 3385 } 3386 shiftCount = 0x403E - aExp; 3387 savedASig = aSig; 3388 aSig >>= shiftCount; 3389 z = aSig; 3390 if ( aSign ) z = - z; 3391 if ( ( z < 0 ) ^ aSign ) { 3392 invalid: 3393 float_raise( float_flag_invalid ); 3394 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3395 } 3396 if ( ( aSig<<shiftCount ) != savedASig ) { 3397 float_set_inexact(); 3398 } 3399 return z; 3400 3401 } 3402 3403 /* 3404 ------------------------------------------------------------------------------- 3405 Returns the result of converting the extended double-precision floating- 3406 point value `a' to the 64-bit two's complement integer format. The 3407 conversion is performed according to the IEC/IEEE Standard for Binary 3408 Floating-Point Arithmetic---which means in particular that the conversion 3409 is rounded according to the current rounding mode. If `a' is a NaN, 3410 the largest positive integer is returned. Otherwise, if the conversion 3411 overflows, the largest integer with the same sign as `a' is returned. 3412 ------------------------------------------------------------------------------- 3413 */ 3414 int64 floatx80_to_int64( floatx80 a ) 3415 { 3416 flag aSign; 3417 int32 aExp, shiftCount; 3418 bits64 aSig, aSigExtra; 3419 3420 aSig = extractFloatx80Frac( a ); 3421 aExp = extractFloatx80Exp( a ); 3422 aSign = extractFloatx80Sign( a ); 3423 shiftCount = 0x403E - aExp; 3424 if ( shiftCount <= 0 ) { 3425 if ( shiftCount ) { 3426 float_raise( float_flag_invalid ); 3427 if ( ! aSign 3428 || ( ( aExp == 0x7FFF ) 3429 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3430 ) { 3431 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3432 } 3433 return (sbits64) LIT64( 0x8000000000000000 ); 3434 } 3435 aSigExtra = 0; 3436 } 3437 else { 3438 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3439 } 3440 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3441 3442 } 3443 3444 /* 3445 ------------------------------------------------------------------------------- 3446 Returns the result of converting the extended double-precision floating- 3447 point value `a' to the 64-bit two's complement integer format. The 3448 conversion is performed according to the IEC/IEEE Standard for Binary 3449 Floating-Point Arithmetic, except that the conversion is always rounded 3450 toward zero. If `a' is a NaN, the largest positive integer is returned. 3451 Otherwise, if the conversion overflows, the largest integer with the same 3452 sign as `a' is returned. 3453 ------------------------------------------------------------------------------- 3454 */ 3455 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3456 { 3457 flag aSign; 3458 int32 aExp, shiftCount; 3459 bits64 aSig; 3460 int64 z; 3461 3462 aSig = extractFloatx80Frac( a ); 3463 aExp = extractFloatx80Exp( a ); 3464 aSign = extractFloatx80Sign( a ); 3465 shiftCount = aExp - 0x403E; 3466 if ( 0 <= shiftCount ) { 3467 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3468 if ( ( a.high != 0xC03E ) || aSig ) { 3469 float_raise( float_flag_invalid ); 3470 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3471 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3472 } 3473 } 3474 return (sbits64) LIT64( 0x8000000000000000 ); 3475 } 3476 else if ( aExp < 0x3FFF ) { 3477 if ( aExp | aSig ) float_set_inexact(); 3478 return 0; 3479 } 3480 z = aSig>>( - shiftCount ); 3481 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3482 float_set_inexact(); 3483 } 3484 if ( aSign ) z = - z; 3485 return z; 3486 3487 } 3488 3489 /* 3490 ------------------------------------------------------------------------------- 3491 Returns the result of converting the extended double-precision floating- 3492 point value `a' to the single-precision floating-point format. The 3493 conversion is performed according to the IEC/IEEE Standard for Binary 3494 Floating-Point Arithmetic. 3495 ------------------------------------------------------------------------------- 3496 */ 3497 float32 floatx80_to_float32( floatx80 a ) 3498 { 3499 flag aSign; 3500 int32 aExp; 3501 bits64 aSig; 3502 3503 aSig = extractFloatx80Frac( a ); 3504 aExp = extractFloatx80Exp( a ); 3505 aSign = extractFloatx80Sign( a ); 3506 if ( aExp == 0x7FFF ) { 3507 if ( (bits64) ( aSig<<1 ) ) { 3508 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3509 } 3510 return packFloat32( aSign, 0xFF, 0 ); 3511 } 3512 shift64RightJamming( aSig, 33, &aSig ); 3513 if ( aExp || aSig ) aExp -= 0x3F81; 3514 return roundAndPackFloat32( aSign, aExp, aSig ); 3515 3516 } 3517 3518 /* 3519 ------------------------------------------------------------------------------- 3520 Returns the result of converting the extended double-precision floating- 3521 point value `a' to the double-precision floating-point format. The 3522 conversion is performed according to the IEC/IEEE Standard for Binary 3523 Floating-Point Arithmetic. 3524 ------------------------------------------------------------------------------- 3525 */ 3526 float64 floatx80_to_float64( floatx80 a ) 3527 { 3528 flag aSign; 3529 int32 aExp; 3530 bits64 aSig, zSig; 3531 3532 aSig = extractFloatx80Frac( a ); 3533 aExp = extractFloatx80Exp( a ); 3534 aSign = extractFloatx80Sign( a ); 3535 if ( aExp == 0x7FFF ) { 3536 if ( (bits64) ( aSig<<1 ) ) { 3537 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3538 } 3539 return packFloat64( aSign, 0x7FF, 0 ); 3540 } 3541 shift64RightJamming( aSig, 1, &zSig ); 3542 if ( aExp || aSig ) aExp -= 0x3C01; 3543 return roundAndPackFloat64( aSign, aExp, zSig ); 3544 3545 } 3546 3547 #ifdef FLOAT128 3548 3549 /* 3550 ------------------------------------------------------------------------------- 3551 Returns the result of converting the extended double-precision floating- 3552 point value `a' to the quadruple-precision floating-point format. The 3553 conversion is performed according to the IEC/IEEE Standard for Binary 3554 Floating-Point Arithmetic. 3555 ------------------------------------------------------------------------------- 3556 */ 3557 float128 floatx80_to_float128( floatx80 a ) 3558 { 3559 flag aSign; 3560 int16 aExp; 3561 bits64 aSig, zSig0, zSig1; 3562 3563 aSig = extractFloatx80Frac( a ); 3564 aExp = extractFloatx80Exp( a ); 3565 aSign = extractFloatx80Sign( a ); 3566 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3567 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3568 } 3569 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3570 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3571 3572 } 3573 3574 #endif 3575 3576 /* 3577 ------------------------------------------------------------------------------- 3578 Rounds the extended double-precision floating-point value `a' to an integer, 3579 and returns the result as an extended quadruple-precision floating-point 3580 value. The operation is performed according to the IEC/IEEE Standard for 3581 Binary Floating-Point Arithmetic. 3582 ------------------------------------------------------------------------------- 3583 */ 3584 floatx80 floatx80_round_to_int( floatx80 a ) 3585 { 3586 flag aSign; 3587 int32 aExp; 3588 bits64 lastBitMask, roundBitsMask; 3589 int8 roundingMode; 3590 floatx80 z; 3591 3592 aExp = extractFloatx80Exp( a ); 3593 if ( 0x403E <= aExp ) { 3594 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3595 return propagateFloatx80NaN( a, a ); 3596 } 3597 return a; 3598 } 3599 if ( aExp < 0x3FFF ) { 3600 if ( ( aExp == 0 ) 3601 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3602 return a; 3603 } 3604 float_set_inexact(); 3605 aSign = extractFloatx80Sign( a ); 3606 switch ( float_rounding_mode() ) { 3607 case float_round_nearest_even: 3608 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3609 ) { 3610 return 3611 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3612 } 3613 break; 3614 case float_round_down: 3615 return 3616 aSign ? 3617 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3618 : packFloatx80( 0, 0, 0 ); 3619 case float_round_up: 3620 return 3621 aSign ? packFloatx80( 1, 0, 0 ) 3622 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3623 } 3624 return packFloatx80( aSign, 0, 0 ); 3625 } 3626 lastBitMask = 1; 3627 lastBitMask <<= 0x403E - aExp; 3628 roundBitsMask = lastBitMask - 1; 3629 z = a; 3630 roundingMode = float_rounding_mode(); 3631 if ( roundingMode == float_round_nearest_even ) { 3632 z.low += lastBitMask>>1; 3633 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3634 } 3635 else if ( roundingMode != float_round_to_zero ) { 3636 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3637 z.low += roundBitsMask; 3638 } 3639 } 3640 z.low &= ~ roundBitsMask; 3641 if ( z.low == 0 ) { 3642 ++z.high; 3643 z.low = LIT64( 0x8000000000000000 ); 3644 } 3645 if ( z.low != a.low ) float_set_inexact(); 3646 return z; 3647 3648 } 3649 3650 /* 3651 ------------------------------------------------------------------------------- 3652 Returns the result of adding the absolute values of the extended double- 3653 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3654 negated before being returned. `zSign' is ignored if the result is a NaN. 3655 The addition is performed according to the IEC/IEEE Standard for Binary 3656 Floating-Point Arithmetic. 3657 ------------------------------------------------------------------------------- 3658 */ 3659 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3660 { 3661 int32 aExp, bExp, zExp; 3662 bits64 aSig, bSig, zSig0, zSig1; 3663 int32 expDiff; 3664 3665 aSig = extractFloatx80Frac( a ); 3666 aExp = extractFloatx80Exp( a ); 3667 bSig = extractFloatx80Frac( b ); 3668 bExp = extractFloatx80Exp( b ); 3669 expDiff = aExp - bExp; 3670 if ( 0 < expDiff ) { 3671 if ( aExp == 0x7FFF ) { 3672 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3673 return a; 3674 } 3675 if ( bExp == 0 ) --expDiff; 3676 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3677 zExp = aExp; 3678 } 3679 else if ( expDiff < 0 ) { 3680 if ( bExp == 0x7FFF ) { 3681 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3682 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3683 } 3684 if ( aExp == 0 ) ++expDiff; 3685 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3686 zExp = bExp; 3687 } 3688 else { 3689 if ( aExp == 0x7FFF ) { 3690 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3691 return propagateFloatx80NaN( a, b ); 3692 } 3693 return a; 3694 } 3695 zSig1 = 0; 3696 zSig0 = aSig + bSig; 3697 if ( aExp == 0 ) { 3698 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3699 goto roundAndPack; 3700 } 3701 zExp = aExp; 3702 goto shiftRight1; 3703 } 3704 zSig0 = aSig + bSig; 3705 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3706 shiftRight1: 3707 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3708 zSig0 |= LIT64( 0x8000000000000000 ); 3709 ++zExp; 3710 roundAndPack: 3711 return 3712 roundAndPackFloatx80( 3713 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3714 3715 } 3716 3717 /* 3718 ------------------------------------------------------------------------------- 3719 Returns the result of subtracting the absolute values of the extended 3720 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3721 difference is negated before being returned. `zSign' is ignored if the 3722 result is a NaN. The subtraction is performed according to the IEC/IEEE 3723 Standard for Binary Floating-Point Arithmetic. 3724 ------------------------------------------------------------------------------- 3725 */ 3726 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3727 { 3728 int32 aExp, bExp, zExp; 3729 bits64 aSig, bSig, zSig0, zSig1; 3730 int32 expDiff; 3731 floatx80 z; 3732 3733 aSig = extractFloatx80Frac( a ); 3734 aExp = extractFloatx80Exp( a ); 3735 bSig = extractFloatx80Frac( b ); 3736 bExp = extractFloatx80Exp( b ); 3737 expDiff = aExp - bExp; 3738 if ( 0 < expDiff ) goto aExpBigger; 3739 if ( expDiff < 0 ) goto bExpBigger; 3740 if ( aExp == 0x7FFF ) { 3741 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3742 return propagateFloatx80NaN( a, b ); 3743 } 3744 float_raise( float_flag_invalid ); 3745 z.low = floatx80_default_nan_low; 3746 z.high = floatx80_default_nan_high; 3747 return z; 3748 } 3749 if ( aExp == 0 ) { 3750 aExp = 1; 3751 bExp = 1; 3752 } 3753 zSig1 = 0; 3754 if ( bSig < aSig ) goto aBigger; 3755 if ( aSig < bSig ) goto bBigger; 3756 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3757 bExpBigger: 3758 if ( bExp == 0x7FFF ) { 3759 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3760 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3761 } 3762 if ( aExp == 0 ) ++expDiff; 3763 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3764 bBigger: 3765 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3766 zExp = bExp; 3767 zSign ^= 1; 3768 goto normalizeRoundAndPack; 3769 aExpBigger: 3770 if ( aExp == 0x7FFF ) { 3771 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3772 return a; 3773 } 3774 if ( bExp == 0 ) --expDiff; 3775 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3776 aBigger: 3777 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3778 zExp = aExp; 3779 normalizeRoundAndPack: 3780 return 3781 normalizeRoundAndPackFloatx80( 3782 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3783 3784 } 3785 3786 /* 3787 ------------------------------------------------------------------------------- 3788 Returns the result of adding the extended double-precision floating-point 3789 values `a' and `b'. The operation is performed according to the IEC/IEEE 3790 Standard for Binary Floating-Point Arithmetic. 3791 ------------------------------------------------------------------------------- 3792 */ 3793 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3794 { 3795 flag aSign, bSign; 3796 3797 aSign = extractFloatx80Sign( a ); 3798 bSign = extractFloatx80Sign( b ); 3799 if ( aSign == bSign ) { 3800 return addFloatx80Sigs( a, b, aSign ); 3801 } 3802 else { 3803 return subFloatx80Sigs( a, b, aSign ); 3804 } 3805 3806 } 3807 3808 /* 3809 ------------------------------------------------------------------------------- 3810 Returns the result of subtracting the extended double-precision floating- 3811 point values `a' and `b'. The operation is performed according to the 3812 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3813 ------------------------------------------------------------------------------- 3814 */ 3815 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3816 { 3817 flag aSign, bSign; 3818 3819 aSign = extractFloatx80Sign( a ); 3820 bSign = extractFloatx80Sign( b ); 3821 if ( aSign == bSign ) { 3822 return subFloatx80Sigs( a, b, aSign ); 3823 } 3824 else { 3825 return addFloatx80Sigs( a, b, aSign ); 3826 } 3827 3828 } 3829 3830 /* 3831 ------------------------------------------------------------------------------- 3832 Returns the result of multiplying the extended double-precision floating- 3833 point values `a' and `b'. The operation is performed according to the 3834 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3835 ------------------------------------------------------------------------------- 3836 */ 3837 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3838 { 3839 flag aSign, bSign, zSign; 3840 int32 aExp, bExp, zExp; 3841 bits64 aSig, bSig, zSig0, zSig1; 3842 floatx80 z; 3843 3844 aSig = extractFloatx80Frac( a ); 3845 aExp = extractFloatx80Exp( a ); 3846 aSign = extractFloatx80Sign( a ); 3847 bSig = extractFloatx80Frac( b ); 3848 bExp = extractFloatx80Exp( b ); 3849 bSign = extractFloatx80Sign( b ); 3850 zSign = aSign ^ bSign; 3851 if ( aExp == 0x7FFF ) { 3852 if ( (bits64) ( aSig<<1 ) 3853 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3854 return propagateFloatx80NaN( a, b ); 3855 } 3856 if ( ( bExp | bSig ) == 0 ) goto invalid; 3857 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3858 } 3859 if ( bExp == 0x7FFF ) { 3860 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3861 if ( ( aExp | aSig ) == 0 ) { 3862 invalid: 3863 float_raise( float_flag_invalid ); 3864 z.low = floatx80_default_nan_low; 3865 z.high = floatx80_default_nan_high; 3866 return z; 3867 } 3868 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3869 } 3870 if ( aExp == 0 ) { 3871 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3872 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3873 } 3874 if ( bExp == 0 ) { 3875 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3876 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3877 } 3878 zExp = aExp + bExp - 0x3FFE; 3879 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3880 if ( 0 < (sbits64) zSig0 ) { 3881 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3882 --zExp; 3883 } 3884 return 3885 roundAndPackFloatx80( 3886 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3887 3888 } 3889 3890 /* 3891 ------------------------------------------------------------------------------- 3892 Returns the result of dividing the extended double-precision floating-point 3893 value `a' by the corresponding value `b'. The operation is performed 3894 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3895 ------------------------------------------------------------------------------- 3896 */ 3897 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3898 { 3899 flag aSign, bSign, zSign; 3900 int32 aExp, bExp, zExp; 3901 bits64 aSig, bSig, zSig0, zSig1; 3902 bits64 rem0, rem1, rem2, term0, term1, term2; 3903 floatx80 z; 3904 3905 aSig = extractFloatx80Frac( a ); 3906 aExp = extractFloatx80Exp( a ); 3907 aSign = extractFloatx80Sign( a ); 3908 bSig = extractFloatx80Frac( b ); 3909 bExp = extractFloatx80Exp( b ); 3910 bSign = extractFloatx80Sign( b ); 3911 zSign = aSign ^ bSign; 3912 if ( aExp == 0x7FFF ) { 3913 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3914 if ( bExp == 0x7FFF ) { 3915 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3916 goto invalid; 3917 } 3918 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3919 } 3920 if ( bExp == 0x7FFF ) { 3921 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3922 return packFloatx80( zSign, 0, 0 ); 3923 } 3924 if ( bExp == 0 ) { 3925 if ( bSig == 0 ) { 3926 if ( ( aExp | aSig ) == 0 ) { 3927 invalid: 3928 float_raise( float_flag_invalid ); 3929 z.low = floatx80_default_nan_low; 3930 z.high = floatx80_default_nan_high; 3931 return z; 3932 } 3933 float_raise( float_flag_divbyzero ); 3934 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3935 } 3936 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3937 } 3938 if ( aExp == 0 ) { 3939 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3940 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3941 } 3942 zExp = aExp - bExp + 0x3FFE; 3943 rem1 = 0; 3944 if ( bSig <= aSig ) { 3945 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3946 ++zExp; 3947 } 3948 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3949 mul64To128( bSig, zSig0, &term0, &term1 ); 3950 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3951 while ( (sbits64) rem0 < 0 ) { 3952 --zSig0; 3953 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3954 } 3955 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3956 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3957 mul64To128( bSig, zSig1, &term1, &term2 ); 3958 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3959 while ( (sbits64) rem1 < 0 ) { 3960 --zSig1; 3961 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3962 } 3963 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3964 } 3965 return 3966 roundAndPackFloatx80( 3967 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3968 3969 } 3970 3971 /* 3972 ------------------------------------------------------------------------------- 3973 Returns the remainder of the extended double-precision floating-point value 3974 `a' with respect to the corresponding value `b'. The operation is performed 3975 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3976 ------------------------------------------------------------------------------- 3977 */ 3978 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3979 { 3980 flag aSign, bSign, zSign; 3981 int32 aExp, bExp, expDiff; 3982 bits64 aSig0, aSig1, bSig; 3983 bits64 q, term0, term1, alternateASig0, alternateASig1; 3984 floatx80 z; 3985 3986 aSig0 = extractFloatx80Frac( a ); 3987 aExp = extractFloatx80Exp( a ); 3988 aSign = extractFloatx80Sign( a ); 3989 bSig = extractFloatx80Frac( b ); 3990 bExp = extractFloatx80Exp( b ); 3991 bSign = extractFloatx80Sign( b ); 3992 if ( aExp == 0x7FFF ) { 3993 if ( (bits64) ( aSig0<<1 ) 3994 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3995 return propagateFloatx80NaN( a, b ); 3996 } 3997 goto invalid; 3998 } 3999 if ( bExp == 0x7FFF ) { 4000 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 4001 return a; 4002 } 4003 if ( bExp == 0 ) { 4004 if ( bSig == 0 ) { 4005 invalid: 4006 float_raise( float_flag_invalid ); 4007 z.low = floatx80_default_nan_low; 4008 z.high = floatx80_default_nan_high; 4009 return z; 4010 } 4011 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4012 } 4013 if ( aExp == 0 ) { 4014 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 4015 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4016 } 4017 bSig |= LIT64( 0x8000000000000000 ); 4018 zSign = aSign; 4019 expDiff = aExp - bExp; 4020 aSig1 = 0; 4021 if ( expDiff < 0 ) { 4022 if ( expDiff < -1 ) return a; 4023 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4024 expDiff = 0; 4025 } 4026 q = ( bSig <= aSig0 ); 4027 if ( q ) aSig0 -= bSig; 4028 expDiff -= 64; 4029 while ( 0 < expDiff ) { 4030 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4031 q = ( 2 < q ) ? q - 2 : 0; 4032 mul64To128( bSig, q, &term0, &term1 ); 4033 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4034 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4035 expDiff -= 62; 4036 } 4037 expDiff += 64; 4038 if ( 0 < expDiff ) { 4039 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4040 q = ( 2 < q ) ? q - 2 : 0; 4041 q >>= 64 - expDiff; 4042 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4043 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4044 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4045 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4046 ++q; 4047 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4048 } 4049 } 4050 else { 4051 term1 = 0; 4052 term0 = bSig; 4053 } 4054 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4055 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4056 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4057 && ( q & 1 ) ) 4058 ) { 4059 aSig0 = alternateASig0; 4060 aSig1 = alternateASig1; 4061 zSign = ! zSign; 4062 } 4063 return 4064 normalizeRoundAndPackFloatx80( 4065 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4066 4067 } 4068 4069 /* 4070 ------------------------------------------------------------------------------- 4071 Returns the square root of the extended double-precision floating-point 4072 value `a'. The operation is performed according to the IEC/IEEE Standard 4073 for Binary Floating-Point Arithmetic. 4074 ------------------------------------------------------------------------------- 4075 */ 4076 floatx80 floatx80_sqrt( floatx80 a ) 4077 { 4078 flag aSign; 4079 int32 aExp, zExp; 4080 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4081 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4082 floatx80 z; 4083 4084 aSig0 = extractFloatx80Frac( a ); 4085 aExp = extractFloatx80Exp( a ); 4086 aSign = extractFloatx80Sign( a ); 4087 if ( aExp == 0x7FFF ) { 4088 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4089 if ( ! aSign ) return a; 4090 goto invalid; 4091 } 4092 if ( aSign ) { 4093 if ( ( aExp | aSig0 ) == 0 ) return a; 4094 invalid: 4095 float_raise( float_flag_invalid ); 4096 z.low = floatx80_default_nan_low; 4097 z.high = floatx80_default_nan_high; 4098 return z; 4099 } 4100 if ( aExp == 0 ) { 4101 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4102 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4103 } 4104 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4105 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4106 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4107 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4108 doubleZSig0 = zSig0<<1; 4109 mul64To128( zSig0, zSig0, &term0, &term1 ); 4110 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4111 while ( (sbits64) rem0 < 0 ) { 4112 --zSig0; 4113 doubleZSig0 -= 2; 4114 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4115 } 4116 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4117 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4118 if ( zSig1 == 0 ) zSig1 = 1; 4119 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4120 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4121 mul64To128( zSig1, zSig1, &term2, &term3 ); 4122 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4123 while ( (sbits64) rem1 < 0 ) { 4124 --zSig1; 4125 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4126 term3 |= 1; 4127 term2 |= doubleZSig0; 4128 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4129 } 4130 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4131 } 4132 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4133 zSig0 |= doubleZSig0; 4134 return 4135 roundAndPackFloatx80( 4136 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4137 4138 } 4139 4140 /* 4141 ------------------------------------------------------------------------------- 4142 Returns 1 if the extended double-precision floating-point value `a' is 4143 equal to the corresponding value `b', and 0 otherwise. The comparison is 4144 performed according to the IEC/IEEE Standard for Binary Floating-Point 4145 Arithmetic. 4146 ------------------------------------------------------------------------------- 4147 */ 4148 flag floatx80_eq( floatx80 a, floatx80 b ) 4149 { 4150 4151 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4152 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4153 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4154 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4155 ) { 4156 if ( floatx80_is_signaling_nan( a ) 4157 || floatx80_is_signaling_nan( b ) ) { 4158 float_raise( float_flag_invalid ); 4159 } 4160 return 0; 4161 } 4162 return 4163 ( a.low == b.low ) 4164 && ( ( a.high == b.high ) 4165 || ( ( a.low == 0 ) 4166 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4167 ); 4168 4169 } 4170 4171 /* 4172 ------------------------------------------------------------------------------- 4173 Returns 1 if the extended double-precision floating-point value `a' is 4174 less than or equal to the corresponding value `b', and 0 otherwise. The 4175 comparison is performed according to the IEC/IEEE Standard for Binary 4176 Floating-Point Arithmetic. 4177 ------------------------------------------------------------------------------- 4178 */ 4179 flag floatx80_le( floatx80 a, floatx80 b ) 4180 { 4181 flag aSign, bSign; 4182 4183 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4184 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4185 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4186 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4187 ) { 4188 float_raise( float_flag_invalid ); 4189 return 0; 4190 } 4191 aSign = extractFloatx80Sign( a ); 4192 bSign = extractFloatx80Sign( b ); 4193 if ( aSign != bSign ) { 4194 return 4195 aSign 4196 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4197 == 0 ); 4198 } 4199 return 4200 aSign ? le128( b.high, b.low, a.high, a.low ) 4201 : le128( a.high, a.low, b.high, b.low ); 4202 4203 } 4204 4205 /* 4206 ------------------------------------------------------------------------------- 4207 Returns 1 if the extended double-precision floating-point value `a' is 4208 less than the corresponding value `b', and 0 otherwise. The comparison 4209 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4210 Arithmetic. 4211 ------------------------------------------------------------------------------- 4212 */ 4213 flag floatx80_lt( floatx80 a, floatx80 b ) 4214 { 4215 flag aSign, bSign; 4216 4217 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4218 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4219 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4220 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4221 ) { 4222 float_raise( float_flag_invalid ); 4223 return 0; 4224 } 4225 aSign = extractFloatx80Sign( a ); 4226 bSign = extractFloatx80Sign( b ); 4227 if ( aSign != bSign ) { 4228 return 4229 aSign 4230 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4231 != 0 ); 4232 } 4233 return 4234 aSign ? lt128( b.high, b.low, a.high, a.low ) 4235 : lt128( a.high, a.low, b.high, b.low ); 4236 4237 } 4238 4239 /* 4240 ------------------------------------------------------------------------------- 4241 Returns 1 if the extended double-precision floating-point value `a' is equal 4242 to the corresponding value `b', and 0 otherwise. The invalid exception is 4243 raised if either operand is a NaN. Otherwise, the comparison is performed 4244 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4245 ------------------------------------------------------------------------------- 4246 */ 4247 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4248 { 4249 4250 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4251 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4252 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4253 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4254 ) { 4255 float_raise( float_flag_invalid ); 4256 return 0; 4257 } 4258 return 4259 ( a.low == b.low ) 4260 && ( ( a.high == b.high ) 4261 || ( ( a.low == 0 ) 4262 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4263 ); 4264 4265 } 4266 4267 /* 4268 ------------------------------------------------------------------------------- 4269 Returns 1 if the extended double-precision floating-point value `a' is less 4270 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4271 do not cause an exception. Otherwise, the comparison is performed according 4272 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4273 ------------------------------------------------------------------------------- 4274 */ 4275 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4276 { 4277 flag aSign, bSign; 4278 4279 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4280 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4281 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4282 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4283 ) { 4284 if ( floatx80_is_signaling_nan( a ) 4285 || floatx80_is_signaling_nan( b ) ) { 4286 float_raise( float_flag_invalid ); 4287 } 4288 return 0; 4289 } 4290 aSign = extractFloatx80Sign( a ); 4291 bSign = extractFloatx80Sign( b ); 4292 if ( aSign != bSign ) { 4293 return 4294 aSign 4295 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4296 == 0 ); 4297 } 4298 return 4299 aSign ? le128( b.high, b.low, a.high, a.low ) 4300 : le128( a.high, a.low, b.high, b.low ); 4301 4302 } 4303 4304 /* 4305 ------------------------------------------------------------------------------- 4306 Returns 1 if the extended double-precision floating-point value `a' is less 4307 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4308 an exception. Otherwise, the comparison is performed according to the 4309 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4310 ------------------------------------------------------------------------------- 4311 */ 4312 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4313 { 4314 flag aSign, bSign; 4315 4316 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4317 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4318 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4319 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4320 ) { 4321 if ( floatx80_is_signaling_nan( a ) 4322 || floatx80_is_signaling_nan( b ) ) { 4323 float_raise( float_flag_invalid ); 4324 } 4325 return 0; 4326 } 4327 aSign = extractFloatx80Sign( a ); 4328 bSign = extractFloatx80Sign( b ); 4329 if ( aSign != bSign ) { 4330 return 4331 aSign 4332 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4333 != 0 ); 4334 } 4335 return 4336 aSign ? lt128( b.high, b.low, a.high, a.low ) 4337 : lt128( a.high, a.low, b.high, b.low ); 4338 4339 } 4340 4341 #endif 4342 4343 #ifdef FLOAT128 4344 4345 /* 4346 ------------------------------------------------------------------------------- 4347 Returns the result of converting the quadruple-precision floating-point 4348 value `a' to the 32-bit two's complement integer format. The conversion 4349 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4350 Arithmetic---which means in particular that the conversion is rounded 4351 according to the current rounding mode. If `a' is a NaN, the largest 4352 positive integer is returned. Otherwise, if the conversion overflows, the 4353 largest integer with the same sign as `a' is returned. 4354 ------------------------------------------------------------------------------- 4355 */ 4356 int32 float128_to_int32( float128 a ) 4357 { 4358 flag aSign; 4359 int32 aExp, shiftCount; 4360 bits64 aSig0, aSig1; 4361 4362 aSig1 = extractFloat128Frac1( a ); 4363 aSig0 = extractFloat128Frac0( a ); 4364 aExp = extractFloat128Exp( a ); 4365 aSign = extractFloat128Sign( a ); 4366 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4367 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4368 aSig0 |= ( aSig1 != 0 ); 4369 shiftCount = 0x4028 - aExp; 4370 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4371 return roundAndPackInt32( aSign, aSig0 ); 4372 4373 } 4374 4375 /* 4376 ------------------------------------------------------------------------------- 4377 Returns the result of converting the quadruple-precision floating-point 4378 value `a' to the 32-bit two's complement integer format. The conversion 4379 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4380 Arithmetic, except that the conversion is always rounded toward zero. If 4381 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4382 conversion overflows, the largest integer with the same sign as `a' is 4383 returned. 4384 ------------------------------------------------------------------------------- 4385 */ 4386 int32 float128_to_int32_round_to_zero( float128 a ) 4387 { 4388 flag aSign; 4389 int32 aExp, shiftCount; 4390 bits64 aSig0, aSig1, savedASig; 4391 int32 z; 4392 4393 aSig1 = extractFloat128Frac1( a ); 4394 aSig0 = extractFloat128Frac0( a ); 4395 aExp = extractFloat128Exp( a ); 4396 aSign = extractFloat128Sign( a ); 4397 aSig0 |= ( aSig1 != 0 ); 4398 if ( 0x401E < aExp ) { 4399 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4400 goto invalid; 4401 } 4402 else if ( aExp < 0x3FFF ) { 4403 if ( aExp || aSig0 ) float_set_inexact(); 4404 return 0; 4405 } 4406 aSig0 |= LIT64( 0x0001000000000000 ); 4407 shiftCount = 0x402F - aExp; 4408 savedASig = aSig0; 4409 aSig0 >>= shiftCount; 4410 z = aSig0; 4411 if ( aSign ) z = - z; 4412 if ( ( z < 0 ) ^ aSign ) { 4413 invalid: 4414 float_raise( float_flag_invalid ); 4415 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4416 } 4417 if ( ( aSig0<<shiftCount ) != savedASig ) { 4418 float_set_inexact(); 4419 } 4420 return z; 4421 4422 } 4423 4424 /* 4425 ------------------------------------------------------------------------------- 4426 Returns the result of converting the quadruple-precision floating-point 4427 value `a' to the 64-bit two's complement integer format. The conversion 4428 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4429 Arithmetic---which means in particular that the conversion is rounded 4430 according to the current rounding mode. If `a' is a NaN, the largest 4431 positive integer is returned. Otherwise, if the conversion overflows, the 4432 largest integer with the same sign as `a' is returned. 4433 ------------------------------------------------------------------------------- 4434 */ 4435 int64 float128_to_int64( float128 a ) 4436 { 4437 flag aSign; 4438 int32 aExp, shiftCount; 4439 bits64 aSig0, aSig1; 4440 4441 aSig1 = extractFloat128Frac1( a ); 4442 aSig0 = extractFloat128Frac0( a ); 4443 aExp = extractFloat128Exp( a ); 4444 aSign = extractFloat128Sign( a ); 4445 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4446 shiftCount = 0x402F - aExp; 4447 if ( shiftCount <= 0 ) { 4448 if ( 0x403E < aExp ) { 4449 float_raise( float_flag_invalid ); 4450 if ( ! aSign 4451 || ( ( aExp == 0x7FFF ) 4452 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4453 ) 4454 ) { 4455 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4456 } 4457 return (sbits64) LIT64( 0x8000000000000000 ); 4458 } 4459 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4460 } 4461 else { 4462 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4463 } 4464 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4465 4466 } 4467 4468 /* 4469 ------------------------------------------------------------------------------- 4470 Returns the result of converting the quadruple-precision floating-point 4471 value `a' to the 64-bit two's complement integer format. The conversion 4472 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4473 Arithmetic, except that the conversion is always rounded toward zero. 4474 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4475 the conversion overflows, the largest integer with the same sign as `a' is 4476 returned. 4477 ------------------------------------------------------------------------------- 4478 */ 4479 int64 float128_to_int64_round_to_zero( float128 a ) 4480 { 4481 flag aSign; 4482 int32 aExp, shiftCount; 4483 bits64 aSig0, aSig1; 4484 int64 z; 4485 4486 aSig1 = extractFloat128Frac1( a ); 4487 aSig0 = extractFloat128Frac0( a ); 4488 aExp = extractFloat128Exp( a ); 4489 aSign = extractFloat128Sign( a ); 4490 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4491 shiftCount = aExp - 0x402F; 4492 if ( 0 < shiftCount ) { 4493 if ( 0x403E <= aExp ) { 4494 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4495 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4496 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4497 if ( aSig1 ) float_set_inexact(); 4498 } 4499 else { 4500 float_raise( float_flag_invalid ); 4501 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4502 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4503 } 4504 } 4505 return (sbits64) LIT64( 0x8000000000000000 ); 4506 } 4507 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4508 if ( (bits64) ( aSig1<<shiftCount ) ) { 4509 float_set_inexact(); 4510 } 4511 } 4512 else { 4513 if ( aExp < 0x3FFF ) { 4514 if ( aExp | aSig0 | aSig1 ) { 4515 float_set_inexact(); 4516 } 4517 return 0; 4518 } 4519 z = aSig0>>( - shiftCount ); 4520 if ( aSig1 4521 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4522 float_set_inexact(); 4523 } 4524 } 4525 if ( aSign ) z = - z; 4526 return z; 4527 4528 } 4529 4530 /* 4531 ------------------------------------------------------------------------------- 4532 Returns the result of converting the quadruple-precision floating-point 4533 value `a' to the single-precision floating-point format. The conversion 4534 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4535 Arithmetic. 4536 ------------------------------------------------------------------------------- 4537 */ 4538 float32 float128_to_float32( float128 a ) 4539 { 4540 flag aSign; 4541 int32 aExp; 4542 bits64 aSig0, aSig1; 4543 bits32 zSig; 4544 4545 aSig1 = extractFloat128Frac1( a ); 4546 aSig0 = extractFloat128Frac0( a ); 4547 aExp = extractFloat128Exp( a ); 4548 aSign = extractFloat128Sign( a ); 4549 if ( aExp == 0x7FFF ) { 4550 if ( aSig0 | aSig1 ) { 4551 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4552 } 4553 return packFloat32( aSign, 0xFF, 0 ); 4554 } 4555 aSig0 |= ( aSig1 != 0 ); 4556 shift64RightJamming( aSig0, 18, &aSig0 ); 4557 zSig = aSig0; 4558 if ( aExp || zSig ) { 4559 zSig |= 0x40000000; 4560 aExp -= 0x3F81; 4561 } 4562 return roundAndPackFloat32( aSign, aExp, zSig ); 4563 4564 } 4565 4566 /* 4567 ------------------------------------------------------------------------------- 4568 Returns the result of converting the quadruple-precision floating-point 4569 value `a' to the double-precision floating-point format. The conversion 4570 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4571 Arithmetic. 4572 ------------------------------------------------------------------------------- 4573 */ 4574 float64 float128_to_float64( float128 a ) 4575 { 4576 flag aSign; 4577 int32 aExp; 4578 bits64 aSig0, aSig1; 4579 4580 aSig1 = extractFloat128Frac1( a ); 4581 aSig0 = extractFloat128Frac0( a ); 4582 aExp = extractFloat128Exp( a ); 4583 aSign = extractFloat128Sign( a ); 4584 if ( aExp == 0x7FFF ) { 4585 if ( aSig0 | aSig1 ) { 4586 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4587 } 4588 return packFloat64( aSign, 0x7FF, 0 ); 4589 } 4590 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4591 aSig0 |= ( aSig1 != 0 ); 4592 if ( aExp || aSig0 ) { 4593 aSig0 |= LIT64( 0x4000000000000000 ); 4594 aExp -= 0x3C01; 4595 } 4596 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4597 4598 } 4599 4600 #ifdef FLOATX80 4601 4602 /* 4603 ------------------------------------------------------------------------------- 4604 Returns the result of converting the quadruple-precision floating-point 4605 value `a' to the extended double-precision floating-point format. The 4606 conversion is performed according to the IEC/IEEE Standard for Binary 4607 Floating-Point Arithmetic. 4608 ------------------------------------------------------------------------------- 4609 */ 4610 floatx80 float128_to_floatx80( float128 a ) 4611 { 4612 flag aSign; 4613 int32 aExp; 4614 bits64 aSig0, aSig1; 4615 4616 aSig1 = extractFloat128Frac1( a ); 4617 aSig0 = extractFloat128Frac0( a ); 4618 aExp = extractFloat128Exp( a ); 4619 aSign = extractFloat128Sign( a ); 4620 if ( aExp == 0x7FFF ) { 4621 if ( aSig0 | aSig1 ) { 4622 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4623 } 4624 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4625 } 4626 if ( aExp == 0 ) { 4627 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4628 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4629 } 4630 else { 4631 aSig0 |= LIT64( 0x0001000000000000 ); 4632 } 4633 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4634 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4635 4636 } 4637 4638 #endif 4639 4640 /* 4641 ------------------------------------------------------------------------------- 4642 Rounds the quadruple-precision floating-point value `a' to an integer, and 4643 returns the result as a quadruple-precision floating-point value. The 4644 operation is performed according to the IEC/IEEE Standard for Binary 4645 Floating-Point Arithmetic. 4646 ------------------------------------------------------------------------------- 4647 */ 4648 float128 float128_round_to_int( float128 a ) 4649 { 4650 flag aSign; 4651 int32 aExp; 4652 bits64 lastBitMask, roundBitsMask; 4653 int8 roundingMode; 4654 float128 z; 4655 4656 aExp = extractFloat128Exp( a ); 4657 if ( 0x402F <= aExp ) { 4658 if ( 0x406F <= aExp ) { 4659 if ( ( aExp == 0x7FFF ) 4660 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4661 ) { 4662 return propagateFloat128NaN( a, a ); 4663 } 4664 return a; 4665 } 4666 lastBitMask = 1; 4667 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4668 roundBitsMask = lastBitMask - 1; 4669 z = a; 4670 roundingMode = float_rounding_mode(); 4671 if ( roundingMode == float_round_nearest_even ) { 4672 if ( lastBitMask ) { 4673 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4674 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4675 } 4676 else { 4677 if ( (sbits64) z.low < 0 ) { 4678 ++z.high; 4679 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4680 } 4681 } 4682 } 4683 else if ( roundingMode != float_round_to_zero ) { 4684 if ( extractFloat128Sign( z ) 4685 ^ ( roundingMode == float_round_up ) ) { 4686 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4687 } 4688 } 4689 z.low &= ~ roundBitsMask; 4690 } 4691 else { 4692 if ( aExp < 0x3FFF ) { 4693 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4694 float_set_inexact(); 4695 aSign = extractFloat128Sign( a ); 4696 switch ( float_rounding_mode() ) { 4697 case float_round_nearest_even: 4698 if ( ( aExp == 0x3FFE ) 4699 && ( extractFloat128Frac0( a ) 4700 | extractFloat128Frac1( a ) ) 4701 ) { 4702 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4703 } 4704 break; 4705 case float_round_down: 4706 return 4707 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4708 : packFloat128( 0, 0, 0, 0 ); 4709 case float_round_up: 4710 return 4711 aSign ? packFloat128( 1, 0, 0, 0 ) 4712 : packFloat128( 0, 0x3FFF, 0, 0 ); 4713 } 4714 return packFloat128( aSign, 0, 0, 0 ); 4715 } 4716 lastBitMask = 1; 4717 lastBitMask <<= 0x402F - aExp; 4718 roundBitsMask = lastBitMask - 1; 4719 z.low = 0; 4720 z.high = a.high; 4721 roundingMode = float_rounding_mode(); 4722 if ( roundingMode == float_round_nearest_even ) { 4723 z.high += lastBitMask>>1; 4724 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4725 z.high &= ~ lastBitMask; 4726 } 4727 } 4728 else if ( roundingMode != float_round_to_zero ) { 4729 if ( extractFloat128Sign( z ) 4730 ^ ( roundingMode == float_round_up ) ) { 4731 z.high |= ( a.low != 0 ); 4732 z.high += roundBitsMask; 4733 } 4734 } 4735 z.high &= ~ roundBitsMask; 4736 } 4737 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4738 float_set_inexact(); 4739 } 4740 return z; 4741 4742 } 4743 4744 /* 4745 ------------------------------------------------------------------------------- 4746 Returns the result of adding the absolute values of the quadruple-precision 4747 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4748 before being returned. `zSign' is ignored if the result is a NaN. 4749 The addition is performed according to the IEC/IEEE Standard for Binary 4750 Floating-Point Arithmetic. 4751 ------------------------------------------------------------------------------- 4752 */ 4753 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4754 { 4755 int32 aExp, bExp, zExp; 4756 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4757 int32 expDiff; 4758 4759 aSig1 = extractFloat128Frac1( a ); 4760 aSig0 = extractFloat128Frac0( a ); 4761 aExp = extractFloat128Exp( a ); 4762 bSig1 = extractFloat128Frac1( b ); 4763 bSig0 = extractFloat128Frac0( b ); 4764 bExp = extractFloat128Exp( b ); 4765 expDiff = aExp - bExp; 4766 if ( 0 < expDiff ) { 4767 if ( aExp == 0x7FFF ) { 4768 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4769 return a; 4770 } 4771 if ( bExp == 0 ) { 4772 --expDiff; 4773 } 4774 else { 4775 bSig0 |= LIT64( 0x0001000000000000 ); 4776 } 4777 shift128ExtraRightJamming( 4778 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4779 zExp = aExp; 4780 } 4781 else if ( expDiff < 0 ) { 4782 if ( bExp == 0x7FFF ) { 4783 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4784 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4785 } 4786 if ( aExp == 0 ) { 4787 ++expDiff; 4788 } 4789 else { 4790 aSig0 |= LIT64( 0x0001000000000000 ); 4791 } 4792 shift128ExtraRightJamming( 4793 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4794 zExp = bExp; 4795 } 4796 else { 4797 if ( aExp == 0x7FFF ) { 4798 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4799 return propagateFloat128NaN( a, b ); 4800 } 4801 return a; 4802 } 4803 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4804 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4805 zSig2 = 0; 4806 zSig0 |= LIT64( 0x0002000000000000 ); 4807 zExp = aExp; 4808 goto shiftRight1; 4809 } 4810 aSig0 |= LIT64( 0x0001000000000000 ); 4811 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4812 --zExp; 4813 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4814 ++zExp; 4815 shiftRight1: 4816 shift128ExtraRightJamming( 4817 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4818 roundAndPack: 4819 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4820 4821 } 4822 4823 /* 4824 ------------------------------------------------------------------------------- 4825 Returns the result of subtracting the absolute values of the quadruple- 4826 precision floating-point values `a' and `b'. If `zSign' is 1, the 4827 difference is negated before being returned. `zSign' is ignored if the 4828 result is a NaN. The subtraction is performed according to the IEC/IEEE 4829 Standard for Binary Floating-Point Arithmetic. 4830 ------------------------------------------------------------------------------- 4831 */ 4832 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4833 { 4834 int32 aExp, bExp, zExp; 4835 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4836 int32 expDiff; 4837 float128 z; 4838 4839 aSig1 = extractFloat128Frac1( a ); 4840 aSig0 = extractFloat128Frac0( a ); 4841 aExp = extractFloat128Exp( a ); 4842 bSig1 = extractFloat128Frac1( b ); 4843 bSig0 = extractFloat128Frac0( b ); 4844 bExp = extractFloat128Exp( b ); 4845 expDiff = aExp - bExp; 4846 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4847 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4848 if ( 0 < expDiff ) goto aExpBigger; 4849 if ( expDiff < 0 ) goto bExpBigger; 4850 if ( aExp == 0x7FFF ) { 4851 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4852 return propagateFloat128NaN( a, b ); 4853 } 4854 float_raise( float_flag_invalid ); 4855 z.low = float128_default_nan_low; 4856 z.high = float128_default_nan_high; 4857 return z; 4858 } 4859 if ( aExp == 0 ) { 4860 aExp = 1; 4861 bExp = 1; 4862 } 4863 if ( bSig0 < aSig0 ) goto aBigger; 4864 if ( aSig0 < bSig0 ) goto bBigger; 4865 if ( bSig1 < aSig1 ) goto aBigger; 4866 if ( aSig1 < bSig1 ) goto bBigger; 4867 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4868 bExpBigger: 4869 if ( bExp == 0x7FFF ) { 4870 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4871 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4872 } 4873 if ( aExp == 0 ) { 4874 ++expDiff; 4875 } 4876 else { 4877 aSig0 |= LIT64( 0x4000000000000000 ); 4878 } 4879 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4880 bSig0 |= LIT64( 0x4000000000000000 ); 4881 bBigger: 4882 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4883 zExp = bExp; 4884 zSign ^= 1; 4885 goto normalizeRoundAndPack; 4886 aExpBigger: 4887 if ( aExp == 0x7FFF ) { 4888 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4889 return a; 4890 } 4891 if ( bExp == 0 ) { 4892 --expDiff; 4893 } 4894 else { 4895 bSig0 |= LIT64( 0x4000000000000000 ); 4896 } 4897 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4898 aSig0 |= LIT64( 0x4000000000000000 ); 4899 aBigger: 4900 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4901 zExp = aExp; 4902 normalizeRoundAndPack: 4903 --zExp; 4904 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4905 4906 } 4907 4908 /* 4909 ------------------------------------------------------------------------------- 4910 Returns the result of adding the quadruple-precision floating-point values 4911 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4912 for Binary Floating-Point Arithmetic. 4913 ------------------------------------------------------------------------------- 4914 */ 4915 float128 float128_add( float128 a, float128 b ) 4916 { 4917 flag aSign, bSign; 4918 4919 aSign = extractFloat128Sign( a ); 4920 bSign = extractFloat128Sign( b ); 4921 if ( aSign == bSign ) { 4922 return addFloat128Sigs( a, b, aSign ); 4923 } 4924 else { 4925 return subFloat128Sigs( a, b, aSign ); 4926 } 4927 4928 } 4929 4930 /* 4931 ------------------------------------------------------------------------------- 4932 Returns the result of subtracting the quadruple-precision floating-point 4933 values `a' and `b'. The operation is performed according to the IEC/IEEE 4934 Standard for Binary Floating-Point Arithmetic. 4935 ------------------------------------------------------------------------------- 4936 */ 4937 float128 float128_sub( float128 a, float128 b ) 4938 { 4939 flag aSign, bSign; 4940 4941 aSign = extractFloat128Sign( a ); 4942 bSign = extractFloat128Sign( b ); 4943 if ( aSign == bSign ) { 4944 return subFloat128Sigs( a, b, aSign ); 4945 } 4946 else { 4947 return addFloat128Sigs( a, b, aSign ); 4948 } 4949 4950 } 4951 4952 /* 4953 ------------------------------------------------------------------------------- 4954 Returns the result of multiplying the quadruple-precision floating-point 4955 values `a' and `b'. The operation is performed according to the IEC/IEEE 4956 Standard for Binary Floating-Point Arithmetic. 4957 ------------------------------------------------------------------------------- 4958 */ 4959 float128 float128_mul( float128 a, float128 b ) 4960 { 4961 flag aSign, bSign, zSign; 4962 int32 aExp, bExp, zExp; 4963 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4964 float128 z; 4965 4966 aSig1 = extractFloat128Frac1( a ); 4967 aSig0 = extractFloat128Frac0( a ); 4968 aExp = extractFloat128Exp( a ); 4969 aSign = extractFloat128Sign( a ); 4970 bSig1 = extractFloat128Frac1( b ); 4971 bSig0 = extractFloat128Frac0( b ); 4972 bExp = extractFloat128Exp( b ); 4973 bSign = extractFloat128Sign( b ); 4974 zSign = aSign ^ bSign; 4975 if ( aExp == 0x7FFF ) { 4976 if ( ( aSig0 | aSig1 ) 4977 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4978 return propagateFloat128NaN( a, b ); 4979 } 4980 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4981 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4982 } 4983 if ( bExp == 0x7FFF ) { 4984 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4985 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4986 invalid: 4987 float_raise( float_flag_invalid ); 4988 z.low = float128_default_nan_low; 4989 z.high = float128_default_nan_high; 4990 return z; 4991 } 4992 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4993 } 4994 if ( aExp == 0 ) { 4995 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4996 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4997 } 4998 if ( bExp == 0 ) { 4999 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5000 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5001 } 5002 zExp = aExp + bExp - 0x4000; 5003 aSig0 |= LIT64( 0x0001000000000000 ); 5004 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5005 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5006 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5007 zSig2 |= ( zSig3 != 0 ); 5008 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5009 shift128ExtraRightJamming( 5010 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5011 ++zExp; 5012 } 5013 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5014 5015 } 5016 5017 /* 5018 ------------------------------------------------------------------------------- 5019 Returns the result of dividing the quadruple-precision floating-point value 5020 `a' by the corresponding value `b'. The operation is performed according to 5021 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5022 ------------------------------------------------------------------------------- 5023 */ 5024 float128 float128_div( float128 a, float128 b ) 5025 { 5026 flag aSign, bSign, zSign; 5027 int32 aExp, bExp, zExp; 5028 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5029 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5030 float128 z; 5031 5032 aSig1 = extractFloat128Frac1( a ); 5033 aSig0 = extractFloat128Frac0( a ); 5034 aExp = extractFloat128Exp( a ); 5035 aSign = extractFloat128Sign( a ); 5036 bSig1 = extractFloat128Frac1( b ); 5037 bSig0 = extractFloat128Frac0( b ); 5038 bExp = extractFloat128Exp( b ); 5039 bSign = extractFloat128Sign( b ); 5040 zSign = aSign ^ bSign; 5041 if ( aExp == 0x7FFF ) { 5042 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5043 if ( bExp == 0x7FFF ) { 5044 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5045 goto invalid; 5046 } 5047 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5048 } 5049 if ( bExp == 0x7FFF ) { 5050 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5051 return packFloat128( zSign, 0, 0, 0 ); 5052 } 5053 if ( bExp == 0 ) { 5054 if ( ( bSig0 | bSig1 ) == 0 ) { 5055 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5056 invalid: 5057 float_raise( float_flag_invalid ); 5058 z.low = float128_default_nan_low; 5059 z.high = float128_default_nan_high; 5060 return z; 5061 } 5062 float_raise( float_flag_divbyzero ); 5063 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5064 } 5065 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5066 } 5067 if ( aExp == 0 ) { 5068 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5069 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5070 } 5071 zExp = aExp - bExp + 0x3FFD; 5072 shortShift128Left( 5073 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5074 shortShift128Left( 5075 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5076 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5077 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5078 ++zExp; 5079 } 5080 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5081 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5082 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5083 while ( (sbits64) rem0 < 0 ) { 5084 --zSig0; 5085 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5086 } 5087 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5088 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5089 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5090 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5091 while ( (sbits64) rem1 < 0 ) { 5092 --zSig1; 5093 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5094 } 5095 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5096 } 5097 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5098 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5099 5100 } 5101 5102 /* 5103 ------------------------------------------------------------------------------- 5104 Returns the remainder of the quadruple-precision floating-point value `a' 5105 with respect to the corresponding value `b'. The operation is performed 5106 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5107 ------------------------------------------------------------------------------- 5108 */ 5109 float128 float128_rem( float128 a, float128 b ) 5110 { 5111 flag aSign, bSign, zSign; 5112 int32 aExp, bExp, expDiff; 5113 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5114 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5115 sbits64 sigMean0; 5116 float128 z; 5117 5118 aSig1 = extractFloat128Frac1( a ); 5119 aSig0 = extractFloat128Frac0( a ); 5120 aExp = extractFloat128Exp( a ); 5121 aSign = extractFloat128Sign( a ); 5122 bSig1 = extractFloat128Frac1( b ); 5123 bSig0 = extractFloat128Frac0( b ); 5124 bExp = extractFloat128Exp( b ); 5125 bSign = extractFloat128Sign( b ); 5126 if ( aExp == 0x7FFF ) { 5127 if ( ( aSig0 | aSig1 ) 5128 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5129 return propagateFloat128NaN( a, b ); 5130 } 5131 goto invalid; 5132 } 5133 if ( bExp == 0x7FFF ) { 5134 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5135 return a; 5136 } 5137 if ( bExp == 0 ) { 5138 if ( ( bSig0 | bSig1 ) == 0 ) { 5139 invalid: 5140 float_raise( float_flag_invalid ); 5141 z.low = float128_default_nan_low; 5142 z.high = float128_default_nan_high; 5143 return z; 5144 } 5145 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5146 } 5147 if ( aExp == 0 ) { 5148 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5149 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5150 } 5151 expDiff = aExp - bExp; 5152 if ( expDiff < -1 ) return a; 5153 shortShift128Left( 5154 aSig0 | LIT64( 0x0001000000000000 ), 5155 aSig1, 5156 15 - ( expDiff < 0 ), 5157 &aSig0, 5158 &aSig1 5159 ); 5160 shortShift128Left( 5161 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5162 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5163 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5164 expDiff -= 64; 5165 while ( 0 < expDiff ) { 5166 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5167 q = ( 4 < q ) ? q - 4 : 0; 5168 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5169 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5170 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5171 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5172 expDiff -= 61; 5173 } 5174 if ( -64 < expDiff ) { 5175 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5176 q = ( 4 < q ) ? q - 4 : 0; 5177 q >>= - expDiff; 5178 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5179 expDiff += 52; 5180 if ( expDiff < 0 ) { 5181 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5182 } 5183 else { 5184 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5185 } 5186 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5187 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5188 } 5189 else { 5190 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5191 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5192 } 5193 do { 5194 alternateASig0 = aSig0; 5195 alternateASig1 = aSig1; 5196 ++q; 5197 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5198 } while ( 0 <= (sbits64) aSig0 ); 5199 add128( 5200 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5201 if ( ( sigMean0 < 0 ) 5202 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5203 aSig0 = alternateASig0; 5204 aSig1 = alternateASig1; 5205 } 5206 zSign = ( (sbits64) aSig0 < 0 ); 5207 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5208 return 5209 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5210 5211 } 5212 5213 /* 5214 ------------------------------------------------------------------------------- 5215 Returns the square root of the quadruple-precision floating-point value `a'. 5216 The operation is performed according to the IEC/IEEE Standard for Binary 5217 Floating-Point Arithmetic. 5218 ------------------------------------------------------------------------------- 5219 */ 5220 float128 float128_sqrt( float128 a ) 5221 { 5222 flag aSign; 5223 int32 aExp, zExp; 5224 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5225 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5226 float128 z; 5227 5228 aSig1 = extractFloat128Frac1( a ); 5229 aSig0 = extractFloat128Frac0( a ); 5230 aExp = extractFloat128Exp( a ); 5231 aSign = extractFloat128Sign( a ); 5232 if ( aExp == 0x7FFF ) { 5233 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5234 if ( ! aSign ) return a; 5235 goto invalid; 5236 } 5237 if ( aSign ) { 5238 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5239 invalid: 5240 float_raise( float_flag_invalid ); 5241 z.low = float128_default_nan_low; 5242 z.high = float128_default_nan_high; 5243 return z; 5244 } 5245 if ( aExp == 0 ) { 5246 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5247 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5248 } 5249 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5250 aSig0 |= LIT64( 0x0001000000000000 ); 5251 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5252 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5253 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5254 doubleZSig0 = zSig0<<1; 5255 mul64To128( zSig0, zSig0, &term0, &term1 ); 5256 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5257 while ( (sbits64) rem0 < 0 ) { 5258 --zSig0; 5259 doubleZSig0 -= 2; 5260 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5261 } 5262 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5263 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5264 if ( zSig1 == 0 ) zSig1 = 1; 5265 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5266 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5267 mul64To128( zSig1, zSig1, &term2, &term3 ); 5268 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5269 while ( (sbits64) rem1 < 0 ) { 5270 --zSig1; 5271 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5272 term3 |= 1; 5273 term2 |= doubleZSig0; 5274 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5275 } 5276 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5277 } 5278 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5279 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5280 5281 } 5282 5283 /* 5284 ------------------------------------------------------------------------------- 5285 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5286 the corresponding value `b', and 0 otherwise. The comparison is performed 5287 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5288 ------------------------------------------------------------------------------- 5289 */ 5290 flag float128_eq( float128 a, float128 b ) 5291 { 5292 5293 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5294 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5295 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5296 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5297 ) { 5298 if ( float128_is_signaling_nan( a ) 5299 || float128_is_signaling_nan( b ) ) { 5300 float_raise( float_flag_invalid ); 5301 } 5302 return 0; 5303 } 5304 return 5305 ( a.low == b.low ) 5306 && ( ( a.high == b.high ) 5307 || ( ( a.low == 0 ) 5308 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5309 ); 5310 5311 } 5312 5313 /* 5314 ------------------------------------------------------------------------------- 5315 Returns 1 if the quadruple-precision floating-point value `a' is less than 5316 or equal to the corresponding value `b', and 0 otherwise. The comparison 5317 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5318 Arithmetic. 5319 ------------------------------------------------------------------------------- 5320 */ 5321 flag float128_le( float128 a, float128 b ) 5322 { 5323 flag aSign, bSign; 5324 5325 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5326 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5327 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5328 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5329 ) { 5330 float_raise( float_flag_invalid ); 5331 return 0; 5332 } 5333 aSign = extractFloat128Sign( a ); 5334 bSign = extractFloat128Sign( b ); 5335 if ( aSign != bSign ) { 5336 return 5337 aSign 5338 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5339 == 0 ); 5340 } 5341 return 5342 aSign ? le128( b.high, b.low, a.high, a.low ) 5343 : le128( a.high, a.low, b.high, b.low ); 5344 5345 } 5346 5347 /* 5348 ------------------------------------------------------------------------------- 5349 Returns 1 if the quadruple-precision floating-point value `a' is less than 5350 the corresponding value `b', and 0 otherwise. The comparison is performed 5351 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5352 ------------------------------------------------------------------------------- 5353 */ 5354 flag float128_lt( float128 a, float128 b ) 5355 { 5356 flag aSign, bSign; 5357 5358 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5359 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5360 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5361 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5362 ) { 5363 float_raise( float_flag_invalid ); 5364 return 0; 5365 } 5366 aSign = extractFloat128Sign( a ); 5367 bSign = extractFloat128Sign( b ); 5368 if ( aSign != bSign ) { 5369 return 5370 aSign 5371 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5372 != 0 ); 5373 } 5374 return 5375 aSign ? lt128( b.high, b.low, a.high, a.low ) 5376 : lt128( a.high, a.low, b.high, b.low ); 5377 5378 } 5379 5380 /* 5381 ------------------------------------------------------------------------------- 5382 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5383 the corresponding value `b', and 0 otherwise. The invalid exception is 5384 raised if either operand is a NaN. Otherwise, the comparison is performed 5385 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5386 ------------------------------------------------------------------------------- 5387 */ 5388 flag float128_eq_signaling( float128 a, float128 b ) 5389 { 5390 5391 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5392 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5393 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5394 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5395 ) { 5396 float_raise( float_flag_invalid ); 5397 return 0; 5398 } 5399 return 5400 ( a.low == b.low ) 5401 && ( ( a.high == b.high ) 5402 || ( ( a.low == 0 ) 5403 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5404 ); 5405 5406 } 5407 5408 /* 5409 ------------------------------------------------------------------------------- 5410 Returns 1 if the quadruple-precision floating-point value `a' is less than 5411 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5412 cause an exception. Otherwise, the comparison is performed according to the 5413 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5414 ------------------------------------------------------------------------------- 5415 */ 5416 flag float128_le_quiet( float128 a, float128 b ) 5417 { 5418 flag aSign, bSign; 5419 5420 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5421 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5422 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5423 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5424 ) { 5425 if ( float128_is_signaling_nan( a ) 5426 || float128_is_signaling_nan( b ) ) { 5427 float_raise( float_flag_invalid ); 5428 } 5429 return 0; 5430 } 5431 aSign = extractFloat128Sign( a ); 5432 bSign = extractFloat128Sign( b ); 5433 if ( aSign != bSign ) { 5434 return 5435 aSign 5436 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5437 == 0 ); 5438 } 5439 return 5440 aSign ? le128( b.high, b.low, a.high, a.low ) 5441 : le128( a.high, a.low, b.high, b.low ); 5442 5443 } 5444 5445 /* 5446 ------------------------------------------------------------------------------- 5447 Returns 1 if the quadruple-precision floating-point value `a' is less than 5448 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5449 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5450 Standard for Binary Floating-Point Arithmetic. 5451 ------------------------------------------------------------------------------- 5452 */ 5453 flag float128_lt_quiet( float128 a, float128 b ) 5454 { 5455 flag aSign, bSign; 5456 5457 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5458 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5459 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5460 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5461 ) { 5462 if ( float128_is_signaling_nan( a ) 5463 || float128_is_signaling_nan( b ) ) { 5464 float_raise( float_flag_invalid ); 5465 } 5466 return 0; 5467 } 5468 aSign = extractFloat128Sign( a ); 5469 bSign = extractFloat128Sign( b ); 5470 if ( aSign != bSign ) { 5471 return 5472 aSign 5473 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5474 != 0 ); 5475 } 5476 return 5477 aSign ? lt128( b.high, b.low, a.high, a.low ) 5478 : lt128( a.high, a.low, b.high, b.low ); 5479 5480 } 5481 5482 #endif 5483 5484 5485 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5486 5487 /* 5488 * These two routines are not part of the original softfloat distribution. 5489 * 5490 * They are based on the corresponding conversions to integer but return 5491 * unsigned numbers instead since these functions are required by GCC. 5492 * 5493 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97 5494 * 5495 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5496 */ 5497 5498 /* 5499 ------------------------------------------------------------------------------- 5500 Returns the result of converting the double-precision floating-point value 5501 `a' to the 32-bit unsigned integer format. The conversion is 5502 performed according to the IEC/IEEE Standard for Binary Floating-point 5503 Arithmetic, except that the conversion is always rounded toward zero. If 5504 `a' is a NaN, the largest positive integer is returned. If the conversion 5505 overflows, the largest integer positive is returned. 5506 ------------------------------------------------------------------------------- 5507 */ 5508 uint32 float64_to_uint32_round_to_zero( float64 a ) 5509 { 5510 flag aSign; 5511 int16 aExp, shiftCount; 5512 bits64 aSig, savedASig; 5513 uint32 z; 5514 5515 aSig = extractFloat64Frac( a ); 5516 aExp = extractFloat64Exp( a ); 5517 aSign = extractFloat64Sign( a ); 5518 5519 if (aSign) { 5520 float_raise( float_flag_invalid ); 5521 return(0); 5522 } 5523 5524 if ( 0x41E < aExp ) { 5525 float_raise( float_flag_invalid ); 5526 return 0xffffffff; 5527 } 5528 else if ( aExp < 0x3FF ) { 5529 if ( aExp || aSig ) float_set_inexact(); 5530 return 0; 5531 } 5532 aSig |= LIT64( 0x0010000000000000 ); 5533 shiftCount = 0x433 - aExp; 5534 savedASig = aSig; 5535 aSig >>= shiftCount; 5536 z = aSig; 5537 if ( ( aSig<<shiftCount ) != savedASig ) { 5538 float_set_inexact(); 5539 } 5540 return z; 5541 5542 } 5543 5544 /* 5545 ------------------------------------------------------------------------------- 5546 Returns the result of converting the single-precision floating-point value 5547 `a' to the 32-bit unsigned integer format. The conversion is 5548 performed according to the IEC/IEEE Standard for Binary Floating-point 5549 Arithmetic, except that the conversion is always rounded toward zero. If 5550 `a' is a NaN, the largest positive integer is returned. If the conversion 5551 overflows, the largest positive integer is returned. 5552 ------------------------------------------------------------------------------- 5553 */ 5554 uint32 float32_to_uint32_round_to_zero( float32 a ) 5555 { 5556 flag aSign; 5557 int16 aExp, shiftCount; 5558 bits32 aSig; 5559 uint32 z; 5560 5561 aSig = extractFloat32Frac( a ); 5562 aExp = extractFloat32Exp( a ); 5563 aSign = extractFloat32Sign( a ); 5564 shiftCount = aExp - 0x9E; 5565 5566 if (aSign) { 5567 float_raise( float_flag_invalid ); 5568 return(0); 5569 } 5570 if ( 0 < shiftCount ) { 5571 float_raise( float_flag_invalid ); 5572 return 0xFFFFFFFF; 5573 } 5574 else if ( aExp <= 0x7E ) { 5575 if ( aExp | aSig ) float_set_inexact(); 5576 return 0; 5577 } 5578 aSig = ( aSig | 0x800000 )<<8; 5579 z = aSig>>( - shiftCount ); 5580 if ( aSig<<( shiftCount & 31 ) ) { 5581 float_set_inexact(); 5582 } 5583 return z; 5584 5585 } 5586 5587 #endif 5588