1 /* $OpenBSD: softfloat.c,v 1.5 2012/12/05 23:20:22 deraadt Exp $ */ 2 /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */ 3 4 /* 5 * This version hacked for use with gcc -msoft-float by bjh21. 6 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 7 * itself). 8 */ 9 10 /* 11 * Things you may want to define: 12 * 13 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 14 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 15 * properly renamed. 16 */ 17 18 /* 19 =============================================================================== 20 21 This C source file is part of the SoftFloat IEC/IEEE Floating-point 22 Arithmetic Package, Release 2a. 23 24 Written by John R. Hauser. This work was made possible in part by the 25 International Computer Science Institute, located at Suite 600, 1947 Center 26 Street, Berkeley, California 94704. Funding was partially provided by the 27 National Science Foundation under grant MIP-9311980. The original version 28 of this code was written as part of a project to build a fixed-point vector 29 processor in collaboration with the University of California at Berkeley, 30 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 32 arithmetic/SoftFloat.html'. 33 34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable 35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT 36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS 37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL 38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM 39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY 40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL 41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS 42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS. 43 44 Derivative works are acceptable, even for commercial purposes, so long as 45 (1) they include prominent notice that the work is derivative, and (2) they 46 include prominent notice akin to these four paragraphs for those parts of 47 this code that are retained. 48 49 =============================================================================== 50 */ 51 52 #ifdef SOFTFLOAT_FOR_GCC 53 #include "softfloat-for-gcc.h" 54 #endif 55 56 #include "milieu.h" 57 #include "softfloat.h" 58 59 /* 60 * Conversions between floats as stored in memory and floats as 61 * SoftFloat uses them 62 */ 63 #ifndef FLOAT64_DEMANGLE 64 #define FLOAT64_DEMANGLE(a) (a) 65 #endif 66 #ifndef FLOAT64_MANGLE 67 #define FLOAT64_MANGLE(a) (a) 68 #endif 69 70 /* 71 ------------------------------------------------------------------------------- 72 Floating-point rounding mode, extended double-precision rounding precision, 73 and exception flags. 74 ------------------------------------------------------------------------------- 75 */ 76 77 /* 78 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 79 * Right now, it does not. I've removed all other dynamic global 80 * variables. [ross] 81 */ 82 #ifdef FLOATX80 83 int8 floatx80_rounding_precision = 80; 84 #endif 85 86 /* 87 ------------------------------------------------------------------------------- 88 Primitive arithmetic functions, including multi-word arithmetic, and 89 division and square root approximations. (Can be specialized to target if 90 desired.) 91 ------------------------------------------------------------------------------- 92 */ 93 #include "softfloat-macros.h" 94 95 /* 96 ------------------------------------------------------------------------------- 97 Functions and definitions to determine: (1) whether tininess for underflow 98 is detected before or after rounding by default, (2) what (if anything) 99 happens when exceptions are raised, (3) how signaling NaNs are distinguished 100 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 101 are propagated from function inputs to output. These details are target- 102 specific. 103 ------------------------------------------------------------------------------- 104 */ 105 #include "softfloat-specialize.h" 106 107 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 108 /* 109 ------------------------------------------------------------------------------- 110 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 111 and 7, and returns the properly rounded 32-bit integer corresponding to the 112 input. If `zSign' is 1, the input is negated before being converted to an 113 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 114 is simply rounded to an integer, with the inexact exception raised if the 115 input cannot be represented exactly as an integer. However, if the fixed- 116 point input is too large, the invalid exception is raised and the largest 117 positive or negative integer is returned. 118 ------------------------------------------------------------------------------- 119 */ 120 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 121 { 122 int8 roundingMode; 123 flag roundNearestEven; 124 int8 roundIncrement, roundBits; 125 int32 z; 126 127 roundingMode = float_rounding_mode(); 128 roundNearestEven = ( roundingMode == float_round_nearest_even ); 129 roundIncrement = 0x40; 130 if ( ! roundNearestEven ) { 131 if ( roundingMode == float_round_to_zero ) { 132 roundIncrement = 0; 133 } 134 else { 135 roundIncrement = 0x7F; 136 if ( zSign ) { 137 if ( roundingMode == float_round_up ) roundIncrement = 0; 138 } 139 else { 140 if ( roundingMode == float_round_down ) roundIncrement = 0; 141 } 142 } 143 } 144 roundBits = absZ & 0x7F; 145 absZ = ( absZ + roundIncrement )>>7; 146 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 147 z = absZ; 148 if ( zSign ) z = - z; 149 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 150 float_raise( float_flag_invalid ); 151 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 152 } 153 if ( roundBits ) float_set_inexact(); 154 return z; 155 156 } 157 158 /* 159 ------------------------------------------------------------------------------- 160 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 161 `absZ1', with binary point between bits 63 and 64 (between the input words), 162 and returns the properly rounded 64-bit integer corresponding to the input. 163 If `zSign' is 1, the input is negated before being converted to an integer. 164 Ordinarily, the fixed-point input is simply rounded to an integer, with 165 the inexact exception raised if the input cannot be represented exactly as 166 an integer. However, if the fixed-point input is too large, the invalid 167 exception is raised and the largest positive or negative integer is 168 returned. 169 ------------------------------------------------------------------------------- 170 */ 171 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 172 { 173 int8 roundingMode; 174 flag roundNearestEven, increment; 175 int64 z; 176 177 roundingMode = float_rounding_mode(); 178 roundNearestEven = ( roundingMode == float_round_nearest_even ); 179 increment = ( (sbits64) absZ1 < 0 ); 180 if ( ! roundNearestEven ) { 181 if ( roundingMode == float_round_to_zero ) { 182 increment = 0; 183 } 184 else { 185 if ( zSign ) { 186 increment = ( roundingMode == float_round_down ) && absZ1; 187 } 188 else { 189 increment = ( roundingMode == float_round_up ) && absZ1; 190 } 191 } 192 } 193 if ( increment ) { 194 ++absZ0; 195 if ( absZ0 == 0 ) goto overflow; 196 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 197 } 198 z = absZ0; 199 if ( zSign ) z = - z; 200 if ( z && ( ( z < 0 ) ^ zSign ) ) { 201 overflow: 202 float_raise( float_flag_invalid ); 203 return 204 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 205 : LIT64( 0x7FFFFFFFFFFFFFFF ); 206 } 207 if ( absZ1 ) float_set_inexact(); 208 return z; 209 210 } 211 212 #ifdef __alpha__ 213 /* 214 ------------------------------------------------------------------------------- 215 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 216 `absZ1', with binary point between bits 63 and 64 (between the input words), 217 and returns the properly rounded 64-bit integer corresponding to the input. 218 If `zSign' is 1, the input is negated before being converted to an integer. 219 Ordinarily, the fixed-point input is simply rounded to an integer, with 220 the inexact exception raised if the input cannot be represented exactly as 221 an integer. 222 ------------------------------------------------------------------------------- 223 */ 224 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0, 225 bits64 absZ1 ) 226 { 227 int8 roundingMode; 228 flag roundNearestEven, increment; 229 int64 z; 230 231 roundingMode = float_rounding_mode(); 232 roundNearestEven = ( roundingMode == float_round_nearest_even ); 233 increment = ( (sbits64) absZ1 < 0 ); 234 if ( ! roundNearestEven ) { 235 if ( roundingMode == float_round_to_zero ) { 236 increment = 0; 237 } 238 else { 239 if ( zSign ) { 240 increment = ( roundingMode == float_round_down ) && absZ1; 241 } 242 else { 243 increment = ( roundingMode == float_round_up ) && absZ1; 244 } 245 } 246 } 247 if ( increment ) { 248 ++absZ0; 249 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 250 } 251 z = absZ0; 252 if ( zSign ) z = - z; 253 if ( absZ1 ) float_set_inexact(); 254 return z; 255 256 } 257 #endif /* __alpha__ */ 258 #endif 259 260 /* 261 ------------------------------------------------------------------------------- 262 Returns the fraction bits of the single-precision floating-point value `a'. 263 ------------------------------------------------------------------------------- 264 */ 265 INLINE bits32 extractFloat32Frac( float32 a ) 266 { 267 268 return a & 0x007FFFFF; 269 270 } 271 272 /* 273 ------------------------------------------------------------------------------- 274 Returns the exponent bits of the single-precision floating-point value `a'. 275 ------------------------------------------------------------------------------- 276 */ 277 INLINE int16 extractFloat32Exp( float32 a ) 278 { 279 280 return ( a>>23 ) & 0xFF; 281 282 } 283 284 /* 285 ------------------------------------------------------------------------------- 286 Returns the sign bit of the single-precision floating-point value `a'. 287 ------------------------------------------------------------------------------- 288 */ 289 INLINE flag extractFloat32Sign( float32 a ) 290 { 291 292 return a>>31; 293 294 } 295 296 /* 297 ------------------------------------------------------------------------------- 298 Normalizes the subnormal single-precision floating-point value represented 299 by the denormalized significand `aSig'. The normalized exponent and 300 significand are stored at the locations pointed to by `zExpPtr' and 301 `zSigPtr', respectively. 302 ------------------------------------------------------------------------------- 303 */ 304 static void 305 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 306 { 307 int8 shiftCount; 308 309 shiftCount = countLeadingZeros32( aSig ) - 8; 310 *zSigPtr = aSig<<shiftCount; 311 *zExpPtr = 1 - shiftCount; 312 313 } 314 315 /* 316 ------------------------------------------------------------------------------- 317 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 318 single-precision floating-point value, returning the result. After being 319 shifted into the proper positions, the three fields are simply added 320 together to form the result. This means that any integer portion of `zSig' 321 will be added into the exponent. Since a properly normalized significand 322 will have an integer portion equal to 1, the `zExp' input should be 1 less 323 than the desired result exponent whenever `zSig' is a complete, normalized 324 significand. 325 ------------------------------------------------------------------------------- 326 */ 327 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 328 { 329 330 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 331 332 } 333 334 /* 335 ------------------------------------------------------------------------------- 336 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 337 and significand `zSig', and returns the proper single-precision floating- 338 point value corresponding to the abstract input. Ordinarily, the abstract 339 value is simply rounded and packed into the single-precision format, with 340 the inexact exception raised if the abstract input cannot be represented 341 exactly. However, if the abstract value is too large, the overflow and 342 inexact exceptions are raised and an infinity or maximal finite value is 343 returned. If the abstract value is too small, the input value is rounded to 344 a subnormal number, and the underflow and inexact exceptions are raised if 345 the abstract input cannot be represented exactly as a subnormal single- 346 precision floating-point number. 347 The input significand `zSig' has its binary point between bits 30 348 and 29, which is 7 bits to the left of the usual location. This shifted 349 significand must be normalized or smaller. If `zSig' is not normalized, 350 `zExp' must be 0; in that case, the result returned is a subnormal number, 351 and it must not require rounding. In the usual case that `zSig' is 352 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 353 The handling of underflow and overflow follows the IEC/IEEE Standard for 354 Binary Floating-Point Arithmetic. 355 ------------------------------------------------------------------------------- 356 */ 357 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 358 { 359 int8 roundingMode; 360 flag roundNearestEven; 361 int8 roundIncrement, roundBits; 362 flag isTiny; 363 364 roundingMode = float_rounding_mode(); 365 roundNearestEven = ( roundingMode == float_round_nearest_even ); 366 roundIncrement = 0x40; 367 if ( ! roundNearestEven ) { 368 if ( roundingMode == float_round_to_zero ) { 369 roundIncrement = 0; 370 } 371 else { 372 roundIncrement = 0x7F; 373 if ( zSign ) { 374 if ( roundingMode == float_round_up ) roundIncrement = 0; 375 } 376 else { 377 if ( roundingMode == float_round_down ) roundIncrement = 0; 378 } 379 } 380 } 381 roundBits = zSig & 0x7F; 382 if ( 0xFD <= (bits16) zExp ) { 383 if ( ( 0xFD < zExp ) 384 || ( ( zExp == 0xFD ) 385 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 386 ) { 387 float_raise( float_flag_overflow | float_flag_inexact ); 388 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 389 } 390 if ( zExp < 0 ) { 391 isTiny = 392 ( float_detect_tininess == float_tininess_before_rounding ) 393 || ( zExp < -1 ) 394 || ( zSig + roundIncrement < 0x80000000 ); 395 shift32RightJamming( zSig, - zExp, &zSig ); 396 zExp = 0; 397 roundBits = zSig & 0x7F; 398 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 399 } 400 } 401 if ( roundBits ) float_set_inexact(); 402 zSig = ( zSig + roundIncrement )>>7; 403 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 404 if ( zSig == 0 ) zExp = 0; 405 return packFloat32( zSign, zExp, zSig ); 406 407 } 408 409 /* 410 ------------------------------------------------------------------------------- 411 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 412 and significand `zSig', and returns the proper single-precision floating- 413 point value corresponding to the abstract input. This routine is just like 414 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 415 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 416 floating-point exponent. 417 ------------------------------------------------------------------------------- 418 */ 419 static float32 420 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 421 { 422 int8 shiftCount; 423 424 shiftCount = countLeadingZeros32( zSig ) - 1; 425 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 426 427 } 428 429 /* 430 ------------------------------------------------------------------------------- 431 Returns the fraction bits of the double-precision floating-point value `a'. 432 ------------------------------------------------------------------------------- 433 */ 434 INLINE bits64 extractFloat64Frac( float64 a ) 435 { 436 437 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 438 439 } 440 441 /* 442 ------------------------------------------------------------------------------- 443 Returns the exponent bits of the double-precision floating-point value `a'. 444 ------------------------------------------------------------------------------- 445 */ 446 INLINE int16 extractFloat64Exp( float64 a ) 447 { 448 449 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 450 451 } 452 453 /* 454 ------------------------------------------------------------------------------- 455 Returns the sign bit of the double-precision floating-point value `a'. 456 ------------------------------------------------------------------------------- 457 */ 458 INLINE flag extractFloat64Sign( float64 a ) 459 { 460 461 return FLOAT64_DEMANGLE(a)>>63; 462 463 } 464 465 /* 466 ------------------------------------------------------------------------------- 467 Normalizes the subnormal double-precision floating-point value represented 468 by the denormalized significand `aSig'. The normalized exponent and 469 significand are stored at the locations pointed to by `zExpPtr' and 470 `zSigPtr', respectively. 471 ------------------------------------------------------------------------------- 472 */ 473 static void 474 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 475 { 476 int8 shiftCount; 477 478 shiftCount = countLeadingZeros64( aSig ) - 11; 479 *zSigPtr = aSig<<shiftCount; 480 *zExpPtr = 1 - shiftCount; 481 482 } 483 484 /* 485 ------------------------------------------------------------------------------- 486 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 487 double-precision floating-point value, returning the result. After being 488 shifted into the proper positions, the three fields are simply added 489 together to form the result. This means that any integer portion of `zSig' 490 will be added into the exponent. Since a properly normalized significand 491 will have an integer portion equal to 1, the `zExp' input should be 1 less 492 than the desired result exponent whenever `zSig' is a complete, normalized 493 significand. 494 ------------------------------------------------------------------------------- 495 */ 496 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 497 { 498 499 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 500 ( ( (bits64) zExp )<<52 ) + zSig ); 501 502 } 503 504 /* 505 ------------------------------------------------------------------------------- 506 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 507 and significand `zSig', and returns the proper double-precision floating- 508 point value corresponding to the abstract input. Ordinarily, the abstract 509 value is simply rounded and packed into the double-precision format, with 510 the inexact exception raised if the abstract input cannot be represented 511 exactly. However, if the abstract value is too large, the overflow and 512 inexact exceptions are raised and an infinity or maximal finite value is 513 returned. If the abstract value is too small, the input value is rounded to 514 a subnormal number, and the underflow and inexact exceptions are raised if 515 the abstract input cannot be represented exactly as a subnormal double- 516 precision floating-point number. 517 The input significand `zSig' has its binary point between bits 62 518 and 61, which is 10 bits to the left of the usual location. This shifted 519 significand must be normalized or smaller. If `zSig' is not normalized, 520 `zExp' must be 0; in that case, the result returned is a subnormal number, 521 and it must not require rounding. In the usual case that `zSig' is 522 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 523 The handling of underflow and overflow follows the IEC/IEEE Standard for 524 Binary Floating-Point Arithmetic. 525 ------------------------------------------------------------------------------- 526 */ 527 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 528 { 529 int8 roundingMode; 530 flag roundNearestEven; 531 int16 roundIncrement, roundBits; 532 flag isTiny; 533 534 roundingMode = float_rounding_mode(); 535 roundNearestEven = ( roundingMode == float_round_nearest_even ); 536 roundIncrement = 0x200; 537 if ( ! roundNearestEven ) { 538 if ( roundingMode == float_round_to_zero ) { 539 roundIncrement = 0; 540 } 541 else { 542 roundIncrement = 0x3FF; 543 if ( zSign ) { 544 if ( roundingMode == float_round_up ) roundIncrement = 0; 545 } 546 else { 547 if ( roundingMode == float_round_down ) roundIncrement = 0; 548 } 549 } 550 } 551 roundBits = zSig & 0x3FF; 552 if ( 0x7FD <= (bits16) zExp ) { 553 if ( ( 0x7FD < zExp ) 554 || ( ( zExp == 0x7FD ) 555 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 556 ) { 557 float_raise( float_flag_overflow | float_flag_inexact ); 558 return FLOAT64_MANGLE( 559 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 560 ( roundIncrement == 0 )); 561 } 562 if ( zExp < 0 ) { 563 isTiny = 564 ( float_detect_tininess == float_tininess_before_rounding ) 565 || ( zExp < -1 ) 566 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 567 shift64RightJamming( zSig, - zExp, &zSig ); 568 zExp = 0; 569 roundBits = zSig & 0x3FF; 570 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 571 } 572 } 573 if ( roundBits ) float_set_inexact(); 574 zSig = ( zSig + roundIncrement )>>10; 575 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 576 if ( zSig == 0 ) zExp = 0; 577 return packFloat64( zSign, zExp, zSig ); 578 579 } 580 581 /* 582 ------------------------------------------------------------------------------- 583 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 584 and significand `zSig', and returns the proper double-precision floating- 585 point value corresponding to the abstract input. This routine is just like 586 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 587 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 588 floating-point exponent. 589 ------------------------------------------------------------------------------- 590 */ 591 static float64 592 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 593 { 594 int8 shiftCount; 595 596 shiftCount = countLeadingZeros64( zSig ) - 1; 597 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 598 599 } 600 601 #ifdef FLOATX80 602 603 /* 604 ------------------------------------------------------------------------------- 605 Returns the fraction bits of the extended double-precision floating-point 606 value `a'. 607 ------------------------------------------------------------------------------- 608 */ 609 INLINE bits64 extractFloatx80Frac( floatx80 a ) 610 { 611 612 return a.low; 613 614 } 615 616 /* 617 ------------------------------------------------------------------------------- 618 Returns the exponent bits of the extended double-precision floating-point 619 value `a'. 620 ------------------------------------------------------------------------------- 621 */ 622 INLINE int32 extractFloatx80Exp( floatx80 a ) 623 { 624 625 return a.high & 0x7FFF; 626 627 } 628 629 /* 630 ------------------------------------------------------------------------------- 631 Returns the sign bit of the extended double-precision floating-point value 632 `a'. 633 ------------------------------------------------------------------------------- 634 */ 635 INLINE flag extractFloatx80Sign( floatx80 a ) 636 { 637 638 return a.high>>15; 639 640 } 641 642 /* 643 ------------------------------------------------------------------------------- 644 Normalizes the subnormal extended double-precision floating-point value 645 represented by the denormalized significand `aSig'. The normalized exponent 646 and significand are stored at the locations pointed to by `zExpPtr' and 647 `zSigPtr', respectively. 648 ------------------------------------------------------------------------------- 649 */ 650 static void 651 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 652 { 653 int8 shiftCount; 654 655 shiftCount = countLeadingZeros64( aSig ); 656 *zSigPtr = aSig<<shiftCount; 657 *zExpPtr = 1 - shiftCount; 658 659 } 660 661 /* 662 ------------------------------------------------------------------------------- 663 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 664 extended double-precision floating-point value, returning the result. 665 ------------------------------------------------------------------------------- 666 */ 667 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 668 { 669 floatx80 z; 670 671 z.low = zSig; 672 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 673 return z; 674 675 } 676 677 /* 678 ------------------------------------------------------------------------------- 679 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 680 and extended significand formed by the concatenation of `zSig0' and `zSig1', 681 and returns the proper extended double-precision floating-point value 682 corresponding to the abstract input. Ordinarily, the abstract value is 683 rounded and packed into the extended double-precision format, with the 684 inexact exception raised if the abstract input cannot be represented 685 exactly. However, if the abstract value is too large, the overflow and 686 inexact exceptions are raised and an infinity or maximal finite value is 687 returned. If the abstract value is too small, the input value is rounded to 688 a subnormal number, and the underflow and inexact exceptions are raised if 689 the abstract input cannot be represented exactly as a subnormal extended 690 double-precision floating-point number. 691 If `roundingPrecision' is 32 or 64, the result is rounded to the same 692 number of bits as single or double precision, respectively. Otherwise, the 693 result is rounded to the full precision of the extended double-precision 694 format. 695 The input significand must be normalized or smaller. If the input 696 significand is not normalized, `zExp' must be 0; in that case, the result 697 returned is a subnormal number, and it must not require rounding. The 698 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 699 Floating-Point Arithmetic. 700 ------------------------------------------------------------------------------- 701 */ 702 static floatx80 703 roundAndPackFloatx80( 704 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 705 ) 706 { 707 int8 roundingMode; 708 flag roundNearestEven, increment, isTiny; 709 int64 roundIncrement, roundMask, roundBits; 710 711 roundingMode = float_rounding_mode(); 712 roundNearestEven = ( roundingMode == float_round_nearest_even ); 713 if ( roundingPrecision == 80 ) goto precision80; 714 if ( roundingPrecision == 64 ) { 715 roundIncrement = LIT64( 0x0000000000000400 ); 716 roundMask = LIT64( 0x00000000000007FF ); 717 } 718 else if ( roundingPrecision == 32 ) { 719 roundIncrement = LIT64( 0x0000008000000000 ); 720 roundMask = LIT64( 0x000000FFFFFFFFFF ); 721 } 722 else { 723 goto precision80; 724 } 725 zSig0 |= ( zSig1 != 0 ); 726 if ( ! roundNearestEven ) { 727 if ( roundingMode == float_round_to_zero ) { 728 roundIncrement = 0; 729 } 730 else { 731 roundIncrement = roundMask; 732 if ( zSign ) { 733 if ( roundingMode == float_round_up ) roundIncrement = 0; 734 } 735 else { 736 if ( roundingMode == float_round_down ) roundIncrement = 0; 737 } 738 } 739 } 740 roundBits = zSig0 & roundMask; 741 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 742 if ( ( 0x7FFE < zExp ) 743 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 744 ) { 745 goto overflow; 746 } 747 if ( zExp <= 0 ) { 748 isTiny = 749 ( float_detect_tininess == float_tininess_before_rounding ) 750 || ( zExp < 0 ) 751 || ( zSig0 <= zSig0 + roundIncrement ); 752 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 753 zExp = 0; 754 roundBits = zSig0 & roundMask; 755 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 756 if ( roundBits ) float_set_inexact(); 757 zSig0 += roundIncrement; 758 if ( (sbits64) zSig0 < 0 ) zExp = 1; 759 roundIncrement = roundMask + 1; 760 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 761 roundMask |= roundIncrement; 762 } 763 zSig0 &= ~ roundMask; 764 return packFloatx80( zSign, zExp, zSig0 ); 765 } 766 } 767 if ( roundBits ) float_set_inexact(); 768 zSig0 += roundIncrement; 769 if ( zSig0 < roundIncrement ) { 770 ++zExp; 771 zSig0 = LIT64( 0x8000000000000000 ); 772 } 773 roundIncrement = roundMask + 1; 774 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 775 roundMask |= roundIncrement; 776 } 777 zSig0 &= ~ roundMask; 778 if ( zSig0 == 0 ) zExp = 0; 779 return packFloatx80( zSign, zExp, zSig0 ); 780 precision80: 781 increment = ( (sbits64) zSig1 < 0 ); 782 if ( ! roundNearestEven ) { 783 if ( roundingMode == float_round_to_zero ) { 784 increment = 0; 785 } 786 else { 787 if ( zSign ) { 788 increment = ( roundingMode == float_round_down ) && zSig1; 789 } 790 else { 791 increment = ( roundingMode == float_round_up ) && zSig1; 792 } 793 } 794 } 795 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 796 if ( ( 0x7FFE < zExp ) 797 || ( ( zExp == 0x7FFE ) 798 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 799 && increment 800 ) 801 ) { 802 roundMask = 0; 803 overflow: 804 float_raise( float_flag_overflow | float_flag_inexact ); 805 if ( ( roundingMode == float_round_to_zero ) 806 || ( zSign && ( roundingMode == float_round_up ) ) 807 || ( ! zSign && ( roundingMode == float_round_down ) ) 808 ) { 809 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 810 } 811 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 812 } 813 if ( zExp <= 0 ) { 814 isTiny = 815 ( float_detect_tininess == float_tininess_before_rounding ) 816 || ( zExp < 0 ) 817 || ! increment 818 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 819 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 820 zExp = 0; 821 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 822 if ( zSig1 ) float_set_inexact(); 823 if ( roundNearestEven ) { 824 increment = ( (sbits64) zSig1 < 0 ); 825 } 826 else { 827 if ( zSign ) { 828 increment = ( roundingMode == float_round_down ) && zSig1; 829 } 830 else { 831 increment = ( roundingMode == float_round_up ) && zSig1; 832 } 833 } 834 if ( increment ) { 835 ++zSig0; 836 zSig0 &= 837 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 838 if ( (sbits64) zSig0 < 0 ) zExp = 1; 839 } 840 return packFloatx80( zSign, zExp, zSig0 ); 841 } 842 } 843 if ( zSig1 ) float_set_inexact(); 844 if ( increment ) { 845 ++zSig0; 846 if ( zSig0 == 0 ) { 847 ++zExp; 848 zSig0 = LIT64( 0x8000000000000000 ); 849 } 850 else { 851 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 852 } 853 } 854 else { 855 if ( zSig0 == 0 ) zExp = 0; 856 } 857 return packFloatx80( zSign, zExp, zSig0 ); 858 859 } 860 861 /* 862 ------------------------------------------------------------------------------- 863 Takes an abstract floating-point value having sign `zSign', exponent 864 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 865 and returns the proper extended double-precision floating-point value 866 corresponding to the abstract input. This routine is just like 867 `roundAndPackFloatx80' except that the input significand does not have to be 868 normalized. 869 ------------------------------------------------------------------------------- 870 */ 871 static floatx80 872 normalizeRoundAndPackFloatx80( 873 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 874 ) 875 { 876 int8 shiftCount; 877 878 if ( zSig0 == 0 ) { 879 zSig0 = zSig1; 880 zSig1 = 0; 881 zExp -= 64; 882 } 883 shiftCount = countLeadingZeros64( zSig0 ); 884 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 885 zExp -= shiftCount; 886 return 887 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 888 889 } 890 891 #endif 892 893 #ifdef FLOAT128 894 895 /* 896 ------------------------------------------------------------------------------- 897 Returns the least-significant 64 fraction bits of the quadruple-precision 898 floating-point value `a'. 899 ------------------------------------------------------------------------------- 900 */ 901 INLINE bits64 extractFloat128Frac1( float128 a ) 902 { 903 904 return a.low; 905 906 } 907 908 /* 909 ------------------------------------------------------------------------------- 910 Returns the most-significant 48 fraction bits of the quadruple-precision 911 floating-point value `a'. 912 ------------------------------------------------------------------------------- 913 */ 914 INLINE bits64 extractFloat128Frac0( float128 a ) 915 { 916 917 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 918 919 } 920 921 /* 922 ------------------------------------------------------------------------------- 923 Returns the exponent bits of the quadruple-precision floating-point value 924 `a'. 925 ------------------------------------------------------------------------------- 926 */ 927 INLINE int32 extractFloat128Exp( float128 a ) 928 { 929 930 return ( a.high>>48 ) & 0x7FFF; 931 932 } 933 934 /* 935 ------------------------------------------------------------------------------- 936 Returns the sign bit of the quadruple-precision floating-point value `a'. 937 ------------------------------------------------------------------------------- 938 */ 939 INLINE flag extractFloat128Sign( float128 a ) 940 { 941 942 return a.high>>63; 943 944 } 945 946 /* 947 ------------------------------------------------------------------------------- 948 Normalizes the subnormal quadruple-precision floating-point value 949 represented by the denormalized significand formed by the concatenation of 950 `aSig0' and `aSig1'. The normalized exponent is stored at the location 951 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 952 significand are stored at the location pointed to by `zSig0Ptr', and the 953 least significant 64 bits of the normalized significand are stored at the 954 location pointed to by `zSig1Ptr'. 955 ------------------------------------------------------------------------------- 956 */ 957 static void 958 normalizeFloat128Subnormal( 959 bits64 aSig0, 960 bits64 aSig1, 961 int32 *zExpPtr, 962 bits64 *zSig0Ptr, 963 bits64 *zSig1Ptr 964 ) 965 { 966 int8 shiftCount; 967 968 if ( aSig0 == 0 ) { 969 shiftCount = countLeadingZeros64( aSig1 ) - 15; 970 if ( shiftCount < 0 ) { 971 *zSig0Ptr = aSig1>>( - shiftCount ); 972 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 973 } 974 else { 975 *zSig0Ptr = aSig1<<shiftCount; 976 *zSig1Ptr = 0; 977 } 978 *zExpPtr = - shiftCount - 63; 979 } 980 else { 981 shiftCount = countLeadingZeros64( aSig0 ) - 15; 982 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 983 *zExpPtr = 1 - shiftCount; 984 } 985 986 } 987 988 /* 989 ------------------------------------------------------------------------------- 990 Packs the sign `zSign', the exponent `zExp', and the significand formed 991 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 992 floating-point value, returning the result. After being shifted into the 993 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 994 added together to form the most significant 32 bits of the result. This 995 means that any integer portion of `zSig0' will be added into the exponent. 996 Since a properly normalized significand will have an integer portion equal 997 to 1, the `zExp' input should be 1 less than the desired result exponent 998 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 999 significand. 1000 ------------------------------------------------------------------------------- 1001 */ 1002 INLINE float128 1003 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1004 { 1005 float128 z; 1006 1007 z.low = zSig1; 1008 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 1009 return z; 1010 1011 } 1012 1013 /* 1014 ------------------------------------------------------------------------------- 1015 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1016 and extended significand formed by the concatenation of `zSig0', `zSig1', 1017 and `zSig2', and returns the proper quadruple-precision floating-point value 1018 corresponding to the abstract input. Ordinarily, the abstract value is 1019 simply rounded and packed into the quadruple-precision format, with the 1020 inexact exception raised if the abstract input cannot be represented 1021 exactly. However, if the abstract value is too large, the overflow and 1022 inexact exceptions are raised and an infinity or maximal finite value is 1023 returned. If the abstract value is too small, the input value is rounded to 1024 a subnormal number, and the underflow and inexact exceptions are raised if 1025 the abstract input cannot be represented exactly as a subnormal quadruple- 1026 precision floating-point number. 1027 The input significand must be normalized or smaller. If the input 1028 significand is not normalized, `zExp' must be 0; in that case, the result 1029 returned is a subnormal number, and it must not require rounding. In the 1030 usual case that the input significand is normalized, `zExp' must be 1 less 1031 than the ``true'' floating-point exponent. The handling of underflow and 1032 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1033 ------------------------------------------------------------------------------- 1034 */ 1035 static float128 1036 roundAndPackFloat128( 1037 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 1038 { 1039 int8 roundingMode; 1040 flag roundNearestEven, increment, isTiny; 1041 1042 roundingMode = float_rounding_mode(); 1043 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1044 increment = ( (sbits64) zSig2 < 0 ); 1045 if ( ! roundNearestEven ) { 1046 if ( roundingMode == float_round_to_zero ) { 1047 increment = 0; 1048 } 1049 else { 1050 if ( zSign ) { 1051 increment = ( roundingMode == float_round_down ) && zSig2; 1052 } 1053 else { 1054 increment = ( roundingMode == float_round_up ) && zSig2; 1055 } 1056 } 1057 } 1058 if ( 0x7FFD <= (bits32) zExp ) { 1059 if ( ( 0x7FFD < zExp ) 1060 || ( ( zExp == 0x7FFD ) 1061 && eq128( 1062 LIT64( 0x0001FFFFFFFFFFFF ), 1063 LIT64( 0xFFFFFFFFFFFFFFFF ), 1064 zSig0, 1065 zSig1 1066 ) 1067 && increment 1068 ) 1069 ) { 1070 float_raise( float_flag_overflow | float_flag_inexact ); 1071 if ( ( roundingMode == float_round_to_zero ) 1072 || ( zSign && ( roundingMode == float_round_up ) ) 1073 || ( ! zSign && ( roundingMode == float_round_down ) ) 1074 ) { 1075 return 1076 packFloat128( 1077 zSign, 1078 0x7FFE, 1079 LIT64( 0x0000FFFFFFFFFFFF ), 1080 LIT64( 0xFFFFFFFFFFFFFFFF ) 1081 ); 1082 } 1083 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1084 } 1085 if ( zExp < 0 ) { 1086 isTiny = 1087 ( float_detect_tininess == float_tininess_before_rounding ) 1088 || ( zExp < -1 ) 1089 || ! increment 1090 || lt128( 1091 zSig0, 1092 zSig1, 1093 LIT64( 0x0001FFFFFFFFFFFF ), 1094 LIT64( 0xFFFFFFFFFFFFFFFF ) 1095 ); 1096 shift128ExtraRightJamming( 1097 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1098 zExp = 0; 1099 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1100 if ( roundNearestEven ) { 1101 increment = ( (sbits64) zSig2 < 0 ); 1102 } 1103 else { 1104 if ( zSign ) { 1105 increment = ( roundingMode == float_round_down ) && zSig2; 1106 } 1107 else { 1108 increment = ( roundingMode == float_round_up ) && zSig2; 1109 } 1110 } 1111 } 1112 } 1113 if ( zSig2 ) float_set_inexact(); 1114 if ( increment ) { 1115 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1116 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1117 } 1118 else { 1119 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1120 } 1121 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1122 1123 } 1124 1125 /* 1126 ------------------------------------------------------------------------------- 1127 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1128 and significand formed by the concatenation of `zSig0' and `zSig1', and 1129 returns the proper quadruple-precision floating-point value corresponding 1130 to the abstract input. This routine is just like `roundAndPackFloat128' 1131 except that the input significand has fewer bits and does not have to be 1132 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1133 point exponent. 1134 ------------------------------------------------------------------------------- 1135 */ 1136 static float128 1137 normalizeRoundAndPackFloat128( 1138 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1139 { 1140 int8 shiftCount; 1141 bits64 zSig2; 1142 1143 if ( zSig0 == 0 ) { 1144 zSig0 = zSig1; 1145 zSig1 = 0; 1146 zExp -= 64; 1147 } 1148 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1149 if ( 0 <= shiftCount ) { 1150 zSig2 = 0; 1151 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1152 } 1153 else { 1154 shift128ExtraRightJamming( 1155 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1156 } 1157 zExp -= shiftCount; 1158 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1159 1160 } 1161 1162 #endif 1163 1164 /* 1165 ------------------------------------------------------------------------------- 1166 Returns the result of converting the 32-bit two's complement integer `a' 1167 to the single-precision floating-point format. The conversion is performed 1168 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1169 ------------------------------------------------------------------------------- 1170 */ 1171 float32 int32_to_float32( int32 a ) 1172 { 1173 flag zSign; 1174 1175 if ( a == 0 ) return 0; 1176 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1177 zSign = ( a < 0 ); 1178 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1179 1180 } 1181 1182 /* 1183 ------------------------------------------------------------------------------- 1184 Returns the result of converting the 32-bit two's complement integer `a' 1185 to the double-precision floating-point format. The conversion is performed 1186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1187 ------------------------------------------------------------------------------- 1188 */ 1189 float64 int32_to_float64( int32 a ) 1190 { 1191 flag zSign; 1192 uint32 absA; 1193 int8 shiftCount; 1194 bits64 zSig; 1195 1196 if ( a == 0 ) return 0; 1197 zSign = ( a < 0 ); 1198 absA = zSign ? - a : a; 1199 shiftCount = countLeadingZeros32( absA ) + 21; 1200 zSig = absA; 1201 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1202 1203 } 1204 1205 #ifdef FLOATX80 1206 1207 /* 1208 ------------------------------------------------------------------------------- 1209 Returns the result of converting the 32-bit two's complement integer `a' 1210 to the extended double-precision floating-point format. The conversion 1211 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1212 Arithmetic. 1213 ------------------------------------------------------------------------------- 1214 */ 1215 floatx80 int32_to_floatx80( int32 a ) 1216 { 1217 flag zSign; 1218 uint32 absA; 1219 int8 shiftCount; 1220 bits64 zSig; 1221 1222 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1223 zSign = ( a < 0 ); 1224 absA = zSign ? - a : a; 1225 shiftCount = countLeadingZeros32( absA ) + 32; 1226 zSig = absA; 1227 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1228 1229 } 1230 1231 #endif 1232 1233 #ifdef FLOAT128 1234 1235 /* 1236 ------------------------------------------------------------------------------- 1237 Returns the result of converting the 32-bit two's complement integer `a' to 1238 the quadruple-precision floating-point format. The conversion is performed 1239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1240 ------------------------------------------------------------------------------- 1241 */ 1242 float128 int32_to_float128( int32 a ) 1243 { 1244 flag zSign; 1245 uint32 absA; 1246 int8 shiftCount; 1247 bits64 zSig0; 1248 1249 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1250 zSign = ( a < 0 ); 1251 absA = zSign ? - a : a; 1252 shiftCount = countLeadingZeros32( absA ) + 17; 1253 zSig0 = absA; 1254 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1255 1256 } 1257 1258 #endif 1259 1260 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1261 /* 1262 ------------------------------------------------------------------------------- 1263 Returns the result of converting the 64-bit two's complement integer `a' 1264 to the single-precision floating-point format. The conversion is performed 1265 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1266 ------------------------------------------------------------------------------- 1267 */ 1268 float32 int64_to_float32( int64 a ) 1269 { 1270 flag zSign; 1271 uint64 absA; 1272 int8 shiftCount; 1273 1274 if ( a == 0 ) return 0; 1275 zSign = ( a < 0 ); 1276 absA = zSign ? - a : a; 1277 shiftCount = countLeadingZeros64( absA ) - 40; 1278 if ( 0 <= shiftCount ) { 1279 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1280 } 1281 else { 1282 shiftCount += 7; 1283 if ( shiftCount < 0 ) { 1284 shift64RightJamming( absA, - shiftCount, &absA ); 1285 } 1286 else { 1287 absA <<= shiftCount; 1288 } 1289 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1290 } 1291 1292 } 1293 1294 /* 1295 ------------------------------------------------------------------------------- 1296 Returns the result of converting the 64-bit two's complement integer `a' 1297 to the double-precision floating-point format. The conversion is performed 1298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1299 ------------------------------------------------------------------------------- 1300 */ 1301 float64 int64_to_float64( int64 a ) 1302 { 1303 flag zSign; 1304 1305 if ( a == 0 ) return 0; 1306 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1307 return packFloat64( 1, 0x43E, 0 ); 1308 } 1309 zSign = ( a < 0 ); 1310 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1311 1312 } 1313 1314 #ifdef FLOATX80 1315 1316 /* 1317 ------------------------------------------------------------------------------- 1318 Returns the result of converting the 64-bit two's complement integer `a' 1319 to the extended double-precision floating-point format. The conversion 1320 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1321 Arithmetic. 1322 ------------------------------------------------------------------------------- 1323 */ 1324 floatx80 int64_to_floatx80( int64 a ) 1325 { 1326 flag zSign; 1327 uint64 absA; 1328 int8 shiftCount; 1329 1330 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1331 zSign = ( a < 0 ); 1332 absA = zSign ? - a : a; 1333 shiftCount = countLeadingZeros64( absA ); 1334 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1335 1336 } 1337 1338 #endif 1339 1340 #ifdef FLOAT128 1341 1342 /* 1343 ------------------------------------------------------------------------------- 1344 Returns the result of converting the 64-bit two's complement integer `a' to 1345 the quadruple-precision floating-point format. The conversion is performed 1346 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1347 ------------------------------------------------------------------------------- 1348 */ 1349 float128 int64_to_float128( int64 a ) 1350 { 1351 flag zSign; 1352 uint64 absA; 1353 int8 shiftCount; 1354 int32 zExp; 1355 bits64 zSig0, zSig1; 1356 1357 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1358 zSign = ( a < 0 ); 1359 absA = zSign ? - a : a; 1360 shiftCount = countLeadingZeros64( absA ) + 49; 1361 zExp = 0x406E - shiftCount; 1362 if ( 64 <= shiftCount ) { 1363 zSig1 = 0; 1364 zSig0 = absA; 1365 shiftCount -= 64; 1366 } 1367 else { 1368 zSig1 = absA; 1369 zSig0 = 0; 1370 } 1371 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1372 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1373 1374 } 1375 1376 #endif 1377 #endif /* !SOFTFLOAT_FOR_GCC */ 1378 1379 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1380 /* 1381 ------------------------------------------------------------------------------- 1382 Returns the result of converting the single-precision floating-point value 1383 `a' to the 32-bit two's complement integer format. The conversion is 1384 performed according to the IEC/IEEE Standard for Binary Floating-Point 1385 Arithmetic---which means in particular that the conversion is rounded 1386 according to the current rounding mode. If `a' is a NaN, the largest 1387 positive integer is returned. Otherwise, if the conversion overflows, the 1388 largest integer with the same sign as `a' is returned. 1389 ------------------------------------------------------------------------------- 1390 */ 1391 int32 float32_to_int32( float32 a ) 1392 { 1393 flag aSign; 1394 int16 aExp, shiftCount; 1395 bits32 aSig; 1396 bits64 aSig64; 1397 1398 aSig = extractFloat32Frac( a ); 1399 aExp = extractFloat32Exp( a ); 1400 aSign = extractFloat32Sign( a ); 1401 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1402 if ( aExp ) aSig |= 0x00800000; 1403 shiftCount = 0xAF - aExp; 1404 aSig64 = aSig; 1405 aSig64 <<= 32; 1406 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1407 return roundAndPackInt32( aSign, aSig64 ); 1408 1409 } 1410 #endif /* !SOFTFLOAT_FOR_GCC */ 1411 1412 /* 1413 ------------------------------------------------------------------------------- 1414 Returns the result of converting the single-precision floating-point value 1415 `a' to the 32-bit two's complement integer format. The conversion is 1416 performed according to the IEC/IEEE Standard for Binary Floating-Point 1417 Arithmetic, except that the conversion is always rounded toward zero. 1418 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1419 the conversion overflows, the largest integer with the same sign as `a' is 1420 returned. 1421 ------------------------------------------------------------------------------- 1422 */ 1423 int32 float32_to_int32_round_to_zero( float32 a ) 1424 { 1425 flag aSign; 1426 int16 aExp, shiftCount; 1427 bits32 aSig; 1428 int32 z; 1429 1430 aSig = extractFloat32Frac( a ); 1431 aExp = extractFloat32Exp( a ); 1432 aSign = extractFloat32Sign( a ); 1433 shiftCount = aExp - 0x9E; 1434 if ( 0 <= shiftCount ) { 1435 if ( a != 0xCF000000 ) { 1436 float_raise( float_flag_invalid ); 1437 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1438 } 1439 return (sbits32) 0x80000000; 1440 } 1441 else if ( aExp <= 0x7E ) { 1442 if ( aExp | aSig ) float_set_inexact(); 1443 return 0; 1444 } 1445 aSig = ( aSig | 0x00800000 )<<8; 1446 z = aSig>>( - shiftCount ); 1447 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1448 float_set_inexact(); 1449 } 1450 if ( aSign ) z = - z; 1451 return z; 1452 1453 } 1454 1455 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1456 /* 1457 ------------------------------------------------------------------------------- 1458 Returns the result of converting the single-precision floating-point value 1459 `a' to the 64-bit two's complement integer format. The conversion is 1460 performed according to the IEC/IEEE Standard for Binary Floating-Point 1461 Arithmetic---which means in particular that the conversion is rounded 1462 according to the current rounding mode. If `a' is a NaN, the largest 1463 positive integer is returned. Otherwise, if the conversion overflows, the 1464 largest integer with the same sign as `a' is returned. 1465 ------------------------------------------------------------------------------- 1466 */ 1467 int64 float32_to_int64( float32 a ) 1468 { 1469 flag aSign; 1470 int16 aExp, shiftCount; 1471 bits32 aSig; 1472 bits64 aSig64, aSigExtra; 1473 1474 aSig = extractFloat32Frac( a ); 1475 aExp = extractFloat32Exp( a ); 1476 aSign = extractFloat32Sign( a ); 1477 shiftCount = 0xBE - aExp; 1478 if ( shiftCount < 0 ) { 1479 float_raise( float_flag_invalid ); 1480 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1481 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1482 } 1483 return (sbits64) LIT64( 0x8000000000000000 ); 1484 } 1485 if ( aExp ) aSig |= 0x00800000; 1486 aSig64 = aSig; 1487 aSig64 <<= 40; 1488 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1489 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1490 1491 } 1492 1493 /* 1494 ------------------------------------------------------------------------------- 1495 Returns the result of converting the single-precision floating-point value 1496 `a' to the 64-bit two's complement integer format. The conversion is 1497 performed according to the IEC/IEEE Standard for Binary Floating-Point 1498 Arithmetic, except that the conversion is always rounded toward zero. If 1499 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1500 conversion overflows, the largest integer with the same sign as `a' is 1501 returned. 1502 ------------------------------------------------------------------------------- 1503 */ 1504 int64 float32_to_int64_round_to_zero( float32 a ) 1505 { 1506 flag aSign; 1507 int16 aExp, shiftCount; 1508 bits32 aSig; 1509 bits64 aSig64; 1510 int64 z; 1511 1512 aSig = extractFloat32Frac( a ); 1513 aExp = extractFloat32Exp( a ); 1514 aSign = extractFloat32Sign( a ); 1515 shiftCount = aExp - 0xBE; 1516 if ( 0 <= shiftCount ) { 1517 if ( a != 0xDF000000 ) { 1518 float_raise( float_flag_invalid ); 1519 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1520 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1521 } 1522 } 1523 return (sbits64) LIT64( 0x8000000000000000 ); 1524 } 1525 else if ( aExp <= 0x7E ) { 1526 if ( aExp | aSig ) float_set_inexact(); 1527 return 0; 1528 } 1529 aSig64 = aSig | 0x00800000; 1530 aSig64 <<= 40; 1531 z = aSig64>>( - shiftCount ); 1532 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1533 float_set_inexact(); 1534 } 1535 if ( aSign ) z = - z; 1536 return z; 1537 1538 } 1539 #endif /* !SOFTFLOAT_FOR_GCC */ 1540 1541 /* 1542 ------------------------------------------------------------------------------- 1543 Returns the result of converting the single-precision floating-point value 1544 `a' to the double-precision floating-point format. The conversion is 1545 performed according to the IEC/IEEE Standard for Binary Floating-Point 1546 Arithmetic. 1547 ------------------------------------------------------------------------------- 1548 */ 1549 float64 float32_to_float64( float32 a ) 1550 { 1551 flag aSign; 1552 int16 aExp; 1553 bits32 aSig; 1554 1555 aSig = extractFloat32Frac( a ); 1556 aExp = extractFloat32Exp( a ); 1557 aSign = extractFloat32Sign( a ); 1558 if ( aExp == 0xFF ) { 1559 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1560 return packFloat64( aSign, 0x7FF, 0 ); 1561 } 1562 if ( aExp == 0 ) { 1563 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1564 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1565 --aExp; 1566 } 1567 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1568 1569 } 1570 1571 #ifdef FLOATX80 1572 1573 /* 1574 ------------------------------------------------------------------------------- 1575 Returns the result of converting the single-precision floating-point value 1576 `a' to the extended double-precision floating-point format. The conversion 1577 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1578 Arithmetic. 1579 ------------------------------------------------------------------------------- 1580 */ 1581 floatx80 float32_to_floatx80( float32 a ) 1582 { 1583 flag aSign; 1584 int16 aExp; 1585 bits32 aSig; 1586 1587 aSig = extractFloat32Frac( a ); 1588 aExp = extractFloat32Exp( a ); 1589 aSign = extractFloat32Sign( a ); 1590 if ( aExp == 0xFF ) { 1591 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1592 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1593 } 1594 if ( aExp == 0 ) { 1595 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1596 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1597 } 1598 aSig |= 0x00800000; 1599 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1600 1601 } 1602 1603 #endif 1604 1605 #ifdef FLOAT128 1606 1607 /* 1608 ------------------------------------------------------------------------------- 1609 Returns the result of converting the single-precision floating-point value 1610 `a' to the double-precision floating-point format. The conversion is 1611 performed according to the IEC/IEEE Standard for Binary Floating-Point 1612 Arithmetic. 1613 ------------------------------------------------------------------------------- 1614 */ 1615 float128 float32_to_float128( float32 a ) 1616 { 1617 flag aSign; 1618 int16 aExp; 1619 bits32 aSig; 1620 1621 aSig = extractFloat32Frac( a ); 1622 aExp = extractFloat32Exp( a ); 1623 aSign = extractFloat32Sign( a ); 1624 if ( aExp == 0xFF ) { 1625 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1626 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1627 } 1628 if ( aExp == 0 ) { 1629 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1630 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1631 --aExp; 1632 } 1633 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1634 1635 } 1636 1637 #endif 1638 1639 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1640 /* 1641 ------------------------------------------------------------------------------- 1642 Rounds the single-precision floating-point value `a' to an integer, and 1643 returns the result as a single-precision floating-point value. The 1644 operation is performed according to the IEC/IEEE Standard for Binary 1645 Floating-Point Arithmetic. 1646 ------------------------------------------------------------------------------- 1647 */ 1648 float32 float32_round_to_int( float32 a ) 1649 { 1650 flag aSign; 1651 int16 aExp; 1652 bits32 lastBitMask, roundBitsMask; 1653 int8 roundingMode; 1654 float32 z; 1655 1656 aExp = extractFloat32Exp( a ); 1657 if ( 0x96 <= aExp ) { 1658 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1659 return propagateFloat32NaN( a, a ); 1660 } 1661 return a; 1662 } 1663 if ( aExp <= 0x7E ) { 1664 if ( (bits32) ( a<<1 ) == 0 ) return a; 1665 float_set_inexact(); 1666 aSign = extractFloat32Sign( a ); 1667 switch ( float_rounding_mode() ) { 1668 case float_round_nearest_even: 1669 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1670 return packFloat32( aSign, 0x7F, 0 ); 1671 } 1672 break; 1673 case float_round_down: 1674 return aSign ? 0xBF800000 : 0; 1675 case float_round_up: 1676 return aSign ? 0x80000000 : 0x3F800000; 1677 } 1678 return packFloat32( aSign, 0, 0 ); 1679 } 1680 lastBitMask = 1; 1681 lastBitMask <<= 0x96 - aExp; 1682 roundBitsMask = lastBitMask - 1; 1683 z = a; 1684 roundingMode = float_rounding_mode(); 1685 if ( roundingMode == float_round_nearest_even ) { 1686 z += lastBitMask>>1; 1687 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1688 } 1689 else if ( roundingMode != float_round_to_zero ) { 1690 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1691 z += roundBitsMask; 1692 } 1693 } 1694 z &= ~ roundBitsMask; 1695 if ( z != a ) float_set_inexact(); 1696 return z; 1697 1698 } 1699 #endif /* !SOFTFLOAT_FOR_GCC */ 1700 1701 /* 1702 ------------------------------------------------------------------------------- 1703 Returns the result of adding the absolute values of the single-precision 1704 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1705 before being returned. `zSign' is ignored if the result is a NaN. 1706 The addition is performed according to the IEC/IEEE Standard for Binary 1707 Floating-Point Arithmetic. 1708 ------------------------------------------------------------------------------- 1709 */ 1710 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1711 { 1712 int16 aExp, bExp, zExp; 1713 bits32 aSig, bSig, zSig; 1714 int16 expDiff; 1715 1716 aSig = extractFloat32Frac( a ); 1717 aExp = extractFloat32Exp( a ); 1718 bSig = extractFloat32Frac( b ); 1719 bExp = extractFloat32Exp( b ); 1720 expDiff = aExp - bExp; 1721 aSig <<= 6; 1722 bSig <<= 6; 1723 if ( 0 < expDiff ) { 1724 if ( aExp == 0xFF ) { 1725 if ( aSig ) return propagateFloat32NaN( a, b ); 1726 return a; 1727 } 1728 if ( bExp == 0 ) { 1729 --expDiff; 1730 } 1731 else { 1732 bSig |= 0x20000000; 1733 } 1734 shift32RightJamming( bSig, expDiff, &bSig ); 1735 zExp = aExp; 1736 } 1737 else if ( expDiff < 0 ) { 1738 if ( bExp == 0xFF ) { 1739 if ( bSig ) return propagateFloat32NaN( a, b ); 1740 return packFloat32( zSign, 0xFF, 0 ); 1741 } 1742 if ( aExp == 0 ) { 1743 ++expDiff; 1744 } 1745 else { 1746 aSig |= 0x20000000; 1747 } 1748 shift32RightJamming( aSig, - expDiff, &aSig ); 1749 zExp = bExp; 1750 } 1751 else { 1752 if ( aExp == 0xFF ) { 1753 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1754 return a; 1755 } 1756 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1757 zSig = 0x40000000 + aSig + bSig; 1758 zExp = aExp; 1759 goto roundAndPack; 1760 } 1761 aSig |= 0x20000000; 1762 zSig = ( aSig + bSig )<<1; 1763 --zExp; 1764 if ( (sbits32) zSig < 0 ) { 1765 zSig = aSig + bSig; 1766 ++zExp; 1767 } 1768 roundAndPack: 1769 return roundAndPackFloat32( zSign, zExp, zSig ); 1770 1771 } 1772 1773 /* 1774 ------------------------------------------------------------------------------- 1775 Returns the result of subtracting the absolute values of the single- 1776 precision floating-point values `a' and `b'. If `zSign' is 1, the 1777 difference is negated before being returned. `zSign' is ignored if the 1778 result is a NaN. The subtraction is performed according to the IEC/IEEE 1779 Standard for Binary Floating-Point Arithmetic. 1780 ------------------------------------------------------------------------------- 1781 */ 1782 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1783 { 1784 int16 aExp, bExp, zExp; 1785 bits32 aSig, bSig, zSig; 1786 int16 expDiff; 1787 1788 aSig = extractFloat32Frac( a ); 1789 aExp = extractFloat32Exp( a ); 1790 bSig = extractFloat32Frac( b ); 1791 bExp = extractFloat32Exp( b ); 1792 expDiff = aExp - bExp; 1793 aSig <<= 7; 1794 bSig <<= 7; 1795 if ( 0 < expDiff ) goto aExpBigger; 1796 if ( expDiff < 0 ) goto bExpBigger; 1797 if ( aExp == 0xFF ) { 1798 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1799 float_raise( float_flag_invalid ); 1800 return float32_default_nan; 1801 } 1802 if ( aExp == 0 ) { 1803 aExp = 1; 1804 bExp = 1; 1805 } 1806 if ( bSig < aSig ) goto aBigger; 1807 if ( aSig < bSig ) goto bBigger; 1808 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1809 bExpBigger: 1810 if ( bExp == 0xFF ) { 1811 if ( bSig ) return propagateFloat32NaN( a, b ); 1812 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1813 } 1814 if ( aExp == 0 ) { 1815 ++expDiff; 1816 } 1817 else { 1818 aSig |= 0x40000000; 1819 } 1820 shift32RightJamming( aSig, - expDiff, &aSig ); 1821 bSig |= 0x40000000; 1822 bBigger: 1823 zSig = bSig - aSig; 1824 zExp = bExp; 1825 zSign ^= 1; 1826 goto normalizeRoundAndPack; 1827 aExpBigger: 1828 if ( aExp == 0xFF ) { 1829 if ( aSig ) return propagateFloat32NaN( a, b ); 1830 return a; 1831 } 1832 if ( bExp == 0 ) { 1833 --expDiff; 1834 } 1835 else { 1836 bSig |= 0x40000000; 1837 } 1838 shift32RightJamming( bSig, expDiff, &bSig ); 1839 aSig |= 0x40000000; 1840 aBigger: 1841 zSig = aSig - bSig; 1842 zExp = aExp; 1843 normalizeRoundAndPack: 1844 --zExp; 1845 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1846 1847 } 1848 1849 /* 1850 ------------------------------------------------------------------------------- 1851 Returns the result of adding the single-precision floating-point values `a' 1852 and `b'. The operation is performed according to the IEC/IEEE Standard for 1853 Binary Floating-Point Arithmetic. 1854 ------------------------------------------------------------------------------- 1855 */ 1856 float32 float32_add( float32 a, float32 b ) 1857 { 1858 flag aSign, bSign; 1859 1860 aSign = extractFloat32Sign( a ); 1861 bSign = extractFloat32Sign( b ); 1862 if ( aSign == bSign ) { 1863 return addFloat32Sigs( a, b, aSign ); 1864 } 1865 else { 1866 return subFloat32Sigs( a, b, aSign ); 1867 } 1868 1869 } 1870 1871 /* 1872 ------------------------------------------------------------------------------- 1873 Returns the result of subtracting the single-precision floating-point values 1874 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1875 for Binary Floating-Point Arithmetic. 1876 ------------------------------------------------------------------------------- 1877 */ 1878 float32 float32_sub( float32 a, float32 b ) 1879 { 1880 flag aSign, bSign; 1881 1882 aSign = extractFloat32Sign( a ); 1883 bSign = extractFloat32Sign( b ); 1884 if ( aSign == bSign ) { 1885 return subFloat32Sigs( a, b, aSign ); 1886 } 1887 else { 1888 return addFloat32Sigs( a, b, aSign ); 1889 } 1890 1891 } 1892 1893 /* 1894 ------------------------------------------------------------------------------- 1895 Returns the result of multiplying the single-precision floating-point values 1896 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1897 for Binary Floating-Point Arithmetic. 1898 ------------------------------------------------------------------------------- 1899 */ 1900 float32 float32_mul( float32 a, float32 b ) 1901 { 1902 flag aSign, bSign, zSign; 1903 int16 aExp, bExp, zExp; 1904 bits32 aSig, bSig; 1905 bits64 zSig64; 1906 bits32 zSig; 1907 1908 aSig = extractFloat32Frac( a ); 1909 aExp = extractFloat32Exp( a ); 1910 aSign = extractFloat32Sign( a ); 1911 bSig = extractFloat32Frac( b ); 1912 bExp = extractFloat32Exp( b ); 1913 bSign = extractFloat32Sign( b ); 1914 zSign = aSign ^ bSign; 1915 if ( aExp == 0xFF ) { 1916 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1917 return propagateFloat32NaN( a, b ); 1918 } 1919 if ( ( bExp | bSig ) == 0 ) { 1920 float_raise( float_flag_invalid ); 1921 return float32_default_nan; 1922 } 1923 return packFloat32( zSign, 0xFF, 0 ); 1924 } 1925 if ( bExp == 0xFF ) { 1926 if ( bSig ) return propagateFloat32NaN( a, b ); 1927 if ( ( aExp | aSig ) == 0 ) { 1928 float_raise( float_flag_invalid ); 1929 return float32_default_nan; 1930 } 1931 return packFloat32( zSign, 0xFF, 0 ); 1932 } 1933 if ( aExp == 0 ) { 1934 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1935 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1936 } 1937 if ( bExp == 0 ) { 1938 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1939 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1940 } 1941 zExp = aExp + bExp - 0x7F; 1942 aSig = ( aSig | 0x00800000 )<<7; 1943 bSig = ( bSig | 0x00800000 )<<8; 1944 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1945 zSig = zSig64; 1946 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1947 zSig <<= 1; 1948 --zExp; 1949 } 1950 return roundAndPackFloat32( zSign, zExp, zSig ); 1951 1952 } 1953 1954 /* 1955 ------------------------------------------------------------------------------- 1956 Returns the result of dividing the single-precision floating-point value `a' 1957 by the corresponding value `b'. The operation is performed according to the 1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1959 ------------------------------------------------------------------------------- 1960 */ 1961 float32 float32_div( float32 a, float32 b ) 1962 { 1963 flag aSign, bSign, zSign; 1964 int16 aExp, bExp, zExp; 1965 bits32 aSig, bSig, zSig; 1966 1967 aSig = extractFloat32Frac( a ); 1968 aExp = extractFloat32Exp( a ); 1969 aSign = extractFloat32Sign( a ); 1970 bSig = extractFloat32Frac( b ); 1971 bExp = extractFloat32Exp( b ); 1972 bSign = extractFloat32Sign( b ); 1973 zSign = aSign ^ bSign; 1974 if ( aExp == 0xFF ) { 1975 if ( aSig ) return propagateFloat32NaN( a, b ); 1976 if ( bExp == 0xFF ) { 1977 if ( bSig ) return propagateFloat32NaN( a, b ); 1978 float_raise( float_flag_invalid ); 1979 return float32_default_nan; 1980 } 1981 return packFloat32( zSign, 0xFF, 0 ); 1982 } 1983 if ( bExp == 0xFF ) { 1984 if ( bSig ) return propagateFloat32NaN( a, b ); 1985 return packFloat32( zSign, 0, 0 ); 1986 } 1987 if ( bExp == 0 ) { 1988 if ( bSig == 0 ) { 1989 if ( ( aExp | aSig ) == 0 ) { 1990 float_raise( float_flag_invalid ); 1991 return float32_default_nan; 1992 } 1993 float_raise( float_flag_divbyzero ); 1994 return packFloat32( zSign, 0xFF, 0 ); 1995 } 1996 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1997 } 1998 if ( aExp == 0 ) { 1999 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2000 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2001 } 2002 zExp = aExp - bExp + 0x7D; 2003 aSig = ( aSig | 0x00800000 )<<7; 2004 bSig = ( bSig | 0x00800000 )<<8; 2005 if ( bSig <= ( aSig + aSig ) ) { 2006 aSig >>= 1; 2007 ++zExp; 2008 } 2009 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2010 if ( ( zSig & 0x3F ) == 0 ) { 2011 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2012 } 2013 return roundAndPackFloat32( zSign, zExp, zSig ); 2014 2015 } 2016 2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2018 /* 2019 ------------------------------------------------------------------------------- 2020 Returns the remainder of the single-precision floating-point value `a' 2021 with respect to the corresponding value `b'. The operation is performed 2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2023 ------------------------------------------------------------------------------- 2024 */ 2025 float32 float32_rem( float32 a, float32 b ) 2026 { 2027 flag aSign, bSign, zSign; 2028 int16 aExp, bExp, expDiff; 2029 bits32 aSig, bSig; 2030 bits32 q; 2031 bits64 aSig64, bSig64, q64; 2032 bits32 alternateASig; 2033 sbits32 sigMean; 2034 2035 aSig = extractFloat32Frac( a ); 2036 aExp = extractFloat32Exp( a ); 2037 aSign = extractFloat32Sign( a ); 2038 bSig = extractFloat32Frac( b ); 2039 bExp = extractFloat32Exp( b ); 2040 bSign = extractFloat32Sign( b ); 2041 if ( aExp == 0xFF ) { 2042 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2043 return propagateFloat32NaN( a, b ); 2044 } 2045 float_raise( float_flag_invalid ); 2046 return float32_default_nan; 2047 } 2048 if ( bExp == 0xFF ) { 2049 if ( bSig ) return propagateFloat32NaN( a, b ); 2050 return a; 2051 } 2052 if ( bExp == 0 ) { 2053 if ( bSig == 0 ) { 2054 float_raise( float_flag_invalid ); 2055 return float32_default_nan; 2056 } 2057 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2058 } 2059 if ( aExp == 0 ) { 2060 if ( aSig == 0 ) return a; 2061 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2062 } 2063 expDiff = aExp - bExp; 2064 aSig |= 0x00800000; 2065 bSig |= 0x00800000; 2066 if ( expDiff < 32 ) { 2067 aSig <<= 8; 2068 bSig <<= 8; 2069 if ( expDiff < 0 ) { 2070 if ( expDiff < -1 ) return a; 2071 aSig >>= 1; 2072 } 2073 q = ( bSig <= aSig ); 2074 if ( q ) aSig -= bSig; 2075 if ( 0 < expDiff ) { 2076 q = ( ( (bits64) aSig )<<32 ) / bSig; 2077 q >>= 32 - expDiff; 2078 bSig >>= 2; 2079 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2080 } 2081 else { 2082 aSig >>= 2; 2083 bSig >>= 2; 2084 } 2085 } 2086 else { 2087 if ( bSig <= aSig ) aSig -= bSig; 2088 aSig64 = ( (bits64) aSig )<<40; 2089 bSig64 = ( (bits64) bSig )<<40; 2090 expDiff -= 64; 2091 while ( 0 < expDiff ) { 2092 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2093 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2094 aSig64 = - ( ( bSig * q64 )<<38 ); 2095 expDiff -= 62; 2096 } 2097 expDiff += 64; 2098 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2099 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2100 q = q64>>( 64 - expDiff ); 2101 bSig <<= 6; 2102 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2103 } 2104 do { 2105 alternateASig = aSig; 2106 ++q; 2107 aSig -= bSig; 2108 } while ( 0 <= (sbits32) aSig ); 2109 sigMean = aSig + alternateASig; 2110 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2111 aSig = alternateASig; 2112 } 2113 zSign = ( (sbits32) aSig < 0 ); 2114 if ( zSign ) aSig = - aSig; 2115 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2116 2117 } 2118 #endif /* !SOFTFLOAT_FOR_GCC */ 2119 2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2121 /* 2122 ------------------------------------------------------------------------------- 2123 Returns the square root of the single-precision floating-point value `a'. 2124 The operation is performed according to the IEC/IEEE Standard for Binary 2125 Floating-Point Arithmetic. 2126 ------------------------------------------------------------------------------- 2127 */ 2128 float32 float32_sqrt( float32 a ) 2129 { 2130 flag aSign; 2131 int16 aExp, zExp; 2132 bits32 aSig, zSig; 2133 bits64 rem, term; 2134 2135 aSig = extractFloat32Frac( a ); 2136 aExp = extractFloat32Exp( a ); 2137 aSign = extractFloat32Sign( a ); 2138 if ( aExp == 0xFF ) { 2139 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2140 if ( ! aSign ) return a; 2141 float_raise( float_flag_invalid ); 2142 return float32_default_nan; 2143 } 2144 if ( aSign ) { 2145 if ( ( aExp | aSig ) == 0 ) return a; 2146 float_raise( float_flag_invalid ); 2147 return float32_default_nan; 2148 } 2149 if ( aExp == 0 ) { 2150 if ( aSig == 0 ) return 0; 2151 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2152 } 2153 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2154 aSig = ( aSig | 0x00800000 )<<8; 2155 zSig = estimateSqrt32( aExp, aSig ) + 2; 2156 if ( ( zSig & 0x7F ) <= 5 ) { 2157 if ( zSig < 2 ) { 2158 zSig = 0x7FFFFFFF; 2159 goto roundAndPack; 2160 } 2161 aSig >>= aExp & 1; 2162 term = ( (bits64) zSig ) * zSig; 2163 rem = ( ( (bits64) aSig )<<32 ) - term; 2164 while ( (sbits64) rem < 0 ) { 2165 --zSig; 2166 rem += ( ( (bits64) zSig )<<1 ) | 1; 2167 } 2168 zSig |= ( rem != 0 ); 2169 } 2170 shift32RightJamming( zSig, 1, &zSig ); 2171 roundAndPack: 2172 return roundAndPackFloat32( 0, zExp, zSig ); 2173 2174 } 2175 #endif /* !SOFTFLOAT_FOR_GCC */ 2176 2177 /* 2178 ------------------------------------------------------------------------------- 2179 Returns 1 if the single-precision floating-point value `a' is equal to 2180 the corresponding value `b', and 0 otherwise. The comparison is performed 2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2182 ------------------------------------------------------------------------------- 2183 */ 2184 flag float32_eq( float32 a, float32 b ) 2185 { 2186 2187 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2188 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2189 ) { 2190 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2191 float_raise( float_flag_invalid ); 2192 } 2193 return 0; 2194 } 2195 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2196 2197 } 2198 2199 /* 2200 ------------------------------------------------------------------------------- 2201 Returns 1 if the single-precision floating-point value `a' is less than 2202 or equal to the corresponding value `b', and 0 otherwise. The comparison 2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2204 Arithmetic. 2205 ------------------------------------------------------------------------------- 2206 */ 2207 flag float32_le( float32 a, float32 b ) 2208 { 2209 flag aSign, bSign; 2210 2211 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2212 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2213 ) { 2214 float_raise( float_flag_invalid ); 2215 return 0; 2216 } 2217 aSign = extractFloat32Sign( a ); 2218 bSign = extractFloat32Sign( b ); 2219 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2220 return ( a == b ) || ( aSign ^ ( a < b ) ); 2221 2222 } 2223 2224 /* 2225 ------------------------------------------------------------------------------- 2226 Returns 1 if the single-precision floating-point value `a' is less than 2227 the corresponding value `b', and 0 otherwise. The comparison is performed 2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2229 ------------------------------------------------------------------------------- 2230 */ 2231 flag float32_lt( float32 a, float32 b ) 2232 { 2233 flag aSign, bSign; 2234 2235 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2236 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2237 ) { 2238 float_raise( float_flag_invalid ); 2239 return 0; 2240 } 2241 aSign = extractFloat32Sign( a ); 2242 bSign = extractFloat32Sign( b ); 2243 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2244 return ( a != b ) && ( aSign ^ ( a < b ) ); 2245 2246 } 2247 2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2249 /* 2250 ------------------------------------------------------------------------------- 2251 Returns 1 if the single-precision floating-point value `a' is equal to 2252 the corresponding value `b', and 0 otherwise. The invalid exception is 2253 raised if either operand is a NaN. Otherwise, the comparison is performed 2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2255 ------------------------------------------------------------------------------- 2256 */ 2257 flag float32_eq_signaling( float32 a, float32 b ) 2258 { 2259 2260 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2261 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2262 ) { 2263 float_raise( float_flag_invalid ); 2264 return 0; 2265 } 2266 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2267 2268 } 2269 2270 /* 2271 ------------------------------------------------------------------------------- 2272 Returns 1 if the single-precision floating-point value `a' is less than or 2273 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2274 cause an exception. Otherwise, the comparison is performed according to the 2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2276 ------------------------------------------------------------------------------- 2277 */ 2278 flag float32_le_quiet( float32 a, float32 b ) 2279 { 2280 flag aSign, bSign; 2281 2282 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2283 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2284 ) { 2285 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2286 float_raise( float_flag_invalid ); 2287 } 2288 return 0; 2289 } 2290 aSign = extractFloat32Sign( a ); 2291 bSign = extractFloat32Sign( b ); 2292 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2293 return ( a == b ) || ( aSign ^ ( a < b ) ); 2294 2295 } 2296 2297 /* 2298 ------------------------------------------------------------------------------- 2299 Returns 1 if the single-precision floating-point value `a' is less than 2300 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2301 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2302 Standard for Binary Floating-Point Arithmetic. 2303 ------------------------------------------------------------------------------- 2304 */ 2305 flag float32_lt_quiet( float32 a, float32 b ) 2306 { 2307 flag aSign, bSign; 2308 2309 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2310 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2311 ) { 2312 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2313 float_raise( float_flag_invalid ); 2314 } 2315 return 0; 2316 } 2317 aSign = extractFloat32Sign( a ); 2318 bSign = extractFloat32Sign( b ); 2319 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2320 return ( a != b ) && ( aSign ^ ( a < b ) ); 2321 2322 } 2323 #endif /* !SOFTFLOAT_FOR_GCC */ 2324 2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2326 /* 2327 ------------------------------------------------------------------------------- 2328 Returns the result of converting the double-precision floating-point value 2329 `a' to the 32-bit two's complement integer format. The conversion is 2330 performed according to the IEC/IEEE Standard for Binary Floating-Point 2331 Arithmetic---which means in particular that the conversion is rounded 2332 according to the current rounding mode. If `a' is a NaN, the largest 2333 positive integer is returned. Otherwise, if the conversion overflows, the 2334 largest integer with the same sign as `a' is returned. 2335 ------------------------------------------------------------------------------- 2336 */ 2337 int32 float64_to_int32( float64 a ) 2338 { 2339 flag aSign; 2340 int16 aExp, shiftCount; 2341 bits64 aSig; 2342 2343 aSig = extractFloat64Frac( a ); 2344 aExp = extractFloat64Exp( a ); 2345 aSign = extractFloat64Sign( a ); 2346 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2347 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2348 shiftCount = 0x42C - aExp; 2349 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2350 return roundAndPackInt32( aSign, aSig ); 2351 2352 } 2353 #endif /* !SOFTFLOAT_FOR_GCC */ 2354 2355 /* 2356 ------------------------------------------------------------------------------- 2357 Returns the result of converting the double-precision floating-point value 2358 `a' to the 32-bit two's complement integer format. The conversion is 2359 performed according to the IEC/IEEE Standard for Binary Floating-Point 2360 Arithmetic, except that the conversion is always rounded toward zero. 2361 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2362 the conversion overflows, the largest integer with the same sign as `a' is 2363 returned. 2364 ------------------------------------------------------------------------------- 2365 */ 2366 int32 float64_to_int32_round_to_zero( float64 a ) 2367 { 2368 flag aSign; 2369 int16 aExp, shiftCount; 2370 bits64 aSig, savedASig; 2371 int32 z; 2372 2373 aSig = extractFloat64Frac( a ); 2374 aExp = extractFloat64Exp( a ); 2375 aSign = extractFloat64Sign( a ); 2376 if ( 0x41E < aExp ) { 2377 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2378 goto invalid; 2379 } 2380 else if ( aExp < 0x3FF ) { 2381 if ( aExp || aSig ) float_set_inexact(); 2382 return 0; 2383 } 2384 aSig |= LIT64( 0x0010000000000000 ); 2385 shiftCount = 0x433 - aExp; 2386 savedASig = aSig; 2387 aSig >>= shiftCount; 2388 z = aSig; 2389 if ( aSign ) z = - z; 2390 if ( ( z < 0 ) ^ aSign ) { 2391 invalid: 2392 float_raise( float_flag_invalid ); 2393 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2394 } 2395 if ( ( aSig<<shiftCount ) != savedASig ) { 2396 float_set_inexact(); 2397 } 2398 return z; 2399 2400 } 2401 2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2403 /* 2404 ------------------------------------------------------------------------------- 2405 Returns the result of converting the double-precision floating-point value 2406 `a' to the 64-bit two's complement integer format. The conversion is 2407 performed according to the IEC/IEEE Standard for Binary Floating-Point 2408 Arithmetic---which means in particular that the conversion is rounded 2409 according to the current rounding mode. If `a' is a NaN, the largest 2410 positive integer is returned. Otherwise, if the conversion overflows, the 2411 largest integer with the same sign as `a' is returned. 2412 ------------------------------------------------------------------------------- 2413 */ 2414 int64 float64_to_int64( float64 a ) 2415 { 2416 flag aSign; 2417 int16 aExp, shiftCount; 2418 bits64 aSig, aSigExtra; 2419 2420 aSig = extractFloat64Frac( a ); 2421 aExp = extractFloat64Exp( a ); 2422 aSign = extractFloat64Sign( a ); 2423 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2424 shiftCount = 0x433 - aExp; 2425 if ( shiftCount <= 0 ) { 2426 if ( 0x43E < aExp ) { 2427 float_raise( float_flag_invalid ); 2428 if ( ! aSign 2429 || ( ( aExp == 0x7FF ) 2430 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2431 ) { 2432 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2433 } 2434 return (sbits64) LIT64( 0x8000000000000000 ); 2435 } 2436 aSigExtra = 0; 2437 aSig <<= - shiftCount; 2438 } 2439 else { 2440 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2441 } 2442 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2443 2444 } 2445 2446 #ifdef __alpha__ 2447 /* 2448 ------------------------------------------------------------------------------- 2449 Returns the result of converting the double-precision floating-point value 2450 `a' to the 64-bit two's complement integer format. The conversion is 2451 performed according to the IEC/IEEE Standard for Binary Floating-Point 2452 Arithmetic---which means in particular that the conversion is rounded 2453 according to the current rounding mode. If `a' is a NaN, the invalid 2454 exception is raised and zero is returned. 2455 ------------------------------------------------------------------------------- 2456 */ 2457 int64 float64_to_int64_no_overflow( float64 a ) 2458 { 2459 flag aSign; 2460 int16 aExp, shiftCount; 2461 bits64 aSig, aSigExtra; 2462 2463 aSig = extractFloat64Frac( a ); 2464 aExp = extractFloat64Exp( a ); 2465 aSign = extractFloat64Sign( a ); 2466 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2467 shiftCount = 0x433 - aExp; 2468 if ( shiftCount <= 0 ) { 2469 if ( 0x43E < aExp ) { 2470 float_raise( float_flag_invalid ); 2471 return 0; 2472 } 2473 aSigExtra = 0; 2474 aSig <<= - shiftCount; 2475 } 2476 else { 2477 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2478 } 2479 return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra ); 2480 2481 } 2482 #endif /* __alpha__ */ 2483 2484 /* 2485 ------------------------------------------------------------------------------- 2486 Returns the result of converting the double-precision floating-point value 2487 `a' to the 64-bit two's complement integer format. The conversion is 2488 performed according to the IEC/IEEE Standard for Binary Floating-Point 2489 Arithmetic, except that the conversion is always rounded toward zero. 2490 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2491 the conversion overflows, the largest integer with the same sign as `a' is 2492 returned. 2493 ------------------------------------------------------------------------------- 2494 */ 2495 int64 float64_to_int64_round_to_zero( float64 a ) 2496 { 2497 flag aSign; 2498 int16 aExp, shiftCount; 2499 bits64 aSig; 2500 int64 z; 2501 2502 aSig = extractFloat64Frac( a ); 2503 aExp = extractFloat64Exp( a ); 2504 aSign = extractFloat64Sign( a ); 2505 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2506 shiftCount = aExp - 0x433; 2507 if ( 0 <= shiftCount ) { 2508 if ( 0x43E <= aExp ) { 2509 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2510 float_raise( float_flag_invalid ); 2511 if ( ! aSign 2512 || ( ( aExp == 0x7FF ) 2513 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2514 ) { 2515 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2516 } 2517 } 2518 return (sbits64) LIT64( 0x8000000000000000 ); 2519 } 2520 z = aSig<<shiftCount; 2521 } 2522 else { 2523 if ( aExp < 0x3FE ) { 2524 if ( aExp | aSig ) float_set_inexact(); 2525 return 0; 2526 } 2527 z = aSig>>( - shiftCount ); 2528 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2529 float_set_inexact(); 2530 } 2531 } 2532 if ( aSign ) z = - z; 2533 return z; 2534 2535 } 2536 #endif /* !SOFTFLOAT_FOR_GCC */ 2537 2538 /* 2539 ------------------------------------------------------------------------------- 2540 Returns the result of converting the double-precision floating-point value 2541 `a' to the single-precision floating-point format. The conversion is 2542 performed according to the IEC/IEEE Standard for Binary Floating-Point 2543 Arithmetic. 2544 ------------------------------------------------------------------------------- 2545 */ 2546 float32 float64_to_float32( float64 a ) 2547 { 2548 flag aSign; 2549 int16 aExp; 2550 bits64 aSig; 2551 bits32 zSig; 2552 2553 aSig = extractFloat64Frac( a ); 2554 aExp = extractFloat64Exp( a ); 2555 aSign = extractFloat64Sign( a ); 2556 if ( aExp == 0x7FF ) { 2557 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2558 return packFloat32( aSign, 0xFF, 0 ); 2559 } 2560 shift64RightJamming( aSig, 22, &aSig ); 2561 zSig = aSig; 2562 if ( aExp || zSig ) { 2563 zSig |= 0x40000000; 2564 aExp -= 0x381; 2565 } 2566 return roundAndPackFloat32( aSign, aExp, zSig ); 2567 2568 } 2569 2570 #ifdef FLOATX80 2571 2572 /* 2573 ------------------------------------------------------------------------------- 2574 Returns the result of converting the double-precision floating-point value 2575 `a' to the extended double-precision floating-point format. The conversion 2576 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2577 Arithmetic. 2578 ------------------------------------------------------------------------------- 2579 */ 2580 floatx80 float64_to_floatx80( float64 a ) 2581 { 2582 flag aSign; 2583 int16 aExp; 2584 bits64 aSig; 2585 2586 aSig = extractFloat64Frac( a ); 2587 aExp = extractFloat64Exp( a ); 2588 aSign = extractFloat64Sign( a ); 2589 if ( aExp == 0x7FF ) { 2590 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2591 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2592 } 2593 if ( aExp == 0 ) { 2594 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2595 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2596 } 2597 return 2598 packFloatx80( 2599 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2600 2601 } 2602 2603 #endif 2604 2605 #ifdef FLOAT128 2606 2607 /* 2608 ------------------------------------------------------------------------------- 2609 Returns the result of converting the double-precision floating-point value 2610 `a' to the quadruple-precision floating-point format. The conversion is 2611 performed according to the IEC/IEEE Standard for Binary Floating-Point 2612 Arithmetic. 2613 ------------------------------------------------------------------------------- 2614 */ 2615 float128 float64_to_float128( float64 a ) 2616 { 2617 flag aSign; 2618 int16 aExp; 2619 bits64 aSig, zSig0, zSig1; 2620 2621 aSig = extractFloat64Frac( a ); 2622 aExp = extractFloat64Exp( a ); 2623 aSign = extractFloat64Sign( a ); 2624 if ( aExp == 0x7FF ) { 2625 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2626 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2627 } 2628 if ( aExp == 0 ) { 2629 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2630 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2631 --aExp; 2632 } 2633 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2634 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2635 2636 } 2637 2638 #endif 2639 2640 #ifndef SOFTFLOAT_FOR_GCC 2641 /* 2642 ------------------------------------------------------------------------------- 2643 Rounds the double-precision floating-point value `a' to an integer, and 2644 returns the result as a double-precision floating-point value. The 2645 operation is performed according to the IEC/IEEE Standard for Binary 2646 Floating-Point Arithmetic. 2647 ------------------------------------------------------------------------------- 2648 */ 2649 float64 float64_round_to_int( float64 a ) 2650 { 2651 flag aSign; 2652 int16 aExp; 2653 bits64 lastBitMask, roundBitsMask; 2654 int8 roundingMode; 2655 float64 z; 2656 2657 aExp = extractFloat64Exp( a ); 2658 if ( 0x433 <= aExp ) { 2659 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2660 return propagateFloat64NaN( a, a ); 2661 } 2662 return a; 2663 } 2664 if ( aExp < 0x3FF ) { 2665 if ( (bits64) ( a<<1 ) == 0 ) return a; 2666 float_set_inexact(); 2667 aSign = extractFloat64Sign( a ); 2668 switch ( float_rounding_mode() ) { 2669 case float_round_nearest_even: 2670 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2671 return packFloat64( aSign, 0x3FF, 0 ); 2672 } 2673 break; 2674 case float_round_down: 2675 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2676 case float_round_up: 2677 return 2678 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2679 } 2680 return packFloat64( aSign, 0, 0 ); 2681 } 2682 lastBitMask = 1; 2683 lastBitMask <<= 0x433 - aExp; 2684 roundBitsMask = lastBitMask - 1; 2685 z = a; 2686 roundingMode = float_rounding_mode(); 2687 if ( roundingMode == float_round_nearest_even ) { 2688 z += lastBitMask>>1; 2689 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2690 } 2691 else if ( roundingMode != float_round_to_zero ) { 2692 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2693 z += roundBitsMask; 2694 } 2695 } 2696 z &= ~ roundBitsMask; 2697 if ( z != a ) float_set_inexact(); 2698 return z; 2699 2700 } 2701 #endif 2702 2703 /* 2704 ------------------------------------------------------------------------------- 2705 Returns the result of adding the absolute values of the double-precision 2706 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2707 before being returned. `zSign' is ignored if the result is a NaN. 2708 The addition is performed according to the IEC/IEEE Standard for Binary 2709 Floating-Point Arithmetic. 2710 ------------------------------------------------------------------------------- 2711 */ 2712 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2713 { 2714 int16 aExp, bExp, zExp; 2715 bits64 aSig, bSig, zSig; 2716 int16 expDiff; 2717 2718 aSig = extractFloat64Frac( a ); 2719 aExp = extractFloat64Exp( a ); 2720 bSig = extractFloat64Frac( b ); 2721 bExp = extractFloat64Exp( b ); 2722 expDiff = aExp - bExp; 2723 aSig <<= 9; 2724 bSig <<= 9; 2725 if ( 0 < expDiff ) { 2726 if ( aExp == 0x7FF ) { 2727 if ( aSig ) return propagateFloat64NaN( a, b ); 2728 return a; 2729 } 2730 if ( bExp == 0 ) { 2731 --expDiff; 2732 } 2733 else { 2734 bSig |= LIT64( 0x2000000000000000 ); 2735 } 2736 shift64RightJamming( bSig, expDiff, &bSig ); 2737 zExp = aExp; 2738 } 2739 else if ( expDiff < 0 ) { 2740 if ( bExp == 0x7FF ) { 2741 if ( bSig ) return propagateFloat64NaN( a, b ); 2742 return packFloat64( zSign, 0x7FF, 0 ); 2743 } 2744 if ( aExp == 0 ) { 2745 ++expDiff; 2746 } 2747 else { 2748 aSig |= LIT64( 0x2000000000000000 ); 2749 } 2750 shift64RightJamming( aSig, - expDiff, &aSig ); 2751 zExp = bExp; 2752 } 2753 else { 2754 if ( aExp == 0x7FF ) { 2755 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2756 return a; 2757 } 2758 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2759 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2760 zExp = aExp; 2761 goto roundAndPack; 2762 } 2763 aSig |= LIT64( 0x2000000000000000 ); 2764 zSig = ( aSig + bSig )<<1; 2765 --zExp; 2766 if ( (sbits64) zSig < 0 ) { 2767 zSig = aSig + bSig; 2768 ++zExp; 2769 } 2770 roundAndPack: 2771 return roundAndPackFloat64( zSign, zExp, zSig ); 2772 2773 } 2774 2775 /* 2776 ------------------------------------------------------------------------------- 2777 Returns the result of subtracting the absolute values of the double- 2778 precision floating-point values `a' and `b'. If `zSign' is 1, the 2779 difference is negated before being returned. `zSign' is ignored if the 2780 result is a NaN. The subtraction is performed according to the IEC/IEEE 2781 Standard for Binary Floating-Point Arithmetic. 2782 ------------------------------------------------------------------------------- 2783 */ 2784 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2785 { 2786 int16 aExp, bExp, zExp; 2787 bits64 aSig, bSig, zSig; 2788 int16 expDiff; 2789 2790 aSig = extractFloat64Frac( a ); 2791 aExp = extractFloat64Exp( a ); 2792 bSig = extractFloat64Frac( b ); 2793 bExp = extractFloat64Exp( b ); 2794 expDiff = aExp - bExp; 2795 aSig <<= 10; 2796 bSig <<= 10; 2797 if ( 0 < expDiff ) goto aExpBigger; 2798 if ( expDiff < 0 ) goto bExpBigger; 2799 if ( aExp == 0x7FF ) { 2800 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2801 float_raise( float_flag_invalid ); 2802 return float64_default_nan; 2803 } 2804 if ( aExp == 0 ) { 2805 aExp = 1; 2806 bExp = 1; 2807 } 2808 if ( bSig < aSig ) goto aBigger; 2809 if ( aSig < bSig ) goto bBigger; 2810 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2811 bExpBigger: 2812 if ( bExp == 0x7FF ) { 2813 if ( bSig ) return propagateFloat64NaN( a, b ); 2814 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2815 } 2816 if ( aExp == 0 ) { 2817 ++expDiff; 2818 } 2819 else { 2820 aSig |= LIT64( 0x4000000000000000 ); 2821 } 2822 shift64RightJamming( aSig, - expDiff, &aSig ); 2823 bSig |= LIT64( 0x4000000000000000 ); 2824 bBigger: 2825 zSig = bSig - aSig; 2826 zExp = bExp; 2827 zSign ^= 1; 2828 goto normalizeRoundAndPack; 2829 aExpBigger: 2830 if ( aExp == 0x7FF ) { 2831 if ( aSig ) return propagateFloat64NaN( a, b ); 2832 return a; 2833 } 2834 if ( bExp == 0 ) { 2835 --expDiff; 2836 } 2837 else { 2838 bSig |= LIT64( 0x4000000000000000 ); 2839 } 2840 shift64RightJamming( bSig, expDiff, &bSig ); 2841 aSig |= LIT64( 0x4000000000000000 ); 2842 aBigger: 2843 zSig = aSig - bSig; 2844 zExp = aExp; 2845 normalizeRoundAndPack: 2846 --zExp; 2847 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2848 2849 } 2850 2851 /* 2852 ------------------------------------------------------------------------------- 2853 Returns the result of adding the double-precision floating-point values `a' 2854 and `b'. The operation is performed according to the IEC/IEEE Standard for 2855 Binary Floating-Point Arithmetic. 2856 ------------------------------------------------------------------------------- 2857 */ 2858 float64 float64_add( float64 a, float64 b ) 2859 { 2860 flag aSign, bSign; 2861 2862 aSign = extractFloat64Sign( a ); 2863 bSign = extractFloat64Sign( b ); 2864 if ( aSign == bSign ) { 2865 return addFloat64Sigs( a, b, aSign ); 2866 } 2867 else { 2868 return subFloat64Sigs( a, b, aSign ); 2869 } 2870 2871 } 2872 2873 /* 2874 ------------------------------------------------------------------------------- 2875 Returns the result of subtracting the double-precision floating-point values 2876 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2877 for Binary Floating-Point Arithmetic. 2878 ------------------------------------------------------------------------------- 2879 */ 2880 float64 float64_sub( float64 a, float64 b ) 2881 { 2882 flag aSign, bSign; 2883 2884 aSign = extractFloat64Sign( a ); 2885 bSign = extractFloat64Sign( b ); 2886 if ( aSign == bSign ) { 2887 return subFloat64Sigs( a, b, aSign ); 2888 } 2889 else { 2890 return addFloat64Sigs( a, b, aSign ); 2891 } 2892 2893 } 2894 2895 /* 2896 ------------------------------------------------------------------------------- 2897 Returns the result of multiplying the double-precision floating-point values 2898 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2899 for Binary Floating-Point Arithmetic. 2900 ------------------------------------------------------------------------------- 2901 */ 2902 float64 float64_mul( float64 a, float64 b ) 2903 { 2904 flag aSign, bSign, zSign; 2905 int16 aExp, bExp, zExp; 2906 bits64 aSig, bSig, zSig0, zSig1; 2907 2908 aSig = extractFloat64Frac( a ); 2909 aExp = extractFloat64Exp( a ); 2910 aSign = extractFloat64Sign( a ); 2911 bSig = extractFloat64Frac( b ); 2912 bExp = extractFloat64Exp( b ); 2913 bSign = extractFloat64Sign( b ); 2914 zSign = aSign ^ bSign; 2915 if ( aExp == 0x7FF ) { 2916 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2917 return propagateFloat64NaN( a, b ); 2918 } 2919 if ( ( bExp | bSig ) == 0 ) { 2920 float_raise( float_flag_invalid ); 2921 return float64_default_nan; 2922 } 2923 return packFloat64( zSign, 0x7FF, 0 ); 2924 } 2925 if ( bExp == 0x7FF ) { 2926 if ( bSig ) return propagateFloat64NaN( a, b ); 2927 if ( ( aExp | aSig ) == 0 ) { 2928 float_raise( float_flag_invalid ); 2929 return float64_default_nan; 2930 } 2931 return packFloat64( zSign, 0x7FF, 0 ); 2932 } 2933 if ( aExp == 0 ) { 2934 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2935 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2936 } 2937 if ( bExp == 0 ) { 2938 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2939 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2940 } 2941 zExp = aExp + bExp - 0x3FF; 2942 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2943 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2944 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2945 zSig0 |= ( zSig1 != 0 ); 2946 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2947 zSig0 <<= 1; 2948 --zExp; 2949 } 2950 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2951 2952 } 2953 2954 /* 2955 ------------------------------------------------------------------------------- 2956 Returns the result of dividing the double-precision floating-point value `a' 2957 by the corresponding value `b'. The operation is performed according to 2958 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2959 ------------------------------------------------------------------------------- 2960 */ 2961 float64 float64_div( float64 a, float64 b ) 2962 { 2963 flag aSign, bSign, zSign; 2964 int16 aExp, bExp, zExp; 2965 bits64 aSig, bSig, zSig; 2966 bits64 rem0, rem1; 2967 bits64 term0, term1; 2968 2969 aSig = extractFloat64Frac( a ); 2970 aExp = extractFloat64Exp( a ); 2971 aSign = extractFloat64Sign( a ); 2972 bSig = extractFloat64Frac( b ); 2973 bExp = extractFloat64Exp( b ); 2974 bSign = extractFloat64Sign( b ); 2975 zSign = aSign ^ bSign; 2976 if ( aExp == 0x7FF ) { 2977 if ( aSig ) return propagateFloat64NaN( a, b ); 2978 if ( bExp == 0x7FF ) { 2979 if ( bSig ) return propagateFloat64NaN( a, b ); 2980 float_raise( float_flag_invalid ); 2981 return float64_default_nan; 2982 } 2983 return packFloat64( zSign, 0x7FF, 0 ); 2984 } 2985 if ( bExp == 0x7FF ) { 2986 if ( bSig ) return propagateFloat64NaN( a, b ); 2987 return packFloat64( zSign, 0, 0 ); 2988 } 2989 if ( bExp == 0 ) { 2990 if ( bSig == 0 ) { 2991 if ( ( aExp | aSig ) == 0 ) { 2992 float_raise( float_flag_invalid ); 2993 return float64_default_nan; 2994 } 2995 float_raise( float_flag_divbyzero ); 2996 return packFloat64( zSign, 0x7FF, 0 ); 2997 } 2998 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2999 } 3000 if ( aExp == 0 ) { 3001 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3002 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3003 } 3004 zExp = aExp - bExp + 0x3FD; 3005 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3006 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3007 if ( bSig <= ( aSig + aSig ) ) { 3008 aSig >>= 1; 3009 ++zExp; 3010 } 3011 zSig = estimateDiv128To64( aSig, 0, bSig ); 3012 if ( ( zSig & 0x1FF ) <= 2 ) { 3013 mul64To128( bSig, zSig, &term0, &term1 ); 3014 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3015 while ( (sbits64) rem0 < 0 ) { 3016 --zSig; 3017 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3018 } 3019 zSig |= ( rem1 != 0 ); 3020 } 3021 return roundAndPackFloat64( zSign, zExp, zSig ); 3022 3023 } 3024 3025 #ifndef SOFTFLOAT_FOR_GCC 3026 /* 3027 ------------------------------------------------------------------------------- 3028 Returns the remainder of the double-precision floating-point value `a' 3029 with respect to the corresponding value `b'. The operation is performed 3030 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3031 ------------------------------------------------------------------------------- 3032 */ 3033 float64 float64_rem( float64 a, float64 b ) 3034 { 3035 flag aSign, bSign, zSign; 3036 int16 aExp, bExp, expDiff; 3037 bits64 aSig, bSig; 3038 bits64 q, alternateASig; 3039 sbits64 sigMean; 3040 3041 aSig = extractFloat64Frac( a ); 3042 aExp = extractFloat64Exp( a ); 3043 aSign = extractFloat64Sign( a ); 3044 bSig = extractFloat64Frac( b ); 3045 bExp = extractFloat64Exp( b ); 3046 bSign = extractFloat64Sign( b ); 3047 if ( aExp == 0x7FF ) { 3048 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3049 return propagateFloat64NaN( a, b ); 3050 } 3051 float_raise( float_flag_invalid ); 3052 return float64_default_nan; 3053 } 3054 if ( bExp == 0x7FF ) { 3055 if ( bSig ) return propagateFloat64NaN( a, b ); 3056 return a; 3057 } 3058 if ( bExp == 0 ) { 3059 if ( bSig == 0 ) { 3060 float_raise( float_flag_invalid ); 3061 return float64_default_nan; 3062 } 3063 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3064 } 3065 if ( aExp == 0 ) { 3066 if ( aSig == 0 ) return a; 3067 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3068 } 3069 expDiff = aExp - bExp; 3070 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3071 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3072 if ( expDiff < 0 ) { 3073 if ( expDiff < -1 ) return a; 3074 aSig >>= 1; 3075 } 3076 q = ( bSig <= aSig ); 3077 if ( q ) aSig -= bSig; 3078 expDiff -= 64; 3079 while ( 0 < expDiff ) { 3080 q = estimateDiv128To64( aSig, 0, bSig ); 3081 q = ( 2 < q ) ? q - 2 : 0; 3082 aSig = - ( ( bSig>>2 ) * q ); 3083 expDiff -= 62; 3084 } 3085 expDiff += 64; 3086 if ( 0 < expDiff ) { 3087 q = estimateDiv128To64( aSig, 0, bSig ); 3088 q = ( 2 < q ) ? q - 2 : 0; 3089 q >>= 64 - expDiff; 3090 bSig >>= 2; 3091 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3092 } 3093 else { 3094 aSig >>= 2; 3095 bSig >>= 2; 3096 } 3097 do { 3098 alternateASig = aSig; 3099 ++q; 3100 aSig -= bSig; 3101 } while ( 0 <= (sbits64) aSig ); 3102 sigMean = aSig + alternateASig; 3103 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3104 aSig = alternateASig; 3105 } 3106 zSign = ( (sbits64) aSig < 0 ); 3107 if ( zSign ) aSig = - aSig; 3108 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3109 3110 } 3111 3112 /* 3113 ------------------------------------------------------------------------------- 3114 Returns the square root of the double-precision floating-point value `a'. 3115 The operation is performed according to the IEC/IEEE Standard for Binary 3116 Floating-Point Arithmetic. 3117 ------------------------------------------------------------------------------- 3118 */ 3119 float64 float64_sqrt( float64 a ) 3120 { 3121 flag aSign; 3122 int16 aExp, zExp; 3123 bits64 aSig, zSig, doubleZSig; 3124 bits64 rem0, rem1, term0, term1; 3125 3126 aSig = extractFloat64Frac( a ); 3127 aExp = extractFloat64Exp( a ); 3128 aSign = extractFloat64Sign( a ); 3129 if ( aExp == 0x7FF ) { 3130 if ( aSig ) return propagateFloat64NaN( a, a ); 3131 if ( ! aSign ) return a; 3132 float_raise( float_flag_invalid ); 3133 return float64_default_nan; 3134 } 3135 if ( aSign ) { 3136 if ( ( aExp | aSig ) == 0 ) return a; 3137 float_raise( float_flag_invalid ); 3138 return float64_default_nan; 3139 } 3140 if ( aExp == 0 ) { 3141 if ( aSig == 0 ) return 0; 3142 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3143 } 3144 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3145 aSig |= LIT64( 0x0010000000000000 ); 3146 zSig = estimateSqrt32( aExp, aSig>>21 ); 3147 aSig <<= 9 - ( aExp & 1 ); 3148 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3149 if ( ( zSig & 0x1FF ) <= 5 ) { 3150 doubleZSig = zSig<<1; 3151 mul64To128( zSig, zSig, &term0, &term1 ); 3152 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3153 while ( (sbits64) rem0 < 0 ) { 3154 --zSig; 3155 doubleZSig -= 2; 3156 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3157 } 3158 zSig |= ( ( rem0 | rem1 ) != 0 ); 3159 } 3160 return roundAndPackFloat64( 0, zExp, zSig ); 3161 3162 } 3163 #endif 3164 3165 /* 3166 ------------------------------------------------------------------------------- 3167 Returns 1 if the double-precision floating-point value `a' is equal to the 3168 corresponding value `b', and 0 otherwise. The comparison is performed 3169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3170 ------------------------------------------------------------------------------- 3171 */ 3172 flag float64_eq( float64 a, float64 b ) 3173 { 3174 3175 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3176 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3177 ) { 3178 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3179 float_raise( float_flag_invalid ); 3180 } 3181 return 0; 3182 } 3183 return ( a == b ) || 3184 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3185 3186 } 3187 3188 /* 3189 ------------------------------------------------------------------------------- 3190 Returns 1 if the double-precision floating-point value `a' is less than or 3191 equal to the corresponding value `b', and 0 otherwise. The comparison is 3192 performed according to the IEC/IEEE Standard for Binary Floating-Point 3193 Arithmetic. 3194 ------------------------------------------------------------------------------- 3195 */ 3196 flag float64_le( float64 a, float64 b ) 3197 { 3198 flag aSign, bSign; 3199 3200 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3201 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3202 ) { 3203 float_raise( float_flag_invalid ); 3204 return 0; 3205 } 3206 aSign = extractFloat64Sign( a ); 3207 bSign = extractFloat64Sign( b ); 3208 if ( aSign != bSign ) 3209 return aSign || 3210 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3211 0 ); 3212 return ( a == b ) || 3213 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3214 3215 } 3216 3217 /* 3218 ------------------------------------------------------------------------------- 3219 Returns 1 if the double-precision floating-point value `a' is less than 3220 the corresponding value `b', and 0 otherwise. The comparison is performed 3221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3222 ------------------------------------------------------------------------------- 3223 */ 3224 flag float64_lt( float64 a, float64 b ) 3225 { 3226 flag aSign, bSign; 3227 3228 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3229 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3230 ) { 3231 float_raise( float_flag_invalid ); 3232 return 0; 3233 } 3234 aSign = extractFloat64Sign( a ); 3235 bSign = extractFloat64Sign( b ); 3236 if ( aSign != bSign ) 3237 return aSign && 3238 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3239 0 ); 3240 return ( a != b ) && 3241 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3242 3243 } 3244 3245 #ifndef SOFTFLOAT_FOR_GCC 3246 /* 3247 ------------------------------------------------------------------------------- 3248 Returns 1 if the double-precision floating-point value `a' is equal to the 3249 corresponding value `b', and 0 otherwise. The invalid exception is raised 3250 if either operand is a NaN. Otherwise, the comparison is performed 3251 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3252 ------------------------------------------------------------------------------- 3253 */ 3254 flag float64_eq_signaling( float64 a, float64 b ) 3255 { 3256 3257 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3258 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3259 ) { 3260 float_raise( float_flag_invalid ); 3261 return 0; 3262 } 3263 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3264 3265 } 3266 3267 /* 3268 ------------------------------------------------------------------------------- 3269 Returns 1 if the double-precision floating-point value `a' is less than or 3270 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3271 cause an exception. Otherwise, the comparison is performed according to the 3272 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3273 ------------------------------------------------------------------------------- 3274 */ 3275 flag float64_le_quiet( float64 a, float64 b ) 3276 { 3277 flag aSign, bSign; 3278 3279 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3280 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3281 ) { 3282 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3283 float_raise( float_flag_invalid ); 3284 } 3285 return 0; 3286 } 3287 aSign = extractFloat64Sign( a ); 3288 bSign = extractFloat64Sign( b ); 3289 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3290 return ( a == b ) || ( aSign ^ ( a < b ) ); 3291 3292 } 3293 3294 /* 3295 ------------------------------------------------------------------------------- 3296 Returns 1 if the double-precision floating-point value `a' is less than 3297 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3298 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3299 Standard for Binary Floating-Point Arithmetic. 3300 ------------------------------------------------------------------------------- 3301 */ 3302 flag float64_lt_quiet( float64 a, float64 b ) 3303 { 3304 flag aSign, bSign; 3305 3306 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3307 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3308 ) { 3309 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3310 float_raise( float_flag_invalid ); 3311 } 3312 return 0; 3313 } 3314 aSign = extractFloat64Sign( a ); 3315 bSign = extractFloat64Sign( b ); 3316 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3317 return ( a != b ) && ( aSign ^ ( a < b ) ); 3318 3319 } 3320 #endif 3321 3322 #ifdef FLOATX80 3323 3324 /* 3325 ------------------------------------------------------------------------------- 3326 Returns the result of converting the extended double-precision floating- 3327 point value `a' to the 32-bit two's complement integer format. The 3328 conversion is performed according to the IEC/IEEE Standard for Binary 3329 Floating-Point Arithmetic---which means in particular that the conversion 3330 is rounded according to the current rounding mode. If `a' is a NaN, the 3331 largest positive integer is returned. Otherwise, if the conversion 3332 overflows, the largest integer with the same sign as `a' is returned. 3333 ------------------------------------------------------------------------------- 3334 */ 3335 int32 floatx80_to_int32( floatx80 a ) 3336 { 3337 flag aSign; 3338 int32 aExp, shiftCount; 3339 bits64 aSig; 3340 3341 aSig = extractFloatx80Frac( a ); 3342 aExp = extractFloatx80Exp( a ); 3343 aSign = extractFloatx80Sign( a ); 3344 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3345 shiftCount = 0x4037 - aExp; 3346 if ( shiftCount <= 0 ) shiftCount = 1; 3347 shift64RightJamming( aSig, shiftCount, &aSig ); 3348 return roundAndPackInt32( aSign, aSig ); 3349 3350 } 3351 3352 /* 3353 ------------------------------------------------------------------------------- 3354 Returns the result of converting the extended double-precision floating- 3355 point value `a' to the 32-bit two's complement integer format. The 3356 conversion is performed according to the IEC/IEEE Standard for Binary 3357 Floating-Point Arithmetic, except that the conversion is always rounded 3358 toward zero. If `a' is a NaN, the largest positive integer is returned. 3359 Otherwise, if the conversion overflows, the largest integer with the same 3360 sign as `a' is returned. 3361 ------------------------------------------------------------------------------- 3362 */ 3363 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3364 { 3365 flag aSign; 3366 int32 aExp, shiftCount; 3367 bits64 aSig, savedASig; 3368 int32 z; 3369 3370 aSig = extractFloatx80Frac( a ); 3371 aExp = extractFloatx80Exp( a ); 3372 aSign = extractFloatx80Sign( a ); 3373 if ( 0x401E < aExp ) { 3374 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3375 goto invalid; 3376 } 3377 else if ( aExp < 0x3FFF ) { 3378 if ( aExp || aSig ) float_set_inexact(); 3379 return 0; 3380 } 3381 shiftCount = 0x403E - aExp; 3382 savedASig = aSig; 3383 aSig >>= shiftCount; 3384 z = aSig; 3385 if ( aSign ) z = - z; 3386 if ( ( z < 0 ) ^ aSign ) { 3387 invalid: 3388 float_raise( float_flag_invalid ); 3389 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3390 } 3391 if ( ( aSig<<shiftCount ) != savedASig ) { 3392 float_set_inexact(); 3393 } 3394 return z; 3395 3396 } 3397 3398 /* 3399 ------------------------------------------------------------------------------- 3400 Returns the result of converting the extended double-precision floating- 3401 point value `a' to the 64-bit two's complement integer format. The 3402 conversion is performed according to the IEC/IEEE Standard for Binary 3403 Floating-Point Arithmetic---which means in particular that the conversion 3404 is rounded according to the current rounding mode. If `a' is a NaN, 3405 the largest positive integer is returned. Otherwise, if the conversion 3406 overflows, the largest integer with the same sign as `a' is returned. 3407 ------------------------------------------------------------------------------- 3408 */ 3409 int64 floatx80_to_int64( floatx80 a ) 3410 { 3411 flag aSign; 3412 int32 aExp, shiftCount; 3413 bits64 aSig, aSigExtra; 3414 3415 aSig = extractFloatx80Frac( a ); 3416 aExp = extractFloatx80Exp( a ); 3417 aSign = extractFloatx80Sign( a ); 3418 shiftCount = 0x403E - aExp; 3419 if ( shiftCount <= 0 ) { 3420 if ( shiftCount ) { 3421 float_raise( float_flag_invalid ); 3422 if ( ! aSign 3423 || ( ( aExp == 0x7FFF ) 3424 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3425 ) { 3426 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3427 } 3428 return (sbits64) LIT64( 0x8000000000000000 ); 3429 } 3430 aSigExtra = 0; 3431 } 3432 else { 3433 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3434 } 3435 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3436 3437 } 3438 3439 /* 3440 ------------------------------------------------------------------------------- 3441 Returns the result of converting the extended double-precision floating- 3442 point value `a' to the 64-bit two's complement integer format. The 3443 conversion is performed according to the IEC/IEEE Standard for Binary 3444 Floating-Point Arithmetic, except that the conversion is always rounded 3445 toward zero. If `a' is a NaN, the largest positive integer is returned. 3446 Otherwise, if the conversion overflows, the largest integer with the same 3447 sign as `a' is returned. 3448 ------------------------------------------------------------------------------- 3449 */ 3450 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3451 { 3452 flag aSign; 3453 int32 aExp, shiftCount; 3454 bits64 aSig; 3455 int64 z; 3456 3457 aSig = extractFloatx80Frac( a ); 3458 aExp = extractFloatx80Exp( a ); 3459 aSign = extractFloatx80Sign( a ); 3460 shiftCount = aExp - 0x403E; 3461 if ( 0 <= shiftCount ) { 3462 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3463 if ( ( a.high != 0xC03E ) || aSig ) { 3464 float_raise( float_flag_invalid ); 3465 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3466 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3467 } 3468 } 3469 return (sbits64) LIT64( 0x8000000000000000 ); 3470 } 3471 else if ( aExp < 0x3FFF ) { 3472 if ( aExp | aSig ) float_set_inexact(); 3473 return 0; 3474 } 3475 z = aSig>>( - shiftCount ); 3476 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3477 float_set_inexact(); 3478 } 3479 if ( aSign ) z = - z; 3480 return z; 3481 3482 } 3483 3484 /* 3485 ------------------------------------------------------------------------------- 3486 Returns the result of converting the extended double-precision floating- 3487 point value `a' to the single-precision floating-point format. The 3488 conversion is performed according to the IEC/IEEE Standard for Binary 3489 Floating-Point Arithmetic. 3490 ------------------------------------------------------------------------------- 3491 */ 3492 float32 floatx80_to_float32( floatx80 a ) 3493 { 3494 flag aSign; 3495 int32 aExp; 3496 bits64 aSig; 3497 3498 aSig = extractFloatx80Frac( a ); 3499 aExp = extractFloatx80Exp( a ); 3500 aSign = extractFloatx80Sign( a ); 3501 if ( aExp == 0x7FFF ) { 3502 if ( (bits64) ( aSig<<1 ) ) { 3503 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3504 } 3505 return packFloat32( aSign, 0xFF, 0 ); 3506 } 3507 shift64RightJamming( aSig, 33, &aSig ); 3508 if ( aExp || aSig ) aExp -= 0x3F81; 3509 return roundAndPackFloat32( aSign, aExp, aSig ); 3510 3511 } 3512 3513 /* 3514 ------------------------------------------------------------------------------- 3515 Returns the result of converting the extended double-precision floating- 3516 point value `a' to the double-precision floating-point format. The 3517 conversion is performed according to the IEC/IEEE Standard for Binary 3518 Floating-Point Arithmetic. 3519 ------------------------------------------------------------------------------- 3520 */ 3521 float64 floatx80_to_float64( floatx80 a ) 3522 { 3523 flag aSign; 3524 int32 aExp; 3525 bits64 aSig, zSig; 3526 3527 aSig = extractFloatx80Frac( a ); 3528 aExp = extractFloatx80Exp( a ); 3529 aSign = extractFloatx80Sign( a ); 3530 if ( aExp == 0x7FFF ) { 3531 if ( (bits64) ( aSig<<1 ) ) { 3532 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3533 } 3534 return packFloat64( aSign, 0x7FF, 0 ); 3535 } 3536 shift64RightJamming( aSig, 1, &zSig ); 3537 if ( aExp || aSig ) aExp -= 0x3C01; 3538 return roundAndPackFloat64( aSign, aExp, zSig ); 3539 3540 } 3541 3542 #ifdef FLOAT128 3543 3544 /* 3545 ------------------------------------------------------------------------------- 3546 Returns the result of converting the extended double-precision floating- 3547 point value `a' to the quadruple-precision floating-point format. The 3548 conversion is performed according to the IEC/IEEE Standard for Binary 3549 Floating-Point Arithmetic. 3550 ------------------------------------------------------------------------------- 3551 */ 3552 float128 floatx80_to_float128( floatx80 a ) 3553 { 3554 flag aSign; 3555 int16 aExp; 3556 bits64 aSig, zSig0, zSig1; 3557 3558 aSig = extractFloatx80Frac( a ); 3559 aExp = extractFloatx80Exp( a ); 3560 aSign = extractFloatx80Sign( a ); 3561 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3562 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3563 } 3564 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3565 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3566 3567 } 3568 3569 #endif 3570 3571 /* 3572 ------------------------------------------------------------------------------- 3573 Rounds the extended double-precision floating-point value `a' to an integer, 3574 and returns the result as an extended quadruple-precision floating-point 3575 value. The operation is performed according to the IEC/IEEE Standard for 3576 Binary Floating-Point Arithmetic. 3577 ------------------------------------------------------------------------------- 3578 */ 3579 floatx80 floatx80_round_to_int( floatx80 a ) 3580 { 3581 flag aSign; 3582 int32 aExp; 3583 bits64 lastBitMask, roundBitsMask; 3584 int8 roundingMode; 3585 floatx80 z; 3586 3587 aExp = extractFloatx80Exp( a ); 3588 if ( 0x403E <= aExp ) { 3589 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3590 return propagateFloatx80NaN( a, a ); 3591 } 3592 return a; 3593 } 3594 if ( aExp < 0x3FFF ) { 3595 if ( ( aExp == 0 ) 3596 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3597 return a; 3598 } 3599 float_set_inexact(); 3600 aSign = extractFloatx80Sign( a ); 3601 switch ( float_rounding_mode() ) { 3602 case float_round_nearest_even: 3603 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3604 ) { 3605 return 3606 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3607 } 3608 break; 3609 case float_round_down: 3610 return 3611 aSign ? 3612 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3613 : packFloatx80( 0, 0, 0 ); 3614 case float_round_up: 3615 return 3616 aSign ? packFloatx80( 1, 0, 0 ) 3617 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3618 } 3619 return packFloatx80( aSign, 0, 0 ); 3620 } 3621 lastBitMask = 1; 3622 lastBitMask <<= 0x403E - aExp; 3623 roundBitsMask = lastBitMask - 1; 3624 z = a; 3625 roundingMode = float_rounding_mode(); 3626 if ( roundingMode == float_round_nearest_even ) { 3627 z.low += lastBitMask>>1; 3628 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3629 } 3630 else if ( roundingMode != float_round_to_zero ) { 3631 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3632 z.low += roundBitsMask; 3633 } 3634 } 3635 z.low &= ~ roundBitsMask; 3636 if ( z.low == 0 ) { 3637 ++z.high; 3638 z.low = LIT64( 0x8000000000000000 ); 3639 } 3640 if ( z.low != a.low ) float_set_inexact(); 3641 return z; 3642 3643 } 3644 3645 /* 3646 ------------------------------------------------------------------------------- 3647 Returns the result of adding the absolute values of the extended double- 3648 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3649 negated before being returned. `zSign' is ignored if the result is a NaN. 3650 The addition is performed according to the IEC/IEEE Standard for Binary 3651 Floating-Point Arithmetic. 3652 ------------------------------------------------------------------------------- 3653 */ 3654 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3655 { 3656 int32 aExp, bExp, zExp; 3657 bits64 aSig, bSig, zSig0, zSig1; 3658 int32 expDiff; 3659 3660 aSig = extractFloatx80Frac( a ); 3661 aExp = extractFloatx80Exp( a ); 3662 bSig = extractFloatx80Frac( b ); 3663 bExp = extractFloatx80Exp( b ); 3664 expDiff = aExp - bExp; 3665 if ( 0 < expDiff ) { 3666 if ( aExp == 0x7FFF ) { 3667 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3668 return a; 3669 } 3670 if ( bExp == 0 ) --expDiff; 3671 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3672 zExp = aExp; 3673 } 3674 else if ( expDiff < 0 ) { 3675 if ( bExp == 0x7FFF ) { 3676 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3677 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3678 } 3679 if ( aExp == 0 ) ++expDiff; 3680 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3681 zExp = bExp; 3682 } 3683 else { 3684 if ( aExp == 0x7FFF ) { 3685 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3686 return propagateFloatx80NaN( a, b ); 3687 } 3688 return a; 3689 } 3690 zSig1 = 0; 3691 zSig0 = aSig + bSig; 3692 if ( aExp == 0 ) { 3693 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3694 goto roundAndPack; 3695 } 3696 zExp = aExp; 3697 goto shiftRight1; 3698 } 3699 zSig0 = aSig + bSig; 3700 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3701 shiftRight1: 3702 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3703 zSig0 |= LIT64( 0x8000000000000000 ); 3704 ++zExp; 3705 roundAndPack: 3706 return 3707 roundAndPackFloatx80( 3708 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3709 3710 } 3711 3712 /* 3713 ------------------------------------------------------------------------------- 3714 Returns the result of subtracting the absolute values of the extended 3715 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3716 difference is negated before being returned. `zSign' is ignored if the 3717 result is a NaN. The subtraction is performed according to the IEC/IEEE 3718 Standard for Binary Floating-Point Arithmetic. 3719 ------------------------------------------------------------------------------- 3720 */ 3721 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3722 { 3723 int32 aExp, bExp, zExp; 3724 bits64 aSig, bSig, zSig0, zSig1; 3725 int32 expDiff; 3726 floatx80 z; 3727 3728 aSig = extractFloatx80Frac( a ); 3729 aExp = extractFloatx80Exp( a ); 3730 bSig = extractFloatx80Frac( b ); 3731 bExp = extractFloatx80Exp( b ); 3732 expDiff = aExp - bExp; 3733 if ( 0 < expDiff ) goto aExpBigger; 3734 if ( expDiff < 0 ) goto bExpBigger; 3735 if ( aExp == 0x7FFF ) { 3736 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3737 return propagateFloatx80NaN( a, b ); 3738 } 3739 float_raise( float_flag_invalid ); 3740 z.low = floatx80_default_nan_low; 3741 z.high = floatx80_default_nan_high; 3742 return z; 3743 } 3744 if ( aExp == 0 ) { 3745 aExp = 1; 3746 bExp = 1; 3747 } 3748 zSig1 = 0; 3749 if ( bSig < aSig ) goto aBigger; 3750 if ( aSig < bSig ) goto bBigger; 3751 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3752 bExpBigger: 3753 if ( bExp == 0x7FFF ) { 3754 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3755 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3756 } 3757 if ( aExp == 0 ) ++expDiff; 3758 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3759 bBigger: 3760 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3761 zExp = bExp; 3762 zSign ^= 1; 3763 goto normalizeRoundAndPack; 3764 aExpBigger: 3765 if ( aExp == 0x7FFF ) { 3766 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3767 return a; 3768 } 3769 if ( bExp == 0 ) --expDiff; 3770 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3771 aBigger: 3772 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3773 zExp = aExp; 3774 normalizeRoundAndPack: 3775 return 3776 normalizeRoundAndPackFloatx80( 3777 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3778 3779 } 3780 3781 /* 3782 ------------------------------------------------------------------------------- 3783 Returns the result of adding the extended double-precision floating-point 3784 values `a' and `b'. The operation is performed according to the IEC/IEEE 3785 Standard for Binary Floating-Point Arithmetic. 3786 ------------------------------------------------------------------------------- 3787 */ 3788 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3789 { 3790 flag aSign, bSign; 3791 3792 aSign = extractFloatx80Sign( a ); 3793 bSign = extractFloatx80Sign( b ); 3794 if ( aSign == bSign ) { 3795 return addFloatx80Sigs( a, b, aSign ); 3796 } 3797 else { 3798 return subFloatx80Sigs( a, b, aSign ); 3799 } 3800 3801 } 3802 3803 /* 3804 ------------------------------------------------------------------------------- 3805 Returns the result of subtracting the extended double-precision floating- 3806 point values `a' and `b'. The operation is performed according to the 3807 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3808 ------------------------------------------------------------------------------- 3809 */ 3810 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3811 { 3812 flag aSign, bSign; 3813 3814 aSign = extractFloatx80Sign( a ); 3815 bSign = extractFloatx80Sign( b ); 3816 if ( aSign == bSign ) { 3817 return subFloatx80Sigs( a, b, aSign ); 3818 } 3819 else { 3820 return addFloatx80Sigs( a, b, aSign ); 3821 } 3822 3823 } 3824 3825 /* 3826 ------------------------------------------------------------------------------- 3827 Returns the result of multiplying the extended double-precision floating- 3828 point values `a' and `b'. The operation is performed according to the 3829 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3830 ------------------------------------------------------------------------------- 3831 */ 3832 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3833 { 3834 flag aSign, bSign, zSign; 3835 int32 aExp, bExp, zExp; 3836 bits64 aSig, bSig, zSig0, zSig1; 3837 floatx80 z; 3838 3839 aSig = extractFloatx80Frac( a ); 3840 aExp = extractFloatx80Exp( a ); 3841 aSign = extractFloatx80Sign( a ); 3842 bSig = extractFloatx80Frac( b ); 3843 bExp = extractFloatx80Exp( b ); 3844 bSign = extractFloatx80Sign( b ); 3845 zSign = aSign ^ bSign; 3846 if ( aExp == 0x7FFF ) { 3847 if ( (bits64) ( aSig<<1 ) 3848 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3849 return propagateFloatx80NaN( a, b ); 3850 } 3851 if ( ( bExp | bSig ) == 0 ) goto invalid; 3852 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3853 } 3854 if ( bExp == 0x7FFF ) { 3855 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3856 if ( ( aExp | aSig ) == 0 ) { 3857 invalid: 3858 float_raise( float_flag_invalid ); 3859 z.low = floatx80_default_nan_low; 3860 z.high = floatx80_default_nan_high; 3861 return z; 3862 } 3863 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3864 } 3865 if ( aExp == 0 ) { 3866 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3867 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3868 } 3869 if ( bExp == 0 ) { 3870 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3871 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3872 } 3873 zExp = aExp + bExp - 0x3FFE; 3874 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3875 if ( 0 < (sbits64) zSig0 ) { 3876 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3877 --zExp; 3878 } 3879 return 3880 roundAndPackFloatx80( 3881 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3882 3883 } 3884 3885 /* 3886 ------------------------------------------------------------------------------- 3887 Returns the result of dividing the extended double-precision floating-point 3888 value `a' by the corresponding value `b'. The operation is performed 3889 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3890 ------------------------------------------------------------------------------- 3891 */ 3892 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3893 { 3894 flag aSign, bSign, zSign; 3895 int32 aExp, bExp, zExp; 3896 bits64 aSig, bSig, zSig0, zSig1; 3897 bits64 rem0, rem1, rem2, term0, term1, term2; 3898 floatx80 z; 3899 3900 aSig = extractFloatx80Frac( a ); 3901 aExp = extractFloatx80Exp( a ); 3902 aSign = extractFloatx80Sign( a ); 3903 bSig = extractFloatx80Frac( b ); 3904 bExp = extractFloatx80Exp( b ); 3905 bSign = extractFloatx80Sign( b ); 3906 zSign = aSign ^ bSign; 3907 if ( aExp == 0x7FFF ) { 3908 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3909 if ( bExp == 0x7FFF ) { 3910 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3911 goto invalid; 3912 } 3913 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3914 } 3915 if ( bExp == 0x7FFF ) { 3916 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3917 return packFloatx80( zSign, 0, 0 ); 3918 } 3919 if ( bExp == 0 ) { 3920 if ( bSig == 0 ) { 3921 if ( ( aExp | aSig ) == 0 ) { 3922 invalid: 3923 float_raise( float_flag_invalid ); 3924 z.low = floatx80_default_nan_low; 3925 z.high = floatx80_default_nan_high; 3926 return z; 3927 } 3928 float_raise( float_flag_divbyzero ); 3929 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3930 } 3931 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3932 } 3933 if ( aExp == 0 ) { 3934 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3935 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3936 } 3937 zExp = aExp - bExp + 0x3FFE; 3938 rem1 = 0; 3939 if ( bSig <= aSig ) { 3940 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3941 ++zExp; 3942 } 3943 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3944 mul64To128( bSig, zSig0, &term0, &term1 ); 3945 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3946 while ( (sbits64) rem0 < 0 ) { 3947 --zSig0; 3948 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3949 } 3950 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3951 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3952 mul64To128( bSig, zSig1, &term1, &term2 ); 3953 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3954 while ( (sbits64) rem1 < 0 ) { 3955 --zSig1; 3956 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3957 } 3958 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3959 } 3960 return 3961 roundAndPackFloatx80( 3962 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3963 3964 } 3965 3966 /* 3967 ------------------------------------------------------------------------------- 3968 Returns the remainder of the extended double-precision floating-point value 3969 `a' with respect to the corresponding value `b'. The operation is performed 3970 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3971 ------------------------------------------------------------------------------- 3972 */ 3973 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3974 { 3975 flag aSign, bSign, zSign; 3976 int32 aExp, bExp, expDiff; 3977 bits64 aSig0, aSig1, bSig; 3978 bits64 q, term0, term1, alternateASig0, alternateASig1; 3979 floatx80 z; 3980 3981 aSig0 = extractFloatx80Frac( a ); 3982 aExp = extractFloatx80Exp( a ); 3983 aSign = extractFloatx80Sign( a ); 3984 bSig = extractFloatx80Frac( b ); 3985 bExp = extractFloatx80Exp( b ); 3986 bSign = extractFloatx80Sign( b ); 3987 if ( aExp == 0x7FFF ) { 3988 if ( (bits64) ( aSig0<<1 ) 3989 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3990 return propagateFloatx80NaN( a, b ); 3991 } 3992 goto invalid; 3993 } 3994 if ( bExp == 0x7FFF ) { 3995 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3996 return a; 3997 } 3998 if ( bExp == 0 ) { 3999 if ( bSig == 0 ) { 4000 invalid: 4001 float_raise( float_flag_invalid ); 4002 z.low = floatx80_default_nan_low; 4003 z.high = floatx80_default_nan_high; 4004 return z; 4005 } 4006 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4007 } 4008 if ( aExp == 0 ) { 4009 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 4010 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4011 } 4012 bSig |= LIT64( 0x8000000000000000 ); 4013 zSign = aSign; 4014 expDiff = aExp - bExp; 4015 aSig1 = 0; 4016 if ( expDiff < 0 ) { 4017 if ( expDiff < -1 ) return a; 4018 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4019 expDiff = 0; 4020 } 4021 q = ( bSig <= aSig0 ); 4022 if ( q ) aSig0 -= bSig; 4023 expDiff -= 64; 4024 while ( 0 < expDiff ) { 4025 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4026 q = ( 2 < q ) ? q - 2 : 0; 4027 mul64To128( bSig, q, &term0, &term1 ); 4028 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4029 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4030 expDiff -= 62; 4031 } 4032 expDiff += 64; 4033 if ( 0 < expDiff ) { 4034 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4035 q = ( 2 < q ) ? q - 2 : 0; 4036 q >>= 64 - expDiff; 4037 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4038 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4039 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4040 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4041 ++q; 4042 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4043 } 4044 } 4045 else { 4046 term1 = 0; 4047 term0 = bSig; 4048 } 4049 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4050 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4051 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4052 && ( q & 1 ) ) 4053 ) { 4054 aSig0 = alternateASig0; 4055 aSig1 = alternateASig1; 4056 zSign = ! zSign; 4057 } 4058 return 4059 normalizeRoundAndPackFloatx80( 4060 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4061 4062 } 4063 4064 /* 4065 ------------------------------------------------------------------------------- 4066 Returns the square root of the extended double-precision floating-point 4067 value `a'. The operation is performed according to the IEC/IEEE Standard 4068 for Binary Floating-Point Arithmetic. 4069 ------------------------------------------------------------------------------- 4070 */ 4071 floatx80 floatx80_sqrt( floatx80 a ) 4072 { 4073 flag aSign; 4074 int32 aExp, zExp; 4075 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4076 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4077 floatx80 z; 4078 4079 aSig0 = extractFloatx80Frac( a ); 4080 aExp = extractFloatx80Exp( a ); 4081 aSign = extractFloatx80Sign( a ); 4082 if ( aExp == 0x7FFF ) { 4083 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4084 if ( ! aSign ) return a; 4085 goto invalid; 4086 } 4087 if ( aSign ) { 4088 if ( ( aExp | aSig0 ) == 0 ) return a; 4089 invalid: 4090 float_raise( float_flag_invalid ); 4091 z.low = floatx80_default_nan_low; 4092 z.high = floatx80_default_nan_high; 4093 return z; 4094 } 4095 if ( aExp == 0 ) { 4096 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4097 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4098 } 4099 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4100 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4101 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4102 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4103 doubleZSig0 = zSig0<<1; 4104 mul64To128( zSig0, zSig0, &term0, &term1 ); 4105 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4106 while ( (sbits64) rem0 < 0 ) { 4107 --zSig0; 4108 doubleZSig0 -= 2; 4109 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4110 } 4111 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4112 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4113 if ( zSig1 == 0 ) zSig1 = 1; 4114 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4115 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4116 mul64To128( zSig1, zSig1, &term2, &term3 ); 4117 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4118 while ( (sbits64) rem1 < 0 ) { 4119 --zSig1; 4120 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4121 term3 |= 1; 4122 term2 |= doubleZSig0; 4123 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4124 } 4125 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4126 } 4127 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4128 zSig0 |= doubleZSig0; 4129 return 4130 roundAndPackFloatx80( 4131 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4132 4133 } 4134 4135 /* 4136 ------------------------------------------------------------------------------- 4137 Returns 1 if the extended double-precision floating-point value `a' is 4138 equal to the corresponding value `b', and 0 otherwise. The comparison is 4139 performed according to the IEC/IEEE Standard for Binary Floating-Point 4140 Arithmetic. 4141 ------------------------------------------------------------------------------- 4142 */ 4143 flag floatx80_eq( floatx80 a, floatx80 b ) 4144 { 4145 4146 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4147 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4148 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4149 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4150 ) { 4151 if ( floatx80_is_signaling_nan( a ) 4152 || floatx80_is_signaling_nan( b ) ) { 4153 float_raise( float_flag_invalid ); 4154 } 4155 return 0; 4156 } 4157 return 4158 ( a.low == b.low ) 4159 && ( ( a.high == b.high ) 4160 || ( ( a.low == 0 ) 4161 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4162 ); 4163 4164 } 4165 4166 /* 4167 ------------------------------------------------------------------------------- 4168 Returns 1 if the extended double-precision floating-point value `a' is 4169 less than or equal to the corresponding value `b', and 0 otherwise. The 4170 comparison is performed according to the IEC/IEEE Standard for Binary 4171 Floating-Point Arithmetic. 4172 ------------------------------------------------------------------------------- 4173 */ 4174 flag floatx80_le( floatx80 a, floatx80 b ) 4175 { 4176 flag aSign, bSign; 4177 4178 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4179 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4180 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4181 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4182 ) { 4183 float_raise( float_flag_invalid ); 4184 return 0; 4185 } 4186 aSign = extractFloatx80Sign( a ); 4187 bSign = extractFloatx80Sign( b ); 4188 if ( aSign != bSign ) { 4189 return 4190 aSign 4191 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4192 == 0 ); 4193 } 4194 return 4195 aSign ? le128( b.high, b.low, a.high, a.low ) 4196 : le128( a.high, a.low, b.high, b.low ); 4197 4198 } 4199 4200 /* 4201 ------------------------------------------------------------------------------- 4202 Returns 1 if the extended double-precision floating-point value `a' is 4203 less than the corresponding value `b', and 0 otherwise. The comparison 4204 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4205 Arithmetic. 4206 ------------------------------------------------------------------------------- 4207 */ 4208 flag floatx80_lt( floatx80 a, floatx80 b ) 4209 { 4210 flag aSign, bSign; 4211 4212 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4213 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4214 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4215 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4216 ) { 4217 float_raise( float_flag_invalid ); 4218 return 0; 4219 } 4220 aSign = extractFloatx80Sign( a ); 4221 bSign = extractFloatx80Sign( b ); 4222 if ( aSign != bSign ) { 4223 return 4224 aSign 4225 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4226 != 0 ); 4227 } 4228 return 4229 aSign ? lt128( b.high, b.low, a.high, a.low ) 4230 : lt128( a.high, a.low, b.high, b.low ); 4231 4232 } 4233 4234 /* 4235 ------------------------------------------------------------------------------- 4236 Returns 1 if the extended double-precision floating-point value `a' is equal 4237 to the corresponding value `b', and 0 otherwise. The invalid exception is 4238 raised if either operand is a NaN. Otherwise, the comparison is performed 4239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4240 ------------------------------------------------------------------------------- 4241 */ 4242 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4243 { 4244 4245 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4246 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4247 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4248 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4249 ) { 4250 float_raise( float_flag_invalid ); 4251 return 0; 4252 } 4253 return 4254 ( a.low == b.low ) 4255 && ( ( a.high == b.high ) 4256 || ( ( a.low == 0 ) 4257 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4258 ); 4259 4260 } 4261 4262 /* 4263 ------------------------------------------------------------------------------- 4264 Returns 1 if the extended double-precision floating-point value `a' is less 4265 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4266 do not cause an exception. Otherwise, the comparison is performed according 4267 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4268 ------------------------------------------------------------------------------- 4269 */ 4270 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4271 { 4272 flag aSign, bSign; 4273 4274 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4275 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4276 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4277 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4278 ) { 4279 if ( floatx80_is_signaling_nan( a ) 4280 || floatx80_is_signaling_nan( b ) ) { 4281 float_raise( float_flag_invalid ); 4282 } 4283 return 0; 4284 } 4285 aSign = extractFloatx80Sign( a ); 4286 bSign = extractFloatx80Sign( b ); 4287 if ( aSign != bSign ) { 4288 return 4289 aSign 4290 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4291 == 0 ); 4292 } 4293 return 4294 aSign ? le128( b.high, b.low, a.high, a.low ) 4295 : le128( a.high, a.low, b.high, b.low ); 4296 4297 } 4298 4299 /* 4300 ------------------------------------------------------------------------------- 4301 Returns 1 if the extended double-precision floating-point value `a' is less 4302 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4303 an exception. Otherwise, the comparison is performed according to the 4304 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4305 ------------------------------------------------------------------------------- 4306 */ 4307 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4308 { 4309 flag aSign, bSign; 4310 4311 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4312 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4313 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4314 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4315 ) { 4316 if ( floatx80_is_signaling_nan( a ) 4317 || floatx80_is_signaling_nan( b ) ) { 4318 float_raise( float_flag_invalid ); 4319 } 4320 return 0; 4321 } 4322 aSign = extractFloatx80Sign( a ); 4323 bSign = extractFloatx80Sign( b ); 4324 if ( aSign != bSign ) { 4325 return 4326 aSign 4327 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4328 != 0 ); 4329 } 4330 return 4331 aSign ? lt128( b.high, b.low, a.high, a.low ) 4332 : lt128( a.high, a.low, b.high, b.low ); 4333 4334 } 4335 4336 #endif 4337 4338 #ifdef FLOAT128 4339 4340 /* 4341 ------------------------------------------------------------------------------- 4342 Returns the result of converting the quadruple-precision floating-point 4343 value `a' to the 32-bit two's complement integer format. The conversion 4344 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4345 Arithmetic---which means in particular that the conversion is rounded 4346 according to the current rounding mode. If `a' is a NaN, the largest 4347 positive integer is returned. Otherwise, if the conversion overflows, the 4348 largest integer with the same sign as `a' is returned. 4349 ------------------------------------------------------------------------------- 4350 */ 4351 int32 float128_to_int32( float128 a ) 4352 { 4353 flag aSign; 4354 int32 aExp, shiftCount; 4355 bits64 aSig0, aSig1; 4356 4357 aSig1 = extractFloat128Frac1( a ); 4358 aSig0 = extractFloat128Frac0( a ); 4359 aExp = extractFloat128Exp( a ); 4360 aSign = extractFloat128Sign( a ); 4361 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4362 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4363 aSig0 |= ( aSig1 != 0 ); 4364 shiftCount = 0x4028 - aExp; 4365 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4366 return roundAndPackInt32( aSign, aSig0 ); 4367 4368 } 4369 4370 /* 4371 ------------------------------------------------------------------------------- 4372 Returns the result of converting the quadruple-precision floating-point 4373 value `a' to the 32-bit two's complement integer format. The conversion 4374 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4375 Arithmetic, except that the conversion is always rounded toward zero. If 4376 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4377 conversion overflows, the largest integer with the same sign as `a' is 4378 returned. 4379 ------------------------------------------------------------------------------- 4380 */ 4381 int32 float128_to_int32_round_to_zero( float128 a ) 4382 { 4383 flag aSign; 4384 int32 aExp, shiftCount; 4385 bits64 aSig0, aSig1, savedASig; 4386 int32 z; 4387 4388 aSig1 = extractFloat128Frac1( a ); 4389 aSig0 = extractFloat128Frac0( a ); 4390 aExp = extractFloat128Exp( a ); 4391 aSign = extractFloat128Sign( a ); 4392 aSig0 |= ( aSig1 != 0 ); 4393 if ( 0x401E < aExp ) { 4394 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4395 goto invalid; 4396 } 4397 else if ( aExp < 0x3FFF ) { 4398 if ( aExp || aSig0 ) float_set_inexact(); 4399 return 0; 4400 } 4401 aSig0 |= LIT64( 0x0001000000000000 ); 4402 shiftCount = 0x402F - aExp; 4403 savedASig = aSig0; 4404 aSig0 >>= shiftCount; 4405 z = aSig0; 4406 if ( aSign ) z = - z; 4407 if ( ( z < 0 ) ^ aSign ) { 4408 invalid: 4409 float_raise( float_flag_invalid ); 4410 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4411 } 4412 if ( ( aSig0<<shiftCount ) != savedASig ) { 4413 float_set_inexact(); 4414 } 4415 return z; 4416 4417 } 4418 4419 /* 4420 ------------------------------------------------------------------------------- 4421 Returns the result of converting the quadruple-precision floating-point 4422 value `a' to the 64-bit two's complement integer format. The conversion 4423 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4424 Arithmetic---which means in particular that the conversion is rounded 4425 according to the current rounding mode. If `a' is a NaN, the largest 4426 positive integer is returned. Otherwise, if the conversion overflows, the 4427 largest integer with the same sign as `a' is returned. 4428 ------------------------------------------------------------------------------- 4429 */ 4430 int64 float128_to_int64( float128 a ) 4431 { 4432 flag aSign; 4433 int32 aExp, shiftCount; 4434 bits64 aSig0, aSig1; 4435 4436 aSig1 = extractFloat128Frac1( a ); 4437 aSig0 = extractFloat128Frac0( a ); 4438 aExp = extractFloat128Exp( a ); 4439 aSign = extractFloat128Sign( a ); 4440 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4441 shiftCount = 0x402F - aExp; 4442 if ( shiftCount <= 0 ) { 4443 if ( 0x403E < aExp ) { 4444 float_raise( float_flag_invalid ); 4445 if ( ! aSign 4446 || ( ( aExp == 0x7FFF ) 4447 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4448 ) 4449 ) { 4450 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4451 } 4452 return (sbits64) LIT64( 0x8000000000000000 ); 4453 } 4454 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4455 } 4456 else { 4457 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4458 } 4459 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4460 4461 } 4462 4463 /* 4464 ------------------------------------------------------------------------------- 4465 Returns the result of converting the quadruple-precision floating-point 4466 value `a' to the 64-bit two's complement integer format. The conversion 4467 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4468 Arithmetic, except that the conversion is always rounded toward zero. 4469 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4470 the conversion overflows, the largest integer with the same sign as `a' is 4471 returned. 4472 ------------------------------------------------------------------------------- 4473 */ 4474 int64 float128_to_int64_round_to_zero( float128 a ) 4475 { 4476 flag aSign; 4477 int32 aExp, shiftCount; 4478 bits64 aSig0, aSig1; 4479 int64 z; 4480 4481 aSig1 = extractFloat128Frac1( a ); 4482 aSig0 = extractFloat128Frac0( a ); 4483 aExp = extractFloat128Exp( a ); 4484 aSign = extractFloat128Sign( a ); 4485 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4486 shiftCount = aExp - 0x402F; 4487 if ( 0 < shiftCount ) { 4488 if ( 0x403E <= aExp ) { 4489 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4490 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4491 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4492 if ( aSig1 ) float_set_inexact(); 4493 } 4494 else { 4495 float_raise( float_flag_invalid ); 4496 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4497 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4498 } 4499 } 4500 return (sbits64) LIT64( 0x8000000000000000 ); 4501 } 4502 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4503 if ( (bits64) ( aSig1<<shiftCount ) ) { 4504 float_set_inexact(); 4505 } 4506 } 4507 else { 4508 if ( aExp < 0x3FFF ) { 4509 if ( aExp | aSig0 | aSig1 ) { 4510 float_set_inexact(); 4511 } 4512 return 0; 4513 } 4514 z = aSig0>>( - shiftCount ); 4515 if ( aSig1 4516 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4517 float_set_inexact(); 4518 } 4519 } 4520 if ( aSign ) z = - z; 4521 return z; 4522 4523 } 4524 4525 /* 4526 ------------------------------------------------------------------------------- 4527 Returns the result of converting the quadruple-precision floating-point 4528 value `a' to the single-precision floating-point format. The conversion 4529 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4530 Arithmetic. 4531 ------------------------------------------------------------------------------- 4532 */ 4533 float32 float128_to_float32( float128 a ) 4534 { 4535 flag aSign; 4536 int32 aExp; 4537 bits64 aSig0, aSig1; 4538 bits32 zSig; 4539 4540 aSig1 = extractFloat128Frac1( a ); 4541 aSig0 = extractFloat128Frac0( a ); 4542 aExp = extractFloat128Exp( a ); 4543 aSign = extractFloat128Sign( a ); 4544 if ( aExp == 0x7FFF ) { 4545 if ( aSig0 | aSig1 ) { 4546 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4547 } 4548 return packFloat32( aSign, 0xFF, 0 ); 4549 } 4550 aSig0 |= ( aSig1 != 0 ); 4551 shift64RightJamming( aSig0, 18, &aSig0 ); 4552 zSig = aSig0; 4553 if ( aExp || zSig ) { 4554 zSig |= 0x40000000; 4555 aExp -= 0x3F81; 4556 } 4557 return roundAndPackFloat32( aSign, aExp, zSig ); 4558 4559 } 4560 4561 /* 4562 ------------------------------------------------------------------------------- 4563 Returns the result of converting the quadruple-precision floating-point 4564 value `a' to the double-precision floating-point format. The conversion 4565 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4566 Arithmetic. 4567 ------------------------------------------------------------------------------- 4568 */ 4569 float64 float128_to_float64( float128 a ) 4570 { 4571 flag aSign; 4572 int32 aExp; 4573 bits64 aSig0, aSig1; 4574 4575 aSig1 = extractFloat128Frac1( a ); 4576 aSig0 = extractFloat128Frac0( a ); 4577 aExp = extractFloat128Exp( a ); 4578 aSign = extractFloat128Sign( a ); 4579 if ( aExp == 0x7FFF ) { 4580 if ( aSig0 | aSig1 ) { 4581 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4582 } 4583 return packFloat64( aSign, 0x7FF, 0 ); 4584 } 4585 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4586 aSig0 |= ( aSig1 != 0 ); 4587 if ( aExp || aSig0 ) { 4588 aSig0 |= LIT64( 0x4000000000000000 ); 4589 aExp -= 0x3C01; 4590 } 4591 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4592 4593 } 4594 4595 #ifdef FLOATX80 4596 4597 /* 4598 ------------------------------------------------------------------------------- 4599 Returns the result of converting the quadruple-precision floating-point 4600 value `a' to the extended double-precision floating-point format. The 4601 conversion is performed according to the IEC/IEEE Standard for Binary 4602 Floating-Point Arithmetic. 4603 ------------------------------------------------------------------------------- 4604 */ 4605 floatx80 float128_to_floatx80( float128 a ) 4606 { 4607 flag aSign; 4608 int32 aExp; 4609 bits64 aSig0, aSig1; 4610 4611 aSig1 = extractFloat128Frac1( a ); 4612 aSig0 = extractFloat128Frac0( a ); 4613 aExp = extractFloat128Exp( a ); 4614 aSign = extractFloat128Sign( a ); 4615 if ( aExp == 0x7FFF ) { 4616 if ( aSig0 | aSig1 ) { 4617 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4618 } 4619 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4620 } 4621 if ( aExp == 0 ) { 4622 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4623 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4624 } 4625 else { 4626 aSig0 |= LIT64( 0x0001000000000000 ); 4627 } 4628 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4629 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4630 4631 } 4632 4633 #endif 4634 4635 /* 4636 ------------------------------------------------------------------------------- 4637 Rounds the quadruple-precision floating-point value `a' to an integer, and 4638 returns the result as a quadruple-precision floating-point value. The 4639 operation is performed according to the IEC/IEEE Standard for Binary 4640 Floating-Point Arithmetic. 4641 ------------------------------------------------------------------------------- 4642 */ 4643 float128 float128_round_to_int( float128 a ) 4644 { 4645 flag aSign; 4646 int32 aExp; 4647 bits64 lastBitMask, roundBitsMask; 4648 int8 roundingMode; 4649 float128 z; 4650 4651 aExp = extractFloat128Exp( a ); 4652 if ( 0x402F <= aExp ) { 4653 if ( 0x406F <= aExp ) { 4654 if ( ( aExp == 0x7FFF ) 4655 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4656 ) { 4657 return propagateFloat128NaN( a, a ); 4658 } 4659 return a; 4660 } 4661 lastBitMask = 1; 4662 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4663 roundBitsMask = lastBitMask - 1; 4664 z = a; 4665 roundingMode = float_rounding_mode(); 4666 if ( roundingMode == float_round_nearest_even ) { 4667 if ( lastBitMask ) { 4668 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4669 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4670 } 4671 else { 4672 if ( (sbits64) z.low < 0 ) { 4673 ++z.high; 4674 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4675 } 4676 } 4677 } 4678 else if ( roundingMode != float_round_to_zero ) { 4679 if ( extractFloat128Sign( z ) 4680 ^ ( roundingMode == float_round_up ) ) { 4681 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4682 } 4683 } 4684 z.low &= ~ roundBitsMask; 4685 } 4686 else { 4687 if ( aExp < 0x3FFF ) { 4688 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4689 float_set_inexact(); 4690 aSign = extractFloat128Sign( a ); 4691 switch ( float_rounding_mode() ) { 4692 case float_round_nearest_even: 4693 if ( ( aExp == 0x3FFE ) 4694 && ( extractFloat128Frac0( a ) 4695 | extractFloat128Frac1( a ) ) 4696 ) { 4697 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4698 } 4699 break; 4700 case float_round_down: 4701 return 4702 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4703 : packFloat128( 0, 0, 0, 0 ); 4704 case float_round_up: 4705 return 4706 aSign ? packFloat128( 1, 0, 0, 0 ) 4707 : packFloat128( 0, 0x3FFF, 0, 0 ); 4708 } 4709 return packFloat128( aSign, 0, 0, 0 ); 4710 } 4711 lastBitMask = 1; 4712 lastBitMask <<= 0x402F - aExp; 4713 roundBitsMask = lastBitMask - 1; 4714 z.low = 0; 4715 z.high = a.high; 4716 roundingMode = float_rounding_mode(); 4717 if ( roundingMode == float_round_nearest_even ) { 4718 z.high += lastBitMask>>1; 4719 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4720 z.high &= ~ lastBitMask; 4721 } 4722 } 4723 else if ( roundingMode != float_round_to_zero ) { 4724 if ( extractFloat128Sign( z ) 4725 ^ ( roundingMode == float_round_up ) ) { 4726 z.high |= ( a.low != 0 ); 4727 z.high += roundBitsMask; 4728 } 4729 } 4730 z.high &= ~ roundBitsMask; 4731 } 4732 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4733 float_set_inexact(); 4734 } 4735 return z; 4736 4737 } 4738 4739 /* 4740 ------------------------------------------------------------------------------- 4741 Returns the result of adding the absolute values of the quadruple-precision 4742 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4743 before being returned. `zSign' is ignored if the result is a NaN. 4744 The addition is performed according to the IEC/IEEE Standard for Binary 4745 Floating-Point Arithmetic. 4746 ------------------------------------------------------------------------------- 4747 */ 4748 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4749 { 4750 int32 aExp, bExp, zExp; 4751 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4752 int32 expDiff; 4753 4754 aSig1 = extractFloat128Frac1( a ); 4755 aSig0 = extractFloat128Frac0( a ); 4756 aExp = extractFloat128Exp( a ); 4757 bSig1 = extractFloat128Frac1( b ); 4758 bSig0 = extractFloat128Frac0( b ); 4759 bExp = extractFloat128Exp( b ); 4760 expDiff = aExp - bExp; 4761 if ( 0 < expDiff ) { 4762 if ( aExp == 0x7FFF ) { 4763 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4764 return a; 4765 } 4766 if ( bExp == 0 ) { 4767 --expDiff; 4768 } 4769 else { 4770 bSig0 |= LIT64( 0x0001000000000000 ); 4771 } 4772 shift128ExtraRightJamming( 4773 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4774 zExp = aExp; 4775 } 4776 else if ( expDiff < 0 ) { 4777 if ( bExp == 0x7FFF ) { 4778 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4779 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4780 } 4781 if ( aExp == 0 ) { 4782 ++expDiff; 4783 } 4784 else { 4785 aSig0 |= LIT64( 0x0001000000000000 ); 4786 } 4787 shift128ExtraRightJamming( 4788 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4789 zExp = bExp; 4790 } 4791 else { 4792 if ( aExp == 0x7FFF ) { 4793 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4794 return propagateFloat128NaN( a, b ); 4795 } 4796 return a; 4797 } 4798 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4799 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4800 zSig2 = 0; 4801 zSig0 |= LIT64( 0x0002000000000000 ); 4802 zExp = aExp; 4803 goto shiftRight1; 4804 } 4805 aSig0 |= LIT64( 0x0001000000000000 ); 4806 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4807 --zExp; 4808 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4809 ++zExp; 4810 shiftRight1: 4811 shift128ExtraRightJamming( 4812 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4813 roundAndPack: 4814 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4815 4816 } 4817 4818 /* 4819 ------------------------------------------------------------------------------- 4820 Returns the result of subtracting the absolute values of the quadruple- 4821 precision floating-point values `a' and `b'. If `zSign' is 1, the 4822 difference is negated before being returned. `zSign' is ignored if the 4823 result is a NaN. The subtraction is performed according to the IEC/IEEE 4824 Standard for Binary Floating-Point Arithmetic. 4825 ------------------------------------------------------------------------------- 4826 */ 4827 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4828 { 4829 int32 aExp, bExp, zExp; 4830 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4831 int32 expDiff; 4832 float128 z; 4833 4834 aSig1 = extractFloat128Frac1( a ); 4835 aSig0 = extractFloat128Frac0( a ); 4836 aExp = extractFloat128Exp( a ); 4837 bSig1 = extractFloat128Frac1( b ); 4838 bSig0 = extractFloat128Frac0( b ); 4839 bExp = extractFloat128Exp( b ); 4840 expDiff = aExp - bExp; 4841 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4842 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4843 if ( 0 < expDiff ) goto aExpBigger; 4844 if ( expDiff < 0 ) goto bExpBigger; 4845 if ( aExp == 0x7FFF ) { 4846 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4847 return propagateFloat128NaN( a, b ); 4848 } 4849 float_raise( float_flag_invalid ); 4850 z.low = float128_default_nan_low; 4851 z.high = float128_default_nan_high; 4852 return z; 4853 } 4854 if ( aExp == 0 ) { 4855 aExp = 1; 4856 bExp = 1; 4857 } 4858 if ( bSig0 < aSig0 ) goto aBigger; 4859 if ( aSig0 < bSig0 ) goto bBigger; 4860 if ( bSig1 < aSig1 ) goto aBigger; 4861 if ( aSig1 < bSig1 ) goto bBigger; 4862 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4863 bExpBigger: 4864 if ( bExp == 0x7FFF ) { 4865 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4866 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4867 } 4868 if ( aExp == 0 ) { 4869 ++expDiff; 4870 } 4871 else { 4872 aSig0 |= LIT64( 0x4000000000000000 ); 4873 } 4874 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4875 bSig0 |= LIT64( 0x4000000000000000 ); 4876 bBigger: 4877 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4878 zExp = bExp; 4879 zSign ^= 1; 4880 goto normalizeRoundAndPack; 4881 aExpBigger: 4882 if ( aExp == 0x7FFF ) { 4883 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4884 return a; 4885 } 4886 if ( bExp == 0 ) { 4887 --expDiff; 4888 } 4889 else { 4890 bSig0 |= LIT64( 0x4000000000000000 ); 4891 } 4892 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4893 aSig0 |= LIT64( 0x4000000000000000 ); 4894 aBigger: 4895 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4896 zExp = aExp; 4897 normalizeRoundAndPack: 4898 --zExp; 4899 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4900 4901 } 4902 4903 /* 4904 ------------------------------------------------------------------------------- 4905 Returns the result of adding the quadruple-precision floating-point values 4906 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4907 for Binary Floating-Point Arithmetic. 4908 ------------------------------------------------------------------------------- 4909 */ 4910 float128 float128_add( float128 a, float128 b ) 4911 { 4912 flag aSign, bSign; 4913 4914 aSign = extractFloat128Sign( a ); 4915 bSign = extractFloat128Sign( b ); 4916 if ( aSign == bSign ) { 4917 return addFloat128Sigs( a, b, aSign ); 4918 } 4919 else { 4920 return subFloat128Sigs( a, b, aSign ); 4921 } 4922 4923 } 4924 4925 /* 4926 ------------------------------------------------------------------------------- 4927 Returns the result of subtracting the quadruple-precision floating-point 4928 values `a' and `b'. The operation is performed according to the IEC/IEEE 4929 Standard for Binary Floating-Point Arithmetic. 4930 ------------------------------------------------------------------------------- 4931 */ 4932 float128 float128_sub( float128 a, float128 b ) 4933 { 4934 flag aSign, bSign; 4935 4936 aSign = extractFloat128Sign( a ); 4937 bSign = extractFloat128Sign( b ); 4938 if ( aSign == bSign ) { 4939 return subFloat128Sigs( a, b, aSign ); 4940 } 4941 else { 4942 return addFloat128Sigs( a, b, aSign ); 4943 } 4944 4945 } 4946 4947 /* 4948 ------------------------------------------------------------------------------- 4949 Returns the result of multiplying the quadruple-precision floating-point 4950 values `a' and `b'. The operation is performed according to the IEC/IEEE 4951 Standard for Binary Floating-Point Arithmetic. 4952 ------------------------------------------------------------------------------- 4953 */ 4954 float128 float128_mul( float128 a, float128 b ) 4955 { 4956 flag aSign, bSign, zSign; 4957 int32 aExp, bExp, zExp; 4958 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4959 float128 z; 4960 4961 aSig1 = extractFloat128Frac1( a ); 4962 aSig0 = extractFloat128Frac0( a ); 4963 aExp = extractFloat128Exp( a ); 4964 aSign = extractFloat128Sign( a ); 4965 bSig1 = extractFloat128Frac1( b ); 4966 bSig0 = extractFloat128Frac0( b ); 4967 bExp = extractFloat128Exp( b ); 4968 bSign = extractFloat128Sign( b ); 4969 zSign = aSign ^ bSign; 4970 if ( aExp == 0x7FFF ) { 4971 if ( ( aSig0 | aSig1 ) 4972 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4973 return propagateFloat128NaN( a, b ); 4974 } 4975 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4976 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4977 } 4978 if ( bExp == 0x7FFF ) { 4979 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4980 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4981 invalid: 4982 float_raise( float_flag_invalid ); 4983 z.low = float128_default_nan_low; 4984 z.high = float128_default_nan_high; 4985 return z; 4986 } 4987 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4988 } 4989 if ( aExp == 0 ) { 4990 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4991 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4992 } 4993 if ( bExp == 0 ) { 4994 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4995 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4996 } 4997 zExp = aExp + bExp - 0x4000; 4998 aSig0 |= LIT64( 0x0001000000000000 ); 4999 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5000 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5001 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5002 zSig2 |= ( zSig3 != 0 ); 5003 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5004 shift128ExtraRightJamming( 5005 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5006 ++zExp; 5007 } 5008 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5009 5010 } 5011 5012 /* 5013 ------------------------------------------------------------------------------- 5014 Returns the result of dividing the quadruple-precision floating-point value 5015 `a' by the corresponding value `b'. The operation is performed according to 5016 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5017 ------------------------------------------------------------------------------- 5018 */ 5019 float128 float128_div( float128 a, float128 b ) 5020 { 5021 flag aSign, bSign, zSign; 5022 int32 aExp, bExp, zExp; 5023 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5024 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5025 float128 z; 5026 5027 aSig1 = extractFloat128Frac1( a ); 5028 aSig0 = extractFloat128Frac0( a ); 5029 aExp = extractFloat128Exp( a ); 5030 aSign = extractFloat128Sign( a ); 5031 bSig1 = extractFloat128Frac1( b ); 5032 bSig0 = extractFloat128Frac0( b ); 5033 bExp = extractFloat128Exp( b ); 5034 bSign = extractFloat128Sign( b ); 5035 zSign = aSign ^ bSign; 5036 if ( aExp == 0x7FFF ) { 5037 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5038 if ( bExp == 0x7FFF ) { 5039 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5040 goto invalid; 5041 } 5042 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5043 } 5044 if ( bExp == 0x7FFF ) { 5045 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5046 return packFloat128( zSign, 0, 0, 0 ); 5047 } 5048 if ( bExp == 0 ) { 5049 if ( ( bSig0 | bSig1 ) == 0 ) { 5050 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5051 invalid: 5052 float_raise( float_flag_invalid ); 5053 z.low = float128_default_nan_low; 5054 z.high = float128_default_nan_high; 5055 return z; 5056 } 5057 float_raise( float_flag_divbyzero ); 5058 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5059 } 5060 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5061 } 5062 if ( aExp == 0 ) { 5063 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5064 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5065 } 5066 zExp = aExp - bExp + 0x3FFD; 5067 shortShift128Left( 5068 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5069 shortShift128Left( 5070 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5071 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5072 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5073 ++zExp; 5074 } 5075 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5076 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5077 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5078 while ( (sbits64) rem0 < 0 ) { 5079 --zSig0; 5080 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5081 } 5082 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5083 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5084 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5085 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5086 while ( (sbits64) rem1 < 0 ) { 5087 --zSig1; 5088 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5089 } 5090 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5091 } 5092 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5093 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5094 5095 } 5096 5097 /* 5098 ------------------------------------------------------------------------------- 5099 Returns the remainder of the quadruple-precision floating-point value `a' 5100 with respect to the corresponding value `b'. The operation is performed 5101 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5102 ------------------------------------------------------------------------------- 5103 */ 5104 float128 float128_rem( float128 a, float128 b ) 5105 { 5106 flag aSign, bSign, zSign; 5107 int32 aExp, bExp, expDiff; 5108 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5109 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5110 sbits64 sigMean0; 5111 float128 z; 5112 5113 aSig1 = extractFloat128Frac1( a ); 5114 aSig0 = extractFloat128Frac0( a ); 5115 aExp = extractFloat128Exp( a ); 5116 aSign = extractFloat128Sign( a ); 5117 bSig1 = extractFloat128Frac1( b ); 5118 bSig0 = extractFloat128Frac0( b ); 5119 bExp = extractFloat128Exp( b ); 5120 bSign = extractFloat128Sign( b ); 5121 if ( aExp == 0x7FFF ) { 5122 if ( ( aSig0 | aSig1 ) 5123 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5124 return propagateFloat128NaN( a, b ); 5125 } 5126 goto invalid; 5127 } 5128 if ( bExp == 0x7FFF ) { 5129 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5130 return a; 5131 } 5132 if ( bExp == 0 ) { 5133 if ( ( bSig0 | bSig1 ) == 0 ) { 5134 invalid: 5135 float_raise( float_flag_invalid ); 5136 z.low = float128_default_nan_low; 5137 z.high = float128_default_nan_high; 5138 return z; 5139 } 5140 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5141 } 5142 if ( aExp == 0 ) { 5143 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5144 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5145 } 5146 expDiff = aExp - bExp; 5147 if ( expDiff < -1 ) return a; 5148 shortShift128Left( 5149 aSig0 | LIT64( 0x0001000000000000 ), 5150 aSig1, 5151 15 - ( expDiff < 0 ), 5152 &aSig0, 5153 &aSig1 5154 ); 5155 shortShift128Left( 5156 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5157 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5158 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5159 expDiff -= 64; 5160 while ( 0 < expDiff ) { 5161 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5162 q = ( 4 < q ) ? q - 4 : 0; 5163 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5164 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5165 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5166 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5167 expDiff -= 61; 5168 } 5169 if ( -64 < expDiff ) { 5170 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5171 q = ( 4 < q ) ? q - 4 : 0; 5172 q >>= - expDiff; 5173 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5174 expDiff += 52; 5175 if ( expDiff < 0 ) { 5176 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5177 } 5178 else { 5179 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5180 } 5181 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5182 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5183 } 5184 else { 5185 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5186 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5187 } 5188 do { 5189 alternateASig0 = aSig0; 5190 alternateASig1 = aSig1; 5191 ++q; 5192 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5193 } while ( 0 <= (sbits64) aSig0 ); 5194 add128( 5195 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5196 if ( ( sigMean0 < 0 ) 5197 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5198 aSig0 = alternateASig0; 5199 aSig1 = alternateASig1; 5200 } 5201 zSign = ( (sbits64) aSig0 < 0 ); 5202 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5203 return 5204 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5205 5206 } 5207 5208 /* 5209 ------------------------------------------------------------------------------- 5210 Returns the square root of the quadruple-precision floating-point value `a'. 5211 The operation is performed according to the IEC/IEEE Standard for Binary 5212 Floating-Point Arithmetic. 5213 ------------------------------------------------------------------------------- 5214 */ 5215 float128 float128_sqrt( float128 a ) 5216 { 5217 flag aSign; 5218 int32 aExp, zExp; 5219 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5220 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5221 float128 z; 5222 5223 aSig1 = extractFloat128Frac1( a ); 5224 aSig0 = extractFloat128Frac0( a ); 5225 aExp = extractFloat128Exp( a ); 5226 aSign = extractFloat128Sign( a ); 5227 if ( aExp == 0x7FFF ) { 5228 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5229 if ( ! aSign ) return a; 5230 goto invalid; 5231 } 5232 if ( aSign ) { 5233 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5234 invalid: 5235 float_raise( float_flag_invalid ); 5236 z.low = float128_default_nan_low; 5237 z.high = float128_default_nan_high; 5238 return z; 5239 } 5240 if ( aExp == 0 ) { 5241 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5242 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5243 } 5244 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5245 aSig0 |= LIT64( 0x0001000000000000 ); 5246 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5247 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5248 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5249 doubleZSig0 = zSig0<<1; 5250 mul64To128( zSig0, zSig0, &term0, &term1 ); 5251 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5252 while ( (sbits64) rem0 < 0 ) { 5253 --zSig0; 5254 doubleZSig0 -= 2; 5255 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5256 } 5257 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5258 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5259 if ( zSig1 == 0 ) zSig1 = 1; 5260 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5261 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5262 mul64To128( zSig1, zSig1, &term2, &term3 ); 5263 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5264 while ( (sbits64) rem1 < 0 ) { 5265 --zSig1; 5266 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5267 term3 |= 1; 5268 term2 |= doubleZSig0; 5269 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5270 } 5271 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5272 } 5273 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5274 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5275 5276 } 5277 5278 /* 5279 ------------------------------------------------------------------------------- 5280 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5281 the corresponding value `b', and 0 otherwise. The comparison is performed 5282 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5283 ------------------------------------------------------------------------------- 5284 */ 5285 flag float128_eq( float128 a, float128 b ) 5286 { 5287 5288 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5289 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5290 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5291 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5292 ) { 5293 if ( float128_is_signaling_nan( a ) 5294 || float128_is_signaling_nan( b ) ) { 5295 float_raise( float_flag_invalid ); 5296 } 5297 return 0; 5298 } 5299 return 5300 ( a.low == b.low ) 5301 && ( ( a.high == b.high ) 5302 || ( ( a.low == 0 ) 5303 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5304 ); 5305 5306 } 5307 5308 /* 5309 ------------------------------------------------------------------------------- 5310 Returns 1 if the quadruple-precision floating-point value `a' is less than 5311 or equal to the corresponding value `b', and 0 otherwise. The comparison 5312 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5313 Arithmetic. 5314 ------------------------------------------------------------------------------- 5315 */ 5316 flag float128_le( float128 a, float128 b ) 5317 { 5318 flag aSign, bSign; 5319 5320 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5321 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5322 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5323 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5324 ) { 5325 float_raise( float_flag_invalid ); 5326 return 0; 5327 } 5328 aSign = extractFloat128Sign( a ); 5329 bSign = extractFloat128Sign( b ); 5330 if ( aSign != bSign ) { 5331 return 5332 aSign 5333 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5334 == 0 ); 5335 } 5336 return 5337 aSign ? le128( b.high, b.low, a.high, a.low ) 5338 : le128( a.high, a.low, b.high, b.low ); 5339 5340 } 5341 5342 /* 5343 ------------------------------------------------------------------------------- 5344 Returns 1 if the quadruple-precision floating-point value `a' is less than 5345 the corresponding value `b', and 0 otherwise. The comparison is performed 5346 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5347 ------------------------------------------------------------------------------- 5348 */ 5349 flag float128_lt( float128 a, float128 b ) 5350 { 5351 flag aSign, bSign; 5352 5353 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5354 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5355 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5356 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5357 ) { 5358 float_raise( float_flag_invalid ); 5359 return 0; 5360 } 5361 aSign = extractFloat128Sign( a ); 5362 bSign = extractFloat128Sign( b ); 5363 if ( aSign != bSign ) { 5364 return 5365 aSign 5366 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5367 != 0 ); 5368 } 5369 return 5370 aSign ? lt128( b.high, b.low, a.high, a.low ) 5371 : lt128( a.high, a.low, b.high, b.low ); 5372 5373 } 5374 5375 /* 5376 ------------------------------------------------------------------------------- 5377 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5378 the corresponding value `b', and 0 otherwise. The invalid exception is 5379 raised if either operand is a NaN. Otherwise, the comparison is performed 5380 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5381 ------------------------------------------------------------------------------- 5382 */ 5383 flag float128_eq_signaling( float128 a, float128 b ) 5384 { 5385 5386 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5387 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5388 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5389 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5390 ) { 5391 float_raise( float_flag_invalid ); 5392 return 0; 5393 } 5394 return 5395 ( a.low == b.low ) 5396 && ( ( a.high == b.high ) 5397 || ( ( a.low == 0 ) 5398 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5399 ); 5400 5401 } 5402 5403 /* 5404 ------------------------------------------------------------------------------- 5405 Returns 1 if the quadruple-precision floating-point value `a' is less than 5406 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5407 cause an exception. Otherwise, the comparison is performed according to the 5408 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5409 ------------------------------------------------------------------------------- 5410 */ 5411 flag float128_le_quiet( float128 a, float128 b ) 5412 { 5413 flag aSign, bSign; 5414 5415 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5416 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5417 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5418 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5419 ) { 5420 if ( float128_is_signaling_nan( a ) 5421 || float128_is_signaling_nan( b ) ) { 5422 float_raise( float_flag_invalid ); 5423 } 5424 return 0; 5425 } 5426 aSign = extractFloat128Sign( a ); 5427 bSign = extractFloat128Sign( b ); 5428 if ( aSign != bSign ) { 5429 return 5430 aSign 5431 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5432 == 0 ); 5433 } 5434 return 5435 aSign ? le128( b.high, b.low, a.high, a.low ) 5436 : le128( a.high, a.low, b.high, b.low ); 5437 5438 } 5439 5440 /* 5441 ------------------------------------------------------------------------------- 5442 Returns 1 if the quadruple-precision floating-point value `a' is less than 5443 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5444 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5445 Standard for Binary Floating-Point Arithmetic. 5446 ------------------------------------------------------------------------------- 5447 */ 5448 flag float128_lt_quiet( float128 a, float128 b ) 5449 { 5450 flag aSign, bSign; 5451 5452 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5453 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5454 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5455 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5456 ) { 5457 if ( float128_is_signaling_nan( a ) 5458 || float128_is_signaling_nan( b ) ) { 5459 float_raise( float_flag_invalid ); 5460 } 5461 return 0; 5462 } 5463 aSign = extractFloat128Sign( a ); 5464 bSign = extractFloat128Sign( b ); 5465 if ( aSign != bSign ) { 5466 return 5467 aSign 5468 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5469 != 0 ); 5470 } 5471 return 5472 aSign ? lt128( b.high, b.low, a.high, a.low ) 5473 : lt128( a.high, a.low, b.high, b.low ); 5474 5475 } 5476 5477 #endif 5478 5479 5480 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5481 5482 /* 5483 * These two routines are not part of the original softfloat distribution. 5484 * 5485 * They are based on the corresponding conversions to integer but return 5486 * unsigned numbers instead since these functions are required by GCC. 5487 * 5488 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97 5489 * 5490 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5491 */ 5492 5493 /* 5494 ------------------------------------------------------------------------------- 5495 Returns the result of converting the double-precision floating-point value 5496 `a' to the 32-bit unsigned integer format. The conversion is 5497 performed according to the IEC/IEEE Standard for Binary Floating-point 5498 Arithmetic, except that the conversion is always rounded toward zero. If 5499 `a' is a NaN, the largest positive integer is returned. If the conversion 5500 overflows, the largest integer positive is returned. 5501 ------------------------------------------------------------------------------- 5502 */ 5503 uint32 float64_to_uint32_round_to_zero( float64 a ) 5504 { 5505 flag aSign; 5506 int16 aExp, shiftCount; 5507 bits64 aSig, savedASig; 5508 uint32 z; 5509 5510 aSig = extractFloat64Frac( a ); 5511 aExp = extractFloat64Exp( a ); 5512 aSign = extractFloat64Sign( a ); 5513 5514 if (aSign) { 5515 float_raise( float_flag_invalid ); 5516 return(0); 5517 } 5518 5519 if ( 0x41E < aExp ) { 5520 float_raise( float_flag_invalid ); 5521 return 0xffffffff; 5522 } 5523 else if ( aExp < 0x3FF ) { 5524 if ( aExp || aSig ) float_set_inexact(); 5525 return 0; 5526 } 5527 aSig |= LIT64( 0x0010000000000000 ); 5528 shiftCount = 0x433 - aExp; 5529 savedASig = aSig; 5530 aSig >>= shiftCount; 5531 z = aSig; 5532 if ( ( aSig<<shiftCount ) != savedASig ) { 5533 float_set_inexact(); 5534 } 5535 return z; 5536 5537 } 5538 5539 /* 5540 ------------------------------------------------------------------------------- 5541 Returns the result of converting the single-precision floating-point value 5542 `a' to the 32-bit unsigned integer format. The conversion is 5543 performed according to the IEC/IEEE Standard for Binary Floating-point 5544 Arithmetic, except that the conversion is always rounded toward zero. If 5545 `a' is a NaN, the largest positive integer is returned. If the conversion 5546 overflows, the largest positive integer is returned. 5547 ------------------------------------------------------------------------------- 5548 */ 5549 uint32 float32_to_uint32_round_to_zero( float32 a ) 5550 { 5551 flag aSign; 5552 int16 aExp, shiftCount; 5553 bits32 aSig; 5554 uint32 z; 5555 5556 aSig = extractFloat32Frac( a ); 5557 aExp = extractFloat32Exp( a ); 5558 aSign = extractFloat32Sign( a ); 5559 shiftCount = aExp - 0x9E; 5560 5561 if (aSign) { 5562 float_raise( float_flag_invalid ); 5563 return(0); 5564 } 5565 if ( 0 < shiftCount ) { 5566 float_raise( float_flag_invalid ); 5567 return 0xFFFFFFFF; 5568 } 5569 else if ( aExp <= 0x7E ) { 5570 if ( aExp | aSig ) float_set_inexact(); 5571 return 0; 5572 } 5573 aSig = ( aSig | 0x800000 )<<8; 5574 z = aSig>>( - shiftCount ); 5575 if ( aSig<<( shiftCount & 31 ) ) { 5576 float_set_inexact(); 5577 } 5578 return z; 5579 5580 } 5581 5582 #endif 5583