1 /* $OpenBSD: softfloat.c,v 1.4 2009/10/27 23:59:35 deraadt Exp $ */ 2 /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */ 3 4 /* 5 * This version hacked for use with gcc -msoft-float by bjh21. 6 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 7 * itself). 8 */ 9 10 /* 11 * Things you may want to define: 12 * 13 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 14 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 15 * properly renamed. 16 */ 17 18 /* 19 =============================================================================== 20 21 This C source file is part of the SoftFloat IEC/IEEE Floating-point 22 Arithmetic Package, Release 2a. 23 24 Written by John R. Hauser. This work was made possible in part by the 25 International Computer Science Institute, located at Suite 600, 1947 Center 26 Street, Berkeley, California 94704. Funding was partially provided by the 27 National Science Foundation under grant MIP-9311980. The original version 28 of this code was written as part of a project to build a fixed-point vector 29 processor in collaboration with the University of California at Berkeley, 30 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 32 arithmetic/SoftFloat.html'. 33 34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable 35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT 36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS 37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL 38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM 39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY 40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL 41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS 42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS. 43 44 Derivative works are acceptable, even for commercial purposes, so long as 45 (1) they include prominent notice that the work is derivative, and (2) they 46 include prominent notice akin to these four paragraphs for those parts of 47 this code that are retained. 48 49 =============================================================================== 50 */ 51 52 #include <sys/cdefs.h> 53 54 #ifdef SOFTFLOAT_FOR_GCC 55 #include "softfloat-for-gcc.h" 56 #endif 57 58 #include "milieu.h" 59 #include "softfloat.h" 60 61 /* 62 * Conversions between floats as stored in memory and floats as 63 * SoftFloat uses them 64 */ 65 #ifndef FLOAT64_DEMANGLE 66 #define FLOAT64_DEMANGLE(a) (a) 67 #endif 68 #ifndef FLOAT64_MANGLE 69 #define FLOAT64_MANGLE(a) (a) 70 #endif 71 72 /* 73 ------------------------------------------------------------------------------- 74 Floating-point rounding mode, extended double-precision rounding precision, 75 and exception flags. 76 ------------------------------------------------------------------------------- 77 */ 78 79 /* 80 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 81 * Right now, it does not. I've removed all other dynamic global 82 * variables. [ross] 83 */ 84 #ifdef FLOATX80 85 int8 floatx80_rounding_precision = 80; 86 #endif 87 88 /* 89 ------------------------------------------------------------------------------- 90 Primitive arithmetic functions, including multi-word arithmetic, and 91 division and square root approximations. (Can be specialized to target if 92 desired.) 93 ------------------------------------------------------------------------------- 94 */ 95 #include "softfloat-macros.h" 96 97 /* 98 ------------------------------------------------------------------------------- 99 Functions and definitions to determine: (1) whether tininess for underflow 100 is detected before or after rounding by default, (2) what (if anything) 101 happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 are propagated from function inputs to output. These details are target- 104 specific. 105 ------------------------------------------------------------------------------- 106 */ 107 #include "softfloat-specialize.h" 108 109 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 110 /* 111 ------------------------------------------------------------------------------- 112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 113 and 7, and returns the properly rounded 32-bit integer corresponding to the 114 input. If `zSign' is 1, the input is negated before being converted to an 115 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 116 is simply rounded to an integer, with the inexact exception raised if the 117 input cannot be represented exactly as an integer. However, if the fixed- 118 point input is too large, the invalid exception is raised and the largest 119 positive or negative integer is returned. 120 ------------------------------------------------------------------------------- 121 */ 122 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 123 { 124 int8 roundingMode; 125 flag roundNearestEven; 126 int8 roundIncrement, roundBits; 127 int32 z; 128 129 roundingMode = float_rounding_mode(); 130 roundNearestEven = ( roundingMode == float_round_nearest_even ); 131 roundIncrement = 0x40; 132 if ( ! roundNearestEven ) { 133 if ( roundingMode == float_round_to_zero ) { 134 roundIncrement = 0; 135 } 136 else { 137 roundIncrement = 0x7F; 138 if ( zSign ) { 139 if ( roundingMode == float_round_up ) roundIncrement = 0; 140 } 141 else { 142 if ( roundingMode == float_round_down ) roundIncrement = 0; 143 } 144 } 145 } 146 roundBits = absZ & 0x7F; 147 absZ = ( absZ + roundIncrement )>>7; 148 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 149 z = absZ; 150 if ( zSign ) z = - z; 151 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 152 float_raise( float_flag_invalid ); 153 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 154 } 155 if ( roundBits ) float_set_inexact(); 156 return z; 157 158 } 159 160 /* 161 ------------------------------------------------------------------------------- 162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 163 `absZ1', with binary point between bits 63 and 64 (between the input words), 164 and returns the properly rounded 64-bit integer corresponding to the input. 165 If `zSign' is 1, the input is negated before being converted to an integer. 166 Ordinarily, the fixed-point input is simply rounded to an integer, with 167 the inexact exception raised if the input cannot be represented exactly as 168 an integer. However, if the fixed-point input is too large, the invalid 169 exception is raised and the largest positive or negative integer is 170 returned. 171 ------------------------------------------------------------------------------- 172 */ 173 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 174 { 175 int8 roundingMode; 176 flag roundNearestEven, increment; 177 int64 z; 178 179 roundingMode = float_rounding_mode(); 180 roundNearestEven = ( roundingMode == float_round_nearest_even ); 181 increment = ( (sbits64) absZ1 < 0 ); 182 if ( ! roundNearestEven ) { 183 if ( roundingMode == float_round_to_zero ) { 184 increment = 0; 185 } 186 else { 187 if ( zSign ) { 188 increment = ( roundingMode == float_round_down ) && absZ1; 189 } 190 else { 191 increment = ( roundingMode == float_round_up ) && absZ1; 192 } 193 } 194 } 195 if ( increment ) { 196 ++absZ0; 197 if ( absZ0 == 0 ) goto overflow; 198 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 199 } 200 z = absZ0; 201 if ( zSign ) z = - z; 202 if ( z && ( ( z < 0 ) ^ zSign ) ) { 203 overflow: 204 float_raise( float_flag_invalid ); 205 return 206 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 207 : LIT64( 0x7FFFFFFFFFFFFFFF ); 208 } 209 if ( absZ1 ) float_set_inexact(); 210 return z; 211 212 } 213 214 #ifdef __alpha__ 215 /* 216 ------------------------------------------------------------------------------- 217 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 218 `absZ1', with binary point between bits 63 and 64 (between the input words), 219 and returns the properly rounded 64-bit integer corresponding to the input. 220 If `zSign' is 1, the input is negated before being converted to an integer. 221 Ordinarily, the fixed-point input is simply rounded to an integer, with 222 the inexact exception raised if the input cannot be represented exactly as 223 an integer. 224 ------------------------------------------------------------------------------- 225 */ 226 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0, 227 bits64 absZ1 ) 228 { 229 int8 roundingMode; 230 flag roundNearestEven, increment; 231 int64 z; 232 233 roundingMode = float_rounding_mode(); 234 roundNearestEven = ( roundingMode == float_round_nearest_even ); 235 increment = ( (sbits64) absZ1 < 0 ); 236 if ( ! roundNearestEven ) { 237 if ( roundingMode == float_round_to_zero ) { 238 increment = 0; 239 } 240 else { 241 if ( zSign ) { 242 increment = ( roundingMode == float_round_down ) && absZ1; 243 } 244 else { 245 increment = ( roundingMode == float_round_up ) && absZ1; 246 } 247 } 248 } 249 if ( increment ) { 250 ++absZ0; 251 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 252 } 253 z = absZ0; 254 if ( zSign ) z = - z; 255 if ( absZ1 ) float_set_inexact(); 256 return z; 257 258 } 259 #endif /* __alpha__ */ 260 #endif 261 262 /* 263 ------------------------------------------------------------------------------- 264 Returns the fraction bits of the single-precision floating-point value `a'. 265 ------------------------------------------------------------------------------- 266 */ 267 INLINE bits32 extractFloat32Frac( float32 a ) 268 { 269 270 return a & 0x007FFFFF; 271 272 } 273 274 /* 275 ------------------------------------------------------------------------------- 276 Returns the exponent bits of the single-precision floating-point value `a'. 277 ------------------------------------------------------------------------------- 278 */ 279 INLINE int16 extractFloat32Exp( float32 a ) 280 { 281 282 return ( a>>23 ) & 0xFF; 283 284 } 285 286 /* 287 ------------------------------------------------------------------------------- 288 Returns the sign bit of the single-precision floating-point value `a'. 289 ------------------------------------------------------------------------------- 290 */ 291 INLINE flag extractFloat32Sign( float32 a ) 292 { 293 294 return a>>31; 295 296 } 297 298 /* 299 ------------------------------------------------------------------------------- 300 Normalizes the subnormal single-precision floating-point value represented 301 by the denormalized significand `aSig'. The normalized exponent and 302 significand are stored at the locations pointed to by `zExpPtr' and 303 `zSigPtr', respectively. 304 ------------------------------------------------------------------------------- 305 */ 306 static void 307 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 308 { 309 int8 shiftCount; 310 311 shiftCount = countLeadingZeros32( aSig ) - 8; 312 *zSigPtr = aSig<<shiftCount; 313 *zExpPtr = 1 - shiftCount; 314 315 } 316 317 /* 318 ------------------------------------------------------------------------------- 319 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 320 single-precision floating-point value, returning the result. After being 321 shifted into the proper positions, the three fields are simply added 322 together to form the result. This means that any integer portion of `zSig' 323 will be added into the exponent. Since a properly normalized significand 324 will have an integer portion equal to 1, the `zExp' input should be 1 less 325 than the desired result exponent whenever `zSig' is a complete, normalized 326 significand. 327 ------------------------------------------------------------------------------- 328 */ 329 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 330 { 331 332 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 333 334 } 335 336 /* 337 ------------------------------------------------------------------------------- 338 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 339 and significand `zSig', and returns the proper single-precision floating- 340 point value corresponding to the abstract input. Ordinarily, the abstract 341 value is simply rounded and packed into the single-precision format, with 342 the inexact exception raised if the abstract input cannot be represented 343 exactly. However, if the abstract value is too large, the overflow and 344 inexact exceptions are raised and an infinity or maximal finite value is 345 returned. If the abstract value is too small, the input value is rounded to 346 a subnormal number, and the underflow and inexact exceptions are raised if 347 the abstract input cannot be represented exactly as a subnormal single- 348 precision floating-point number. 349 The input significand `zSig' has its binary point between bits 30 350 and 29, which is 7 bits to the left of the usual location. This shifted 351 significand must be normalized or smaller. If `zSig' is not normalized, 352 `zExp' must be 0; in that case, the result returned is a subnormal number, 353 and it must not require rounding. In the usual case that `zSig' is 354 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 355 The handling of underflow and overflow follows the IEC/IEEE Standard for 356 Binary Floating-Point Arithmetic. 357 ------------------------------------------------------------------------------- 358 */ 359 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 360 { 361 int8 roundingMode; 362 flag roundNearestEven; 363 int8 roundIncrement, roundBits; 364 flag isTiny; 365 366 roundingMode = float_rounding_mode(); 367 roundNearestEven = ( roundingMode == float_round_nearest_even ); 368 roundIncrement = 0x40; 369 if ( ! roundNearestEven ) { 370 if ( roundingMode == float_round_to_zero ) { 371 roundIncrement = 0; 372 } 373 else { 374 roundIncrement = 0x7F; 375 if ( zSign ) { 376 if ( roundingMode == float_round_up ) roundIncrement = 0; 377 } 378 else { 379 if ( roundingMode == float_round_down ) roundIncrement = 0; 380 } 381 } 382 } 383 roundBits = zSig & 0x7F; 384 if ( 0xFD <= (bits16) zExp ) { 385 if ( ( 0xFD < zExp ) 386 || ( ( zExp == 0xFD ) 387 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 388 ) { 389 float_raise( float_flag_overflow | float_flag_inexact ); 390 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 391 } 392 if ( zExp < 0 ) { 393 isTiny = 394 ( float_detect_tininess == float_tininess_before_rounding ) 395 || ( zExp < -1 ) 396 || ( zSig + roundIncrement < 0x80000000 ); 397 shift32RightJamming( zSig, - zExp, &zSig ); 398 zExp = 0; 399 roundBits = zSig & 0x7F; 400 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 401 } 402 } 403 if ( roundBits ) float_set_inexact(); 404 zSig = ( zSig + roundIncrement )>>7; 405 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 406 if ( zSig == 0 ) zExp = 0; 407 return packFloat32( zSign, zExp, zSig ); 408 409 } 410 411 /* 412 ------------------------------------------------------------------------------- 413 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 414 and significand `zSig', and returns the proper single-precision floating- 415 point value corresponding to the abstract input. This routine is just like 416 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 417 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 418 floating-point exponent. 419 ------------------------------------------------------------------------------- 420 */ 421 static float32 422 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 423 { 424 int8 shiftCount; 425 426 shiftCount = countLeadingZeros32( zSig ) - 1; 427 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 428 429 } 430 431 /* 432 ------------------------------------------------------------------------------- 433 Returns the fraction bits of the double-precision floating-point value `a'. 434 ------------------------------------------------------------------------------- 435 */ 436 INLINE bits64 extractFloat64Frac( float64 a ) 437 { 438 439 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 440 441 } 442 443 /* 444 ------------------------------------------------------------------------------- 445 Returns the exponent bits of the double-precision floating-point value `a'. 446 ------------------------------------------------------------------------------- 447 */ 448 INLINE int16 extractFloat64Exp( float64 a ) 449 { 450 451 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 452 453 } 454 455 /* 456 ------------------------------------------------------------------------------- 457 Returns the sign bit of the double-precision floating-point value `a'. 458 ------------------------------------------------------------------------------- 459 */ 460 INLINE flag extractFloat64Sign( float64 a ) 461 { 462 463 return FLOAT64_DEMANGLE(a)>>63; 464 465 } 466 467 /* 468 ------------------------------------------------------------------------------- 469 Normalizes the subnormal double-precision floating-point value represented 470 by the denormalized significand `aSig'. The normalized exponent and 471 significand are stored at the locations pointed to by `zExpPtr' and 472 `zSigPtr', respectively. 473 ------------------------------------------------------------------------------- 474 */ 475 static void 476 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 477 { 478 int8 shiftCount; 479 480 shiftCount = countLeadingZeros64( aSig ) - 11; 481 *zSigPtr = aSig<<shiftCount; 482 *zExpPtr = 1 - shiftCount; 483 484 } 485 486 /* 487 ------------------------------------------------------------------------------- 488 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 489 double-precision floating-point value, returning the result. After being 490 shifted into the proper positions, the three fields are simply added 491 together to form the result. This means that any integer portion of `zSig' 492 will be added into the exponent. Since a properly normalized significand 493 will have an integer portion equal to 1, the `zExp' input should be 1 less 494 than the desired result exponent whenever `zSig' is a complete, normalized 495 significand. 496 ------------------------------------------------------------------------------- 497 */ 498 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 499 { 500 501 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 502 ( ( (bits64) zExp )<<52 ) + zSig ); 503 504 } 505 506 /* 507 ------------------------------------------------------------------------------- 508 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 509 and significand `zSig', and returns the proper double-precision floating- 510 point value corresponding to the abstract input. Ordinarily, the abstract 511 value is simply rounded and packed into the double-precision format, with 512 the inexact exception raised if the abstract input cannot be represented 513 exactly. However, if the abstract value is too large, the overflow and 514 inexact exceptions are raised and an infinity or maximal finite value is 515 returned. If the abstract value is too small, the input value is rounded to 516 a subnormal number, and the underflow and inexact exceptions are raised if 517 the abstract input cannot be represented exactly as a subnormal double- 518 precision floating-point number. 519 The input significand `zSig' has its binary point between bits 62 520 and 61, which is 10 bits to the left of the usual location. This shifted 521 significand must be normalized or smaller. If `zSig' is not normalized, 522 `zExp' must be 0; in that case, the result returned is a subnormal number, 523 and it must not require rounding. In the usual case that `zSig' is 524 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 525 The handling of underflow and overflow follows the IEC/IEEE Standard for 526 Binary Floating-Point Arithmetic. 527 ------------------------------------------------------------------------------- 528 */ 529 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 530 { 531 int8 roundingMode; 532 flag roundNearestEven; 533 int16 roundIncrement, roundBits; 534 flag isTiny; 535 536 roundingMode = float_rounding_mode(); 537 roundNearestEven = ( roundingMode == float_round_nearest_even ); 538 roundIncrement = 0x200; 539 if ( ! roundNearestEven ) { 540 if ( roundingMode == float_round_to_zero ) { 541 roundIncrement = 0; 542 } 543 else { 544 roundIncrement = 0x3FF; 545 if ( zSign ) { 546 if ( roundingMode == float_round_up ) roundIncrement = 0; 547 } 548 else { 549 if ( roundingMode == float_round_down ) roundIncrement = 0; 550 } 551 } 552 } 553 roundBits = zSig & 0x3FF; 554 if ( 0x7FD <= (bits16) zExp ) { 555 if ( ( 0x7FD < zExp ) 556 || ( ( zExp == 0x7FD ) 557 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 558 ) { 559 float_raise( float_flag_overflow | float_flag_inexact ); 560 return FLOAT64_MANGLE( 561 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 562 ( roundIncrement == 0 )); 563 } 564 if ( zExp < 0 ) { 565 isTiny = 566 ( float_detect_tininess == float_tininess_before_rounding ) 567 || ( zExp < -1 ) 568 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 569 shift64RightJamming( zSig, - zExp, &zSig ); 570 zExp = 0; 571 roundBits = zSig & 0x3FF; 572 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 573 } 574 } 575 if ( roundBits ) float_set_inexact(); 576 zSig = ( zSig + roundIncrement )>>10; 577 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 578 if ( zSig == 0 ) zExp = 0; 579 return packFloat64( zSign, zExp, zSig ); 580 581 } 582 583 /* 584 ------------------------------------------------------------------------------- 585 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 586 and significand `zSig', and returns the proper double-precision floating- 587 point value corresponding to the abstract input. This routine is just like 588 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 589 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 590 floating-point exponent. 591 ------------------------------------------------------------------------------- 592 */ 593 static float64 594 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 595 { 596 int8 shiftCount; 597 598 shiftCount = countLeadingZeros64( zSig ) - 1; 599 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 600 601 } 602 603 #ifdef FLOATX80 604 605 /* 606 ------------------------------------------------------------------------------- 607 Returns the fraction bits of the extended double-precision floating-point 608 value `a'. 609 ------------------------------------------------------------------------------- 610 */ 611 INLINE bits64 extractFloatx80Frac( floatx80 a ) 612 { 613 614 return a.low; 615 616 } 617 618 /* 619 ------------------------------------------------------------------------------- 620 Returns the exponent bits of the extended double-precision floating-point 621 value `a'. 622 ------------------------------------------------------------------------------- 623 */ 624 INLINE int32 extractFloatx80Exp( floatx80 a ) 625 { 626 627 return a.high & 0x7FFF; 628 629 } 630 631 /* 632 ------------------------------------------------------------------------------- 633 Returns the sign bit of the extended double-precision floating-point value 634 `a'. 635 ------------------------------------------------------------------------------- 636 */ 637 INLINE flag extractFloatx80Sign( floatx80 a ) 638 { 639 640 return a.high>>15; 641 642 } 643 644 /* 645 ------------------------------------------------------------------------------- 646 Normalizes the subnormal extended double-precision floating-point value 647 represented by the denormalized significand `aSig'. The normalized exponent 648 and significand are stored at the locations pointed to by `zExpPtr' and 649 `zSigPtr', respectively. 650 ------------------------------------------------------------------------------- 651 */ 652 static void 653 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 654 { 655 int8 shiftCount; 656 657 shiftCount = countLeadingZeros64( aSig ); 658 *zSigPtr = aSig<<shiftCount; 659 *zExpPtr = 1 - shiftCount; 660 661 } 662 663 /* 664 ------------------------------------------------------------------------------- 665 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 666 extended double-precision floating-point value, returning the result. 667 ------------------------------------------------------------------------------- 668 */ 669 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 670 { 671 floatx80 z; 672 673 z.low = zSig; 674 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 675 return z; 676 677 } 678 679 /* 680 ------------------------------------------------------------------------------- 681 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 682 and extended significand formed by the concatenation of `zSig0' and `zSig1', 683 and returns the proper extended double-precision floating-point value 684 corresponding to the abstract input. Ordinarily, the abstract value is 685 rounded and packed into the extended double-precision format, with the 686 inexact exception raised if the abstract input cannot be represented 687 exactly. However, if the abstract value is too large, the overflow and 688 inexact exceptions are raised and an infinity or maximal finite value is 689 returned. If the abstract value is too small, the input value is rounded to 690 a subnormal number, and the underflow and inexact exceptions are raised if 691 the abstract input cannot be represented exactly as a subnormal extended 692 double-precision floating-point number. 693 If `roundingPrecision' is 32 or 64, the result is rounded to the same 694 number of bits as single or double precision, respectively. Otherwise, the 695 result is rounded to the full precision of the extended double-precision 696 format. 697 The input significand must be normalized or smaller. If the input 698 significand is not normalized, `zExp' must be 0; in that case, the result 699 returned is a subnormal number, and it must not require rounding. The 700 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 701 Floating-Point Arithmetic. 702 ------------------------------------------------------------------------------- 703 */ 704 static floatx80 705 roundAndPackFloatx80( 706 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 707 ) 708 { 709 int8 roundingMode; 710 flag roundNearestEven, increment, isTiny; 711 int64 roundIncrement, roundMask, roundBits; 712 713 roundingMode = float_rounding_mode(); 714 roundNearestEven = ( roundingMode == float_round_nearest_even ); 715 if ( roundingPrecision == 80 ) goto precision80; 716 if ( roundingPrecision == 64 ) { 717 roundIncrement = LIT64( 0x0000000000000400 ); 718 roundMask = LIT64( 0x00000000000007FF ); 719 } 720 else if ( roundingPrecision == 32 ) { 721 roundIncrement = LIT64( 0x0000008000000000 ); 722 roundMask = LIT64( 0x000000FFFFFFFFFF ); 723 } 724 else { 725 goto precision80; 726 } 727 zSig0 |= ( zSig1 != 0 ); 728 if ( ! roundNearestEven ) { 729 if ( roundingMode == float_round_to_zero ) { 730 roundIncrement = 0; 731 } 732 else { 733 roundIncrement = roundMask; 734 if ( zSign ) { 735 if ( roundingMode == float_round_up ) roundIncrement = 0; 736 } 737 else { 738 if ( roundingMode == float_round_down ) roundIncrement = 0; 739 } 740 } 741 } 742 roundBits = zSig0 & roundMask; 743 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 744 if ( ( 0x7FFE < zExp ) 745 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 746 ) { 747 goto overflow; 748 } 749 if ( zExp <= 0 ) { 750 isTiny = 751 ( float_detect_tininess == float_tininess_before_rounding ) 752 || ( zExp < 0 ) 753 || ( zSig0 <= zSig0 + roundIncrement ); 754 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 755 zExp = 0; 756 roundBits = zSig0 & roundMask; 757 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 758 if ( roundBits ) float_set_inexact(); 759 zSig0 += roundIncrement; 760 if ( (sbits64) zSig0 < 0 ) zExp = 1; 761 roundIncrement = roundMask + 1; 762 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 763 roundMask |= roundIncrement; 764 } 765 zSig0 &= ~ roundMask; 766 return packFloatx80( zSign, zExp, zSig0 ); 767 } 768 } 769 if ( roundBits ) float_set_inexact(); 770 zSig0 += roundIncrement; 771 if ( zSig0 < roundIncrement ) { 772 ++zExp; 773 zSig0 = LIT64( 0x8000000000000000 ); 774 } 775 roundIncrement = roundMask + 1; 776 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 777 roundMask |= roundIncrement; 778 } 779 zSig0 &= ~ roundMask; 780 if ( zSig0 == 0 ) zExp = 0; 781 return packFloatx80( zSign, zExp, zSig0 ); 782 precision80: 783 increment = ( (sbits64) zSig1 < 0 ); 784 if ( ! roundNearestEven ) { 785 if ( roundingMode == float_round_to_zero ) { 786 increment = 0; 787 } 788 else { 789 if ( zSign ) { 790 increment = ( roundingMode == float_round_down ) && zSig1; 791 } 792 else { 793 increment = ( roundingMode == float_round_up ) && zSig1; 794 } 795 } 796 } 797 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 798 if ( ( 0x7FFE < zExp ) 799 || ( ( zExp == 0x7FFE ) 800 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 801 && increment 802 ) 803 ) { 804 roundMask = 0; 805 overflow: 806 float_raise( float_flag_overflow | float_flag_inexact ); 807 if ( ( roundingMode == float_round_to_zero ) 808 || ( zSign && ( roundingMode == float_round_up ) ) 809 || ( ! zSign && ( roundingMode == float_round_down ) ) 810 ) { 811 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 812 } 813 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 814 } 815 if ( zExp <= 0 ) { 816 isTiny = 817 ( float_detect_tininess == float_tininess_before_rounding ) 818 || ( zExp < 0 ) 819 || ! increment 820 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 821 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 822 zExp = 0; 823 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 824 if ( zSig1 ) float_set_inexact(); 825 if ( roundNearestEven ) { 826 increment = ( (sbits64) zSig1 < 0 ); 827 } 828 else { 829 if ( zSign ) { 830 increment = ( roundingMode == float_round_down ) && zSig1; 831 } 832 else { 833 increment = ( roundingMode == float_round_up ) && zSig1; 834 } 835 } 836 if ( increment ) { 837 ++zSig0; 838 zSig0 &= 839 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 840 if ( (sbits64) zSig0 < 0 ) zExp = 1; 841 } 842 return packFloatx80( zSign, zExp, zSig0 ); 843 } 844 } 845 if ( zSig1 ) float_set_inexact(); 846 if ( increment ) { 847 ++zSig0; 848 if ( zSig0 == 0 ) { 849 ++zExp; 850 zSig0 = LIT64( 0x8000000000000000 ); 851 } 852 else { 853 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 854 } 855 } 856 else { 857 if ( zSig0 == 0 ) zExp = 0; 858 } 859 return packFloatx80( zSign, zExp, zSig0 ); 860 861 } 862 863 /* 864 ------------------------------------------------------------------------------- 865 Takes an abstract floating-point value having sign `zSign', exponent 866 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 867 and returns the proper extended double-precision floating-point value 868 corresponding to the abstract input. This routine is just like 869 `roundAndPackFloatx80' except that the input significand does not have to be 870 normalized. 871 ------------------------------------------------------------------------------- 872 */ 873 static floatx80 874 normalizeRoundAndPackFloatx80( 875 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 876 ) 877 { 878 int8 shiftCount; 879 880 if ( zSig0 == 0 ) { 881 zSig0 = zSig1; 882 zSig1 = 0; 883 zExp -= 64; 884 } 885 shiftCount = countLeadingZeros64( zSig0 ); 886 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 887 zExp -= shiftCount; 888 return 889 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 890 891 } 892 893 #endif 894 895 #ifdef FLOAT128 896 897 /* 898 ------------------------------------------------------------------------------- 899 Returns the least-significant 64 fraction bits of the quadruple-precision 900 floating-point value `a'. 901 ------------------------------------------------------------------------------- 902 */ 903 INLINE bits64 extractFloat128Frac1( float128 a ) 904 { 905 906 return a.low; 907 908 } 909 910 /* 911 ------------------------------------------------------------------------------- 912 Returns the most-significant 48 fraction bits of the quadruple-precision 913 floating-point value `a'. 914 ------------------------------------------------------------------------------- 915 */ 916 INLINE bits64 extractFloat128Frac0( float128 a ) 917 { 918 919 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 920 921 } 922 923 /* 924 ------------------------------------------------------------------------------- 925 Returns the exponent bits of the quadruple-precision floating-point value 926 `a'. 927 ------------------------------------------------------------------------------- 928 */ 929 INLINE int32 extractFloat128Exp( float128 a ) 930 { 931 932 return ( a.high>>48 ) & 0x7FFF; 933 934 } 935 936 /* 937 ------------------------------------------------------------------------------- 938 Returns the sign bit of the quadruple-precision floating-point value `a'. 939 ------------------------------------------------------------------------------- 940 */ 941 INLINE flag extractFloat128Sign( float128 a ) 942 { 943 944 return a.high>>63; 945 946 } 947 948 /* 949 ------------------------------------------------------------------------------- 950 Normalizes the subnormal quadruple-precision floating-point value 951 represented by the denormalized significand formed by the concatenation of 952 `aSig0' and `aSig1'. The normalized exponent is stored at the location 953 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 954 significand are stored at the location pointed to by `zSig0Ptr', and the 955 least significant 64 bits of the normalized significand are stored at the 956 location pointed to by `zSig1Ptr'. 957 ------------------------------------------------------------------------------- 958 */ 959 static void 960 normalizeFloat128Subnormal( 961 bits64 aSig0, 962 bits64 aSig1, 963 int32 *zExpPtr, 964 bits64 *zSig0Ptr, 965 bits64 *zSig1Ptr 966 ) 967 { 968 int8 shiftCount; 969 970 if ( aSig0 == 0 ) { 971 shiftCount = countLeadingZeros64( aSig1 ) - 15; 972 if ( shiftCount < 0 ) { 973 *zSig0Ptr = aSig1>>( - shiftCount ); 974 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 975 } 976 else { 977 *zSig0Ptr = aSig1<<shiftCount; 978 *zSig1Ptr = 0; 979 } 980 *zExpPtr = - shiftCount - 63; 981 } 982 else { 983 shiftCount = countLeadingZeros64( aSig0 ) - 15; 984 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 985 *zExpPtr = 1 - shiftCount; 986 } 987 988 } 989 990 /* 991 ------------------------------------------------------------------------------- 992 Packs the sign `zSign', the exponent `zExp', and the significand formed 993 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 994 floating-point value, returning the result. After being shifted into the 995 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 996 added together to form the most significant 32 bits of the result. This 997 means that any integer portion of `zSig0' will be added into the exponent. 998 Since a properly normalized significand will have an integer portion equal 999 to 1, the `zExp' input should be 1 less than the desired result exponent 1000 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1001 significand. 1002 ------------------------------------------------------------------------------- 1003 */ 1004 INLINE float128 1005 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1006 { 1007 float128 z; 1008 1009 z.low = zSig1; 1010 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 1011 return z; 1012 1013 } 1014 1015 /* 1016 ------------------------------------------------------------------------------- 1017 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1018 and extended significand formed by the concatenation of `zSig0', `zSig1', 1019 and `zSig2', and returns the proper quadruple-precision floating-point value 1020 corresponding to the abstract input. Ordinarily, the abstract value is 1021 simply rounded and packed into the quadruple-precision format, with the 1022 inexact exception raised if the abstract input cannot be represented 1023 exactly. However, if the abstract value is too large, the overflow and 1024 inexact exceptions are raised and an infinity or maximal finite value is 1025 returned. If the abstract value is too small, the input value is rounded to 1026 a subnormal number, and the underflow and inexact exceptions are raised if 1027 the abstract input cannot be represented exactly as a subnormal quadruple- 1028 precision floating-point number. 1029 The input significand must be normalized or smaller. If the input 1030 significand is not normalized, `zExp' must be 0; in that case, the result 1031 returned is a subnormal number, and it must not require rounding. In the 1032 usual case that the input significand is normalized, `zExp' must be 1 less 1033 than the ``true'' floating-point exponent. The handling of underflow and 1034 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1035 ------------------------------------------------------------------------------- 1036 */ 1037 static float128 1038 roundAndPackFloat128( 1039 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 1040 { 1041 int8 roundingMode; 1042 flag roundNearestEven, increment, isTiny; 1043 1044 roundingMode = float_rounding_mode(); 1045 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1046 increment = ( (sbits64) zSig2 < 0 ); 1047 if ( ! roundNearestEven ) { 1048 if ( roundingMode == float_round_to_zero ) { 1049 increment = 0; 1050 } 1051 else { 1052 if ( zSign ) { 1053 increment = ( roundingMode == float_round_down ) && zSig2; 1054 } 1055 else { 1056 increment = ( roundingMode == float_round_up ) && zSig2; 1057 } 1058 } 1059 } 1060 if ( 0x7FFD <= (bits32) zExp ) { 1061 if ( ( 0x7FFD < zExp ) 1062 || ( ( zExp == 0x7FFD ) 1063 && eq128( 1064 LIT64( 0x0001FFFFFFFFFFFF ), 1065 LIT64( 0xFFFFFFFFFFFFFFFF ), 1066 zSig0, 1067 zSig1 1068 ) 1069 && increment 1070 ) 1071 ) { 1072 float_raise( float_flag_overflow | float_flag_inexact ); 1073 if ( ( roundingMode == float_round_to_zero ) 1074 || ( zSign && ( roundingMode == float_round_up ) ) 1075 || ( ! zSign && ( roundingMode == float_round_down ) ) 1076 ) { 1077 return 1078 packFloat128( 1079 zSign, 1080 0x7FFE, 1081 LIT64( 0x0000FFFFFFFFFFFF ), 1082 LIT64( 0xFFFFFFFFFFFFFFFF ) 1083 ); 1084 } 1085 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1086 } 1087 if ( zExp < 0 ) { 1088 isTiny = 1089 ( float_detect_tininess == float_tininess_before_rounding ) 1090 || ( zExp < -1 ) 1091 || ! increment 1092 || lt128( 1093 zSig0, 1094 zSig1, 1095 LIT64( 0x0001FFFFFFFFFFFF ), 1096 LIT64( 0xFFFFFFFFFFFFFFFF ) 1097 ); 1098 shift128ExtraRightJamming( 1099 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1100 zExp = 0; 1101 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1102 if ( roundNearestEven ) { 1103 increment = ( (sbits64) zSig2 < 0 ); 1104 } 1105 else { 1106 if ( zSign ) { 1107 increment = ( roundingMode == float_round_down ) && zSig2; 1108 } 1109 else { 1110 increment = ( roundingMode == float_round_up ) && zSig2; 1111 } 1112 } 1113 } 1114 } 1115 if ( zSig2 ) float_set_inexact(); 1116 if ( increment ) { 1117 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1118 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1119 } 1120 else { 1121 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1122 } 1123 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1124 1125 } 1126 1127 /* 1128 ------------------------------------------------------------------------------- 1129 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1130 and significand formed by the concatenation of `zSig0' and `zSig1', and 1131 returns the proper quadruple-precision floating-point value corresponding 1132 to the abstract input. This routine is just like `roundAndPackFloat128' 1133 except that the input significand has fewer bits and does not have to be 1134 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1135 point exponent. 1136 ------------------------------------------------------------------------------- 1137 */ 1138 static float128 1139 normalizeRoundAndPackFloat128( 1140 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1141 { 1142 int8 shiftCount; 1143 bits64 zSig2; 1144 1145 if ( zSig0 == 0 ) { 1146 zSig0 = zSig1; 1147 zSig1 = 0; 1148 zExp -= 64; 1149 } 1150 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1151 if ( 0 <= shiftCount ) { 1152 zSig2 = 0; 1153 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1154 } 1155 else { 1156 shift128ExtraRightJamming( 1157 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1158 } 1159 zExp -= shiftCount; 1160 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1161 1162 } 1163 1164 #endif 1165 1166 /* 1167 ------------------------------------------------------------------------------- 1168 Returns the result of converting the 32-bit two's complement integer `a' 1169 to the single-precision floating-point format. The conversion is performed 1170 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1171 ------------------------------------------------------------------------------- 1172 */ 1173 float32 int32_to_float32( int32 a ) 1174 { 1175 flag zSign; 1176 1177 if ( a == 0 ) return 0; 1178 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1179 zSign = ( a < 0 ); 1180 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1181 1182 } 1183 1184 /* 1185 ------------------------------------------------------------------------------- 1186 Returns the result of converting the 32-bit two's complement integer `a' 1187 to the double-precision floating-point format. The conversion is performed 1188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1189 ------------------------------------------------------------------------------- 1190 */ 1191 float64 int32_to_float64( int32 a ) 1192 { 1193 flag zSign; 1194 uint32 absA; 1195 int8 shiftCount; 1196 bits64 zSig; 1197 1198 if ( a == 0 ) return 0; 1199 zSign = ( a < 0 ); 1200 absA = zSign ? - a : a; 1201 shiftCount = countLeadingZeros32( absA ) + 21; 1202 zSig = absA; 1203 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1204 1205 } 1206 1207 #ifdef FLOATX80 1208 1209 /* 1210 ------------------------------------------------------------------------------- 1211 Returns the result of converting the 32-bit two's complement integer `a' 1212 to the extended double-precision floating-point format. The conversion 1213 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1214 Arithmetic. 1215 ------------------------------------------------------------------------------- 1216 */ 1217 floatx80 int32_to_floatx80( int32 a ) 1218 { 1219 flag zSign; 1220 uint32 absA; 1221 int8 shiftCount; 1222 bits64 zSig; 1223 1224 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1225 zSign = ( a < 0 ); 1226 absA = zSign ? - a : a; 1227 shiftCount = countLeadingZeros32( absA ) + 32; 1228 zSig = absA; 1229 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1230 1231 } 1232 1233 #endif 1234 1235 #ifdef FLOAT128 1236 1237 /* 1238 ------------------------------------------------------------------------------- 1239 Returns the result of converting the 32-bit two's complement integer `a' to 1240 the quadruple-precision floating-point format. The conversion is performed 1241 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1242 ------------------------------------------------------------------------------- 1243 */ 1244 float128 int32_to_float128( int32 a ) 1245 { 1246 flag zSign; 1247 uint32 absA; 1248 int8 shiftCount; 1249 bits64 zSig0; 1250 1251 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1252 zSign = ( a < 0 ); 1253 absA = zSign ? - a : a; 1254 shiftCount = countLeadingZeros32( absA ) + 17; 1255 zSig0 = absA; 1256 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1257 1258 } 1259 1260 #endif 1261 1262 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1263 /* 1264 ------------------------------------------------------------------------------- 1265 Returns the result of converting the 64-bit two's complement integer `a' 1266 to the single-precision floating-point format. The conversion is performed 1267 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1268 ------------------------------------------------------------------------------- 1269 */ 1270 float32 int64_to_float32( int64 a ) 1271 { 1272 flag zSign; 1273 uint64 absA; 1274 int8 shiftCount; 1275 1276 if ( a == 0 ) return 0; 1277 zSign = ( a < 0 ); 1278 absA = zSign ? - a : a; 1279 shiftCount = countLeadingZeros64( absA ) - 40; 1280 if ( 0 <= shiftCount ) { 1281 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1282 } 1283 else { 1284 shiftCount += 7; 1285 if ( shiftCount < 0 ) { 1286 shift64RightJamming( absA, - shiftCount, &absA ); 1287 } 1288 else { 1289 absA <<= shiftCount; 1290 } 1291 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1292 } 1293 1294 } 1295 1296 /* 1297 ------------------------------------------------------------------------------- 1298 Returns the result of converting the 64-bit two's complement integer `a' 1299 to the double-precision floating-point format. The conversion is performed 1300 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1301 ------------------------------------------------------------------------------- 1302 */ 1303 float64 int64_to_float64( int64 a ) 1304 { 1305 flag zSign; 1306 1307 if ( a == 0 ) return 0; 1308 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1309 return packFloat64( 1, 0x43E, 0 ); 1310 } 1311 zSign = ( a < 0 ); 1312 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1313 1314 } 1315 1316 #ifdef FLOATX80 1317 1318 /* 1319 ------------------------------------------------------------------------------- 1320 Returns the result of converting the 64-bit two's complement integer `a' 1321 to the extended double-precision floating-point format. The conversion 1322 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1323 Arithmetic. 1324 ------------------------------------------------------------------------------- 1325 */ 1326 floatx80 int64_to_floatx80( int64 a ) 1327 { 1328 flag zSign; 1329 uint64 absA; 1330 int8 shiftCount; 1331 1332 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1333 zSign = ( a < 0 ); 1334 absA = zSign ? - a : a; 1335 shiftCount = countLeadingZeros64( absA ); 1336 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1337 1338 } 1339 1340 #endif 1341 1342 #ifdef FLOAT128 1343 1344 /* 1345 ------------------------------------------------------------------------------- 1346 Returns the result of converting the 64-bit two's complement integer `a' to 1347 the quadruple-precision floating-point format. The conversion is performed 1348 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1349 ------------------------------------------------------------------------------- 1350 */ 1351 float128 int64_to_float128( int64 a ) 1352 { 1353 flag zSign; 1354 uint64 absA; 1355 int8 shiftCount; 1356 int32 zExp; 1357 bits64 zSig0, zSig1; 1358 1359 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1360 zSign = ( a < 0 ); 1361 absA = zSign ? - a : a; 1362 shiftCount = countLeadingZeros64( absA ) + 49; 1363 zExp = 0x406E - shiftCount; 1364 if ( 64 <= shiftCount ) { 1365 zSig1 = 0; 1366 zSig0 = absA; 1367 shiftCount -= 64; 1368 } 1369 else { 1370 zSig1 = absA; 1371 zSig0 = 0; 1372 } 1373 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1374 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1375 1376 } 1377 1378 #endif 1379 #endif /* !SOFTFLOAT_FOR_GCC */ 1380 1381 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1382 /* 1383 ------------------------------------------------------------------------------- 1384 Returns the result of converting the single-precision floating-point value 1385 `a' to the 32-bit two's complement integer format. The conversion is 1386 performed according to the IEC/IEEE Standard for Binary Floating-Point 1387 Arithmetic---which means in particular that the conversion is rounded 1388 according to the current rounding mode. If `a' is a NaN, the largest 1389 positive integer is returned. Otherwise, if the conversion overflows, the 1390 largest integer with the same sign as `a' is returned. 1391 ------------------------------------------------------------------------------- 1392 */ 1393 int32 float32_to_int32( float32 a ) 1394 { 1395 flag aSign; 1396 int16 aExp, shiftCount; 1397 bits32 aSig; 1398 bits64 aSig64; 1399 1400 aSig = extractFloat32Frac( a ); 1401 aExp = extractFloat32Exp( a ); 1402 aSign = extractFloat32Sign( a ); 1403 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1404 if ( aExp ) aSig |= 0x00800000; 1405 shiftCount = 0xAF - aExp; 1406 aSig64 = aSig; 1407 aSig64 <<= 32; 1408 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1409 return roundAndPackInt32( aSign, aSig64 ); 1410 1411 } 1412 #endif /* !SOFTFLOAT_FOR_GCC */ 1413 1414 /* 1415 ------------------------------------------------------------------------------- 1416 Returns the result of converting the single-precision floating-point value 1417 `a' to the 32-bit two's complement integer format. The conversion is 1418 performed according to the IEC/IEEE Standard for Binary Floating-Point 1419 Arithmetic, except that the conversion is always rounded toward zero. 1420 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1421 the conversion overflows, the largest integer with the same sign as `a' is 1422 returned. 1423 ------------------------------------------------------------------------------- 1424 */ 1425 int32 float32_to_int32_round_to_zero( float32 a ) 1426 { 1427 flag aSign; 1428 int16 aExp, shiftCount; 1429 bits32 aSig; 1430 int32 z; 1431 1432 aSig = extractFloat32Frac( a ); 1433 aExp = extractFloat32Exp( a ); 1434 aSign = extractFloat32Sign( a ); 1435 shiftCount = aExp - 0x9E; 1436 if ( 0 <= shiftCount ) { 1437 if ( a != 0xCF000000 ) { 1438 float_raise( float_flag_invalid ); 1439 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1440 } 1441 return (sbits32) 0x80000000; 1442 } 1443 else if ( aExp <= 0x7E ) { 1444 if ( aExp | aSig ) float_set_inexact(); 1445 return 0; 1446 } 1447 aSig = ( aSig | 0x00800000 )<<8; 1448 z = aSig>>( - shiftCount ); 1449 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1450 float_set_inexact(); 1451 } 1452 if ( aSign ) z = - z; 1453 return z; 1454 1455 } 1456 1457 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1458 /* 1459 ------------------------------------------------------------------------------- 1460 Returns the result of converting the single-precision floating-point value 1461 `a' to the 64-bit two's complement integer format. The conversion is 1462 performed according to the IEC/IEEE Standard for Binary Floating-Point 1463 Arithmetic---which means in particular that the conversion is rounded 1464 according to the current rounding mode. If `a' is a NaN, the largest 1465 positive integer is returned. Otherwise, if the conversion overflows, the 1466 largest integer with the same sign as `a' is returned. 1467 ------------------------------------------------------------------------------- 1468 */ 1469 int64 float32_to_int64( float32 a ) 1470 { 1471 flag aSign; 1472 int16 aExp, shiftCount; 1473 bits32 aSig; 1474 bits64 aSig64, aSigExtra; 1475 1476 aSig = extractFloat32Frac( a ); 1477 aExp = extractFloat32Exp( a ); 1478 aSign = extractFloat32Sign( a ); 1479 shiftCount = 0xBE - aExp; 1480 if ( shiftCount < 0 ) { 1481 float_raise( float_flag_invalid ); 1482 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1483 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1484 } 1485 return (sbits64) LIT64( 0x8000000000000000 ); 1486 } 1487 if ( aExp ) aSig |= 0x00800000; 1488 aSig64 = aSig; 1489 aSig64 <<= 40; 1490 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1491 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1492 1493 } 1494 1495 /* 1496 ------------------------------------------------------------------------------- 1497 Returns the result of converting the single-precision floating-point value 1498 `a' to the 64-bit two's complement integer format. The conversion is 1499 performed according to the IEC/IEEE Standard for Binary Floating-Point 1500 Arithmetic, except that the conversion is always rounded toward zero. If 1501 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1502 conversion overflows, the largest integer with the same sign as `a' is 1503 returned. 1504 ------------------------------------------------------------------------------- 1505 */ 1506 int64 float32_to_int64_round_to_zero( float32 a ) 1507 { 1508 flag aSign; 1509 int16 aExp, shiftCount; 1510 bits32 aSig; 1511 bits64 aSig64; 1512 int64 z; 1513 1514 aSig = extractFloat32Frac( a ); 1515 aExp = extractFloat32Exp( a ); 1516 aSign = extractFloat32Sign( a ); 1517 shiftCount = aExp - 0xBE; 1518 if ( 0 <= shiftCount ) { 1519 if ( a != 0xDF000000 ) { 1520 float_raise( float_flag_invalid ); 1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1522 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1523 } 1524 } 1525 return (sbits64) LIT64( 0x8000000000000000 ); 1526 } 1527 else if ( aExp <= 0x7E ) { 1528 if ( aExp | aSig ) float_set_inexact(); 1529 return 0; 1530 } 1531 aSig64 = aSig | 0x00800000; 1532 aSig64 <<= 40; 1533 z = aSig64>>( - shiftCount ); 1534 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1535 float_set_inexact(); 1536 } 1537 if ( aSign ) z = - z; 1538 return z; 1539 1540 } 1541 #endif /* !SOFTFLOAT_FOR_GCC */ 1542 1543 /* 1544 ------------------------------------------------------------------------------- 1545 Returns the result of converting the single-precision floating-point value 1546 `a' to the double-precision floating-point format. The conversion is 1547 performed according to the IEC/IEEE Standard for Binary Floating-Point 1548 Arithmetic. 1549 ------------------------------------------------------------------------------- 1550 */ 1551 float64 float32_to_float64( float32 a ) 1552 { 1553 flag aSign; 1554 int16 aExp; 1555 bits32 aSig; 1556 1557 aSig = extractFloat32Frac( a ); 1558 aExp = extractFloat32Exp( a ); 1559 aSign = extractFloat32Sign( a ); 1560 if ( aExp == 0xFF ) { 1561 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1562 return packFloat64( aSign, 0x7FF, 0 ); 1563 } 1564 if ( aExp == 0 ) { 1565 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1566 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1567 --aExp; 1568 } 1569 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1570 1571 } 1572 1573 #ifdef FLOATX80 1574 1575 /* 1576 ------------------------------------------------------------------------------- 1577 Returns the result of converting the single-precision floating-point value 1578 `a' to the extended double-precision floating-point format. The conversion 1579 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1580 Arithmetic. 1581 ------------------------------------------------------------------------------- 1582 */ 1583 floatx80 float32_to_floatx80( float32 a ) 1584 { 1585 flag aSign; 1586 int16 aExp; 1587 bits32 aSig; 1588 1589 aSig = extractFloat32Frac( a ); 1590 aExp = extractFloat32Exp( a ); 1591 aSign = extractFloat32Sign( a ); 1592 if ( aExp == 0xFF ) { 1593 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1594 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1595 } 1596 if ( aExp == 0 ) { 1597 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1598 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1599 } 1600 aSig |= 0x00800000; 1601 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1602 1603 } 1604 1605 #endif 1606 1607 #ifdef FLOAT128 1608 1609 /* 1610 ------------------------------------------------------------------------------- 1611 Returns the result of converting the single-precision floating-point value 1612 `a' to the double-precision floating-point format. The conversion is 1613 performed according to the IEC/IEEE Standard for Binary Floating-Point 1614 Arithmetic. 1615 ------------------------------------------------------------------------------- 1616 */ 1617 float128 float32_to_float128( float32 a ) 1618 { 1619 flag aSign; 1620 int16 aExp; 1621 bits32 aSig; 1622 1623 aSig = extractFloat32Frac( a ); 1624 aExp = extractFloat32Exp( a ); 1625 aSign = extractFloat32Sign( a ); 1626 if ( aExp == 0xFF ) { 1627 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1628 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1629 } 1630 if ( aExp == 0 ) { 1631 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1632 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1633 --aExp; 1634 } 1635 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1636 1637 } 1638 1639 #endif 1640 1641 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1642 /* 1643 ------------------------------------------------------------------------------- 1644 Rounds the single-precision floating-point value `a' to an integer, and 1645 returns the result as a single-precision floating-point value. The 1646 operation is performed according to the IEC/IEEE Standard for Binary 1647 Floating-Point Arithmetic. 1648 ------------------------------------------------------------------------------- 1649 */ 1650 float32 float32_round_to_int( float32 a ) 1651 { 1652 flag aSign; 1653 int16 aExp; 1654 bits32 lastBitMask, roundBitsMask; 1655 int8 roundingMode; 1656 float32 z; 1657 1658 aExp = extractFloat32Exp( a ); 1659 if ( 0x96 <= aExp ) { 1660 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1661 return propagateFloat32NaN( a, a ); 1662 } 1663 return a; 1664 } 1665 if ( aExp <= 0x7E ) { 1666 if ( (bits32) ( a<<1 ) == 0 ) return a; 1667 float_set_inexact(); 1668 aSign = extractFloat32Sign( a ); 1669 switch ( float_rounding_mode() ) { 1670 case float_round_nearest_even: 1671 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1672 return packFloat32( aSign, 0x7F, 0 ); 1673 } 1674 break; 1675 case float_round_down: 1676 return aSign ? 0xBF800000 : 0; 1677 case float_round_up: 1678 return aSign ? 0x80000000 : 0x3F800000; 1679 } 1680 return packFloat32( aSign, 0, 0 ); 1681 } 1682 lastBitMask = 1; 1683 lastBitMask <<= 0x96 - aExp; 1684 roundBitsMask = lastBitMask - 1; 1685 z = a; 1686 roundingMode = float_rounding_mode(); 1687 if ( roundingMode == float_round_nearest_even ) { 1688 z += lastBitMask>>1; 1689 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1690 } 1691 else if ( roundingMode != float_round_to_zero ) { 1692 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1693 z += roundBitsMask; 1694 } 1695 } 1696 z &= ~ roundBitsMask; 1697 if ( z != a ) float_set_inexact(); 1698 return z; 1699 1700 } 1701 #endif /* !SOFTFLOAT_FOR_GCC */ 1702 1703 /* 1704 ------------------------------------------------------------------------------- 1705 Returns the result of adding the absolute values of the single-precision 1706 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1707 before being returned. `zSign' is ignored if the result is a NaN. 1708 The addition is performed according to the IEC/IEEE Standard for Binary 1709 Floating-Point Arithmetic. 1710 ------------------------------------------------------------------------------- 1711 */ 1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1713 { 1714 int16 aExp, bExp, zExp; 1715 bits32 aSig, bSig, zSig; 1716 int16 expDiff; 1717 1718 aSig = extractFloat32Frac( a ); 1719 aExp = extractFloat32Exp( a ); 1720 bSig = extractFloat32Frac( b ); 1721 bExp = extractFloat32Exp( b ); 1722 expDiff = aExp - bExp; 1723 aSig <<= 6; 1724 bSig <<= 6; 1725 if ( 0 < expDiff ) { 1726 if ( aExp == 0xFF ) { 1727 if ( aSig ) return propagateFloat32NaN( a, b ); 1728 return a; 1729 } 1730 if ( bExp == 0 ) { 1731 --expDiff; 1732 } 1733 else { 1734 bSig |= 0x20000000; 1735 } 1736 shift32RightJamming( bSig, expDiff, &bSig ); 1737 zExp = aExp; 1738 } 1739 else if ( expDiff < 0 ) { 1740 if ( bExp == 0xFF ) { 1741 if ( bSig ) return propagateFloat32NaN( a, b ); 1742 return packFloat32( zSign, 0xFF, 0 ); 1743 } 1744 if ( aExp == 0 ) { 1745 ++expDiff; 1746 } 1747 else { 1748 aSig |= 0x20000000; 1749 } 1750 shift32RightJamming( aSig, - expDiff, &aSig ); 1751 zExp = bExp; 1752 } 1753 else { 1754 if ( aExp == 0xFF ) { 1755 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1756 return a; 1757 } 1758 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1759 zSig = 0x40000000 + aSig + bSig; 1760 zExp = aExp; 1761 goto roundAndPack; 1762 } 1763 aSig |= 0x20000000; 1764 zSig = ( aSig + bSig )<<1; 1765 --zExp; 1766 if ( (sbits32) zSig < 0 ) { 1767 zSig = aSig + bSig; 1768 ++zExp; 1769 } 1770 roundAndPack: 1771 return roundAndPackFloat32( zSign, zExp, zSig ); 1772 1773 } 1774 1775 /* 1776 ------------------------------------------------------------------------------- 1777 Returns the result of subtracting the absolute values of the single- 1778 precision floating-point values `a' and `b'. If `zSign' is 1, the 1779 difference is negated before being returned. `zSign' is ignored if the 1780 result is a NaN. The subtraction is performed according to the IEC/IEEE 1781 Standard for Binary Floating-Point Arithmetic. 1782 ------------------------------------------------------------------------------- 1783 */ 1784 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1785 { 1786 int16 aExp, bExp, zExp; 1787 bits32 aSig, bSig, zSig; 1788 int16 expDiff; 1789 1790 aSig = extractFloat32Frac( a ); 1791 aExp = extractFloat32Exp( a ); 1792 bSig = extractFloat32Frac( b ); 1793 bExp = extractFloat32Exp( b ); 1794 expDiff = aExp - bExp; 1795 aSig <<= 7; 1796 bSig <<= 7; 1797 if ( 0 < expDiff ) goto aExpBigger; 1798 if ( expDiff < 0 ) goto bExpBigger; 1799 if ( aExp == 0xFF ) { 1800 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1801 float_raise( float_flag_invalid ); 1802 return float32_default_nan; 1803 } 1804 if ( aExp == 0 ) { 1805 aExp = 1; 1806 bExp = 1; 1807 } 1808 if ( bSig < aSig ) goto aBigger; 1809 if ( aSig < bSig ) goto bBigger; 1810 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1811 bExpBigger: 1812 if ( bExp == 0xFF ) { 1813 if ( bSig ) return propagateFloat32NaN( a, b ); 1814 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1815 } 1816 if ( aExp == 0 ) { 1817 ++expDiff; 1818 } 1819 else { 1820 aSig |= 0x40000000; 1821 } 1822 shift32RightJamming( aSig, - expDiff, &aSig ); 1823 bSig |= 0x40000000; 1824 bBigger: 1825 zSig = bSig - aSig; 1826 zExp = bExp; 1827 zSign ^= 1; 1828 goto normalizeRoundAndPack; 1829 aExpBigger: 1830 if ( aExp == 0xFF ) { 1831 if ( aSig ) return propagateFloat32NaN( a, b ); 1832 return a; 1833 } 1834 if ( bExp == 0 ) { 1835 --expDiff; 1836 } 1837 else { 1838 bSig |= 0x40000000; 1839 } 1840 shift32RightJamming( bSig, expDiff, &bSig ); 1841 aSig |= 0x40000000; 1842 aBigger: 1843 zSig = aSig - bSig; 1844 zExp = aExp; 1845 normalizeRoundAndPack: 1846 --zExp; 1847 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1848 1849 } 1850 1851 /* 1852 ------------------------------------------------------------------------------- 1853 Returns the result of adding the single-precision floating-point values `a' 1854 and `b'. The operation is performed according to the IEC/IEEE Standard for 1855 Binary Floating-Point Arithmetic. 1856 ------------------------------------------------------------------------------- 1857 */ 1858 float32 float32_add( float32 a, float32 b ) 1859 { 1860 flag aSign, bSign; 1861 1862 aSign = extractFloat32Sign( a ); 1863 bSign = extractFloat32Sign( b ); 1864 if ( aSign == bSign ) { 1865 return addFloat32Sigs( a, b, aSign ); 1866 } 1867 else { 1868 return subFloat32Sigs( a, b, aSign ); 1869 } 1870 1871 } 1872 1873 /* 1874 ------------------------------------------------------------------------------- 1875 Returns the result of subtracting the single-precision floating-point values 1876 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1877 for Binary Floating-Point Arithmetic. 1878 ------------------------------------------------------------------------------- 1879 */ 1880 float32 float32_sub( float32 a, float32 b ) 1881 { 1882 flag aSign, bSign; 1883 1884 aSign = extractFloat32Sign( a ); 1885 bSign = extractFloat32Sign( b ); 1886 if ( aSign == bSign ) { 1887 return subFloat32Sigs( a, b, aSign ); 1888 } 1889 else { 1890 return addFloat32Sigs( a, b, aSign ); 1891 } 1892 1893 } 1894 1895 /* 1896 ------------------------------------------------------------------------------- 1897 Returns the result of multiplying the single-precision floating-point values 1898 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1899 for Binary Floating-Point Arithmetic. 1900 ------------------------------------------------------------------------------- 1901 */ 1902 float32 float32_mul( float32 a, float32 b ) 1903 { 1904 flag aSign, bSign, zSign; 1905 int16 aExp, bExp, zExp; 1906 bits32 aSig, bSig; 1907 bits64 zSig64; 1908 bits32 zSig; 1909 1910 aSig = extractFloat32Frac( a ); 1911 aExp = extractFloat32Exp( a ); 1912 aSign = extractFloat32Sign( a ); 1913 bSig = extractFloat32Frac( b ); 1914 bExp = extractFloat32Exp( b ); 1915 bSign = extractFloat32Sign( b ); 1916 zSign = aSign ^ bSign; 1917 if ( aExp == 0xFF ) { 1918 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1919 return propagateFloat32NaN( a, b ); 1920 } 1921 if ( ( bExp | bSig ) == 0 ) { 1922 float_raise( float_flag_invalid ); 1923 return float32_default_nan; 1924 } 1925 return packFloat32( zSign, 0xFF, 0 ); 1926 } 1927 if ( bExp == 0xFF ) { 1928 if ( bSig ) return propagateFloat32NaN( a, b ); 1929 if ( ( aExp | aSig ) == 0 ) { 1930 float_raise( float_flag_invalid ); 1931 return float32_default_nan; 1932 } 1933 return packFloat32( zSign, 0xFF, 0 ); 1934 } 1935 if ( aExp == 0 ) { 1936 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1937 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1938 } 1939 if ( bExp == 0 ) { 1940 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1941 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1942 } 1943 zExp = aExp + bExp - 0x7F; 1944 aSig = ( aSig | 0x00800000 )<<7; 1945 bSig = ( bSig | 0x00800000 )<<8; 1946 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1947 zSig = zSig64; 1948 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1949 zSig <<= 1; 1950 --zExp; 1951 } 1952 return roundAndPackFloat32( zSign, zExp, zSig ); 1953 1954 } 1955 1956 /* 1957 ------------------------------------------------------------------------------- 1958 Returns the result of dividing the single-precision floating-point value `a' 1959 by the corresponding value `b'. The operation is performed according to the 1960 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1961 ------------------------------------------------------------------------------- 1962 */ 1963 float32 float32_div( float32 a, float32 b ) 1964 { 1965 flag aSign, bSign, zSign; 1966 int16 aExp, bExp, zExp; 1967 bits32 aSig, bSig, zSig; 1968 1969 aSig = extractFloat32Frac( a ); 1970 aExp = extractFloat32Exp( a ); 1971 aSign = extractFloat32Sign( a ); 1972 bSig = extractFloat32Frac( b ); 1973 bExp = extractFloat32Exp( b ); 1974 bSign = extractFloat32Sign( b ); 1975 zSign = aSign ^ bSign; 1976 if ( aExp == 0xFF ) { 1977 if ( aSig ) return propagateFloat32NaN( a, b ); 1978 if ( bExp == 0xFF ) { 1979 if ( bSig ) return propagateFloat32NaN( a, b ); 1980 float_raise( float_flag_invalid ); 1981 return float32_default_nan; 1982 } 1983 return packFloat32( zSign, 0xFF, 0 ); 1984 } 1985 if ( bExp == 0xFF ) { 1986 if ( bSig ) return propagateFloat32NaN( a, b ); 1987 return packFloat32( zSign, 0, 0 ); 1988 } 1989 if ( bExp == 0 ) { 1990 if ( bSig == 0 ) { 1991 if ( ( aExp | aSig ) == 0 ) { 1992 float_raise( float_flag_invalid ); 1993 return float32_default_nan; 1994 } 1995 float_raise( float_flag_divbyzero ); 1996 return packFloat32( zSign, 0xFF, 0 ); 1997 } 1998 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1999 } 2000 if ( aExp == 0 ) { 2001 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2002 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2003 } 2004 zExp = aExp - bExp + 0x7D; 2005 aSig = ( aSig | 0x00800000 )<<7; 2006 bSig = ( bSig | 0x00800000 )<<8; 2007 if ( bSig <= ( aSig + aSig ) ) { 2008 aSig >>= 1; 2009 ++zExp; 2010 } 2011 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2012 if ( ( zSig & 0x3F ) == 0 ) { 2013 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2014 } 2015 return roundAndPackFloat32( zSign, zExp, zSig ); 2016 2017 } 2018 2019 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2020 /* 2021 ------------------------------------------------------------------------------- 2022 Returns the remainder of the single-precision floating-point value `a' 2023 with respect to the corresponding value `b'. The operation is performed 2024 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2025 ------------------------------------------------------------------------------- 2026 */ 2027 float32 float32_rem( float32 a, float32 b ) 2028 { 2029 flag aSign, bSign, zSign; 2030 int16 aExp, bExp, expDiff; 2031 bits32 aSig, bSig; 2032 bits32 q; 2033 bits64 aSig64, bSig64, q64; 2034 bits32 alternateASig; 2035 sbits32 sigMean; 2036 2037 aSig = extractFloat32Frac( a ); 2038 aExp = extractFloat32Exp( a ); 2039 aSign = extractFloat32Sign( a ); 2040 bSig = extractFloat32Frac( b ); 2041 bExp = extractFloat32Exp( b ); 2042 bSign = extractFloat32Sign( b ); 2043 if ( aExp == 0xFF ) { 2044 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2045 return propagateFloat32NaN( a, b ); 2046 } 2047 float_raise( float_flag_invalid ); 2048 return float32_default_nan; 2049 } 2050 if ( bExp == 0xFF ) { 2051 if ( bSig ) return propagateFloat32NaN( a, b ); 2052 return a; 2053 } 2054 if ( bExp == 0 ) { 2055 if ( bSig == 0 ) { 2056 float_raise( float_flag_invalid ); 2057 return float32_default_nan; 2058 } 2059 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2060 } 2061 if ( aExp == 0 ) { 2062 if ( aSig == 0 ) return a; 2063 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2064 } 2065 expDiff = aExp - bExp; 2066 aSig |= 0x00800000; 2067 bSig |= 0x00800000; 2068 if ( expDiff < 32 ) { 2069 aSig <<= 8; 2070 bSig <<= 8; 2071 if ( expDiff < 0 ) { 2072 if ( expDiff < -1 ) return a; 2073 aSig >>= 1; 2074 } 2075 q = ( bSig <= aSig ); 2076 if ( q ) aSig -= bSig; 2077 if ( 0 < expDiff ) { 2078 q = ( ( (bits64) aSig )<<32 ) / bSig; 2079 q >>= 32 - expDiff; 2080 bSig >>= 2; 2081 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2082 } 2083 else { 2084 aSig >>= 2; 2085 bSig >>= 2; 2086 } 2087 } 2088 else { 2089 if ( bSig <= aSig ) aSig -= bSig; 2090 aSig64 = ( (bits64) aSig )<<40; 2091 bSig64 = ( (bits64) bSig )<<40; 2092 expDiff -= 64; 2093 while ( 0 < expDiff ) { 2094 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2095 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2096 aSig64 = - ( ( bSig * q64 )<<38 ); 2097 expDiff -= 62; 2098 } 2099 expDiff += 64; 2100 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2101 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2102 q = q64>>( 64 - expDiff ); 2103 bSig <<= 6; 2104 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2105 } 2106 do { 2107 alternateASig = aSig; 2108 ++q; 2109 aSig -= bSig; 2110 } while ( 0 <= (sbits32) aSig ); 2111 sigMean = aSig + alternateASig; 2112 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2113 aSig = alternateASig; 2114 } 2115 zSign = ( (sbits32) aSig < 0 ); 2116 if ( zSign ) aSig = - aSig; 2117 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2118 2119 } 2120 #endif /* !SOFTFLOAT_FOR_GCC */ 2121 2122 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2123 /* 2124 ------------------------------------------------------------------------------- 2125 Returns the square root of the single-precision floating-point value `a'. 2126 The operation is performed according to the IEC/IEEE Standard for Binary 2127 Floating-Point Arithmetic. 2128 ------------------------------------------------------------------------------- 2129 */ 2130 float32 float32_sqrt( float32 a ) 2131 { 2132 flag aSign; 2133 int16 aExp, zExp; 2134 bits32 aSig, zSig; 2135 bits64 rem, term; 2136 2137 aSig = extractFloat32Frac( a ); 2138 aExp = extractFloat32Exp( a ); 2139 aSign = extractFloat32Sign( a ); 2140 if ( aExp == 0xFF ) { 2141 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2142 if ( ! aSign ) return a; 2143 float_raise( float_flag_invalid ); 2144 return float32_default_nan; 2145 } 2146 if ( aSign ) { 2147 if ( ( aExp | aSig ) == 0 ) return a; 2148 float_raise( float_flag_invalid ); 2149 return float32_default_nan; 2150 } 2151 if ( aExp == 0 ) { 2152 if ( aSig == 0 ) return 0; 2153 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2154 } 2155 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2156 aSig = ( aSig | 0x00800000 )<<8; 2157 zSig = estimateSqrt32( aExp, aSig ) + 2; 2158 if ( ( zSig & 0x7F ) <= 5 ) { 2159 if ( zSig < 2 ) { 2160 zSig = 0x7FFFFFFF; 2161 goto roundAndPack; 2162 } 2163 aSig >>= aExp & 1; 2164 term = ( (bits64) zSig ) * zSig; 2165 rem = ( ( (bits64) aSig )<<32 ) - term; 2166 while ( (sbits64) rem < 0 ) { 2167 --zSig; 2168 rem += ( ( (bits64) zSig )<<1 ) | 1; 2169 } 2170 zSig |= ( rem != 0 ); 2171 } 2172 shift32RightJamming( zSig, 1, &zSig ); 2173 roundAndPack: 2174 return roundAndPackFloat32( 0, zExp, zSig ); 2175 2176 } 2177 #endif /* !SOFTFLOAT_FOR_GCC */ 2178 2179 /* 2180 ------------------------------------------------------------------------------- 2181 Returns 1 if the single-precision floating-point value `a' is equal to 2182 the corresponding value `b', and 0 otherwise. The comparison is performed 2183 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2184 ------------------------------------------------------------------------------- 2185 */ 2186 flag float32_eq( float32 a, float32 b ) 2187 { 2188 2189 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2190 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2191 ) { 2192 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2193 float_raise( float_flag_invalid ); 2194 } 2195 return 0; 2196 } 2197 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2198 2199 } 2200 2201 /* 2202 ------------------------------------------------------------------------------- 2203 Returns 1 if the single-precision floating-point value `a' is less than 2204 or equal to the corresponding value `b', and 0 otherwise. The comparison 2205 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2206 Arithmetic. 2207 ------------------------------------------------------------------------------- 2208 */ 2209 flag float32_le( float32 a, float32 b ) 2210 { 2211 flag aSign, bSign; 2212 2213 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2214 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2215 ) { 2216 float_raise( float_flag_invalid ); 2217 return 0; 2218 } 2219 aSign = extractFloat32Sign( a ); 2220 bSign = extractFloat32Sign( b ); 2221 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2222 return ( a == b ) || ( aSign ^ ( a < b ) ); 2223 2224 } 2225 2226 /* 2227 ------------------------------------------------------------------------------- 2228 Returns 1 if the single-precision floating-point value `a' is less than 2229 the corresponding value `b', and 0 otherwise. The comparison is performed 2230 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2231 ------------------------------------------------------------------------------- 2232 */ 2233 flag float32_lt( float32 a, float32 b ) 2234 { 2235 flag aSign, bSign; 2236 2237 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2238 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2239 ) { 2240 float_raise( float_flag_invalid ); 2241 return 0; 2242 } 2243 aSign = extractFloat32Sign( a ); 2244 bSign = extractFloat32Sign( b ); 2245 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2246 return ( a != b ) && ( aSign ^ ( a < b ) ); 2247 2248 } 2249 2250 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2251 /* 2252 ------------------------------------------------------------------------------- 2253 Returns 1 if the single-precision floating-point value `a' is equal to 2254 the corresponding value `b', and 0 otherwise. The invalid exception is 2255 raised if either operand is a NaN. Otherwise, the comparison is performed 2256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2257 ------------------------------------------------------------------------------- 2258 */ 2259 flag float32_eq_signaling( float32 a, float32 b ) 2260 { 2261 2262 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2263 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2264 ) { 2265 float_raise( float_flag_invalid ); 2266 return 0; 2267 } 2268 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2269 2270 } 2271 2272 /* 2273 ------------------------------------------------------------------------------- 2274 Returns 1 if the single-precision floating-point value `a' is less than or 2275 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2276 cause an exception. Otherwise, the comparison is performed according to the 2277 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2278 ------------------------------------------------------------------------------- 2279 */ 2280 flag float32_le_quiet( float32 a, float32 b ) 2281 { 2282 flag aSign, bSign; 2283 2284 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2285 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2286 ) { 2287 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2288 float_raise( float_flag_invalid ); 2289 } 2290 return 0; 2291 } 2292 aSign = extractFloat32Sign( a ); 2293 bSign = extractFloat32Sign( b ); 2294 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2295 return ( a == b ) || ( aSign ^ ( a < b ) ); 2296 2297 } 2298 2299 /* 2300 ------------------------------------------------------------------------------- 2301 Returns 1 if the single-precision floating-point value `a' is less than 2302 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2303 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2304 Standard for Binary Floating-Point Arithmetic. 2305 ------------------------------------------------------------------------------- 2306 */ 2307 flag float32_lt_quiet( float32 a, float32 b ) 2308 { 2309 flag aSign, bSign; 2310 2311 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2312 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2313 ) { 2314 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2315 float_raise( float_flag_invalid ); 2316 } 2317 return 0; 2318 } 2319 aSign = extractFloat32Sign( a ); 2320 bSign = extractFloat32Sign( b ); 2321 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2322 return ( a != b ) && ( aSign ^ ( a < b ) ); 2323 2324 } 2325 #endif /* !SOFTFLOAT_FOR_GCC */ 2326 2327 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2328 /* 2329 ------------------------------------------------------------------------------- 2330 Returns the result of converting the double-precision floating-point value 2331 `a' to the 32-bit two's complement integer format. The conversion is 2332 performed according to the IEC/IEEE Standard for Binary Floating-Point 2333 Arithmetic---which means in particular that the conversion is rounded 2334 according to the current rounding mode. If `a' is a NaN, the largest 2335 positive integer is returned. Otherwise, if the conversion overflows, the 2336 largest integer with the same sign as `a' is returned. 2337 ------------------------------------------------------------------------------- 2338 */ 2339 int32 float64_to_int32( float64 a ) 2340 { 2341 flag aSign; 2342 int16 aExp, shiftCount; 2343 bits64 aSig; 2344 2345 aSig = extractFloat64Frac( a ); 2346 aExp = extractFloat64Exp( a ); 2347 aSign = extractFloat64Sign( a ); 2348 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2349 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2350 shiftCount = 0x42C - aExp; 2351 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2352 return roundAndPackInt32( aSign, aSig ); 2353 2354 } 2355 #endif /* !SOFTFLOAT_FOR_GCC */ 2356 2357 /* 2358 ------------------------------------------------------------------------------- 2359 Returns the result of converting the double-precision floating-point value 2360 `a' to the 32-bit two's complement integer format. The conversion is 2361 performed according to the IEC/IEEE Standard for Binary Floating-Point 2362 Arithmetic, except that the conversion is always rounded toward zero. 2363 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2364 the conversion overflows, the largest integer with the same sign as `a' is 2365 returned. 2366 ------------------------------------------------------------------------------- 2367 */ 2368 int32 float64_to_int32_round_to_zero( float64 a ) 2369 { 2370 flag aSign; 2371 int16 aExp, shiftCount; 2372 bits64 aSig, savedASig; 2373 int32 z; 2374 2375 aSig = extractFloat64Frac( a ); 2376 aExp = extractFloat64Exp( a ); 2377 aSign = extractFloat64Sign( a ); 2378 if ( 0x41E < aExp ) { 2379 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2380 goto invalid; 2381 } 2382 else if ( aExp < 0x3FF ) { 2383 if ( aExp || aSig ) float_set_inexact(); 2384 return 0; 2385 } 2386 aSig |= LIT64( 0x0010000000000000 ); 2387 shiftCount = 0x433 - aExp; 2388 savedASig = aSig; 2389 aSig >>= shiftCount; 2390 z = aSig; 2391 if ( aSign ) z = - z; 2392 if ( ( z < 0 ) ^ aSign ) { 2393 invalid: 2394 float_raise( float_flag_invalid ); 2395 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2396 } 2397 if ( ( aSig<<shiftCount ) != savedASig ) { 2398 float_set_inexact(); 2399 } 2400 return z; 2401 2402 } 2403 2404 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2405 /* 2406 ------------------------------------------------------------------------------- 2407 Returns the result of converting the double-precision floating-point value 2408 `a' to the 64-bit two's complement integer format. The conversion is 2409 performed according to the IEC/IEEE Standard for Binary Floating-Point 2410 Arithmetic---which means in particular that the conversion is rounded 2411 according to the current rounding mode. If `a' is a NaN, the largest 2412 positive integer is returned. Otherwise, if the conversion overflows, the 2413 largest integer with the same sign as `a' is returned. 2414 ------------------------------------------------------------------------------- 2415 */ 2416 int64 float64_to_int64( float64 a ) 2417 { 2418 flag aSign; 2419 int16 aExp, shiftCount; 2420 bits64 aSig, aSigExtra; 2421 2422 aSig = extractFloat64Frac( a ); 2423 aExp = extractFloat64Exp( a ); 2424 aSign = extractFloat64Sign( a ); 2425 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2426 shiftCount = 0x433 - aExp; 2427 if ( shiftCount <= 0 ) { 2428 if ( 0x43E < aExp ) { 2429 float_raise( float_flag_invalid ); 2430 if ( ! aSign 2431 || ( ( aExp == 0x7FF ) 2432 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2433 ) { 2434 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2435 } 2436 return (sbits64) LIT64( 0x8000000000000000 ); 2437 } 2438 aSigExtra = 0; 2439 aSig <<= - shiftCount; 2440 } 2441 else { 2442 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2443 } 2444 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2445 2446 } 2447 2448 #ifdef __alpha__ 2449 /* 2450 ------------------------------------------------------------------------------- 2451 Returns the result of converting the double-precision floating-point value 2452 `a' to the 64-bit two's complement integer format. The conversion is 2453 performed according to the IEC/IEEE Standard for Binary Floating-Point 2454 Arithmetic---which means in particular that the conversion is rounded 2455 according to the current rounding mode. If `a' is a NaN, the invalid 2456 exception is raised and zero is returned. 2457 ------------------------------------------------------------------------------- 2458 */ 2459 int64 float64_to_int64_no_overflow( float64 a ) 2460 { 2461 flag aSign; 2462 int16 aExp, shiftCount; 2463 bits64 aSig, aSigExtra; 2464 2465 aSig = extractFloat64Frac( a ); 2466 aExp = extractFloat64Exp( a ); 2467 aSign = extractFloat64Sign( a ); 2468 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2469 shiftCount = 0x433 - aExp; 2470 if ( shiftCount <= 0 ) { 2471 if ( 0x43E < aExp ) { 2472 float_raise( float_flag_invalid ); 2473 return 0; 2474 } 2475 aSigExtra = 0; 2476 aSig <<= - shiftCount; 2477 } 2478 else { 2479 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2480 } 2481 return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra ); 2482 2483 } 2484 #endif /* __alpha__ */ 2485 2486 /* 2487 ------------------------------------------------------------------------------- 2488 Returns the result of converting the double-precision floating-point value 2489 `a' to the 64-bit two's complement integer format. The conversion is 2490 performed according to the IEC/IEEE Standard for Binary Floating-Point 2491 Arithmetic, except that the conversion is always rounded toward zero. 2492 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2493 the conversion overflows, the largest integer with the same sign as `a' is 2494 returned. 2495 ------------------------------------------------------------------------------- 2496 */ 2497 int64 float64_to_int64_round_to_zero( float64 a ) 2498 { 2499 flag aSign; 2500 int16 aExp, shiftCount; 2501 bits64 aSig; 2502 int64 z; 2503 2504 aSig = extractFloat64Frac( a ); 2505 aExp = extractFloat64Exp( a ); 2506 aSign = extractFloat64Sign( a ); 2507 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2508 shiftCount = aExp - 0x433; 2509 if ( 0 <= shiftCount ) { 2510 if ( 0x43E <= aExp ) { 2511 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2512 float_raise( float_flag_invalid ); 2513 if ( ! aSign 2514 || ( ( aExp == 0x7FF ) 2515 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2516 ) { 2517 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2518 } 2519 } 2520 return (sbits64) LIT64( 0x8000000000000000 ); 2521 } 2522 z = aSig<<shiftCount; 2523 } 2524 else { 2525 if ( aExp < 0x3FE ) { 2526 if ( aExp | aSig ) float_set_inexact(); 2527 return 0; 2528 } 2529 z = aSig>>( - shiftCount ); 2530 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2531 float_set_inexact(); 2532 } 2533 } 2534 if ( aSign ) z = - z; 2535 return z; 2536 2537 } 2538 #endif /* !SOFTFLOAT_FOR_GCC */ 2539 2540 /* 2541 ------------------------------------------------------------------------------- 2542 Returns the result of converting the double-precision floating-point value 2543 `a' to the single-precision floating-point format. The conversion is 2544 performed according to the IEC/IEEE Standard for Binary Floating-Point 2545 Arithmetic. 2546 ------------------------------------------------------------------------------- 2547 */ 2548 float32 float64_to_float32( float64 a ) 2549 { 2550 flag aSign; 2551 int16 aExp; 2552 bits64 aSig; 2553 bits32 zSig; 2554 2555 aSig = extractFloat64Frac( a ); 2556 aExp = extractFloat64Exp( a ); 2557 aSign = extractFloat64Sign( a ); 2558 if ( aExp == 0x7FF ) { 2559 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2560 return packFloat32( aSign, 0xFF, 0 ); 2561 } 2562 shift64RightJamming( aSig, 22, &aSig ); 2563 zSig = aSig; 2564 if ( aExp || zSig ) { 2565 zSig |= 0x40000000; 2566 aExp -= 0x381; 2567 } 2568 return roundAndPackFloat32( aSign, aExp, zSig ); 2569 2570 } 2571 2572 #ifdef FLOATX80 2573 2574 /* 2575 ------------------------------------------------------------------------------- 2576 Returns the result of converting the double-precision floating-point value 2577 `a' to the extended double-precision floating-point format. The conversion 2578 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2579 Arithmetic. 2580 ------------------------------------------------------------------------------- 2581 */ 2582 floatx80 float64_to_floatx80( float64 a ) 2583 { 2584 flag aSign; 2585 int16 aExp; 2586 bits64 aSig; 2587 2588 aSig = extractFloat64Frac( a ); 2589 aExp = extractFloat64Exp( a ); 2590 aSign = extractFloat64Sign( a ); 2591 if ( aExp == 0x7FF ) { 2592 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2593 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2594 } 2595 if ( aExp == 0 ) { 2596 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2597 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2598 } 2599 return 2600 packFloatx80( 2601 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2602 2603 } 2604 2605 #endif 2606 2607 #ifdef FLOAT128 2608 2609 /* 2610 ------------------------------------------------------------------------------- 2611 Returns the result of converting the double-precision floating-point value 2612 `a' to the quadruple-precision floating-point format. The conversion is 2613 performed according to the IEC/IEEE Standard for Binary Floating-Point 2614 Arithmetic. 2615 ------------------------------------------------------------------------------- 2616 */ 2617 float128 float64_to_float128( float64 a ) 2618 { 2619 flag aSign; 2620 int16 aExp; 2621 bits64 aSig, zSig0, zSig1; 2622 2623 aSig = extractFloat64Frac( a ); 2624 aExp = extractFloat64Exp( a ); 2625 aSign = extractFloat64Sign( a ); 2626 if ( aExp == 0x7FF ) { 2627 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2628 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2629 } 2630 if ( aExp == 0 ) { 2631 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2632 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2633 --aExp; 2634 } 2635 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2636 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2637 2638 } 2639 2640 #endif 2641 2642 #ifndef SOFTFLOAT_FOR_GCC 2643 /* 2644 ------------------------------------------------------------------------------- 2645 Rounds the double-precision floating-point value `a' to an integer, and 2646 returns the result as a double-precision floating-point value. The 2647 operation is performed according to the IEC/IEEE Standard for Binary 2648 Floating-Point Arithmetic. 2649 ------------------------------------------------------------------------------- 2650 */ 2651 float64 float64_round_to_int( float64 a ) 2652 { 2653 flag aSign; 2654 int16 aExp; 2655 bits64 lastBitMask, roundBitsMask; 2656 int8 roundingMode; 2657 float64 z; 2658 2659 aExp = extractFloat64Exp( a ); 2660 if ( 0x433 <= aExp ) { 2661 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2662 return propagateFloat64NaN( a, a ); 2663 } 2664 return a; 2665 } 2666 if ( aExp < 0x3FF ) { 2667 if ( (bits64) ( a<<1 ) == 0 ) return a; 2668 float_set_inexact(); 2669 aSign = extractFloat64Sign( a ); 2670 switch ( float_rounding_mode() ) { 2671 case float_round_nearest_even: 2672 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2673 return packFloat64( aSign, 0x3FF, 0 ); 2674 } 2675 break; 2676 case float_round_down: 2677 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2678 case float_round_up: 2679 return 2680 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2681 } 2682 return packFloat64( aSign, 0, 0 ); 2683 } 2684 lastBitMask = 1; 2685 lastBitMask <<= 0x433 - aExp; 2686 roundBitsMask = lastBitMask - 1; 2687 z = a; 2688 roundingMode = float_rounding_mode(); 2689 if ( roundingMode == float_round_nearest_even ) { 2690 z += lastBitMask>>1; 2691 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2692 } 2693 else if ( roundingMode != float_round_to_zero ) { 2694 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2695 z += roundBitsMask; 2696 } 2697 } 2698 z &= ~ roundBitsMask; 2699 if ( z != a ) float_set_inexact(); 2700 return z; 2701 2702 } 2703 #endif 2704 2705 /* 2706 ------------------------------------------------------------------------------- 2707 Returns the result of adding the absolute values of the double-precision 2708 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2709 before being returned. `zSign' is ignored if the result is a NaN. 2710 The addition is performed according to the IEC/IEEE Standard for Binary 2711 Floating-Point Arithmetic. 2712 ------------------------------------------------------------------------------- 2713 */ 2714 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2715 { 2716 int16 aExp, bExp, zExp; 2717 bits64 aSig, bSig, zSig; 2718 int16 expDiff; 2719 2720 aSig = extractFloat64Frac( a ); 2721 aExp = extractFloat64Exp( a ); 2722 bSig = extractFloat64Frac( b ); 2723 bExp = extractFloat64Exp( b ); 2724 expDiff = aExp - bExp; 2725 aSig <<= 9; 2726 bSig <<= 9; 2727 if ( 0 < expDiff ) { 2728 if ( aExp == 0x7FF ) { 2729 if ( aSig ) return propagateFloat64NaN( a, b ); 2730 return a; 2731 } 2732 if ( bExp == 0 ) { 2733 --expDiff; 2734 } 2735 else { 2736 bSig |= LIT64( 0x2000000000000000 ); 2737 } 2738 shift64RightJamming( bSig, expDiff, &bSig ); 2739 zExp = aExp; 2740 } 2741 else if ( expDiff < 0 ) { 2742 if ( bExp == 0x7FF ) { 2743 if ( bSig ) return propagateFloat64NaN( a, b ); 2744 return packFloat64( zSign, 0x7FF, 0 ); 2745 } 2746 if ( aExp == 0 ) { 2747 ++expDiff; 2748 } 2749 else { 2750 aSig |= LIT64( 0x2000000000000000 ); 2751 } 2752 shift64RightJamming( aSig, - expDiff, &aSig ); 2753 zExp = bExp; 2754 } 2755 else { 2756 if ( aExp == 0x7FF ) { 2757 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2758 return a; 2759 } 2760 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2761 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2762 zExp = aExp; 2763 goto roundAndPack; 2764 } 2765 aSig |= LIT64( 0x2000000000000000 ); 2766 zSig = ( aSig + bSig )<<1; 2767 --zExp; 2768 if ( (sbits64) zSig < 0 ) { 2769 zSig = aSig + bSig; 2770 ++zExp; 2771 } 2772 roundAndPack: 2773 return roundAndPackFloat64( zSign, zExp, zSig ); 2774 2775 } 2776 2777 /* 2778 ------------------------------------------------------------------------------- 2779 Returns the result of subtracting the absolute values of the double- 2780 precision floating-point values `a' and `b'. If `zSign' is 1, the 2781 difference is negated before being returned. `zSign' is ignored if the 2782 result is a NaN. The subtraction is performed according to the IEC/IEEE 2783 Standard for Binary Floating-Point Arithmetic. 2784 ------------------------------------------------------------------------------- 2785 */ 2786 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2787 { 2788 int16 aExp, bExp, zExp; 2789 bits64 aSig, bSig, zSig; 2790 int16 expDiff; 2791 2792 aSig = extractFloat64Frac( a ); 2793 aExp = extractFloat64Exp( a ); 2794 bSig = extractFloat64Frac( b ); 2795 bExp = extractFloat64Exp( b ); 2796 expDiff = aExp - bExp; 2797 aSig <<= 10; 2798 bSig <<= 10; 2799 if ( 0 < expDiff ) goto aExpBigger; 2800 if ( expDiff < 0 ) goto bExpBigger; 2801 if ( aExp == 0x7FF ) { 2802 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2803 float_raise( float_flag_invalid ); 2804 return float64_default_nan; 2805 } 2806 if ( aExp == 0 ) { 2807 aExp = 1; 2808 bExp = 1; 2809 } 2810 if ( bSig < aSig ) goto aBigger; 2811 if ( aSig < bSig ) goto bBigger; 2812 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2813 bExpBigger: 2814 if ( bExp == 0x7FF ) { 2815 if ( bSig ) return propagateFloat64NaN( a, b ); 2816 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2817 } 2818 if ( aExp == 0 ) { 2819 ++expDiff; 2820 } 2821 else { 2822 aSig |= LIT64( 0x4000000000000000 ); 2823 } 2824 shift64RightJamming( aSig, - expDiff, &aSig ); 2825 bSig |= LIT64( 0x4000000000000000 ); 2826 bBigger: 2827 zSig = bSig - aSig; 2828 zExp = bExp; 2829 zSign ^= 1; 2830 goto normalizeRoundAndPack; 2831 aExpBigger: 2832 if ( aExp == 0x7FF ) { 2833 if ( aSig ) return propagateFloat64NaN( a, b ); 2834 return a; 2835 } 2836 if ( bExp == 0 ) { 2837 --expDiff; 2838 } 2839 else { 2840 bSig |= LIT64( 0x4000000000000000 ); 2841 } 2842 shift64RightJamming( bSig, expDiff, &bSig ); 2843 aSig |= LIT64( 0x4000000000000000 ); 2844 aBigger: 2845 zSig = aSig - bSig; 2846 zExp = aExp; 2847 normalizeRoundAndPack: 2848 --zExp; 2849 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2850 2851 } 2852 2853 /* 2854 ------------------------------------------------------------------------------- 2855 Returns the result of adding the double-precision floating-point values `a' 2856 and `b'. The operation is performed according to the IEC/IEEE Standard for 2857 Binary Floating-Point Arithmetic. 2858 ------------------------------------------------------------------------------- 2859 */ 2860 float64 float64_add( float64 a, float64 b ) 2861 { 2862 flag aSign, bSign; 2863 2864 aSign = extractFloat64Sign( a ); 2865 bSign = extractFloat64Sign( b ); 2866 if ( aSign == bSign ) { 2867 return addFloat64Sigs( a, b, aSign ); 2868 } 2869 else { 2870 return subFloat64Sigs( a, b, aSign ); 2871 } 2872 2873 } 2874 2875 /* 2876 ------------------------------------------------------------------------------- 2877 Returns the result of subtracting the double-precision floating-point values 2878 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2879 for Binary Floating-Point Arithmetic. 2880 ------------------------------------------------------------------------------- 2881 */ 2882 float64 float64_sub( float64 a, float64 b ) 2883 { 2884 flag aSign, bSign; 2885 2886 aSign = extractFloat64Sign( a ); 2887 bSign = extractFloat64Sign( b ); 2888 if ( aSign == bSign ) { 2889 return subFloat64Sigs( a, b, aSign ); 2890 } 2891 else { 2892 return addFloat64Sigs( a, b, aSign ); 2893 } 2894 2895 } 2896 2897 /* 2898 ------------------------------------------------------------------------------- 2899 Returns the result of multiplying the double-precision floating-point values 2900 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2901 for Binary Floating-Point Arithmetic. 2902 ------------------------------------------------------------------------------- 2903 */ 2904 float64 float64_mul( float64 a, float64 b ) 2905 { 2906 flag aSign, bSign, zSign; 2907 int16 aExp, bExp, zExp; 2908 bits64 aSig, bSig, zSig0, zSig1; 2909 2910 aSig = extractFloat64Frac( a ); 2911 aExp = extractFloat64Exp( a ); 2912 aSign = extractFloat64Sign( a ); 2913 bSig = extractFloat64Frac( b ); 2914 bExp = extractFloat64Exp( b ); 2915 bSign = extractFloat64Sign( b ); 2916 zSign = aSign ^ bSign; 2917 if ( aExp == 0x7FF ) { 2918 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2919 return propagateFloat64NaN( a, b ); 2920 } 2921 if ( ( bExp | bSig ) == 0 ) { 2922 float_raise( float_flag_invalid ); 2923 return float64_default_nan; 2924 } 2925 return packFloat64( zSign, 0x7FF, 0 ); 2926 } 2927 if ( bExp == 0x7FF ) { 2928 if ( bSig ) return propagateFloat64NaN( a, b ); 2929 if ( ( aExp | aSig ) == 0 ) { 2930 float_raise( float_flag_invalid ); 2931 return float64_default_nan; 2932 } 2933 return packFloat64( zSign, 0x7FF, 0 ); 2934 } 2935 if ( aExp == 0 ) { 2936 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2937 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2938 } 2939 if ( bExp == 0 ) { 2940 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2941 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2942 } 2943 zExp = aExp + bExp - 0x3FF; 2944 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2945 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2946 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2947 zSig0 |= ( zSig1 != 0 ); 2948 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2949 zSig0 <<= 1; 2950 --zExp; 2951 } 2952 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2953 2954 } 2955 2956 /* 2957 ------------------------------------------------------------------------------- 2958 Returns the result of dividing the double-precision floating-point value `a' 2959 by the corresponding value `b'. The operation is performed according to 2960 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2961 ------------------------------------------------------------------------------- 2962 */ 2963 float64 float64_div( float64 a, float64 b ) 2964 { 2965 flag aSign, bSign, zSign; 2966 int16 aExp, bExp, zExp; 2967 bits64 aSig, bSig, zSig; 2968 bits64 rem0, rem1; 2969 bits64 term0, term1; 2970 2971 aSig = extractFloat64Frac( a ); 2972 aExp = extractFloat64Exp( a ); 2973 aSign = extractFloat64Sign( a ); 2974 bSig = extractFloat64Frac( b ); 2975 bExp = extractFloat64Exp( b ); 2976 bSign = extractFloat64Sign( b ); 2977 zSign = aSign ^ bSign; 2978 if ( aExp == 0x7FF ) { 2979 if ( aSig ) return propagateFloat64NaN( a, b ); 2980 if ( bExp == 0x7FF ) { 2981 if ( bSig ) return propagateFloat64NaN( a, b ); 2982 float_raise( float_flag_invalid ); 2983 return float64_default_nan; 2984 } 2985 return packFloat64( zSign, 0x7FF, 0 ); 2986 } 2987 if ( bExp == 0x7FF ) { 2988 if ( bSig ) return propagateFloat64NaN( a, b ); 2989 return packFloat64( zSign, 0, 0 ); 2990 } 2991 if ( bExp == 0 ) { 2992 if ( bSig == 0 ) { 2993 if ( ( aExp | aSig ) == 0 ) { 2994 float_raise( float_flag_invalid ); 2995 return float64_default_nan; 2996 } 2997 float_raise( float_flag_divbyzero ); 2998 return packFloat64( zSign, 0x7FF, 0 ); 2999 } 3000 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3001 } 3002 if ( aExp == 0 ) { 3003 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3004 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3005 } 3006 zExp = aExp - bExp + 0x3FD; 3007 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3008 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3009 if ( bSig <= ( aSig + aSig ) ) { 3010 aSig >>= 1; 3011 ++zExp; 3012 } 3013 zSig = estimateDiv128To64( aSig, 0, bSig ); 3014 if ( ( zSig & 0x1FF ) <= 2 ) { 3015 mul64To128( bSig, zSig, &term0, &term1 ); 3016 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3017 while ( (sbits64) rem0 < 0 ) { 3018 --zSig; 3019 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3020 } 3021 zSig |= ( rem1 != 0 ); 3022 } 3023 return roundAndPackFloat64( zSign, zExp, zSig ); 3024 3025 } 3026 3027 #ifndef SOFTFLOAT_FOR_GCC 3028 /* 3029 ------------------------------------------------------------------------------- 3030 Returns the remainder of the double-precision floating-point value `a' 3031 with respect to the corresponding value `b'. The operation is performed 3032 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3033 ------------------------------------------------------------------------------- 3034 */ 3035 float64 float64_rem( float64 a, float64 b ) 3036 { 3037 flag aSign, bSign, zSign; 3038 int16 aExp, bExp, expDiff; 3039 bits64 aSig, bSig; 3040 bits64 q, alternateASig; 3041 sbits64 sigMean; 3042 3043 aSig = extractFloat64Frac( a ); 3044 aExp = extractFloat64Exp( a ); 3045 aSign = extractFloat64Sign( a ); 3046 bSig = extractFloat64Frac( b ); 3047 bExp = extractFloat64Exp( b ); 3048 bSign = extractFloat64Sign( b ); 3049 if ( aExp == 0x7FF ) { 3050 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3051 return propagateFloat64NaN( a, b ); 3052 } 3053 float_raise( float_flag_invalid ); 3054 return float64_default_nan; 3055 } 3056 if ( bExp == 0x7FF ) { 3057 if ( bSig ) return propagateFloat64NaN( a, b ); 3058 return a; 3059 } 3060 if ( bExp == 0 ) { 3061 if ( bSig == 0 ) { 3062 float_raise( float_flag_invalid ); 3063 return float64_default_nan; 3064 } 3065 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3066 } 3067 if ( aExp == 0 ) { 3068 if ( aSig == 0 ) return a; 3069 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3070 } 3071 expDiff = aExp - bExp; 3072 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3073 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3074 if ( expDiff < 0 ) { 3075 if ( expDiff < -1 ) return a; 3076 aSig >>= 1; 3077 } 3078 q = ( bSig <= aSig ); 3079 if ( q ) aSig -= bSig; 3080 expDiff -= 64; 3081 while ( 0 < expDiff ) { 3082 q = estimateDiv128To64( aSig, 0, bSig ); 3083 q = ( 2 < q ) ? q - 2 : 0; 3084 aSig = - ( ( bSig>>2 ) * q ); 3085 expDiff -= 62; 3086 } 3087 expDiff += 64; 3088 if ( 0 < expDiff ) { 3089 q = estimateDiv128To64( aSig, 0, bSig ); 3090 q = ( 2 < q ) ? q - 2 : 0; 3091 q >>= 64 - expDiff; 3092 bSig >>= 2; 3093 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3094 } 3095 else { 3096 aSig >>= 2; 3097 bSig >>= 2; 3098 } 3099 do { 3100 alternateASig = aSig; 3101 ++q; 3102 aSig -= bSig; 3103 } while ( 0 <= (sbits64) aSig ); 3104 sigMean = aSig + alternateASig; 3105 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3106 aSig = alternateASig; 3107 } 3108 zSign = ( (sbits64) aSig < 0 ); 3109 if ( zSign ) aSig = - aSig; 3110 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3111 3112 } 3113 3114 /* 3115 ------------------------------------------------------------------------------- 3116 Returns the square root of the double-precision floating-point value `a'. 3117 The operation is performed according to the IEC/IEEE Standard for Binary 3118 Floating-Point Arithmetic. 3119 ------------------------------------------------------------------------------- 3120 */ 3121 float64 float64_sqrt( float64 a ) 3122 { 3123 flag aSign; 3124 int16 aExp, zExp; 3125 bits64 aSig, zSig, doubleZSig; 3126 bits64 rem0, rem1, term0, term1; 3127 3128 aSig = extractFloat64Frac( a ); 3129 aExp = extractFloat64Exp( a ); 3130 aSign = extractFloat64Sign( a ); 3131 if ( aExp == 0x7FF ) { 3132 if ( aSig ) return propagateFloat64NaN( a, a ); 3133 if ( ! aSign ) return a; 3134 float_raise( float_flag_invalid ); 3135 return float64_default_nan; 3136 } 3137 if ( aSign ) { 3138 if ( ( aExp | aSig ) == 0 ) return a; 3139 float_raise( float_flag_invalid ); 3140 return float64_default_nan; 3141 } 3142 if ( aExp == 0 ) { 3143 if ( aSig == 0 ) return 0; 3144 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3145 } 3146 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3147 aSig |= LIT64( 0x0010000000000000 ); 3148 zSig = estimateSqrt32( aExp, aSig>>21 ); 3149 aSig <<= 9 - ( aExp & 1 ); 3150 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3151 if ( ( zSig & 0x1FF ) <= 5 ) { 3152 doubleZSig = zSig<<1; 3153 mul64To128( zSig, zSig, &term0, &term1 ); 3154 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3155 while ( (sbits64) rem0 < 0 ) { 3156 --zSig; 3157 doubleZSig -= 2; 3158 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3159 } 3160 zSig |= ( ( rem0 | rem1 ) != 0 ); 3161 } 3162 return roundAndPackFloat64( 0, zExp, zSig ); 3163 3164 } 3165 #endif 3166 3167 /* 3168 ------------------------------------------------------------------------------- 3169 Returns 1 if the double-precision floating-point value `a' is equal to the 3170 corresponding value `b', and 0 otherwise. The comparison is performed 3171 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3172 ------------------------------------------------------------------------------- 3173 */ 3174 flag float64_eq( float64 a, float64 b ) 3175 { 3176 3177 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3178 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3179 ) { 3180 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3181 float_raise( float_flag_invalid ); 3182 } 3183 return 0; 3184 } 3185 return ( a == b ) || 3186 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3187 3188 } 3189 3190 /* 3191 ------------------------------------------------------------------------------- 3192 Returns 1 if the double-precision floating-point value `a' is less than or 3193 equal to the corresponding value `b', and 0 otherwise. The comparison is 3194 performed according to the IEC/IEEE Standard for Binary Floating-Point 3195 Arithmetic. 3196 ------------------------------------------------------------------------------- 3197 */ 3198 flag float64_le( float64 a, float64 b ) 3199 { 3200 flag aSign, bSign; 3201 3202 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3203 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3204 ) { 3205 float_raise( float_flag_invalid ); 3206 return 0; 3207 } 3208 aSign = extractFloat64Sign( a ); 3209 bSign = extractFloat64Sign( b ); 3210 if ( aSign != bSign ) 3211 return aSign || 3212 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3213 0 ); 3214 return ( a == b ) || 3215 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3216 3217 } 3218 3219 /* 3220 ------------------------------------------------------------------------------- 3221 Returns 1 if the double-precision floating-point value `a' is less than 3222 the corresponding value `b', and 0 otherwise. The comparison is performed 3223 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3224 ------------------------------------------------------------------------------- 3225 */ 3226 flag float64_lt( float64 a, float64 b ) 3227 { 3228 flag aSign, bSign; 3229 3230 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3231 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3232 ) { 3233 float_raise( float_flag_invalid ); 3234 return 0; 3235 } 3236 aSign = extractFloat64Sign( a ); 3237 bSign = extractFloat64Sign( b ); 3238 if ( aSign != bSign ) 3239 return aSign && 3240 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3241 0 ); 3242 return ( a != b ) && 3243 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3244 3245 } 3246 3247 #ifndef SOFTFLOAT_FOR_GCC 3248 /* 3249 ------------------------------------------------------------------------------- 3250 Returns 1 if the double-precision floating-point value `a' is equal to the 3251 corresponding value `b', and 0 otherwise. The invalid exception is raised 3252 if either operand is a NaN. Otherwise, the comparison is performed 3253 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3254 ------------------------------------------------------------------------------- 3255 */ 3256 flag float64_eq_signaling( float64 a, float64 b ) 3257 { 3258 3259 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3260 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3261 ) { 3262 float_raise( float_flag_invalid ); 3263 return 0; 3264 } 3265 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3266 3267 } 3268 3269 /* 3270 ------------------------------------------------------------------------------- 3271 Returns 1 if the double-precision floating-point value `a' is less than or 3272 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3273 cause an exception. Otherwise, the comparison is performed according to the 3274 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3275 ------------------------------------------------------------------------------- 3276 */ 3277 flag float64_le_quiet( float64 a, float64 b ) 3278 { 3279 flag aSign, bSign; 3280 3281 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3282 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3283 ) { 3284 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3285 float_raise( float_flag_invalid ); 3286 } 3287 return 0; 3288 } 3289 aSign = extractFloat64Sign( a ); 3290 bSign = extractFloat64Sign( b ); 3291 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3292 return ( a == b ) || ( aSign ^ ( a < b ) ); 3293 3294 } 3295 3296 /* 3297 ------------------------------------------------------------------------------- 3298 Returns 1 if the double-precision floating-point value `a' is less than 3299 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3300 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3301 Standard for Binary Floating-Point Arithmetic. 3302 ------------------------------------------------------------------------------- 3303 */ 3304 flag float64_lt_quiet( float64 a, float64 b ) 3305 { 3306 flag aSign, bSign; 3307 3308 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3309 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3310 ) { 3311 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3312 float_raise( float_flag_invalid ); 3313 } 3314 return 0; 3315 } 3316 aSign = extractFloat64Sign( a ); 3317 bSign = extractFloat64Sign( b ); 3318 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3319 return ( a != b ) && ( aSign ^ ( a < b ) ); 3320 3321 } 3322 #endif 3323 3324 #ifdef FLOATX80 3325 3326 /* 3327 ------------------------------------------------------------------------------- 3328 Returns the result of converting the extended double-precision floating- 3329 point value `a' to the 32-bit two's complement integer format. The 3330 conversion is performed according to the IEC/IEEE Standard for Binary 3331 Floating-Point Arithmetic---which means in particular that the conversion 3332 is rounded according to the current rounding mode. If `a' is a NaN, the 3333 largest positive integer is returned. Otherwise, if the conversion 3334 overflows, the largest integer with the same sign as `a' is returned. 3335 ------------------------------------------------------------------------------- 3336 */ 3337 int32 floatx80_to_int32( floatx80 a ) 3338 { 3339 flag aSign; 3340 int32 aExp, shiftCount; 3341 bits64 aSig; 3342 3343 aSig = extractFloatx80Frac( a ); 3344 aExp = extractFloatx80Exp( a ); 3345 aSign = extractFloatx80Sign( a ); 3346 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3347 shiftCount = 0x4037 - aExp; 3348 if ( shiftCount <= 0 ) shiftCount = 1; 3349 shift64RightJamming( aSig, shiftCount, &aSig ); 3350 return roundAndPackInt32( aSign, aSig ); 3351 3352 } 3353 3354 /* 3355 ------------------------------------------------------------------------------- 3356 Returns the result of converting the extended double-precision floating- 3357 point value `a' to the 32-bit two's complement integer format. The 3358 conversion is performed according to the IEC/IEEE Standard for Binary 3359 Floating-Point Arithmetic, except that the conversion is always rounded 3360 toward zero. If `a' is a NaN, the largest positive integer is returned. 3361 Otherwise, if the conversion overflows, the largest integer with the same 3362 sign as `a' is returned. 3363 ------------------------------------------------------------------------------- 3364 */ 3365 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3366 { 3367 flag aSign; 3368 int32 aExp, shiftCount; 3369 bits64 aSig, savedASig; 3370 int32 z; 3371 3372 aSig = extractFloatx80Frac( a ); 3373 aExp = extractFloatx80Exp( a ); 3374 aSign = extractFloatx80Sign( a ); 3375 if ( 0x401E < aExp ) { 3376 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3377 goto invalid; 3378 } 3379 else if ( aExp < 0x3FFF ) { 3380 if ( aExp || aSig ) float_set_inexact(); 3381 return 0; 3382 } 3383 shiftCount = 0x403E - aExp; 3384 savedASig = aSig; 3385 aSig >>= shiftCount; 3386 z = aSig; 3387 if ( aSign ) z = - z; 3388 if ( ( z < 0 ) ^ aSign ) { 3389 invalid: 3390 float_raise( float_flag_invalid ); 3391 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3392 } 3393 if ( ( aSig<<shiftCount ) != savedASig ) { 3394 float_set_inexact(); 3395 } 3396 return z; 3397 3398 } 3399 3400 /* 3401 ------------------------------------------------------------------------------- 3402 Returns the result of converting the extended double-precision floating- 3403 point value `a' to the 64-bit two's complement integer format. The 3404 conversion is performed according to the IEC/IEEE Standard for Binary 3405 Floating-Point Arithmetic---which means in particular that the conversion 3406 is rounded according to the current rounding mode. If `a' is a NaN, 3407 the largest positive integer is returned. Otherwise, if the conversion 3408 overflows, the largest integer with the same sign as `a' is returned. 3409 ------------------------------------------------------------------------------- 3410 */ 3411 int64 floatx80_to_int64( floatx80 a ) 3412 { 3413 flag aSign; 3414 int32 aExp, shiftCount; 3415 bits64 aSig, aSigExtra; 3416 3417 aSig = extractFloatx80Frac( a ); 3418 aExp = extractFloatx80Exp( a ); 3419 aSign = extractFloatx80Sign( a ); 3420 shiftCount = 0x403E - aExp; 3421 if ( shiftCount <= 0 ) { 3422 if ( shiftCount ) { 3423 float_raise( float_flag_invalid ); 3424 if ( ! aSign 3425 || ( ( aExp == 0x7FFF ) 3426 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3427 ) { 3428 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3429 } 3430 return (sbits64) LIT64( 0x8000000000000000 ); 3431 } 3432 aSigExtra = 0; 3433 } 3434 else { 3435 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3436 } 3437 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3438 3439 } 3440 3441 /* 3442 ------------------------------------------------------------------------------- 3443 Returns the result of converting the extended double-precision floating- 3444 point value `a' to the 64-bit two's complement integer format. The 3445 conversion is performed according to the IEC/IEEE Standard for Binary 3446 Floating-Point Arithmetic, except that the conversion is always rounded 3447 toward zero. If `a' is a NaN, the largest positive integer is returned. 3448 Otherwise, if the conversion overflows, the largest integer with the same 3449 sign as `a' is returned. 3450 ------------------------------------------------------------------------------- 3451 */ 3452 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3453 { 3454 flag aSign; 3455 int32 aExp, shiftCount; 3456 bits64 aSig; 3457 int64 z; 3458 3459 aSig = extractFloatx80Frac( a ); 3460 aExp = extractFloatx80Exp( a ); 3461 aSign = extractFloatx80Sign( a ); 3462 shiftCount = aExp - 0x403E; 3463 if ( 0 <= shiftCount ) { 3464 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3465 if ( ( a.high != 0xC03E ) || aSig ) { 3466 float_raise( float_flag_invalid ); 3467 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3468 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3469 } 3470 } 3471 return (sbits64) LIT64( 0x8000000000000000 ); 3472 } 3473 else if ( aExp < 0x3FFF ) { 3474 if ( aExp | aSig ) float_set_inexact(); 3475 return 0; 3476 } 3477 z = aSig>>( - shiftCount ); 3478 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3479 float_set_inexact(); 3480 } 3481 if ( aSign ) z = - z; 3482 return z; 3483 3484 } 3485 3486 /* 3487 ------------------------------------------------------------------------------- 3488 Returns the result of converting the extended double-precision floating- 3489 point value `a' to the single-precision floating-point format. The 3490 conversion is performed according to the IEC/IEEE Standard for Binary 3491 Floating-Point Arithmetic. 3492 ------------------------------------------------------------------------------- 3493 */ 3494 float32 floatx80_to_float32( floatx80 a ) 3495 { 3496 flag aSign; 3497 int32 aExp; 3498 bits64 aSig; 3499 3500 aSig = extractFloatx80Frac( a ); 3501 aExp = extractFloatx80Exp( a ); 3502 aSign = extractFloatx80Sign( a ); 3503 if ( aExp == 0x7FFF ) { 3504 if ( (bits64) ( aSig<<1 ) ) { 3505 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3506 } 3507 return packFloat32( aSign, 0xFF, 0 ); 3508 } 3509 shift64RightJamming( aSig, 33, &aSig ); 3510 if ( aExp || aSig ) aExp -= 0x3F81; 3511 return roundAndPackFloat32( aSign, aExp, aSig ); 3512 3513 } 3514 3515 /* 3516 ------------------------------------------------------------------------------- 3517 Returns the result of converting the extended double-precision floating- 3518 point value `a' to the double-precision floating-point format. The 3519 conversion is performed according to the IEC/IEEE Standard for Binary 3520 Floating-Point Arithmetic. 3521 ------------------------------------------------------------------------------- 3522 */ 3523 float64 floatx80_to_float64( floatx80 a ) 3524 { 3525 flag aSign; 3526 int32 aExp; 3527 bits64 aSig, zSig; 3528 3529 aSig = extractFloatx80Frac( a ); 3530 aExp = extractFloatx80Exp( a ); 3531 aSign = extractFloatx80Sign( a ); 3532 if ( aExp == 0x7FFF ) { 3533 if ( (bits64) ( aSig<<1 ) ) { 3534 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3535 } 3536 return packFloat64( aSign, 0x7FF, 0 ); 3537 } 3538 shift64RightJamming( aSig, 1, &zSig ); 3539 if ( aExp || aSig ) aExp -= 0x3C01; 3540 return roundAndPackFloat64( aSign, aExp, zSig ); 3541 3542 } 3543 3544 #ifdef FLOAT128 3545 3546 /* 3547 ------------------------------------------------------------------------------- 3548 Returns the result of converting the extended double-precision floating- 3549 point value `a' to the quadruple-precision floating-point format. The 3550 conversion is performed according to the IEC/IEEE Standard for Binary 3551 Floating-Point Arithmetic. 3552 ------------------------------------------------------------------------------- 3553 */ 3554 float128 floatx80_to_float128( floatx80 a ) 3555 { 3556 flag aSign; 3557 int16 aExp; 3558 bits64 aSig, zSig0, zSig1; 3559 3560 aSig = extractFloatx80Frac( a ); 3561 aExp = extractFloatx80Exp( a ); 3562 aSign = extractFloatx80Sign( a ); 3563 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3564 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3565 } 3566 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3567 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3568 3569 } 3570 3571 #endif 3572 3573 /* 3574 ------------------------------------------------------------------------------- 3575 Rounds the extended double-precision floating-point value `a' to an integer, 3576 and returns the result as an extended quadruple-precision floating-point 3577 value. The operation is performed according to the IEC/IEEE Standard for 3578 Binary Floating-Point Arithmetic. 3579 ------------------------------------------------------------------------------- 3580 */ 3581 floatx80 floatx80_round_to_int( floatx80 a ) 3582 { 3583 flag aSign; 3584 int32 aExp; 3585 bits64 lastBitMask, roundBitsMask; 3586 int8 roundingMode; 3587 floatx80 z; 3588 3589 aExp = extractFloatx80Exp( a ); 3590 if ( 0x403E <= aExp ) { 3591 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3592 return propagateFloatx80NaN( a, a ); 3593 } 3594 return a; 3595 } 3596 if ( aExp < 0x3FFF ) { 3597 if ( ( aExp == 0 ) 3598 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3599 return a; 3600 } 3601 float_set_inexact(); 3602 aSign = extractFloatx80Sign( a ); 3603 switch ( float_rounding_mode() ) { 3604 case float_round_nearest_even: 3605 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3606 ) { 3607 return 3608 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3609 } 3610 break; 3611 case float_round_down: 3612 return 3613 aSign ? 3614 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3615 : packFloatx80( 0, 0, 0 ); 3616 case float_round_up: 3617 return 3618 aSign ? packFloatx80( 1, 0, 0 ) 3619 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3620 } 3621 return packFloatx80( aSign, 0, 0 ); 3622 } 3623 lastBitMask = 1; 3624 lastBitMask <<= 0x403E - aExp; 3625 roundBitsMask = lastBitMask - 1; 3626 z = a; 3627 roundingMode = float_rounding_mode(); 3628 if ( roundingMode == float_round_nearest_even ) { 3629 z.low += lastBitMask>>1; 3630 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3631 } 3632 else if ( roundingMode != float_round_to_zero ) { 3633 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3634 z.low += roundBitsMask; 3635 } 3636 } 3637 z.low &= ~ roundBitsMask; 3638 if ( z.low == 0 ) { 3639 ++z.high; 3640 z.low = LIT64( 0x8000000000000000 ); 3641 } 3642 if ( z.low != a.low ) float_set_inexact(); 3643 return z; 3644 3645 } 3646 3647 /* 3648 ------------------------------------------------------------------------------- 3649 Returns the result of adding the absolute values of the extended double- 3650 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3651 negated before being returned. `zSign' is ignored if the result is a NaN. 3652 The addition is performed according to the IEC/IEEE Standard for Binary 3653 Floating-Point Arithmetic. 3654 ------------------------------------------------------------------------------- 3655 */ 3656 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3657 { 3658 int32 aExp, bExp, zExp; 3659 bits64 aSig, bSig, zSig0, zSig1; 3660 int32 expDiff; 3661 3662 aSig = extractFloatx80Frac( a ); 3663 aExp = extractFloatx80Exp( a ); 3664 bSig = extractFloatx80Frac( b ); 3665 bExp = extractFloatx80Exp( b ); 3666 expDiff = aExp - bExp; 3667 if ( 0 < expDiff ) { 3668 if ( aExp == 0x7FFF ) { 3669 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3670 return a; 3671 } 3672 if ( bExp == 0 ) --expDiff; 3673 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3674 zExp = aExp; 3675 } 3676 else if ( expDiff < 0 ) { 3677 if ( bExp == 0x7FFF ) { 3678 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3679 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3680 } 3681 if ( aExp == 0 ) ++expDiff; 3682 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3683 zExp = bExp; 3684 } 3685 else { 3686 if ( aExp == 0x7FFF ) { 3687 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3688 return propagateFloatx80NaN( a, b ); 3689 } 3690 return a; 3691 } 3692 zSig1 = 0; 3693 zSig0 = aSig + bSig; 3694 if ( aExp == 0 ) { 3695 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3696 goto roundAndPack; 3697 } 3698 zExp = aExp; 3699 goto shiftRight1; 3700 } 3701 zSig0 = aSig + bSig; 3702 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3703 shiftRight1: 3704 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3705 zSig0 |= LIT64( 0x8000000000000000 ); 3706 ++zExp; 3707 roundAndPack: 3708 return 3709 roundAndPackFloatx80( 3710 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3711 3712 } 3713 3714 /* 3715 ------------------------------------------------------------------------------- 3716 Returns the result of subtracting the absolute values of the extended 3717 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3718 difference is negated before being returned. `zSign' is ignored if the 3719 result is a NaN. The subtraction is performed according to the IEC/IEEE 3720 Standard for Binary Floating-Point Arithmetic. 3721 ------------------------------------------------------------------------------- 3722 */ 3723 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3724 { 3725 int32 aExp, bExp, zExp; 3726 bits64 aSig, bSig, zSig0, zSig1; 3727 int32 expDiff; 3728 floatx80 z; 3729 3730 aSig = extractFloatx80Frac( a ); 3731 aExp = extractFloatx80Exp( a ); 3732 bSig = extractFloatx80Frac( b ); 3733 bExp = extractFloatx80Exp( b ); 3734 expDiff = aExp - bExp; 3735 if ( 0 < expDiff ) goto aExpBigger; 3736 if ( expDiff < 0 ) goto bExpBigger; 3737 if ( aExp == 0x7FFF ) { 3738 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3739 return propagateFloatx80NaN( a, b ); 3740 } 3741 float_raise( float_flag_invalid ); 3742 z.low = floatx80_default_nan_low; 3743 z.high = floatx80_default_nan_high; 3744 return z; 3745 } 3746 if ( aExp == 0 ) { 3747 aExp = 1; 3748 bExp = 1; 3749 } 3750 zSig1 = 0; 3751 if ( bSig < aSig ) goto aBigger; 3752 if ( aSig < bSig ) goto bBigger; 3753 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3754 bExpBigger: 3755 if ( bExp == 0x7FFF ) { 3756 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3757 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3758 } 3759 if ( aExp == 0 ) ++expDiff; 3760 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3761 bBigger: 3762 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3763 zExp = bExp; 3764 zSign ^= 1; 3765 goto normalizeRoundAndPack; 3766 aExpBigger: 3767 if ( aExp == 0x7FFF ) { 3768 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3769 return a; 3770 } 3771 if ( bExp == 0 ) --expDiff; 3772 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3773 aBigger: 3774 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3775 zExp = aExp; 3776 normalizeRoundAndPack: 3777 return 3778 normalizeRoundAndPackFloatx80( 3779 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3780 3781 } 3782 3783 /* 3784 ------------------------------------------------------------------------------- 3785 Returns the result of adding the extended double-precision floating-point 3786 values `a' and `b'. The operation is performed according to the IEC/IEEE 3787 Standard for Binary Floating-Point Arithmetic. 3788 ------------------------------------------------------------------------------- 3789 */ 3790 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3791 { 3792 flag aSign, bSign; 3793 3794 aSign = extractFloatx80Sign( a ); 3795 bSign = extractFloatx80Sign( b ); 3796 if ( aSign == bSign ) { 3797 return addFloatx80Sigs( a, b, aSign ); 3798 } 3799 else { 3800 return subFloatx80Sigs( a, b, aSign ); 3801 } 3802 3803 } 3804 3805 /* 3806 ------------------------------------------------------------------------------- 3807 Returns the result of subtracting the extended double-precision floating- 3808 point values `a' and `b'. The operation is performed according to the 3809 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3810 ------------------------------------------------------------------------------- 3811 */ 3812 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3813 { 3814 flag aSign, bSign; 3815 3816 aSign = extractFloatx80Sign( a ); 3817 bSign = extractFloatx80Sign( b ); 3818 if ( aSign == bSign ) { 3819 return subFloatx80Sigs( a, b, aSign ); 3820 } 3821 else { 3822 return addFloatx80Sigs( a, b, aSign ); 3823 } 3824 3825 } 3826 3827 /* 3828 ------------------------------------------------------------------------------- 3829 Returns the result of multiplying the extended double-precision floating- 3830 point values `a' and `b'. The operation is performed according to the 3831 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3832 ------------------------------------------------------------------------------- 3833 */ 3834 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3835 { 3836 flag aSign, bSign, zSign; 3837 int32 aExp, bExp, zExp; 3838 bits64 aSig, bSig, zSig0, zSig1; 3839 floatx80 z; 3840 3841 aSig = extractFloatx80Frac( a ); 3842 aExp = extractFloatx80Exp( a ); 3843 aSign = extractFloatx80Sign( a ); 3844 bSig = extractFloatx80Frac( b ); 3845 bExp = extractFloatx80Exp( b ); 3846 bSign = extractFloatx80Sign( b ); 3847 zSign = aSign ^ bSign; 3848 if ( aExp == 0x7FFF ) { 3849 if ( (bits64) ( aSig<<1 ) 3850 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3851 return propagateFloatx80NaN( a, b ); 3852 } 3853 if ( ( bExp | bSig ) == 0 ) goto invalid; 3854 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3855 } 3856 if ( bExp == 0x7FFF ) { 3857 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3858 if ( ( aExp | aSig ) == 0 ) { 3859 invalid: 3860 float_raise( float_flag_invalid ); 3861 z.low = floatx80_default_nan_low; 3862 z.high = floatx80_default_nan_high; 3863 return z; 3864 } 3865 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3866 } 3867 if ( aExp == 0 ) { 3868 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3869 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3870 } 3871 if ( bExp == 0 ) { 3872 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3873 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3874 } 3875 zExp = aExp + bExp - 0x3FFE; 3876 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3877 if ( 0 < (sbits64) zSig0 ) { 3878 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3879 --zExp; 3880 } 3881 return 3882 roundAndPackFloatx80( 3883 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3884 3885 } 3886 3887 /* 3888 ------------------------------------------------------------------------------- 3889 Returns the result of dividing the extended double-precision floating-point 3890 value `a' by the corresponding value `b'. The operation is performed 3891 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3892 ------------------------------------------------------------------------------- 3893 */ 3894 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3895 { 3896 flag aSign, bSign, zSign; 3897 int32 aExp, bExp, zExp; 3898 bits64 aSig, bSig, zSig0, zSig1; 3899 bits64 rem0, rem1, rem2, term0, term1, term2; 3900 floatx80 z; 3901 3902 aSig = extractFloatx80Frac( a ); 3903 aExp = extractFloatx80Exp( a ); 3904 aSign = extractFloatx80Sign( a ); 3905 bSig = extractFloatx80Frac( b ); 3906 bExp = extractFloatx80Exp( b ); 3907 bSign = extractFloatx80Sign( b ); 3908 zSign = aSign ^ bSign; 3909 if ( aExp == 0x7FFF ) { 3910 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3911 if ( bExp == 0x7FFF ) { 3912 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3913 goto invalid; 3914 } 3915 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3916 } 3917 if ( bExp == 0x7FFF ) { 3918 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3919 return packFloatx80( zSign, 0, 0 ); 3920 } 3921 if ( bExp == 0 ) { 3922 if ( bSig == 0 ) { 3923 if ( ( aExp | aSig ) == 0 ) { 3924 invalid: 3925 float_raise( float_flag_invalid ); 3926 z.low = floatx80_default_nan_low; 3927 z.high = floatx80_default_nan_high; 3928 return z; 3929 } 3930 float_raise( float_flag_divbyzero ); 3931 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3932 } 3933 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3934 } 3935 if ( aExp == 0 ) { 3936 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3937 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3938 } 3939 zExp = aExp - bExp + 0x3FFE; 3940 rem1 = 0; 3941 if ( bSig <= aSig ) { 3942 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3943 ++zExp; 3944 } 3945 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3946 mul64To128( bSig, zSig0, &term0, &term1 ); 3947 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3948 while ( (sbits64) rem0 < 0 ) { 3949 --zSig0; 3950 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3951 } 3952 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3953 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3954 mul64To128( bSig, zSig1, &term1, &term2 ); 3955 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3956 while ( (sbits64) rem1 < 0 ) { 3957 --zSig1; 3958 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3959 } 3960 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3961 } 3962 return 3963 roundAndPackFloatx80( 3964 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3965 3966 } 3967 3968 /* 3969 ------------------------------------------------------------------------------- 3970 Returns the remainder of the extended double-precision floating-point value 3971 `a' with respect to the corresponding value `b'. The operation is performed 3972 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3973 ------------------------------------------------------------------------------- 3974 */ 3975 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3976 { 3977 flag aSign, bSign, zSign; 3978 int32 aExp, bExp, expDiff; 3979 bits64 aSig0, aSig1, bSig; 3980 bits64 q, term0, term1, alternateASig0, alternateASig1; 3981 floatx80 z; 3982 3983 aSig0 = extractFloatx80Frac( a ); 3984 aExp = extractFloatx80Exp( a ); 3985 aSign = extractFloatx80Sign( a ); 3986 bSig = extractFloatx80Frac( b ); 3987 bExp = extractFloatx80Exp( b ); 3988 bSign = extractFloatx80Sign( b ); 3989 if ( aExp == 0x7FFF ) { 3990 if ( (bits64) ( aSig0<<1 ) 3991 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3992 return propagateFloatx80NaN( a, b ); 3993 } 3994 goto invalid; 3995 } 3996 if ( bExp == 0x7FFF ) { 3997 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3998 return a; 3999 } 4000 if ( bExp == 0 ) { 4001 if ( bSig == 0 ) { 4002 invalid: 4003 float_raise( float_flag_invalid ); 4004 z.low = floatx80_default_nan_low; 4005 z.high = floatx80_default_nan_high; 4006 return z; 4007 } 4008 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4009 } 4010 if ( aExp == 0 ) { 4011 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 4012 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4013 } 4014 bSig |= LIT64( 0x8000000000000000 ); 4015 zSign = aSign; 4016 expDiff = aExp - bExp; 4017 aSig1 = 0; 4018 if ( expDiff < 0 ) { 4019 if ( expDiff < -1 ) return a; 4020 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4021 expDiff = 0; 4022 } 4023 q = ( bSig <= aSig0 ); 4024 if ( q ) aSig0 -= bSig; 4025 expDiff -= 64; 4026 while ( 0 < expDiff ) { 4027 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4028 q = ( 2 < q ) ? q - 2 : 0; 4029 mul64To128( bSig, q, &term0, &term1 ); 4030 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4031 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4032 expDiff -= 62; 4033 } 4034 expDiff += 64; 4035 if ( 0 < expDiff ) { 4036 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4037 q = ( 2 < q ) ? q - 2 : 0; 4038 q >>= 64 - expDiff; 4039 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4040 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4041 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4042 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4043 ++q; 4044 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4045 } 4046 } 4047 else { 4048 term1 = 0; 4049 term0 = bSig; 4050 } 4051 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4052 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4053 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4054 && ( q & 1 ) ) 4055 ) { 4056 aSig0 = alternateASig0; 4057 aSig1 = alternateASig1; 4058 zSign = ! zSign; 4059 } 4060 return 4061 normalizeRoundAndPackFloatx80( 4062 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4063 4064 } 4065 4066 /* 4067 ------------------------------------------------------------------------------- 4068 Returns the square root of the extended double-precision floating-point 4069 value `a'. The operation is performed according to the IEC/IEEE Standard 4070 for Binary Floating-Point Arithmetic. 4071 ------------------------------------------------------------------------------- 4072 */ 4073 floatx80 floatx80_sqrt( floatx80 a ) 4074 { 4075 flag aSign; 4076 int32 aExp, zExp; 4077 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4078 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4079 floatx80 z; 4080 4081 aSig0 = extractFloatx80Frac( a ); 4082 aExp = extractFloatx80Exp( a ); 4083 aSign = extractFloatx80Sign( a ); 4084 if ( aExp == 0x7FFF ) { 4085 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4086 if ( ! aSign ) return a; 4087 goto invalid; 4088 } 4089 if ( aSign ) { 4090 if ( ( aExp | aSig0 ) == 0 ) return a; 4091 invalid: 4092 float_raise( float_flag_invalid ); 4093 z.low = floatx80_default_nan_low; 4094 z.high = floatx80_default_nan_high; 4095 return z; 4096 } 4097 if ( aExp == 0 ) { 4098 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4099 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4100 } 4101 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4102 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4103 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4104 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4105 doubleZSig0 = zSig0<<1; 4106 mul64To128( zSig0, zSig0, &term0, &term1 ); 4107 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4108 while ( (sbits64) rem0 < 0 ) { 4109 --zSig0; 4110 doubleZSig0 -= 2; 4111 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4112 } 4113 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4114 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4115 if ( zSig1 == 0 ) zSig1 = 1; 4116 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4117 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4118 mul64To128( zSig1, zSig1, &term2, &term3 ); 4119 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4120 while ( (sbits64) rem1 < 0 ) { 4121 --zSig1; 4122 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4123 term3 |= 1; 4124 term2 |= doubleZSig0; 4125 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4126 } 4127 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4128 } 4129 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4130 zSig0 |= doubleZSig0; 4131 return 4132 roundAndPackFloatx80( 4133 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4134 4135 } 4136 4137 /* 4138 ------------------------------------------------------------------------------- 4139 Returns 1 if the extended double-precision floating-point value `a' is 4140 equal to the corresponding value `b', and 0 otherwise. The comparison is 4141 performed according to the IEC/IEEE Standard for Binary Floating-Point 4142 Arithmetic. 4143 ------------------------------------------------------------------------------- 4144 */ 4145 flag floatx80_eq( floatx80 a, floatx80 b ) 4146 { 4147 4148 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4149 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4150 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4151 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4152 ) { 4153 if ( floatx80_is_signaling_nan( a ) 4154 || floatx80_is_signaling_nan( b ) ) { 4155 float_raise( float_flag_invalid ); 4156 } 4157 return 0; 4158 } 4159 return 4160 ( a.low == b.low ) 4161 && ( ( a.high == b.high ) 4162 || ( ( a.low == 0 ) 4163 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4164 ); 4165 4166 } 4167 4168 /* 4169 ------------------------------------------------------------------------------- 4170 Returns 1 if the extended double-precision floating-point value `a' is 4171 less than or equal to the corresponding value `b', and 0 otherwise. The 4172 comparison is performed according to the IEC/IEEE Standard for Binary 4173 Floating-Point Arithmetic. 4174 ------------------------------------------------------------------------------- 4175 */ 4176 flag floatx80_le( floatx80 a, floatx80 b ) 4177 { 4178 flag aSign, bSign; 4179 4180 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4181 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4182 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4183 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4184 ) { 4185 float_raise( float_flag_invalid ); 4186 return 0; 4187 } 4188 aSign = extractFloatx80Sign( a ); 4189 bSign = extractFloatx80Sign( b ); 4190 if ( aSign != bSign ) { 4191 return 4192 aSign 4193 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4194 == 0 ); 4195 } 4196 return 4197 aSign ? le128( b.high, b.low, a.high, a.low ) 4198 : le128( a.high, a.low, b.high, b.low ); 4199 4200 } 4201 4202 /* 4203 ------------------------------------------------------------------------------- 4204 Returns 1 if the extended double-precision floating-point value `a' is 4205 less than the corresponding value `b', and 0 otherwise. The comparison 4206 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4207 Arithmetic. 4208 ------------------------------------------------------------------------------- 4209 */ 4210 flag floatx80_lt( floatx80 a, floatx80 b ) 4211 { 4212 flag aSign, bSign; 4213 4214 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4215 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4216 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4217 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4218 ) { 4219 float_raise( float_flag_invalid ); 4220 return 0; 4221 } 4222 aSign = extractFloatx80Sign( a ); 4223 bSign = extractFloatx80Sign( b ); 4224 if ( aSign != bSign ) { 4225 return 4226 aSign 4227 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4228 != 0 ); 4229 } 4230 return 4231 aSign ? lt128( b.high, b.low, a.high, a.low ) 4232 : lt128( a.high, a.low, b.high, b.low ); 4233 4234 } 4235 4236 /* 4237 ------------------------------------------------------------------------------- 4238 Returns 1 if the extended double-precision floating-point value `a' is equal 4239 to the corresponding value `b', and 0 otherwise. The invalid exception is 4240 raised if either operand is a NaN. Otherwise, the comparison is performed 4241 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4242 ------------------------------------------------------------------------------- 4243 */ 4244 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4245 { 4246 4247 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4248 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4249 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4250 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4251 ) { 4252 float_raise( float_flag_invalid ); 4253 return 0; 4254 } 4255 return 4256 ( a.low == b.low ) 4257 && ( ( a.high == b.high ) 4258 || ( ( a.low == 0 ) 4259 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4260 ); 4261 4262 } 4263 4264 /* 4265 ------------------------------------------------------------------------------- 4266 Returns 1 if the extended double-precision floating-point value `a' is less 4267 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4268 do not cause an exception. Otherwise, the comparison is performed according 4269 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4270 ------------------------------------------------------------------------------- 4271 */ 4272 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4273 { 4274 flag aSign, bSign; 4275 4276 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4277 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4278 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4279 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4280 ) { 4281 if ( floatx80_is_signaling_nan( a ) 4282 || floatx80_is_signaling_nan( b ) ) { 4283 float_raise( float_flag_invalid ); 4284 } 4285 return 0; 4286 } 4287 aSign = extractFloatx80Sign( a ); 4288 bSign = extractFloatx80Sign( b ); 4289 if ( aSign != bSign ) { 4290 return 4291 aSign 4292 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4293 == 0 ); 4294 } 4295 return 4296 aSign ? le128( b.high, b.low, a.high, a.low ) 4297 : le128( a.high, a.low, b.high, b.low ); 4298 4299 } 4300 4301 /* 4302 ------------------------------------------------------------------------------- 4303 Returns 1 if the extended double-precision floating-point value `a' is less 4304 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4305 an exception. Otherwise, the comparison is performed according to the 4306 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4307 ------------------------------------------------------------------------------- 4308 */ 4309 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4310 { 4311 flag aSign, bSign; 4312 4313 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4314 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4315 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4316 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4317 ) { 4318 if ( floatx80_is_signaling_nan( a ) 4319 || floatx80_is_signaling_nan( b ) ) { 4320 float_raise( float_flag_invalid ); 4321 } 4322 return 0; 4323 } 4324 aSign = extractFloatx80Sign( a ); 4325 bSign = extractFloatx80Sign( b ); 4326 if ( aSign != bSign ) { 4327 return 4328 aSign 4329 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4330 != 0 ); 4331 } 4332 return 4333 aSign ? lt128( b.high, b.low, a.high, a.low ) 4334 : lt128( a.high, a.low, b.high, b.low ); 4335 4336 } 4337 4338 #endif 4339 4340 #ifdef FLOAT128 4341 4342 /* 4343 ------------------------------------------------------------------------------- 4344 Returns the result of converting the quadruple-precision floating-point 4345 value `a' to the 32-bit two's complement integer format. The conversion 4346 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4347 Arithmetic---which means in particular that the conversion is rounded 4348 according to the current rounding mode. If `a' is a NaN, the largest 4349 positive integer is returned. Otherwise, if the conversion overflows, the 4350 largest integer with the same sign as `a' is returned. 4351 ------------------------------------------------------------------------------- 4352 */ 4353 int32 float128_to_int32( float128 a ) 4354 { 4355 flag aSign; 4356 int32 aExp, shiftCount; 4357 bits64 aSig0, aSig1; 4358 4359 aSig1 = extractFloat128Frac1( a ); 4360 aSig0 = extractFloat128Frac0( a ); 4361 aExp = extractFloat128Exp( a ); 4362 aSign = extractFloat128Sign( a ); 4363 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4364 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4365 aSig0 |= ( aSig1 != 0 ); 4366 shiftCount = 0x4028 - aExp; 4367 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4368 return roundAndPackInt32( aSign, aSig0 ); 4369 4370 } 4371 4372 /* 4373 ------------------------------------------------------------------------------- 4374 Returns the result of converting the quadruple-precision floating-point 4375 value `a' to the 32-bit two's complement integer format. The conversion 4376 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4377 Arithmetic, except that the conversion is always rounded toward zero. If 4378 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4379 conversion overflows, the largest integer with the same sign as `a' is 4380 returned. 4381 ------------------------------------------------------------------------------- 4382 */ 4383 int32 float128_to_int32_round_to_zero( float128 a ) 4384 { 4385 flag aSign; 4386 int32 aExp, shiftCount; 4387 bits64 aSig0, aSig1, savedASig; 4388 int32 z; 4389 4390 aSig1 = extractFloat128Frac1( a ); 4391 aSig0 = extractFloat128Frac0( a ); 4392 aExp = extractFloat128Exp( a ); 4393 aSign = extractFloat128Sign( a ); 4394 aSig0 |= ( aSig1 != 0 ); 4395 if ( 0x401E < aExp ) { 4396 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4397 goto invalid; 4398 } 4399 else if ( aExp < 0x3FFF ) { 4400 if ( aExp || aSig0 ) float_set_inexact(); 4401 return 0; 4402 } 4403 aSig0 |= LIT64( 0x0001000000000000 ); 4404 shiftCount = 0x402F - aExp; 4405 savedASig = aSig0; 4406 aSig0 >>= shiftCount; 4407 z = aSig0; 4408 if ( aSign ) z = - z; 4409 if ( ( z < 0 ) ^ aSign ) { 4410 invalid: 4411 float_raise( float_flag_invalid ); 4412 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4413 } 4414 if ( ( aSig0<<shiftCount ) != savedASig ) { 4415 float_set_inexact(); 4416 } 4417 return z; 4418 4419 } 4420 4421 /* 4422 ------------------------------------------------------------------------------- 4423 Returns the result of converting the quadruple-precision floating-point 4424 value `a' to the 64-bit two's complement integer format. The conversion 4425 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4426 Arithmetic---which means in particular that the conversion is rounded 4427 according to the current rounding mode. If `a' is a NaN, the largest 4428 positive integer is returned. Otherwise, if the conversion overflows, the 4429 largest integer with the same sign as `a' is returned. 4430 ------------------------------------------------------------------------------- 4431 */ 4432 int64 float128_to_int64( float128 a ) 4433 { 4434 flag aSign; 4435 int32 aExp, shiftCount; 4436 bits64 aSig0, aSig1; 4437 4438 aSig1 = extractFloat128Frac1( a ); 4439 aSig0 = extractFloat128Frac0( a ); 4440 aExp = extractFloat128Exp( a ); 4441 aSign = extractFloat128Sign( a ); 4442 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4443 shiftCount = 0x402F - aExp; 4444 if ( shiftCount <= 0 ) { 4445 if ( 0x403E < aExp ) { 4446 float_raise( float_flag_invalid ); 4447 if ( ! aSign 4448 || ( ( aExp == 0x7FFF ) 4449 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4450 ) 4451 ) { 4452 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4453 } 4454 return (sbits64) LIT64( 0x8000000000000000 ); 4455 } 4456 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4457 } 4458 else { 4459 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4460 } 4461 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4462 4463 } 4464 4465 /* 4466 ------------------------------------------------------------------------------- 4467 Returns the result of converting the quadruple-precision floating-point 4468 value `a' to the 64-bit two's complement integer format. The conversion 4469 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4470 Arithmetic, except that the conversion is always rounded toward zero. 4471 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4472 the conversion overflows, the largest integer with the same sign as `a' is 4473 returned. 4474 ------------------------------------------------------------------------------- 4475 */ 4476 int64 float128_to_int64_round_to_zero( float128 a ) 4477 { 4478 flag aSign; 4479 int32 aExp, shiftCount; 4480 bits64 aSig0, aSig1; 4481 int64 z; 4482 4483 aSig1 = extractFloat128Frac1( a ); 4484 aSig0 = extractFloat128Frac0( a ); 4485 aExp = extractFloat128Exp( a ); 4486 aSign = extractFloat128Sign( a ); 4487 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4488 shiftCount = aExp - 0x402F; 4489 if ( 0 < shiftCount ) { 4490 if ( 0x403E <= aExp ) { 4491 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4492 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4493 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4494 if ( aSig1 ) float_set_inexact(); 4495 } 4496 else { 4497 float_raise( float_flag_invalid ); 4498 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4499 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4500 } 4501 } 4502 return (sbits64) LIT64( 0x8000000000000000 ); 4503 } 4504 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4505 if ( (bits64) ( aSig1<<shiftCount ) ) { 4506 float_set_inexact(); 4507 } 4508 } 4509 else { 4510 if ( aExp < 0x3FFF ) { 4511 if ( aExp | aSig0 | aSig1 ) { 4512 float_set_inexact(); 4513 } 4514 return 0; 4515 } 4516 z = aSig0>>( - shiftCount ); 4517 if ( aSig1 4518 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4519 float_set_inexact(); 4520 } 4521 } 4522 if ( aSign ) z = - z; 4523 return z; 4524 4525 } 4526 4527 /* 4528 ------------------------------------------------------------------------------- 4529 Returns the result of converting the quadruple-precision floating-point 4530 value `a' to the single-precision floating-point format. The conversion 4531 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4532 Arithmetic. 4533 ------------------------------------------------------------------------------- 4534 */ 4535 float32 float128_to_float32( float128 a ) 4536 { 4537 flag aSign; 4538 int32 aExp; 4539 bits64 aSig0, aSig1; 4540 bits32 zSig; 4541 4542 aSig1 = extractFloat128Frac1( a ); 4543 aSig0 = extractFloat128Frac0( a ); 4544 aExp = extractFloat128Exp( a ); 4545 aSign = extractFloat128Sign( a ); 4546 if ( aExp == 0x7FFF ) { 4547 if ( aSig0 | aSig1 ) { 4548 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4549 } 4550 return packFloat32( aSign, 0xFF, 0 ); 4551 } 4552 aSig0 |= ( aSig1 != 0 ); 4553 shift64RightJamming( aSig0, 18, &aSig0 ); 4554 zSig = aSig0; 4555 if ( aExp || zSig ) { 4556 zSig |= 0x40000000; 4557 aExp -= 0x3F81; 4558 } 4559 return roundAndPackFloat32( aSign, aExp, zSig ); 4560 4561 } 4562 4563 /* 4564 ------------------------------------------------------------------------------- 4565 Returns the result of converting the quadruple-precision floating-point 4566 value `a' to the double-precision floating-point format. The conversion 4567 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4568 Arithmetic. 4569 ------------------------------------------------------------------------------- 4570 */ 4571 float64 float128_to_float64( float128 a ) 4572 { 4573 flag aSign; 4574 int32 aExp; 4575 bits64 aSig0, aSig1; 4576 4577 aSig1 = extractFloat128Frac1( a ); 4578 aSig0 = extractFloat128Frac0( a ); 4579 aExp = extractFloat128Exp( a ); 4580 aSign = extractFloat128Sign( a ); 4581 if ( aExp == 0x7FFF ) { 4582 if ( aSig0 | aSig1 ) { 4583 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4584 } 4585 return packFloat64( aSign, 0x7FF, 0 ); 4586 } 4587 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4588 aSig0 |= ( aSig1 != 0 ); 4589 if ( aExp || aSig0 ) { 4590 aSig0 |= LIT64( 0x4000000000000000 ); 4591 aExp -= 0x3C01; 4592 } 4593 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4594 4595 } 4596 4597 #ifdef FLOATX80 4598 4599 /* 4600 ------------------------------------------------------------------------------- 4601 Returns the result of converting the quadruple-precision floating-point 4602 value `a' to the extended double-precision floating-point format. The 4603 conversion is performed according to the IEC/IEEE Standard for Binary 4604 Floating-Point Arithmetic. 4605 ------------------------------------------------------------------------------- 4606 */ 4607 floatx80 float128_to_floatx80( float128 a ) 4608 { 4609 flag aSign; 4610 int32 aExp; 4611 bits64 aSig0, aSig1; 4612 4613 aSig1 = extractFloat128Frac1( a ); 4614 aSig0 = extractFloat128Frac0( a ); 4615 aExp = extractFloat128Exp( a ); 4616 aSign = extractFloat128Sign( a ); 4617 if ( aExp == 0x7FFF ) { 4618 if ( aSig0 | aSig1 ) { 4619 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4620 } 4621 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4622 } 4623 if ( aExp == 0 ) { 4624 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4625 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4626 } 4627 else { 4628 aSig0 |= LIT64( 0x0001000000000000 ); 4629 } 4630 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4631 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4632 4633 } 4634 4635 #endif 4636 4637 /* 4638 ------------------------------------------------------------------------------- 4639 Rounds the quadruple-precision floating-point value `a' to an integer, and 4640 returns the result as a quadruple-precision floating-point value. The 4641 operation is performed according to the IEC/IEEE Standard for Binary 4642 Floating-Point Arithmetic. 4643 ------------------------------------------------------------------------------- 4644 */ 4645 float128 float128_round_to_int( float128 a ) 4646 { 4647 flag aSign; 4648 int32 aExp; 4649 bits64 lastBitMask, roundBitsMask; 4650 int8 roundingMode; 4651 float128 z; 4652 4653 aExp = extractFloat128Exp( a ); 4654 if ( 0x402F <= aExp ) { 4655 if ( 0x406F <= aExp ) { 4656 if ( ( aExp == 0x7FFF ) 4657 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4658 ) { 4659 return propagateFloat128NaN( a, a ); 4660 } 4661 return a; 4662 } 4663 lastBitMask = 1; 4664 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4665 roundBitsMask = lastBitMask - 1; 4666 z = a; 4667 roundingMode = float_rounding_mode(); 4668 if ( roundingMode == float_round_nearest_even ) { 4669 if ( lastBitMask ) { 4670 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4671 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4672 } 4673 else { 4674 if ( (sbits64) z.low < 0 ) { 4675 ++z.high; 4676 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4677 } 4678 } 4679 } 4680 else if ( roundingMode != float_round_to_zero ) { 4681 if ( extractFloat128Sign( z ) 4682 ^ ( roundingMode == float_round_up ) ) { 4683 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4684 } 4685 } 4686 z.low &= ~ roundBitsMask; 4687 } 4688 else { 4689 if ( aExp < 0x3FFF ) { 4690 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4691 float_set_inexact(); 4692 aSign = extractFloat128Sign( a ); 4693 switch ( float_rounding_mode() ) { 4694 case float_round_nearest_even: 4695 if ( ( aExp == 0x3FFE ) 4696 && ( extractFloat128Frac0( a ) 4697 | extractFloat128Frac1( a ) ) 4698 ) { 4699 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4700 } 4701 break; 4702 case float_round_down: 4703 return 4704 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4705 : packFloat128( 0, 0, 0, 0 ); 4706 case float_round_up: 4707 return 4708 aSign ? packFloat128( 1, 0, 0, 0 ) 4709 : packFloat128( 0, 0x3FFF, 0, 0 ); 4710 } 4711 return packFloat128( aSign, 0, 0, 0 ); 4712 } 4713 lastBitMask = 1; 4714 lastBitMask <<= 0x402F - aExp; 4715 roundBitsMask = lastBitMask - 1; 4716 z.low = 0; 4717 z.high = a.high; 4718 roundingMode = float_rounding_mode(); 4719 if ( roundingMode == float_round_nearest_even ) { 4720 z.high += lastBitMask>>1; 4721 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4722 z.high &= ~ lastBitMask; 4723 } 4724 } 4725 else if ( roundingMode != float_round_to_zero ) { 4726 if ( extractFloat128Sign( z ) 4727 ^ ( roundingMode == float_round_up ) ) { 4728 z.high |= ( a.low != 0 ); 4729 z.high += roundBitsMask; 4730 } 4731 } 4732 z.high &= ~ roundBitsMask; 4733 } 4734 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4735 float_set_inexact(); 4736 } 4737 return z; 4738 4739 } 4740 4741 /* 4742 ------------------------------------------------------------------------------- 4743 Returns the result of adding the absolute values of the quadruple-precision 4744 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4745 before being returned. `zSign' is ignored if the result is a NaN. 4746 The addition is performed according to the IEC/IEEE Standard for Binary 4747 Floating-Point Arithmetic. 4748 ------------------------------------------------------------------------------- 4749 */ 4750 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4751 { 4752 int32 aExp, bExp, zExp; 4753 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4754 int32 expDiff; 4755 4756 aSig1 = extractFloat128Frac1( a ); 4757 aSig0 = extractFloat128Frac0( a ); 4758 aExp = extractFloat128Exp( a ); 4759 bSig1 = extractFloat128Frac1( b ); 4760 bSig0 = extractFloat128Frac0( b ); 4761 bExp = extractFloat128Exp( b ); 4762 expDiff = aExp - bExp; 4763 if ( 0 < expDiff ) { 4764 if ( aExp == 0x7FFF ) { 4765 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4766 return a; 4767 } 4768 if ( bExp == 0 ) { 4769 --expDiff; 4770 } 4771 else { 4772 bSig0 |= LIT64( 0x0001000000000000 ); 4773 } 4774 shift128ExtraRightJamming( 4775 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4776 zExp = aExp; 4777 } 4778 else if ( expDiff < 0 ) { 4779 if ( bExp == 0x7FFF ) { 4780 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4781 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4782 } 4783 if ( aExp == 0 ) { 4784 ++expDiff; 4785 } 4786 else { 4787 aSig0 |= LIT64( 0x0001000000000000 ); 4788 } 4789 shift128ExtraRightJamming( 4790 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4791 zExp = bExp; 4792 } 4793 else { 4794 if ( aExp == 0x7FFF ) { 4795 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4796 return propagateFloat128NaN( a, b ); 4797 } 4798 return a; 4799 } 4800 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4801 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4802 zSig2 = 0; 4803 zSig0 |= LIT64( 0x0002000000000000 ); 4804 zExp = aExp; 4805 goto shiftRight1; 4806 } 4807 aSig0 |= LIT64( 0x0001000000000000 ); 4808 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4809 --zExp; 4810 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4811 ++zExp; 4812 shiftRight1: 4813 shift128ExtraRightJamming( 4814 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4815 roundAndPack: 4816 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4817 4818 } 4819 4820 /* 4821 ------------------------------------------------------------------------------- 4822 Returns the result of subtracting the absolute values of the quadruple- 4823 precision floating-point values `a' and `b'. If `zSign' is 1, the 4824 difference is negated before being returned. `zSign' is ignored if the 4825 result is a NaN. The subtraction is performed according to the IEC/IEEE 4826 Standard for Binary Floating-Point Arithmetic. 4827 ------------------------------------------------------------------------------- 4828 */ 4829 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4830 { 4831 int32 aExp, bExp, zExp; 4832 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4833 int32 expDiff; 4834 float128 z; 4835 4836 aSig1 = extractFloat128Frac1( a ); 4837 aSig0 = extractFloat128Frac0( a ); 4838 aExp = extractFloat128Exp( a ); 4839 bSig1 = extractFloat128Frac1( b ); 4840 bSig0 = extractFloat128Frac0( b ); 4841 bExp = extractFloat128Exp( b ); 4842 expDiff = aExp - bExp; 4843 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4844 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4845 if ( 0 < expDiff ) goto aExpBigger; 4846 if ( expDiff < 0 ) goto bExpBigger; 4847 if ( aExp == 0x7FFF ) { 4848 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4849 return propagateFloat128NaN( a, b ); 4850 } 4851 float_raise( float_flag_invalid ); 4852 z.low = float128_default_nan_low; 4853 z.high = float128_default_nan_high; 4854 return z; 4855 } 4856 if ( aExp == 0 ) { 4857 aExp = 1; 4858 bExp = 1; 4859 } 4860 if ( bSig0 < aSig0 ) goto aBigger; 4861 if ( aSig0 < bSig0 ) goto bBigger; 4862 if ( bSig1 < aSig1 ) goto aBigger; 4863 if ( aSig1 < bSig1 ) goto bBigger; 4864 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4865 bExpBigger: 4866 if ( bExp == 0x7FFF ) { 4867 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4868 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4869 } 4870 if ( aExp == 0 ) { 4871 ++expDiff; 4872 } 4873 else { 4874 aSig0 |= LIT64( 0x4000000000000000 ); 4875 } 4876 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4877 bSig0 |= LIT64( 0x4000000000000000 ); 4878 bBigger: 4879 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4880 zExp = bExp; 4881 zSign ^= 1; 4882 goto normalizeRoundAndPack; 4883 aExpBigger: 4884 if ( aExp == 0x7FFF ) { 4885 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4886 return a; 4887 } 4888 if ( bExp == 0 ) { 4889 --expDiff; 4890 } 4891 else { 4892 bSig0 |= LIT64( 0x4000000000000000 ); 4893 } 4894 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4895 aSig0 |= LIT64( 0x4000000000000000 ); 4896 aBigger: 4897 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4898 zExp = aExp; 4899 normalizeRoundAndPack: 4900 --zExp; 4901 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4902 4903 } 4904 4905 /* 4906 ------------------------------------------------------------------------------- 4907 Returns the result of adding the quadruple-precision floating-point values 4908 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4909 for Binary Floating-Point Arithmetic. 4910 ------------------------------------------------------------------------------- 4911 */ 4912 float128 float128_add( float128 a, float128 b ) 4913 { 4914 flag aSign, bSign; 4915 4916 aSign = extractFloat128Sign( a ); 4917 bSign = extractFloat128Sign( b ); 4918 if ( aSign == bSign ) { 4919 return addFloat128Sigs( a, b, aSign ); 4920 } 4921 else { 4922 return subFloat128Sigs( a, b, aSign ); 4923 } 4924 4925 } 4926 4927 /* 4928 ------------------------------------------------------------------------------- 4929 Returns the result of subtracting the quadruple-precision floating-point 4930 values `a' and `b'. The operation is performed according to the IEC/IEEE 4931 Standard for Binary Floating-Point Arithmetic. 4932 ------------------------------------------------------------------------------- 4933 */ 4934 float128 float128_sub( float128 a, float128 b ) 4935 { 4936 flag aSign, bSign; 4937 4938 aSign = extractFloat128Sign( a ); 4939 bSign = extractFloat128Sign( b ); 4940 if ( aSign == bSign ) { 4941 return subFloat128Sigs( a, b, aSign ); 4942 } 4943 else { 4944 return addFloat128Sigs( a, b, aSign ); 4945 } 4946 4947 } 4948 4949 /* 4950 ------------------------------------------------------------------------------- 4951 Returns the result of multiplying the quadruple-precision floating-point 4952 values `a' and `b'. The operation is performed according to the IEC/IEEE 4953 Standard for Binary Floating-Point Arithmetic. 4954 ------------------------------------------------------------------------------- 4955 */ 4956 float128 float128_mul( float128 a, float128 b ) 4957 { 4958 flag aSign, bSign, zSign; 4959 int32 aExp, bExp, zExp; 4960 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4961 float128 z; 4962 4963 aSig1 = extractFloat128Frac1( a ); 4964 aSig0 = extractFloat128Frac0( a ); 4965 aExp = extractFloat128Exp( a ); 4966 aSign = extractFloat128Sign( a ); 4967 bSig1 = extractFloat128Frac1( b ); 4968 bSig0 = extractFloat128Frac0( b ); 4969 bExp = extractFloat128Exp( b ); 4970 bSign = extractFloat128Sign( b ); 4971 zSign = aSign ^ bSign; 4972 if ( aExp == 0x7FFF ) { 4973 if ( ( aSig0 | aSig1 ) 4974 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4975 return propagateFloat128NaN( a, b ); 4976 } 4977 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4978 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4979 } 4980 if ( bExp == 0x7FFF ) { 4981 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4982 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4983 invalid: 4984 float_raise( float_flag_invalid ); 4985 z.low = float128_default_nan_low; 4986 z.high = float128_default_nan_high; 4987 return z; 4988 } 4989 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4990 } 4991 if ( aExp == 0 ) { 4992 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4993 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4994 } 4995 if ( bExp == 0 ) { 4996 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4997 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4998 } 4999 zExp = aExp + bExp - 0x4000; 5000 aSig0 |= LIT64( 0x0001000000000000 ); 5001 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5002 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5003 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5004 zSig2 |= ( zSig3 != 0 ); 5005 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5006 shift128ExtraRightJamming( 5007 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5008 ++zExp; 5009 } 5010 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5011 5012 } 5013 5014 /* 5015 ------------------------------------------------------------------------------- 5016 Returns the result of dividing the quadruple-precision floating-point value 5017 `a' by the corresponding value `b'. The operation is performed according to 5018 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5019 ------------------------------------------------------------------------------- 5020 */ 5021 float128 float128_div( float128 a, float128 b ) 5022 { 5023 flag aSign, bSign, zSign; 5024 int32 aExp, bExp, zExp; 5025 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5026 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5027 float128 z; 5028 5029 aSig1 = extractFloat128Frac1( a ); 5030 aSig0 = extractFloat128Frac0( a ); 5031 aExp = extractFloat128Exp( a ); 5032 aSign = extractFloat128Sign( a ); 5033 bSig1 = extractFloat128Frac1( b ); 5034 bSig0 = extractFloat128Frac0( b ); 5035 bExp = extractFloat128Exp( b ); 5036 bSign = extractFloat128Sign( b ); 5037 zSign = aSign ^ bSign; 5038 if ( aExp == 0x7FFF ) { 5039 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5040 if ( bExp == 0x7FFF ) { 5041 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5042 goto invalid; 5043 } 5044 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5045 } 5046 if ( bExp == 0x7FFF ) { 5047 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5048 return packFloat128( zSign, 0, 0, 0 ); 5049 } 5050 if ( bExp == 0 ) { 5051 if ( ( bSig0 | bSig1 ) == 0 ) { 5052 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5053 invalid: 5054 float_raise( float_flag_invalid ); 5055 z.low = float128_default_nan_low; 5056 z.high = float128_default_nan_high; 5057 return z; 5058 } 5059 float_raise( float_flag_divbyzero ); 5060 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5061 } 5062 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5063 } 5064 if ( aExp == 0 ) { 5065 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5066 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5067 } 5068 zExp = aExp - bExp + 0x3FFD; 5069 shortShift128Left( 5070 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5071 shortShift128Left( 5072 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5073 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5074 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5075 ++zExp; 5076 } 5077 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5078 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5079 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5080 while ( (sbits64) rem0 < 0 ) { 5081 --zSig0; 5082 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5083 } 5084 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5085 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5086 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5087 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5088 while ( (sbits64) rem1 < 0 ) { 5089 --zSig1; 5090 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5091 } 5092 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5093 } 5094 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5095 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5096 5097 } 5098 5099 /* 5100 ------------------------------------------------------------------------------- 5101 Returns the remainder of the quadruple-precision floating-point value `a' 5102 with respect to the corresponding value `b'. The operation is performed 5103 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5104 ------------------------------------------------------------------------------- 5105 */ 5106 float128 float128_rem( float128 a, float128 b ) 5107 { 5108 flag aSign, bSign, zSign; 5109 int32 aExp, bExp, expDiff; 5110 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5111 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5112 sbits64 sigMean0; 5113 float128 z; 5114 5115 aSig1 = extractFloat128Frac1( a ); 5116 aSig0 = extractFloat128Frac0( a ); 5117 aExp = extractFloat128Exp( a ); 5118 aSign = extractFloat128Sign( a ); 5119 bSig1 = extractFloat128Frac1( b ); 5120 bSig0 = extractFloat128Frac0( b ); 5121 bExp = extractFloat128Exp( b ); 5122 bSign = extractFloat128Sign( b ); 5123 if ( aExp == 0x7FFF ) { 5124 if ( ( aSig0 | aSig1 ) 5125 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5126 return propagateFloat128NaN( a, b ); 5127 } 5128 goto invalid; 5129 } 5130 if ( bExp == 0x7FFF ) { 5131 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5132 return a; 5133 } 5134 if ( bExp == 0 ) { 5135 if ( ( bSig0 | bSig1 ) == 0 ) { 5136 invalid: 5137 float_raise( float_flag_invalid ); 5138 z.low = float128_default_nan_low; 5139 z.high = float128_default_nan_high; 5140 return z; 5141 } 5142 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5143 } 5144 if ( aExp == 0 ) { 5145 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5146 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5147 } 5148 expDiff = aExp - bExp; 5149 if ( expDiff < -1 ) return a; 5150 shortShift128Left( 5151 aSig0 | LIT64( 0x0001000000000000 ), 5152 aSig1, 5153 15 - ( expDiff < 0 ), 5154 &aSig0, 5155 &aSig1 5156 ); 5157 shortShift128Left( 5158 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5159 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5160 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5161 expDiff -= 64; 5162 while ( 0 < expDiff ) { 5163 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5164 q = ( 4 < q ) ? q - 4 : 0; 5165 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5166 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5167 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5168 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5169 expDiff -= 61; 5170 } 5171 if ( -64 < expDiff ) { 5172 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5173 q = ( 4 < q ) ? q - 4 : 0; 5174 q >>= - expDiff; 5175 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5176 expDiff += 52; 5177 if ( expDiff < 0 ) { 5178 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5179 } 5180 else { 5181 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5182 } 5183 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5184 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5185 } 5186 else { 5187 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5188 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5189 } 5190 do { 5191 alternateASig0 = aSig0; 5192 alternateASig1 = aSig1; 5193 ++q; 5194 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5195 } while ( 0 <= (sbits64) aSig0 ); 5196 add128( 5197 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5198 if ( ( sigMean0 < 0 ) 5199 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5200 aSig0 = alternateASig0; 5201 aSig1 = alternateASig1; 5202 } 5203 zSign = ( (sbits64) aSig0 < 0 ); 5204 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5205 return 5206 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5207 5208 } 5209 5210 /* 5211 ------------------------------------------------------------------------------- 5212 Returns the square root of the quadruple-precision floating-point value `a'. 5213 The operation is performed according to the IEC/IEEE Standard for Binary 5214 Floating-Point Arithmetic. 5215 ------------------------------------------------------------------------------- 5216 */ 5217 float128 float128_sqrt( float128 a ) 5218 { 5219 flag aSign; 5220 int32 aExp, zExp; 5221 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5222 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5223 float128 z; 5224 5225 aSig1 = extractFloat128Frac1( a ); 5226 aSig0 = extractFloat128Frac0( a ); 5227 aExp = extractFloat128Exp( a ); 5228 aSign = extractFloat128Sign( a ); 5229 if ( aExp == 0x7FFF ) { 5230 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5231 if ( ! aSign ) return a; 5232 goto invalid; 5233 } 5234 if ( aSign ) { 5235 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5236 invalid: 5237 float_raise( float_flag_invalid ); 5238 z.low = float128_default_nan_low; 5239 z.high = float128_default_nan_high; 5240 return z; 5241 } 5242 if ( aExp == 0 ) { 5243 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5244 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5245 } 5246 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5247 aSig0 |= LIT64( 0x0001000000000000 ); 5248 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5249 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5250 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5251 doubleZSig0 = zSig0<<1; 5252 mul64To128( zSig0, zSig0, &term0, &term1 ); 5253 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5254 while ( (sbits64) rem0 < 0 ) { 5255 --zSig0; 5256 doubleZSig0 -= 2; 5257 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5258 } 5259 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5260 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5261 if ( zSig1 == 0 ) zSig1 = 1; 5262 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5263 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5264 mul64To128( zSig1, zSig1, &term2, &term3 ); 5265 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5266 while ( (sbits64) rem1 < 0 ) { 5267 --zSig1; 5268 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5269 term3 |= 1; 5270 term2 |= doubleZSig0; 5271 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5272 } 5273 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5274 } 5275 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5276 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5277 5278 } 5279 5280 /* 5281 ------------------------------------------------------------------------------- 5282 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5283 the corresponding value `b', and 0 otherwise. The comparison is performed 5284 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5285 ------------------------------------------------------------------------------- 5286 */ 5287 flag float128_eq( float128 a, float128 b ) 5288 { 5289 5290 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5291 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5292 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5293 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5294 ) { 5295 if ( float128_is_signaling_nan( a ) 5296 || float128_is_signaling_nan( b ) ) { 5297 float_raise( float_flag_invalid ); 5298 } 5299 return 0; 5300 } 5301 return 5302 ( a.low == b.low ) 5303 && ( ( a.high == b.high ) 5304 || ( ( a.low == 0 ) 5305 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5306 ); 5307 5308 } 5309 5310 /* 5311 ------------------------------------------------------------------------------- 5312 Returns 1 if the quadruple-precision floating-point value `a' is less than 5313 or equal to the corresponding value `b', and 0 otherwise. The comparison 5314 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5315 Arithmetic. 5316 ------------------------------------------------------------------------------- 5317 */ 5318 flag float128_le( float128 a, float128 b ) 5319 { 5320 flag aSign, bSign; 5321 5322 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5323 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5324 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5325 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5326 ) { 5327 float_raise( float_flag_invalid ); 5328 return 0; 5329 } 5330 aSign = extractFloat128Sign( a ); 5331 bSign = extractFloat128Sign( b ); 5332 if ( aSign != bSign ) { 5333 return 5334 aSign 5335 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5336 == 0 ); 5337 } 5338 return 5339 aSign ? le128( b.high, b.low, a.high, a.low ) 5340 : le128( a.high, a.low, b.high, b.low ); 5341 5342 } 5343 5344 /* 5345 ------------------------------------------------------------------------------- 5346 Returns 1 if the quadruple-precision floating-point value `a' is less than 5347 the corresponding value `b', and 0 otherwise. The comparison is performed 5348 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5349 ------------------------------------------------------------------------------- 5350 */ 5351 flag float128_lt( float128 a, float128 b ) 5352 { 5353 flag aSign, bSign; 5354 5355 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5356 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5357 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5358 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5359 ) { 5360 float_raise( float_flag_invalid ); 5361 return 0; 5362 } 5363 aSign = extractFloat128Sign( a ); 5364 bSign = extractFloat128Sign( b ); 5365 if ( aSign != bSign ) { 5366 return 5367 aSign 5368 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5369 != 0 ); 5370 } 5371 return 5372 aSign ? lt128( b.high, b.low, a.high, a.low ) 5373 : lt128( a.high, a.low, b.high, b.low ); 5374 5375 } 5376 5377 /* 5378 ------------------------------------------------------------------------------- 5379 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5380 the corresponding value `b', and 0 otherwise. The invalid exception is 5381 raised if either operand is a NaN. Otherwise, the comparison is performed 5382 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5383 ------------------------------------------------------------------------------- 5384 */ 5385 flag float128_eq_signaling( float128 a, float128 b ) 5386 { 5387 5388 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5389 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5390 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5391 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5392 ) { 5393 float_raise( float_flag_invalid ); 5394 return 0; 5395 } 5396 return 5397 ( a.low == b.low ) 5398 && ( ( a.high == b.high ) 5399 || ( ( a.low == 0 ) 5400 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5401 ); 5402 5403 } 5404 5405 /* 5406 ------------------------------------------------------------------------------- 5407 Returns 1 if the quadruple-precision floating-point value `a' is less than 5408 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5409 cause an exception. Otherwise, the comparison is performed according to the 5410 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5411 ------------------------------------------------------------------------------- 5412 */ 5413 flag float128_le_quiet( float128 a, float128 b ) 5414 { 5415 flag aSign, bSign; 5416 5417 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5418 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5419 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5420 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5421 ) { 5422 if ( float128_is_signaling_nan( a ) 5423 || float128_is_signaling_nan( b ) ) { 5424 float_raise( float_flag_invalid ); 5425 } 5426 return 0; 5427 } 5428 aSign = extractFloat128Sign( a ); 5429 bSign = extractFloat128Sign( b ); 5430 if ( aSign != bSign ) { 5431 return 5432 aSign 5433 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5434 == 0 ); 5435 } 5436 return 5437 aSign ? le128( b.high, b.low, a.high, a.low ) 5438 : le128( a.high, a.low, b.high, b.low ); 5439 5440 } 5441 5442 /* 5443 ------------------------------------------------------------------------------- 5444 Returns 1 if the quadruple-precision floating-point value `a' is less than 5445 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5446 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5447 Standard for Binary Floating-Point Arithmetic. 5448 ------------------------------------------------------------------------------- 5449 */ 5450 flag float128_lt_quiet( float128 a, float128 b ) 5451 { 5452 flag aSign, bSign; 5453 5454 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5455 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5456 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5457 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5458 ) { 5459 if ( float128_is_signaling_nan( a ) 5460 || float128_is_signaling_nan( b ) ) { 5461 float_raise( float_flag_invalid ); 5462 } 5463 return 0; 5464 } 5465 aSign = extractFloat128Sign( a ); 5466 bSign = extractFloat128Sign( b ); 5467 if ( aSign != bSign ) { 5468 return 5469 aSign 5470 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5471 != 0 ); 5472 } 5473 return 5474 aSign ? lt128( b.high, b.low, a.high, a.low ) 5475 : lt128( a.high, a.low, b.high, b.low ); 5476 5477 } 5478 5479 #endif 5480 5481 5482 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5483 5484 /* 5485 * These two routines are not part of the original softfloat distribution. 5486 * 5487 * They are based on the corresponding conversions to integer but return 5488 * unsigned numbers instead since these functions are required by GCC. 5489 * 5490 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97 5491 * 5492 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5493 */ 5494 5495 /* 5496 ------------------------------------------------------------------------------- 5497 Returns the result of converting the double-precision floating-point value 5498 `a' to the 32-bit unsigned integer format. The conversion is 5499 performed according to the IEC/IEEE Standard for Binary Floating-point 5500 Arithmetic, except that the conversion is always rounded toward zero. If 5501 `a' is a NaN, the largest positive integer is returned. If the conversion 5502 overflows, the largest integer positive is returned. 5503 ------------------------------------------------------------------------------- 5504 */ 5505 uint32 float64_to_uint32_round_to_zero( float64 a ) 5506 { 5507 flag aSign; 5508 int16 aExp, shiftCount; 5509 bits64 aSig, savedASig; 5510 uint32 z; 5511 5512 aSig = extractFloat64Frac( a ); 5513 aExp = extractFloat64Exp( a ); 5514 aSign = extractFloat64Sign( a ); 5515 5516 if (aSign) { 5517 float_raise( float_flag_invalid ); 5518 return(0); 5519 } 5520 5521 if ( 0x41E < aExp ) { 5522 float_raise( float_flag_invalid ); 5523 return 0xffffffff; 5524 } 5525 else if ( aExp < 0x3FF ) { 5526 if ( aExp || aSig ) float_set_inexact(); 5527 return 0; 5528 } 5529 aSig |= LIT64( 0x0010000000000000 ); 5530 shiftCount = 0x433 - aExp; 5531 savedASig = aSig; 5532 aSig >>= shiftCount; 5533 z = aSig; 5534 if ( ( aSig<<shiftCount ) != savedASig ) { 5535 float_set_inexact(); 5536 } 5537 return z; 5538 5539 } 5540 5541 /* 5542 ------------------------------------------------------------------------------- 5543 Returns the result of converting the single-precision floating-point value 5544 `a' to the 32-bit unsigned integer format. The conversion is 5545 performed according to the IEC/IEEE Standard for Binary Floating-point 5546 Arithmetic, except that the conversion is always rounded toward zero. If 5547 `a' is a NaN, the largest positive integer is returned. If the conversion 5548 overflows, the largest positive integer is returned. 5549 ------------------------------------------------------------------------------- 5550 */ 5551 uint32 float32_to_uint32_round_to_zero( float32 a ) 5552 { 5553 flag aSign; 5554 int16 aExp, shiftCount; 5555 bits32 aSig; 5556 uint32 z; 5557 5558 aSig = extractFloat32Frac( a ); 5559 aExp = extractFloat32Exp( a ); 5560 aSign = extractFloat32Sign( a ); 5561 shiftCount = aExp - 0x9E; 5562 5563 if (aSign) { 5564 float_raise( float_flag_invalid ); 5565 return(0); 5566 } 5567 if ( 0 < shiftCount ) { 5568 float_raise( float_flag_invalid ); 5569 return 0xFFFFFFFF; 5570 } 5571 else if ( aExp <= 0x7E ) { 5572 if ( aExp | aSig ) float_set_inexact(); 5573 return 0; 5574 } 5575 aSig = ( aSig | 0x800000 )<<8; 5576 z = aSig>>( - shiftCount ); 5577 if ( aSig<<( shiftCount & 31 ) ) { 5578 float_set_inexact(); 5579 } 5580 return z; 5581 5582 } 5583 5584 #endif 5585