1 /* $NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* If you need this in a boot program, you have bigger problems... */ 48 #ifndef _STANDALONE 49 50 #include <sys/cdefs.h> 51 #if defined(LIBC_SCCS) && !defined(lint) 52 __RCSID("$NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $"); 53 #endif /* LIBC_SCCS and not lint */ 54 55 #ifdef SOFTFLOAT_FOR_GCC 56 #include "softfloat-for-gcc.h" 57 #endif 58 59 #include "milieu.h" 60 #include "softfloat.h" 61 62 /* 63 * Conversions between floats as stored in memory and floats as 64 * SoftFloat uses them 65 */ 66 #ifndef FLOAT64_DEMANGLE 67 #define FLOAT64_DEMANGLE(a) (a) 68 #endif 69 #ifndef FLOAT64_MANGLE 70 #define FLOAT64_MANGLE(a) (a) 71 #endif 72 73 /* 74 ------------------------------------------------------------------------------- 75 Floating-point rounding mode, extended double-precision rounding precision, 76 and exception flags. 77 ------------------------------------------------------------------------------- 78 */ 79 80 /* 81 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 82 * Right now, it does not. I've removed all other dynamic global 83 * variables. [ross] 84 */ 85 #ifdef FLOATX80 86 int8 floatx80_rounding_precision = 80; 87 #endif 88 89 /* 90 ------------------------------------------------------------------------------- 91 Primitive arithmetic functions, including multi-word arithmetic, and 92 division and square root approximations. (Can be specialized to target if 93 desired.) 94 ------------------------------------------------------------------------------- 95 */ 96 #include "softfloat-macros.h" 97 98 /* 99 ------------------------------------------------------------------------------- 100 Functions and definitions to determine: (1) whether tininess for underflow 101 is detected before or after rounding by default, (2) what (if anything) 102 happens when exceptions are raised, (3) how signaling NaNs are distinguished 103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 104 are propagated from function inputs to output. These details are target- 105 specific. 106 ------------------------------------------------------------------------------- 107 */ 108 #include "softfloat-specialize.h" 109 110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 111 /* 112 ------------------------------------------------------------------------------- 113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 114 and 7, and returns the properly rounded 32-bit integer corresponding to the 115 input. If `zSign' is 1, the input is negated before being converted to an 116 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 117 is simply rounded to an integer, with the inexact exception raised if the 118 input cannot be represented exactly as an integer. However, if the fixed- 119 point input is too large, the invalid exception is raised and the largest 120 positive or negative integer is returned. 121 ------------------------------------------------------------------------------- 122 */ 123 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 124 { 125 int8 roundingMode; 126 flag roundNearestEven; 127 int8 roundIncrement, roundBits; 128 int32 z; 129 130 roundingMode = float_rounding_mode(); 131 roundNearestEven = ( roundingMode == float_round_nearest_even ); 132 roundIncrement = 0x40; 133 if ( ! roundNearestEven ) { 134 if ( roundingMode == float_round_to_zero ) { 135 roundIncrement = 0; 136 } 137 else { 138 roundIncrement = 0x7F; 139 if ( zSign ) { 140 if ( roundingMode == float_round_up ) roundIncrement = 0; 141 } 142 else { 143 if ( roundingMode == float_round_down ) roundIncrement = 0; 144 } 145 } 146 } 147 roundBits = absZ & 0x7F; 148 absZ = ( absZ + roundIncrement )>>7; 149 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 150 z = absZ; 151 if ( zSign ) z = - z; 152 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 153 float_raise( float_flag_invalid ); 154 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 155 } 156 if ( roundBits ) float_set_inexact(); 157 return z; 158 159 } 160 161 /* 162 ------------------------------------------------------------------------------- 163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 164 `absZ1', with binary point between bits 63 and 64 (between the input words), 165 and returns the properly rounded 64-bit integer corresponding to the input. 166 If `zSign' is 1, the input is negated before being converted to an integer. 167 Ordinarily, the fixed-point input is simply rounded to an integer, with 168 the inexact exception raised if the input cannot be represented exactly as 169 an integer. However, if the fixed-point input is too large, the invalid 170 exception is raised and the largest positive or negative integer is 171 returned. 172 ------------------------------------------------------------------------------- 173 */ 174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 175 { 176 int8 roundingMode; 177 flag roundNearestEven, increment; 178 int64 z; 179 180 roundingMode = float_rounding_mode(); 181 roundNearestEven = ( roundingMode == float_round_nearest_even ); 182 increment = ( (sbits64) absZ1 < 0 ); 183 if ( ! roundNearestEven ) { 184 if ( roundingMode == float_round_to_zero ) { 185 increment = 0; 186 } 187 else { 188 if ( zSign ) { 189 increment = ( roundingMode == float_round_down ) && absZ1; 190 } 191 else { 192 increment = ( roundingMode == float_round_up ) && absZ1; 193 } 194 } 195 } 196 if ( increment ) { 197 ++absZ0; 198 if ( absZ0 == 0 ) goto overflow; 199 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 200 } 201 z = absZ0; 202 if ( zSign ) z = - z; 203 if ( z && ( ( z < 0 ) ^ zSign ) ) { 204 overflow: 205 float_raise( float_flag_invalid ); 206 return 207 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 208 : LIT64( 0x7FFFFFFFFFFFFFFF ); 209 } 210 if ( absZ1 ) float_set_inexact(); 211 return z; 212 213 } 214 215 /* same as above, but for unsigned values */ 216 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 ) 217 { 218 int8 roundingMode; 219 flag roundNearestEven, increment; 220 uint64 z; 221 222 roundingMode = float_rounding_mode(); 223 roundNearestEven = ( roundingMode == float_round_nearest_even ); 224 increment = ( (sbits64) absZ1 < 0 ); 225 if ( ! roundNearestEven ) { 226 if ( roundingMode == float_round_to_zero ) { 227 increment = 0; 228 } 229 else { 230 increment = ( roundingMode == float_round_up ) && absZ1; 231 } 232 } 233 if ( increment ) { 234 ++absZ0; 235 if ( absZ0 == 0 ) { 236 float_raise( float_flag_invalid ); 237 return LIT64( 0x7FFFFFFFFFFFFFFF ); 238 } 239 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 240 } 241 z = absZ0; 242 if ( absZ1 ) float_set_inexact(); 243 return z; 244 245 } 246 #endif 247 248 /* 249 ------------------------------------------------------------------------------- 250 Returns the fraction bits of the single-precision floating-point value `a'. 251 ------------------------------------------------------------------------------- 252 */ 253 INLINE bits32 extractFloat32Frac( float32 a ) 254 { 255 256 return a & 0x007FFFFF; 257 258 } 259 260 /* 261 ------------------------------------------------------------------------------- 262 Returns the exponent bits of the single-precision floating-point value `a'. 263 ------------------------------------------------------------------------------- 264 */ 265 INLINE int16 extractFloat32Exp( float32 a ) 266 { 267 268 return ( a>>23 ) & 0xFF; 269 270 } 271 272 /* 273 ------------------------------------------------------------------------------- 274 Returns the sign bit of the single-precision floating-point value `a'. 275 ------------------------------------------------------------------------------- 276 */ 277 INLINE flag extractFloat32Sign( float32 a ) 278 { 279 280 return a>>31; 281 282 } 283 284 /* 285 ------------------------------------------------------------------------------- 286 Normalizes the subnormal single-precision floating-point value represented 287 by the denormalized significand `aSig'. The normalized exponent and 288 significand are stored at the locations pointed to by `zExpPtr' and 289 `zSigPtr', respectively. 290 ------------------------------------------------------------------------------- 291 */ 292 static void 293 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 294 { 295 int8 shiftCount; 296 297 shiftCount = countLeadingZeros32( aSig ) - 8; 298 *zSigPtr = aSig<<shiftCount; 299 *zExpPtr = 1 - shiftCount; 300 301 } 302 303 /* 304 ------------------------------------------------------------------------------- 305 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 306 single-precision floating-point value, returning the result. After being 307 shifted into the proper positions, the three fields are simply added 308 together to form the result. This means that any integer portion of `zSig' 309 will be added into the exponent. Since a properly normalized significand 310 will have an integer portion equal to 1, the `zExp' input should be 1 less 311 than the desired result exponent whenever `zSig' is a complete, normalized 312 significand. 313 ------------------------------------------------------------------------------- 314 */ 315 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 316 { 317 318 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 319 320 } 321 322 /* 323 ------------------------------------------------------------------------------- 324 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 325 and significand `zSig', and returns the proper single-precision floating- 326 point value corresponding to the abstract input. Ordinarily, the abstract 327 value is simply rounded and packed into the single-precision format, with 328 the inexact exception raised if the abstract input cannot be represented 329 exactly. However, if the abstract value is too large, the overflow and 330 inexact exceptions are raised and an infinity or maximal finite value is 331 returned. If the abstract value is too small, the input value is rounded to 332 a subnormal number, and the underflow and inexact exceptions are raised if 333 the abstract input cannot be represented exactly as a subnormal single- 334 precision floating-point number. 335 The input significand `zSig' has its binary point between bits 30 336 and 29, which is 7 bits to the left of the usual location. This shifted 337 significand must be normalized or smaller. If `zSig' is not normalized, 338 `zExp' must be 0; in that case, the result returned is a subnormal number, 339 and it must not require rounding. In the usual case that `zSig' is 340 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 341 The handling of underflow and overflow follows the IEC/IEEE Standard for 342 Binary Floating-Point Arithmetic. 343 ------------------------------------------------------------------------------- 344 */ 345 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 346 { 347 int8 roundingMode; 348 flag roundNearestEven; 349 int8 roundIncrement, roundBits; 350 flag isTiny; 351 352 roundingMode = float_rounding_mode(); 353 roundNearestEven = ( roundingMode == float_round_nearest_even ); 354 roundIncrement = 0x40; 355 if ( ! roundNearestEven ) { 356 if ( roundingMode == float_round_to_zero ) { 357 roundIncrement = 0; 358 } 359 else { 360 roundIncrement = 0x7F; 361 if ( zSign ) { 362 if ( roundingMode == float_round_up ) roundIncrement = 0; 363 } 364 else { 365 if ( roundingMode == float_round_down ) roundIncrement = 0; 366 } 367 } 368 } 369 roundBits = zSig & 0x7F; 370 if ( 0xFD <= (bits16) zExp ) { 371 if ( ( 0xFD < zExp ) 372 || ( ( zExp == 0xFD ) 373 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 374 ) { 375 float_raise( float_flag_overflow | float_flag_inexact ); 376 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 377 } 378 if ( zExp < 0 ) { 379 isTiny = 380 ( float_detect_tininess == float_tininess_before_rounding ) 381 || ( zExp < -1 ) 382 || ( zSig + roundIncrement < 0x80000000 ); 383 shift32RightJamming( zSig, - zExp, &zSig ); 384 zExp = 0; 385 roundBits = zSig & 0x7F; 386 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 387 } 388 } 389 if ( roundBits ) float_set_inexact(); 390 zSig = ( zSig + roundIncrement )>>7; 391 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 392 if ( zSig == 0 ) zExp = 0; 393 return packFloat32( zSign, zExp, zSig ); 394 395 } 396 397 /* 398 ------------------------------------------------------------------------------- 399 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 400 and significand `zSig', and returns the proper single-precision floating- 401 point value corresponding to the abstract input. This routine is just like 402 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 403 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 404 floating-point exponent. 405 ------------------------------------------------------------------------------- 406 */ 407 static float32 408 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 409 { 410 int8 shiftCount; 411 412 shiftCount = countLeadingZeros32( zSig ) - 1; 413 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 414 415 } 416 417 /* 418 ------------------------------------------------------------------------------- 419 Returns the fraction bits of the double-precision floating-point value `a'. 420 ------------------------------------------------------------------------------- 421 */ 422 INLINE bits64 extractFloat64Frac( float64 a ) 423 { 424 425 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 426 427 } 428 429 /* 430 ------------------------------------------------------------------------------- 431 Returns the exponent bits of the double-precision floating-point value `a'. 432 ------------------------------------------------------------------------------- 433 */ 434 INLINE int16 extractFloat64Exp( float64 a ) 435 { 436 437 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 438 439 } 440 441 /* 442 ------------------------------------------------------------------------------- 443 Returns the sign bit of the double-precision floating-point value `a'. 444 ------------------------------------------------------------------------------- 445 */ 446 INLINE flag extractFloat64Sign( float64 a ) 447 { 448 449 return FLOAT64_DEMANGLE(a)>>63; 450 451 } 452 453 /* 454 ------------------------------------------------------------------------------- 455 Normalizes the subnormal double-precision floating-point value represented 456 by the denormalized significand `aSig'. The normalized exponent and 457 significand are stored at the locations pointed to by `zExpPtr' and 458 `zSigPtr', respectively. 459 ------------------------------------------------------------------------------- 460 */ 461 static void 462 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 463 { 464 int8 shiftCount; 465 466 shiftCount = countLeadingZeros64( aSig ) - 11; 467 *zSigPtr = aSig<<shiftCount; 468 *zExpPtr = 1 - shiftCount; 469 470 } 471 472 /* 473 ------------------------------------------------------------------------------- 474 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 475 double-precision floating-point value, returning the result. After being 476 shifted into the proper positions, the three fields are simply added 477 together to form the result. This means that any integer portion of `zSig' 478 will be added into the exponent. Since a properly normalized significand 479 will have an integer portion equal to 1, the `zExp' input should be 1 less 480 than the desired result exponent whenever `zSig' is a complete, normalized 481 significand. 482 ------------------------------------------------------------------------------- 483 */ 484 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 485 { 486 487 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 488 ( ( (bits64) zExp )<<52 ) + zSig ); 489 490 } 491 492 /* 493 ------------------------------------------------------------------------------- 494 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 495 and significand `zSig', and returns the proper double-precision floating- 496 point value corresponding to the abstract input. Ordinarily, the abstract 497 value is simply rounded and packed into the double-precision format, with 498 the inexact exception raised if the abstract input cannot be represented 499 exactly. However, if the abstract value is too large, the overflow and 500 inexact exceptions are raised and an infinity or maximal finite value is 501 returned. If the abstract value is too small, the input value is rounded to 502 a subnormal number, and the underflow and inexact exceptions are raised if 503 the abstract input cannot be represented exactly as a subnormal double- 504 precision floating-point number. 505 The input significand `zSig' has its binary point between bits 62 506 and 61, which is 10 bits to the left of the usual location. This shifted 507 significand must be normalized or smaller. If `zSig' is not normalized, 508 `zExp' must be 0; in that case, the result returned is a subnormal number, 509 and it must not require rounding. In the usual case that `zSig' is 510 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 511 The handling of underflow and overflow follows the IEC/IEEE Standard for 512 Binary Floating-Point Arithmetic. 513 ------------------------------------------------------------------------------- 514 */ 515 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 516 { 517 int8 roundingMode; 518 flag roundNearestEven; 519 int16 roundIncrement, roundBits; 520 flag isTiny; 521 522 roundingMode = float_rounding_mode(); 523 roundNearestEven = ( roundingMode == float_round_nearest_even ); 524 roundIncrement = 0x200; 525 if ( ! roundNearestEven ) { 526 if ( roundingMode == float_round_to_zero ) { 527 roundIncrement = 0; 528 } 529 else { 530 roundIncrement = 0x3FF; 531 if ( zSign ) { 532 if ( roundingMode == float_round_up ) roundIncrement = 0; 533 } 534 else { 535 if ( roundingMode == float_round_down ) roundIncrement = 0; 536 } 537 } 538 } 539 roundBits = zSig & 0x3FF; 540 if ( 0x7FD <= (bits16) zExp ) { 541 if ( ( 0x7FD < zExp ) 542 || ( ( zExp == 0x7FD ) 543 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 544 ) { 545 float_raise( float_flag_overflow | float_flag_inexact ); 546 return FLOAT64_MANGLE( 547 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 548 ( roundIncrement == 0 )); 549 } 550 if ( zExp < 0 ) { 551 isTiny = 552 ( float_detect_tininess == float_tininess_before_rounding ) 553 || ( zExp < -1 ) 554 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 555 shift64RightJamming( zSig, - zExp, &zSig ); 556 zExp = 0; 557 roundBits = zSig & 0x3FF; 558 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 559 } 560 } 561 if ( roundBits ) float_set_inexact(); 562 zSig = ( zSig + roundIncrement )>>10; 563 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 564 if ( zSig == 0 ) zExp = 0; 565 return packFloat64( zSign, zExp, zSig ); 566 567 } 568 569 /* 570 ------------------------------------------------------------------------------- 571 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 572 and significand `zSig', and returns the proper double-precision floating- 573 point value corresponding to the abstract input. This routine is just like 574 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 575 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 576 floating-point exponent. 577 ------------------------------------------------------------------------------- 578 */ 579 static float64 580 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 581 { 582 int8 shiftCount; 583 584 shiftCount = countLeadingZeros64( zSig ) - 1; 585 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 586 587 } 588 589 #ifdef FLOATX80 590 591 /* 592 ------------------------------------------------------------------------------- 593 Returns the fraction bits of the extended double-precision floating-point 594 value `a'. 595 ------------------------------------------------------------------------------- 596 */ 597 INLINE bits64 extractFloatx80Frac( floatx80 a ) 598 { 599 600 return a.low; 601 602 } 603 604 /* 605 ------------------------------------------------------------------------------- 606 Returns the exponent bits of the extended double-precision floating-point 607 value `a'. 608 ------------------------------------------------------------------------------- 609 */ 610 INLINE int32 extractFloatx80Exp( floatx80 a ) 611 { 612 613 return a.high & 0x7FFF; 614 615 } 616 617 /* 618 ------------------------------------------------------------------------------- 619 Returns the sign bit of the extended double-precision floating-point value 620 `a'. 621 ------------------------------------------------------------------------------- 622 */ 623 INLINE flag extractFloatx80Sign( floatx80 a ) 624 { 625 626 return a.high>>15; 627 628 } 629 630 /* 631 ------------------------------------------------------------------------------- 632 Normalizes the subnormal extended double-precision floating-point value 633 represented by the denormalized significand `aSig'. The normalized exponent 634 and significand are stored at the locations pointed to by `zExpPtr' and 635 `zSigPtr', respectively. 636 ------------------------------------------------------------------------------- 637 */ 638 static void 639 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 640 { 641 int8 shiftCount; 642 643 shiftCount = countLeadingZeros64( aSig ); 644 *zSigPtr = aSig<<shiftCount; 645 *zExpPtr = 1 - shiftCount; 646 647 } 648 649 /* 650 ------------------------------------------------------------------------------- 651 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 652 extended double-precision floating-point value, returning the result. 653 ------------------------------------------------------------------------------- 654 */ 655 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 656 { 657 floatx80 z; 658 659 z.low = zSig; 660 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 661 return z; 662 663 } 664 665 /* 666 ------------------------------------------------------------------------------- 667 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 668 and extended significand formed by the concatenation of `zSig0' and `zSig1', 669 and returns the proper extended double-precision floating-point value 670 corresponding to the abstract input. Ordinarily, the abstract value is 671 rounded and packed into the extended double-precision format, with the 672 inexact exception raised if the abstract input cannot be represented 673 exactly. However, if the abstract value is too large, the overflow and 674 inexact exceptions are raised and an infinity or maximal finite value is 675 returned. If the abstract value is too small, the input value is rounded to 676 a subnormal number, and the underflow and inexact exceptions are raised if 677 the abstract input cannot be represented exactly as a subnormal extended 678 double-precision floating-point number. 679 If `roundingPrecision' is 32 or 64, the result is rounded to the same 680 number of bits as single or double precision, respectively. Otherwise, the 681 result is rounded to the full precision of the extended double-precision 682 format. 683 The input significand must be normalized or smaller. If the input 684 significand is not normalized, `zExp' must be 0; in that case, the result 685 returned is a subnormal number, and it must not require rounding. The 686 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 687 Floating-Point Arithmetic. 688 ------------------------------------------------------------------------------- 689 */ 690 static floatx80 691 roundAndPackFloatx80( 692 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 693 ) 694 { 695 int8 roundingMode; 696 flag roundNearestEven, increment, isTiny; 697 int64 roundIncrement, roundMask, roundBits; 698 699 roundingMode = float_rounding_mode(); 700 roundNearestEven = ( roundingMode == float_round_nearest_even ); 701 if ( roundingPrecision == 80 ) goto precision80; 702 if ( roundingPrecision == 64 ) { 703 roundIncrement = LIT64( 0x0000000000000400 ); 704 roundMask = LIT64( 0x00000000000007FF ); 705 } 706 else if ( roundingPrecision == 32 ) { 707 roundIncrement = LIT64( 0x0000008000000000 ); 708 roundMask = LIT64( 0x000000FFFFFFFFFF ); 709 } 710 else { 711 goto precision80; 712 } 713 zSig0 |= ( zSig1 != 0 ); 714 if ( ! roundNearestEven ) { 715 if ( roundingMode == float_round_to_zero ) { 716 roundIncrement = 0; 717 } 718 else { 719 roundIncrement = roundMask; 720 if ( zSign ) { 721 if ( roundingMode == float_round_up ) roundIncrement = 0; 722 } 723 else { 724 if ( roundingMode == float_round_down ) roundIncrement = 0; 725 } 726 } 727 } 728 roundBits = zSig0 & roundMask; 729 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 730 if ( ( 0x7FFE < zExp ) 731 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 732 ) { 733 goto overflow; 734 } 735 if ( zExp <= 0 ) { 736 isTiny = 737 ( float_detect_tininess == float_tininess_before_rounding ) 738 || ( zExp < 0 ) 739 || ( zSig0 <= zSig0 + roundIncrement ); 740 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 741 zExp = 0; 742 roundBits = zSig0 & roundMask; 743 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 744 if ( roundBits ) float_set_inexact(); 745 zSig0 += roundIncrement; 746 if ( (sbits64) zSig0 < 0 ) zExp = 1; 747 roundIncrement = roundMask + 1; 748 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 749 roundMask |= roundIncrement; 750 } 751 zSig0 &= ~ roundMask; 752 return packFloatx80( zSign, zExp, zSig0 ); 753 } 754 } 755 if ( roundBits ) float_set_inexact(); 756 zSig0 += roundIncrement; 757 if ( zSig0 < roundIncrement ) { 758 ++zExp; 759 zSig0 = LIT64( 0x8000000000000000 ); 760 } 761 roundIncrement = roundMask + 1; 762 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 763 roundMask |= roundIncrement; 764 } 765 zSig0 &= ~ roundMask; 766 if ( zSig0 == 0 ) zExp = 0; 767 return packFloatx80( zSign, zExp, zSig0 ); 768 precision80: 769 increment = ( (sbits64) zSig1 < 0 ); 770 if ( ! roundNearestEven ) { 771 if ( roundingMode == float_round_to_zero ) { 772 increment = 0; 773 } 774 else { 775 if ( zSign ) { 776 increment = ( roundingMode == float_round_down ) && zSig1; 777 } 778 else { 779 increment = ( roundingMode == float_round_up ) && zSig1; 780 } 781 } 782 } 783 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 784 if ( ( 0x7FFE < zExp ) 785 || ( ( zExp == 0x7FFE ) 786 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 787 && increment 788 ) 789 ) { 790 roundMask = 0; 791 overflow: 792 float_raise( float_flag_overflow | float_flag_inexact ); 793 if ( ( roundingMode == float_round_to_zero ) 794 || ( zSign && ( roundingMode == float_round_up ) ) 795 || ( ! zSign && ( roundingMode == float_round_down ) ) 796 ) { 797 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 798 } 799 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 800 } 801 if ( zExp <= 0 ) { 802 isTiny = 803 ( float_detect_tininess == float_tininess_before_rounding ) 804 || ( zExp < 0 ) 805 || ! increment 806 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 807 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 808 zExp = 0; 809 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 810 if ( zSig1 ) float_set_inexact(); 811 if ( roundNearestEven ) { 812 increment = ( (sbits64) zSig1 < 0 ); 813 } 814 else { 815 if ( zSign ) { 816 increment = ( roundingMode == float_round_down ) && zSig1; 817 } 818 else { 819 increment = ( roundingMode == float_round_up ) && zSig1; 820 } 821 } 822 if ( increment ) { 823 ++zSig0; 824 zSig0 &= 825 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 826 if ( (sbits64) zSig0 < 0 ) zExp = 1; 827 } 828 return packFloatx80( zSign, zExp, zSig0 ); 829 } 830 } 831 if ( zSig1 ) float_set_inexact(); 832 if ( increment ) { 833 ++zSig0; 834 if ( zSig0 == 0 ) { 835 ++zExp; 836 zSig0 = LIT64( 0x8000000000000000 ); 837 } 838 else { 839 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 840 } 841 } 842 else { 843 if ( zSig0 == 0 ) zExp = 0; 844 } 845 return packFloatx80( zSign, zExp, zSig0 ); 846 847 } 848 849 /* 850 ------------------------------------------------------------------------------- 851 Takes an abstract floating-point value having sign `zSign', exponent 852 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 853 and returns the proper extended double-precision floating-point value 854 corresponding to the abstract input. This routine is just like 855 `roundAndPackFloatx80' except that the input significand does not have to be 856 normalized. 857 ------------------------------------------------------------------------------- 858 */ 859 static floatx80 860 normalizeRoundAndPackFloatx80( 861 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 862 ) 863 { 864 int8 shiftCount; 865 866 if ( zSig0 == 0 ) { 867 zSig0 = zSig1; 868 zSig1 = 0; 869 zExp -= 64; 870 } 871 shiftCount = countLeadingZeros64( zSig0 ); 872 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 873 zExp -= shiftCount; 874 return 875 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 876 877 } 878 879 #endif 880 881 #ifdef FLOAT128 882 883 /* 884 ------------------------------------------------------------------------------- 885 Returns the least-significant 64 fraction bits of the quadruple-precision 886 floating-point value `a'. 887 ------------------------------------------------------------------------------- 888 */ 889 INLINE bits64 extractFloat128Frac1( float128 a ) 890 { 891 892 return a.low; 893 894 } 895 896 /* 897 ------------------------------------------------------------------------------- 898 Returns the most-significant 48 fraction bits of the quadruple-precision 899 floating-point value `a'. 900 ------------------------------------------------------------------------------- 901 */ 902 INLINE bits64 extractFloat128Frac0( float128 a ) 903 { 904 905 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 906 907 } 908 909 /* 910 ------------------------------------------------------------------------------- 911 Returns the exponent bits of the quadruple-precision floating-point value 912 `a'. 913 ------------------------------------------------------------------------------- 914 */ 915 INLINE int32 extractFloat128Exp( float128 a ) 916 { 917 918 return ( a.high>>48 ) & 0x7FFF; 919 920 } 921 922 /* 923 ------------------------------------------------------------------------------- 924 Returns the sign bit of the quadruple-precision floating-point value `a'. 925 ------------------------------------------------------------------------------- 926 */ 927 INLINE flag extractFloat128Sign( float128 a ) 928 { 929 930 return a.high>>63; 931 932 } 933 934 /* 935 ------------------------------------------------------------------------------- 936 Normalizes the subnormal quadruple-precision floating-point value 937 represented by the denormalized significand formed by the concatenation of 938 `aSig0' and `aSig1'. The normalized exponent is stored at the location 939 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 940 significand are stored at the location pointed to by `zSig0Ptr', and the 941 least significant 64 bits of the normalized significand are stored at the 942 location pointed to by `zSig1Ptr'. 943 ------------------------------------------------------------------------------- 944 */ 945 static void 946 normalizeFloat128Subnormal( 947 bits64 aSig0, 948 bits64 aSig1, 949 int32 *zExpPtr, 950 bits64 *zSig0Ptr, 951 bits64 *zSig1Ptr 952 ) 953 { 954 int8 shiftCount; 955 956 if ( aSig0 == 0 ) { 957 shiftCount = countLeadingZeros64( aSig1 ) - 15; 958 if ( shiftCount < 0 ) { 959 *zSig0Ptr = aSig1>>( - shiftCount ); 960 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 961 } 962 else { 963 *zSig0Ptr = aSig1<<shiftCount; 964 *zSig1Ptr = 0; 965 } 966 *zExpPtr = - shiftCount - 63; 967 } 968 else { 969 shiftCount = countLeadingZeros64( aSig0 ) - 15; 970 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 971 *zExpPtr = 1 - shiftCount; 972 } 973 974 } 975 976 /* 977 ------------------------------------------------------------------------------- 978 Packs the sign `zSign', the exponent `zExp', and the significand formed 979 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 980 floating-point value, returning the result. After being shifted into the 981 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 982 added together to form the most significant 32 bits of the result. This 983 means that any integer portion of `zSig0' will be added into the exponent. 984 Since a properly normalized significand will have an integer portion equal 985 to 1, the `zExp' input should be 1 less than the desired result exponent 986 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 987 significand. 988 ------------------------------------------------------------------------------- 989 */ 990 INLINE float128 991 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 992 { 993 float128 z; 994 995 z.low = zSig1; 996 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 997 return z; 998 999 } 1000 1001 /* 1002 ------------------------------------------------------------------------------- 1003 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1004 and extended significand formed by the concatenation of `zSig0', `zSig1', 1005 and `zSig2', and returns the proper quadruple-precision floating-point value 1006 corresponding to the abstract input. Ordinarily, the abstract value is 1007 simply rounded and packed into the quadruple-precision format, with the 1008 inexact exception raised if the abstract input cannot be represented 1009 exactly. However, if the abstract value is too large, the overflow and 1010 inexact exceptions are raised and an infinity or maximal finite value is 1011 returned. If the abstract value is too small, the input value is rounded to 1012 a subnormal number, and the underflow and inexact exceptions are raised if 1013 the abstract input cannot be represented exactly as a subnormal quadruple- 1014 precision floating-point number. 1015 The input significand must be normalized or smaller. If the input 1016 significand is not normalized, `zExp' must be 0; in that case, the result 1017 returned is a subnormal number, and it must not require rounding. In the 1018 usual case that the input significand is normalized, `zExp' must be 1 less 1019 than the ``true'' floating-point exponent. The handling of underflow and 1020 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1021 ------------------------------------------------------------------------------- 1022 */ 1023 static float128 1024 roundAndPackFloat128( 1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 1026 { 1027 int8 roundingMode; 1028 flag roundNearestEven, increment, isTiny; 1029 1030 roundingMode = float_rounding_mode(); 1031 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1032 increment = ( (sbits64) zSig2 < 0 ); 1033 if ( ! roundNearestEven ) { 1034 if ( roundingMode == float_round_to_zero ) { 1035 increment = 0; 1036 } 1037 else { 1038 if ( zSign ) { 1039 increment = ( roundingMode == float_round_down ) && zSig2; 1040 } 1041 else { 1042 increment = ( roundingMode == float_round_up ) && zSig2; 1043 } 1044 } 1045 } 1046 if ( 0x7FFD <= (bits32) zExp ) { 1047 if ( ( 0x7FFD < zExp ) 1048 || ( ( zExp == 0x7FFD ) 1049 && eq128( 1050 LIT64( 0x0001FFFFFFFFFFFF ), 1051 LIT64( 0xFFFFFFFFFFFFFFFF ), 1052 zSig0, 1053 zSig1 1054 ) 1055 && increment 1056 ) 1057 ) { 1058 float_raise( float_flag_overflow | float_flag_inexact ); 1059 if ( ( roundingMode == float_round_to_zero ) 1060 || ( zSign && ( roundingMode == float_round_up ) ) 1061 || ( ! zSign && ( roundingMode == float_round_down ) ) 1062 ) { 1063 return 1064 packFloat128( 1065 zSign, 1066 0x7FFE, 1067 LIT64( 0x0000FFFFFFFFFFFF ), 1068 LIT64( 0xFFFFFFFFFFFFFFFF ) 1069 ); 1070 } 1071 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1072 } 1073 if ( zExp < 0 ) { 1074 isTiny = 1075 ( float_detect_tininess == float_tininess_before_rounding ) 1076 || ( zExp < -1 ) 1077 || ! increment 1078 || lt128( 1079 zSig0, 1080 zSig1, 1081 LIT64( 0x0001FFFFFFFFFFFF ), 1082 LIT64( 0xFFFFFFFFFFFFFFFF ) 1083 ); 1084 shift128ExtraRightJamming( 1085 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1086 zExp = 0; 1087 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1088 if ( roundNearestEven ) { 1089 increment = ( (sbits64) zSig2 < 0 ); 1090 } 1091 else { 1092 if ( zSign ) { 1093 increment = ( roundingMode == float_round_down ) && zSig2; 1094 } 1095 else { 1096 increment = ( roundingMode == float_round_up ) && zSig2; 1097 } 1098 } 1099 } 1100 } 1101 if ( zSig2 ) float_set_inexact(); 1102 if ( increment ) { 1103 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1104 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1105 } 1106 else { 1107 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1108 } 1109 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1110 1111 } 1112 1113 /* 1114 ------------------------------------------------------------------------------- 1115 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1116 and significand formed by the concatenation of `zSig0' and `zSig1', and 1117 returns the proper quadruple-precision floating-point value corresponding 1118 to the abstract input. This routine is just like `roundAndPackFloat128' 1119 except that the input significand has fewer bits and does not have to be 1120 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1121 point exponent. 1122 ------------------------------------------------------------------------------- 1123 */ 1124 static float128 1125 normalizeRoundAndPackFloat128( 1126 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1127 { 1128 int8 shiftCount; 1129 bits64 zSig2; 1130 1131 if ( zSig0 == 0 ) { 1132 zSig0 = zSig1; 1133 zSig1 = 0; 1134 zExp -= 64; 1135 } 1136 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1137 if ( 0 <= shiftCount ) { 1138 zSig2 = 0; 1139 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1140 } 1141 else { 1142 shift128ExtraRightJamming( 1143 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1144 } 1145 zExp -= shiftCount; 1146 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1147 1148 } 1149 1150 #endif 1151 1152 /* 1153 ------------------------------------------------------------------------------- 1154 Returns the result of converting the 32-bit two's complement integer `a' 1155 to the single-precision floating-point format. The conversion is performed 1156 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1157 ------------------------------------------------------------------------------- 1158 */ 1159 float32 int32_to_float32( int32 a ) 1160 { 1161 flag zSign; 1162 1163 if ( a == 0 ) return 0; 1164 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1165 zSign = ( a < 0 ); 1166 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1167 1168 } 1169 1170 /* 1171 ------------------------------------------------------------------------------- 1172 Returns the result of converting the 32-bit two's complement integer `a' 1173 to the double-precision floating-point format. The conversion is performed 1174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1175 ------------------------------------------------------------------------------- 1176 */ 1177 float64 int32_to_float64( int32 a ) 1178 { 1179 flag zSign; 1180 uint32 absA; 1181 int8 shiftCount; 1182 bits64 zSig; 1183 1184 if ( a == 0 ) return 0; 1185 zSign = ( a < 0 ); 1186 absA = zSign ? - a : a; 1187 shiftCount = countLeadingZeros32( absA ) + 21; 1188 zSig = absA; 1189 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1190 1191 } 1192 1193 #ifdef FLOATX80 1194 1195 /* 1196 ------------------------------------------------------------------------------- 1197 Returns the result of converting the 32-bit two's complement integer `a' 1198 to the extended double-precision floating-point format. The conversion 1199 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1200 Arithmetic. 1201 ------------------------------------------------------------------------------- 1202 */ 1203 floatx80 int32_to_floatx80( int32 a ) 1204 { 1205 flag zSign; 1206 uint32 absA; 1207 int8 shiftCount; 1208 bits64 zSig; 1209 1210 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1211 zSign = ( a < 0 ); 1212 absA = zSign ? - a : a; 1213 shiftCount = countLeadingZeros32( absA ) + 32; 1214 zSig = absA; 1215 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1216 1217 } 1218 1219 #endif 1220 1221 #ifdef FLOAT128 1222 1223 /* 1224 ------------------------------------------------------------------------------- 1225 Returns the result of converting the 32-bit two's complement integer `a' to 1226 the quadruple-precision floating-point format. The conversion is performed 1227 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1228 ------------------------------------------------------------------------------- 1229 */ 1230 float128 int32_to_float128( int32 a ) 1231 { 1232 flag zSign; 1233 uint32 absA; 1234 int8 shiftCount; 1235 bits64 zSig0; 1236 1237 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1238 zSign = ( a < 0 ); 1239 absA = zSign ? - a : a; 1240 shiftCount = countLeadingZeros32( absA ) + 17; 1241 zSig0 = absA; 1242 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1243 1244 } 1245 1246 #endif 1247 1248 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1249 /* 1250 ------------------------------------------------------------------------------- 1251 Returns the result of converting the 64-bit two's complement integer `a' 1252 to the single-precision floating-point format. The conversion is performed 1253 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1254 ------------------------------------------------------------------------------- 1255 */ 1256 float32 int64_to_float32( int64 a ) 1257 { 1258 flag zSign; 1259 uint64 absA; 1260 int8 shiftCount; 1261 1262 if ( a == 0 ) return 0; 1263 zSign = ( a < 0 ); 1264 absA = zSign ? - a : a; 1265 shiftCount = countLeadingZeros64( absA ) - 40; 1266 if ( 0 <= shiftCount ) { 1267 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1268 } 1269 else { 1270 shiftCount += 7; 1271 if ( shiftCount < 0 ) { 1272 shift64RightJamming( absA, - shiftCount, &absA ); 1273 } 1274 else { 1275 absA <<= shiftCount; 1276 } 1277 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1278 } 1279 1280 } 1281 1282 /* 1283 ------------------------------------------------------------------------------- 1284 Returns the result of converting the 64-bit two's complement integer `a' 1285 to the double-precision floating-point format. The conversion is performed 1286 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1287 ------------------------------------------------------------------------------- 1288 */ 1289 float64 int64_to_float64( int64 a ) 1290 { 1291 flag zSign; 1292 1293 if ( a == 0 ) return 0; 1294 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1295 return packFloat64( 1, 0x43E, 0 ); 1296 } 1297 zSign = ( a < 0 ); 1298 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1299 1300 } 1301 1302 #ifdef FLOATX80 1303 1304 /* 1305 ------------------------------------------------------------------------------- 1306 Returns the result of converting the 64-bit two's complement integer `a' 1307 to the extended double-precision floating-point format. The conversion 1308 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1309 Arithmetic. 1310 ------------------------------------------------------------------------------- 1311 */ 1312 floatx80 int64_to_floatx80( int64 a ) 1313 { 1314 flag zSign; 1315 uint64 absA; 1316 int8 shiftCount; 1317 1318 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1319 zSign = ( a < 0 ); 1320 absA = zSign ? - a : a; 1321 shiftCount = countLeadingZeros64( absA ); 1322 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1323 1324 } 1325 1326 #endif 1327 1328 #ifdef FLOAT128 1329 1330 /* 1331 ------------------------------------------------------------------------------- 1332 Returns the result of converting the 64-bit two's complement integer `a' to 1333 the quadruple-precision floating-point format. The conversion is performed 1334 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1335 ------------------------------------------------------------------------------- 1336 */ 1337 float128 int64_to_float128( int64 a ) 1338 { 1339 flag zSign; 1340 uint64 absA; 1341 int8 shiftCount; 1342 int32 zExp; 1343 bits64 zSig0, zSig1; 1344 1345 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1346 zSign = ( a < 0 ); 1347 absA = zSign ? - a : a; 1348 shiftCount = countLeadingZeros64( absA ) + 49; 1349 zExp = 0x406E - shiftCount; 1350 if ( 64 <= shiftCount ) { 1351 zSig1 = 0; 1352 zSig0 = absA; 1353 shiftCount -= 64; 1354 } 1355 else { 1356 zSig1 = absA; 1357 zSig0 = 0; 1358 } 1359 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1360 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1361 1362 } 1363 1364 #endif 1365 #endif /* !SOFTFLOAT_FOR_GCC */ 1366 1367 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1368 /* 1369 ------------------------------------------------------------------------------- 1370 Returns the result of converting the single-precision floating-point value 1371 `a' to the 32-bit two's complement integer format. The conversion is 1372 performed according to the IEC/IEEE Standard for Binary Floating-Point 1373 Arithmetic---which means in particular that the conversion is rounded 1374 according to the current rounding mode. If `a' is a NaN, the largest 1375 positive integer is returned. Otherwise, if the conversion overflows, the 1376 largest integer with the same sign as `a' is returned. 1377 ------------------------------------------------------------------------------- 1378 */ 1379 int32 float32_to_int32( float32 a ) 1380 { 1381 flag aSign; 1382 int16 aExp, shiftCount; 1383 bits32 aSig; 1384 bits64 aSig64; 1385 1386 aSig = extractFloat32Frac( a ); 1387 aExp = extractFloat32Exp( a ); 1388 aSign = extractFloat32Sign( a ); 1389 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1390 if ( aExp ) aSig |= 0x00800000; 1391 shiftCount = 0xAF - aExp; 1392 aSig64 = aSig; 1393 aSig64 <<= 32; 1394 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1395 return roundAndPackInt32( aSign, aSig64 ); 1396 1397 } 1398 #endif /* !SOFTFLOAT_FOR_GCC */ 1399 1400 /* 1401 ------------------------------------------------------------------------------- 1402 Returns the result of converting the single-precision floating-point value 1403 `a' to the 32-bit two's complement integer format. The conversion is 1404 performed according to the IEC/IEEE Standard for Binary Floating-Point 1405 Arithmetic, except that the conversion is always rounded toward zero. 1406 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1407 the conversion overflows, the largest integer with the same sign as `a' is 1408 returned. 1409 ------------------------------------------------------------------------------- 1410 */ 1411 int32 float32_to_int32_round_to_zero( float32 a ) 1412 { 1413 flag aSign; 1414 int16 aExp, shiftCount; 1415 bits32 aSig; 1416 int32 z; 1417 1418 aSig = extractFloat32Frac( a ); 1419 aExp = extractFloat32Exp( a ); 1420 aSign = extractFloat32Sign( a ); 1421 shiftCount = aExp - 0x9E; 1422 if ( 0 <= shiftCount ) { 1423 if ( a != 0xCF000000 ) { 1424 float_raise( float_flag_invalid ); 1425 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1426 } 1427 return (sbits32) 0x80000000; 1428 } 1429 else if ( aExp <= 0x7E ) { 1430 if ( aExp | aSig ) float_set_inexact(); 1431 return 0; 1432 } 1433 aSig = ( aSig | 0x00800000 )<<8; 1434 z = aSig>>( - shiftCount ); 1435 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1436 float_set_inexact(); 1437 } 1438 if ( aSign ) z = - z; 1439 return z; 1440 1441 } 1442 1443 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1444 /* 1445 ------------------------------------------------------------------------------- 1446 Returns the result of converting the single-precision floating-point value 1447 `a' to the 64-bit two's complement integer format. The conversion is 1448 performed according to the IEC/IEEE Standard for Binary Floating-Point 1449 Arithmetic---which means in particular that the conversion is rounded 1450 according to the current rounding mode. If `a' is a NaN, the largest 1451 positive integer is returned. Otherwise, if the conversion overflows, the 1452 largest integer with the same sign as `a' is returned. 1453 ------------------------------------------------------------------------------- 1454 */ 1455 int64 float32_to_int64( float32 a ) 1456 { 1457 flag aSign; 1458 int16 aExp, shiftCount; 1459 bits32 aSig; 1460 bits64 aSig64, aSigExtra; 1461 1462 aSig = extractFloat32Frac( a ); 1463 aExp = extractFloat32Exp( a ); 1464 aSign = extractFloat32Sign( a ); 1465 shiftCount = 0xBE - aExp; 1466 if ( shiftCount < 0 ) { 1467 float_raise( float_flag_invalid ); 1468 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1469 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1470 } 1471 return (sbits64) LIT64( 0x8000000000000000 ); 1472 } 1473 if ( aExp ) aSig |= 0x00800000; 1474 aSig64 = aSig; 1475 aSig64 <<= 40; 1476 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1477 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1478 1479 } 1480 1481 /* 1482 ------------------------------------------------------------------------------- 1483 Returns the result of converting the single-precision floating-point value 1484 `a' to the 64-bit two's complement integer format. The conversion is 1485 performed according to the IEC/IEEE Standard for Binary Floating-Point 1486 Arithmetic, except that the conversion is always rounded toward zero. If 1487 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1488 conversion overflows, the largest integer with the same sign as `a' is 1489 returned. 1490 ------------------------------------------------------------------------------- 1491 */ 1492 int64 float32_to_int64_round_to_zero( float32 a ) 1493 { 1494 flag aSign; 1495 int16 aExp, shiftCount; 1496 bits32 aSig; 1497 bits64 aSig64; 1498 int64 z; 1499 1500 aSig = extractFloat32Frac( a ); 1501 aExp = extractFloat32Exp( a ); 1502 aSign = extractFloat32Sign( a ); 1503 shiftCount = aExp - 0xBE; 1504 if ( 0 <= shiftCount ) { 1505 if ( a != 0xDF000000 ) { 1506 float_raise( float_flag_invalid ); 1507 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1508 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1509 } 1510 } 1511 return (sbits64) LIT64( 0x8000000000000000 ); 1512 } 1513 else if ( aExp <= 0x7E ) { 1514 if ( aExp | aSig ) float_set_inexact(); 1515 return 0; 1516 } 1517 aSig64 = aSig | 0x00800000; 1518 aSig64 <<= 40; 1519 z = aSig64>>( - shiftCount ); 1520 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1521 float_set_inexact(); 1522 } 1523 if ( aSign ) z = - z; 1524 return z; 1525 1526 } 1527 #endif /* !SOFTFLOAT_FOR_GCC */ 1528 1529 /* 1530 ------------------------------------------------------------------------------- 1531 Returns the result of converting the single-precision floating-point value 1532 `a' to the double-precision floating-point format. The conversion is 1533 performed according to the IEC/IEEE Standard for Binary Floating-Point 1534 Arithmetic. 1535 ------------------------------------------------------------------------------- 1536 */ 1537 float64 float32_to_float64( float32 a ) 1538 { 1539 flag aSign; 1540 int16 aExp; 1541 bits32 aSig; 1542 1543 aSig = extractFloat32Frac( a ); 1544 aExp = extractFloat32Exp( a ); 1545 aSign = extractFloat32Sign( a ); 1546 if ( aExp == 0xFF ) { 1547 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1548 return packFloat64( aSign, 0x7FF, 0 ); 1549 } 1550 if ( aExp == 0 ) { 1551 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1552 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1553 --aExp; 1554 } 1555 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1556 1557 } 1558 1559 #ifdef FLOATX80 1560 1561 /* 1562 ------------------------------------------------------------------------------- 1563 Returns the result of converting the single-precision floating-point value 1564 `a' to the extended double-precision floating-point format. The conversion 1565 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1566 Arithmetic. 1567 ------------------------------------------------------------------------------- 1568 */ 1569 floatx80 float32_to_floatx80( float32 a ) 1570 { 1571 flag aSign; 1572 int16 aExp; 1573 bits32 aSig; 1574 1575 aSig = extractFloat32Frac( a ); 1576 aExp = extractFloat32Exp( a ); 1577 aSign = extractFloat32Sign( a ); 1578 if ( aExp == 0xFF ) { 1579 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1580 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1581 } 1582 if ( aExp == 0 ) { 1583 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1584 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1585 } 1586 aSig |= 0x00800000; 1587 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1588 1589 } 1590 1591 #endif 1592 1593 #ifdef FLOAT128 1594 1595 /* 1596 ------------------------------------------------------------------------------- 1597 Returns the result of converting the single-precision floating-point value 1598 `a' to the double-precision floating-point format. The conversion is 1599 performed according to the IEC/IEEE Standard for Binary Floating-Point 1600 Arithmetic. 1601 ------------------------------------------------------------------------------- 1602 */ 1603 float128 float32_to_float128( float32 a ) 1604 { 1605 flag aSign; 1606 int16 aExp; 1607 bits32 aSig; 1608 1609 aSig = extractFloat32Frac( a ); 1610 aExp = extractFloat32Exp( a ); 1611 aSign = extractFloat32Sign( a ); 1612 if ( aExp == 0xFF ) { 1613 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1614 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1615 } 1616 if ( aExp == 0 ) { 1617 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1618 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1619 --aExp; 1620 } 1621 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1622 1623 } 1624 1625 #endif 1626 1627 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1628 /* 1629 ------------------------------------------------------------------------------- 1630 Rounds the single-precision floating-point value `a' to an integer, and 1631 returns the result as a single-precision floating-point value. The 1632 operation is performed according to the IEC/IEEE Standard for Binary 1633 Floating-Point Arithmetic. 1634 ------------------------------------------------------------------------------- 1635 */ 1636 float32 float32_round_to_int( float32 a ) 1637 { 1638 flag aSign; 1639 int16 aExp; 1640 bits32 lastBitMask, roundBitsMask; 1641 int8 roundingMode; 1642 float32 z; 1643 1644 aExp = extractFloat32Exp( a ); 1645 if ( 0x96 <= aExp ) { 1646 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1647 return propagateFloat32NaN( a, a ); 1648 } 1649 return a; 1650 } 1651 if ( aExp <= 0x7E ) { 1652 if ( (bits32) ( a<<1 ) == 0 ) return a; 1653 float_set_inexact(); 1654 aSign = extractFloat32Sign( a ); 1655 switch ( float_rounding_mode() ) { 1656 case float_round_nearest_even: 1657 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1658 return packFloat32( aSign, 0x7F, 0 ); 1659 } 1660 break; 1661 case float_round_down: 1662 return aSign ? 0xBF800000 : 0; 1663 case float_round_up: 1664 return aSign ? 0x80000000 : 0x3F800000; 1665 } 1666 return packFloat32( aSign, 0, 0 ); 1667 } 1668 lastBitMask = 1; 1669 lastBitMask <<= 0x96 - aExp; 1670 roundBitsMask = lastBitMask - 1; 1671 z = a; 1672 roundingMode = float_rounding_mode(); 1673 if ( roundingMode == float_round_nearest_even ) { 1674 z += lastBitMask>>1; 1675 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1676 } 1677 else if ( roundingMode != float_round_to_zero ) { 1678 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1679 z += roundBitsMask; 1680 } 1681 } 1682 z &= ~ roundBitsMask; 1683 if ( z != a ) float_set_inexact(); 1684 return z; 1685 1686 } 1687 #endif /* !SOFTFLOAT_FOR_GCC */ 1688 1689 /* 1690 ------------------------------------------------------------------------------- 1691 Returns the result of adding the absolute values of the single-precision 1692 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1693 before being returned. `zSign' is ignored if the result is a NaN. 1694 The addition is performed according to the IEC/IEEE Standard for Binary 1695 Floating-Point Arithmetic. 1696 ------------------------------------------------------------------------------- 1697 */ 1698 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1699 { 1700 int16 aExp, bExp, zExp; 1701 bits32 aSig, bSig, zSig; 1702 int16 expDiff; 1703 1704 aSig = extractFloat32Frac( a ); 1705 aExp = extractFloat32Exp( a ); 1706 bSig = extractFloat32Frac( b ); 1707 bExp = extractFloat32Exp( b ); 1708 expDiff = aExp - bExp; 1709 aSig <<= 6; 1710 bSig <<= 6; 1711 if ( 0 < expDiff ) { 1712 if ( aExp == 0xFF ) { 1713 if ( aSig ) return propagateFloat32NaN( a, b ); 1714 return a; 1715 } 1716 if ( bExp == 0 ) { 1717 --expDiff; 1718 } 1719 else { 1720 bSig |= 0x20000000; 1721 } 1722 shift32RightJamming( bSig, expDiff, &bSig ); 1723 zExp = aExp; 1724 } 1725 else if ( expDiff < 0 ) { 1726 if ( bExp == 0xFF ) { 1727 if ( bSig ) return propagateFloat32NaN( a, b ); 1728 return packFloat32( zSign, 0xFF, 0 ); 1729 } 1730 if ( aExp == 0 ) { 1731 ++expDiff; 1732 } 1733 else { 1734 aSig |= 0x20000000; 1735 } 1736 shift32RightJamming( aSig, - expDiff, &aSig ); 1737 zExp = bExp; 1738 } 1739 else { 1740 if ( aExp == 0xFF ) { 1741 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1742 return a; 1743 } 1744 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1745 zSig = 0x40000000 + aSig + bSig; 1746 zExp = aExp; 1747 goto roundAndPack; 1748 } 1749 aSig |= 0x20000000; 1750 zSig = ( aSig + bSig )<<1; 1751 --zExp; 1752 if ( (sbits32) zSig < 0 ) { 1753 zSig = aSig + bSig; 1754 ++zExp; 1755 } 1756 roundAndPack: 1757 return roundAndPackFloat32( zSign, zExp, zSig ); 1758 1759 } 1760 1761 /* 1762 ------------------------------------------------------------------------------- 1763 Returns the result of subtracting the absolute values of the single- 1764 precision floating-point values `a' and `b'. If `zSign' is 1, the 1765 difference is negated before being returned. `zSign' is ignored if the 1766 result is a NaN. The subtraction is performed according to the IEC/IEEE 1767 Standard for Binary Floating-Point Arithmetic. 1768 ------------------------------------------------------------------------------- 1769 */ 1770 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1771 { 1772 int16 aExp, bExp, zExp; 1773 bits32 aSig, bSig, zSig; 1774 int16 expDiff; 1775 1776 aSig = extractFloat32Frac( a ); 1777 aExp = extractFloat32Exp( a ); 1778 bSig = extractFloat32Frac( b ); 1779 bExp = extractFloat32Exp( b ); 1780 expDiff = aExp - bExp; 1781 aSig <<= 7; 1782 bSig <<= 7; 1783 if ( 0 < expDiff ) goto aExpBigger; 1784 if ( expDiff < 0 ) goto bExpBigger; 1785 if ( aExp == 0xFF ) { 1786 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1787 float_raise( float_flag_invalid ); 1788 return float32_default_nan; 1789 } 1790 if ( aExp == 0 ) { 1791 aExp = 1; 1792 bExp = 1; 1793 } 1794 if ( bSig < aSig ) goto aBigger; 1795 if ( aSig < bSig ) goto bBigger; 1796 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1797 bExpBigger: 1798 if ( bExp == 0xFF ) { 1799 if ( bSig ) return propagateFloat32NaN( a, b ); 1800 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1801 } 1802 if ( aExp == 0 ) { 1803 ++expDiff; 1804 } 1805 else { 1806 aSig |= 0x40000000; 1807 } 1808 shift32RightJamming( aSig, - expDiff, &aSig ); 1809 bSig |= 0x40000000; 1810 bBigger: 1811 zSig = bSig - aSig; 1812 zExp = bExp; 1813 zSign ^= 1; 1814 goto normalizeRoundAndPack; 1815 aExpBigger: 1816 if ( aExp == 0xFF ) { 1817 if ( aSig ) return propagateFloat32NaN( a, b ); 1818 return a; 1819 } 1820 if ( bExp == 0 ) { 1821 --expDiff; 1822 } 1823 else { 1824 bSig |= 0x40000000; 1825 } 1826 shift32RightJamming( bSig, expDiff, &bSig ); 1827 aSig |= 0x40000000; 1828 aBigger: 1829 zSig = aSig - bSig; 1830 zExp = aExp; 1831 normalizeRoundAndPack: 1832 --zExp; 1833 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1834 1835 } 1836 1837 /* 1838 ------------------------------------------------------------------------------- 1839 Returns the result of adding the single-precision floating-point values `a' 1840 and `b'. The operation is performed according to the IEC/IEEE Standard for 1841 Binary Floating-Point Arithmetic. 1842 ------------------------------------------------------------------------------- 1843 */ 1844 float32 float32_add( float32 a, float32 b ) 1845 { 1846 flag aSign, bSign; 1847 1848 aSign = extractFloat32Sign( a ); 1849 bSign = extractFloat32Sign( b ); 1850 if ( aSign == bSign ) { 1851 return addFloat32Sigs( a, b, aSign ); 1852 } 1853 else { 1854 return subFloat32Sigs( a, b, aSign ); 1855 } 1856 1857 } 1858 1859 /* 1860 ------------------------------------------------------------------------------- 1861 Returns the result of subtracting the single-precision floating-point values 1862 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1863 for Binary Floating-Point Arithmetic. 1864 ------------------------------------------------------------------------------- 1865 */ 1866 float32 float32_sub( float32 a, float32 b ) 1867 { 1868 flag aSign, bSign; 1869 1870 aSign = extractFloat32Sign( a ); 1871 bSign = extractFloat32Sign( b ); 1872 if ( aSign == bSign ) { 1873 return subFloat32Sigs( a, b, aSign ); 1874 } 1875 else { 1876 return addFloat32Sigs( a, b, aSign ); 1877 } 1878 1879 } 1880 1881 /* 1882 ------------------------------------------------------------------------------- 1883 Returns the result of multiplying the single-precision floating-point values 1884 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1885 for Binary Floating-Point Arithmetic. 1886 ------------------------------------------------------------------------------- 1887 */ 1888 float32 float32_mul( float32 a, float32 b ) 1889 { 1890 flag aSign, bSign, zSign; 1891 int16 aExp, bExp, zExp; 1892 bits32 aSig, bSig; 1893 bits64 zSig64; 1894 bits32 zSig; 1895 1896 aSig = extractFloat32Frac( a ); 1897 aExp = extractFloat32Exp( a ); 1898 aSign = extractFloat32Sign( a ); 1899 bSig = extractFloat32Frac( b ); 1900 bExp = extractFloat32Exp( b ); 1901 bSign = extractFloat32Sign( b ); 1902 zSign = aSign ^ bSign; 1903 if ( aExp == 0xFF ) { 1904 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1905 return propagateFloat32NaN( a, b ); 1906 } 1907 if ( ( bExp | bSig ) == 0 ) { 1908 float_raise( float_flag_invalid ); 1909 return float32_default_nan; 1910 } 1911 return packFloat32( zSign, 0xFF, 0 ); 1912 } 1913 if ( bExp == 0xFF ) { 1914 if ( bSig ) return propagateFloat32NaN( a, b ); 1915 if ( ( aExp | aSig ) == 0 ) { 1916 float_raise( float_flag_invalid ); 1917 return float32_default_nan; 1918 } 1919 return packFloat32( zSign, 0xFF, 0 ); 1920 } 1921 if ( aExp == 0 ) { 1922 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1923 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1924 } 1925 if ( bExp == 0 ) { 1926 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1927 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1928 } 1929 zExp = aExp + bExp - 0x7F; 1930 aSig = ( aSig | 0x00800000 )<<7; 1931 bSig = ( bSig | 0x00800000 )<<8; 1932 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1933 zSig = zSig64; 1934 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1935 zSig <<= 1; 1936 --zExp; 1937 } 1938 return roundAndPackFloat32( zSign, zExp, zSig ); 1939 1940 } 1941 1942 /* 1943 ------------------------------------------------------------------------------- 1944 Returns the result of dividing the single-precision floating-point value `a' 1945 by the corresponding value `b'. The operation is performed according to the 1946 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1947 ------------------------------------------------------------------------------- 1948 */ 1949 float32 float32_div( float32 a, float32 b ) 1950 { 1951 flag aSign, bSign, zSign; 1952 int16 aExp, bExp, zExp; 1953 bits32 aSig, bSig, zSig; 1954 1955 aSig = extractFloat32Frac( a ); 1956 aExp = extractFloat32Exp( a ); 1957 aSign = extractFloat32Sign( a ); 1958 bSig = extractFloat32Frac( b ); 1959 bExp = extractFloat32Exp( b ); 1960 bSign = extractFloat32Sign( b ); 1961 zSign = aSign ^ bSign; 1962 if ( aExp == 0xFF ) { 1963 if ( aSig ) return propagateFloat32NaN( a, b ); 1964 if ( bExp == 0xFF ) { 1965 if ( bSig ) return propagateFloat32NaN( a, b ); 1966 float_raise( float_flag_invalid ); 1967 return float32_default_nan; 1968 } 1969 return packFloat32( zSign, 0xFF, 0 ); 1970 } 1971 if ( bExp == 0xFF ) { 1972 if ( bSig ) return propagateFloat32NaN( a, b ); 1973 return packFloat32( zSign, 0, 0 ); 1974 } 1975 if ( bExp == 0 ) { 1976 if ( bSig == 0 ) { 1977 if ( ( aExp | aSig ) == 0 ) { 1978 float_raise( float_flag_invalid ); 1979 return float32_default_nan; 1980 } 1981 float_raise( float_flag_divbyzero ); 1982 return packFloat32( zSign, 0xFF, 0 ); 1983 } 1984 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1985 } 1986 if ( aExp == 0 ) { 1987 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1988 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1989 } 1990 zExp = aExp - bExp + 0x7D; 1991 aSig = ( aSig | 0x00800000 )<<7; 1992 bSig = ( bSig | 0x00800000 )<<8; 1993 if ( bSig <= ( aSig + aSig ) ) { 1994 aSig >>= 1; 1995 ++zExp; 1996 } 1997 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1998 if ( ( zSig & 0x3F ) == 0 ) { 1999 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2000 } 2001 return roundAndPackFloat32( zSign, zExp, zSig ); 2002 2003 } 2004 2005 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2006 /* 2007 ------------------------------------------------------------------------------- 2008 Returns the remainder of the single-precision floating-point value `a' 2009 with respect to the corresponding value `b'. The operation is performed 2010 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2011 ------------------------------------------------------------------------------- 2012 */ 2013 float32 float32_rem( float32 a, float32 b ) 2014 { 2015 flag aSign, bSign __unused, zSign; 2016 int16 aExp, bExp, expDiff; 2017 bits32 aSig, bSig; 2018 bits32 q; 2019 bits64 aSig64, bSig64, q64; 2020 bits32 alternateASig; 2021 sbits32 sigMean; 2022 2023 aSig = extractFloat32Frac( a ); 2024 aExp = extractFloat32Exp( a ); 2025 aSign = extractFloat32Sign( a ); 2026 bSig = extractFloat32Frac( b ); 2027 bExp = extractFloat32Exp( b ); 2028 bSign = extractFloat32Sign( b ); 2029 if ( aExp == 0xFF ) { 2030 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2031 return propagateFloat32NaN( a, b ); 2032 } 2033 float_raise( float_flag_invalid ); 2034 return float32_default_nan; 2035 } 2036 if ( bExp == 0xFF ) { 2037 if ( bSig ) return propagateFloat32NaN( a, b ); 2038 return a; 2039 } 2040 if ( bExp == 0 ) { 2041 if ( bSig == 0 ) { 2042 float_raise( float_flag_invalid ); 2043 return float32_default_nan; 2044 } 2045 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2046 } 2047 if ( aExp == 0 ) { 2048 if ( aSig == 0 ) return a; 2049 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2050 } 2051 expDiff = aExp - bExp; 2052 aSig |= 0x00800000; 2053 bSig |= 0x00800000; 2054 if ( expDiff < 32 ) { 2055 aSig <<= 8; 2056 bSig <<= 8; 2057 if ( expDiff < 0 ) { 2058 if ( expDiff < -1 ) return a; 2059 aSig >>= 1; 2060 } 2061 q = ( bSig <= aSig ); 2062 if ( q ) aSig -= bSig; 2063 if ( 0 < expDiff ) { 2064 q = ( ( (bits64) aSig )<<32 ) / bSig; 2065 q >>= 32 - expDiff; 2066 bSig >>= 2; 2067 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2068 } 2069 else { 2070 aSig >>= 2; 2071 bSig >>= 2; 2072 } 2073 } 2074 else { 2075 if ( bSig <= aSig ) aSig -= bSig; 2076 aSig64 = ( (bits64) aSig )<<40; 2077 bSig64 = ( (bits64) bSig )<<40; 2078 expDiff -= 64; 2079 while ( 0 < expDiff ) { 2080 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2081 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2082 aSig64 = - ( ( bSig * q64 )<<38 ); 2083 expDiff -= 62; 2084 } 2085 expDiff += 64; 2086 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2087 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2088 q = q64>>( 64 - expDiff ); 2089 bSig <<= 6; 2090 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2091 } 2092 do { 2093 alternateASig = aSig; 2094 ++q; 2095 aSig -= bSig; 2096 } while ( 0 <= (sbits32) aSig ); 2097 sigMean = aSig + alternateASig; 2098 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2099 aSig = alternateASig; 2100 } 2101 zSign = ( (sbits32) aSig < 0 ); 2102 if ( zSign ) aSig = - aSig; 2103 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2104 2105 } 2106 #endif /* !SOFTFLOAT_FOR_GCC */ 2107 2108 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2109 /* 2110 ------------------------------------------------------------------------------- 2111 Returns the square root of the single-precision floating-point value `a'. 2112 The operation is performed according to the IEC/IEEE Standard for Binary 2113 Floating-Point Arithmetic. 2114 ------------------------------------------------------------------------------- 2115 */ 2116 float32 float32_sqrt( float32 a ) 2117 { 2118 flag aSign; 2119 int16 aExp, zExp; 2120 bits32 aSig, zSig; 2121 bits64 rem, term; 2122 2123 aSig = extractFloat32Frac( a ); 2124 aExp = extractFloat32Exp( a ); 2125 aSign = extractFloat32Sign( a ); 2126 if ( aExp == 0xFF ) { 2127 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2128 if ( ! aSign ) return a; 2129 float_raise( float_flag_invalid ); 2130 return float32_default_nan; 2131 } 2132 if ( aSign ) { 2133 if ( ( aExp | aSig ) == 0 ) return a; 2134 float_raise( float_flag_invalid ); 2135 return float32_default_nan; 2136 } 2137 if ( aExp == 0 ) { 2138 if ( aSig == 0 ) return 0; 2139 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2140 } 2141 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2142 aSig = ( aSig | 0x00800000 )<<8; 2143 zSig = estimateSqrt32( aExp, aSig ) + 2; 2144 if ( ( zSig & 0x7F ) <= 5 ) { 2145 if ( zSig < 2 ) { 2146 zSig = 0x7FFFFFFF; 2147 goto roundAndPack; 2148 } 2149 aSig >>= aExp & 1; 2150 term = ( (bits64) zSig ) * zSig; 2151 rem = ( ( (bits64) aSig )<<32 ) - term; 2152 while ( (sbits64) rem < 0 ) { 2153 --zSig; 2154 rem += ( ( (bits64) zSig )<<1 ) | 1; 2155 } 2156 zSig |= ( rem != 0 ); 2157 } 2158 shift32RightJamming( zSig, 1, &zSig ); 2159 roundAndPack: 2160 return roundAndPackFloat32( 0, zExp, zSig ); 2161 2162 } 2163 #endif /* !SOFTFLOAT_FOR_GCC */ 2164 2165 /* 2166 ------------------------------------------------------------------------------- 2167 Returns 1 if the single-precision floating-point value `a' is equal to 2168 the corresponding value `b', and 0 otherwise. The comparison is performed 2169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2170 ------------------------------------------------------------------------------- 2171 */ 2172 flag float32_eq( float32 a, float32 b ) 2173 { 2174 2175 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2176 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2177 ) { 2178 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2179 float_raise( float_flag_invalid ); 2180 } 2181 return 0; 2182 } 2183 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2184 2185 } 2186 2187 /* 2188 ------------------------------------------------------------------------------- 2189 Returns 1 if the single-precision floating-point value `a' is less than 2190 or equal to the corresponding value `b', and 0 otherwise. The comparison 2191 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2192 Arithmetic. 2193 ------------------------------------------------------------------------------- 2194 */ 2195 flag float32_le( float32 a, float32 b ) 2196 { 2197 flag aSign, bSign; 2198 2199 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2200 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2201 ) { 2202 float_raise( float_flag_invalid ); 2203 return 0; 2204 } 2205 aSign = extractFloat32Sign( a ); 2206 bSign = extractFloat32Sign( b ); 2207 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2208 return ( a == b ) || ( aSign ^ ( a < b ) ); 2209 2210 } 2211 2212 /* 2213 ------------------------------------------------------------------------------- 2214 Returns 1 if the single-precision floating-point value `a' is less than 2215 the corresponding value `b', and 0 otherwise. The comparison is performed 2216 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2217 ------------------------------------------------------------------------------- 2218 */ 2219 flag float32_lt( float32 a, float32 b ) 2220 { 2221 flag aSign, bSign; 2222 2223 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2224 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2225 ) { 2226 float_raise( float_flag_invalid ); 2227 return 0; 2228 } 2229 aSign = extractFloat32Sign( a ); 2230 bSign = extractFloat32Sign( b ); 2231 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2232 return ( a != b ) && ( aSign ^ ( a < b ) ); 2233 2234 } 2235 2236 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2237 /* 2238 ------------------------------------------------------------------------------- 2239 Returns 1 if the single-precision floating-point value `a' is equal to 2240 the corresponding value `b', and 0 otherwise. The invalid exception is 2241 raised if either operand is a NaN. Otherwise, the comparison is performed 2242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2243 ------------------------------------------------------------------------------- 2244 */ 2245 flag float32_eq_signaling( float32 a, float32 b ) 2246 { 2247 2248 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2249 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2250 ) { 2251 float_raise( float_flag_invalid ); 2252 return 0; 2253 } 2254 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2255 2256 } 2257 2258 /* 2259 ------------------------------------------------------------------------------- 2260 Returns 1 if the single-precision floating-point value `a' is less than or 2261 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2262 cause an exception. Otherwise, the comparison is performed according to the 2263 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2264 ------------------------------------------------------------------------------- 2265 */ 2266 flag float32_le_quiet( float32 a, float32 b ) 2267 { 2268 flag aSign, bSign; 2269 2270 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2271 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2272 ) { 2273 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2274 float_raise( float_flag_invalid ); 2275 } 2276 return 0; 2277 } 2278 aSign = extractFloat32Sign( a ); 2279 bSign = extractFloat32Sign( b ); 2280 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2281 return ( a == b ) || ( aSign ^ ( a < b ) ); 2282 2283 } 2284 2285 /* 2286 ------------------------------------------------------------------------------- 2287 Returns 1 if the single-precision floating-point value `a' is less than 2288 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2289 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2290 Standard for Binary Floating-Point Arithmetic. 2291 ------------------------------------------------------------------------------- 2292 */ 2293 flag float32_lt_quiet( float32 a, float32 b ) 2294 { 2295 flag aSign, bSign; 2296 2297 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2298 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2299 ) { 2300 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2301 float_raise( float_flag_invalid ); 2302 } 2303 return 0; 2304 } 2305 aSign = extractFloat32Sign( a ); 2306 bSign = extractFloat32Sign( b ); 2307 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2308 return ( a != b ) && ( aSign ^ ( a < b ) ); 2309 2310 } 2311 #endif /* !SOFTFLOAT_FOR_GCC */ 2312 2313 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2314 /* 2315 ------------------------------------------------------------------------------- 2316 Returns the result of converting the double-precision floating-point value 2317 `a' to the 32-bit two's complement integer format. The conversion is 2318 performed according to the IEC/IEEE Standard for Binary Floating-Point 2319 Arithmetic---which means in particular that the conversion is rounded 2320 according to the current rounding mode. If `a' is a NaN, the largest 2321 positive integer is returned. Otherwise, if the conversion overflows, the 2322 largest integer with the same sign as `a' is returned. 2323 ------------------------------------------------------------------------------- 2324 */ 2325 int32 float64_to_int32( float64 a ) 2326 { 2327 flag aSign; 2328 int16 aExp, shiftCount; 2329 bits64 aSig; 2330 2331 aSig = extractFloat64Frac( a ); 2332 aExp = extractFloat64Exp( a ); 2333 aSign = extractFloat64Sign( a ); 2334 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2335 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2336 shiftCount = 0x42C - aExp; 2337 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2338 return roundAndPackInt32( aSign, aSig ); 2339 2340 } 2341 #endif /* !SOFTFLOAT_FOR_GCC */ 2342 2343 /* 2344 ------------------------------------------------------------------------------- 2345 Returns the result of converting the double-precision floating-point value 2346 `a' to the 32-bit two's complement integer format. The conversion is 2347 performed according to the IEC/IEEE Standard for Binary Floating-Point 2348 Arithmetic, except that the conversion is always rounded toward zero. 2349 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2350 the conversion overflows, the largest integer with the same sign as `a' is 2351 returned. 2352 ------------------------------------------------------------------------------- 2353 */ 2354 int32 float64_to_int32_round_to_zero( float64 a ) 2355 { 2356 flag aSign; 2357 int16 aExp, shiftCount; 2358 bits64 aSig, savedASig; 2359 int32 z; 2360 2361 aSig = extractFloat64Frac( a ); 2362 aExp = extractFloat64Exp( a ); 2363 aSign = extractFloat64Sign( a ); 2364 if ( 0x41E < aExp ) { 2365 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2366 goto invalid; 2367 } 2368 else if ( aExp < 0x3FF ) { 2369 if ( aExp || aSig ) float_set_inexact(); 2370 return 0; 2371 } 2372 aSig |= LIT64( 0x0010000000000000 ); 2373 shiftCount = 0x433 - aExp; 2374 savedASig = aSig; 2375 aSig >>= shiftCount; 2376 z = aSig; 2377 if ( aSign ) z = - z; 2378 if ( ( z < 0 ) ^ aSign ) { 2379 invalid: 2380 float_raise( float_flag_invalid ); 2381 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2382 } 2383 if ( ( aSig<<shiftCount ) != savedASig ) { 2384 float_set_inexact(); 2385 } 2386 return z; 2387 2388 } 2389 2390 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2391 /* 2392 ------------------------------------------------------------------------------- 2393 Returns the result of converting the double-precision floating-point value 2394 `a' to the 64-bit two's complement integer format. The conversion is 2395 performed according to the IEC/IEEE Standard for Binary Floating-Point 2396 Arithmetic---which means in particular that the conversion is rounded 2397 according to the current rounding mode. If `a' is a NaN, the largest 2398 positive integer is returned. Otherwise, if the conversion overflows, the 2399 largest integer with the same sign as `a' is returned. 2400 ------------------------------------------------------------------------------- 2401 */ 2402 int64 float64_to_int64( float64 a ) 2403 { 2404 flag aSign; 2405 int16 aExp, shiftCount; 2406 bits64 aSig, aSigExtra; 2407 2408 aSig = extractFloat64Frac( a ); 2409 aExp = extractFloat64Exp( a ); 2410 aSign = extractFloat64Sign( a ); 2411 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2412 shiftCount = 0x433 - aExp; 2413 if ( shiftCount <= 0 ) { 2414 if ( 0x43E < aExp ) { 2415 float_raise( float_flag_invalid ); 2416 if ( ! aSign 2417 || ( ( aExp == 0x7FF ) 2418 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2419 ) { 2420 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2421 } 2422 return (sbits64) LIT64( 0x8000000000000000 ); 2423 } 2424 aSigExtra = 0; 2425 aSig <<= - shiftCount; 2426 } 2427 else { 2428 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2429 } 2430 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2431 2432 } 2433 2434 /* like above, but result is unsigned */ 2435 uint64 float64_to_uint64( float64 a ) 2436 { 2437 flag aSign; 2438 int16 aExp, shiftCount; 2439 bits64 aSig, aSigExtra; 2440 2441 aSig = extractFloat64Frac( a ); 2442 aExp = extractFloat64Exp( a ); 2443 aSign = extractFloat64Sign( a ); 2444 2445 if (aSign) { 2446 return float64_to_int64(a); 2447 } 2448 2449 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2450 shiftCount = 0x433 - aExp; 2451 if ( shiftCount <= 0 ) { 2452 if ( 0x43E < aExp ) { 2453 float_raise( float_flag_invalid ); 2454 if ( ! aSign 2455 || ( ( aExp == 0x7FF ) 2456 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2457 ) { 2458 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2459 } 2460 return (sbits64) LIT64( 0x8000000000000000 ); 2461 } 2462 aSigExtra = 0; 2463 aSig <<= - shiftCount; 2464 } 2465 else { 2466 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2467 } 2468 return roundAndPackUInt64( aSig, aSigExtra ); 2469 2470 } 2471 2472 /* 2473 ------------------------------------------------------------------------------- 2474 Returns the result of converting the double-precision floating-point value 2475 `a' to the 64-bit two's complement integer format. The conversion is 2476 performed according to the IEC/IEEE Standard for Binary Floating-Point 2477 Arithmetic, except that the conversion is always rounded toward zero. 2478 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2479 the conversion overflows, the largest integer with the same sign as `a' is 2480 returned. 2481 ------------------------------------------------------------------------------- 2482 */ 2483 int64 float64_to_int64_round_to_zero( float64 a ) 2484 { 2485 flag aSign; 2486 int16 aExp, shiftCount; 2487 bits64 aSig; 2488 int64 z; 2489 2490 aSig = extractFloat64Frac( a ); 2491 aExp = extractFloat64Exp( a ); 2492 aSign = extractFloat64Sign( a ); 2493 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2494 shiftCount = aExp - 0x433; 2495 if ( 0 <= shiftCount ) { 2496 if ( 0x43E <= aExp ) { 2497 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2498 float_raise( float_flag_invalid ); 2499 if ( ! aSign 2500 || ( ( aExp == 0x7FF ) 2501 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2502 ) { 2503 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2504 } 2505 } 2506 return (sbits64) LIT64( 0x8000000000000000 ); 2507 } 2508 z = aSig<<shiftCount; 2509 } 2510 else { 2511 if ( aExp < 0x3FE ) { 2512 if ( aExp | aSig ) float_set_inexact(); 2513 return 0; 2514 } 2515 z = aSig>>( - shiftCount ); 2516 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2517 float_set_inexact(); 2518 } 2519 } 2520 if ( aSign ) z = - z; 2521 return z; 2522 2523 } 2524 #endif /* !SOFTFLOAT_FOR_GCC */ 2525 2526 /* 2527 ------------------------------------------------------------------------------- 2528 Returns the result of converting the double-precision floating-point value 2529 `a' to the single-precision floating-point format. The conversion is 2530 performed according to the IEC/IEEE Standard for Binary Floating-Point 2531 Arithmetic. 2532 ------------------------------------------------------------------------------- 2533 */ 2534 float32 float64_to_float32( float64 a ) 2535 { 2536 flag aSign; 2537 int16 aExp; 2538 bits64 aSig; 2539 bits32 zSig; 2540 2541 aSig = extractFloat64Frac( a ); 2542 aExp = extractFloat64Exp( a ); 2543 aSign = extractFloat64Sign( a ); 2544 if ( aExp == 0x7FF ) { 2545 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2546 return packFloat32( aSign, 0xFF, 0 ); 2547 } 2548 shift64RightJamming( aSig, 22, &aSig ); 2549 zSig = aSig; 2550 if ( aExp || zSig ) { 2551 zSig |= 0x40000000; 2552 aExp -= 0x381; 2553 } 2554 return roundAndPackFloat32( aSign, aExp, zSig ); 2555 2556 } 2557 2558 #ifdef FLOATX80 2559 2560 /* 2561 ------------------------------------------------------------------------------- 2562 Returns the result of converting the double-precision floating-point value 2563 `a' to the extended double-precision floating-point format. The conversion 2564 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2565 Arithmetic. 2566 ------------------------------------------------------------------------------- 2567 */ 2568 floatx80 float64_to_floatx80( float64 a ) 2569 { 2570 flag aSign; 2571 int16 aExp; 2572 bits64 aSig; 2573 2574 aSig = extractFloat64Frac( a ); 2575 aExp = extractFloat64Exp( a ); 2576 aSign = extractFloat64Sign( a ); 2577 if ( aExp == 0x7FF ) { 2578 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2579 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2580 } 2581 if ( aExp == 0 ) { 2582 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2583 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2584 } 2585 return 2586 packFloatx80( 2587 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2588 2589 } 2590 2591 #endif 2592 2593 #ifdef FLOAT128 2594 2595 /* 2596 ------------------------------------------------------------------------------- 2597 Returns the result of converting the double-precision floating-point value 2598 `a' to the quadruple-precision floating-point format. The conversion is 2599 performed according to the IEC/IEEE Standard for Binary Floating-Point 2600 Arithmetic. 2601 ------------------------------------------------------------------------------- 2602 */ 2603 float128 float64_to_float128( float64 a ) 2604 { 2605 flag aSign; 2606 int16 aExp; 2607 bits64 aSig, zSig0, zSig1; 2608 2609 aSig = extractFloat64Frac( a ); 2610 aExp = extractFloat64Exp( a ); 2611 aSign = extractFloat64Sign( a ); 2612 if ( aExp == 0x7FF ) { 2613 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2614 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2615 } 2616 if ( aExp == 0 ) { 2617 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2618 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2619 --aExp; 2620 } 2621 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2622 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2623 2624 } 2625 2626 #endif 2627 2628 #ifndef SOFTFLOAT_FOR_GCC 2629 /* 2630 ------------------------------------------------------------------------------- 2631 Rounds the double-precision floating-point value `a' to an integer, and 2632 returns the result as a double-precision floating-point value. The 2633 operation is performed according to the IEC/IEEE Standard for Binary 2634 Floating-Point Arithmetic. 2635 ------------------------------------------------------------------------------- 2636 */ 2637 float64 float64_round_to_int( float64 a ) 2638 { 2639 flag aSign; 2640 int16 aExp; 2641 bits64 lastBitMask, roundBitsMask; 2642 int8 roundingMode; 2643 float64 z; 2644 2645 aExp = extractFloat64Exp( a ); 2646 if ( 0x433 <= aExp ) { 2647 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2648 return propagateFloat64NaN( a, a ); 2649 } 2650 return a; 2651 } 2652 if ( aExp < 0x3FF ) { 2653 if ( (bits64) ( a<<1 ) == 0 ) return a; 2654 float_set_inexact(); 2655 aSign = extractFloat64Sign( a ); 2656 switch ( float_rounding_mode() ) { 2657 case float_round_nearest_even: 2658 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2659 return packFloat64( aSign, 0x3FF, 0 ); 2660 } 2661 break; 2662 case float_round_down: 2663 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2664 case float_round_up: 2665 return 2666 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2667 } 2668 return packFloat64( aSign, 0, 0 ); 2669 } 2670 lastBitMask = 1; 2671 lastBitMask <<= 0x433 - aExp; 2672 roundBitsMask = lastBitMask - 1; 2673 z = a; 2674 roundingMode = float_rounding_mode(); 2675 if ( roundingMode == float_round_nearest_even ) { 2676 z += lastBitMask>>1; 2677 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2678 } 2679 else if ( roundingMode != float_round_to_zero ) { 2680 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2681 z += roundBitsMask; 2682 } 2683 } 2684 z &= ~ roundBitsMask; 2685 if ( z != a ) float_set_inexact(); 2686 return z; 2687 2688 } 2689 #endif 2690 2691 /* 2692 ------------------------------------------------------------------------------- 2693 Returns the result of adding the absolute values of the double-precision 2694 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2695 before being returned. `zSign' is ignored if the result is a NaN. 2696 The addition is performed according to the IEC/IEEE Standard for Binary 2697 Floating-Point Arithmetic. 2698 ------------------------------------------------------------------------------- 2699 */ 2700 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2701 { 2702 int16 aExp, bExp, zExp; 2703 bits64 aSig, bSig, zSig; 2704 int16 expDiff; 2705 2706 aSig = extractFloat64Frac( a ); 2707 aExp = extractFloat64Exp( a ); 2708 bSig = extractFloat64Frac( b ); 2709 bExp = extractFloat64Exp( b ); 2710 expDiff = aExp - bExp; 2711 aSig <<= 9; 2712 bSig <<= 9; 2713 if ( 0 < expDiff ) { 2714 if ( aExp == 0x7FF ) { 2715 if ( aSig ) return propagateFloat64NaN( a, b ); 2716 return a; 2717 } 2718 if ( bExp == 0 ) { 2719 --expDiff; 2720 } 2721 else { 2722 bSig |= LIT64( 0x2000000000000000 ); 2723 } 2724 shift64RightJamming( bSig, expDiff, &bSig ); 2725 zExp = aExp; 2726 } 2727 else if ( expDiff < 0 ) { 2728 if ( bExp == 0x7FF ) { 2729 if ( bSig ) return propagateFloat64NaN( a, b ); 2730 return packFloat64( zSign, 0x7FF, 0 ); 2731 } 2732 if ( aExp == 0 ) { 2733 ++expDiff; 2734 } 2735 else { 2736 aSig |= LIT64( 0x2000000000000000 ); 2737 } 2738 shift64RightJamming( aSig, - expDiff, &aSig ); 2739 zExp = bExp; 2740 } 2741 else { 2742 if ( aExp == 0x7FF ) { 2743 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2744 return a; 2745 } 2746 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2747 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2748 zExp = aExp; 2749 goto roundAndPack; 2750 } 2751 aSig |= LIT64( 0x2000000000000000 ); 2752 zSig = ( aSig + bSig )<<1; 2753 --zExp; 2754 if ( (sbits64) zSig < 0 ) { 2755 zSig = aSig + bSig; 2756 ++zExp; 2757 } 2758 roundAndPack: 2759 return roundAndPackFloat64( zSign, zExp, zSig ); 2760 2761 } 2762 2763 /* 2764 ------------------------------------------------------------------------------- 2765 Returns the result of subtracting the absolute values of the double- 2766 precision floating-point values `a' and `b'. If `zSign' is 1, the 2767 difference is negated before being returned. `zSign' is ignored if the 2768 result is a NaN. The subtraction is performed according to the IEC/IEEE 2769 Standard for Binary Floating-Point Arithmetic. 2770 ------------------------------------------------------------------------------- 2771 */ 2772 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2773 { 2774 int16 aExp, bExp, zExp; 2775 bits64 aSig, bSig, zSig; 2776 int16 expDiff; 2777 2778 aSig = extractFloat64Frac( a ); 2779 aExp = extractFloat64Exp( a ); 2780 bSig = extractFloat64Frac( b ); 2781 bExp = extractFloat64Exp( b ); 2782 expDiff = aExp - bExp; 2783 aSig <<= 10; 2784 bSig <<= 10; 2785 if ( 0 < expDiff ) goto aExpBigger; 2786 if ( expDiff < 0 ) goto bExpBigger; 2787 if ( aExp == 0x7FF ) { 2788 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2789 float_raise( float_flag_invalid ); 2790 return float64_default_nan; 2791 } 2792 if ( aExp == 0 ) { 2793 aExp = 1; 2794 bExp = 1; 2795 } 2796 if ( bSig < aSig ) goto aBigger; 2797 if ( aSig < bSig ) goto bBigger; 2798 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2799 bExpBigger: 2800 if ( bExp == 0x7FF ) { 2801 if ( bSig ) return propagateFloat64NaN( a, b ); 2802 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2803 } 2804 if ( aExp == 0 ) { 2805 ++expDiff; 2806 } 2807 else { 2808 aSig |= LIT64( 0x4000000000000000 ); 2809 } 2810 shift64RightJamming( aSig, - expDiff, &aSig ); 2811 bSig |= LIT64( 0x4000000000000000 ); 2812 bBigger: 2813 zSig = bSig - aSig; 2814 zExp = bExp; 2815 zSign ^= 1; 2816 goto normalizeRoundAndPack; 2817 aExpBigger: 2818 if ( aExp == 0x7FF ) { 2819 if ( aSig ) return propagateFloat64NaN( a, b ); 2820 return a; 2821 } 2822 if ( bExp == 0 ) { 2823 --expDiff; 2824 } 2825 else { 2826 bSig |= LIT64( 0x4000000000000000 ); 2827 } 2828 shift64RightJamming( bSig, expDiff, &bSig ); 2829 aSig |= LIT64( 0x4000000000000000 ); 2830 aBigger: 2831 zSig = aSig - bSig; 2832 zExp = aExp; 2833 normalizeRoundAndPack: 2834 --zExp; 2835 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2836 2837 } 2838 2839 /* 2840 ------------------------------------------------------------------------------- 2841 Returns the result of adding the double-precision floating-point values `a' 2842 and `b'. The operation is performed according to the IEC/IEEE Standard for 2843 Binary Floating-Point Arithmetic. 2844 ------------------------------------------------------------------------------- 2845 */ 2846 float64 float64_add( float64 a, float64 b ) 2847 { 2848 flag aSign, bSign; 2849 2850 aSign = extractFloat64Sign( a ); 2851 bSign = extractFloat64Sign( b ); 2852 if ( aSign == bSign ) { 2853 return addFloat64Sigs( a, b, aSign ); 2854 } 2855 else { 2856 return subFloat64Sigs( a, b, aSign ); 2857 } 2858 2859 } 2860 2861 /* 2862 ------------------------------------------------------------------------------- 2863 Returns the result of subtracting the double-precision floating-point values 2864 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2865 for Binary Floating-Point Arithmetic. 2866 ------------------------------------------------------------------------------- 2867 */ 2868 float64 float64_sub( float64 a, float64 b ) 2869 { 2870 flag aSign, bSign; 2871 2872 aSign = extractFloat64Sign( a ); 2873 bSign = extractFloat64Sign( b ); 2874 if ( aSign == bSign ) { 2875 return subFloat64Sigs( a, b, aSign ); 2876 } 2877 else { 2878 return addFloat64Sigs( a, b, aSign ); 2879 } 2880 2881 } 2882 2883 /* 2884 ------------------------------------------------------------------------------- 2885 Returns the result of multiplying the double-precision floating-point values 2886 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2887 for Binary Floating-Point Arithmetic. 2888 ------------------------------------------------------------------------------- 2889 */ 2890 float64 float64_mul( float64 a, float64 b ) 2891 { 2892 flag aSign, bSign, zSign; 2893 int16 aExp, bExp, zExp; 2894 bits64 aSig, bSig, zSig0, zSig1; 2895 2896 aSig = extractFloat64Frac( a ); 2897 aExp = extractFloat64Exp( a ); 2898 aSign = extractFloat64Sign( a ); 2899 bSig = extractFloat64Frac( b ); 2900 bExp = extractFloat64Exp( b ); 2901 bSign = extractFloat64Sign( b ); 2902 zSign = aSign ^ bSign; 2903 if ( aExp == 0x7FF ) { 2904 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2905 return propagateFloat64NaN( a, b ); 2906 } 2907 if ( ( bExp | bSig ) == 0 ) { 2908 float_raise( float_flag_invalid ); 2909 return float64_default_nan; 2910 } 2911 return packFloat64( zSign, 0x7FF, 0 ); 2912 } 2913 if ( bExp == 0x7FF ) { 2914 if ( bSig ) return propagateFloat64NaN( a, b ); 2915 if ( ( aExp | aSig ) == 0 ) { 2916 float_raise( float_flag_invalid ); 2917 return float64_default_nan; 2918 } 2919 return packFloat64( zSign, 0x7FF, 0 ); 2920 } 2921 if ( aExp == 0 ) { 2922 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2923 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2924 } 2925 if ( bExp == 0 ) { 2926 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2927 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2928 } 2929 zExp = aExp + bExp - 0x3FF; 2930 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2931 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2932 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2933 zSig0 |= ( zSig1 != 0 ); 2934 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2935 zSig0 <<= 1; 2936 --zExp; 2937 } 2938 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2939 2940 } 2941 2942 /* 2943 ------------------------------------------------------------------------------- 2944 Returns the result of dividing the double-precision floating-point value `a' 2945 by the corresponding value `b'. The operation is performed according to 2946 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2947 ------------------------------------------------------------------------------- 2948 */ 2949 float64 float64_div( float64 a, float64 b ) 2950 { 2951 flag aSign, bSign, zSign; 2952 int16 aExp, bExp, zExp; 2953 bits64 aSig, bSig, zSig; 2954 bits64 rem0, rem1; 2955 bits64 term0, term1; 2956 2957 aSig = extractFloat64Frac( a ); 2958 aExp = extractFloat64Exp( a ); 2959 aSign = extractFloat64Sign( a ); 2960 bSig = extractFloat64Frac( b ); 2961 bExp = extractFloat64Exp( b ); 2962 bSign = extractFloat64Sign( b ); 2963 zSign = aSign ^ bSign; 2964 if ( aExp == 0x7FF ) { 2965 if ( aSig ) return propagateFloat64NaN( a, b ); 2966 if ( bExp == 0x7FF ) { 2967 if ( bSig ) return propagateFloat64NaN( a, b ); 2968 float_raise( float_flag_invalid ); 2969 return float64_default_nan; 2970 } 2971 return packFloat64( zSign, 0x7FF, 0 ); 2972 } 2973 if ( bExp == 0x7FF ) { 2974 if ( bSig ) return propagateFloat64NaN( a, b ); 2975 return packFloat64( zSign, 0, 0 ); 2976 } 2977 if ( bExp == 0 ) { 2978 if ( bSig == 0 ) { 2979 if ( ( aExp | aSig ) == 0 ) { 2980 float_raise( float_flag_invalid ); 2981 return float64_default_nan; 2982 } 2983 float_raise( float_flag_divbyzero ); 2984 return packFloat64( zSign, 0x7FF, 0 ); 2985 } 2986 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2987 } 2988 if ( aExp == 0 ) { 2989 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2990 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2991 } 2992 zExp = aExp - bExp + 0x3FD; 2993 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2994 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2995 if ( bSig <= ( aSig + aSig ) ) { 2996 aSig >>= 1; 2997 ++zExp; 2998 } 2999 zSig = estimateDiv128To64( aSig, 0, bSig ); 3000 if ( ( zSig & 0x1FF ) <= 2 ) { 3001 mul64To128( bSig, zSig, &term0, &term1 ); 3002 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3003 while ( (sbits64) rem0 < 0 ) { 3004 --zSig; 3005 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3006 } 3007 zSig |= ( rem1 != 0 ); 3008 } 3009 return roundAndPackFloat64( zSign, zExp, zSig ); 3010 3011 } 3012 3013 #ifndef SOFTFLOAT_FOR_GCC 3014 /* 3015 ------------------------------------------------------------------------------- 3016 Returns the remainder of the double-precision floating-point value `a' 3017 with respect to the corresponding value `b'. The operation is performed 3018 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3019 ------------------------------------------------------------------------------- 3020 */ 3021 float64 float64_rem( float64 a, float64 b ) 3022 { 3023 flag aSign, bSign __unused, zSign; 3024 int16 aExp, bExp, expDiff; 3025 bits64 aSig, bSig; 3026 bits64 q, alternateASig; 3027 sbits64 sigMean; 3028 3029 aSig = extractFloat64Frac( a ); 3030 aExp = extractFloat64Exp( a ); 3031 aSign = extractFloat64Sign( a ); 3032 bSig = extractFloat64Frac( b ); 3033 bExp = extractFloat64Exp( b ); 3034 bSign = extractFloat64Sign( b ); 3035 if ( aExp == 0x7FF ) { 3036 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3037 return propagateFloat64NaN( a, b ); 3038 } 3039 float_raise( float_flag_invalid ); 3040 return float64_default_nan; 3041 } 3042 if ( bExp == 0x7FF ) { 3043 if ( bSig ) return propagateFloat64NaN( a, b ); 3044 return a; 3045 } 3046 if ( bExp == 0 ) { 3047 if ( bSig == 0 ) { 3048 float_raise( float_flag_invalid ); 3049 return float64_default_nan; 3050 } 3051 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3052 } 3053 if ( aExp == 0 ) { 3054 if ( aSig == 0 ) return a; 3055 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3056 } 3057 expDiff = aExp - bExp; 3058 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3059 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3060 if ( expDiff < 0 ) { 3061 if ( expDiff < -1 ) return a; 3062 aSig >>= 1; 3063 } 3064 q = ( bSig <= aSig ); 3065 if ( q ) aSig -= bSig; 3066 expDiff -= 64; 3067 while ( 0 < expDiff ) { 3068 q = estimateDiv128To64( aSig, 0, bSig ); 3069 q = ( 2 < q ) ? q - 2 : 0; 3070 aSig = - ( ( bSig>>2 ) * q ); 3071 expDiff -= 62; 3072 } 3073 expDiff += 64; 3074 if ( 0 < expDiff ) { 3075 q = estimateDiv128To64( aSig, 0, bSig ); 3076 q = ( 2 < q ) ? q - 2 : 0; 3077 q >>= 64 - expDiff; 3078 bSig >>= 2; 3079 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3080 } 3081 else { 3082 aSig >>= 2; 3083 bSig >>= 2; 3084 } 3085 do { 3086 alternateASig = aSig; 3087 ++q; 3088 aSig -= bSig; 3089 } while ( 0 <= (sbits64) aSig ); 3090 sigMean = aSig + alternateASig; 3091 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3092 aSig = alternateASig; 3093 } 3094 zSign = ( (sbits64) aSig < 0 ); 3095 if ( zSign ) aSig = - aSig; 3096 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3097 3098 } 3099 3100 /* 3101 ------------------------------------------------------------------------------- 3102 Returns the square root of the double-precision floating-point value `a'. 3103 The operation is performed according to the IEC/IEEE Standard for Binary 3104 Floating-Point Arithmetic. 3105 ------------------------------------------------------------------------------- 3106 */ 3107 float64 float64_sqrt( float64 a ) 3108 { 3109 flag aSign; 3110 int16 aExp, zExp; 3111 bits64 aSig, zSig, doubleZSig; 3112 bits64 rem0, rem1, term0, term1; 3113 3114 aSig = extractFloat64Frac( a ); 3115 aExp = extractFloat64Exp( a ); 3116 aSign = extractFloat64Sign( a ); 3117 if ( aExp == 0x7FF ) { 3118 if ( aSig ) return propagateFloat64NaN( a, a ); 3119 if ( ! aSign ) return a; 3120 float_raise( float_flag_invalid ); 3121 return float64_default_nan; 3122 } 3123 if ( aSign ) { 3124 if ( ( aExp | aSig ) == 0 ) return a; 3125 float_raise( float_flag_invalid ); 3126 return float64_default_nan; 3127 } 3128 if ( aExp == 0 ) { 3129 if ( aSig == 0 ) return 0; 3130 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3131 } 3132 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3133 aSig |= LIT64( 0x0010000000000000 ); 3134 zSig = estimateSqrt32( aExp, aSig>>21 ); 3135 aSig <<= 9 - ( aExp & 1 ); 3136 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3137 if ( ( zSig & 0x1FF ) <= 5 ) { 3138 doubleZSig = zSig<<1; 3139 mul64To128( zSig, zSig, &term0, &term1 ); 3140 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3141 while ( (sbits64) rem0 < 0 ) { 3142 --zSig; 3143 doubleZSig -= 2; 3144 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3145 } 3146 zSig |= ( ( rem0 | rem1 ) != 0 ); 3147 } 3148 return roundAndPackFloat64( 0, zExp, zSig ); 3149 3150 } 3151 #endif 3152 3153 /* 3154 ------------------------------------------------------------------------------- 3155 Returns 1 if the double-precision floating-point value `a' is equal to the 3156 corresponding value `b', and 0 otherwise. The comparison is performed 3157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3158 ------------------------------------------------------------------------------- 3159 */ 3160 flag float64_eq( float64 a, float64 b ) 3161 { 3162 3163 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3164 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3165 ) { 3166 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3167 float_raise( float_flag_invalid ); 3168 } 3169 return 0; 3170 } 3171 return ( a == b ) || 3172 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3173 3174 } 3175 3176 /* 3177 ------------------------------------------------------------------------------- 3178 Returns 1 if the double-precision floating-point value `a' is less than or 3179 equal to the corresponding value `b', and 0 otherwise. The comparison is 3180 performed according to the IEC/IEEE Standard for Binary Floating-Point 3181 Arithmetic. 3182 ------------------------------------------------------------------------------- 3183 */ 3184 flag float64_le( float64 a, float64 b ) 3185 { 3186 flag aSign, bSign; 3187 3188 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3189 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3190 ) { 3191 float_raise( float_flag_invalid ); 3192 return 0; 3193 } 3194 aSign = extractFloat64Sign( a ); 3195 bSign = extractFloat64Sign( b ); 3196 if ( aSign != bSign ) 3197 return aSign || 3198 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3199 0 ); 3200 return ( a == b ) || 3201 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3202 3203 } 3204 3205 /* 3206 ------------------------------------------------------------------------------- 3207 Returns 1 if the double-precision floating-point value `a' is less than 3208 the corresponding value `b', and 0 otherwise. The comparison is performed 3209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3210 ------------------------------------------------------------------------------- 3211 */ 3212 flag float64_lt( float64 a, float64 b ) 3213 { 3214 flag aSign, bSign; 3215 3216 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3217 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3218 ) { 3219 float_raise( float_flag_invalid ); 3220 return 0; 3221 } 3222 aSign = extractFloat64Sign( a ); 3223 bSign = extractFloat64Sign( b ); 3224 if ( aSign != bSign ) 3225 return aSign && 3226 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3227 0 ); 3228 return ( a != b ) && 3229 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3230 3231 } 3232 3233 #ifndef SOFTFLOAT_FOR_GCC 3234 /* 3235 ------------------------------------------------------------------------------- 3236 Returns 1 if the double-precision floating-point value `a' is equal to the 3237 corresponding value `b', and 0 otherwise. The invalid exception is raised 3238 if either operand is a NaN. Otherwise, the comparison is performed 3239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3240 ------------------------------------------------------------------------------- 3241 */ 3242 flag float64_eq_signaling( float64 a, float64 b ) 3243 { 3244 3245 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3246 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3247 ) { 3248 float_raise( float_flag_invalid ); 3249 return 0; 3250 } 3251 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3252 3253 } 3254 3255 /* 3256 ------------------------------------------------------------------------------- 3257 Returns 1 if the double-precision floating-point value `a' is less than or 3258 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3259 cause an exception. Otherwise, the comparison is performed according to the 3260 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3261 ------------------------------------------------------------------------------- 3262 */ 3263 flag float64_le_quiet( float64 a, float64 b ) 3264 { 3265 flag aSign, bSign; 3266 3267 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3268 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3269 ) { 3270 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3271 float_raise( float_flag_invalid ); 3272 } 3273 return 0; 3274 } 3275 aSign = extractFloat64Sign( a ); 3276 bSign = extractFloat64Sign( b ); 3277 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3278 return ( a == b ) || ( aSign ^ ( a < b ) ); 3279 3280 } 3281 3282 /* 3283 ------------------------------------------------------------------------------- 3284 Returns 1 if the double-precision floating-point value `a' is less than 3285 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3286 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3287 Standard for Binary Floating-Point Arithmetic. 3288 ------------------------------------------------------------------------------- 3289 */ 3290 flag float64_lt_quiet( float64 a, float64 b ) 3291 { 3292 flag aSign, bSign; 3293 3294 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3295 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3296 ) { 3297 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3298 float_raise( float_flag_invalid ); 3299 } 3300 return 0; 3301 } 3302 aSign = extractFloat64Sign( a ); 3303 bSign = extractFloat64Sign( b ); 3304 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3305 return ( a != b ) && ( aSign ^ ( a < b ) ); 3306 3307 } 3308 #endif 3309 3310 #ifdef FLOATX80 3311 3312 /* 3313 ------------------------------------------------------------------------------- 3314 Returns the result of converting the extended double-precision floating- 3315 point value `a' to the 32-bit two's complement integer format. The 3316 conversion is performed according to the IEC/IEEE Standard for Binary 3317 Floating-Point Arithmetic---which means in particular that the conversion 3318 is rounded according to the current rounding mode. If `a' is a NaN, the 3319 largest positive integer is returned. Otherwise, if the conversion 3320 overflows, the largest integer with the same sign as `a' is returned. 3321 ------------------------------------------------------------------------------- 3322 */ 3323 int32 floatx80_to_int32( floatx80 a ) 3324 { 3325 flag aSign; 3326 int32 aExp, shiftCount; 3327 bits64 aSig; 3328 3329 aSig = extractFloatx80Frac( a ); 3330 aExp = extractFloatx80Exp( a ); 3331 aSign = extractFloatx80Sign( a ); 3332 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3333 shiftCount = 0x4037 - aExp; 3334 if ( shiftCount <= 0 ) shiftCount = 1; 3335 shift64RightJamming( aSig, shiftCount, &aSig ); 3336 return roundAndPackInt32( aSign, aSig ); 3337 3338 } 3339 3340 /* 3341 ------------------------------------------------------------------------------- 3342 Returns the result of converting the extended double-precision floating- 3343 point value `a' to the 32-bit two's complement integer format. The 3344 conversion is performed according to the IEC/IEEE Standard for Binary 3345 Floating-Point Arithmetic, except that the conversion is always rounded 3346 toward zero. If `a' is a NaN, the largest positive integer is returned. 3347 Otherwise, if the conversion overflows, the largest integer with the same 3348 sign as `a' is returned. 3349 ------------------------------------------------------------------------------- 3350 */ 3351 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3352 { 3353 flag aSign; 3354 int32 aExp, shiftCount; 3355 bits64 aSig, savedASig; 3356 int32 z; 3357 3358 aSig = extractFloatx80Frac( a ); 3359 aExp = extractFloatx80Exp( a ); 3360 aSign = extractFloatx80Sign( a ); 3361 if ( 0x401E < aExp ) { 3362 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3363 goto invalid; 3364 } 3365 else if ( aExp < 0x3FFF ) { 3366 if ( aExp || aSig ) float_set_inexact(); 3367 return 0; 3368 } 3369 shiftCount = 0x403E - aExp; 3370 savedASig = aSig; 3371 aSig >>= shiftCount; 3372 z = aSig; 3373 if ( aSign ) z = - z; 3374 if ( ( z < 0 ) ^ aSign ) { 3375 invalid: 3376 float_raise( float_flag_invalid ); 3377 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3378 } 3379 if ( ( aSig<<shiftCount ) != savedASig ) { 3380 float_set_inexact(); 3381 } 3382 return z; 3383 3384 } 3385 3386 /* 3387 ------------------------------------------------------------------------------- 3388 Returns the result of converting the extended double-precision floating- 3389 point value `a' to the 64-bit two's complement integer format. The 3390 conversion is performed according to the IEC/IEEE Standard for Binary 3391 Floating-Point Arithmetic---which means in particular that the conversion 3392 is rounded according to the current rounding mode. If `a' is a NaN, 3393 the largest positive integer is returned. Otherwise, if the conversion 3394 overflows, the largest integer with the same sign as `a' is returned. 3395 ------------------------------------------------------------------------------- 3396 */ 3397 int64 floatx80_to_int64( floatx80 a ) 3398 { 3399 flag aSign; 3400 int32 aExp, shiftCount; 3401 bits64 aSig, aSigExtra; 3402 3403 aSig = extractFloatx80Frac( a ); 3404 aExp = extractFloatx80Exp( a ); 3405 aSign = extractFloatx80Sign( a ); 3406 shiftCount = 0x403E - aExp; 3407 if ( shiftCount <= 0 ) { 3408 if ( shiftCount ) { 3409 float_raise( float_flag_invalid ); 3410 if ( ! aSign 3411 || ( ( aExp == 0x7FFF ) 3412 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3413 ) { 3414 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3415 } 3416 return (sbits64) LIT64( 0x8000000000000000 ); 3417 } 3418 aSigExtra = 0; 3419 } 3420 else { 3421 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3422 } 3423 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3424 3425 } 3426 3427 /* 3428 ------------------------------------------------------------------------------- 3429 Returns the result of converting the extended double-precision floating- 3430 point value `a' to the 64-bit two's complement integer format. The 3431 conversion is performed according to the IEC/IEEE Standard for Binary 3432 Floating-Point Arithmetic, except that the conversion is always rounded 3433 toward zero. If `a' is a NaN, the largest positive integer is returned. 3434 Otherwise, if the conversion overflows, the largest integer with the same 3435 sign as `a' is returned. 3436 ------------------------------------------------------------------------------- 3437 */ 3438 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3439 { 3440 flag aSign; 3441 int32 aExp, shiftCount; 3442 bits64 aSig; 3443 int64 z; 3444 3445 aSig = extractFloatx80Frac( a ); 3446 aExp = extractFloatx80Exp( a ); 3447 aSign = extractFloatx80Sign( a ); 3448 shiftCount = aExp - 0x403E; 3449 if ( 0 <= shiftCount ) { 3450 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3451 if ( ( a.high != 0xC03E ) || aSig ) { 3452 float_raise( float_flag_invalid ); 3453 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3454 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3455 } 3456 } 3457 return (sbits64) LIT64( 0x8000000000000000 ); 3458 } 3459 else if ( aExp < 0x3FFF ) { 3460 if ( aExp | aSig ) float_set_inexact(); 3461 return 0; 3462 } 3463 z = aSig>>( - shiftCount ); 3464 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3465 float_set_inexact(); 3466 } 3467 if ( aSign ) z = - z; 3468 return z; 3469 3470 } 3471 3472 /* 3473 ------------------------------------------------------------------------------- 3474 Returns the result of converting the extended double-precision floating- 3475 point value `a' to the single-precision floating-point format. The 3476 conversion is performed according to the IEC/IEEE Standard for Binary 3477 Floating-Point Arithmetic. 3478 ------------------------------------------------------------------------------- 3479 */ 3480 float32 floatx80_to_float32( floatx80 a ) 3481 { 3482 flag aSign; 3483 int32 aExp; 3484 bits64 aSig; 3485 3486 aSig = extractFloatx80Frac( a ); 3487 aExp = extractFloatx80Exp( a ); 3488 aSign = extractFloatx80Sign( a ); 3489 if ( aExp == 0x7FFF ) { 3490 if ( (bits64) ( aSig<<1 ) ) { 3491 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3492 } 3493 return packFloat32( aSign, 0xFF, 0 ); 3494 } 3495 shift64RightJamming( aSig, 33, &aSig ); 3496 if ( aExp || aSig ) aExp -= 0x3F81; 3497 return roundAndPackFloat32( aSign, aExp, aSig ); 3498 3499 } 3500 3501 /* 3502 ------------------------------------------------------------------------------- 3503 Returns the result of converting the extended double-precision floating- 3504 point value `a' to the double-precision floating-point format. The 3505 conversion is performed according to the IEC/IEEE Standard for Binary 3506 Floating-Point Arithmetic. 3507 ------------------------------------------------------------------------------- 3508 */ 3509 float64 floatx80_to_float64( floatx80 a ) 3510 { 3511 flag aSign; 3512 int32 aExp; 3513 bits64 aSig, zSig; 3514 3515 aSig = extractFloatx80Frac( a ); 3516 aExp = extractFloatx80Exp( a ); 3517 aSign = extractFloatx80Sign( a ); 3518 if ( aExp == 0x7FFF ) { 3519 if ( (bits64) ( aSig<<1 ) ) { 3520 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3521 } 3522 return packFloat64( aSign, 0x7FF, 0 ); 3523 } 3524 shift64RightJamming( aSig, 1, &zSig ); 3525 if ( aExp || aSig ) aExp -= 0x3C01; 3526 return roundAndPackFloat64( aSign, aExp, zSig ); 3527 3528 } 3529 3530 #ifdef FLOAT128 3531 3532 /* 3533 ------------------------------------------------------------------------------- 3534 Returns the result of converting the extended double-precision floating- 3535 point value `a' to the quadruple-precision floating-point format. The 3536 conversion is performed according to the IEC/IEEE Standard for Binary 3537 Floating-Point Arithmetic. 3538 ------------------------------------------------------------------------------- 3539 */ 3540 float128 floatx80_to_float128( floatx80 a ) 3541 { 3542 flag aSign; 3543 int16 aExp; 3544 bits64 aSig, zSig0, zSig1; 3545 3546 aSig = extractFloatx80Frac( a ); 3547 aExp = extractFloatx80Exp( a ); 3548 aSign = extractFloatx80Sign( a ); 3549 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3550 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3551 } 3552 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3553 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3554 3555 } 3556 3557 #endif 3558 3559 /* 3560 ------------------------------------------------------------------------------- 3561 Rounds the extended double-precision floating-point value `a' to an integer, 3562 and returns the result as an extended quadruple-precision floating-point 3563 value. The operation is performed according to the IEC/IEEE Standard for 3564 Binary Floating-Point Arithmetic. 3565 ------------------------------------------------------------------------------- 3566 */ 3567 floatx80 floatx80_round_to_int( floatx80 a ) 3568 { 3569 flag aSign; 3570 int32 aExp; 3571 bits64 lastBitMask, roundBitsMask; 3572 int8 roundingMode; 3573 floatx80 z; 3574 3575 aExp = extractFloatx80Exp( a ); 3576 if ( 0x403E <= aExp ) { 3577 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3578 return propagateFloatx80NaN( a, a ); 3579 } 3580 return a; 3581 } 3582 if ( aExp < 0x3FFF ) { 3583 if ( ( aExp == 0 ) 3584 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3585 return a; 3586 } 3587 float_set_inexact(); 3588 aSign = extractFloatx80Sign( a ); 3589 switch ( float_rounding_mode() ) { 3590 case float_round_nearest_even: 3591 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3592 ) { 3593 return 3594 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3595 } 3596 break; 3597 case float_round_down: 3598 return 3599 aSign ? 3600 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3601 : packFloatx80( 0, 0, 0 ); 3602 case float_round_up: 3603 return 3604 aSign ? packFloatx80( 1, 0, 0 ) 3605 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3606 } 3607 return packFloatx80( aSign, 0, 0 ); 3608 } 3609 lastBitMask = 1; 3610 lastBitMask <<= 0x403E - aExp; 3611 roundBitsMask = lastBitMask - 1; 3612 z = a; 3613 roundingMode = float_rounding_mode(); 3614 if ( roundingMode == float_round_nearest_even ) { 3615 z.low += lastBitMask>>1; 3616 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3617 } 3618 else if ( roundingMode != float_round_to_zero ) { 3619 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3620 z.low += roundBitsMask; 3621 } 3622 } 3623 z.low &= ~ roundBitsMask; 3624 if ( z.low == 0 ) { 3625 ++z.high; 3626 z.low = LIT64( 0x8000000000000000 ); 3627 } 3628 if ( z.low != a.low ) float_set_inexact(); 3629 return z; 3630 3631 } 3632 3633 /* 3634 ------------------------------------------------------------------------------- 3635 Returns the result of adding the absolute values of the extended double- 3636 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3637 negated before being returned. `zSign' is ignored if the result is a NaN. 3638 The addition is performed according to the IEC/IEEE Standard for Binary 3639 Floating-Point Arithmetic. 3640 ------------------------------------------------------------------------------- 3641 */ 3642 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3643 { 3644 int32 aExp, bExp, zExp; 3645 bits64 aSig, bSig, zSig0, zSig1; 3646 int32 expDiff; 3647 3648 aSig = extractFloatx80Frac( a ); 3649 aExp = extractFloatx80Exp( a ); 3650 bSig = extractFloatx80Frac( b ); 3651 bExp = extractFloatx80Exp( b ); 3652 expDiff = aExp - bExp; 3653 if ( 0 < expDiff ) { 3654 if ( aExp == 0x7FFF ) { 3655 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3656 return a; 3657 } 3658 if ( bExp == 0 ) --expDiff; 3659 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3660 zExp = aExp; 3661 } 3662 else if ( expDiff < 0 ) { 3663 if ( bExp == 0x7FFF ) { 3664 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3665 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3666 } 3667 if ( aExp == 0 ) ++expDiff; 3668 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3669 zExp = bExp; 3670 } 3671 else { 3672 if ( aExp == 0x7FFF ) { 3673 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3674 return propagateFloatx80NaN( a, b ); 3675 } 3676 return a; 3677 } 3678 zSig1 = 0; 3679 zSig0 = aSig + bSig; 3680 if ( aExp == 0 ) { 3681 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3682 goto roundAndPack; 3683 } 3684 zExp = aExp; 3685 goto shiftRight1; 3686 } 3687 zSig0 = aSig + bSig; 3688 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3689 shiftRight1: 3690 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3691 zSig0 |= LIT64( 0x8000000000000000 ); 3692 ++zExp; 3693 roundAndPack: 3694 return 3695 roundAndPackFloatx80( 3696 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3697 3698 } 3699 3700 /* 3701 ------------------------------------------------------------------------------- 3702 Returns the result of subtracting the absolute values of the extended 3703 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3704 difference is negated before being returned. `zSign' is ignored if the 3705 result is a NaN. The subtraction is performed according to the IEC/IEEE 3706 Standard for Binary Floating-Point Arithmetic. 3707 ------------------------------------------------------------------------------- 3708 */ 3709 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3710 { 3711 int32 aExp, bExp, zExp; 3712 bits64 aSig, bSig, zSig0, zSig1; 3713 int32 expDiff; 3714 floatx80 z; 3715 3716 aSig = extractFloatx80Frac( a ); 3717 aExp = extractFloatx80Exp( a ); 3718 bSig = extractFloatx80Frac( b ); 3719 bExp = extractFloatx80Exp( b ); 3720 expDiff = aExp - bExp; 3721 if ( 0 < expDiff ) goto aExpBigger; 3722 if ( expDiff < 0 ) goto bExpBigger; 3723 if ( aExp == 0x7FFF ) { 3724 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3725 return propagateFloatx80NaN( a, b ); 3726 } 3727 float_raise( float_flag_invalid ); 3728 z.low = floatx80_default_nan_low; 3729 z.high = floatx80_default_nan_high; 3730 return z; 3731 } 3732 if ( aExp == 0 ) { 3733 aExp = 1; 3734 bExp = 1; 3735 } 3736 zSig1 = 0; 3737 if ( bSig < aSig ) goto aBigger; 3738 if ( aSig < bSig ) goto bBigger; 3739 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3740 bExpBigger: 3741 if ( bExp == 0x7FFF ) { 3742 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3743 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3744 } 3745 if ( aExp == 0 ) ++expDiff; 3746 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3747 bBigger: 3748 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3749 zExp = bExp; 3750 zSign ^= 1; 3751 goto normalizeRoundAndPack; 3752 aExpBigger: 3753 if ( aExp == 0x7FFF ) { 3754 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3755 return a; 3756 } 3757 if ( bExp == 0 ) --expDiff; 3758 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3759 aBigger: 3760 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3761 zExp = aExp; 3762 normalizeRoundAndPack: 3763 return 3764 normalizeRoundAndPackFloatx80( 3765 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3766 3767 } 3768 3769 /* 3770 ------------------------------------------------------------------------------- 3771 Returns the result of adding the extended double-precision floating-point 3772 values `a' and `b'. The operation is performed according to the IEC/IEEE 3773 Standard for Binary Floating-Point Arithmetic. 3774 ------------------------------------------------------------------------------- 3775 */ 3776 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3777 { 3778 flag aSign, bSign; 3779 3780 aSign = extractFloatx80Sign( a ); 3781 bSign = extractFloatx80Sign( b ); 3782 if ( aSign == bSign ) { 3783 return addFloatx80Sigs( a, b, aSign ); 3784 } 3785 else { 3786 return subFloatx80Sigs( a, b, aSign ); 3787 } 3788 3789 } 3790 3791 /* 3792 ------------------------------------------------------------------------------- 3793 Returns the result of subtracting the extended double-precision floating- 3794 point values `a' and `b'. The operation is performed according to the 3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3796 ------------------------------------------------------------------------------- 3797 */ 3798 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3799 { 3800 flag aSign, bSign; 3801 3802 aSign = extractFloatx80Sign( a ); 3803 bSign = extractFloatx80Sign( b ); 3804 if ( aSign == bSign ) { 3805 return subFloatx80Sigs( a, b, aSign ); 3806 } 3807 else { 3808 return addFloatx80Sigs( a, b, aSign ); 3809 } 3810 3811 } 3812 3813 /* 3814 ------------------------------------------------------------------------------- 3815 Returns the result of multiplying the extended double-precision floating- 3816 point values `a' and `b'. The operation is performed according to the 3817 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3818 ------------------------------------------------------------------------------- 3819 */ 3820 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3821 { 3822 flag aSign, bSign, zSign; 3823 int32 aExp, bExp, zExp; 3824 bits64 aSig, bSig, zSig0, zSig1; 3825 floatx80 z; 3826 3827 aSig = extractFloatx80Frac( a ); 3828 aExp = extractFloatx80Exp( a ); 3829 aSign = extractFloatx80Sign( a ); 3830 bSig = extractFloatx80Frac( b ); 3831 bExp = extractFloatx80Exp( b ); 3832 bSign = extractFloatx80Sign( b ); 3833 zSign = aSign ^ bSign; 3834 if ( aExp == 0x7FFF ) { 3835 if ( (bits64) ( aSig<<1 ) 3836 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3837 return propagateFloatx80NaN( a, b ); 3838 } 3839 if ( ( bExp | bSig ) == 0 ) goto invalid; 3840 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3841 } 3842 if ( bExp == 0x7FFF ) { 3843 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3844 if ( ( aExp | aSig ) == 0 ) { 3845 invalid: 3846 float_raise( float_flag_invalid ); 3847 z.low = floatx80_default_nan_low; 3848 z.high = floatx80_default_nan_high; 3849 return z; 3850 } 3851 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3852 } 3853 if ( aExp == 0 ) { 3854 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3855 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3856 } 3857 if ( bExp == 0 ) { 3858 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3859 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3860 } 3861 zExp = aExp + bExp - 0x3FFE; 3862 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3863 if ( 0 < (sbits64) zSig0 ) { 3864 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3865 --zExp; 3866 } 3867 return 3868 roundAndPackFloatx80( 3869 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3870 3871 } 3872 3873 /* 3874 ------------------------------------------------------------------------------- 3875 Returns the result of dividing the extended double-precision floating-point 3876 value `a' by the corresponding value `b'. The operation is performed 3877 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3878 ------------------------------------------------------------------------------- 3879 */ 3880 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3881 { 3882 flag aSign, bSign, zSign; 3883 int32 aExp, bExp, zExp; 3884 bits64 aSig, bSig, zSig0, zSig1; 3885 bits64 rem0, rem1, rem2, term0, term1, term2; 3886 floatx80 z; 3887 3888 aSig = extractFloatx80Frac( a ); 3889 aExp = extractFloatx80Exp( a ); 3890 aSign = extractFloatx80Sign( a ); 3891 bSig = extractFloatx80Frac( b ); 3892 bExp = extractFloatx80Exp( b ); 3893 bSign = extractFloatx80Sign( b ); 3894 zSign = aSign ^ bSign; 3895 if ( aExp == 0x7FFF ) { 3896 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3897 if ( bExp == 0x7FFF ) { 3898 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3899 goto invalid; 3900 } 3901 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3902 } 3903 if ( bExp == 0x7FFF ) { 3904 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3905 return packFloatx80( zSign, 0, 0 ); 3906 } 3907 if ( bExp == 0 ) { 3908 if ( bSig == 0 ) { 3909 if ( ( aExp | aSig ) == 0 ) { 3910 invalid: 3911 float_raise( float_flag_invalid ); 3912 z.low = floatx80_default_nan_low; 3913 z.high = floatx80_default_nan_high; 3914 return z; 3915 } 3916 float_raise( float_flag_divbyzero ); 3917 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3918 } 3919 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3920 } 3921 if ( aExp == 0 ) { 3922 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3923 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3924 } 3925 zExp = aExp - bExp + 0x3FFE; 3926 rem1 = 0; 3927 if ( bSig <= aSig ) { 3928 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3929 ++zExp; 3930 } 3931 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3932 mul64To128( bSig, zSig0, &term0, &term1 ); 3933 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3934 while ( (sbits64) rem0 < 0 ) { 3935 --zSig0; 3936 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3937 } 3938 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3939 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3940 mul64To128( bSig, zSig1, &term1, &term2 ); 3941 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3942 while ( (sbits64) rem1 < 0 ) { 3943 --zSig1; 3944 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3945 } 3946 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3947 } 3948 return 3949 roundAndPackFloatx80( 3950 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3951 3952 } 3953 3954 /* 3955 ------------------------------------------------------------------------------- 3956 Returns the remainder of the extended double-precision floating-point value 3957 `a' with respect to the corresponding value `b'. The operation is performed 3958 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3959 ------------------------------------------------------------------------------- 3960 */ 3961 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3962 { 3963 flag aSign, bSign, zSign; 3964 int32 aExp, bExp, expDiff; 3965 bits64 aSig0, aSig1, bSig; 3966 bits64 q, term0, term1, alternateASig0, alternateASig1; 3967 floatx80 z; 3968 3969 aSig0 = extractFloatx80Frac( a ); 3970 aExp = extractFloatx80Exp( a ); 3971 aSign = extractFloatx80Sign( a ); 3972 bSig = extractFloatx80Frac( b ); 3973 bExp = extractFloatx80Exp( b ); 3974 bSign = extractFloatx80Sign( b ); 3975 if ( aExp == 0x7FFF ) { 3976 if ( (bits64) ( aSig0<<1 ) 3977 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3978 return propagateFloatx80NaN( a, b ); 3979 } 3980 goto invalid; 3981 } 3982 if ( bExp == 0x7FFF ) { 3983 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3984 return a; 3985 } 3986 if ( bExp == 0 ) { 3987 if ( bSig == 0 ) { 3988 invalid: 3989 float_raise( float_flag_invalid ); 3990 z.low = floatx80_default_nan_low; 3991 z.high = floatx80_default_nan_high; 3992 return z; 3993 } 3994 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3995 } 3996 if ( aExp == 0 ) { 3997 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3998 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3999 } 4000 bSig |= LIT64( 0x8000000000000000 ); 4001 zSign = aSign; 4002 expDiff = aExp - bExp; 4003 aSig1 = 0; 4004 if ( expDiff < 0 ) { 4005 if ( expDiff < -1 ) return a; 4006 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4007 expDiff = 0; 4008 } 4009 q = ( bSig <= aSig0 ); 4010 if ( q ) aSig0 -= bSig; 4011 expDiff -= 64; 4012 while ( 0 < expDiff ) { 4013 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4014 q = ( 2 < q ) ? q - 2 : 0; 4015 mul64To128( bSig, q, &term0, &term1 ); 4016 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4017 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4018 expDiff -= 62; 4019 } 4020 expDiff += 64; 4021 if ( 0 < expDiff ) { 4022 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4023 q = ( 2 < q ) ? q - 2 : 0; 4024 q >>= 64 - expDiff; 4025 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4026 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4027 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4028 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4029 ++q; 4030 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4031 } 4032 } 4033 else { 4034 term1 = 0; 4035 term0 = bSig; 4036 } 4037 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4038 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4039 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4040 && ( q & 1 ) ) 4041 ) { 4042 aSig0 = alternateASig0; 4043 aSig1 = alternateASig1; 4044 zSign = ! zSign; 4045 } 4046 return 4047 normalizeRoundAndPackFloatx80( 4048 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4049 4050 } 4051 4052 /* 4053 ------------------------------------------------------------------------------- 4054 Returns the square root of the extended double-precision floating-point 4055 value `a'. The operation is performed according to the IEC/IEEE Standard 4056 for Binary Floating-Point Arithmetic. 4057 ------------------------------------------------------------------------------- 4058 */ 4059 floatx80 floatx80_sqrt( floatx80 a ) 4060 { 4061 flag aSign; 4062 int32 aExp, zExp; 4063 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4064 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4065 floatx80 z; 4066 4067 aSig0 = extractFloatx80Frac( a ); 4068 aExp = extractFloatx80Exp( a ); 4069 aSign = extractFloatx80Sign( a ); 4070 if ( aExp == 0x7FFF ) { 4071 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4072 if ( ! aSign ) return a; 4073 goto invalid; 4074 } 4075 if ( aSign ) { 4076 if ( ( aExp | aSig0 ) == 0 ) return a; 4077 invalid: 4078 float_raise( float_flag_invalid ); 4079 z.low = floatx80_default_nan_low; 4080 z.high = floatx80_default_nan_high; 4081 return z; 4082 } 4083 if ( aExp == 0 ) { 4084 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4085 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4086 } 4087 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4088 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4089 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4090 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4091 doubleZSig0 = zSig0<<1; 4092 mul64To128( zSig0, zSig0, &term0, &term1 ); 4093 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4094 while ( (sbits64) rem0 < 0 ) { 4095 --zSig0; 4096 doubleZSig0 -= 2; 4097 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4098 } 4099 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4100 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4101 if ( zSig1 == 0 ) zSig1 = 1; 4102 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4103 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4104 mul64To128( zSig1, zSig1, &term2, &term3 ); 4105 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4106 while ( (sbits64) rem1 < 0 ) { 4107 --zSig1; 4108 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4109 term3 |= 1; 4110 term2 |= doubleZSig0; 4111 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4112 } 4113 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4114 } 4115 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4116 zSig0 |= doubleZSig0; 4117 return 4118 roundAndPackFloatx80( 4119 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4120 4121 } 4122 4123 /* 4124 ------------------------------------------------------------------------------- 4125 Returns 1 if the extended double-precision floating-point value `a' is 4126 equal to the corresponding value `b', and 0 otherwise. The comparison is 4127 performed according to the IEC/IEEE Standard for Binary Floating-Point 4128 Arithmetic. 4129 ------------------------------------------------------------------------------- 4130 */ 4131 flag floatx80_eq( floatx80 a, floatx80 b ) 4132 { 4133 4134 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4135 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4136 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4137 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4138 ) { 4139 if ( floatx80_is_signaling_nan( a ) 4140 || floatx80_is_signaling_nan( b ) ) { 4141 float_raise( float_flag_invalid ); 4142 } 4143 return 0; 4144 } 4145 return 4146 ( a.low == b.low ) 4147 && ( ( a.high == b.high ) 4148 || ( ( a.low == 0 ) 4149 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4150 ); 4151 4152 } 4153 4154 /* 4155 ------------------------------------------------------------------------------- 4156 Returns 1 if the extended double-precision floating-point value `a' is 4157 less than or equal to the corresponding value `b', and 0 otherwise. The 4158 comparison is performed according to the IEC/IEEE Standard for Binary 4159 Floating-Point Arithmetic. 4160 ------------------------------------------------------------------------------- 4161 */ 4162 flag floatx80_le( floatx80 a, floatx80 b ) 4163 { 4164 flag aSign, bSign; 4165 4166 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4167 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4168 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4169 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4170 ) { 4171 float_raise( float_flag_invalid ); 4172 return 0; 4173 } 4174 aSign = extractFloatx80Sign( a ); 4175 bSign = extractFloatx80Sign( b ); 4176 if ( aSign != bSign ) { 4177 return 4178 aSign 4179 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4180 == 0 ); 4181 } 4182 return 4183 aSign ? le128( b.high, b.low, a.high, a.low ) 4184 : le128( a.high, a.low, b.high, b.low ); 4185 4186 } 4187 4188 /* 4189 ------------------------------------------------------------------------------- 4190 Returns 1 if the extended double-precision floating-point value `a' is 4191 less than the corresponding value `b', and 0 otherwise. The comparison 4192 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4193 Arithmetic. 4194 ------------------------------------------------------------------------------- 4195 */ 4196 flag floatx80_lt( floatx80 a, floatx80 b ) 4197 { 4198 flag aSign, bSign; 4199 4200 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4201 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4202 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4203 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4204 ) { 4205 float_raise( float_flag_invalid ); 4206 return 0; 4207 } 4208 aSign = extractFloatx80Sign( a ); 4209 bSign = extractFloatx80Sign( b ); 4210 if ( aSign != bSign ) { 4211 return 4212 aSign 4213 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4214 != 0 ); 4215 } 4216 return 4217 aSign ? lt128( b.high, b.low, a.high, a.low ) 4218 : lt128( a.high, a.low, b.high, b.low ); 4219 4220 } 4221 4222 /* 4223 ------------------------------------------------------------------------------- 4224 Returns 1 if the extended double-precision floating-point value `a' is equal 4225 to the corresponding value `b', and 0 otherwise. The invalid exception is 4226 raised if either operand is a NaN. Otherwise, the comparison is performed 4227 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4228 ------------------------------------------------------------------------------- 4229 */ 4230 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4231 { 4232 4233 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4234 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4235 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4236 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4237 ) { 4238 float_raise( float_flag_invalid ); 4239 return 0; 4240 } 4241 return 4242 ( a.low == b.low ) 4243 && ( ( a.high == b.high ) 4244 || ( ( a.low == 0 ) 4245 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4246 ); 4247 4248 } 4249 4250 /* 4251 ------------------------------------------------------------------------------- 4252 Returns 1 if the extended double-precision floating-point value `a' is less 4253 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4254 do not cause an exception. Otherwise, the comparison is performed according 4255 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4256 ------------------------------------------------------------------------------- 4257 */ 4258 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4259 { 4260 flag aSign, bSign; 4261 4262 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4263 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4264 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4265 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4266 ) { 4267 if ( floatx80_is_signaling_nan( a ) 4268 || floatx80_is_signaling_nan( b ) ) { 4269 float_raise( float_flag_invalid ); 4270 } 4271 return 0; 4272 } 4273 aSign = extractFloatx80Sign( a ); 4274 bSign = extractFloatx80Sign( b ); 4275 if ( aSign != bSign ) { 4276 return 4277 aSign 4278 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4279 == 0 ); 4280 } 4281 return 4282 aSign ? le128( b.high, b.low, a.high, a.low ) 4283 : le128( a.high, a.low, b.high, b.low ); 4284 4285 } 4286 4287 /* 4288 ------------------------------------------------------------------------------- 4289 Returns 1 if the extended double-precision floating-point value `a' is less 4290 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4291 an exception. Otherwise, the comparison is performed according to the 4292 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4293 ------------------------------------------------------------------------------- 4294 */ 4295 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4296 { 4297 flag aSign, bSign; 4298 4299 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4300 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4301 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4302 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4303 ) { 4304 if ( floatx80_is_signaling_nan( a ) 4305 || floatx80_is_signaling_nan( b ) ) { 4306 float_raise( float_flag_invalid ); 4307 } 4308 return 0; 4309 } 4310 aSign = extractFloatx80Sign( a ); 4311 bSign = extractFloatx80Sign( b ); 4312 if ( aSign != bSign ) { 4313 return 4314 aSign 4315 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4316 != 0 ); 4317 } 4318 return 4319 aSign ? lt128( b.high, b.low, a.high, a.low ) 4320 : lt128( a.high, a.low, b.high, b.low ); 4321 4322 } 4323 4324 #endif 4325 4326 #ifdef FLOAT128 4327 4328 /* 4329 ------------------------------------------------------------------------------- 4330 Returns the result of converting the quadruple-precision floating-point 4331 value `a' to the 32-bit two's complement integer format. The conversion 4332 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4333 Arithmetic---which means in particular that the conversion is rounded 4334 according to the current rounding mode. If `a' is a NaN, the largest 4335 positive integer is returned. Otherwise, if the conversion overflows, the 4336 largest integer with the same sign as `a' is returned. 4337 ------------------------------------------------------------------------------- 4338 */ 4339 int32 float128_to_int32( float128 a ) 4340 { 4341 flag aSign; 4342 int32 aExp, shiftCount; 4343 bits64 aSig0, aSig1; 4344 4345 aSig1 = extractFloat128Frac1( a ); 4346 aSig0 = extractFloat128Frac0( a ); 4347 aExp = extractFloat128Exp( a ); 4348 aSign = extractFloat128Sign( a ); 4349 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4350 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4351 aSig0 |= ( aSig1 != 0 ); 4352 shiftCount = 0x4028 - aExp; 4353 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4354 return roundAndPackInt32( aSign, aSig0 ); 4355 4356 } 4357 4358 /* 4359 ------------------------------------------------------------------------------- 4360 Returns the result of converting the quadruple-precision floating-point 4361 value `a' to the 32-bit two's complement integer format. The conversion 4362 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4363 Arithmetic, except that the conversion is always rounded toward zero. If 4364 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4365 conversion overflows, the largest integer with the same sign as `a' is 4366 returned. 4367 ------------------------------------------------------------------------------- 4368 */ 4369 int32 float128_to_int32_round_to_zero( float128 a ) 4370 { 4371 flag aSign; 4372 int32 aExp, shiftCount; 4373 bits64 aSig0, aSig1, savedASig; 4374 int32 z; 4375 4376 aSig1 = extractFloat128Frac1( a ); 4377 aSig0 = extractFloat128Frac0( a ); 4378 aExp = extractFloat128Exp( a ); 4379 aSign = extractFloat128Sign( a ); 4380 aSig0 |= ( aSig1 != 0 ); 4381 if ( 0x401E < aExp ) { 4382 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4383 goto invalid; 4384 } 4385 else if ( aExp < 0x3FFF ) { 4386 if ( aExp || aSig0 ) float_set_inexact(); 4387 return 0; 4388 } 4389 aSig0 |= LIT64( 0x0001000000000000 ); 4390 shiftCount = 0x402F - aExp; 4391 savedASig = aSig0; 4392 aSig0 >>= shiftCount; 4393 z = aSig0; 4394 if ( aSign ) z = - z; 4395 if ( ( z < 0 ) ^ aSign ) { 4396 invalid: 4397 float_raise( float_flag_invalid ); 4398 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4399 } 4400 if ( ( aSig0<<shiftCount ) != savedASig ) { 4401 float_set_inexact(); 4402 } 4403 return z; 4404 4405 } 4406 4407 /* 4408 ------------------------------------------------------------------------------- 4409 Returns the result of converting the quadruple-precision floating-point 4410 value `a' to the 64-bit two's complement integer format. The conversion 4411 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4412 Arithmetic---which means in particular that the conversion is rounded 4413 according to the current rounding mode. If `a' is a NaN, the largest 4414 positive integer is returned. Otherwise, if the conversion overflows, the 4415 largest integer with the same sign as `a' is returned. 4416 ------------------------------------------------------------------------------- 4417 */ 4418 int64 float128_to_int64( float128 a ) 4419 { 4420 flag aSign; 4421 int32 aExp, shiftCount; 4422 bits64 aSig0, aSig1; 4423 4424 aSig1 = extractFloat128Frac1( a ); 4425 aSig0 = extractFloat128Frac0( a ); 4426 aExp = extractFloat128Exp( a ); 4427 aSign = extractFloat128Sign( a ); 4428 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4429 shiftCount = 0x402F - aExp; 4430 if ( shiftCount <= 0 ) { 4431 if ( 0x403E < aExp ) { 4432 float_raise( float_flag_invalid ); 4433 if ( ! aSign 4434 || ( ( aExp == 0x7FFF ) 4435 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4436 ) 4437 ) { 4438 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4439 } 4440 return (sbits64) LIT64( 0x8000000000000000 ); 4441 } 4442 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4443 } 4444 else { 4445 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4446 } 4447 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4448 4449 } 4450 4451 /* 4452 ------------------------------------------------------------------------------- 4453 Returns the result of converting the quadruple-precision floating-point 4454 value `a' to the 64-bit two's complement integer format. The conversion 4455 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4456 Arithmetic, except that the conversion is always rounded toward zero. 4457 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4458 the conversion overflows, the largest integer with the same sign as `a' is 4459 returned. 4460 ------------------------------------------------------------------------------- 4461 */ 4462 int64 float128_to_int64_round_to_zero( float128 a ) 4463 { 4464 flag aSign; 4465 int32 aExp, shiftCount; 4466 bits64 aSig0, aSig1; 4467 int64 z; 4468 4469 aSig1 = extractFloat128Frac1( a ); 4470 aSig0 = extractFloat128Frac0( a ); 4471 aExp = extractFloat128Exp( a ); 4472 aSign = extractFloat128Sign( a ); 4473 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4474 shiftCount = aExp - 0x402F; 4475 if ( 0 < shiftCount ) { 4476 if ( 0x403E <= aExp ) { 4477 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4478 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4479 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4480 if ( aSig1 ) float_set_inexact(); 4481 } 4482 else { 4483 float_raise( float_flag_invalid ); 4484 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4485 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4486 } 4487 } 4488 return (sbits64) LIT64( 0x8000000000000000 ); 4489 } 4490 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4491 if ( (bits64) ( aSig1<<shiftCount ) ) { 4492 float_set_inexact(); 4493 } 4494 } 4495 else { 4496 if ( aExp < 0x3FFF ) { 4497 if ( aExp | aSig0 | aSig1 ) { 4498 float_set_inexact(); 4499 } 4500 return 0; 4501 } 4502 z = aSig0>>( - shiftCount ); 4503 if ( aSig1 4504 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4505 float_set_inexact(); 4506 } 4507 } 4508 if ( aSign ) z = - z; 4509 return z; 4510 4511 } 4512 4513 /* 4514 ------------------------------------------------------------------------------- 4515 Returns the result of converting the quadruple-precision floating-point 4516 value `a' to the single-precision floating-point format. The conversion 4517 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4518 Arithmetic. 4519 ------------------------------------------------------------------------------- 4520 */ 4521 float32 float128_to_float32( float128 a ) 4522 { 4523 flag aSign; 4524 int32 aExp; 4525 bits64 aSig0, aSig1; 4526 bits32 zSig; 4527 4528 aSig1 = extractFloat128Frac1( a ); 4529 aSig0 = extractFloat128Frac0( a ); 4530 aExp = extractFloat128Exp( a ); 4531 aSign = extractFloat128Sign( a ); 4532 if ( aExp == 0x7FFF ) { 4533 if ( aSig0 | aSig1 ) { 4534 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4535 } 4536 return packFloat32( aSign, 0xFF, 0 ); 4537 } 4538 aSig0 |= ( aSig1 != 0 ); 4539 shift64RightJamming( aSig0, 18, &aSig0 ); 4540 zSig = aSig0; 4541 if ( aExp || zSig ) { 4542 zSig |= 0x40000000; 4543 aExp -= 0x3F81; 4544 } 4545 return roundAndPackFloat32( aSign, aExp, zSig ); 4546 4547 } 4548 4549 /* 4550 ------------------------------------------------------------------------------- 4551 Returns the result of converting the quadruple-precision floating-point 4552 value `a' to the double-precision floating-point format. The conversion 4553 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4554 Arithmetic. 4555 ------------------------------------------------------------------------------- 4556 */ 4557 float64 float128_to_float64( float128 a ) 4558 { 4559 flag aSign; 4560 int32 aExp; 4561 bits64 aSig0, aSig1; 4562 4563 aSig1 = extractFloat128Frac1( a ); 4564 aSig0 = extractFloat128Frac0( a ); 4565 aExp = extractFloat128Exp( a ); 4566 aSign = extractFloat128Sign( a ); 4567 if ( aExp == 0x7FFF ) { 4568 if ( aSig0 | aSig1 ) { 4569 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4570 } 4571 return packFloat64( aSign, 0x7FF, 0 ); 4572 } 4573 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4574 aSig0 |= ( aSig1 != 0 ); 4575 if ( aExp || aSig0 ) { 4576 aSig0 |= LIT64( 0x4000000000000000 ); 4577 aExp -= 0x3C01; 4578 } 4579 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4580 4581 } 4582 4583 #ifdef FLOATX80 4584 4585 /* 4586 ------------------------------------------------------------------------------- 4587 Returns the result of converting the quadruple-precision floating-point 4588 value `a' to the extended double-precision floating-point format. The 4589 conversion is performed according to the IEC/IEEE Standard for Binary 4590 Floating-Point Arithmetic. 4591 ------------------------------------------------------------------------------- 4592 */ 4593 floatx80 float128_to_floatx80( float128 a ) 4594 { 4595 flag aSign; 4596 int32 aExp; 4597 bits64 aSig0, aSig1; 4598 4599 aSig1 = extractFloat128Frac1( a ); 4600 aSig0 = extractFloat128Frac0( a ); 4601 aExp = extractFloat128Exp( a ); 4602 aSign = extractFloat128Sign( a ); 4603 if ( aExp == 0x7FFF ) { 4604 if ( aSig0 | aSig1 ) { 4605 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4606 } 4607 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4608 } 4609 if ( aExp == 0 ) { 4610 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4611 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4612 } 4613 else { 4614 aSig0 |= LIT64( 0x0001000000000000 ); 4615 } 4616 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4617 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4618 4619 } 4620 4621 #endif 4622 4623 /* 4624 ------------------------------------------------------------------------------- 4625 Rounds the quadruple-precision floating-point value `a' to an integer, and 4626 returns the result as a quadruple-precision floating-point value. The 4627 operation is performed according to the IEC/IEEE Standard for Binary 4628 Floating-Point Arithmetic. 4629 ------------------------------------------------------------------------------- 4630 */ 4631 float128 float128_round_to_int( float128 a ) 4632 { 4633 flag aSign; 4634 int32 aExp; 4635 bits64 lastBitMask, roundBitsMask; 4636 int8 roundingMode; 4637 float128 z; 4638 4639 aExp = extractFloat128Exp( a ); 4640 if ( 0x402F <= aExp ) { 4641 if ( 0x406F <= aExp ) { 4642 if ( ( aExp == 0x7FFF ) 4643 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4644 ) { 4645 return propagateFloat128NaN( a, a ); 4646 } 4647 return a; 4648 } 4649 lastBitMask = 1; 4650 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4651 roundBitsMask = lastBitMask - 1; 4652 z = a; 4653 roundingMode = float_rounding_mode(); 4654 if ( roundingMode == float_round_nearest_even ) { 4655 if ( lastBitMask ) { 4656 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4657 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4658 } 4659 else { 4660 if ( (sbits64) z.low < 0 ) { 4661 ++z.high; 4662 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4663 } 4664 } 4665 } 4666 else if ( roundingMode != float_round_to_zero ) { 4667 if ( extractFloat128Sign( z ) 4668 ^ ( roundingMode == float_round_up ) ) { 4669 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4670 } 4671 } 4672 z.low &= ~ roundBitsMask; 4673 } 4674 else { 4675 if ( aExp < 0x3FFF ) { 4676 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4677 float_set_inexact(); 4678 aSign = extractFloat128Sign( a ); 4679 switch ( float_rounding_mode() ) { 4680 case float_round_nearest_even: 4681 if ( ( aExp == 0x3FFE ) 4682 && ( extractFloat128Frac0( a ) 4683 | extractFloat128Frac1( a ) ) 4684 ) { 4685 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4686 } 4687 break; 4688 case float_round_down: 4689 return 4690 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4691 : packFloat128( 0, 0, 0, 0 ); 4692 case float_round_up: 4693 return 4694 aSign ? packFloat128( 1, 0, 0, 0 ) 4695 : packFloat128( 0, 0x3FFF, 0, 0 ); 4696 } 4697 return packFloat128( aSign, 0, 0, 0 ); 4698 } 4699 lastBitMask = 1; 4700 lastBitMask <<= 0x402F - aExp; 4701 roundBitsMask = lastBitMask - 1; 4702 z.low = 0; 4703 z.high = a.high; 4704 roundingMode = float_rounding_mode(); 4705 if ( roundingMode == float_round_nearest_even ) { 4706 z.high += lastBitMask>>1; 4707 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4708 z.high &= ~ lastBitMask; 4709 } 4710 } 4711 else if ( roundingMode != float_round_to_zero ) { 4712 if ( extractFloat128Sign( z ) 4713 ^ ( roundingMode == float_round_up ) ) { 4714 z.high |= ( a.low != 0 ); 4715 z.high += roundBitsMask; 4716 } 4717 } 4718 z.high &= ~ roundBitsMask; 4719 } 4720 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4721 float_set_inexact(); 4722 } 4723 return z; 4724 4725 } 4726 4727 /* 4728 ------------------------------------------------------------------------------- 4729 Returns the result of adding the absolute values of the quadruple-precision 4730 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4731 before being returned. `zSign' is ignored if the result is a NaN. 4732 The addition is performed according to the IEC/IEEE Standard for Binary 4733 Floating-Point Arithmetic. 4734 ------------------------------------------------------------------------------- 4735 */ 4736 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4737 { 4738 int32 aExp, bExp, zExp; 4739 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4740 int32 expDiff; 4741 4742 aSig1 = extractFloat128Frac1( a ); 4743 aSig0 = extractFloat128Frac0( a ); 4744 aExp = extractFloat128Exp( a ); 4745 bSig1 = extractFloat128Frac1( b ); 4746 bSig0 = extractFloat128Frac0( b ); 4747 bExp = extractFloat128Exp( b ); 4748 expDiff = aExp - bExp; 4749 if ( 0 < expDiff ) { 4750 if ( aExp == 0x7FFF ) { 4751 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4752 return a; 4753 } 4754 if ( bExp == 0 ) { 4755 --expDiff; 4756 } 4757 else { 4758 bSig0 |= LIT64( 0x0001000000000000 ); 4759 } 4760 shift128ExtraRightJamming( 4761 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4762 zExp = aExp; 4763 } 4764 else if ( expDiff < 0 ) { 4765 if ( bExp == 0x7FFF ) { 4766 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4767 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4768 } 4769 if ( aExp == 0 ) { 4770 ++expDiff; 4771 } 4772 else { 4773 aSig0 |= LIT64( 0x0001000000000000 ); 4774 } 4775 shift128ExtraRightJamming( 4776 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4777 zExp = bExp; 4778 } 4779 else { 4780 if ( aExp == 0x7FFF ) { 4781 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4782 return propagateFloat128NaN( a, b ); 4783 } 4784 return a; 4785 } 4786 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4787 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4788 zSig2 = 0; 4789 zSig0 |= LIT64( 0x0002000000000000 ); 4790 zExp = aExp; 4791 goto shiftRight1; 4792 } 4793 aSig0 |= LIT64( 0x0001000000000000 ); 4794 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4795 --zExp; 4796 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4797 ++zExp; 4798 shiftRight1: 4799 shift128ExtraRightJamming( 4800 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4801 roundAndPack: 4802 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4803 4804 } 4805 4806 /* 4807 ------------------------------------------------------------------------------- 4808 Returns the result of subtracting the absolute values of the quadruple- 4809 precision floating-point values `a' and `b'. If `zSign' is 1, the 4810 difference is negated before being returned. `zSign' is ignored if the 4811 result is a NaN. The subtraction is performed according to the IEC/IEEE 4812 Standard for Binary Floating-Point Arithmetic. 4813 ------------------------------------------------------------------------------- 4814 */ 4815 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4816 { 4817 int32 aExp, bExp, zExp; 4818 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4819 int32 expDiff; 4820 float128 z; 4821 4822 aSig1 = extractFloat128Frac1( a ); 4823 aSig0 = extractFloat128Frac0( a ); 4824 aExp = extractFloat128Exp( a ); 4825 bSig1 = extractFloat128Frac1( b ); 4826 bSig0 = extractFloat128Frac0( b ); 4827 bExp = extractFloat128Exp( b ); 4828 expDiff = aExp - bExp; 4829 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4830 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4831 if ( 0 < expDiff ) goto aExpBigger; 4832 if ( expDiff < 0 ) goto bExpBigger; 4833 if ( aExp == 0x7FFF ) { 4834 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4835 return propagateFloat128NaN( a, b ); 4836 } 4837 float_raise( float_flag_invalid ); 4838 z.low = float128_default_nan_low; 4839 z.high = float128_default_nan_high; 4840 return z; 4841 } 4842 if ( aExp == 0 ) { 4843 aExp = 1; 4844 bExp = 1; 4845 } 4846 if ( bSig0 < aSig0 ) goto aBigger; 4847 if ( aSig0 < bSig0 ) goto bBigger; 4848 if ( bSig1 < aSig1 ) goto aBigger; 4849 if ( aSig1 < bSig1 ) goto bBigger; 4850 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4851 bExpBigger: 4852 if ( bExp == 0x7FFF ) { 4853 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4854 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4855 } 4856 if ( aExp == 0 ) { 4857 ++expDiff; 4858 } 4859 else { 4860 aSig0 |= LIT64( 0x4000000000000000 ); 4861 } 4862 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4863 bSig0 |= LIT64( 0x4000000000000000 ); 4864 bBigger: 4865 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4866 zExp = bExp; 4867 zSign ^= 1; 4868 goto normalizeRoundAndPack; 4869 aExpBigger: 4870 if ( aExp == 0x7FFF ) { 4871 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4872 return a; 4873 } 4874 if ( bExp == 0 ) { 4875 --expDiff; 4876 } 4877 else { 4878 bSig0 |= LIT64( 0x4000000000000000 ); 4879 } 4880 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4881 aSig0 |= LIT64( 0x4000000000000000 ); 4882 aBigger: 4883 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4884 zExp = aExp; 4885 normalizeRoundAndPack: 4886 --zExp; 4887 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4888 4889 } 4890 4891 /* 4892 ------------------------------------------------------------------------------- 4893 Returns the result of adding the quadruple-precision floating-point values 4894 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4895 for Binary Floating-Point Arithmetic. 4896 ------------------------------------------------------------------------------- 4897 */ 4898 float128 float128_add( float128 a, float128 b ) 4899 { 4900 flag aSign, bSign; 4901 4902 aSign = extractFloat128Sign( a ); 4903 bSign = extractFloat128Sign( b ); 4904 if ( aSign == bSign ) { 4905 return addFloat128Sigs( a, b, aSign ); 4906 } 4907 else { 4908 return subFloat128Sigs( a, b, aSign ); 4909 } 4910 4911 } 4912 4913 /* 4914 ------------------------------------------------------------------------------- 4915 Returns the result of subtracting the quadruple-precision floating-point 4916 values `a' and `b'. The operation is performed according to the IEC/IEEE 4917 Standard for Binary Floating-Point Arithmetic. 4918 ------------------------------------------------------------------------------- 4919 */ 4920 float128 float128_sub( float128 a, float128 b ) 4921 { 4922 flag aSign, bSign; 4923 4924 aSign = extractFloat128Sign( a ); 4925 bSign = extractFloat128Sign( b ); 4926 if ( aSign == bSign ) { 4927 return subFloat128Sigs( a, b, aSign ); 4928 } 4929 else { 4930 return addFloat128Sigs( a, b, aSign ); 4931 } 4932 4933 } 4934 4935 /* 4936 ------------------------------------------------------------------------------- 4937 Returns the result of multiplying the quadruple-precision floating-point 4938 values `a' and `b'. The operation is performed according to the IEC/IEEE 4939 Standard for Binary Floating-Point Arithmetic. 4940 ------------------------------------------------------------------------------- 4941 */ 4942 float128 float128_mul( float128 a, float128 b ) 4943 { 4944 flag aSign, bSign, zSign; 4945 int32 aExp, bExp, zExp; 4946 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4947 float128 z; 4948 4949 aSig1 = extractFloat128Frac1( a ); 4950 aSig0 = extractFloat128Frac0( a ); 4951 aExp = extractFloat128Exp( a ); 4952 aSign = extractFloat128Sign( a ); 4953 bSig1 = extractFloat128Frac1( b ); 4954 bSig0 = extractFloat128Frac0( b ); 4955 bExp = extractFloat128Exp( b ); 4956 bSign = extractFloat128Sign( b ); 4957 zSign = aSign ^ bSign; 4958 if ( aExp == 0x7FFF ) { 4959 if ( ( aSig0 | aSig1 ) 4960 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4961 return propagateFloat128NaN( a, b ); 4962 } 4963 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4964 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4965 } 4966 if ( bExp == 0x7FFF ) { 4967 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4968 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4969 invalid: 4970 float_raise( float_flag_invalid ); 4971 z.low = float128_default_nan_low; 4972 z.high = float128_default_nan_high; 4973 return z; 4974 } 4975 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4976 } 4977 if ( aExp == 0 ) { 4978 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4979 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4980 } 4981 if ( bExp == 0 ) { 4982 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4983 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4984 } 4985 zExp = aExp + bExp - 0x4000; 4986 aSig0 |= LIT64( 0x0001000000000000 ); 4987 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4988 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4989 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4990 zSig2 |= ( zSig3 != 0 ); 4991 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4992 shift128ExtraRightJamming( 4993 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4994 ++zExp; 4995 } 4996 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4997 4998 } 4999 5000 /* 5001 ------------------------------------------------------------------------------- 5002 Returns the result of dividing the quadruple-precision floating-point value 5003 `a' by the corresponding value `b'. The operation is performed according to 5004 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5005 ------------------------------------------------------------------------------- 5006 */ 5007 float128 float128_div( float128 a, float128 b ) 5008 { 5009 flag aSign, bSign, zSign; 5010 int32 aExp, bExp, zExp; 5011 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5012 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5013 float128 z; 5014 5015 aSig1 = extractFloat128Frac1( a ); 5016 aSig0 = extractFloat128Frac0( a ); 5017 aExp = extractFloat128Exp( a ); 5018 aSign = extractFloat128Sign( a ); 5019 bSig1 = extractFloat128Frac1( b ); 5020 bSig0 = extractFloat128Frac0( b ); 5021 bExp = extractFloat128Exp( b ); 5022 bSign = extractFloat128Sign( b ); 5023 zSign = aSign ^ bSign; 5024 if ( aExp == 0x7FFF ) { 5025 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5026 if ( bExp == 0x7FFF ) { 5027 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5028 goto invalid; 5029 } 5030 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5031 } 5032 if ( bExp == 0x7FFF ) { 5033 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5034 return packFloat128( zSign, 0, 0, 0 ); 5035 } 5036 if ( bExp == 0 ) { 5037 if ( ( bSig0 | bSig1 ) == 0 ) { 5038 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5039 invalid: 5040 float_raise( float_flag_invalid ); 5041 z.low = float128_default_nan_low; 5042 z.high = float128_default_nan_high; 5043 return z; 5044 } 5045 float_raise( float_flag_divbyzero ); 5046 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5047 } 5048 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5049 } 5050 if ( aExp == 0 ) { 5051 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5052 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5053 } 5054 zExp = aExp - bExp + 0x3FFD; 5055 shortShift128Left( 5056 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5057 shortShift128Left( 5058 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5059 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5060 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5061 ++zExp; 5062 } 5063 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5064 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5065 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5066 while ( (sbits64) rem0 < 0 ) { 5067 --zSig0; 5068 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5069 } 5070 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5071 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5072 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5073 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5074 while ( (sbits64) rem1 < 0 ) { 5075 --zSig1; 5076 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5077 } 5078 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5079 } 5080 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5081 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5082 5083 } 5084 5085 /* 5086 ------------------------------------------------------------------------------- 5087 Returns the remainder of the quadruple-precision floating-point value `a' 5088 with respect to the corresponding value `b'. The operation is performed 5089 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5090 ------------------------------------------------------------------------------- 5091 */ 5092 float128 float128_rem( float128 a, float128 b ) 5093 { 5094 flag aSign, bSign, zSign; 5095 int32 aExp, bExp, expDiff; 5096 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5097 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5098 sbits64 sigMean0; 5099 float128 z; 5100 5101 aSig1 = extractFloat128Frac1( a ); 5102 aSig0 = extractFloat128Frac0( a ); 5103 aExp = extractFloat128Exp( a ); 5104 aSign = extractFloat128Sign( a ); 5105 bSig1 = extractFloat128Frac1( b ); 5106 bSig0 = extractFloat128Frac0( b ); 5107 bExp = extractFloat128Exp( b ); 5108 bSign = extractFloat128Sign( b ); 5109 if ( aExp == 0x7FFF ) { 5110 if ( ( aSig0 | aSig1 ) 5111 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5112 return propagateFloat128NaN( a, b ); 5113 } 5114 goto invalid; 5115 } 5116 if ( bExp == 0x7FFF ) { 5117 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5118 return a; 5119 } 5120 if ( bExp == 0 ) { 5121 if ( ( bSig0 | bSig1 ) == 0 ) { 5122 invalid: 5123 float_raise( float_flag_invalid ); 5124 z.low = float128_default_nan_low; 5125 z.high = float128_default_nan_high; 5126 return z; 5127 } 5128 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5129 } 5130 if ( aExp == 0 ) { 5131 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5132 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5133 } 5134 expDiff = aExp - bExp; 5135 if ( expDiff < -1 ) return a; 5136 shortShift128Left( 5137 aSig0 | LIT64( 0x0001000000000000 ), 5138 aSig1, 5139 15 - ( expDiff < 0 ), 5140 &aSig0, 5141 &aSig1 5142 ); 5143 shortShift128Left( 5144 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5145 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5146 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5147 expDiff -= 64; 5148 while ( 0 < expDiff ) { 5149 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5150 q = ( 4 < q ) ? q - 4 : 0; 5151 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5152 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5153 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5154 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5155 expDiff -= 61; 5156 } 5157 if ( -64 < expDiff ) { 5158 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5159 q = ( 4 < q ) ? q - 4 : 0; 5160 q >>= - expDiff; 5161 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5162 expDiff += 52; 5163 if ( expDiff < 0 ) { 5164 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5165 } 5166 else { 5167 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5168 } 5169 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5170 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5171 } 5172 else { 5173 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5174 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5175 } 5176 do { 5177 alternateASig0 = aSig0; 5178 alternateASig1 = aSig1; 5179 ++q; 5180 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5181 } while ( 0 <= (sbits64) aSig0 ); 5182 add128( 5183 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5184 if ( ( sigMean0 < 0 ) 5185 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5186 aSig0 = alternateASig0; 5187 aSig1 = alternateASig1; 5188 } 5189 zSign = ( (sbits64) aSig0 < 0 ); 5190 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5191 return 5192 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5193 5194 } 5195 5196 /* 5197 ------------------------------------------------------------------------------- 5198 Returns the square root of the quadruple-precision floating-point value `a'. 5199 The operation is performed according to the IEC/IEEE Standard for Binary 5200 Floating-Point Arithmetic. 5201 ------------------------------------------------------------------------------- 5202 */ 5203 float128 float128_sqrt( float128 a ) 5204 { 5205 flag aSign; 5206 int32 aExp, zExp; 5207 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5208 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5209 float128 z; 5210 5211 aSig1 = extractFloat128Frac1( a ); 5212 aSig0 = extractFloat128Frac0( a ); 5213 aExp = extractFloat128Exp( a ); 5214 aSign = extractFloat128Sign( a ); 5215 if ( aExp == 0x7FFF ) { 5216 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5217 if ( ! aSign ) return a; 5218 goto invalid; 5219 } 5220 if ( aSign ) { 5221 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5222 invalid: 5223 float_raise( float_flag_invalid ); 5224 z.low = float128_default_nan_low; 5225 z.high = float128_default_nan_high; 5226 return z; 5227 } 5228 if ( aExp == 0 ) { 5229 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5230 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5231 } 5232 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5233 aSig0 |= LIT64( 0x0001000000000000 ); 5234 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5235 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5236 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5237 doubleZSig0 = zSig0<<1; 5238 mul64To128( zSig0, zSig0, &term0, &term1 ); 5239 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5240 while ( (sbits64) rem0 < 0 ) { 5241 --zSig0; 5242 doubleZSig0 -= 2; 5243 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5244 } 5245 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5246 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5247 if ( zSig1 == 0 ) zSig1 = 1; 5248 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5249 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5250 mul64To128( zSig1, zSig1, &term2, &term3 ); 5251 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5252 while ( (sbits64) rem1 < 0 ) { 5253 --zSig1; 5254 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5255 term3 |= 1; 5256 term2 |= doubleZSig0; 5257 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5258 } 5259 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5260 } 5261 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5262 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5263 5264 } 5265 5266 /* 5267 ------------------------------------------------------------------------------- 5268 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5269 the corresponding value `b', and 0 otherwise. The comparison is performed 5270 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5271 ------------------------------------------------------------------------------- 5272 */ 5273 flag float128_eq( float128 a, float128 b ) 5274 { 5275 5276 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5277 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5278 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5279 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5280 ) { 5281 if ( float128_is_signaling_nan( a ) 5282 || float128_is_signaling_nan( b ) ) { 5283 float_raise( float_flag_invalid ); 5284 } 5285 return 0; 5286 } 5287 return 5288 ( a.low == b.low ) 5289 && ( ( a.high == b.high ) 5290 || ( ( a.low == 0 ) 5291 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5292 ); 5293 5294 } 5295 5296 /* 5297 ------------------------------------------------------------------------------- 5298 Returns 1 if the quadruple-precision floating-point value `a' is less than 5299 or equal to the corresponding value `b', and 0 otherwise. The comparison 5300 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5301 Arithmetic. 5302 ------------------------------------------------------------------------------- 5303 */ 5304 flag float128_le( float128 a, float128 b ) 5305 { 5306 flag aSign, bSign; 5307 5308 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5309 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5310 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5311 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5312 ) { 5313 float_raise( float_flag_invalid ); 5314 return 0; 5315 } 5316 aSign = extractFloat128Sign( a ); 5317 bSign = extractFloat128Sign( b ); 5318 if ( aSign != bSign ) { 5319 return 5320 aSign 5321 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5322 == 0 ); 5323 } 5324 return 5325 aSign ? le128( b.high, b.low, a.high, a.low ) 5326 : le128( a.high, a.low, b.high, b.low ); 5327 5328 } 5329 5330 /* 5331 ------------------------------------------------------------------------------- 5332 Returns 1 if the quadruple-precision floating-point value `a' is less than 5333 the corresponding value `b', and 0 otherwise. The comparison is performed 5334 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5335 ------------------------------------------------------------------------------- 5336 */ 5337 flag float128_lt( float128 a, float128 b ) 5338 { 5339 flag aSign, bSign; 5340 5341 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5342 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5343 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5344 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5345 ) { 5346 float_raise( float_flag_invalid ); 5347 return 0; 5348 } 5349 aSign = extractFloat128Sign( a ); 5350 bSign = extractFloat128Sign( b ); 5351 if ( aSign != bSign ) { 5352 return 5353 aSign 5354 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5355 != 0 ); 5356 } 5357 return 5358 aSign ? lt128( b.high, b.low, a.high, a.low ) 5359 : lt128( a.high, a.low, b.high, b.low ); 5360 5361 } 5362 5363 /* 5364 ------------------------------------------------------------------------------- 5365 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5366 the corresponding value `b', and 0 otherwise. The invalid exception is 5367 raised if either operand is a NaN. Otherwise, the comparison is performed 5368 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5369 ------------------------------------------------------------------------------- 5370 */ 5371 flag float128_eq_signaling( float128 a, float128 b ) 5372 { 5373 5374 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5375 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5376 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5377 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5378 ) { 5379 float_raise( float_flag_invalid ); 5380 return 0; 5381 } 5382 return 5383 ( a.low == b.low ) 5384 && ( ( a.high == b.high ) 5385 || ( ( a.low == 0 ) 5386 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5387 ); 5388 5389 } 5390 5391 /* 5392 ------------------------------------------------------------------------------- 5393 Returns 1 if the quadruple-precision floating-point value `a' is less than 5394 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5395 cause an exception. Otherwise, the comparison is performed according to the 5396 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5397 ------------------------------------------------------------------------------- 5398 */ 5399 flag float128_le_quiet( float128 a, float128 b ) 5400 { 5401 flag aSign, bSign; 5402 5403 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5404 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5405 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5406 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5407 ) { 5408 if ( float128_is_signaling_nan( a ) 5409 || float128_is_signaling_nan( b ) ) { 5410 float_raise( float_flag_invalid ); 5411 } 5412 return 0; 5413 } 5414 aSign = extractFloat128Sign( a ); 5415 bSign = extractFloat128Sign( b ); 5416 if ( aSign != bSign ) { 5417 return 5418 aSign 5419 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5420 == 0 ); 5421 } 5422 return 5423 aSign ? le128( b.high, b.low, a.high, a.low ) 5424 : le128( a.high, a.low, b.high, b.low ); 5425 5426 } 5427 5428 /* 5429 ------------------------------------------------------------------------------- 5430 Returns 1 if the quadruple-precision floating-point value `a' is less than 5431 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5432 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5433 Standard for Binary Floating-Point Arithmetic. 5434 ------------------------------------------------------------------------------- 5435 */ 5436 flag float128_lt_quiet( float128 a, float128 b ) 5437 { 5438 flag aSign, bSign; 5439 5440 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5441 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5442 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5443 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5444 ) { 5445 if ( float128_is_signaling_nan( a ) 5446 || float128_is_signaling_nan( b ) ) { 5447 float_raise( float_flag_invalid ); 5448 } 5449 return 0; 5450 } 5451 aSign = extractFloat128Sign( a ); 5452 bSign = extractFloat128Sign( b ); 5453 if ( aSign != bSign ) { 5454 return 5455 aSign 5456 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5457 != 0 ); 5458 } 5459 return 5460 aSign ? lt128( b.high, b.low, a.high, a.low ) 5461 : lt128( a.high, a.low, b.high, b.low ); 5462 5463 } 5464 5465 #endif 5466 5467 5468 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5469 5470 /* 5471 * These two routines are not part of the original softfloat distribution. 5472 * 5473 * They are based on the corresponding conversions to integer but return 5474 * unsigned numbers instead since these functions are required by GCC. 5475 * 5476 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5477 * 5478 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5479 */ 5480 5481 /* 5482 ------------------------------------------------------------------------------- 5483 Returns the result of converting the double-precision floating-point value 5484 `a' to the 32-bit unsigned integer format. The conversion is 5485 performed according to the IEC/IEEE Standard for Binary Floating-point 5486 Arithmetic, except that the conversion is always rounded toward zero. If 5487 `a' is a NaN, the largest positive integer is returned. If the conversion 5488 overflows, the largest integer positive is returned. 5489 ------------------------------------------------------------------------------- 5490 */ 5491 uint32 float64_to_uint32_round_to_zero( float64 a ) 5492 { 5493 flag aSign; 5494 int16 aExp, shiftCount; 5495 bits64 aSig, savedASig; 5496 uint32 z; 5497 5498 aSig = extractFloat64Frac( a ); 5499 aExp = extractFloat64Exp( a ); 5500 aSign = extractFloat64Sign( a ); 5501 5502 if (aSign) { 5503 float_raise( float_flag_invalid ); 5504 return(0); 5505 } 5506 5507 if ( 0x41E < aExp ) { 5508 float_raise( float_flag_invalid ); 5509 return 0xffffffff; 5510 } 5511 else if ( aExp < 0x3FF ) { 5512 if ( aExp || aSig ) float_set_inexact(); 5513 return 0; 5514 } 5515 aSig |= LIT64( 0x0010000000000000 ); 5516 shiftCount = 0x433 - aExp; 5517 savedASig = aSig; 5518 aSig >>= shiftCount; 5519 z = aSig; 5520 if ( ( aSig<<shiftCount ) != savedASig ) { 5521 float_set_inexact(); 5522 } 5523 return z; 5524 5525 } 5526 5527 /* 5528 ------------------------------------------------------------------------------- 5529 Returns the result of converting the single-precision floating-point value 5530 `a' to the 32-bit unsigned integer format. The conversion is 5531 performed according to the IEC/IEEE Standard for Binary Floating-point 5532 Arithmetic, except that the conversion is always rounded toward zero. If 5533 `a' is a NaN, the largest positive integer is returned. If the conversion 5534 overflows, the largest positive integer is returned. 5535 ------------------------------------------------------------------------------- 5536 */ 5537 uint32 float32_to_uint32_round_to_zero( float32 a ) 5538 { 5539 flag aSign; 5540 int16 aExp, shiftCount; 5541 bits32 aSig; 5542 uint32 z; 5543 5544 aSig = extractFloat32Frac( a ); 5545 aExp = extractFloat32Exp( a ); 5546 aSign = extractFloat32Sign( a ); 5547 shiftCount = aExp - 0x9E; 5548 5549 if (aSign) { 5550 float_raise( float_flag_invalid ); 5551 return(0); 5552 } 5553 if ( 0 < shiftCount ) { 5554 float_raise( float_flag_invalid ); 5555 return 0xFFFFFFFF; 5556 } 5557 else if ( aExp <= 0x7E ) { 5558 if ( aExp | aSig ) float_set_inexact(); 5559 return 0; 5560 } 5561 aSig = ( aSig | 0x800000 )<<8; 5562 z = aSig>>( - shiftCount ); 5563 if ( aSig<<( shiftCount & 31 ) ) { 5564 float_set_inexact(); 5565 } 5566 return z; 5567 5568 } 5569 5570 #endif 5571 5572 #endif /* _STANDALONE */ 5573