1 /* $OpenBSD: softfloat.c,v 1.2 2007/12/29 16:59:16 miod Exp $ */ 2 /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */ 3 4 /* 5 * This version hacked for use with gcc -msoft-float by bjh21. 6 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 7 * itself). 8 */ 9 10 /* 11 * Things you may want to define: 12 * 13 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 14 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 15 * properly renamed. 16 */ 17 18 /* 19 =============================================================================== 20 21 This C source file is part of the SoftFloat IEC/IEEE Floating-point 22 Arithmetic Package, Release 2a. 23 24 Written by John R. Hauser. This work was made possible in part by the 25 International Computer Science Institute, located at Suite 600, 1947 Center 26 Street, Berkeley, California 94704. Funding was partially provided by the 27 National Science Foundation under grant MIP-9311980. The original version 28 of this code was written as part of a project to build a fixed-point vector 29 processor in collaboration with the University of California at Berkeley, 30 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 32 arithmetic/SoftFloat.html'. 33 34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable 35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT 36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS 37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL 38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM 39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY 40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL 41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS 42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS. 43 44 Derivative works are acceptable, even for commercial purposes, so long as 45 (1) they include prominent notice that the work is derivative, and (2) they 46 include prominent notice akin to these four paragraphs for those parts of 47 this code that are retained. 48 49 =============================================================================== 50 */ 51 52 #include <sys/cdefs.h> 53 #if defined(LIBC_SCCS) && !defined(lint) 54 __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $"); 55 #endif /* LIBC_SCCS and not lint */ 56 57 #ifdef SOFTFLOAT_FOR_GCC 58 #include "softfloat-for-gcc.h" 59 #endif 60 61 #include "milieu.h" 62 #include "softfloat.h" 63 64 /* 65 * Conversions between floats as stored in memory and floats as 66 * SoftFloat uses them 67 */ 68 #ifndef FLOAT64_DEMANGLE 69 #define FLOAT64_DEMANGLE(a) (a) 70 #endif 71 #ifndef FLOAT64_MANGLE 72 #define FLOAT64_MANGLE(a) (a) 73 #endif 74 75 /* 76 ------------------------------------------------------------------------------- 77 Floating-point rounding mode, extended double-precision rounding precision, 78 and exception flags. 79 ------------------------------------------------------------------------------- 80 */ 81 82 /* 83 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 84 * Right now, it does not. I've removed all other dynamic global 85 * variables. [ross] 86 */ 87 #ifdef FLOATX80 88 int8 floatx80_rounding_precision = 80; 89 #endif 90 91 /* 92 ------------------------------------------------------------------------------- 93 Primitive arithmetic functions, including multi-word arithmetic, and 94 division and square root approximations. (Can be specialized to target if 95 desired.) 96 ------------------------------------------------------------------------------- 97 */ 98 #include "softfloat-macros.h" 99 100 /* 101 ------------------------------------------------------------------------------- 102 Functions and definitions to determine: (1) whether tininess for underflow 103 is detected before or after rounding by default, (2) what (if anything) 104 happens when exceptions are raised, (3) how signaling NaNs are distinguished 105 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 106 are propagated from function inputs to output. These details are target- 107 specific. 108 ------------------------------------------------------------------------------- 109 */ 110 #include "softfloat-specialize.h" 111 112 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 113 /* 114 ------------------------------------------------------------------------------- 115 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 116 and 7, and returns the properly rounded 32-bit integer corresponding to the 117 input. If `zSign' is 1, the input is negated before being converted to an 118 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 119 is simply rounded to an integer, with the inexact exception raised if the 120 input cannot be represented exactly as an integer. However, if the fixed- 121 point input is too large, the invalid exception is raised and the largest 122 positive or negative integer is returned. 123 ------------------------------------------------------------------------------- 124 */ 125 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 126 { 127 int8 roundingMode; 128 flag roundNearestEven; 129 int8 roundIncrement, roundBits; 130 int32 z; 131 132 roundingMode = float_rounding_mode(); 133 roundNearestEven = ( roundingMode == float_round_nearest_even ); 134 roundIncrement = 0x40; 135 if ( ! roundNearestEven ) { 136 if ( roundingMode == float_round_to_zero ) { 137 roundIncrement = 0; 138 } 139 else { 140 roundIncrement = 0x7F; 141 if ( zSign ) { 142 if ( roundingMode == float_round_up ) roundIncrement = 0; 143 } 144 else { 145 if ( roundingMode == float_round_down ) roundIncrement = 0; 146 } 147 } 148 } 149 roundBits = absZ & 0x7F; 150 absZ = ( absZ + roundIncrement )>>7; 151 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 152 z = absZ; 153 if ( zSign ) z = - z; 154 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 155 float_raise( float_flag_invalid ); 156 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 157 } 158 if ( roundBits ) float_set_inexact(); 159 return z; 160 161 } 162 163 /* 164 ------------------------------------------------------------------------------- 165 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 166 `absZ1', with binary point between bits 63 and 64 (between the input words), 167 and returns the properly rounded 64-bit integer corresponding to the input. 168 If `zSign' is 1, the input is negated before being converted to an integer. 169 Ordinarily, the fixed-point input is simply rounded to an integer, with 170 the inexact exception raised if the input cannot be represented exactly as 171 an integer. However, if the fixed-point input is too large, the invalid 172 exception is raised and the largest positive or negative integer is 173 returned. 174 ------------------------------------------------------------------------------- 175 */ 176 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 177 { 178 int8 roundingMode; 179 flag roundNearestEven, increment; 180 int64 z; 181 182 roundingMode = float_rounding_mode(); 183 roundNearestEven = ( roundingMode == float_round_nearest_even ); 184 increment = ( (sbits64) absZ1 < 0 ); 185 if ( ! roundNearestEven ) { 186 if ( roundingMode == float_round_to_zero ) { 187 increment = 0; 188 } 189 else { 190 if ( zSign ) { 191 increment = ( roundingMode == float_round_down ) && absZ1; 192 } 193 else { 194 increment = ( roundingMode == float_round_up ) && absZ1; 195 } 196 } 197 } 198 if ( increment ) { 199 ++absZ0; 200 if ( absZ0 == 0 ) goto overflow; 201 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 202 } 203 z = absZ0; 204 if ( zSign ) z = - z; 205 if ( z && ( ( z < 0 ) ^ zSign ) ) { 206 overflow: 207 float_raise( float_flag_invalid ); 208 return 209 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 210 : LIT64( 0x7FFFFFFFFFFFFFFF ); 211 } 212 if ( absZ1 ) float_set_inexact(); 213 return z; 214 215 } 216 #endif 217 218 /* 219 ------------------------------------------------------------------------------- 220 Returns the fraction bits of the single-precision floating-point value `a'. 221 ------------------------------------------------------------------------------- 222 */ 223 INLINE bits32 extractFloat32Frac( float32 a ) 224 { 225 226 return a & 0x007FFFFF; 227 228 } 229 230 /* 231 ------------------------------------------------------------------------------- 232 Returns the exponent bits of the single-precision floating-point value `a'. 233 ------------------------------------------------------------------------------- 234 */ 235 INLINE int16 extractFloat32Exp( float32 a ) 236 { 237 238 return ( a>>23 ) & 0xFF; 239 240 } 241 242 /* 243 ------------------------------------------------------------------------------- 244 Returns the sign bit of the single-precision floating-point value `a'. 245 ------------------------------------------------------------------------------- 246 */ 247 INLINE flag extractFloat32Sign( float32 a ) 248 { 249 250 return a>>31; 251 252 } 253 254 /* 255 ------------------------------------------------------------------------------- 256 Normalizes the subnormal single-precision floating-point value represented 257 by the denormalized significand `aSig'. The normalized exponent and 258 significand are stored at the locations pointed to by `zExpPtr' and 259 `zSigPtr', respectively. 260 ------------------------------------------------------------------------------- 261 */ 262 static void 263 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 264 { 265 int8 shiftCount; 266 267 shiftCount = countLeadingZeros32( aSig ) - 8; 268 *zSigPtr = aSig<<shiftCount; 269 *zExpPtr = 1 - shiftCount; 270 271 } 272 273 /* 274 ------------------------------------------------------------------------------- 275 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 276 single-precision floating-point value, returning the result. After being 277 shifted into the proper positions, the three fields are simply added 278 together to form the result. This means that any integer portion of `zSig' 279 will be added into the exponent. Since a properly normalized significand 280 will have an integer portion equal to 1, the `zExp' input should be 1 less 281 than the desired result exponent whenever `zSig' is a complete, normalized 282 significand. 283 ------------------------------------------------------------------------------- 284 */ 285 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 286 { 287 288 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 289 290 } 291 292 /* 293 ------------------------------------------------------------------------------- 294 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 295 and significand `zSig', and returns the proper single-precision floating- 296 point value corresponding to the abstract input. Ordinarily, the abstract 297 value is simply rounded and packed into the single-precision format, with 298 the inexact exception raised if the abstract input cannot be represented 299 exactly. However, if the abstract value is too large, the overflow and 300 inexact exceptions are raised and an infinity or maximal finite value is 301 returned. If the abstract value is too small, the input value is rounded to 302 a subnormal number, and the underflow and inexact exceptions are raised if 303 the abstract input cannot be represented exactly as a subnormal single- 304 precision floating-point number. 305 The input significand `zSig' has its binary point between bits 30 306 and 29, which is 7 bits to the left of the usual location. This shifted 307 significand must be normalized or smaller. If `zSig' is not normalized, 308 `zExp' must be 0; in that case, the result returned is a subnormal number, 309 and it must not require rounding. In the usual case that `zSig' is 310 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 311 The handling of underflow and overflow follows the IEC/IEEE Standard for 312 Binary Floating-Point Arithmetic. 313 ------------------------------------------------------------------------------- 314 */ 315 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 316 { 317 int8 roundingMode; 318 flag roundNearestEven; 319 int8 roundIncrement, roundBits; 320 flag isTiny; 321 322 roundingMode = float_rounding_mode(); 323 roundNearestEven = ( roundingMode == float_round_nearest_even ); 324 roundIncrement = 0x40; 325 if ( ! roundNearestEven ) { 326 if ( roundingMode == float_round_to_zero ) { 327 roundIncrement = 0; 328 } 329 else { 330 roundIncrement = 0x7F; 331 if ( zSign ) { 332 if ( roundingMode == float_round_up ) roundIncrement = 0; 333 } 334 else { 335 if ( roundingMode == float_round_down ) roundIncrement = 0; 336 } 337 } 338 } 339 roundBits = zSig & 0x7F; 340 if ( 0xFD <= (bits16) zExp ) { 341 if ( ( 0xFD < zExp ) 342 || ( ( zExp == 0xFD ) 343 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 344 ) { 345 float_raise( float_flag_overflow | float_flag_inexact ); 346 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 347 } 348 if ( zExp < 0 ) { 349 isTiny = 350 ( float_detect_tininess == float_tininess_before_rounding ) 351 || ( zExp < -1 ) 352 || ( zSig + roundIncrement < 0x80000000 ); 353 shift32RightJamming( zSig, - zExp, &zSig ); 354 zExp = 0; 355 roundBits = zSig & 0x7F; 356 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 357 } 358 } 359 if ( roundBits ) float_set_inexact(); 360 zSig = ( zSig + roundIncrement )>>7; 361 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 362 if ( zSig == 0 ) zExp = 0; 363 return packFloat32( zSign, zExp, zSig ); 364 365 } 366 367 /* 368 ------------------------------------------------------------------------------- 369 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 370 and significand `zSig', and returns the proper single-precision floating- 371 point value corresponding to the abstract input. This routine is just like 372 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 373 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 374 floating-point exponent. 375 ------------------------------------------------------------------------------- 376 */ 377 static float32 378 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 379 { 380 int8 shiftCount; 381 382 shiftCount = countLeadingZeros32( zSig ) - 1; 383 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 384 385 } 386 387 /* 388 ------------------------------------------------------------------------------- 389 Returns the fraction bits of the double-precision floating-point value `a'. 390 ------------------------------------------------------------------------------- 391 */ 392 INLINE bits64 extractFloat64Frac( float64 a ) 393 { 394 395 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 396 397 } 398 399 /* 400 ------------------------------------------------------------------------------- 401 Returns the exponent bits of the double-precision floating-point value `a'. 402 ------------------------------------------------------------------------------- 403 */ 404 INLINE int16 extractFloat64Exp( float64 a ) 405 { 406 407 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 408 409 } 410 411 /* 412 ------------------------------------------------------------------------------- 413 Returns the sign bit of the double-precision floating-point value `a'. 414 ------------------------------------------------------------------------------- 415 */ 416 INLINE flag extractFloat64Sign( float64 a ) 417 { 418 419 return FLOAT64_DEMANGLE(a)>>63; 420 421 } 422 423 /* 424 ------------------------------------------------------------------------------- 425 Normalizes the subnormal double-precision floating-point value represented 426 by the denormalized significand `aSig'. The normalized exponent and 427 significand are stored at the locations pointed to by `zExpPtr' and 428 `zSigPtr', respectively. 429 ------------------------------------------------------------------------------- 430 */ 431 static void 432 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 433 { 434 int8 shiftCount; 435 436 shiftCount = countLeadingZeros64( aSig ) - 11; 437 *zSigPtr = aSig<<shiftCount; 438 *zExpPtr = 1 - shiftCount; 439 440 } 441 442 /* 443 ------------------------------------------------------------------------------- 444 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 445 double-precision floating-point value, returning the result. After being 446 shifted into the proper positions, the three fields are simply added 447 together to form the result. This means that any integer portion of `zSig' 448 will be added into the exponent. Since a properly normalized significand 449 will have an integer portion equal to 1, the `zExp' input should be 1 less 450 than the desired result exponent whenever `zSig' is a complete, normalized 451 significand. 452 ------------------------------------------------------------------------------- 453 */ 454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 455 { 456 457 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 458 ( ( (bits64) zExp )<<52 ) + zSig ); 459 460 } 461 462 /* 463 ------------------------------------------------------------------------------- 464 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 465 and significand `zSig', and returns the proper double-precision floating- 466 point value corresponding to the abstract input. Ordinarily, the abstract 467 value is simply rounded and packed into the double-precision format, with 468 the inexact exception raised if the abstract input cannot be represented 469 exactly. However, if the abstract value is too large, the overflow and 470 inexact exceptions are raised and an infinity or maximal finite value is 471 returned. If the abstract value is too small, the input value is rounded to 472 a subnormal number, and the underflow and inexact exceptions are raised if 473 the abstract input cannot be represented exactly as a subnormal double- 474 precision floating-point number. 475 The input significand `zSig' has its binary point between bits 62 476 and 61, which is 10 bits to the left of the usual location. This shifted 477 significand must be normalized or smaller. If `zSig' is not normalized, 478 `zExp' must be 0; in that case, the result returned is a subnormal number, 479 and it must not require rounding. In the usual case that `zSig' is 480 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 481 The handling of underflow and overflow follows the IEC/IEEE Standard for 482 Binary Floating-Point Arithmetic. 483 ------------------------------------------------------------------------------- 484 */ 485 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 486 { 487 int8 roundingMode; 488 flag roundNearestEven; 489 int16 roundIncrement, roundBits; 490 flag isTiny; 491 492 roundingMode = float_rounding_mode(); 493 roundNearestEven = ( roundingMode == float_round_nearest_even ); 494 roundIncrement = 0x200; 495 if ( ! roundNearestEven ) { 496 if ( roundingMode == float_round_to_zero ) { 497 roundIncrement = 0; 498 } 499 else { 500 roundIncrement = 0x3FF; 501 if ( zSign ) { 502 if ( roundingMode == float_round_up ) roundIncrement = 0; 503 } 504 else { 505 if ( roundingMode == float_round_down ) roundIncrement = 0; 506 } 507 } 508 } 509 roundBits = zSig & 0x3FF; 510 if ( 0x7FD <= (bits16) zExp ) { 511 if ( ( 0x7FD < zExp ) 512 || ( ( zExp == 0x7FD ) 513 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 514 ) { 515 float_raise( float_flag_overflow | float_flag_inexact ); 516 return FLOAT64_MANGLE( 517 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 518 ( roundIncrement == 0 )); 519 } 520 if ( zExp < 0 ) { 521 isTiny = 522 ( float_detect_tininess == float_tininess_before_rounding ) 523 || ( zExp < -1 ) 524 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 525 shift64RightJamming( zSig, - zExp, &zSig ); 526 zExp = 0; 527 roundBits = zSig & 0x3FF; 528 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 529 } 530 } 531 if ( roundBits ) float_set_inexact(); 532 zSig = ( zSig + roundIncrement )>>10; 533 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 534 if ( zSig == 0 ) zExp = 0; 535 return packFloat64( zSign, zExp, zSig ); 536 537 } 538 539 /* 540 ------------------------------------------------------------------------------- 541 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 542 and significand `zSig', and returns the proper double-precision floating- 543 point value corresponding to the abstract input. This routine is just like 544 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 545 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 546 floating-point exponent. 547 ------------------------------------------------------------------------------- 548 */ 549 static float64 550 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 551 { 552 int8 shiftCount; 553 554 shiftCount = countLeadingZeros64( zSig ) - 1; 555 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 556 557 } 558 559 #ifdef FLOATX80 560 561 /* 562 ------------------------------------------------------------------------------- 563 Returns the fraction bits of the extended double-precision floating-point 564 value `a'. 565 ------------------------------------------------------------------------------- 566 */ 567 INLINE bits64 extractFloatx80Frac( floatx80 a ) 568 { 569 570 return a.low; 571 572 } 573 574 /* 575 ------------------------------------------------------------------------------- 576 Returns the exponent bits of the extended double-precision floating-point 577 value `a'. 578 ------------------------------------------------------------------------------- 579 */ 580 INLINE int32 extractFloatx80Exp( floatx80 a ) 581 { 582 583 return a.high & 0x7FFF; 584 585 } 586 587 /* 588 ------------------------------------------------------------------------------- 589 Returns the sign bit of the extended double-precision floating-point value 590 `a'. 591 ------------------------------------------------------------------------------- 592 */ 593 INLINE flag extractFloatx80Sign( floatx80 a ) 594 { 595 596 return a.high>>15; 597 598 } 599 600 /* 601 ------------------------------------------------------------------------------- 602 Normalizes the subnormal extended double-precision floating-point value 603 represented by the denormalized significand `aSig'. The normalized exponent 604 and significand are stored at the locations pointed to by `zExpPtr' and 605 `zSigPtr', respectively. 606 ------------------------------------------------------------------------------- 607 */ 608 static void 609 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 610 { 611 int8 shiftCount; 612 613 shiftCount = countLeadingZeros64( aSig ); 614 *zSigPtr = aSig<<shiftCount; 615 *zExpPtr = 1 - shiftCount; 616 617 } 618 619 /* 620 ------------------------------------------------------------------------------- 621 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 622 extended double-precision floating-point value, returning the result. 623 ------------------------------------------------------------------------------- 624 */ 625 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 626 { 627 floatx80 z; 628 629 z.low = zSig; 630 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 631 return z; 632 633 } 634 635 /* 636 ------------------------------------------------------------------------------- 637 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 638 and extended significand formed by the concatenation of `zSig0' and `zSig1', 639 and returns the proper extended double-precision floating-point value 640 corresponding to the abstract input. Ordinarily, the abstract value is 641 rounded and packed into the extended double-precision format, with the 642 inexact exception raised if the abstract input cannot be represented 643 exactly. However, if the abstract value is too large, the overflow and 644 inexact exceptions are raised and an infinity or maximal finite value is 645 returned. If the abstract value is too small, the input value is rounded to 646 a subnormal number, and the underflow and inexact exceptions are raised if 647 the abstract input cannot be represented exactly as a subnormal extended 648 double-precision floating-point number. 649 If `roundingPrecision' is 32 or 64, the result is rounded to the same 650 number of bits as single or double precision, respectively. Otherwise, the 651 result is rounded to the full precision of the extended double-precision 652 format. 653 The input significand must be normalized or smaller. If the input 654 significand is not normalized, `zExp' must be 0; in that case, the result 655 returned is a subnormal number, and it must not require rounding. The 656 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 657 Floating-Point Arithmetic. 658 ------------------------------------------------------------------------------- 659 */ 660 static floatx80 661 roundAndPackFloatx80( 662 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 663 ) 664 { 665 int8 roundingMode; 666 flag roundNearestEven, increment, isTiny; 667 int64 roundIncrement, roundMask, roundBits; 668 669 roundingMode = float_rounding_mode(); 670 roundNearestEven = ( roundingMode == float_round_nearest_even ); 671 if ( roundingPrecision == 80 ) goto precision80; 672 if ( roundingPrecision == 64 ) { 673 roundIncrement = LIT64( 0x0000000000000400 ); 674 roundMask = LIT64( 0x00000000000007FF ); 675 } 676 else if ( roundingPrecision == 32 ) { 677 roundIncrement = LIT64( 0x0000008000000000 ); 678 roundMask = LIT64( 0x000000FFFFFFFFFF ); 679 } 680 else { 681 goto precision80; 682 } 683 zSig0 |= ( zSig1 != 0 ); 684 if ( ! roundNearestEven ) { 685 if ( roundingMode == float_round_to_zero ) { 686 roundIncrement = 0; 687 } 688 else { 689 roundIncrement = roundMask; 690 if ( zSign ) { 691 if ( roundingMode == float_round_up ) roundIncrement = 0; 692 } 693 else { 694 if ( roundingMode == float_round_down ) roundIncrement = 0; 695 } 696 } 697 } 698 roundBits = zSig0 & roundMask; 699 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 700 if ( ( 0x7FFE < zExp ) 701 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 702 ) { 703 goto overflow; 704 } 705 if ( zExp <= 0 ) { 706 isTiny = 707 ( float_detect_tininess == float_tininess_before_rounding ) 708 || ( zExp < 0 ) 709 || ( zSig0 <= zSig0 + roundIncrement ); 710 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 711 zExp = 0; 712 roundBits = zSig0 & roundMask; 713 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 714 if ( roundBits ) float_set_inexact(); 715 zSig0 += roundIncrement; 716 if ( (sbits64) zSig0 < 0 ) zExp = 1; 717 roundIncrement = roundMask + 1; 718 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 719 roundMask |= roundIncrement; 720 } 721 zSig0 &= ~ roundMask; 722 return packFloatx80( zSign, zExp, zSig0 ); 723 } 724 } 725 if ( roundBits ) float_set_inexact(); 726 zSig0 += roundIncrement; 727 if ( zSig0 < roundIncrement ) { 728 ++zExp; 729 zSig0 = LIT64( 0x8000000000000000 ); 730 } 731 roundIncrement = roundMask + 1; 732 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 733 roundMask |= roundIncrement; 734 } 735 zSig0 &= ~ roundMask; 736 if ( zSig0 == 0 ) zExp = 0; 737 return packFloatx80( zSign, zExp, zSig0 ); 738 precision80: 739 increment = ( (sbits64) zSig1 < 0 ); 740 if ( ! roundNearestEven ) { 741 if ( roundingMode == float_round_to_zero ) { 742 increment = 0; 743 } 744 else { 745 if ( zSign ) { 746 increment = ( roundingMode == float_round_down ) && zSig1; 747 } 748 else { 749 increment = ( roundingMode == float_round_up ) && zSig1; 750 } 751 } 752 } 753 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 754 if ( ( 0x7FFE < zExp ) 755 || ( ( zExp == 0x7FFE ) 756 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 757 && increment 758 ) 759 ) { 760 roundMask = 0; 761 overflow: 762 float_raise( float_flag_overflow | float_flag_inexact ); 763 if ( ( roundingMode == float_round_to_zero ) 764 || ( zSign && ( roundingMode == float_round_up ) ) 765 || ( ! zSign && ( roundingMode == float_round_down ) ) 766 ) { 767 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 768 } 769 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 770 } 771 if ( zExp <= 0 ) { 772 isTiny = 773 ( float_detect_tininess == float_tininess_before_rounding ) 774 || ( zExp < 0 ) 775 || ! increment 776 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 777 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 778 zExp = 0; 779 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 780 if ( zSig1 ) float_set_inexact(); 781 if ( roundNearestEven ) { 782 increment = ( (sbits64) zSig1 < 0 ); 783 } 784 else { 785 if ( zSign ) { 786 increment = ( roundingMode == float_round_down ) && zSig1; 787 } 788 else { 789 increment = ( roundingMode == float_round_up ) && zSig1; 790 } 791 } 792 if ( increment ) { 793 ++zSig0; 794 zSig0 &= 795 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 796 if ( (sbits64) zSig0 < 0 ) zExp = 1; 797 } 798 return packFloatx80( zSign, zExp, zSig0 ); 799 } 800 } 801 if ( zSig1 ) float_set_inexact(); 802 if ( increment ) { 803 ++zSig0; 804 if ( zSig0 == 0 ) { 805 ++zExp; 806 zSig0 = LIT64( 0x8000000000000000 ); 807 } 808 else { 809 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 810 } 811 } 812 else { 813 if ( zSig0 == 0 ) zExp = 0; 814 } 815 return packFloatx80( zSign, zExp, zSig0 ); 816 817 } 818 819 /* 820 ------------------------------------------------------------------------------- 821 Takes an abstract floating-point value having sign `zSign', exponent 822 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 823 and returns the proper extended double-precision floating-point value 824 corresponding to the abstract input. This routine is just like 825 `roundAndPackFloatx80' except that the input significand does not have to be 826 normalized. 827 ------------------------------------------------------------------------------- 828 */ 829 static floatx80 830 normalizeRoundAndPackFloatx80( 831 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 832 ) 833 { 834 int8 shiftCount; 835 836 if ( zSig0 == 0 ) { 837 zSig0 = zSig1; 838 zSig1 = 0; 839 zExp -= 64; 840 } 841 shiftCount = countLeadingZeros64( zSig0 ); 842 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 843 zExp -= shiftCount; 844 return 845 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 846 847 } 848 849 #endif 850 851 #ifdef FLOAT128 852 853 /* 854 ------------------------------------------------------------------------------- 855 Returns the least-significant 64 fraction bits of the quadruple-precision 856 floating-point value `a'. 857 ------------------------------------------------------------------------------- 858 */ 859 INLINE bits64 extractFloat128Frac1( float128 a ) 860 { 861 862 return a.low; 863 864 } 865 866 /* 867 ------------------------------------------------------------------------------- 868 Returns the most-significant 48 fraction bits of the quadruple-precision 869 floating-point value `a'. 870 ------------------------------------------------------------------------------- 871 */ 872 INLINE bits64 extractFloat128Frac0( float128 a ) 873 { 874 875 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 876 877 } 878 879 /* 880 ------------------------------------------------------------------------------- 881 Returns the exponent bits of the quadruple-precision floating-point value 882 `a'. 883 ------------------------------------------------------------------------------- 884 */ 885 INLINE int32 extractFloat128Exp( float128 a ) 886 { 887 888 return ( a.high>>48 ) & 0x7FFF; 889 890 } 891 892 /* 893 ------------------------------------------------------------------------------- 894 Returns the sign bit of the quadruple-precision floating-point value `a'. 895 ------------------------------------------------------------------------------- 896 */ 897 INLINE flag extractFloat128Sign( float128 a ) 898 { 899 900 return a.high>>63; 901 902 } 903 904 /* 905 ------------------------------------------------------------------------------- 906 Normalizes the subnormal quadruple-precision floating-point value 907 represented by the denormalized significand formed by the concatenation of 908 `aSig0' and `aSig1'. The normalized exponent is stored at the location 909 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 910 significand are stored at the location pointed to by `zSig0Ptr', and the 911 least significant 64 bits of the normalized significand are stored at the 912 location pointed to by `zSig1Ptr'. 913 ------------------------------------------------------------------------------- 914 */ 915 static void 916 normalizeFloat128Subnormal( 917 bits64 aSig0, 918 bits64 aSig1, 919 int32 *zExpPtr, 920 bits64 *zSig0Ptr, 921 bits64 *zSig1Ptr 922 ) 923 { 924 int8 shiftCount; 925 926 if ( aSig0 == 0 ) { 927 shiftCount = countLeadingZeros64( aSig1 ) - 15; 928 if ( shiftCount < 0 ) { 929 *zSig0Ptr = aSig1>>( - shiftCount ); 930 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 931 } 932 else { 933 *zSig0Ptr = aSig1<<shiftCount; 934 *zSig1Ptr = 0; 935 } 936 *zExpPtr = - shiftCount - 63; 937 } 938 else { 939 shiftCount = countLeadingZeros64( aSig0 ) - 15; 940 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 941 *zExpPtr = 1 - shiftCount; 942 } 943 944 } 945 946 /* 947 ------------------------------------------------------------------------------- 948 Packs the sign `zSign', the exponent `zExp', and the significand formed 949 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 950 floating-point value, returning the result. After being shifted into the 951 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 952 added together to form the most significant 32 bits of the result. This 953 means that any integer portion of `zSig0' will be added into the exponent. 954 Since a properly normalized significand will have an integer portion equal 955 to 1, the `zExp' input should be 1 less than the desired result exponent 956 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 957 significand. 958 ------------------------------------------------------------------------------- 959 */ 960 INLINE float128 961 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 962 { 963 float128 z; 964 965 z.low = zSig1; 966 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 967 return z; 968 969 } 970 971 /* 972 ------------------------------------------------------------------------------- 973 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 974 and extended significand formed by the concatenation of `zSig0', `zSig1', 975 and `zSig2', and returns the proper quadruple-precision floating-point value 976 corresponding to the abstract input. Ordinarily, the abstract value is 977 simply rounded and packed into the quadruple-precision format, with the 978 inexact exception raised if the abstract input cannot be represented 979 exactly. However, if the abstract value is too large, the overflow and 980 inexact exceptions are raised and an infinity or maximal finite value is 981 returned. If the abstract value is too small, the input value is rounded to 982 a subnormal number, and the underflow and inexact exceptions are raised if 983 the abstract input cannot be represented exactly as a subnormal quadruple- 984 precision floating-point number. 985 The input significand must be normalized or smaller. If the input 986 significand is not normalized, `zExp' must be 0; in that case, the result 987 returned is a subnormal number, and it must not require rounding. In the 988 usual case that the input significand is normalized, `zExp' must be 1 less 989 than the ``true'' floating-point exponent. The handling of underflow and 990 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 991 ------------------------------------------------------------------------------- 992 */ 993 static float128 994 roundAndPackFloat128( 995 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 996 { 997 int8 roundingMode; 998 flag roundNearestEven, increment, isTiny; 999 1000 roundingMode = float_rounding_mode(); 1001 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1002 increment = ( (sbits64) zSig2 < 0 ); 1003 if ( ! roundNearestEven ) { 1004 if ( roundingMode == float_round_to_zero ) { 1005 increment = 0; 1006 } 1007 else { 1008 if ( zSign ) { 1009 increment = ( roundingMode == float_round_down ) && zSig2; 1010 } 1011 else { 1012 increment = ( roundingMode == float_round_up ) && zSig2; 1013 } 1014 } 1015 } 1016 if ( 0x7FFD <= (bits32) zExp ) { 1017 if ( ( 0x7FFD < zExp ) 1018 || ( ( zExp == 0x7FFD ) 1019 && eq128( 1020 LIT64( 0x0001FFFFFFFFFFFF ), 1021 LIT64( 0xFFFFFFFFFFFFFFFF ), 1022 zSig0, 1023 zSig1 1024 ) 1025 && increment 1026 ) 1027 ) { 1028 float_raise( float_flag_overflow | float_flag_inexact ); 1029 if ( ( roundingMode == float_round_to_zero ) 1030 || ( zSign && ( roundingMode == float_round_up ) ) 1031 || ( ! zSign && ( roundingMode == float_round_down ) ) 1032 ) { 1033 return 1034 packFloat128( 1035 zSign, 1036 0x7FFE, 1037 LIT64( 0x0000FFFFFFFFFFFF ), 1038 LIT64( 0xFFFFFFFFFFFFFFFF ) 1039 ); 1040 } 1041 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1042 } 1043 if ( zExp < 0 ) { 1044 isTiny = 1045 ( float_detect_tininess == float_tininess_before_rounding ) 1046 || ( zExp < -1 ) 1047 || ! increment 1048 || lt128( 1049 zSig0, 1050 zSig1, 1051 LIT64( 0x0001FFFFFFFFFFFF ), 1052 LIT64( 0xFFFFFFFFFFFFFFFF ) 1053 ); 1054 shift128ExtraRightJamming( 1055 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1056 zExp = 0; 1057 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1058 if ( roundNearestEven ) { 1059 increment = ( (sbits64) zSig2 < 0 ); 1060 } 1061 else { 1062 if ( zSign ) { 1063 increment = ( roundingMode == float_round_down ) && zSig2; 1064 } 1065 else { 1066 increment = ( roundingMode == float_round_up ) && zSig2; 1067 } 1068 } 1069 } 1070 } 1071 if ( zSig2 ) float_set_inexact(); 1072 if ( increment ) { 1073 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1074 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1075 } 1076 else { 1077 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1078 } 1079 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1080 1081 } 1082 1083 /* 1084 ------------------------------------------------------------------------------- 1085 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1086 and significand formed by the concatenation of `zSig0' and `zSig1', and 1087 returns the proper quadruple-precision floating-point value corresponding 1088 to the abstract input. This routine is just like `roundAndPackFloat128' 1089 except that the input significand has fewer bits and does not have to be 1090 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1091 point exponent. 1092 ------------------------------------------------------------------------------- 1093 */ 1094 static float128 1095 normalizeRoundAndPackFloat128( 1096 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1097 { 1098 int8 shiftCount; 1099 bits64 zSig2; 1100 1101 if ( zSig0 == 0 ) { 1102 zSig0 = zSig1; 1103 zSig1 = 0; 1104 zExp -= 64; 1105 } 1106 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1107 if ( 0 <= shiftCount ) { 1108 zSig2 = 0; 1109 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1110 } 1111 else { 1112 shift128ExtraRightJamming( 1113 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1114 } 1115 zExp -= shiftCount; 1116 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1117 1118 } 1119 1120 #endif 1121 1122 /* 1123 ------------------------------------------------------------------------------- 1124 Returns the result of converting the 32-bit two's complement integer `a' 1125 to the single-precision floating-point format. The conversion is performed 1126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1127 ------------------------------------------------------------------------------- 1128 */ 1129 float32 int32_to_float32( int32 a ) 1130 { 1131 flag zSign; 1132 1133 if ( a == 0 ) return 0; 1134 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1135 zSign = ( a < 0 ); 1136 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1137 1138 } 1139 1140 /* 1141 ------------------------------------------------------------------------------- 1142 Returns the result of converting the 32-bit two's complement integer `a' 1143 to the double-precision floating-point format. The conversion is performed 1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1145 ------------------------------------------------------------------------------- 1146 */ 1147 float64 int32_to_float64( int32 a ) 1148 { 1149 flag zSign; 1150 uint32 absA; 1151 int8 shiftCount; 1152 bits64 zSig; 1153 1154 if ( a == 0 ) return 0; 1155 zSign = ( a < 0 ); 1156 absA = zSign ? - a : a; 1157 shiftCount = countLeadingZeros32( absA ) + 21; 1158 zSig = absA; 1159 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1160 1161 } 1162 1163 #ifdef FLOATX80 1164 1165 /* 1166 ------------------------------------------------------------------------------- 1167 Returns the result of converting the 32-bit two's complement integer `a' 1168 to the extended double-precision floating-point format. The conversion 1169 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1170 Arithmetic. 1171 ------------------------------------------------------------------------------- 1172 */ 1173 floatx80 int32_to_floatx80( int32 a ) 1174 { 1175 flag zSign; 1176 uint32 absA; 1177 int8 shiftCount; 1178 bits64 zSig; 1179 1180 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1181 zSign = ( a < 0 ); 1182 absA = zSign ? - a : a; 1183 shiftCount = countLeadingZeros32( absA ) + 32; 1184 zSig = absA; 1185 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1186 1187 } 1188 1189 #endif 1190 1191 #ifdef FLOAT128 1192 1193 /* 1194 ------------------------------------------------------------------------------- 1195 Returns the result of converting the 32-bit two's complement integer `a' to 1196 the quadruple-precision floating-point format. The conversion is performed 1197 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1198 ------------------------------------------------------------------------------- 1199 */ 1200 float128 int32_to_float128( int32 a ) 1201 { 1202 flag zSign; 1203 uint32 absA; 1204 int8 shiftCount; 1205 bits64 zSig0; 1206 1207 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1208 zSign = ( a < 0 ); 1209 absA = zSign ? - a : a; 1210 shiftCount = countLeadingZeros32( absA ) + 17; 1211 zSig0 = absA; 1212 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1213 1214 } 1215 1216 #endif 1217 1218 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1219 /* 1220 ------------------------------------------------------------------------------- 1221 Returns the result of converting the 64-bit two's complement integer `a' 1222 to the single-precision floating-point format. The conversion is performed 1223 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1224 ------------------------------------------------------------------------------- 1225 */ 1226 float32 int64_to_float32( int64 a ) 1227 { 1228 flag zSign; 1229 uint64 absA; 1230 int8 shiftCount; 1231 1232 if ( a == 0 ) return 0; 1233 zSign = ( a < 0 ); 1234 absA = zSign ? - a : a; 1235 shiftCount = countLeadingZeros64( absA ) - 40; 1236 if ( 0 <= shiftCount ) { 1237 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1238 } 1239 else { 1240 shiftCount += 7; 1241 if ( shiftCount < 0 ) { 1242 shift64RightJamming( absA, - shiftCount, &absA ); 1243 } 1244 else { 1245 absA <<= shiftCount; 1246 } 1247 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1248 } 1249 1250 } 1251 1252 /* 1253 ------------------------------------------------------------------------------- 1254 Returns the result of converting the 64-bit two's complement integer `a' 1255 to the double-precision floating-point format. The conversion is performed 1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1257 ------------------------------------------------------------------------------- 1258 */ 1259 float64 int64_to_float64( int64 a ) 1260 { 1261 flag zSign; 1262 1263 if ( a == 0 ) return 0; 1264 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1265 return packFloat64( 1, 0x43E, 0 ); 1266 } 1267 zSign = ( a < 0 ); 1268 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1269 1270 } 1271 1272 #ifdef FLOATX80 1273 1274 /* 1275 ------------------------------------------------------------------------------- 1276 Returns the result of converting the 64-bit two's complement integer `a' 1277 to the extended double-precision floating-point format. The conversion 1278 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1279 Arithmetic. 1280 ------------------------------------------------------------------------------- 1281 */ 1282 floatx80 int64_to_floatx80( int64 a ) 1283 { 1284 flag zSign; 1285 uint64 absA; 1286 int8 shiftCount; 1287 1288 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1289 zSign = ( a < 0 ); 1290 absA = zSign ? - a : a; 1291 shiftCount = countLeadingZeros64( absA ); 1292 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1293 1294 } 1295 1296 #endif 1297 1298 #ifdef FLOAT128 1299 1300 /* 1301 ------------------------------------------------------------------------------- 1302 Returns the result of converting the 64-bit two's complement integer `a' to 1303 the quadruple-precision floating-point format. The conversion is performed 1304 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1305 ------------------------------------------------------------------------------- 1306 */ 1307 float128 int64_to_float128( int64 a ) 1308 { 1309 flag zSign; 1310 uint64 absA; 1311 int8 shiftCount; 1312 int32 zExp; 1313 bits64 zSig0, zSig1; 1314 1315 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1316 zSign = ( a < 0 ); 1317 absA = zSign ? - a : a; 1318 shiftCount = countLeadingZeros64( absA ) + 49; 1319 zExp = 0x406E - shiftCount; 1320 if ( 64 <= shiftCount ) { 1321 zSig1 = 0; 1322 zSig0 = absA; 1323 shiftCount -= 64; 1324 } 1325 else { 1326 zSig1 = absA; 1327 zSig0 = 0; 1328 } 1329 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1330 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1331 1332 } 1333 1334 #endif 1335 #endif /* !SOFTFLOAT_FOR_GCC */ 1336 1337 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1338 /* 1339 ------------------------------------------------------------------------------- 1340 Returns the result of converting the single-precision floating-point value 1341 `a' to the 32-bit two's complement integer format. The conversion is 1342 performed according to the IEC/IEEE Standard for Binary Floating-Point 1343 Arithmetic---which means in particular that the conversion is rounded 1344 according to the current rounding mode. If `a' is a NaN, the largest 1345 positive integer is returned. Otherwise, if the conversion overflows, the 1346 largest integer with the same sign as `a' is returned. 1347 ------------------------------------------------------------------------------- 1348 */ 1349 int32 float32_to_int32( float32 a ) 1350 { 1351 flag aSign; 1352 int16 aExp, shiftCount; 1353 bits32 aSig; 1354 bits64 aSig64; 1355 1356 aSig = extractFloat32Frac( a ); 1357 aExp = extractFloat32Exp( a ); 1358 aSign = extractFloat32Sign( a ); 1359 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1360 if ( aExp ) aSig |= 0x00800000; 1361 shiftCount = 0xAF - aExp; 1362 aSig64 = aSig; 1363 aSig64 <<= 32; 1364 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1365 return roundAndPackInt32( aSign, aSig64 ); 1366 1367 } 1368 #endif /* !SOFTFLOAT_FOR_GCC */ 1369 1370 /* 1371 ------------------------------------------------------------------------------- 1372 Returns the result of converting the single-precision floating-point value 1373 `a' to the 32-bit two's complement integer format. The conversion is 1374 performed according to the IEC/IEEE Standard for Binary Floating-Point 1375 Arithmetic, except that the conversion is always rounded toward zero. 1376 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1377 the conversion overflows, the largest integer with the same sign as `a' is 1378 returned. 1379 ------------------------------------------------------------------------------- 1380 */ 1381 int32 float32_to_int32_round_to_zero( float32 a ) 1382 { 1383 flag aSign; 1384 int16 aExp, shiftCount; 1385 bits32 aSig; 1386 int32 z; 1387 1388 aSig = extractFloat32Frac( a ); 1389 aExp = extractFloat32Exp( a ); 1390 aSign = extractFloat32Sign( a ); 1391 shiftCount = aExp - 0x9E; 1392 if ( 0 <= shiftCount ) { 1393 if ( a != 0xCF000000 ) { 1394 float_raise( float_flag_invalid ); 1395 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1396 } 1397 return (sbits32) 0x80000000; 1398 } 1399 else if ( aExp <= 0x7E ) { 1400 if ( aExp | aSig ) float_set_inexact(); 1401 return 0; 1402 } 1403 aSig = ( aSig | 0x00800000 )<<8; 1404 z = aSig>>( - shiftCount ); 1405 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1406 float_set_inexact(); 1407 } 1408 if ( aSign ) z = - z; 1409 return z; 1410 1411 } 1412 1413 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1414 /* 1415 ------------------------------------------------------------------------------- 1416 Returns the result of converting the single-precision floating-point value 1417 `a' to the 64-bit two's complement integer format. The conversion is 1418 performed according to the IEC/IEEE Standard for Binary Floating-Point 1419 Arithmetic---which means in particular that the conversion is rounded 1420 according to the current rounding mode. If `a' is a NaN, the largest 1421 positive integer is returned. Otherwise, if the conversion overflows, the 1422 largest integer with the same sign as `a' is returned. 1423 ------------------------------------------------------------------------------- 1424 */ 1425 int64 float32_to_int64( float32 a ) 1426 { 1427 flag aSign; 1428 int16 aExp, shiftCount; 1429 bits32 aSig; 1430 bits64 aSig64, aSigExtra; 1431 1432 aSig = extractFloat32Frac( a ); 1433 aExp = extractFloat32Exp( a ); 1434 aSign = extractFloat32Sign( a ); 1435 shiftCount = 0xBE - aExp; 1436 if ( shiftCount < 0 ) { 1437 float_raise( float_flag_invalid ); 1438 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1439 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1440 } 1441 return (sbits64) LIT64( 0x8000000000000000 ); 1442 } 1443 if ( aExp ) aSig |= 0x00800000; 1444 aSig64 = aSig; 1445 aSig64 <<= 40; 1446 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1447 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1448 1449 } 1450 1451 /* 1452 ------------------------------------------------------------------------------- 1453 Returns the result of converting the single-precision floating-point value 1454 `a' to the 64-bit two's complement integer format. The conversion is 1455 performed according to the IEC/IEEE Standard for Binary Floating-Point 1456 Arithmetic, except that the conversion is always rounded toward zero. If 1457 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1458 conversion overflows, the largest integer with the same sign as `a' is 1459 returned. 1460 ------------------------------------------------------------------------------- 1461 */ 1462 int64 float32_to_int64_round_to_zero( float32 a ) 1463 { 1464 flag aSign; 1465 int16 aExp, shiftCount; 1466 bits32 aSig; 1467 bits64 aSig64; 1468 int64 z; 1469 1470 aSig = extractFloat32Frac( a ); 1471 aExp = extractFloat32Exp( a ); 1472 aSign = extractFloat32Sign( a ); 1473 shiftCount = aExp - 0xBE; 1474 if ( 0 <= shiftCount ) { 1475 if ( a != 0xDF000000 ) { 1476 float_raise( float_flag_invalid ); 1477 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1478 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1479 } 1480 } 1481 return (sbits64) LIT64( 0x8000000000000000 ); 1482 } 1483 else if ( aExp <= 0x7E ) { 1484 if ( aExp | aSig ) float_set_inexact(); 1485 return 0; 1486 } 1487 aSig64 = aSig | 0x00800000; 1488 aSig64 <<= 40; 1489 z = aSig64>>( - shiftCount ); 1490 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1491 float_set_inexact(); 1492 } 1493 if ( aSign ) z = - z; 1494 return z; 1495 1496 } 1497 #endif /* !SOFTFLOAT_FOR_GCC */ 1498 1499 /* 1500 ------------------------------------------------------------------------------- 1501 Returns the result of converting the single-precision floating-point value 1502 `a' to the double-precision floating-point format. The conversion is 1503 performed according to the IEC/IEEE Standard for Binary Floating-Point 1504 Arithmetic. 1505 ------------------------------------------------------------------------------- 1506 */ 1507 float64 float32_to_float64( float32 a ) 1508 { 1509 flag aSign; 1510 int16 aExp; 1511 bits32 aSig; 1512 1513 aSig = extractFloat32Frac( a ); 1514 aExp = extractFloat32Exp( a ); 1515 aSign = extractFloat32Sign( a ); 1516 if ( aExp == 0xFF ) { 1517 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1518 return packFloat64( aSign, 0x7FF, 0 ); 1519 } 1520 if ( aExp == 0 ) { 1521 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1522 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1523 --aExp; 1524 } 1525 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1526 1527 } 1528 1529 #ifdef FLOATX80 1530 1531 /* 1532 ------------------------------------------------------------------------------- 1533 Returns the result of converting the single-precision floating-point value 1534 `a' to the extended double-precision floating-point format. The conversion 1535 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1536 Arithmetic. 1537 ------------------------------------------------------------------------------- 1538 */ 1539 floatx80 float32_to_floatx80( float32 a ) 1540 { 1541 flag aSign; 1542 int16 aExp; 1543 bits32 aSig; 1544 1545 aSig = extractFloat32Frac( a ); 1546 aExp = extractFloat32Exp( a ); 1547 aSign = extractFloat32Sign( a ); 1548 if ( aExp == 0xFF ) { 1549 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1550 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1551 } 1552 if ( aExp == 0 ) { 1553 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1554 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1555 } 1556 aSig |= 0x00800000; 1557 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1558 1559 } 1560 1561 #endif 1562 1563 #ifdef FLOAT128 1564 1565 /* 1566 ------------------------------------------------------------------------------- 1567 Returns the result of converting the single-precision floating-point value 1568 `a' to the double-precision floating-point format. The conversion is 1569 performed according to the IEC/IEEE Standard for Binary Floating-Point 1570 Arithmetic. 1571 ------------------------------------------------------------------------------- 1572 */ 1573 float128 float32_to_float128( float32 a ) 1574 { 1575 flag aSign; 1576 int16 aExp; 1577 bits32 aSig; 1578 1579 aSig = extractFloat32Frac( a ); 1580 aExp = extractFloat32Exp( a ); 1581 aSign = extractFloat32Sign( a ); 1582 if ( aExp == 0xFF ) { 1583 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1584 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1585 } 1586 if ( aExp == 0 ) { 1587 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1588 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1589 --aExp; 1590 } 1591 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1592 1593 } 1594 1595 #endif 1596 1597 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1598 /* 1599 ------------------------------------------------------------------------------- 1600 Rounds the single-precision floating-point value `a' to an integer, and 1601 returns the result as a single-precision floating-point value. The 1602 operation is performed according to the IEC/IEEE Standard for Binary 1603 Floating-Point Arithmetic. 1604 ------------------------------------------------------------------------------- 1605 */ 1606 float32 float32_round_to_int( float32 a ) 1607 { 1608 flag aSign; 1609 int16 aExp; 1610 bits32 lastBitMask, roundBitsMask; 1611 int8 roundingMode; 1612 float32 z; 1613 1614 aExp = extractFloat32Exp( a ); 1615 if ( 0x96 <= aExp ) { 1616 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1617 return propagateFloat32NaN( a, a ); 1618 } 1619 return a; 1620 } 1621 if ( aExp <= 0x7E ) { 1622 if ( (bits32) ( a<<1 ) == 0 ) return a; 1623 float_set_inexact(); 1624 aSign = extractFloat32Sign( a ); 1625 switch ( float_rounding_mode() ) { 1626 case float_round_nearest_even: 1627 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1628 return packFloat32( aSign, 0x7F, 0 ); 1629 } 1630 break; 1631 case float_round_down: 1632 return aSign ? 0xBF800000 : 0; 1633 case float_round_up: 1634 return aSign ? 0x80000000 : 0x3F800000; 1635 } 1636 return packFloat32( aSign, 0, 0 ); 1637 } 1638 lastBitMask = 1; 1639 lastBitMask <<= 0x96 - aExp; 1640 roundBitsMask = lastBitMask - 1; 1641 z = a; 1642 roundingMode = float_rounding_mode(); 1643 if ( roundingMode == float_round_nearest_even ) { 1644 z += lastBitMask>>1; 1645 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1646 } 1647 else if ( roundingMode != float_round_to_zero ) { 1648 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1649 z += roundBitsMask; 1650 } 1651 } 1652 z &= ~ roundBitsMask; 1653 if ( z != a ) float_set_inexact(); 1654 return z; 1655 1656 } 1657 #endif /* !SOFTFLOAT_FOR_GCC */ 1658 1659 /* 1660 ------------------------------------------------------------------------------- 1661 Returns the result of adding the absolute values of the single-precision 1662 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1663 before being returned. `zSign' is ignored if the result is a NaN. 1664 The addition is performed according to the IEC/IEEE Standard for Binary 1665 Floating-Point Arithmetic. 1666 ------------------------------------------------------------------------------- 1667 */ 1668 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1669 { 1670 int16 aExp, bExp, zExp; 1671 bits32 aSig, bSig, zSig; 1672 int16 expDiff; 1673 1674 aSig = extractFloat32Frac( a ); 1675 aExp = extractFloat32Exp( a ); 1676 bSig = extractFloat32Frac( b ); 1677 bExp = extractFloat32Exp( b ); 1678 expDiff = aExp - bExp; 1679 aSig <<= 6; 1680 bSig <<= 6; 1681 if ( 0 < expDiff ) { 1682 if ( aExp == 0xFF ) { 1683 if ( aSig ) return propagateFloat32NaN( a, b ); 1684 return a; 1685 } 1686 if ( bExp == 0 ) { 1687 --expDiff; 1688 } 1689 else { 1690 bSig |= 0x20000000; 1691 } 1692 shift32RightJamming( bSig, expDiff, &bSig ); 1693 zExp = aExp; 1694 } 1695 else if ( expDiff < 0 ) { 1696 if ( bExp == 0xFF ) { 1697 if ( bSig ) return propagateFloat32NaN( a, b ); 1698 return packFloat32( zSign, 0xFF, 0 ); 1699 } 1700 if ( aExp == 0 ) { 1701 ++expDiff; 1702 } 1703 else { 1704 aSig |= 0x20000000; 1705 } 1706 shift32RightJamming( aSig, - expDiff, &aSig ); 1707 zExp = bExp; 1708 } 1709 else { 1710 if ( aExp == 0xFF ) { 1711 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1712 return a; 1713 } 1714 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1715 zSig = 0x40000000 + aSig + bSig; 1716 zExp = aExp; 1717 goto roundAndPack; 1718 } 1719 aSig |= 0x20000000; 1720 zSig = ( aSig + bSig )<<1; 1721 --zExp; 1722 if ( (sbits32) zSig < 0 ) { 1723 zSig = aSig + bSig; 1724 ++zExp; 1725 } 1726 roundAndPack: 1727 return roundAndPackFloat32( zSign, zExp, zSig ); 1728 1729 } 1730 1731 /* 1732 ------------------------------------------------------------------------------- 1733 Returns the result of subtracting the absolute values of the single- 1734 precision floating-point values `a' and `b'. If `zSign' is 1, the 1735 difference is negated before being returned. `zSign' is ignored if the 1736 result is a NaN. The subtraction is performed according to the IEC/IEEE 1737 Standard for Binary Floating-Point Arithmetic. 1738 ------------------------------------------------------------------------------- 1739 */ 1740 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1741 { 1742 int16 aExp, bExp, zExp; 1743 bits32 aSig, bSig, zSig; 1744 int16 expDiff; 1745 1746 aSig = extractFloat32Frac( a ); 1747 aExp = extractFloat32Exp( a ); 1748 bSig = extractFloat32Frac( b ); 1749 bExp = extractFloat32Exp( b ); 1750 expDiff = aExp - bExp; 1751 aSig <<= 7; 1752 bSig <<= 7; 1753 if ( 0 < expDiff ) goto aExpBigger; 1754 if ( expDiff < 0 ) goto bExpBigger; 1755 if ( aExp == 0xFF ) { 1756 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1757 float_raise( float_flag_invalid ); 1758 return float32_default_nan; 1759 } 1760 if ( aExp == 0 ) { 1761 aExp = 1; 1762 bExp = 1; 1763 } 1764 if ( bSig < aSig ) goto aBigger; 1765 if ( aSig < bSig ) goto bBigger; 1766 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1767 bExpBigger: 1768 if ( bExp == 0xFF ) { 1769 if ( bSig ) return propagateFloat32NaN( a, b ); 1770 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1771 } 1772 if ( aExp == 0 ) { 1773 ++expDiff; 1774 } 1775 else { 1776 aSig |= 0x40000000; 1777 } 1778 shift32RightJamming( aSig, - expDiff, &aSig ); 1779 bSig |= 0x40000000; 1780 bBigger: 1781 zSig = bSig - aSig; 1782 zExp = bExp; 1783 zSign ^= 1; 1784 goto normalizeRoundAndPack; 1785 aExpBigger: 1786 if ( aExp == 0xFF ) { 1787 if ( aSig ) return propagateFloat32NaN( a, b ); 1788 return a; 1789 } 1790 if ( bExp == 0 ) { 1791 --expDiff; 1792 } 1793 else { 1794 bSig |= 0x40000000; 1795 } 1796 shift32RightJamming( bSig, expDiff, &bSig ); 1797 aSig |= 0x40000000; 1798 aBigger: 1799 zSig = aSig - bSig; 1800 zExp = aExp; 1801 normalizeRoundAndPack: 1802 --zExp; 1803 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1804 1805 } 1806 1807 /* 1808 ------------------------------------------------------------------------------- 1809 Returns the result of adding the single-precision floating-point values `a' 1810 and `b'. The operation is performed according to the IEC/IEEE Standard for 1811 Binary Floating-Point Arithmetic. 1812 ------------------------------------------------------------------------------- 1813 */ 1814 float32 float32_add( float32 a, float32 b ) 1815 { 1816 flag aSign, bSign; 1817 1818 aSign = extractFloat32Sign( a ); 1819 bSign = extractFloat32Sign( b ); 1820 if ( aSign == bSign ) { 1821 return addFloat32Sigs( a, b, aSign ); 1822 } 1823 else { 1824 return subFloat32Sigs( a, b, aSign ); 1825 } 1826 1827 } 1828 1829 /* 1830 ------------------------------------------------------------------------------- 1831 Returns the result of subtracting the single-precision floating-point values 1832 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1833 for Binary Floating-Point Arithmetic. 1834 ------------------------------------------------------------------------------- 1835 */ 1836 float32 float32_sub( float32 a, float32 b ) 1837 { 1838 flag aSign, bSign; 1839 1840 aSign = extractFloat32Sign( a ); 1841 bSign = extractFloat32Sign( b ); 1842 if ( aSign == bSign ) { 1843 return subFloat32Sigs( a, b, aSign ); 1844 } 1845 else { 1846 return addFloat32Sigs( a, b, aSign ); 1847 } 1848 1849 } 1850 1851 /* 1852 ------------------------------------------------------------------------------- 1853 Returns the result of multiplying the single-precision floating-point values 1854 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1855 for Binary Floating-Point Arithmetic. 1856 ------------------------------------------------------------------------------- 1857 */ 1858 float32 float32_mul( float32 a, float32 b ) 1859 { 1860 flag aSign, bSign, zSign; 1861 int16 aExp, bExp, zExp; 1862 bits32 aSig, bSig; 1863 bits64 zSig64; 1864 bits32 zSig; 1865 1866 aSig = extractFloat32Frac( a ); 1867 aExp = extractFloat32Exp( a ); 1868 aSign = extractFloat32Sign( a ); 1869 bSig = extractFloat32Frac( b ); 1870 bExp = extractFloat32Exp( b ); 1871 bSign = extractFloat32Sign( b ); 1872 zSign = aSign ^ bSign; 1873 if ( aExp == 0xFF ) { 1874 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1875 return propagateFloat32NaN( a, b ); 1876 } 1877 if ( ( bExp | bSig ) == 0 ) { 1878 float_raise( float_flag_invalid ); 1879 return float32_default_nan; 1880 } 1881 return packFloat32( zSign, 0xFF, 0 ); 1882 } 1883 if ( bExp == 0xFF ) { 1884 if ( bSig ) return propagateFloat32NaN( a, b ); 1885 if ( ( aExp | aSig ) == 0 ) { 1886 float_raise( float_flag_invalid ); 1887 return float32_default_nan; 1888 } 1889 return packFloat32( zSign, 0xFF, 0 ); 1890 } 1891 if ( aExp == 0 ) { 1892 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1893 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1894 } 1895 if ( bExp == 0 ) { 1896 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1897 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1898 } 1899 zExp = aExp + bExp - 0x7F; 1900 aSig = ( aSig | 0x00800000 )<<7; 1901 bSig = ( bSig | 0x00800000 )<<8; 1902 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1903 zSig = zSig64; 1904 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1905 zSig <<= 1; 1906 --zExp; 1907 } 1908 return roundAndPackFloat32( zSign, zExp, zSig ); 1909 1910 } 1911 1912 /* 1913 ------------------------------------------------------------------------------- 1914 Returns the result of dividing the single-precision floating-point value `a' 1915 by the corresponding value `b'. The operation is performed according to the 1916 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1917 ------------------------------------------------------------------------------- 1918 */ 1919 float32 float32_div( float32 a, float32 b ) 1920 { 1921 flag aSign, bSign, zSign; 1922 int16 aExp, bExp, zExp; 1923 bits32 aSig, bSig, zSig; 1924 1925 aSig = extractFloat32Frac( a ); 1926 aExp = extractFloat32Exp( a ); 1927 aSign = extractFloat32Sign( a ); 1928 bSig = extractFloat32Frac( b ); 1929 bExp = extractFloat32Exp( b ); 1930 bSign = extractFloat32Sign( b ); 1931 zSign = aSign ^ bSign; 1932 if ( aExp == 0xFF ) { 1933 if ( aSig ) return propagateFloat32NaN( a, b ); 1934 if ( bExp == 0xFF ) { 1935 if ( bSig ) return propagateFloat32NaN( a, b ); 1936 float_raise( float_flag_invalid ); 1937 return float32_default_nan; 1938 } 1939 return packFloat32( zSign, 0xFF, 0 ); 1940 } 1941 if ( bExp == 0xFF ) { 1942 if ( bSig ) return propagateFloat32NaN( a, b ); 1943 return packFloat32( zSign, 0, 0 ); 1944 } 1945 if ( bExp == 0 ) { 1946 if ( bSig == 0 ) { 1947 if ( ( aExp | aSig ) == 0 ) { 1948 float_raise( float_flag_invalid ); 1949 return float32_default_nan; 1950 } 1951 float_raise( float_flag_divbyzero ); 1952 return packFloat32( zSign, 0xFF, 0 ); 1953 } 1954 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1955 } 1956 if ( aExp == 0 ) { 1957 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1958 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1959 } 1960 zExp = aExp - bExp + 0x7D; 1961 aSig = ( aSig | 0x00800000 )<<7; 1962 bSig = ( bSig | 0x00800000 )<<8; 1963 if ( bSig <= ( aSig + aSig ) ) { 1964 aSig >>= 1; 1965 ++zExp; 1966 } 1967 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1968 if ( ( zSig & 0x3F ) == 0 ) { 1969 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1970 } 1971 return roundAndPackFloat32( zSign, zExp, zSig ); 1972 1973 } 1974 1975 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1976 /* 1977 ------------------------------------------------------------------------------- 1978 Returns the remainder of the single-precision floating-point value `a' 1979 with respect to the corresponding value `b'. The operation is performed 1980 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1981 ------------------------------------------------------------------------------- 1982 */ 1983 float32 float32_rem( float32 a, float32 b ) 1984 { 1985 flag aSign, bSign, zSign; 1986 int16 aExp, bExp, expDiff; 1987 bits32 aSig, bSig; 1988 bits32 q; 1989 bits64 aSig64, bSig64, q64; 1990 bits32 alternateASig; 1991 sbits32 sigMean; 1992 1993 aSig = extractFloat32Frac( a ); 1994 aExp = extractFloat32Exp( a ); 1995 aSign = extractFloat32Sign( a ); 1996 bSig = extractFloat32Frac( b ); 1997 bExp = extractFloat32Exp( b ); 1998 bSign = extractFloat32Sign( b ); 1999 if ( aExp == 0xFF ) { 2000 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2001 return propagateFloat32NaN( a, b ); 2002 } 2003 float_raise( float_flag_invalid ); 2004 return float32_default_nan; 2005 } 2006 if ( bExp == 0xFF ) { 2007 if ( bSig ) return propagateFloat32NaN( a, b ); 2008 return a; 2009 } 2010 if ( bExp == 0 ) { 2011 if ( bSig == 0 ) { 2012 float_raise( float_flag_invalid ); 2013 return float32_default_nan; 2014 } 2015 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2016 } 2017 if ( aExp == 0 ) { 2018 if ( aSig == 0 ) return a; 2019 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2020 } 2021 expDiff = aExp - bExp; 2022 aSig |= 0x00800000; 2023 bSig |= 0x00800000; 2024 if ( expDiff < 32 ) { 2025 aSig <<= 8; 2026 bSig <<= 8; 2027 if ( expDiff < 0 ) { 2028 if ( expDiff < -1 ) return a; 2029 aSig >>= 1; 2030 } 2031 q = ( bSig <= aSig ); 2032 if ( q ) aSig -= bSig; 2033 if ( 0 < expDiff ) { 2034 q = ( ( (bits64) aSig )<<32 ) / bSig; 2035 q >>= 32 - expDiff; 2036 bSig >>= 2; 2037 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2038 } 2039 else { 2040 aSig >>= 2; 2041 bSig >>= 2; 2042 } 2043 } 2044 else { 2045 if ( bSig <= aSig ) aSig -= bSig; 2046 aSig64 = ( (bits64) aSig )<<40; 2047 bSig64 = ( (bits64) bSig )<<40; 2048 expDiff -= 64; 2049 while ( 0 < expDiff ) { 2050 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2051 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2052 aSig64 = - ( ( bSig * q64 )<<38 ); 2053 expDiff -= 62; 2054 } 2055 expDiff += 64; 2056 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2057 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2058 q = q64>>( 64 - expDiff ); 2059 bSig <<= 6; 2060 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2061 } 2062 do { 2063 alternateASig = aSig; 2064 ++q; 2065 aSig -= bSig; 2066 } while ( 0 <= (sbits32) aSig ); 2067 sigMean = aSig + alternateASig; 2068 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2069 aSig = alternateASig; 2070 } 2071 zSign = ( (sbits32) aSig < 0 ); 2072 if ( zSign ) aSig = - aSig; 2073 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2074 2075 } 2076 #endif /* !SOFTFLOAT_FOR_GCC */ 2077 2078 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2079 /* 2080 ------------------------------------------------------------------------------- 2081 Returns the square root of the single-precision floating-point value `a'. 2082 The operation is performed according to the IEC/IEEE Standard for Binary 2083 Floating-Point Arithmetic. 2084 ------------------------------------------------------------------------------- 2085 */ 2086 float32 float32_sqrt( float32 a ) 2087 { 2088 flag aSign; 2089 int16 aExp, zExp; 2090 bits32 aSig, zSig; 2091 bits64 rem, term; 2092 2093 aSig = extractFloat32Frac( a ); 2094 aExp = extractFloat32Exp( a ); 2095 aSign = extractFloat32Sign( a ); 2096 if ( aExp == 0xFF ) { 2097 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2098 if ( ! aSign ) return a; 2099 float_raise( float_flag_invalid ); 2100 return float32_default_nan; 2101 } 2102 if ( aSign ) { 2103 if ( ( aExp | aSig ) == 0 ) return a; 2104 float_raise( float_flag_invalid ); 2105 return float32_default_nan; 2106 } 2107 if ( aExp == 0 ) { 2108 if ( aSig == 0 ) return 0; 2109 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2110 } 2111 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2112 aSig = ( aSig | 0x00800000 )<<8; 2113 zSig = estimateSqrt32( aExp, aSig ) + 2; 2114 if ( ( zSig & 0x7F ) <= 5 ) { 2115 if ( zSig < 2 ) { 2116 zSig = 0x7FFFFFFF; 2117 goto roundAndPack; 2118 } 2119 aSig >>= aExp & 1; 2120 term = ( (bits64) zSig ) * zSig; 2121 rem = ( ( (bits64) aSig )<<32 ) - term; 2122 while ( (sbits64) rem < 0 ) { 2123 --zSig; 2124 rem += ( ( (bits64) zSig )<<1 ) | 1; 2125 } 2126 zSig |= ( rem != 0 ); 2127 } 2128 shift32RightJamming( zSig, 1, &zSig ); 2129 roundAndPack: 2130 return roundAndPackFloat32( 0, zExp, zSig ); 2131 2132 } 2133 #endif /* !SOFTFLOAT_FOR_GCC */ 2134 2135 /* 2136 ------------------------------------------------------------------------------- 2137 Returns 1 if the single-precision floating-point value `a' is equal to 2138 the corresponding value `b', and 0 otherwise. The comparison is performed 2139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2140 ------------------------------------------------------------------------------- 2141 */ 2142 flag float32_eq( float32 a, float32 b ) 2143 { 2144 2145 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2146 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2147 ) { 2148 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2149 float_raise( float_flag_invalid ); 2150 } 2151 return 0; 2152 } 2153 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2154 2155 } 2156 2157 /* 2158 ------------------------------------------------------------------------------- 2159 Returns 1 if the single-precision floating-point value `a' is less than 2160 or equal to the corresponding value `b', and 0 otherwise. The comparison 2161 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2162 Arithmetic. 2163 ------------------------------------------------------------------------------- 2164 */ 2165 flag float32_le( float32 a, float32 b ) 2166 { 2167 flag aSign, bSign; 2168 2169 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2170 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2171 ) { 2172 float_raise( float_flag_invalid ); 2173 return 0; 2174 } 2175 aSign = extractFloat32Sign( a ); 2176 bSign = extractFloat32Sign( b ); 2177 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2178 return ( a == b ) || ( aSign ^ ( a < b ) ); 2179 2180 } 2181 2182 /* 2183 ------------------------------------------------------------------------------- 2184 Returns 1 if the single-precision floating-point value `a' is less than 2185 the corresponding value `b', and 0 otherwise. The comparison is performed 2186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2187 ------------------------------------------------------------------------------- 2188 */ 2189 flag float32_lt( float32 a, float32 b ) 2190 { 2191 flag aSign, bSign; 2192 2193 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2194 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2195 ) { 2196 float_raise( float_flag_invalid ); 2197 return 0; 2198 } 2199 aSign = extractFloat32Sign( a ); 2200 bSign = extractFloat32Sign( b ); 2201 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2202 return ( a != b ) && ( aSign ^ ( a < b ) ); 2203 2204 } 2205 2206 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2207 /* 2208 ------------------------------------------------------------------------------- 2209 Returns 1 if the single-precision floating-point value `a' is equal to 2210 the corresponding value `b', and 0 otherwise. The invalid exception is 2211 raised if either operand is a NaN. Otherwise, the comparison is performed 2212 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2213 ------------------------------------------------------------------------------- 2214 */ 2215 flag float32_eq_signaling( float32 a, float32 b ) 2216 { 2217 2218 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2219 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2220 ) { 2221 float_raise( float_flag_invalid ); 2222 return 0; 2223 } 2224 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2225 2226 } 2227 2228 /* 2229 ------------------------------------------------------------------------------- 2230 Returns 1 if the single-precision floating-point value `a' is less than or 2231 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2232 cause an exception. Otherwise, the comparison is performed according to the 2233 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2234 ------------------------------------------------------------------------------- 2235 */ 2236 flag float32_le_quiet( float32 a, float32 b ) 2237 { 2238 flag aSign, bSign; 2239 2240 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2241 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2242 ) { 2243 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2244 float_raise( float_flag_invalid ); 2245 } 2246 return 0; 2247 } 2248 aSign = extractFloat32Sign( a ); 2249 bSign = extractFloat32Sign( b ); 2250 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2251 return ( a == b ) || ( aSign ^ ( a < b ) ); 2252 2253 } 2254 2255 /* 2256 ------------------------------------------------------------------------------- 2257 Returns 1 if the single-precision floating-point value `a' is less than 2258 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2259 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2260 Standard for Binary Floating-Point Arithmetic. 2261 ------------------------------------------------------------------------------- 2262 */ 2263 flag float32_lt_quiet( float32 a, float32 b ) 2264 { 2265 flag aSign, bSign; 2266 2267 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2268 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2269 ) { 2270 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2271 float_raise( float_flag_invalid ); 2272 } 2273 return 0; 2274 } 2275 aSign = extractFloat32Sign( a ); 2276 bSign = extractFloat32Sign( b ); 2277 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2278 return ( a != b ) && ( aSign ^ ( a < b ) ); 2279 2280 } 2281 #endif /* !SOFTFLOAT_FOR_GCC */ 2282 2283 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2284 /* 2285 ------------------------------------------------------------------------------- 2286 Returns the result of converting the double-precision floating-point value 2287 `a' to the 32-bit two's complement integer format. The conversion is 2288 performed according to the IEC/IEEE Standard for Binary Floating-Point 2289 Arithmetic---which means in particular that the conversion is rounded 2290 according to the current rounding mode. If `a' is a NaN, the largest 2291 positive integer is returned. Otherwise, if the conversion overflows, the 2292 largest integer with the same sign as `a' is returned. 2293 ------------------------------------------------------------------------------- 2294 */ 2295 int32 float64_to_int32( float64 a ) 2296 { 2297 flag aSign; 2298 int16 aExp, shiftCount; 2299 bits64 aSig; 2300 2301 aSig = extractFloat64Frac( a ); 2302 aExp = extractFloat64Exp( a ); 2303 aSign = extractFloat64Sign( a ); 2304 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2305 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2306 shiftCount = 0x42C - aExp; 2307 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2308 return roundAndPackInt32( aSign, aSig ); 2309 2310 } 2311 #endif /* !SOFTFLOAT_FOR_GCC */ 2312 2313 /* 2314 ------------------------------------------------------------------------------- 2315 Returns the result of converting the double-precision floating-point value 2316 `a' to the 32-bit two's complement integer format. The conversion is 2317 performed according to the IEC/IEEE Standard for Binary Floating-Point 2318 Arithmetic, except that the conversion is always rounded toward zero. 2319 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2320 the conversion overflows, the largest integer with the same sign as `a' is 2321 returned. 2322 ------------------------------------------------------------------------------- 2323 */ 2324 int32 float64_to_int32_round_to_zero( float64 a ) 2325 { 2326 flag aSign; 2327 int16 aExp, shiftCount; 2328 bits64 aSig, savedASig; 2329 int32 z; 2330 2331 aSig = extractFloat64Frac( a ); 2332 aExp = extractFloat64Exp( a ); 2333 aSign = extractFloat64Sign( a ); 2334 if ( 0x41E < aExp ) { 2335 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2336 goto invalid; 2337 } 2338 else if ( aExp < 0x3FF ) { 2339 if ( aExp || aSig ) float_set_inexact(); 2340 return 0; 2341 } 2342 aSig |= LIT64( 0x0010000000000000 ); 2343 shiftCount = 0x433 - aExp; 2344 savedASig = aSig; 2345 aSig >>= shiftCount; 2346 z = aSig; 2347 if ( aSign ) z = - z; 2348 if ( ( z < 0 ) ^ aSign ) { 2349 invalid: 2350 float_raise( float_flag_invalid ); 2351 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2352 } 2353 if ( ( aSig<<shiftCount ) != savedASig ) { 2354 float_set_inexact(); 2355 } 2356 return z; 2357 2358 } 2359 2360 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2361 /* 2362 ------------------------------------------------------------------------------- 2363 Returns the result of converting the double-precision floating-point value 2364 `a' to the 64-bit two's complement integer format. The conversion is 2365 performed according to the IEC/IEEE Standard for Binary Floating-Point 2366 Arithmetic---which means in particular that the conversion is rounded 2367 according to the current rounding mode. If `a' is a NaN, the largest 2368 positive integer is returned. Otherwise, if the conversion overflows, the 2369 largest integer with the same sign as `a' is returned. 2370 ------------------------------------------------------------------------------- 2371 */ 2372 int64 float64_to_int64( float64 a ) 2373 { 2374 flag aSign; 2375 int16 aExp, shiftCount; 2376 bits64 aSig, aSigExtra; 2377 2378 aSig = extractFloat64Frac( a ); 2379 aExp = extractFloat64Exp( a ); 2380 aSign = extractFloat64Sign( a ); 2381 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2382 shiftCount = 0x433 - aExp; 2383 if ( shiftCount <= 0 ) { 2384 if ( 0x43E < aExp ) { 2385 float_raise( float_flag_invalid ); 2386 if ( ! aSign 2387 || ( ( aExp == 0x7FF ) 2388 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2389 ) { 2390 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2391 } 2392 return (sbits64) LIT64( 0x8000000000000000 ); 2393 } 2394 aSigExtra = 0; 2395 aSig <<= - shiftCount; 2396 } 2397 else { 2398 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2399 } 2400 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2401 2402 } 2403 2404 /* 2405 ------------------------------------------------------------------------------- 2406 Returns the result of converting the double-precision floating-point value 2407 `a' to the 64-bit two's complement integer format. The conversion is 2408 performed according to the IEC/IEEE Standard for Binary Floating-Point 2409 Arithmetic, except that the conversion is always rounded toward zero. 2410 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2411 the conversion overflows, the largest integer with the same sign as `a' is 2412 returned. 2413 ------------------------------------------------------------------------------- 2414 */ 2415 int64 float64_to_int64_round_to_zero( float64 a ) 2416 { 2417 flag aSign; 2418 int16 aExp, shiftCount; 2419 bits64 aSig; 2420 int64 z; 2421 2422 aSig = extractFloat64Frac( a ); 2423 aExp = extractFloat64Exp( a ); 2424 aSign = extractFloat64Sign( a ); 2425 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2426 shiftCount = aExp - 0x433; 2427 if ( 0 <= shiftCount ) { 2428 if ( 0x43E <= aExp ) { 2429 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2430 float_raise( float_flag_invalid ); 2431 if ( ! aSign 2432 || ( ( aExp == 0x7FF ) 2433 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2434 ) { 2435 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2436 } 2437 } 2438 return (sbits64) LIT64( 0x8000000000000000 ); 2439 } 2440 z = aSig<<shiftCount; 2441 } 2442 else { 2443 if ( aExp < 0x3FE ) { 2444 if ( aExp | aSig ) float_set_inexact(); 2445 return 0; 2446 } 2447 z = aSig>>( - shiftCount ); 2448 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2449 float_set_inexact(); 2450 } 2451 } 2452 if ( aSign ) z = - z; 2453 return z; 2454 2455 } 2456 #endif /* !SOFTFLOAT_FOR_GCC */ 2457 2458 /* 2459 ------------------------------------------------------------------------------- 2460 Returns the result of converting the double-precision floating-point value 2461 `a' to the single-precision floating-point format. The conversion is 2462 performed according to the IEC/IEEE Standard for Binary Floating-Point 2463 Arithmetic. 2464 ------------------------------------------------------------------------------- 2465 */ 2466 float32 float64_to_float32( float64 a ) 2467 { 2468 flag aSign; 2469 int16 aExp; 2470 bits64 aSig; 2471 bits32 zSig; 2472 2473 aSig = extractFloat64Frac( a ); 2474 aExp = extractFloat64Exp( a ); 2475 aSign = extractFloat64Sign( a ); 2476 if ( aExp == 0x7FF ) { 2477 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2478 return packFloat32( aSign, 0xFF, 0 ); 2479 } 2480 shift64RightJamming( aSig, 22, &aSig ); 2481 zSig = aSig; 2482 if ( aExp || zSig ) { 2483 zSig |= 0x40000000; 2484 aExp -= 0x381; 2485 } 2486 return roundAndPackFloat32( aSign, aExp, zSig ); 2487 2488 } 2489 2490 #ifdef FLOATX80 2491 2492 /* 2493 ------------------------------------------------------------------------------- 2494 Returns the result of converting the double-precision floating-point value 2495 `a' to the extended double-precision floating-point format. The conversion 2496 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2497 Arithmetic. 2498 ------------------------------------------------------------------------------- 2499 */ 2500 floatx80 float64_to_floatx80( float64 a ) 2501 { 2502 flag aSign; 2503 int16 aExp; 2504 bits64 aSig; 2505 2506 aSig = extractFloat64Frac( a ); 2507 aExp = extractFloat64Exp( a ); 2508 aSign = extractFloat64Sign( a ); 2509 if ( aExp == 0x7FF ) { 2510 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2511 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2512 } 2513 if ( aExp == 0 ) { 2514 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2515 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2516 } 2517 return 2518 packFloatx80( 2519 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2520 2521 } 2522 2523 #endif 2524 2525 #ifdef FLOAT128 2526 2527 /* 2528 ------------------------------------------------------------------------------- 2529 Returns the result of converting the double-precision floating-point value 2530 `a' to the quadruple-precision floating-point format. The conversion is 2531 performed according to the IEC/IEEE Standard for Binary Floating-Point 2532 Arithmetic. 2533 ------------------------------------------------------------------------------- 2534 */ 2535 float128 float64_to_float128( float64 a ) 2536 { 2537 flag aSign; 2538 int16 aExp; 2539 bits64 aSig, zSig0, zSig1; 2540 2541 aSig = extractFloat64Frac( a ); 2542 aExp = extractFloat64Exp( a ); 2543 aSign = extractFloat64Sign( a ); 2544 if ( aExp == 0x7FF ) { 2545 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2546 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2547 } 2548 if ( aExp == 0 ) { 2549 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2550 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2551 --aExp; 2552 } 2553 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2554 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2555 2556 } 2557 2558 #endif 2559 2560 #ifndef SOFTFLOAT_FOR_GCC 2561 /* 2562 ------------------------------------------------------------------------------- 2563 Rounds the double-precision floating-point value `a' to an integer, and 2564 returns the result as a double-precision floating-point value. The 2565 operation is performed according to the IEC/IEEE Standard for Binary 2566 Floating-Point Arithmetic. 2567 ------------------------------------------------------------------------------- 2568 */ 2569 float64 float64_round_to_int( float64 a ) 2570 { 2571 flag aSign; 2572 int16 aExp; 2573 bits64 lastBitMask, roundBitsMask; 2574 int8 roundingMode; 2575 float64 z; 2576 2577 aExp = extractFloat64Exp( a ); 2578 if ( 0x433 <= aExp ) { 2579 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2580 return propagateFloat64NaN( a, a ); 2581 } 2582 return a; 2583 } 2584 if ( aExp < 0x3FF ) { 2585 if ( (bits64) ( a<<1 ) == 0 ) return a; 2586 float_set_inexact(); 2587 aSign = extractFloat64Sign( a ); 2588 switch ( float_rounding_mode() ) { 2589 case float_round_nearest_even: 2590 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2591 return packFloat64( aSign, 0x3FF, 0 ); 2592 } 2593 break; 2594 case float_round_down: 2595 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2596 case float_round_up: 2597 return 2598 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2599 } 2600 return packFloat64( aSign, 0, 0 ); 2601 } 2602 lastBitMask = 1; 2603 lastBitMask <<= 0x433 - aExp; 2604 roundBitsMask = lastBitMask - 1; 2605 z = a; 2606 roundingMode = float_rounding_mode(); 2607 if ( roundingMode == float_round_nearest_even ) { 2608 z += lastBitMask>>1; 2609 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2610 } 2611 else if ( roundingMode != float_round_to_zero ) { 2612 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2613 z += roundBitsMask; 2614 } 2615 } 2616 z &= ~ roundBitsMask; 2617 if ( z != a ) float_set_inexact(); 2618 return z; 2619 2620 } 2621 #endif 2622 2623 /* 2624 ------------------------------------------------------------------------------- 2625 Returns the result of adding the absolute values of the double-precision 2626 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2627 before being returned. `zSign' is ignored if the result is a NaN. 2628 The addition is performed according to the IEC/IEEE Standard for Binary 2629 Floating-Point Arithmetic. 2630 ------------------------------------------------------------------------------- 2631 */ 2632 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2633 { 2634 int16 aExp, bExp, zExp; 2635 bits64 aSig, bSig, zSig; 2636 int16 expDiff; 2637 2638 aSig = extractFloat64Frac( a ); 2639 aExp = extractFloat64Exp( a ); 2640 bSig = extractFloat64Frac( b ); 2641 bExp = extractFloat64Exp( b ); 2642 expDiff = aExp - bExp; 2643 aSig <<= 9; 2644 bSig <<= 9; 2645 if ( 0 < expDiff ) { 2646 if ( aExp == 0x7FF ) { 2647 if ( aSig ) return propagateFloat64NaN( a, b ); 2648 return a; 2649 } 2650 if ( bExp == 0 ) { 2651 --expDiff; 2652 } 2653 else { 2654 bSig |= LIT64( 0x2000000000000000 ); 2655 } 2656 shift64RightJamming( bSig, expDiff, &bSig ); 2657 zExp = aExp; 2658 } 2659 else if ( expDiff < 0 ) { 2660 if ( bExp == 0x7FF ) { 2661 if ( bSig ) return propagateFloat64NaN( a, b ); 2662 return packFloat64( zSign, 0x7FF, 0 ); 2663 } 2664 if ( aExp == 0 ) { 2665 ++expDiff; 2666 } 2667 else { 2668 aSig |= LIT64( 0x2000000000000000 ); 2669 } 2670 shift64RightJamming( aSig, - expDiff, &aSig ); 2671 zExp = bExp; 2672 } 2673 else { 2674 if ( aExp == 0x7FF ) { 2675 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2676 return a; 2677 } 2678 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2679 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2680 zExp = aExp; 2681 goto roundAndPack; 2682 } 2683 aSig |= LIT64( 0x2000000000000000 ); 2684 zSig = ( aSig + bSig )<<1; 2685 --zExp; 2686 if ( (sbits64) zSig < 0 ) { 2687 zSig = aSig + bSig; 2688 ++zExp; 2689 } 2690 roundAndPack: 2691 return roundAndPackFloat64( zSign, zExp, zSig ); 2692 2693 } 2694 2695 /* 2696 ------------------------------------------------------------------------------- 2697 Returns the result of subtracting the absolute values of the double- 2698 precision floating-point values `a' and `b'. If `zSign' is 1, the 2699 difference is negated before being returned. `zSign' is ignored if the 2700 result is a NaN. The subtraction is performed according to the IEC/IEEE 2701 Standard for Binary Floating-Point Arithmetic. 2702 ------------------------------------------------------------------------------- 2703 */ 2704 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2705 { 2706 int16 aExp, bExp, zExp; 2707 bits64 aSig, bSig, zSig; 2708 int16 expDiff; 2709 2710 aSig = extractFloat64Frac( a ); 2711 aExp = extractFloat64Exp( a ); 2712 bSig = extractFloat64Frac( b ); 2713 bExp = extractFloat64Exp( b ); 2714 expDiff = aExp - bExp; 2715 aSig <<= 10; 2716 bSig <<= 10; 2717 if ( 0 < expDiff ) goto aExpBigger; 2718 if ( expDiff < 0 ) goto bExpBigger; 2719 if ( aExp == 0x7FF ) { 2720 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2721 float_raise( float_flag_invalid ); 2722 return float64_default_nan; 2723 } 2724 if ( aExp == 0 ) { 2725 aExp = 1; 2726 bExp = 1; 2727 } 2728 if ( bSig < aSig ) goto aBigger; 2729 if ( aSig < bSig ) goto bBigger; 2730 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2731 bExpBigger: 2732 if ( bExp == 0x7FF ) { 2733 if ( bSig ) return propagateFloat64NaN( a, b ); 2734 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2735 } 2736 if ( aExp == 0 ) { 2737 ++expDiff; 2738 } 2739 else { 2740 aSig |= LIT64( 0x4000000000000000 ); 2741 } 2742 shift64RightJamming( aSig, - expDiff, &aSig ); 2743 bSig |= LIT64( 0x4000000000000000 ); 2744 bBigger: 2745 zSig = bSig - aSig; 2746 zExp = bExp; 2747 zSign ^= 1; 2748 goto normalizeRoundAndPack; 2749 aExpBigger: 2750 if ( aExp == 0x7FF ) { 2751 if ( aSig ) return propagateFloat64NaN( a, b ); 2752 return a; 2753 } 2754 if ( bExp == 0 ) { 2755 --expDiff; 2756 } 2757 else { 2758 bSig |= LIT64( 0x4000000000000000 ); 2759 } 2760 shift64RightJamming( bSig, expDiff, &bSig ); 2761 aSig |= LIT64( 0x4000000000000000 ); 2762 aBigger: 2763 zSig = aSig - bSig; 2764 zExp = aExp; 2765 normalizeRoundAndPack: 2766 --zExp; 2767 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2768 2769 } 2770 2771 /* 2772 ------------------------------------------------------------------------------- 2773 Returns the result of adding the double-precision floating-point values `a' 2774 and `b'. The operation is performed according to the IEC/IEEE Standard for 2775 Binary Floating-Point Arithmetic. 2776 ------------------------------------------------------------------------------- 2777 */ 2778 float64 float64_add( float64 a, float64 b ) 2779 { 2780 flag aSign, bSign; 2781 2782 aSign = extractFloat64Sign( a ); 2783 bSign = extractFloat64Sign( b ); 2784 if ( aSign == bSign ) { 2785 return addFloat64Sigs( a, b, aSign ); 2786 } 2787 else { 2788 return subFloat64Sigs( a, b, aSign ); 2789 } 2790 2791 } 2792 2793 /* 2794 ------------------------------------------------------------------------------- 2795 Returns the result of subtracting the double-precision floating-point values 2796 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2797 for Binary Floating-Point Arithmetic. 2798 ------------------------------------------------------------------------------- 2799 */ 2800 float64 float64_sub( float64 a, float64 b ) 2801 { 2802 flag aSign, bSign; 2803 2804 aSign = extractFloat64Sign( a ); 2805 bSign = extractFloat64Sign( b ); 2806 if ( aSign == bSign ) { 2807 return subFloat64Sigs( a, b, aSign ); 2808 } 2809 else { 2810 return addFloat64Sigs( a, b, aSign ); 2811 } 2812 2813 } 2814 2815 /* 2816 ------------------------------------------------------------------------------- 2817 Returns the result of multiplying the double-precision floating-point values 2818 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2819 for Binary Floating-Point Arithmetic. 2820 ------------------------------------------------------------------------------- 2821 */ 2822 float64 float64_mul( float64 a, float64 b ) 2823 { 2824 flag aSign, bSign, zSign; 2825 int16 aExp, bExp, zExp; 2826 bits64 aSig, bSig, zSig0, zSig1; 2827 2828 aSig = extractFloat64Frac( a ); 2829 aExp = extractFloat64Exp( a ); 2830 aSign = extractFloat64Sign( a ); 2831 bSig = extractFloat64Frac( b ); 2832 bExp = extractFloat64Exp( b ); 2833 bSign = extractFloat64Sign( b ); 2834 zSign = aSign ^ bSign; 2835 if ( aExp == 0x7FF ) { 2836 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2837 return propagateFloat64NaN( a, b ); 2838 } 2839 if ( ( bExp | bSig ) == 0 ) { 2840 float_raise( float_flag_invalid ); 2841 return float64_default_nan; 2842 } 2843 return packFloat64( zSign, 0x7FF, 0 ); 2844 } 2845 if ( bExp == 0x7FF ) { 2846 if ( bSig ) return propagateFloat64NaN( a, b ); 2847 if ( ( aExp | aSig ) == 0 ) { 2848 float_raise( float_flag_invalid ); 2849 return float64_default_nan; 2850 } 2851 return packFloat64( zSign, 0x7FF, 0 ); 2852 } 2853 if ( aExp == 0 ) { 2854 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2855 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2856 } 2857 if ( bExp == 0 ) { 2858 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2859 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2860 } 2861 zExp = aExp + bExp - 0x3FF; 2862 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2863 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2864 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2865 zSig0 |= ( zSig1 != 0 ); 2866 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2867 zSig0 <<= 1; 2868 --zExp; 2869 } 2870 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2871 2872 } 2873 2874 /* 2875 ------------------------------------------------------------------------------- 2876 Returns the result of dividing the double-precision floating-point value `a' 2877 by the corresponding value `b'. The operation is performed according to 2878 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2879 ------------------------------------------------------------------------------- 2880 */ 2881 float64 float64_div( float64 a, float64 b ) 2882 { 2883 flag aSign, bSign, zSign; 2884 int16 aExp, bExp, zExp; 2885 bits64 aSig, bSig, zSig; 2886 bits64 rem0, rem1; 2887 bits64 term0, term1; 2888 2889 aSig = extractFloat64Frac( a ); 2890 aExp = extractFloat64Exp( a ); 2891 aSign = extractFloat64Sign( a ); 2892 bSig = extractFloat64Frac( b ); 2893 bExp = extractFloat64Exp( b ); 2894 bSign = extractFloat64Sign( b ); 2895 zSign = aSign ^ bSign; 2896 if ( aExp == 0x7FF ) { 2897 if ( aSig ) return propagateFloat64NaN( a, b ); 2898 if ( bExp == 0x7FF ) { 2899 if ( bSig ) return propagateFloat64NaN( a, b ); 2900 float_raise( float_flag_invalid ); 2901 return float64_default_nan; 2902 } 2903 return packFloat64( zSign, 0x7FF, 0 ); 2904 } 2905 if ( bExp == 0x7FF ) { 2906 if ( bSig ) return propagateFloat64NaN( a, b ); 2907 return packFloat64( zSign, 0, 0 ); 2908 } 2909 if ( bExp == 0 ) { 2910 if ( bSig == 0 ) { 2911 if ( ( aExp | aSig ) == 0 ) { 2912 float_raise( float_flag_invalid ); 2913 return float64_default_nan; 2914 } 2915 float_raise( float_flag_divbyzero ); 2916 return packFloat64( zSign, 0x7FF, 0 ); 2917 } 2918 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2919 } 2920 if ( aExp == 0 ) { 2921 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2922 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2923 } 2924 zExp = aExp - bExp + 0x3FD; 2925 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2926 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2927 if ( bSig <= ( aSig + aSig ) ) { 2928 aSig >>= 1; 2929 ++zExp; 2930 } 2931 zSig = estimateDiv128To64( aSig, 0, bSig ); 2932 if ( ( zSig & 0x1FF ) <= 2 ) { 2933 mul64To128( bSig, zSig, &term0, &term1 ); 2934 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2935 while ( (sbits64) rem0 < 0 ) { 2936 --zSig; 2937 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2938 } 2939 zSig |= ( rem1 != 0 ); 2940 } 2941 return roundAndPackFloat64( zSign, zExp, zSig ); 2942 2943 } 2944 2945 #ifndef SOFTFLOAT_FOR_GCC 2946 /* 2947 ------------------------------------------------------------------------------- 2948 Returns the remainder of the double-precision floating-point value `a' 2949 with respect to the corresponding value `b'. The operation is performed 2950 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2951 ------------------------------------------------------------------------------- 2952 */ 2953 float64 float64_rem( float64 a, float64 b ) 2954 { 2955 flag aSign, bSign, zSign; 2956 int16 aExp, bExp, expDiff; 2957 bits64 aSig, bSig; 2958 bits64 q, alternateASig; 2959 sbits64 sigMean; 2960 2961 aSig = extractFloat64Frac( a ); 2962 aExp = extractFloat64Exp( a ); 2963 aSign = extractFloat64Sign( a ); 2964 bSig = extractFloat64Frac( b ); 2965 bExp = extractFloat64Exp( b ); 2966 bSign = extractFloat64Sign( b ); 2967 if ( aExp == 0x7FF ) { 2968 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2969 return propagateFloat64NaN( a, b ); 2970 } 2971 float_raise( float_flag_invalid ); 2972 return float64_default_nan; 2973 } 2974 if ( bExp == 0x7FF ) { 2975 if ( bSig ) return propagateFloat64NaN( a, b ); 2976 return a; 2977 } 2978 if ( bExp == 0 ) { 2979 if ( bSig == 0 ) { 2980 float_raise( float_flag_invalid ); 2981 return float64_default_nan; 2982 } 2983 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2984 } 2985 if ( aExp == 0 ) { 2986 if ( aSig == 0 ) return a; 2987 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2988 } 2989 expDiff = aExp - bExp; 2990 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2991 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2992 if ( expDiff < 0 ) { 2993 if ( expDiff < -1 ) return a; 2994 aSig >>= 1; 2995 } 2996 q = ( bSig <= aSig ); 2997 if ( q ) aSig -= bSig; 2998 expDiff -= 64; 2999 while ( 0 < expDiff ) { 3000 q = estimateDiv128To64( aSig, 0, bSig ); 3001 q = ( 2 < q ) ? q - 2 : 0; 3002 aSig = - ( ( bSig>>2 ) * q ); 3003 expDiff -= 62; 3004 } 3005 expDiff += 64; 3006 if ( 0 < expDiff ) { 3007 q = estimateDiv128To64( aSig, 0, bSig ); 3008 q = ( 2 < q ) ? q - 2 : 0; 3009 q >>= 64 - expDiff; 3010 bSig >>= 2; 3011 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3012 } 3013 else { 3014 aSig >>= 2; 3015 bSig >>= 2; 3016 } 3017 do { 3018 alternateASig = aSig; 3019 ++q; 3020 aSig -= bSig; 3021 } while ( 0 <= (sbits64) aSig ); 3022 sigMean = aSig + alternateASig; 3023 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3024 aSig = alternateASig; 3025 } 3026 zSign = ( (sbits64) aSig < 0 ); 3027 if ( zSign ) aSig = - aSig; 3028 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3029 3030 } 3031 3032 /* 3033 ------------------------------------------------------------------------------- 3034 Returns the square root of the double-precision floating-point value `a'. 3035 The operation is performed according to the IEC/IEEE Standard for Binary 3036 Floating-Point Arithmetic. 3037 ------------------------------------------------------------------------------- 3038 */ 3039 float64 float64_sqrt( float64 a ) 3040 { 3041 flag aSign; 3042 int16 aExp, zExp; 3043 bits64 aSig, zSig, doubleZSig; 3044 bits64 rem0, rem1, term0, term1; 3045 3046 aSig = extractFloat64Frac( a ); 3047 aExp = extractFloat64Exp( a ); 3048 aSign = extractFloat64Sign( a ); 3049 if ( aExp == 0x7FF ) { 3050 if ( aSig ) return propagateFloat64NaN( a, a ); 3051 if ( ! aSign ) return a; 3052 float_raise( float_flag_invalid ); 3053 return float64_default_nan; 3054 } 3055 if ( aSign ) { 3056 if ( ( aExp | aSig ) == 0 ) return a; 3057 float_raise( float_flag_invalid ); 3058 return float64_default_nan; 3059 } 3060 if ( aExp == 0 ) { 3061 if ( aSig == 0 ) return 0; 3062 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3063 } 3064 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3065 aSig |= LIT64( 0x0010000000000000 ); 3066 zSig = estimateSqrt32( aExp, aSig>>21 ); 3067 aSig <<= 9 - ( aExp & 1 ); 3068 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3069 if ( ( zSig & 0x1FF ) <= 5 ) { 3070 doubleZSig = zSig<<1; 3071 mul64To128( zSig, zSig, &term0, &term1 ); 3072 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3073 while ( (sbits64) rem0 < 0 ) { 3074 --zSig; 3075 doubleZSig -= 2; 3076 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3077 } 3078 zSig |= ( ( rem0 | rem1 ) != 0 ); 3079 } 3080 return roundAndPackFloat64( 0, zExp, zSig ); 3081 3082 } 3083 #endif 3084 3085 /* 3086 ------------------------------------------------------------------------------- 3087 Returns 1 if the double-precision floating-point value `a' is equal to the 3088 corresponding value `b', and 0 otherwise. The comparison is performed 3089 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3090 ------------------------------------------------------------------------------- 3091 */ 3092 flag float64_eq( float64 a, float64 b ) 3093 { 3094 3095 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3096 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3097 ) { 3098 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3099 float_raise( float_flag_invalid ); 3100 } 3101 return 0; 3102 } 3103 return ( a == b ) || 3104 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3105 3106 } 3107 3108 /* 3109 ------------------------------------------------------------------------------- 3110 Returns 1 if the double-precision floating-point value `a' is less than or 3111 equal to the corresponding value `b', and 0 otherwise. The comparison is 3112 performed according to the IEC/IEEE Standard for Binary Floating-Point 3113 Arithmetic. 3114 ------------------------------------------------------------------------------- 3115 */ 3116 flag float64_le( float64 a, float64 b ) 3117 { 3118 flag aSign, bSign; 3119 3120 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3121 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3122 ) { 3123 float_raise( float_flag_invalid ); 3124 return 0; 3125 } 3126 aSign = extractFloat64Sign( a ); 3127 bSign = extractFloat64Sign( b ); 3128 if ( aSign != bSign ) 3129 return aSign || 3130 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3131 0 ); 3132 return ( a == b ) || 3133 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3134 3135 } 3136 3137 /* 3138 ------------------------------------------------------------------------------- 3139 Returns 1 if the double-precision floating-point value `a' is less than 3140 the corresponding value `b', and 0 otherwise. The comparison is performed 3141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3142 ------------------------------------------------------------------------------- 3143 */ 3144 flag float64_lt( float64 a, float64 b ) 3145 { 3146 flag aSign, bSign; 3147 3148 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3149 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3150 ) { 3151 float_raise( float_flag_invalid ); 3152 return 0; 3153 } 3154 aSign = extractFloat64Sign( a ); 3155 bSign = extractFloat64Sign( b ); 3156 if ( aSign != bSign ) 3157 return aSign && 3158 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3159 0 ); 3160 return ( a != b ) && 3161 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3162 3163 } 3164 3165 #ifndef SOFTFLOAT_FOR_GCC 3166 /* 3167 ------------------------------------------------------------------------------- 3168 Returns 1 if the double-precision floating-point value `a' is equal to the 3169 corresponding value `b', and 0 otherwise. The invalid exception is raised 3170 if either operand is a NaN. Otherwise, the comparison is performed 3171 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3172 ------------------------------------------------------------------------------- 3173 */ 3174 flag float64_eq_signaling( float64 a, float64 b ) 3175 { 3176 3177 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3178 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3179 ) { 3180 float_raise( float_flag_invalid ); 3181 return 0; 3182 } 3183 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3184 3185 } 3186 3187 /* 3188 ------------------------------------------------------------------------------- 3189 Returns 1 if the double-precision floating-point value `a' is less than or 3190 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3191 cause an exception. Otherwise, the comparison is performed according to the 3192 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3193 ------------------------------------------------------------------------------- 3194 */ 3195 flag float64_le_quiet( float64 a, float64 b ) 3196 { 3197 flag aSign, bSign; 3198 3199 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3200 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3201 ) { 3202 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3203 float_raise( float_flag_invalid ); 3204 } 3205 return 0; 3206 } 3207 aSign = extractFloat64Sign( a ); 3208 bSign = extractFloat64Sign( b ); 3209 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3210 return ( a == b ) || ( aSign ^ ( a < b ) ); 3211 3212 } 3213 3214 /* 3215 ------------------------------------------------------------------------------- 3216 Returns 1 if the double-precision floating-point value `a' is less than 3217 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3218 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3219 Standard for Binary Floating-Point Arithmetic. 3220 ------------------------------------------------------------------------------- 3221 */ 3222 flag float64_lt_quiet( float64 a, float64 b ) 3223 { 3224 flag aSign, bSign; 3225 3226 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3227 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3228 ) { 3229 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3230 float_raise( float_flag_invalid ); 3231 } 3232 return 0; 3233 } 3234 aSign = extractFloat64Sign( a ); 3235 bSign = extractFloat64Sign( b ); 3236 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3237 return ( a != b ) && ( aSign ^ ( a < b ) ); 3238 3239 } 3240 #endif 3241 3242 #ifdef FLOATX80 3243 3244 /* 3245 ------------------------------------------------------------------------------- 3246 Returns the result of converting the extended double-precision floating- 3247 point value `a' to the 32-bit two's complement integer format. The 3248 conversion is performed according to the IEC/IEEE Standard for Binary 3249 Floating-Point Arithmetic---which means in particular that the conversion 3250 is rounded according to the current rounding mode. If `a' is a NaN, the 3251 largest positive integer is returned. Otherwise, if the conversion 3252 overflows, the largest integer with the same sign as `a' is returned. 3253 ------------------------------------------------------------------------------- 3254 */ 3255 int32 floatx80_to_int32( floatx80 a ) 3256 { 3257 flag aSign; 3258 int32 aExp, shiftCount; 3259 bits64 aSig; 3260 3261 aSig = extractFloatx80Frac( a ); 3262 aExp = extractFloatx80Exp( a ); 3263 aSign = extractFloatx80Sign( a ); 3264 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3265 shiftCount = 0x4037 - aExp; 3266 if ( shiftCount <= 0 ) shiftCount = 1; 3267 shift64RightJamming( aSig, shiftCount, &aSig ); 3268 return roundAndPackInt32( aSign, aSig ); 3269 3270 } 3271 3272 /* 3273 ------------------------------------------------------------------------------- 3274 Returns the result of converting the extended double-precision floating- 3275 point value `a' to the 32-bit two's complement integer format. The 3276 conversion is performed according to the IEC/IEEE Standard for Binary 3277 Floating-Point Arithmetic, except that the conversion is always rounded 3278 toward zero. If `a' is a NaN, the largest positive integer is returned. 3279 Otherwise, if the conversion overflows, the largest integer with the same 3280 sign as `a' is returned. 3281 ------------------------------------------------------------------------------- 3282 */ 3283 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3284 { 3285 flag aSign; 3286 int32 aExp, shiftCount; 3287 bits64 aSig, savedASig; 3288 int32 z; 3289 3290 aSig = extractFloatx80Frac( a ); 3291 aExp = extractFloatx80Exp( a ); 3292 aSign = extractFloatx80Sign( a ); 3293 if ( 0x401E < aExp ) { 3294 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3295 goto invalid; 3296 } 3297 else if ( aExp < 0x3FFF ) { 3298 if ( aExp || aSig ) float_set_inexact(); 3299 return 0; 3300 } 3301 shiftCount = 0x403E - aExp; 3302 savedASig = aSig; 3303 aSig >>= shiftCount; 3304 z = aSig; 3305 if ( aSign ) z = - z; 3306 if ( ( z < 0 ) ^ aSign ) { 3307 invalid: 3308 float_raise( float_flag_invalid ); 3309 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3310 } 3311 if ( ( aSig<<shiftCount ) != savedASig ) { 3312 float_set_inexact(); 3313 } 3314 return z; 3315 3316 } 3317 3318 /* 3319 ------------------------------------------------------------------------------- 3320 Returns the result of converting the extended double-precision floating- 3321 point value `a' to the 64-bit two's complement integer format. The 3322 conversion is performed according to the IEC/IEEE Standard for Binary 3323 Floating-Point Arithmetic---which means in particular that the conversion 3324 is rounded according to the current rounding mode. If `a' is a NaN, 3325 the largest positive integer is returned. Otherwise, if the conversion 3326 overflows, the largest integer with the same sign as `a' is returned. 3327 ------------------------------------------------------------------------------- 3328 */ 3329 int64 floatx80_to_int64( floatx80 a ) 3330 { 3331 flag aSign; 3332 int32 aExp, shiftCount; 3333 bits64 aSig, aSigExtra; 3334 3335 aSig = extractFloatx80Frac( a ); 3336 aExp = extractFloatx80Exp( a ); 3337 aSign = extractFloatx80Sign( a ); 3338 shiftCount = 0x403E - aExp; 3339 if ( shiftCount <= 0 ) { 3340 if ( shiftCount ) { 3341 float_raise( float_flag_invalid ); 3342 if ( ! aSign 3343 || ( ( aExp == 0x7FFF ) 3344 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3345 ) { 3346 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3347 } 3348 return (sbits64) LIT64( 0x8000000000000000 ); 3349 } 3350 aSigExtra = 0; 3351 } 3352 else { 3353 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3354 } 3355 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3356 3357 } 3358 3359 /* 3360 ------------------------------------------------------------------------------- 3361 Returns the result of converting the extended double-precision floating- 3362 point value `a' to the 64-bit two's complement integer format. The 3363 conversion is performed according to the IEC/IEEE Standard for Binary 3364 Floating-Point Arithmetic, except that the conversion is always rounded 3365 toward zero. If `a' is a NaN, the largest positive integer is returned. 3366 Otherwise, if the conversion overflows, the largest integer with the same 3367 sign as `a' is returned. 3368 ------------------------------------------------------------------------------- 3369 */ 3370 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3371 { 3372 flag aSign; 3373 int32 aExp, shiftCount; 3374 bits64 aSig; 3375 int64 z; 3376 3377 aSig = extractFloatx80Frac( a ); 3378 aExp = extractFloatx80Exp( a ); 3379 aSign = extractFloatx80Sign( a ); 3380 shiftCount = aExp - 0x403E; 3381 if ( 0 <= shiftCount ) { 3382 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3383 if ( ( a.high != 0xC03E ) || aSig ) { 3384 float_raise( float_flag_invalid ); 3385 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3386 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3387 } 3388 } 3389 return (sbits64) LIT64( 0x8000000000000000 ); 3390 } 3391 else if ( aExp < 0x3FFF ) { 3392 if ( aExp | aSig ) float_set_inexact(); 3393 return 0; 3394 } 3395 z = aSig>>( - shiftCount ); 3396 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3397 float_set_inexact(); 3398 } 3399 if ( aSign ) z = - z; 3400 return z; 3401 3402 } 3403 3404 /* 3405 ------------------------------------------------------------------------------- 3406 Returns the result of converting the extended double-precision floating- 3407 point value `a' to the single-precision floating-point format. The 3408 conversion is performed according to the IEC/IEEE Standard for Binary 3409 Floating-Point Arithmetic. 3410 ------------------------------------------------------------------------------- 3411 */ 3412 float32 floatx80_to_float32( floatx80 a ) 3413 { 3414 flag aSign; 3415 int32 aExp; 3416 bits64 aSig; 3417 3418 aSig = extractFloatx80Frac( a ); 3419 aExp = extractFloatx80Exp( a ); 3420 aSign = extractFloatx80Sign( a ); 3421 if ( aExp == 0x7FFF ) { 3422 if ( (bits64) ( aSig<<1 ) ) { 3423 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3424 } 3425 return packFloat32( aSign, 0xFF, 0 ); 3426 } 3427 shift64RightJamming( aSig, 33, &aSig ); 3428 if ( aExp || aSig ) aExp -= 0x3F81; 3429 return roundAndPackFloat32( aSign, aExp, aSig ); 3430 3431 } 3432 3433 /* 3434 ------------------------------------------------------------------------------- 3435 Returns the result of converting the extended double-precision floating- 3436 point value `a' to the double-precision floating-point format. The 3437 conversion is performed according to the IEC/IEEE Standard for Binary 3438 Floating-Point Arithmetic. 3439 ------------------------------------------------------------------------------- 3440 */ 3441 float64 floatx80_to_float64( floatx80 a ) 3442 { 3443 flag aSign; 3444 int32 aExp; 3445 bits64 aSig, zSig; 3446 3447 aSig = extractFloatx80Frac( a ); 3448 aExp = extractFloatx80Exp( a ); 3449 aSign = extractFloatx80Sign( a ); 3450 if ( aExp == 0x7FFF ) { 3451 if ( (bits64) ( aSig<<1 ) ) { 3452 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3453 } 3454 return packFloat64( aSign, 0x7FF, 0 ); 3455 } 3456 shift64RightJamming( aSig, 1, &zSig ); 3457 if ( aExp || aSig ) aExp -= 0x3C01; 3458 return roundAndPackFloat64( aSign, aExp, zSig ); 3459 3460 } 3461 3462 #ifdef FLOAT128 3463 3464 /* 3465 ------------------------------------------------------------------------------- 3466 Returns the result of converting the extended double-precision floating- 3467 point value `a' to the quadruple-precision floating-point format. The 3468 conversion is performed according to the IEC/IEEE Standard for Binary 3469 Floating-Point Arithmetic. 3470 ------------------------------------------------------------------------------- 3471 */ 3472 float128 floatx80_to_float128( floatx80 a ) 3473 { 3474 flag aSign; 3475 int16 aExp; 3476 bits64 aSig, zSig0, zSig1; 3477 3478 aSig = extractFloatx80Frac( a ); 3479 aExp = extractFloatx80Exp( a ); 3480 aSign = extractFloatx80Sign( a ); 3481 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3482 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3483 } 3484 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3485 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3486 3487 } 3488 3489 #endif 3490 3491 /* 3492 ------------------------------------------------------------------------------- 3493 Rounds the extended double-precision floating-point value `a' to an integer, 3494 and returns the result as an extended quadruple-precision floating-point 3495 value. The operation is performed according to the IEC/IEEE Standard for 3496 Binary Floating-Point Arithmetic. 3497 ------------------------------------------------------------------------------- 3498 */ 3499 floatx80 floatx80_round_to_int( floatx80 a ) 3500 { 3501 flag aSign; 3502 int32 aExp; 3503 bits64 lastBitMask, roundBitsMask; 3504 int8 roundingMode; 3505 floatx80 z; 3506 3507 aExp = extractFloatx80Exp( a ); 3508 if ( 0x403E <= aExp ) { 3509 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3510 return propagateFloatx80NaN( a, a ); 3511 } 3512 return a; 3513 } 3514 if ( aExp < 0x3FFF ) { 3515 if ( ( aExp == 0 ) 3516 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3517 return a; 3518 } 3519 float_set_inexact(); 3520 aSign = extractFloatx80Sign( a ); 3521 switch ( float_rounding_mode() ) { 3522 case float_round_nearest_even: 3523 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3524 ) { 3525 return 3526 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3527 } 3528 break; 3529 case float_round_down: 3530 return 3531 aSign ? 3532 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3533 : packFloatx80( 0, 0, 0 ); 3534 case float_round_up: 3535 return 3536 aSign ? packFloatx80( 1, 0, 0 ) 3537 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3538 } 3539 return packFloatx80( aSign, 0, 0 ); 3540 } 3541 lastBitMask = 1; 3542 lastBitMask <<= 0x403E - aExp; 3543 roundBitsMask = lastBitMask - 1; 3544 z = a; 3545 roundingMode = float_rounding_mode(); 3546 if ( roundingMode == float_round_nearest_even ) { 3547 z.low += lastBitMask>>1; 3548 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3549 } 3550 else if ( roundingMode != float_round_to_zero ) { 3551 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3552 z.low += roundBitsMask; 3553 } 3554 } 3555 z.low &= ~ roundBitsMask; 3556 if ( z.low == 0 ) { 3557 ++z.high; 3558 z.low = LIT64( 0x8000000000000000 ); 3559 } 3560 if ( z.low != a.low ) float_set_inexact(); 3561 return z; 3562 3563 } 3564 3565 /* 3566 ------------------------------------------------------------------------------- 3567 Returns the result of adding the absolute values of the extended double- 3568 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3569 negated before being returned. `zSign' is ignored if the result is a NaN. 3570 The addition is performed according to the IEC/IEEE Standard for Binary 3571 Floating-Point Arithmetic. 3572 ------------------------------------------------------------------------------- 3573 */ 3574 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3575 { 3576 int32 aExp, bExp, zExp; 3577 bits64 aSig, bSig, zSig0, zSig1; 3578 int32 expDiff; 3579 3580 aSig = extractFloatx80Frac( a ); 3581 aExp = extractFloatx80Exp( a ); 3582 bSig = extractFloatx80Frac( b ); 3583 bExp = extractFloatx80Exp( b ); 3584 expDiff = aExp - bExp; 3585 if ( 0 < expDiff ) { 3586 if ( aExp == 0x7FFF ) { 3587 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3588 return a; 3589 } 3590 if ( bExp == 0 ) --expDiff; 3591 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3592 zExp = aExp; 3593 } 3594 else if ( expDiff < 0 ) { 3595 if ( bExp == 0x7FFF ) { 3596 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3597 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3598 } 3599 if ( aExp == 0 ) ++expDiff; 3600 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3601 zExp = bExp; 3602 } 3603 else { 3604 if ( aExp == 0x7FFF ) { 3605 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3606 return propagateFloatx80NaN( a, b ); 3607 } 3608 return a; 3609 } 3610 zSig1 = 0; 3611 zSig0 = aSig + bSig; 3612 if ( aExp == 0 ) { 3613 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3614 goto roundAndPack; 3615 } 3616 zExp = aExp; 3617 goto shiftRight1; 3618 } 3619 zSig0 = aSig + bSig; 3620 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3621 shiftRight1: 3622 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3623 zSig0 |= LIT64( 0x8000000000000000 ); 3624 ++zExp; 3625 roundAndPack: 3626 return 3627 roundAndPackFloatx80( 3628 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3629 3630 } 3631 3632 /* 3633 ------------------------------------------------------------------------------- 3634 Returns the result of subtracting the absolute values of the extended 3635 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3636 difference is negated before being returned. `zSign' is ignored if the 3637 result is a NaN. The subtraction is performed according to the IEC/IEEE 3638 Standard for Binary Floating-Point Arithmetic. 3639 ------------------------------------------------------------------------------- 3640 */ 3641 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3642 { 3643 int32 aExp, bExp, zExp; 3644 bits64 aSig, bSig, zSig0, zSig1; 3645 int32 expDiff; 3646 floatx80 z; 3647 3648 aSig = extractFloatx80Frac( a ); 3649 aExp = extractFloatx80Exp( a ); 3650 bSig = extractFloatx80Frac( b ); 3651 bExp = extractFloatx80Exp( b ); 3652 expDiff = aExp - bExp; 3653 if ( 0 < expDiff ) goto aExpBigger; 3654 if ( expDiff < 0 ) goto bExpBigger; 3655 if ( aExp == 0x7FFF ) { 3656 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3657 return propagateFloatx80NaN( a, b ); 3658 } 3659 float_raise( float_flag_invalid ); 3660 z.low = floatx80_default_nan_low; 3661 z.high = floatx80_default_nan_high; 3662 return z; 3663 } 3664 if ( aExp == 0 ) { 3665 aExp = 1; 3666 bExp = 1; 3667 } 3668 zSig1 = 0; 3669 if ( bSig < aSig ) goto aBigger; 3670 if ( aSig < bSig ) goto bBigger; 3671 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3672 bExpBigger: 3673 if ( bExp == 0x7FFF ) { 3674 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3675 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3676 } 3677 if ( aExp == 0 ) ++expDiff; 3678 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3679 bBigger: 3680 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3681 zExp = bExp; 3682 zSign ^= 1; 3683 goto normalizeRoundAndPack; 3684 aExpBigger: 3685 if ( aExp == 0x7FFF ) { 3686 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3687 return a; 3688 } 3689 if ( bExp == 0 ) --expDiff; 3690 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3691 aBigger: 3692 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3693 zExp = aExp; 3694 normalizeRoundAndPack: 3695 return 3696 normalizeRoundAndPackFloatx80( 3697 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3698 3699 } 3700 3701 /* 3702 ------------------------------------------------------------------------------- 3703 Returns the result of adding the extended double-precision floating-point 3704 values `a' and `b'. The operation is performed according to the IEC/IEEE 3705 Standard for Binary Floating-Point Arithmetic. 3706 ------------------------------------------------------------------------------- 3707 */ 3708 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3709 { 3710 flag aSign, bSign; 3711 3712 aSign = extractFloatx80Sign( a ); 3713 bSign = extractFloatx80Sign( b ); 3714 if ( aSign == bSign ) { 3715 return addFloatx80Sigs( a, b, aSign ); 3716 } 3717 else { 3718 return subFloatx80Sigs( a, b, aSign ); 3719 } 3720 3721 } 3722 3723 /* 3724 ------------------------------------------------------------------------------- 3725 Returns the result of subtracting the extended double-precision floating- 3726 point values `a' and `b'. The operation is performed according to the 3727 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3728 ------------------------------------------------------------------------------- 3729 */ 3730 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3731 { 3732 flag aSign, bSign; 3733 3734 aSign = extractFloatx80Sign( a ); 3735 bSign = extractFloatx80Sign( b ); 3736 if ( aSign == bSign ) { 3737 return subFloatx80Sigs( a, b, aSign ); 3738 } 3739 else { 3740 return addFloatx80Sigs( a, b, aSign ); 3741 } 3742 3743 } 3744 3745 /* 3746 ------------------------------------------------------------------------------- 3747 Returns the result of multiplying the extended double-precision floating- 3748 point values `a' and `b'. The operation is performed according to the 3749 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3750 ------------------------------------------------------------------------------- 3751 */ 3752 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3753 { 3754 flag aSign, bSign, zSign; 3755 int32 aExp, bExp, zExp; 3756 bits64 aSig, bSig, zSig0, zSig1; 3757 floatx80 z; 3758 3759 aSig = extractFloatx80Frac( a ); 3760 aExp = extractFloatx80Exp( a ); 3761 aSign = extractFloatx80Sign( a ); 3762 bSig = extractFloatx80Frac( b ); 3763 bExp = extractFloatx80Exp( b ); 3764 bSign = extractFloatx80Sign( b ); 3765 zSign = aSign ^ bSign; 3766 if ( aExp == 0x7FFF ) { 3767 if ( (bits64) ( aSig<<1 ) 3768 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3769 return propagateFloatx80NaN( a, b ); 3770 } 3771 if ( ( bExp | bSig ) == 0 ) goto invalid; 3772 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3773 } 3774 if ( bExp == 0x7FFF ) { 3775 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3776 if ( ( aExp | aSig ) == 0 ) { 3777 invalid: 3778 float_raise( float_flag_invalid ); 3779 z.low = floatx80_default_nan_low; 3780 z.high = floatx80_default_nan_high; 3781 return z; 3782 } 3783 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3784 } 3785 if ( aExp == 0 ) { 3786 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3787 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3788 } 3789 if ( bExp == 0 ) { 3790 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3791 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3792 } 3793 zExp = aExp + bExp - 0x3FFE; 3794 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3795 if ( 0 < (sbits64) zSig0 ) { 3796 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3797 --zExp; 3798 } 3799 return 3800 roundAndPackFloatx80( 3801 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3802 3803 } 3804 3805 /* 3806 ------------------------------------------------------------------------------- 3807 Returns the result of dividing the extended double-precision floating-point 3808 value `a' by the corresponding value `b'. The operation is performed 3809 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3810 ------------------------------------------------------------------------------- 3811 */ 3812 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3813 { 3814 flag aSign, bSign, zSign; 3815 int32 aExp, bExp, zExp; 3816 bits64 aSig, bSig, zSig0, zSig1; 3817 bits64 rem0, rem1, rem2, term0, term1, term2; 3818 floatx80 z; 3819 3820 aSig = extractFloatx80Frac( a ); 3821 aExp = extractFloatx80Exp( a ); 3822 aSign = extractFloatx80Sign( a ); 3823 bSig = extractFloatx80Frac( b ); 3824 bExp = extractFloatx80Exp( b ); 3825 bSign = extractFloatx80Sign( b ); 3826 zSign = aSign ^ bSign; 3827 if ( aExp == 0x7FFF ) { 3828 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3829 if ( bExp == 0x7FFF ) { 3830 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3831 goto invalid; 3832 } 3833 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3834 } 3835 if ( bExp == 0x7FFF ) { 3836 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3837 return packFloatx80( zSign, 0, 0 ); 3838 } 3839 if ( bExp == 0 ) { 3840 if ( bSig == 0 ) { 3841 if ( ( aExp | aSig ) == 0 ) { 3842 invalid: 3843 float_raise( float_flag_invalid ); 3844 z.low = floatx80_default_nan_low; 3845 z.high = floatx80_default_nan_high; 3846 return z; 3847 } 3848 float_raise( float_flag_divbyzero ); 3849 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3850 } 3851 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3852 } 3853 if ( aExp == 0 ) { 3854 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3855 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3856 } 3857 zExp = aExp - bExp + 0x3FFE; 3858 rem1 = 0; 3859 if ( bSig <= aSig ) { 3860 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3861 ++zExp; 3862 } 3863 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3864 mul64To128( bSig, zSig0, &term0, &term1 ); 3865 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3866 while ( (sbits64) rem0 < 0 ) { 3867 --zSig0; 3868 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3869 } 3870 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3871 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3872 mul64To128( bSig, zSig1, &term1, &term2 ); 3873 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3874 while ( (sbits64) rem1 < 0 ) { 3875 --zSig1; 3876 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3877 } 3878 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3879 } 3880 return 3881 roundAndPackFloatx80( 3882 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3883 3884 } 3885 3886 /* 3887 ------------------------------------------------------------------------------- 3888 Returns the remainder of the extended double-precision floating-point value 3889 `a' with respect to the corresponding value `b'. The operation is performed 3890 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3891 ------------------------------------------------------------------------------- 3892 */ 3893 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3894 { 3895 flag aSign, bSign, zSign; 3896 int32 aExp, bExp, expDiff; 3897 bits64 aSig0, aSig1, bSig; 3898 bits64 q, term0, term1, alternateASig0, alternateASig1; 3899 floatx80 z; 3900 3901 aSig0 = extractFloatx80Frac( a ); 3902 aExp = extractFloatx80Exp( a ); 3903 aSign = extractFloatx80Sign( a ); 3904 bSig = extractFloatx80Frac( b ); 3905 bExp = extractFloatx80Exp( b ); 3906 bSign = extractFloatx80Sign( b ); 3907 if ( aExp == 0x7FFF ) { 3908 if ( (bits64) ( aSig0<<1 ) 3909 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3910 return propagateFloatx80NaN( a, b ); 3911 } 3912 goto invalid; 3913 } 3914 if ( bExp == 0x7FFF ) { 3915 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3916 return a; 3917 } 3918 if ( bExp == 0 ) { 3919 if ( bSig == 0 ) { 3920 invalid: 3921 float_raise( float_flag_invalid ); 3922 z.low = floatx80_default_nan_low; 3923 z.high = floatx80_default_nan_high; 3924 return z; 3925 } 3926 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3927 } 3928 if ( aExp == 0 ) { 3929 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3930 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3931 } 3932 bSig |= LIT64( 0x8000000000000000 ); 3933 zSign = aSign; 3934 expDiff = aExp - bExp; 3935 aSig1 = 0; 3936 if ( expDiff < 0 ) { 3937 if ( expDiff < -1 ) return a; 3938 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3939 expDiff = 0; 3940 } 3941 q = ( bSig <= aSig0 ); 3942 if ( q ) aSig0 -= bSig; 3943 expDiff -= 64; 3944 while ( 0 < expDiff ) { 3945 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3946 q = ( 2 < q ) ? q - 2 : 0; 3947 mul64To128( bSig, q, &term0, &term1 ); 3948 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3949 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3950 expDiff -= 62; 3951 } 3952 expDiff += 64; 3953 if ( 0 < expDiff ) { 3954 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3955 q = ( 2 < q ) ? q - 2 : 0; 3956 q >>= 64 - expDiff; 3957 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3958 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3959 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3960 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3961 ++q; 3962 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3963 } 3964 } 3965 else { 3966 term1 = 0; 3967 term0 = bSig; 3968 } 3969 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3970 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3971 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3972 && ( q & 1 ) ) 3973 ) { 3974 aSig0 = alternateASig0; 3975 aSig1 = alternateASig1; 3976 zSign = ! zSign; 3977 } 3978 return 3979 normalizeRoundAndPackFloatx80( 3980 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3981 3982 } 3983 3984 /* 3985 ------------------------------------------------------------------------------- 3986 Returns the square root of the extended double-precision floating-point 3987 value `a'. The operation is performed according to the IEC/IEEE Standard 3988 for Binary Floating-Point Arithmetic. 3989 ------------------------------------------------------------------------------- 3990 */ 3991 floatx80 floatx80_sqrt( floatx80 a ) 3992 { 3993 flag aSign; 3994 int32 aExp, zExp; 3995 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3996 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3997 floatx80 z; 3998 3999 aSig0 = extractFloatx80Frac( a ); 4000 aExp = extractFloatx80Exp( a ); 4001 aSign = extractFloatx80Sign( a ); 4002 if ( aExp == 0x7FFF ) { 4003 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4004 if ( ! aSign ) return a; 4005 goto invalid; 4006 } 4007 if ( aSign ) { 4008 if ( ( aExp | aSig0 ) == 0 ) return a; 4009 invalid: 4010 float_raise( float_flag_invalid ); 4011 z.low = floatx80_default_nan_low; 4012 z.high = floatx80_default_nan_high; 4013 return z; 4014 } 4015 if ( aExp == 0 ) { 4016 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4017 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4018 } 4019 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4020 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4021 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4022 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4023 doubleZSig0 = zSig0<<1; 4024 mul64To128( zSig0, zSig0, &term0, &term1 ); 4025 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4026 while ( (sbits64) rem0 < 0 ) { 4027 --zSig0; 4028 doubleZSig0 -= 2; 4029 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4030 } 4031 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4032 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4033 if ( zSig1 == 0 ) zSig1 = 1; 4034 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4035 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4036 mul64To128( zSig1, zSig1, &term2, &term3 ); 4037 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4038 while ( (sbits64) rem1 < 0 ) { 4039 --zSig1; 4040 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4041 term3 |= 1; 4042 term2 |= doubleZSig0; 4043 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4044 } 4045 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4046 } 4047 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4048 zSig0 |= doubleZSig0; 4049 return 4050 roundAndPackFloatx80( 4051 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4052 4053 } 4054 4055 /* 4056 ------------------------------------------------------------------------------- 4057 Returns 1 if the extended double-precision floating-point value `a' is 4058 equal to the corresponding value `b', and 0 otherwise. The comparison is 4059 performed according to the IEC/IEEE Standard for Binary Floating-Point 4060 Arithmetic. 4061 ------------------------------------------------------------------------------- 4062 */ 4063 flag floatx80_eq( floatx80 a, floatx80 b ) 4064 { 4065 4066 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4067 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4068 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4069 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4070 ) { 4071 if ( floatx80_is_signaling_nan( a ) 4072 || floatx80_is_signaling_nan( b ) ) { 4073 float_raise( float_flag_invalid ); 4074 } 4075 return 0; 4076 } 4077 return 4078 ( a.low == b.low ) 4079 && ( ( a.high == b.high ) 4080 || ( ( a.low == 0 ) 4081 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4082 ); 4083 4084 } 4085 4086 /* 4087 ------------------------------------------------------------------------------- 4088 Returns 1 if the extended double-precision floating-point value `a' is 4089 less than or equal to the corresponding value `b', and 0 otherwise. The 4090 comparison is performed according to the IEC/IEEE Standard for Binary 4091 Floating-Point Arithmetic. 4092 ------------------------------------------------------------------------------- 4093 */ 4094 flag floatx80_le( floatx80 a, floatx80 b ) 4095 { 4096 flag aSign, bSign; 4097 4098 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4099 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4100 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4101 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4102 ) { 4103 float_raise( float_flag_invalid ); 4104 return 0; 4105 } 4106 aSign = extractFloatx80Sign( a ); 4107 bSign = extractFloatx80Sign( b ); 4108 if ( aSign != bSign ) { 4109 return 4110 aSign 4111 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4112 == 0 ); 4113 } 4114 return 4115 aSign ? le128( b.high, b.low, a.high, a.low ) 4116 : le128( a.high, a.low, b.high, b.low ); 4117 4118 } 4119 4120 /* 4121 ------------------------------------------------------------------------------- 4122 Returns 1 if the extended double-precision floating-point value `a' is 4123 less than the corresponding value `b', and 0 otherwise. The comparison 4124 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4125 Arithmetic. 4126 ------------------------------------------------------------------------------- 4127 */ 4128 flag floatx80_lt( floatx80 a, floatx80 b ) 4129 { 4130 flag aSign, bSign; 4131 4132 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4133 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4134 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4135 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4136 ) { 4137 float_raise( float_flag_invalid ); 4138 return 0; 4139 } 4140 aSign = extractFloatx80Sign( a ); 4141 bSign = extractFloatx80Sign( b ); 4142 if ( aSign != bSign ) { 4143 return 4144 aSign 4145 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4146 != 0 ); 4147 } 4148 return 4149 aSign ? lt128( b.high, b.low, a.high, a.low ) 4150 : lt128( a.high, a.low, b.high, b.low ); 4151 4152 } 4153 4154 /* 4155 ------------------------------------------------------------------------------- 4156 Returns 1 if the extended double-precision floating-point value `a' is equal 4157 to the corresponding value `b', and 0 otherwise. The invalid exception is 4158 raised if either operand is a NaN. Otherwise, the comparison is performed 4159 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4160 ------------------------------------------------------------------------------- 4161 */ 4162 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4163 { 4164 4165 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4166 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4167 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4168 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4169 ) { 4170 float_raise( float_flag_invalid ); 4171 return 0; 4172 } 4173 return 4174 ( a.low == b.low ) 4175 && ( ( a.high == b.high ) 4176 || ( ( a.low == 0 ) 4177 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4178 ); 4179 4180 } 4181 4182 /* 4183 ------------------------------------------------------------------------------- 4184 Returns 1 if the extended double-precision floating-point value `a' is less 4185 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4186 do not cause an exception. Otherwise, the comparison is performed according 4187 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4188 ------------------------------------------------------------------------------- 4189 */ 4190 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4191 { 4192 flag aSign, bSign; 4193 4194 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4195 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4196 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4197 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4198 ) { 4199 if ( floatx80_is_signaling_nan( a ) 4200 || floatx80_is_signaling_nan( b ) ) { 4201 float_raise( float_flag_invalid ); 4202 } 4203 return 0; 4204 } 4205 aSign = extractFloatx80Sign( a ); 4206 bSign = extractFloatx80Sign( b ); 4207 if ( aSign != bSign ) { 4208 return 4209 aSign 4210 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4211 == 0 ); 4212 } 4213 return 4214 aSign ? le128( b.high, b.low, a.high, a.low ) 4215 : le128( a.high, a.low, b.high, b.low ); 4216 4217 } 4218 4219 /* 4220 ------------------------------------------------------------------------------- 4221 Returns 1 if the extended double-precision floating-point value `a' is less 4222 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4223 an exception. Otherwise, the comparison is performed according to the 4224 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4225 ------------------------------------------------------------------------------- 4226 */ 4227 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4228 { 4229 flag aSign, bSign; 4230 4231 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4232 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4233 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4234 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4235 ) { 4236 if ( floatx80_is_signaling_nan( a ) 4237 || floatx80_is_signaling_nan( b ) ) { 4238 float_raise( float_flag_invalid ); 4239 } 4240 return 0; 4241 } 4242 aSign = extractFloatx80Sign( a ); 4243 bSign = extractFloatx80Sign( b ); 4244 if ( aSign != bSign ) { 4245 return 4246 aSign 4247 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4248 != 0 ); 4249 } 4250 return 4251 aSign ? lt128( b.high, b.low, a.high, a.low ) 4252 : lt128( a.high, a.low, b.high, b.low ); 4253 4254 } 4255 4256 #endif 4257 4258 #ifdef FLOAT128 4259 4260 /* 4261 ------------------------------------------------------------------------------- 4262 Returns the result of converting the quadruple-precision floating-point 4263 value `a' to the 32-bit two's complement integer format. The conversion 4264 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4265 Arithmetic---which means in particular that the conversion is rounded 4266 according to the current rounding mode. If `a' is a NaN, the largest 4267 positive integer is returned. Otherwise, if the conversion overflows, the 4268 largest integer with the same sign as `a' is returned. 4269 ------------------------------------------------------------------------------- 4270 */ 4271 int32 float128_to_int32( float128 a ) 4272 { 4273 flag aSign; 4274 int32 aExp, shiftCount; 4275 bits64 aSig0, aSig1; 4276 4277 aSig1 = extractFloat128Frac1( a ); 4278 aSig0 = extractFloat128Frac0( a ); 4279 aExp = extractFloat128Exp( a ); 4280 aSign = extractFloat128Sign( a ); 4281 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4282 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4283 aSig0 |= ( aSig1 != 0 ); 4284 shiftCount = 0x4028 - aExp; 4285 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4286 return roundAndPackInt32( aSign, aSig0 ); 4287 4288 } 4289 4290 /* 4291 ------------------------------------------------------------------------------- 4292 Returns the result of converting the quadruple-precision floating-point 4293 value `a' to the 32-bit two's complement integer format. The conversion 4294 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4295 Arithmetic, except that the conversion is always rounded toward zero. If 4296 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4297 conversion overflows, the largest integer with the same sign as `a' is 4298 returned. 4299 ------------------------------------------------------------------------------- 4300 */ 4301 int32 float128_to_int32_round_to_zero( float128 a ) 4302 { 4303 flag aSign; 4304 int32 aExp, shiftCount; 4305 bits64 aSig0, aSig1, savedASig; 4306 int32 z; 4307 4308 aSig1 = extractFloat128Frac1( a ); 4309 aSig0 = extractFloat128Frac0( a ); 4310 aExp = extractFloat128Exp( a ); 4311 aSign = extractFloat128Sign( a ); 4312 aSig0 |= ( aSig1 != 0 ); 4313 if ( 0x401E < aExp ) { 4314 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4315 goto invalid; 4316 } 4317 else if ( aExp < 0x3FFF ) { 4318 if ( aExp || aSig0 ) float_set_inexact(); 4319 return 0; 4320 } 4321 aSig0 |= LIT64( 0x0001000000000000 ); 4322 shiftCount = 0x402F - aExp; 4323 savedASig = aSig0; 4324 aSig0 >>= shiftCount; 4325 z = aSig0; 4326 if ( aSign ) z = - z; 4327 if ( ( z < 0 ) ^ aSign ) { 4328 invalid: 4329 float_raise( float_flag_invalid ); 4330 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4331 } 4332 if ( ( aSig0<<shiftCount ) != savedASig ) { 4333 float_set_inexact(); 4334 } 4335 return z; 4336 4337 } 4338 4339 /* 4340 ------------------------------------------------------------------------------- 4341 Returns the result of converting the quadruple-precision floating-point 4342 value `a' to the 64-bit two's complement integer format. The conversion 4343 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4344 Arithmetic---which means in particular that the conversion is rounded 4345 according to the current rounding mode. If `a' is a NaN, the largest 4346 positive integer is returned. Otherwise, if the conversion overflows, the 4347 largest integer with the same sign as `a' is returned. 4348 ------------------------------------------------------------------------------- 4349 */ 4350 int64 float128_to_int64( float128 a ) 4351 { 4352 flag aSign; 4353 int32 aExp, shiftCount; 4354 bits64 aSig0, aSig1; 4355 4356 aSig1 = extractFloat128Frac1( a ); 4357 aSig0 = extractFloat128Frac0( a ); 4358 aExp = extractFloat128Exp( a ); 4359 aSign = extractFloat128Sign( a ); 4360 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4361 shiftCount = 0x402F - aExp; 4362 if ( shiftCount <= 0 ) { 4363 if ( 0x403E < aExp ) { 4364 float_raise( float_flag_invalid ); 4365 if ( ! aSign 4366 || ( ( aExp == 0x7FFF ) 4367 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4368 ) 4369 ) { 4370 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4371 } 4372 return (sbits64) LIT64( 0x8000000000000000 ); 4373 } 4374 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4375 } 4376 else { 4377 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4378 } 4379 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4380 4381 } 4382 4383 /* 4384 ------------------------------------------------------------------------------- 4385 Returns the result of converting the quadruple-precision floating-point 4386 value `a' to the 64-bit two's complement integer format. The conversion 4387 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4388 Arithmetic, except that the conversion is always rounded toward zero. 4389 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4390 the conversion overflows, the largest integer with the same sign as `a' is 4391 returned. 4392 ------------------------------------------------------------------------------- 4393 */ 4394 int64 float128_to_int64_round_to_zero( float128 a ) 4395 { 4396 flag aSign; 4397 int32 aExp, shiftCount; 4398 bits64 aSig0, aSig1; 4399 int64 z; 4400 4401 aSig1 = extractFloat128Frac1( a ); 4402 aSig0 = extractFloat128Frac0( a ); 4403 aExp = extractFloat128Exp( a ); 4404 aSign = extractFloat128Sign( a ); 4405 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4406 shiftCount = aExp - 0x402F; 4407 if ( 0 < shiftCount ) { 4408 if ( 0x403E <= aExp ) { 4409 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4410 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4411 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4412 if ( aSig1 ) float_set_inexact(); 4413 } 4414 else { 4415 float_raise( float_flag_invalid ); 4416 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4417 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4418 } 4419 } 4420 return (sbits64) LIT64( 0x8000000000000000 ); 4421 } 4422 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4423 if ( (bits64) ( aSig1<<shiftCount ) ) { 4424 float_set_inexact(); 4425 } 4426 } 4427 else { 4428 if ( aExp < 0x3FFF ) { 4429 if ( aExp | aSig0 | aSig1 ) { 4430 float_set_inexact(); 4431 } 4432 return 0; 4433 } 4434 z = aSig0>>( - shiftCount ); 4435 if ( aSig1 4436 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4437 float_set_inexact(); 4438 } 4439 } 4440 if ( aSign ) z = - z; 4441 return z; 4442 4443 } 4444 4445 /* 4446 ------------------------------------------------------------------------------- 4447 Returns the result of converting the quadruple-precision floating-point 4448 value `a' to the single-precision floating-point format. The conversion 4449 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4450 Arithmetic. 4451 ------------------------------------------------------------------------------- 4452 */ 4453 float32 float128_to_float32( float128 a ) 4454 { 4455 flag aSign; 4456 int32 aExp; 4457 bits64 aSig0, aSig1; 4458 bits32 zSig; 4459 4460 aSig1 = extractFloat128Frac1( a ); 4461 aSig0 = extractFloat128Frac0( a ); 4462 aExp = extractFloat128Exp( a ); 4463 aSign = extractFloat128Sign( a ); 4464 if ( aExp == 0x7FFF ) { 4465 if ( aSig0 | aSig1 ) { 4466 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4467 } 4468 return packFloat32( aSign, 0xFF, 0 ); 4469 } 4470 aSig0 |= ( aSig1 != 0 ); 4471 shift64RightJamming( aSig0, 18, &aSig0 ); 4472 zSig = aSig0; 4473 if ( aExp || zSig ) { 4474 zSig |= 0x40000000; 4475 aExp -= 0x3F81; 4476 } 4477 return roundAndPackFloat32( aSign, aExp, zSig ); 4478 4479 } 4480 4481 /* 4482 ------------------------------------------------------------------------------- 4483 Returns the result of converting the quadruple-precision floating-point 4484 value `a' to the double-precision floating-point format. The conversion 4485 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4486 Arithmetic. 4487 ------------------------------------------------------------------------------- 4488 */ 4489 float64 float128_to_float64( float128 a ) 4490 { 4491 flag aSign; 4492 int32 aExp; 4493 bits64 aSig0, aSig1; 4494 4495 aSig1 = extractFloat128Frac1( a ); 4496 aSig0 = extractFloat128Frac0( a ); 4497 aExp = extractFloat128Exp( a ); 4498 aSign = extractFloat128Sign( a ); 4499 if ( aExp == 0x7FFF ) { 4500 if ( aSig0 | aSig1 ) { 4501 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4502 } 4503 return packFloat64( aSign, 0x7FF, 0 ); 4504 } 4505 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4506 aSig0 |= ( aSig1 != 0 ); 4507 if ( aExp || aSig0 ) { 4508 aSig0 |= LIT64( 0x4000000000000000 ); 4509 aExp -= 0x3C01; 4510 } 4511 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4512 4513 } 4514 4515 #ifdef FLOATX80 4516 4517 /* 4518 ------------------------------------------------------------------------------- 4519 Returns the result of converting the quadruple-precision floating-point 4520 value `a' to the extended double-precision floating-point format. The 4521 conversion is performed according to the IEC/IEEE Standard for Binary 4522 Floating-Point Arithmetic. 4523 ------------------------------------------------------------------------------- 4524 */ 4525 floatx80 float128_to_floatx80( float128 a ) 4526 { 4527 flag aSign; 4528 int32 aExp; 4529 bits64 aSig0, aSig1; 4530 4531 aSig1 = extractFloat128Frac1( a ); 4532 aSig0 = extractFloat128Frac0( a ); 4533 aExp = extractFloat128Exp( a ); 4534 aSign = extractFloat128Sign( a ); 4535 if ( aExp == 0x7FFF ) { 4536 if ( aSig0 | aSig1 ) { 4537 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4538 } 4539 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4540 } 4541 if ( aExp == 0 ) { 4542 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4543 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4544 } 4545 else { 4546 aSig0 |= LIT64( 0x0001000000000000 ); 4547 } 4548 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4549 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4550 4551 } 4552 4553 #endif 4554 4555 /* 4556 ------------------------------------------------------------------------------- 4557 Rounds the quadruple-precision floating-point value `a' to an integer, and 4558 returns the result as a quadruple-precision floating-point value. The 4559 operation is performed according to the IEC/IEEE Standard for Binary 4560 Floating-Point Arithmetic. 4561 ------------------------------------------------------------------------------- 4562 */ 4563 float128 float128_round_to_int( float128 a ) 4564 { 4565 flag aSign; 4566 int32 aExp; 4567 bits64 lastBitMask, roundBitsMask; 4568 int8 roundingMode; 4569 float128 z; 4570 4571 aExp = extractFloat128Exp( a ); 4572 if ( 0x402F <= aExp ) { 4573 if ( 0x406F <= aExp ) { 4574 if ( ( aExp == 0x7FFF ) 4575 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4576 ) { 4577 return propagateFloat128NaN( a, a ); 4578 } 4579 return a; 4580 } 4581 lastBitMask = 1; 4582 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4583 roundBitsMask = lastBitMask - 1; 4584 z = a; 4585 roundingMode = float_rounding_mode(); 4586 if ( roundingMode == float_round_nearest_even ) { 4587 if ( lastBitMask ) { 4588 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4589 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4590 } 4591 else { 4592 if ( (sbits64) z.low < 0 ) { 4593 ++z.high; 4594 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4595 } 4596 } 4597 } 4598 else if ( roundingMode != float_round_to_zero ) { 4599 if ( extractFloat128Sign( z ) 4600 ^ ( roundingMode == float_round_up ) ) { 4601 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4602 } 4603 } 4604 z.low &= ~ roundBitsMask; 4605 } 4606 else { 4607 if ( aExp < 0x3FFF ) { 4608 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4609 float_set_inexact(); 4610 aSign = extractFloat128Sign( a ); 4611 switch ( float_rounding_mode() ) { 4612 case float_round_nearest_even: 4613 if ( ( aExp == 0x3FFE ) 4614 && ( extractFloat128Frac0( a ) 4615 | extractFloat128Frac1( a ) ) 4616 ) { 4617 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4618 } 4619 break; 4620 case float_round_down: 4621 return 4622 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4623 : packFloat128( 0, 0, 0, 0 ); 4624 case float_round_up: 4625 return 4626 aSign ? packFloat128( 1, 0, 0, 0 ) 4627 : packFloat128( 0, 0x3FFF, 0, 0 ); 4628 } 4629 return packFloat128( aSign, 0, 0, 0 ); 4630 } 4631 lastBitMask = 1; 4632 lastBitMask <<= 0x402F - aExp; 4633 roundBitsMask = lastBitMask - 1; 4634 z.low = 0; 4635 z.high = a.high; 4636 roundingMode = float_rounding_mode(); 4637 if ( roundingMode == float_round_nearest_even ) { 4638 z.high += lastBitMask>>1; 4639 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4640 z.high &= ~ lastBitMask; 4641 } 4642 } 4643 else if ( roundingMode != float_round_to_zero ) { 4644 if ( extractFloat128Sign( z ) 4645 ^ ( roundingMode == float_round_up ) ) { 4646 z.high |= ( a.low != 0 ); 4647 z.high += roundBitsMask; 4648 } 4649 } 4650 z.high &= ~ roundBitsMask; 4651 } 4652 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4653 float_set_inexact(); 4654 } 4655 return z; 4656 4657 } 4658 4659 /* 4660 ------------------------------------------------------------------------------- 4661 Returns the result of adding the absolute values of the quadruple-precision 4662 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4663 before being returned. `zSign' is ignored if the result is a NaN. 4664 The addition is performed according to the IEC/IEEE Standard for Binary 4665 Floating-Point Arithmetic. 4666 ------------------------------------------------------------------------------- 4667 */ 4668 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4669 { 4670 int32 aExp, bExp, zExp; 4671 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4672 int32 expDiff; 4673 4674 aSig1 = extractFloat128Frac1( a ); 4675 aSig0 = extractFloat128Frac0( a ); 4676 aExp = extractFloat128Exp( a ); 4677 bSig1 = extractFloat128Frac1( b ); 4678 bSig0 = extractFloat128Frac0( b ); 4679 bExp = extractFloat128Exp( b ); 4680 expDiff = aExp - bExp; 4681 if ( 0 < expDiff ) { 4682 if ( aExp == 0x7FFF ) { 4683 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4684 return a; 4685 } 4686 if ( bExp == 0 ) { 4687 --expDiff; 4688 } 4689 else { 4690 bSig0 |= LIT64( 0x0001000000000000 ); 4691 } 4692 shift128ExtraRightJamming( 4693 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4694 zExp = aExp; 4695 } 4696 else if ( expDiff < 0 ) { 4697 if ( bExp == 0x7FFF ) { 4698 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4699 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4700 } 4701 if ( aExp == 0 ) { 4702 ++expDiff; 4703 } 4704 else { 4705 aSig0 |= LIT64( 0x0001000000000000 ); 4706 } 4707 shift128ExtraRightJamming( 4708 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4709 zExp = bExp; 4710 } 4711 else { 4712 if ( aExp == 0x7FFF ) { 4713 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4714 return propagateFloat128NaN( a, b ); 4715 } 4716 return a; 4717 } 4718 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4719 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4720 zSig2 = 0; 4721 zSig0 |= LIT64( 0x0002000000000000 ); 4722 zExp = aExp; 4723 goto shiftRight1; 4724 } 4725 aSig0 |= LIT64( 0x0001000000000000 ); 4726 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4727 --zExp; 4728 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4729 ++zExp; 4730 shiftRight1: 4731 shift128ExtraRightJamming( 4732 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4733 roundAndPack: 4734 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4735 4736 } 4737 4738 /* 4739 ------------------------------------------------------------------------------- 4740 Returns the result of subtracting the absolute values of the quadruple- 4741 precision floating-point values `a' and `b'. If `zSign' is 1, the 4742 difference is negated before being returned. `zSign' is ignored if the 4743 result is a NaN. The subtraction is performed according to the IEC/IEEE 4744 Standard for Binary Floating-Point Arithmetic. 4745 ------------------------------------------------------------------------------- 4746 */ 4747 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4748 { 4749 int32 aExp, bExp, zExp; 4750 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4751 int32 expDiff; 4752 float128 z; 4753 4754 aSig1 = extractFloat128Frac1( a ); 4755 aSig0 = extractFloat128Frac0( a ); 4756 aExp = extractFloat128Exp( a ); 4757 bSig1 = extractFloat128Frac1( b ); 4758 bSig0 = extractFloat128Frac0( b ); 4759 bExp = extractFloat128Exp( b ); 4760 expDiff = aExp - bExp; 4761 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4762 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4763 if ( 0 < expDiff ) goto aExpBigger; 4764 if ( expDiff < 0 ) goto bExpBigger; 4765 if ( aExp == 0x7FFF ) { 4766 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4767 return propagateFloat128NaN( a, b ); 4768 } 4769 float_raise( float_flag_invalid ); 4770 z.low = float128_default_nan_low; 4771 z.high = float128_default_nan_high; 4772 return z; 4773 } 4774 if ( aExp == 0 ) { 4775 aExp = 1; 4776 bExp = 1; 4777 } 4778 if ( bSig0 < aSig0 ) goto aBigger; 4779 if ( aSig0 < bSig0 ) goto bBigger; 4780 if ( bSig1 < aSig1 ) goto aBigger; 4781 if ( aSig1 < bSig1 ) goto bBigger; 4782 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4783 bExpBigger: 4784 if ( bExp == 0x7FFF ) { 4785 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4786 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4787 } 4788 if ( aExp == 0 ) { 4789 ++expDiff; 4790 } 4791 else { 4792 aSig0 |= LIT64( 0x4000000000000000 ); 4793 } 4794 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4795 bSig0 |= LIT64( 0x4000000000000000 ); 4796 bBigger: 4797 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4798 zExp = bExp; 4799 zSign ^= 1; 4800 goto normalizeRoundAndPack; 4801 aExpBigger: 4802 if ( aExp == 0x7FFF ) { 4803 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4804 return a; 4805 } 4806 if ( bExp == 0 ) { 4807 --expDiff; 4808 } 4809 else { 4810 bSig0 |= LIT64( 0x4000000000000000 ); 4811 } 4812 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4813 aSig0 |= LIT64( 0x4000000000000000 ); 4814 aBigger: 4815 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4816 zExp = aExp; 4817 normalizeRoundAndPack: 4818 --zExp; 4819 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4820 4821 } 4822 4823 /* 4824 ------------------------------------------------------------------------------- 4825 Returns the result of adding the quadruple-precision floating-point values 4826 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4827 for Binary Floating-Point Arithmetic. 4828 ------------------------------------------------------------------------------- 4829 */ 4830 float128 float128_add( float128 a, float128 b ) 4831 { 4832 flag aSign, bSign; 4833 4834 aSign = extractFloat128Sign( a ); 4835 bSign = extractFloat128Sign( b ); 4836 if ( aSign == bSign ) { 4837 return addFloat128Sigs( a, b, aSign ); 4838 } 4839 else { 4840 return subFloat128Sigs( a, b, aSign ); 4841 } 4842 4843 } 4844 4845 /* 4846 ------------------------------------------------------------------------------- 4847 Returns the result of subtracting the quadruple-precision floating-point 4848 values `a' and `b'. The operation is performed according to the IEC/IEEE 4849 Standard for Binary Floating-Point Arithmetic. 4850 ------------------------------------------------------------------------------- 4851 */ 4852 float128 float128_sub( float128 a, float128 b ) 4853 { 4854 flag aSign, bSign; 4855 4856 aSign = extractFloat128Sign( a ); 4857 bSign = extractFloat128Sign( b ); 4858 if ( aSign == bSign ) { 4859 return subFloat128Sigs( a, b, aSign ); 4860 } 4861 else { 4862 return addFloat128Sigs( a, b, aSign ); 4863 } 4864 4865 } 4866 4867 /* 4868 ------------------------------------------------------------------------------- 4869 Returns the result of multiplying the quadruple-precision floating-point 4870 values `a' and `b'. The operation is performed according to the IEC/IEEE 4871 Standard for Binary Floating-Point Arithmetic. 4872 ------------------------------------------------------------------------------- 4873 */ 4874 float128 float128_mul( float128 a, float128 b ) 4875 { 4876 flag aSign, bSign, zSign; 4877 int32 aExp, bExp, zExp; 4878 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4879 float128 z; 4880 4881 aSig1 = extractFloat128Frac1( a ); 4882 aSig0 = extractFloat128Frac0( a ); 4883 aExp = extractFloat128Exp( a ); 4884 aSign = extractFloat128Sign( a ); 4885 bSig1 = extractFloat128Frac1( b ); 4886 bSig0 = extractFloat128Frac0( b ); 4887 bExp = extractFloat128Exp( b ); 4888 bSign = extractFloat128Sign( b ); 4889 zSign = aSign ^ bSign; 4890 if ( aExp == 0x7FFF ) { 4891 if ( ( aSig0 | aSig1 ) 4892 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4893 return propagateFloat128NaN( a, b ); 4894 } 4895 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4896 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4897 } 4898 if ( bExp == 0x7FFF ) { 4899 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4900 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4901 invalid: 4902 float_raise( float_flag_invalid ); 4903 z.low = float128_default_nan_low; 4904 z.high = float128_default_nan_high; 4905 return z; 4906 } 4907 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4908 } 4909 if ( aExp == 0 ) { 4910 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4911 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4912 } 4913 if ( bExp == 0 ) { 4914 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4915 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4916 } 4917 zExp = aExp + bExp - 0x4000; 4918 aSig0 |= LIT64( 0x0001000000000000 ); 4919 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4920 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4921 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4922 zSig2 |= ( zSig3 != 0 ); 4923 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4924 shift128ExtraRightJamming( 4925 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4926 ++zExp; 4927 } 4928 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4929 4930 } 4931 4932 /* 4933 ------------------------------------------------------------------------------- 4934 Returns the result of dividing the quadruple-precision floating-point value 4935 `a' by the corresponding value `b'. The operation is performed according to 4936 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4937 ------------------------------------------------------------------------------- 4938 */ 4939 float128 float128_div( float128 a, float128 b ) 4940 { 4941 flag aSign, bSign, zSign; 4942 int32 aExp, bExp, zExp; 4943 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4944 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4945 float128 z; 4946 4947 aSig1 = extractFloat128Frac1( a ); 4948 aSig0 = extractFloat128Frac0( a ); 4949 aExp = extractFloat128Exp( a ); 4950 aSign = extractFloat128Sign( a ); 4951 bSig1 = extractFloat128Frac1( b ); 4952 bSig0 = extractFloat128Frac0( b ); 4953 bExp = extractFloat128Exp( b ); 4954 bSign = extractFloat128Sign( b ); 4955 zSign = aSign ^ bSign; 4956 if ( aExp == 0x7FFF ) { 4957 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4958 if ( bExp == 0x7FFF ) { 4959 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4960 goto invalid; 4961 } 4962 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4963 } 4964 if ( bExp == 0x7FFF ) { 4965 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4966 return packFloat128( zSign, 0, 0, 0 ); 4967 } 4968 if ( bExp == 0 ) { 4969 if ( ( bSig0 | bSig1 ) == 0 ) { 4970 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4971 invalid: 4972 float_raise( float_flag_invalid ); 4973 z.low = float128_default_nan_low; 4974 z.high = float128_default_nan_high; 4975 return z; 4976 } 4977 float_raise( float_flag_divbyzero ); 4978 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4979 } 4980 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4981 } 4982 if ( aExp == 0 ) { 4983 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4984 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4985 } 4986 zExp = aExp - bExp + 0x3FFD; 4987 shortShift128Left( 4988 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4989 shortShift128Left( 4990 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4991 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4992 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4993 ++zExp; 4994 } 4995 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4996 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4997 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4998 while ( (sbits64) rem0 < 0 ) { 4999 --zSig0; 5000 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5001 } 5002 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5003 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5004 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5005 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5006 while ( (sbits64) rem1 < 0 ) { 5007 --zSig1; 5008 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5009 } 5010 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5011 } 5012 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5013 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5014 5015 } 5016 5017 /* 5018 ------------------------------------------------------------------------------- 5019 Returns the remainder of the quadruple-precision floating-point value `a' 5020 with respect to the corresponding value `b'. The operation is performed 5021 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5022 ------------------------------------------------------------------------------- 5023 */ 5024 float128 float128_rem( float128 a, float128 b ) 5025 { 5026 flag aSign, bSign, zSign; 5027 int32 aExp, bExp, expDiff; 5028 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5029 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5030 sbits64 sigMean0; 5031 float128 z; 5032 5033 aSig1 = extractFloat128Frac1( a ); 5034 aSig0 = extractFloat128Frac0( a ); 5035 aExp = extractFloat128Exp( a ); 5036 aSign = extractFloat128Sign( a ); 5037 bSig1 = extractFloat128Frac1( b ); 5038 bSig0 = extractFloat128Frac0( b ); 5039 bExp = extractFloat128Exp( b ); 5040 bSign = extractFloat128Sign( b ); 5041 if ( aExp == 0x7FFF ) { 5042 if ( ( aSig0 | aSig1 ) 5043 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5044 return propagateFloat128NaN( a, b ); 5045 } 5046 goto invalid; 5047 } 5048 if ( bExp == 0x7FFF ) { 5049 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5050 return a; 5051 } 5052 if ( bExp == 0 ) { 5053 if ( ( bSig0 | bSig1 ) == 0 ) { 5054 invalid: 5055 float_raise( float_flag_invalid ); 5056 z.low = float128_default_nan_low; 5057 z.high = float128_default_nan_high; 5058 return z; 5059 } 5060 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5061 } 5062 if ( aExp == 0 ) { 5063 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5064 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5065 } 5066 expDiff = aExp - bExp; 5067 if ( expDiff < -1 ) return a; 5068 shortShift128Left( 5069 aSig0 | LIT64( 0x0001000000000000 ), 5070 aSig1, 5071 15 - ( expDiff < 0 ), 5072 &aSig0, 5073 &aSig1 5074 ); 5075 shortShift128Left( 5076 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5077 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5078 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5079 expDiff -= 64; 5080 while ( 0 < expDiff ) { 5081 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5082 q = ( 4 < q ) ? q - 4 : 0; 5083 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5084 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5085 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5086 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5087 expDiff -= 61; 5088 } 5089 if ( -64 < expDiff ) { 5090 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5091 q = ( 4 < q ) ? q - 4 : 0; 5092 q >>= - expDiff; 5093 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5094 expDiff += 52; 5095 if ( expDiff < 0 ) { 5096 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5097 } 5098 else { 5099 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5100 } 5101 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5102 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5103 } 5104 else { 5105 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5106 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5107 } 5108 do { 5109 alternateASig0 = aSig0; 5110 alternateASig1 = aSig1; 5111 ++q; 5112 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5113 } while ( 0 <= (sbits64) aSig0 ); 5114 add128( 5115 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5116 if ( ( sigMean0 < 0 ) 5117 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5118 aSig0 = alternateASig0; 5119 aSig1 = alternateASig1; 5120 } 5121 zSign = ( (sbits64) aSig0 < 0 ); 5122 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5123 return 5124 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5125 5126 } 5127 5128 /* 5129 ------------------------------------------------------------------------------- 5130 Returns the square root of the quadruple-precision floating-point value `a'. 5131 The operation is performed according to the IEC/IEEE Standard for Binary 5132 Floating-Point Arithmetic. 5133 ------------------------------------------------------------------------------- 5134 */ 5135 float128 float128_sqrt( float128 a ) 5136 { 5137 flag aSign; 5138 int32 aExp, zExp; 5139 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5140 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5141 float128 z; 5142 5143 aSig1 = extractFloat128Frac1( a ); 5144 aSig0 = extractFloat128Frac0( a ); 5145 aExp = extractFloat128Exp( a ); 5146 aSign = extractFloat128Sign( a ); 5147 if ( aExp == 0x7FFF ) { 5148 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5149 if ( ! aSign ) return a; 5150 goto invalid; 5151 } 5152 if ( aSign ) { 5153 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5154 invalid: 5155 float_raise( float_flag_invalid ); 5156 z.low = float128_default_nan_low; 5157 z.high = float128_default_nan_high; 5158 return z; 5159 } 5160 if ( aExp == 0 ) { 5161 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5162 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5163 } 5164 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5165 aSig0 |= LIT64( 0x0001000000000000 ); 5166 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5167 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5168 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5169 doubleZSig0 = zSig0<<1; 5170 mul64To128( zSig0, zSig0, &term0, &term1 ); 5171 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5172 while ( (sbits64) rem0 < 0 ) { 5173 --zSig0; 5174 doubleZSig0 -= 2; 5175 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5176 } 5177 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5178 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5179 if ( zSig1 == 0 ) zSig1 = 1; 5180 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5181 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5182 mul64To128( zSig1, zSig1, &term2, &term3 ); 5183 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5184 while ( (sbits64) rem1 < 0 ) { 5185 --zSig1; 5186 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5187 term3 |= 1; 5188 term2 |= doubleZSig0; 5189 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5190 } 5191 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5192 } 5193 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5194 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5195 5196 } 5197 5198 /* 5199 ------------------------------------------------------------------------------- 5200 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5201 the corresponding value `b', and 0 otherwise. The comparison is performed 5202 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5203 ------------------------------------------------------------------------------- 5204 */ 5205 flag float128_eq( float128 a, float128 b ) 5206 { 5207 5208 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5209 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5210 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5211 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5212 ) { 5213 if ( float128_is_signaling_nan( a ) 5214 || float128_is_signaling_nan( b ) ) { 5215 float_raise( float_flag_invalid ); 5216 } 5217 return 0; 5218 } 5219 return 5220 ( a.low == b.low ) 5221 && ( ( a.high == b.high ) 5222 || ( ( a.low == 0 ) 5223 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5224 ); 5225 5226 } 5227 5228 /* 5229 ------------------------------------------------------------------------------- 5230 Returns 1 if the quadruple-precision floating-point value `a' is less than 5231 or equal to the corresponding value `b', and 0 otherwise. The comparison 5232 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5233 Arithmetic. 5234 ------------------------------------------------------------------------------- 5235 */ 5236 flag float128_le( float128 a, float128 b ) 5237 { 5238 flag aSign, bSign; 5239 5240 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5241 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5242 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5243 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5244 ) { 5245 float_raise( float_flag_invalid ); 5246 return 0; 5247 } 5248 aSign = extractFloat128Sign( a ); 5249 bSign = extractFloat128Sign( b ); 5250 if ( aSign != bSign ) { 5251 return 5252 aSign 5253 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5254 == 0 ); 5255 } 5256 return 5257 aSign ? le128( b.high, b.low, a.high, a.low ) 5258 : le128( a.high, a.low, b.high, b.low ); 5259 5260 } 5261 5262 /* 5263 ------------------------------------------------------------------------------- 5264 Returns 1 if the quadruple-precision floating-point value `a' is less than 5265 the corresponding value `b', and 0 otherwise. The comparison is performed 5266 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5267 ------------------------------------------------------------------------------- 5268 */ 5269 flag float128_lt( float128 a, float128 b ) 5270 { 5271 flag aSign, bSign; 5272 5273 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5274 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5275 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5276 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5277 ) { 5278 float_raise( float_flag_invalid ); 5279 return 0; 5280 } 5281 aSign = extractFloat128Sign( a ); 5282 bSign = extractFloat128Sign( b ); 5283 if ( aSign != bSign ) { 5284 return 5285 aSign 5286 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5287 != 0 ); 5288 } 5289 return 5290 aSign ? lt128( b.high, b.low, a.high, a.low ) 5291 : lt128( a.high, a.low, b.high, b.low ); 5292 5293 } 5294 5295 /* 5296 ------------------------------------------------------------------------------- 5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5298 the corresponding value `b', and 0 otherwise. The invalid exception is 5299 raised if either operand is a NaN. Otherwise, the comparison is performed 5300 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5301 ------------------------------------------------------------------------------- 5302 */ 5303 flag float128_eq_signaling( float128 a, float128 b ) 5304 { 5305 5306 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5307 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5308 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5309 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5310 ) { 5311 float_raise( float_flag_invalid ); 5312 return 0; 5313 } 5314 return 5315 ( a.low == b.low ) 5316 && ( ( a.high == b.high ) 5317 || ( ( a.low == 0 ) 5318 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5319 ); 5320 5321 } 5322 5323 /* 5324 ------------------------------------------------------------------------------- 5325 Returns 1 if the quadruple-precision floating-point value `a' is less than 5326 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5327 cause an exception. Otherwise, the comparison is performed according to the 5328 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5329 ------------------------------------------------------------------------------- 5330 */ 5331 flag float128_le_quiet( float128 a, float128 b ) 5332 { 5333 flag aSign, bSign; 5334 5335 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5336 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5337 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5338 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5339 ) { 5340 if ( float128_is_signaling_nan( a ) 5341 || float128_is_signaling_nan( b ) ) { 5342 float_raise( float_flag_invalid ); 5343 } 5344 return 0; 5345 } 5346 aSign = extractFloat128Sign( a ); 5347 bSign = extractFloat128Sign( b ); 5348 if ( aSign != bSign ) { 5349 return 5350 aSign 5351 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5352 == 0 ); 5353 } 5354 return 5355 aSign ? le128( b.high, b.low, a.high, a.low ) 5356 : le128( a.high, a.low, b.high, b.low ); 5357 5358 } 5359 5360 /* 5361 ------------------------------------------------------------------------------- 5362 Returns 1 if the quadruple-precision floating-point value `a' is less than 5363 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5364 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5365 Standard for Binary Floating-Point Arithmetic. 5366 ------------------------------------------------------------------------------- 5367 */ 5368 flag float128_lt_quiet( float128 a, float128 b ) 5369 { 5370 flag aSign, bSign; 5371 5372 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5373 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5374 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5375 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5376 ) { 5377 if ( float128_is_signaling_nan( a ) 5378 || float128_is_signaling_nan( b ) ) { 5379 float_raise( float_flag_invalid ); 5380 } 5381 return 0; 5382 } 5383 aSign = extractFloat128Sign( a ); 5384 bSign = extractFloat128Sign( b ); 5385 if ( aSign != bSign ) { 5386 return 5387 aSign 5388 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5389 != 0 ); 5390 } 5391 return 5392 aSign ? lt128( b.high, b.low, a.high, a.low ) 5393 : lt128( a.high, a.low, b.high, b.low ); 5394 5395 } 5396 5397 #endif 5398 5399 5400 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5401 5402 /* 5403 * These two routines are not part of the original softfloat distribution. 5404 * 5405 * They are based on the corresponding conversions to integer but return 5406 * unsigned numbers instead since these functions are required by GCC. 5407 * 5408 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97 5409 * 5410 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5411 */ 5412 5413 /* 5414 ------------------------------------------------------------------------------- 5415 Returns the result of converting the double-precision floating-point value 5416 `a' to the 32-bit unsigned integer format. The conversion is 5417 performed according to the IEC/IEEE Standard for Binary Floating-point 5418 Arithmetic, except that the conversion is always rounded toward zero. If 5419 `a' is a NaN, the largest positive integer is returned. If the conversion 5420 overflows, the largest integer positive is returned. 5421 ------------------------------------------------------------------------------- 5422 */ 5423 uint32 float64_to_uint32_round_to_zero( float64 a ) 5424 { 5425 flag aSign; 5426 int16 aExp, shiftCount; 5427 bits64 aSig, savedASig; 5428 uint32 z; 5429 5430 aSig = extractFloat64Frac( a ); 5431 aExp = extractFloat64Exp( a ); 5432 aSign = extractFloat64Sign( a ); 5433 5434 if (aSign) { 5435 float_raise( float_flag_invalid ); 5436 return(0); 5437 } 5438 5439 if ( 0x41E < aExp ) { 5440 float_raise( float_flag_invalid ); 5441 return 0xffffffff; 5442 } 5443 else if ( aExp < 0x3FF ) { 5444 if ( aExp || aSig ) float_set_inexact(); 5445 return 0; 5446 } 5447 aSig |= LIT64( 0x0010000000000000 ); 5448 shiftCount = 0x433 - aExp; 5449 savedASig = aSig; 5450 aSig >>= shiftCount; 5451 z = aSig; 5452 if ( ( aSig<<shiftCount ) != savedASig ) { 5453 float_set_inexact(); 5454 } 5455 return z; 5456 5457 } 5458 5459 /* 5460 ------------------------------------------------------------------------------- 5461 Returns the result of converting the single-precision floating-point value 5462 `a' to the 32-bit unsigned integer format. The conversion is 5463 performed according to the IEC/IEEE Standard for Binary Floating-point 5464 Arithmetic, except that the conversion is always rounded toward zero. If 5465 `a' is a NaN, the largest positive integer is returned. If the conversion 5466 overflows, the largest positive integer is returned. 5467 ------------------------------------------------------------------------------- 5468 */ 5469 uint32 float32_to_uint32_round_to_zero( float32 a ) 5470 { 5471 flag aSign; 5472 int16 aExp, shiftCount; 5473 bits32 aSig; 5474 uint32 z; 5475 5476 aSig = extractFloat32Frac( a ); 5477 aExp = extractFloat32Exp( a ); 5478 aSign = extractFloat32Sign( a ); 5479 shiftCount = aExp - 0x9E; 5480 5481 if (aSign) { 5482 float_raise( float_flag_invalid ); 5483 return(0); 5484 } 5485 if ( 0 < shiftCount ) { 5486 float_raise( float_flag_invalid ); 5487 return 0xFFFFFFFF; 5488 } 5489 else if ( aExp <= 0x7E ) { 5490 if ( aExp | aSig ) float_set_inexact(); 5491 return 0; 5492 } 5493 aSig = ( aSig | 0x800000 )<<8; 5494 z = aSig>>( - shiftCount ); 5495 if ( aSig<<( shiftCount & 31 ) ) { 5496 float_set_inexact(); 5497 } 5498 return z; 5499 5500 } 5501 5502 #endif 5503