1 /* $NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /*============================================================================ 18 19 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 20 Package, Release 2b. 21 22 Written by John R. Hauser. This work was made possible in part by the 23 International Computer Science Institute, located at Suite 600, 1947 Center 24 Street, Berkeley, California 94704. Funding was partially provided by the 25 National Science Foundation under grant MIP-9311980. The original version 26 of this code was written as part of a project to build a fixed-point vector 27 processor in collaboration with the University of California at Berkeley, 28 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 29 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 30 arithmetic/SoftFloat.html'. 31 32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 33 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 34 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 35 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 36 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 37 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 38 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 39 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 40 41 Derivative works are acceptable, even for commercial purposes, so long as 42 (1) the source code for the derivative work includes prominent notice that 43 the work is derivative, and (2) the source code includes prominent notice with 44 these four paragraphs for those parts of this code that are retained. 45 46 =============================================================================*/ 47 48 /* If you need this in a boot program, you have bigger problems... */ 49 #ifndef _STANDALONE 50 51 #include <sys/cdefs.h> 52 #if defined(LIBC_SCCS) && !defined(lint) 53 __RCSID("$NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $"); 54 #endif /* LIBC_SCCS and not lint */ 55 56 #ifdef SOFTFLOAT_FOR_GCC 57 #include "softfloat-for-gcc.h" 58 #endif 59 60 #include "milieu.h" 61 #include "softfloat.h" 62 63 /*---------------------------------------------------------------------------- 64 | Floating-point rounding mode, extended double-precision rounding precision, 65 | and exception flags. 66 *----------------------------------------------------------------------------*/ 67 /* 68 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 69 * Right now, it does not. I've removed all other dynamic global 70 * variables. [ross] 71 */ 72 #ifdef FLOATX80 73 int8 floatx80_rounding_precision = 80; 74 #endif 75 76 /*---------------------------------------------------------------------------- 77 | Primitive arithmetic functions, including multi-word arithmetic, and 78 | division and square root approximations. (Can be specialized to target if 79 | desired.) 80 *----------------------------------------------------------------------------*/ 81 #include "softfloat-macros.h" 82 83 /*---------------------------------------------------------------------------- 84 | Functions and definitions to determine: (1) whether tininess for underflow 85 | is detected before or after rounding by default, (2) what (if anything) 86 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 87 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 88 | are propagated from function inputs to output. These details are target- 89 | specific. 90 *----------------------------------------------------------------------------*/ 91 #include "softfloat-specialize.h" 92 93 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 94 /*---------------------------------------------------------------------------- 95 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 96 | and 7, and returns the properly rounded 32-bit integer corresponding to the 97 | input. If `zSign' is 1, the input is negated before being converted to an 98 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 99 | is simply rounded to an integer, with the inexact exception raised if the 100 | input cannot be represented exactly as an integer. However, if the fixed- 101 | point input is too large, the invalid exception is raised and the largest 102 | positive or negative integer is returned. 103 *----------------------------------------------------------------------------*/ 104 105 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 106 { 107 int8 roundingMode; 108 flag roundNearestEven; 109 int8 roundIncrement, roundBits; 110 int32 z; 111 112 roundingMode = float_rounding_mode(); 113 roundNearestEven = ( roundingMode == float_round_nearest_even ); 114 roundIncrement = 0x40; 115 if ( ! roundNearestEven ) { 116 if ( roundingMode == float_round_to_zero ) { 117 roundIncrement = 0; 118 } 119 else { 120 roundIncrement = 0x7F; 121 if ( zSign ) { 122 if ( roundingMode == float_round_up ) roundIncrement = 0; 123 } 124 else { 125 if ( roundingMode == float_round_down ) roundIncrement = 0; 126 } 127 } 128 } 129 roundBits = absZ & 0x7F; 130 absZ = ( absZ + roundIncrement )>>7; 131 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 132 z = absZ; 133 if ( zSign ) z = - z; 134 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 135 float_raise( float_flag_invalid ); 136 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 137 } 138 if ( roundBits ) float_set_inexact(); 139 return z; 140 141 } 142 143 /*---------------------------------------------------------------------------- 144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 145 | `absZ1', with binary point between bits 63 and 64 (between the input words), 146 | and returns the properly rounded 64-bit integer corresponding to the input. 147 | If `zSign' is 1, the input is negated before being converted to an integer. 148 | Ordinarily, the fixed-point input is simply rounded to an integer, with 149 | the inexact exception raised if the input cannot be represented exactly as 150 | an integer. However, if the fixed-point input is too large, the invalid 151 | exception is raised and the largest positive or negative integer is 152 | returned. 153 *----------------------------------------------------------------------------*/ 154 155 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 156 { 157 int8 roundingMode; 158 flag roundNearestEven, increment; 159 int64 z; 160 161 roundingMode = float_rounding_mode(); 162 roundNearestEven = ( roundingMode == float_round_nearest_even ); 163 increment = ( (sbits64) absZ1 < 0 ); 164 if ( ! roundNearestEven ) { 165 if ( roundingMode == float_round_to_zero ) { 166 increment = 0; 167 } 168 else { 169 if ( zSign ) { 170 increment = ( roundingMode == float_round_down ) && absZ1; 171 } 172 else { 173 increment = ( roundingMode == float_round_up ) && absZ1; 174 } 175 } 176 } 177 if ( increment ) { 178 ++absZ0; 179 if ( absZ0 == 0 ) goto overflow; 180 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 181 } 182 z = absZ0; 183 if ( zSign ) z = - z; 184 if ( z && ( ( z < 0 ) ^ zSign ) ) { 185 overflow: 186 float_raise( float_flag_invalid ); 187 return 188 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 189 : LIT64( 0x7FFFFFFFFFFFFFFF ); 190 } 191 if ( absZ1 ) float_set_inexact(); 192 return z; 193 194 } 195 196 /* same as above, but for unsigned values */ 197 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 ) 198 { 199 int8 roundingMode; 200 flag roundNearestEven, increment; 201 uint64 z; 202 203 roundingMode = float_rounding_mode(); 204 roundNearestEven = ( roundingMode == float_round_nearest_even ); 205 increment = ( (sbits64) absZ1 < 0 ); 206 if ( ! roundNearestEven ) { 207 if ( roundingMode == float_round_to_zero ) { 208 increment = 0; 209 } 210 else { 211 increment = ( roundingMode == float_round_up ) && absZ1; 212 } 213 } 214 if ( increment ) { 215 ++absZ0; 216 if ( absZ0 == 0 ) { 217 float_raise( float_flag_invalid ); 218 return LIT64( 0x7FFFFFFFFFFFFFFF ); 219 } 220 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 221 } 222 z = absZ0; 223 if ( absZ1 ) float_set_inexact(); 224 return z; 225 226 } 227 #endif /* SOFTFLOAT_FOR_GCC */ 228 229 /*---------------------------------------------------------------------------- 230 | Returns the fraction bits of the single-precision floating-point value `a'. 231 *----------------------------------------------------------------------------*/ 232 233 INLINE bits32 extractFloat32Frac( float32 a ) 234 { 235 236 return a & 0x007FFFFF; 237 238 } 239 240 /*---------------------------------------------------------------------------- 241 | Returns the exponent bits of the single-precision floating-point value `a'. 242 *----------------------------------------------------------------------------*/ 243 244 INLINE int16 extractFloat32Exp( float32 a ) 245 { 246 247 return ( a>>23 ) & 0xFF; 248 249 } 250 251 /*---------------------------------------------------------------------------- 252 | Returns the sign bit of the single-precision floating-point value `a'. 253 *----------------------------------------------------------------------------*/ 254 255 INLINE flag extractFloat32Sign( float32 a ) 256 { 257 258 return a>>31; 259 260 } 261 262 /*---------------------------------------------------------------------------- 263 | Normalizes the subnormal single-precision floating-point value represented 264 | by the denormalized significand `aSig'. The normalized exponent and 265 | significand are stored at the locations pointed to by `zExpPtr' and 266 | `zSigPtr', respectively. 267 *----------------------------------------------------------------------------*/ 268 269 static void 270 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 271 { 272 int8 shiftCount; 273 274 shiftCount = countLeadingZeros32( aSig ) - 8; 275 *zSigPtr = aSig<<shiftCount; 276 *zExpPtr = 1 - shiftCount; 277 278 } 279 280 /*---------------------------------------------------------------------------- 281 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 282 | single-precision floating-point value, returning the result. After being 283 | shifted into the proper positions, the three fields are simply added 284 | together to form the result. This means that any integer portion of `zSig' 285 | will be added into the exponent. Since a properly normalized significand 286 | will have an integer portion equal to 1, the `zExp' input should be 1 less 287 | than the desired result exponent whenever `zSig' is a complete, normalized 288 | significand. 289 *----------------------------------------------------------------------------*/ 290 291 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 292 { 293 294 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 295 296 } 297 298 /*---------------------------------------------------------------------------- 299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 300 | and significand `zSig', and returns the proper single-precision floating- 301 | point value corresponding to the abstract input. Ordinarily, the abstract 302 | value is simply rounded and packed into the single-precision format, with 303 | the inexact exception raised if the abstract input cannot be represented 304 | exactly. However, if the abstract value is too large, the overflow and 305 | inexact exceptions are raised and an infinity or maximal finite value is 306 | returned. If the abstract value is too small, the input value is rounded to 307 | a subnormal number, and the underflow and inexact exceptions are raised if 308 | the abstract input cannot be represented exactly as a subnormal single- 309 | precision floating-point number. 310 | The input significand `zSig' has its binary point between bits 30 311 | and 29, which is 7 bits to the left of the usual location. This shifted 312 | significand must be normalized or smaller. If `zSig' is not normalized, 313 | `zExp' must be 0; in that case, the result returned is a subnormal number, 314 | and it must not require rounding. In the usual case that `zSig' is 315 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 316 | The handling of underflow and overflow follows the IEC/IEEE Standard for 317 | Binary Floating-Point Arithmetic. 318 *----------------------------------------------------------------------------*/ 319 320 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 321 { 322 int8 roundingMode; 323 flag roundNearestEven; 324 int8 roundIncrement, roundBits; 325 flag isTiny; 326 327 roundingMode = float_rounding_mode(); 328 roundNearestEven = ( roundingMode == float_round_nearest_even ); 329 roundIncrement = 0x40; 330 if ( ! roundNearestEven ) { 331 if ( roundingMode == float_round_to_zero ) { 332 roundIncrement = 0; 333 } 334 else { 335 roundIncrement = 0x7F; 336 if ( zSign ) { 337 if ( roundingMode == float_round_up ) roundIncrement = 0; 338 } 339 else { 340 if ( roundingMode == float_round_down ) roundIncrement = 0; 341 } 342 } 343 } 344 roundBits = zSig & 0x7F; 345 if ( 0xFD <= (bits16) zExp ) { 346 if ( ( 0xFD < zExp ) 347 || ( ( zExp == 0xFD ) 348 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 349 ) { 350 float_raise( float_flag_overflow | float_flag_inexact ); 351 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 352 } 353 if ( zExp < 0 ) { 354 isTiny = 355 ( float_detect_tininess == float_tininess_before_rounding ) 356 || ( zExp < -1 ) 357 || ( zSig + roundIncrement < 0x80000000 ); 358 shift32RightJamming( zSig, - zExp, &zSig ); 359 zExp = 0; 360 roundBits = zSig & 0x7F; 361 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 362 } 363 } 364 if ( roundBits ) float_set_inexact(); 365 zSig = ( zSig + roundIncrement )>>7; 366 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 367 if ( zSig == 0 ) zExp = 0; 368 return packFloat32( zSign, zExp, zSig ); 369 370 } 371 372 /*---------------------------------------------------------------------------- 373 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 374 | and significand `zSig', and returns the proper single-precision floating- 375 | point value corresponding to the abstract input. This routine is just like 376 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 377 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 378 | floating-point exponent. 379 *----------------------------------------------------------------------------*/ 380 381 static float32 382 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 383 { 384 int8 shiftCount; 385 386 shiftCount = countLeadingZeros32( zSig ) - 1; 387 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 388 389 } 390 391 /*---------------------------------------------------------------------------- 392 | Returns the fraction bits of the double-precision floating-point value `a'. 393 *----------------------------------------------------------------------------*/ 394 395 INLINE bits64 extractFloat64Frac( float64 a ) 396 { 397 398 return a & LIT64( 0x000FFFFFFFFFFFFF ); 399 400 } 401 402 /*---------------------------------------------------------------------------- 403 | Returns the exponent bits of the double-precision floating-point value `a'. 404 *----------------------------------------------------------------------------*/ 405 406 INLINE int16 extractFloat64Exp( float64 a ) 407 { 408 409 return ( a>>52 ) & 0x7FF; 410 411 } 412 /*---------------------------------------------------------------------------- 413 | Returns the sign bit of the double-precision floating-point value `a'. 414 *----------------------------------------------------------------------------*/ 415 416 INLINE flag extractFloat64Sign( float64 a ) 417 { 418 419 return a>>63; 420 421 } 422 423 /*---------------------------------------------------------------------------- 424 | Normalizes the subnormal double-precision floating-point value represented 425 | by the denormalized significand `aSig'. The normalized exponent and 426 | significand are stored at the locations pointed to by `zExpPtr' and 427 | `zSigPtr', respectively. 428 *----------------------------------------------------------------------------*/ 429 430 static void 431 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 432 { 433 int8 shiftCount; 434 435 shiftCount = countLeadingZeros64( aSig ) - 11; 436 *zSigPtr = aSig<<shiftCount; 437 *zExpPtr = 1 - shiftCount; 438 439 } 440 441 /*---------------------------------------------------------------------------- 442 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 443 | double-precision floating-point value, returning the result. After being 444 | shifted into the proper positions, the three fields are simply added 445 | together to form the result. This means that any integer portion of `zSig' 446 | will be added into the exponent. Since a properly normalized significand 447 | will have an integer portion equal to 1, the `zExp' input should be 1 less 448 | than the desired result exponent whenever `zSig' is a complete, normalized 449 | significand. 450 *----------------------------------------------------------------------------*/ 451 452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 453 { 454 455 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig; 456 457 } 458 459 /*---------------------------------------------------------------------------- 460 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 461 | and significand `zSig', and returns the proper double-precision floating- 462 | point value corresponding to the abstract input. Ordinarily, the abstract 463 | value is simply rounded and packed into the double-precision format, with 464 | the inexact exception raised if the abstract input cannot be represented 465 | exactly. However, if the abstract value is too large, the overflow and 466 | inexact exceptions are raised and an infinity or maximal finite value is 467 | returned. If the abstract value is too small, the input value is rounded 468 | to a subnormal number, and the underflow and inexact exceptions are raised 469 | if the abstract input cannot be represented exactly as a subnormal double- 470 | precision floating-point number. 471 | The input significand `zSig' has its binary point between bits 62 472 | and 61, which is 10 bits to the left of the usual location. This shifted 473 | significand must be normalized or smaller. If `zSig' is not normalized, 474 | `zExp' must be 0; in that case, the result returned is a subnormal number, 475 | and it must not require rounding. In the usual case that `zSig' is 476 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 477 | The handling of underflow and overflow follows the IEC/IEEE Standard for 478 | Binary Floating-Point Arithmetic. 479 *----------------------------------------------------------------------------*/ 480 481 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 482 { 483 int8 roundingMode; 484 flag roundNearestEven; 485 int16 roundIncrement, roundBits; 486 flag isTiny; 487 488 roundingMode = float_rounding_mode(); 489 roundNearestEven = ( roundingMode == float_round_nearest_even ); 490 roundIncrement = 0x200; 491 if ( ! roundNearestEven ) { 492 if ( roundingMode == float_round_to_zero ) { 493 roundIncrement = 0; 494 } 495 else { 496 roundIncrement = 0x3FF; 497 if ( zSign ) { 498 if ( roundingMode == float_round_up ) roundIncrement = 0; 499 } 500 else { 501 if ( roundingMode == float_round_down ) roundIncrement = 0; 502 } 503 } 504 } 505 roundBits = zSig & 0x3FF; 506 if ( 0x7FD <= (bits16) zExp ) { 507 if ( ( 0x7FD < zExp ) 508 || ( ( zExp == 0x7FD ) 509 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 510 ) { 511 float_raise( float_flag_overflow | float_flag_inexact ); 512 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 ); 513 } 514 if ( zExp < 0 ) { 515 isTiny = 516 ( float_detect_tininess == float_tininess_before_rounding ) 517 || ( zExp < -1 ) 518 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 519 shift64RightJamming( zSig, - zExp, &zSig ); 520 zExp = 0; 521 roundBits = zSig & 0x3FF; 522 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 523 } 524 } 525 if ( roundBits ) float_set_inexact(); 526 zSig = ( zSig + roundIncrement )>>10; 527 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 528 if ( zSig == 0 ) zExp = 0; 529 return packFloat64( zSign, zExp, zSig ); 530 531 } 532 533 /*---------------------------------------------------------------------------- 534 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 535 | and significand `zSig', and returns the proper double-precision floating- 536 | point value corresponding to the abstract input. This routine is just like 537 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 538 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 539 | floating-point exponent. 540 *----------------------------------------------------------------------------*/ 541 542 static float64 543 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 544 { 545 int8 shiftCount; 546 547 shiftCount = countLeadingZeros64( zSig ) - 1; 548 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 549 550 } 551 552 #ifdef FLOATX80 553 554 /*---------------------------------------------------------------------------- 555 | Returns the fraction bits of the extended double-precision floating-point 556 | value `a'. 557 *----------------------------------------------------------------------------*/ 558 559 INLINE bits64 extractFloatx80Frac( floatx80 a ) 560 { 561 562 return a.low; 563 564 } 565 566 /*---------------------------------------------------------------------------- 567 | Returns the exponent bits of the extended double-precision floating-point 568 | value `a'. 569 *----------------------------------------------------------------------------*/ 570 571 INLINE int32 extractFloatx80Exp( floatx80 a ) 572 { 573 574 return a.high & 0x7FFF; 575 576 } 577 578 /*---------------------------------------------------------------------------- 579 | Returns the sign bit of the extended double-precision floating-point value 580 | `a'. 581 *----------------------------------------------------------------------------*/ 582 583 INLINE flag extractFloatx80Sign( floatx80 a ) 584 { 585 586 return a.high>>15; 587 588 } 589 590 /*---------------------------------------------------------------------------- 591 | Normalizes the subnormal extended double-precision floating-point value 592 | represented by the denormalized significand `aSig'. The normalized exponent 593 | and significand are stored at the locations pointed to by `zExpPtr' and 594 | `zSigPtr', respectively. 595 *----------------------------------------------------------------------------*/ 596 597 static void 598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 599 { 600 int8 shiftCount; 601 602 shiftCount = countLeadingZeros64( aSig ); 603 *zSigPtr = aSig<<shiftCount; 604 *zExpPtr = 1 - shiftCount; 605 606 } 607 608 /*---------------------------------------------------------------------------- 609 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 610 | extended double-precision floating-point value, returning the result. 611 *----------------------------------------------------------------------------*/ 612 613 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 614 { 615 floatx80 z; 616 617 z.low = zSig; 618 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 619 return z; 620 621 } 622 623 /*---------------------------------------------------------------------------- 624 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 625 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 626 | and returns the proper extended double-precision floating-point value 627 | corresponding to the abstract input. Ordinarily, the abstract value is 628 | rounded and packed into the extended double-precision format, with the 629 | inexact exception raised if the abstract input cannot be represented 630 | exactly. However, if the abstract value is too large, the overflow and 631 | inexact exceptions are raised and an infinity or maximal finite value is 632 | returned. If the abstract value is too small, the input value is rounded to 633 | a subnormal number, and the underflow and inexact exceptions are raised if 634 | the abstract input cannot be represented exactly as a subnormal extended 635 | double-precision floating-point number. 636 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 637 | number of bits as single or double precision, respectively. Otherwise, the 638 | result is rounded to the full precision of the extended double-precision 639 | format. 640 | The input significand must be normalized or smaller. If the input 641 | significand is not normalized, `zExp' must be 0; in that case, the result 642 | returned is a subnormal number, and it must not require rounding. The 643 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 644 | Floating-Point Arithmetic. 645 *----------------------------------------------------------------------------*/ 646 647 static floatx80 648 roundAndPackFloatx80( 649 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 650 ) 651 { 652 int8 roundingMode; 653 flag roundNearestEven, increment, isTiny; 654 int64 roundIncrement, roundMask, roundBits; 655 656 roundingMode = float_rounding_mode(); 657 roundNearestEven = ( roundingMode == float_round_nearest_even ); 658 if ( roundingPrecision == 80 ) goto precision80; 659 if ( roundingPrecision == 64 ) { 660 roundIncrement = LIT64( 0x0000000000000400 ); 661 roundMask = LIT64( 0x00000000000007FF ); 662 } 663 else if ( roundingPrecision == 32 ) { 664 roundIncrement = LIT64( 0x0000008000000000 ); 665 roundMask = LIT64( 0x000000FFFFFFFFFF ); 666 } 667 else { 668 goto precision80; 669 } 670 zSig0 |= ( zSig1 != 0 ); 671 if ( ! roundNearestEven ) { 672 if ( roundingMode == float_round_to_zero ) { 673 roundIncrement = 0; 674 } 675 else { 676 roundIncrement = roundMask; 677 if ( zSign ) { 678 if ( roundingMode == float_round_up ) roundIncrement = 0; 679 } 680 else { 681 if ( roundingMode == float_round_down ) roundIncrement = 0; 682 } 683 } 684 } 685 roundBits = zSig0 & roundMask; 686 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 687 if ( ( 0x7FFE < zExp ) 688 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 689 ) { 690 goto overflow; 691 } 692 if ( zExp <= 0 ) { 693 isTiny = 694 ( float_detect_tininess == float_tininess_before_rounding ) 695 || ( zExp < 0 ) 696 || ( zSig0 <= zSig0 + roundIncrement ); 697 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 698 zExp = 0; 699 roundBits = zSig0 & roundMask; 700 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 701 if ( roundBits ) float_set_inexact(); 702 zSig0 += roundIncrement; 703 if ( (sbits64) zSig0 < 0 ) zExp = 1; 704 roundIncrement = roundMask + 1; 705 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 706 roundMask |= roundIncrement; 707 } 708 zSig0 &= ~ roundMask; 709 return packFloatx80( zSign, zExp, zSig0 ); 710 } 711 } 712 if ( roundBits ) float_set_inexact(); 713 zSig0 += roundIncrement; 714 if ( zSig0 < roundIncrement ) { 715 ++zExp; 716 zSig0 = LIT64( 0x8000000000000000 ); 717 } 718 roundIncrement = roundMask + 1; 719 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 720 roundMask |= roundIncrement; 721 } 722 zSig0 &= ~ roundMask; 723 if ( zSig0 == 0 ) zExp = 0; 724 return packFloatx80( zSign, zExp, zSig0 ); 725 precision80: 726 increment = ( (sbits64) zSig1 < 0 ); 727 if ( ! roundNearestEven ) { 728 if ( roundingMode == float_round_to_zero ) { 729 increment = 0; 730 } 731 else { 732 if ( zSign ) { 733 increment = ( roundingMode == float_round_down ) && zSig1; 734 } 735 else { 736 increment = ( roundingMode == float_round_up ) && zSig1; 737 } 738 } 739 } 740 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 741 if ( ( 0x7FFE < zExp ) 742 || ( ( zExp == 0x7FFE ) 743 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 744 && increment 745 ) 746 ) { 747 roundMask = 0; 748 overflow: 749 float_raise( float_flag_overflow | float_flag_inexact ); 750 if ( ( roundingMode == float_round_to_zero ) 751 || ( zSign && ( roundingMode == float_round_up ) ) 752 || ( ! zSign && ( roundingMode == float_round_down ) ) 753 ) { 754 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 755 } 756 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 757 } 758 if ( zExp <= 0 ) { 759 isTiny = 760 ( float_detect_tininess == float_tininess_before_rounding ) 761 || ( zExp < 0 ) 762 || ! increment 763 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 764 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 765 zExp = 0; 766 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 767 if ( zSig1 ) float_set_inexact(); 768 if ( roundNearestEven ) { 769 increment = ( (sbits64) zSig1 < 0 ); 770 } 771 else { 772 if ( zSign ) { 773 increment = ( roundingMode == float_round_down ) && zSig1; 774 } 775 else { 776 increment = ( roundingMode == float_round_up ) && zSig1; 777 } 778 } 779 if ( increment ) { 780 ++zSig0; 781 zSig0 &= 782 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 783 if ( (sbits64) zSig0 < 0 ) zExp = 1; 784 } 785 return packFloatx80( zSign, zExp, zSig0 ); 786 } 787 } 788 if ( zSig1 ) float_set_inexact(); 789 if ( increment ) { 790 ++zSig0; 791 if ( zSig0 == 0 ) { 792 ++zExp; 793 zSig0 = LIT64( 0x8000000000000000 ); 794 } 795 else { 796 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 797 } 798 } 799 else { 800 if ( zSig0 == 0 ) zExp = 0; 801 } 802 return packFloatx80( zSign, zExp, zSig0 ); 803 804 } 805 806 /*---------------------------------------------------------------------------- 807 | Takes an abstract floating-point value having sign `zSign', exponent 808 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 809 | and returns the proper extended double-precision floating-point value 810 | corresponding to the abstract input. This routine is just like 811 | `roundAndPackFloatx80' except that the input significand does not have to be 812 | normalized. 813 *----------------------------------------------------------------------------*/ 814 815 static floatx80 816 normalizeRoundAndPackFloatx80( 817 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 818 ) 819 { 820 int8 shiftCount; 821 822 if ( zSig0 == 0 ) { 823 zSig0 = zSig1; 824 zSig1 = 0; 825 zExp -= 64; 826 } 827 shiftCount = countLeadingZeros64( zSig0 ); 828 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 829 zExp -= shiftCount; 830 return 831 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 832 833 } 834 835 #endif 836 837 #ifdef FLOAT128 838 839 /*---------------------------------------------------------------------------- 840 | Returns the least-significant 64 fraction bits of the quadruple-precision 841 | floating-point value `a'. 842 *----------------------------------------------------------------------------*/ 843 844 INLINE bits64 extractFloat128Frac1( float128 a ) 845 { 846 847 return a.low; 848 849 } 850 851 /*---------------------------------------------------------------------------- 852 | Returns the most-significant 48 fraction bits of the quadruple-precision 853 | floating-point value `a'. 854 *----------------------------------------------------------------------------*/ 855 856 INLINE bits64 extractFloat128Frac0( float128 a ) 857 { 858 859 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 860 861 } 862 863 /*---------------------------------------------------------------------------- 864 | Returns the exponent bits of the quadruple-precision floating-point value 865 | `a'. 866 *----------------------------------------------------------------------------*/ 867 868 INLINE int32 extractFloat128Exp( float128 a ) 869 { 870 871 return ( a.high>>48 ) & 0x7FFF; 872 873 } 874 875 /*---------------------------------------------------------------------------- 876 | Returns the sign bit of the quadruple-precision floating-point value `a'. 877 *----------------------------------------------------------------------------*/ 878 879 INLINE flag extractFloat128Sign( float128 a ) 880 { 881 882 return a.high>>63; 883 884 } 885 886 /*---------------------------------------------------------------------------- 887 | Normalizes the subnormal quadruple-precision floating-point value 888 | represented by the denormalized significand formed by the concatenation of 889 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 890 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 891 | significand are stored at the location pointed to by `zSig0Ptr', and the 892 | least significant 64 bits of the normalized significand are stored at the 893 | location pointed to by `zSig1Ptr'. 894 *----------------------------------------------------------------------------*/ 895 896 static void 897 normalizeFloat128Subnormal( 898 bits64 aSig0, 899 bits64 aSig1, 900 int32 *zExpPtr, 901 bits64 *zSig0Ptr, 902 bits64 *zSig1Ptr 903 ) 904 { 905 int8 shiftCount; 906 907 if ( aSig0 == 0 ) { 908 shiftCount = countLeadingZeros64( aSig1 ) - 15; 909 if ( shiftCount < 0 ) { 910 *zSig0Ptr = aSig1>>( - shiftCount ); 911 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 912 } 913 else { 914 *zSig0Ptr = aSig1<<shiftCount; 915 *zSig1Ptr = 0; 916 } 917 *zExpPtr = - shiftCount - 63; 918 } 919 else { 920 shiftCount = countLeadingZeros64( aSig0 ) - 15; 921 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 922 *zExpPtr = 1 - shiftCount; 923 } 924 925 } 926 927 /*---------------------------------------------------------------------------- 928 | Packs the sign `zSign', the exponent `zExp', and the significand formed 929 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 930 | floating-point value, returning the result. After being shifted into the 931 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 932 | added together to form the most significant 32 bits of the result. This 933 | means that any integer portion of `zSig0' will be added into the exponent. 934 | Since a properly normalized significand will have an integer portion equal 935 | to 1, the `zExp' input should be 1 less than the desired result exponent 936 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 937 | significand. 938 *----------------------------------------------------------------------------*/ 939 940 INLINE float128 941 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 942 { 943 float128 z; 944 945 z.low = zSig1; 946 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 947 return z; 948 949 } 950 951 /*---------------------------------------------------------------------------- 952 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 953 | and extended significand formed by the concatenation of `zSig0', `zSig1', 954 | and `zSig2', and returns the proper quadruple-precision floating-point value 955 | corresponding to the abstract input. Ordinarily, the abstract value is 956 | simply rounded and packed into the quadruple-precision format, with the 957 | inexact exception raised if the abstract input cannot be represented 958 | exactly. However, if the abstract value is too large, the overflow and 959 | inexact exceptions are raised and an infinity or maximal finite value is 960 | returned. If the abstract value is too small, the input value is rounded to 961 | a subnormal number, and the underflow and inexact exceptions are raised if 962 | the abstract input cannot be represented exactly as a subnormal quadruple- 963 | precision floating-point number. 964 | The input significand must be normalized or smaller. If the input 965 | significand is not normalized, `zExp' must be 0; in that case, the result 966 | returned is a subnormal number, and it must not require rounding. In the 967 | usual case that the input significand is normalized, `zExp' must be 1 less 968 | than the ``true'' floating-point exponent. The handling of underflow and 969 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 970 *----------------------------------------------------------------------------*/ 971 972 static float128 973 roundAndPackFloat128( 974 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 975 { 976 int8 roundingMode; 977 flag roundNearestEven, increment, isTiny; 978 979 roundingMode = float_rounding_mode(); 980 roundNearestEven = ( roundingMode == float_round_nearest_even ); 981 increment = ( (sbits64) zSig2 < 0 ); 982 if ( ! roundNearestEven ) { 983 if ( roundingMode == float_round_to_zero ) { 984 increment = 0; 985 } 986 else { 987 if ( zSign ) { 988 increment = ( roundingMode == float_round_down ) && zSig2; 989 } 990 else { 991 increment = ( roundingMode == float_round_up ) && zSig2; 992 } 993 } 994 } 995 if ( 0x7FFD <= (bits32) zExp ) { 996 if ( ( 0x7FFD < zExp ) 997 || ( ( zExp == 0x7FFD ) 998 && eq128( 999 LIT64( 0x0001FFFFFFFFFFFF ), 1000 LIT64( 0xFFFFFFFFFFFFFFFF ), 1001 zSig0, 1002 zSig1 1003 ) 1004 && increment 1005 ) 1006 ) { 1007 float_raise( float_flag_overflow | float_flag_inexact ); 1008 if ( ( roundingMode == float_round_to_zero ) 1009 || ( zSign && ( roundingMode == float_round_up ) ) 1010 || ( ! zSign && ( roundingMode == float_round_down ) ) 1011 ) { 1012 return 1013 packFloat128( 1014 zSign, 1015 0x7FFE, 1016 LIT64( 0x0000FFFFFFFFFFFF ), 1017 LIT64( 0xFFFFFFFFFFFFFFFF ) 1018 ); 1019 } 1020 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1021 } 1022 if ( zExp < 0 ) { 1023 isTiny = 1024 ( float_detect_tininess == float_tininess_before_rounding ) 1025 || ( zExp < -1 ) 1026 || ! increment 1027 || lt128( 1028 zSig0, 1029 zSig1, 1030 LIT64( 0x0001FFFFFFFFFFFF ), 1031 LIT64( 0xFFFFFFFFFFFFFFFF ) 1032 ); 1033 shift128ExtraRightJamming( 1034 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1035 zExp = 0; 1036 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1037 if ( roundNearestEven ) { 1038 increment = ( (sbits64) zSig2 < 0 ); 1039 } 1040 else { 1041 if ( zSign ) { 1042 increment = ( roundingMode == float_round_down ) && zSig2; 1043 } 1044 else { 1045 increment = ( roundingMode == float_round_up ) && zSig2; 1046 } 1047 } 1048 } 1049 } 1050 if ( zSig2 ) float_set_inexact(); 1051 if ( increment ) { 1052 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1053 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1054 } 1055 else { 1056 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1057 } 1058 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1059 1060 } 1061 1062 /*---------------------------------------------------------------------------- 1063 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1064 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1065 | returns the proper quadruple-precision floating-point value corresponding 1066 | to the abstract input. This routine is just like `roundAndPackFloat128' 1067 | except that the input significand has fewer bits and does not have to be 1068 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1069 | point exponent. 1070 *----------------------------------------------------------------------------*/ 1071 1072 static float128 1073 normalizeRoundAndPackFloat128( 1074 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1075 { 1076 int8 shiftCount; 1077 bits64 zSig2; 1078 1079 if ( zSig0 == 0 ) { 1080 zSig0 = zSig1; 1081 zSig1 = 0; 1082 zExp -= 64; 1083 } 1084 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1085 if ( 0 <= shiftCount ) { 1086 zSig2 = 0; 1087 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1088 } 1089 else { 1090 shift128ExtraRightJamming( 1091 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1092 } 1093 zExp -= shiftCount; 1094 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1095 1096 } 1097 1098 #endif 1099 1100 /*---------------------------------------------------------------------------- 1101 | Returns the result of converting the 32-bit two's complement integer `a' 1102 | to the single-precision floating-point format. The conversion is performed 1103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1104 *----------------------------------------------------------------------------*/ 1105 1106 float32 int32_to_float32( int32 a ) 1107 { 1108 flag zSign; 1109 1110 if ( a == 0 ) return 0; 1111 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1112 zSign = ( a < 0 ); 1113 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1114 1115 } 1116 1117 /*---------------------------------------------------------------------------- 1118 | Returns the result of converting the 32-bit two's complement integer `a' 1119 | to the double-precision floating-point format. The conversion is performed 1120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1121 *----------------------------------------------------------------------------*/ 1122 1123 float64 int32_to_float64( int32 a ) 1124 { 1125 flag zSign; 1126 uint32 absA; 1127 int8 shiftCount; 1128 bits64 zSig; 1129 1130 if ( a == 0 ) return 0; 1131 zSign = ( a < 0 ); 1132 absA = zSign ? - a : a; 1133 shiftCount = countLeadingZeros32( absA ) + 21; 1134 zSig = absA; 1135 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1136 1137 } 1138 1139 #ifdef FLOATX80 1140 1141 /*---------------------------------------------------------------------------- 1142 | Returns the result of converting the 32-bit two's complement integer `a' 1143 | to the extended double-precision floating-point format. The conversion 1144 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1145 | Arithmetic. 1146 *----------------------------------------------------------------------------*/ 1147 1148 floatx80 int32_to_floatx80( int32 a ) 1149 { 1150 flag zSign; 1151 uint32 absA; 1152 int8 shiftCount; 1153 bits64 zSig; 1154 1155 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1156 zSign = ( a < 0 ); 1157 absA = zSign ? - a : a; 1158 shiftCount = countLeadingZeros32( absA ) + 32; 1159 zSig = absA; 1160 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1161 1162 } 1163 1164 #endif 1165 1166 #ifdef FLOAT128 1167 1168 /*---------------------------------------------------------------------------- 1169 | Returns the result of converting the 32-bit two's complement integer `a' to 1170 | the quadruple-precision floating-point format. The conversion is performed 1171 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1172 *----------------------------------------------------------------------------*/ 1173 1174 float128 int32_to_float128( int32 a ) 1175 { 1176 flag zSign; 1177 uint32 absA; 1178 int8 shiftCount; 1179 bits64 zSig0; 1180 1181 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1182 zSign = ( a < 0 ); 1183 absA = zSign ? - a : a; 1184 shiftCount = countLeadingZeros32( absA ) + 17; 1185 zSig0 = absA; 1186 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1187 1188 } 1189 1190 #endif 1191 1192 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1193 /*---------------------------------------------------------------------------- 1194 | Returns the result of converting the 64-bit two's complement integer `a' 1195 | to the single-precision floating-point format. The conversion is performed 1196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1197 *----------------------------------------------------------------------------*/ 1198 1199 float32 int64_to_float32( int64 a ) 1200 { 1201 flag zSign; 1202 uint64 absA; 1203 int8 shiftCount; 1204 1205 if ( a == 0 ) return 0; 1206 zSign = ( a < 0 ); 1207 absA = zSign ? - a : a; 1208 shiftCount = countLeadingZeros64( absA ) - 40; 1209 if ( 0 <= shiftCount ) { 1210 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1211 } 1212 else { 1213 shiftCount += 7; 1214 if ( shiftCount < 0 ) { 1215 shift64RightJamming( absA, - shiftCount, &absA ); 1216 } 1217 else { 1218 absA <<= shiftCount; 1219 } 1220 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1221 } 1222 1223 } 1224 1225 /*---------------------------------------------------------------------------- 1226 | Returns the result of converting the 64-bit two's complement integer `a' 1227 | to the double-precision floating-point format. The conversion is performed 1228 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1229 *----------------------------------------------------------------------------*/ 1230 1231 float64 int64_to_float64( int64 a ) 1232 { 1233 flag zSign; 1234 1235 if ( a == 0 ) return 0; 1236 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1237 return packFloat64( 1, 0x43E, 0 ); 1238 } 1239 zSign = ( a < 0 ); 1240 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1241 1242 } 1243 1244 #ifdef FLOATX80 1245 1246 /*---------------------------------------------------------------------------- 1247 | Returns the result of converting the 64-bit two's complement integer `a' 1248 | to the extended double-precision floating-point format. The conversion 1249 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1250 | Arithmetic. 1251 *----------------------------------------------------------------------------*/ 1252 1253 floatx80 int64_to_floatx80( int64 a ) 1254 { 1255 flag zSign; 1256 uint64 absA; 1257 int8 shiftCount; 1258 1259 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1260 zSign = ( a < 0 ); 1261 absA = zSign ? - a : a; 1262 shiftCount = countLeadingZeros64( absA ); 1263 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1264 1265 } 1266 1267 #endif 1268 1269 #ifdef FLOAT128 1270 1271 /*---------------------------------------------------------------------------- 1272 | Returns the result of converting the 64-bit two's complement integer `a' to 1273 | the quadruple-precision floating-point format. The conversion is performed 1274 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1275 *----------------------------------------------------------------------------*/ 1276 1277 float128 int64_to_float128( int64 a ) 1278 { 1279 flag zSign; 1280 uint64 absA; 1281 int8 shiftCount; 1282 int32 zExp; 1283 bits64 zSig0, zSig1; 1284 1285 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1286 zSign = ( a < 0 ); 1287 absA = zSign ? - a : a; 1288 shiftCount = countLeadingZeros64( absA ) + 49; 1289 zExp = 0x406E - shiftCount; 1290 if ( 64 <= shiftCount ) { 1291 zSig1 = 0; 1292 zSig0 = absA; 1293 shiftCount -= 64; 1294 } 1295 else { 1296 zSig1 = absA; 1297 zSig0 = 0; 1298 } 1299 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1300 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1301 1302 } 1303 1304 #endif 1305 #endif /* !SOFTFLOAT_FOR_GCC */ 1306 1307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1308 /*---------------------------------------------------------------------------- 1309 | Returns the result of converting the single-precision floating-point value 1310 | `a' to the 32-bit two's complement integer format. The conversion is 1311 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1312 | Arithmetic---which means in particular that the conversion is rounded 1313 | according to the current rounding mode. If `a' is a NaN, the largest 1314 | positive integer is returned. Otherwise, if the conversion overflows, the 1315 | largest integer with the same sign as `a' is returned. 1316 *----------------------------------------------------------------------------*/ 1317 1318 int32 float32_to_int32( float32 a ) 1319 { 1320 flag aSign; 1321 int16 aExp, shiftCount; 1322 bits32 aSig; 1323 bits64 aSig64; 1324 1325 aSig = extractFloat32Frac( a ); 1326 aExp = extractFloat32Exp( a ); 1327 aSign = extractFloat32Sign( a ); 1328 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1329 if ( aExp ) aSig |= 0x00800000; 1330 shiftCount = 0xAF - aExp; 1331 aSig64 = aSig; 1332 aSig64 <<= 32; 1333 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1334 return roundAndPackInt32( aSign, aSig64 ); 1335 1336 } 1337 #endif /* !SOFTFLOAT_FOR_GCC */ 1338 1339 /*---------------------------------------------------------------------------- 1340 | Returns the result of converting the single-precision floating-point value 1341 | `a' to the 32-bit two's complement integer format. The conversion is 1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1343 | Arithmetic, except that the conversion is always rounded toward zero. 1344 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1345 | the conversion overflows, the largest integer with the same sign as `a' is 1346 | returned. 1347 *----------------------------------------------------------------------------*/ 1348 1349 int32 float32_to_int32_round_to_zero( float32 a ) 1350 { 1351 flag aSign; 1352 int16 aExp, shiftCount; 1353 bits32 aSig; 1354 int32 z; 1355 1356 aSig = extractFloat32Frac( a ); 1357 aExp = extractFloat32Exp( a ); 1358 aSign = extractFloat32Sign( a ); 1359 shiftCount = aExp - 0x9E; 1360 if ( 0 <= shiftCount ) { 1361 if ( a != 0xCF000000 ) { 1362 float_raise( float_flag_invalid ); 1363 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1364 } 1365 return (sbits32) 0x80000000; 1366 } 1367 else if ( aExp <= 0x7E ) { 1368 if ( aExp | aSig ) float_set_inexact(); 1369 return 0; 1370 } 1371 aSig = ( aSig | 0x00800000 )<<8; 1372 z = aSig>>( - shiftCount ); 1373 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1374 float_set_inexact(); 1375 } 1376 if ( aSign ) z = - z; 1377 return z; 1378 1379 } 1380 1381 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1382 /*---------------------------------------------------------------------------- 1383 | Returns the result of converting the single-precision floating-point value 1384 | `a' to the 64-bit two's complement integer format. The conversion is 1385 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1386 | Arithmetic---which means in particular that the conversion is rounded 1387 | according to the current rounding mode. If `a' is a NaN, the largest 1388 | positive integer is returned. Otherwise, if the conversion overflows, the 1389 | largest integer with the same sign as `a' is returned. 1390 *----------------------------------------------------------------------------*/ 1391 1392 int64 float32_to_int64( float32 a ) 1393 { 1394 flag aSign; 1395 int16 aExp, shiftCount; 1396 bits32 aSig; 1397 bits64 aSig64, aSigExtra; 1398 1399 aSig = extractFloat32Frac( a ); 1400 aExp = extractFloat32Exp( a ); 1401 aSign = extractFloat32Sign( a ); 1402 shiftCount = 0xBE - aExp; 1403 if ( shiftCount < 0 ) { 1404 float_raise( float_flag_invalid ); 1405 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1406 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1407 } 1408 return (sbits64) LIT64( 0x8000000000000000 ); 1409 } 1410 if ( aExp ) aSig |= 0x00800000; 1411 aSig64 = aSig; 1412 aSig64 <<= 40; 1413 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1414 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1415 1416 } 1417 1418 /*---------------------------------------------------------------------------- 1419 | Returns the result of converting the single-precision floating-point value 1420 | `a' to the 64-bit two's complement integer format. The conversion is 1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1422 | Arithmetic, except that the conversion is always rounded toward zero. If 1423 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1424 | conversion overflows, the largest integer with the same sign as `a' is 1425 | returned. 1426 *----------------------------------------------------------------------------*/ 1427 1428 int64 float32_to_int64_round_to_zero( float32 a ) 1429 { 1430 flag aSign; 1431 int16 aExp, shiftCount; 1432 bits32 aSig; 1433 bits64 aSig64; 1434 int64 z; 1435 1436 aSig = extractFloat32Frac( a ); 1437 aExp = extractFloat32Exp( a ); 1438 aSign = extractFloat32Sign( a ); 1439 shiftCount = aExp - 0xBE; 1440 if ( 0 <= shiftCount ) { 1441 if ( a != 0xDF000000 ) { 1442 float_raise( float_flag_invalid ); 1443 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1444 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1445 } 1446 } 1447 return (sbits64) LIT64( 0x8000000000000000 ); 1448 } 1449 else if ( aExp <= 0x7E ) { 1450 if ( aExp | aSig ) float_set_inexact(); 1451 return 0; 1452 } 1453 aSig64 = aSig | 0x00800000; 1454 aSig64 <<= 40; 1455 z = aSig64>>( - shiftCount ); 1456 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1457 float_set_inexact(); 1458 } 1459 if ( aSign ) z = - z; 1460 return z; 1461 1462 } 1463 #endif /* !SOFTFLOAT_FOR_GCC */ 1464 1465 /*---------------------------------------------------------------------------- 1466 | Returns the result of converting the single-precision floating-point value 1467 | `a' to the double-precision floating-point format. The conversion is 1468 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1469 | Arithmetic. 1470 *----------------------------------------------------------------------------*/ 1471 1472 float64 float32_to_float64( float32 a ) 1473 { 1474 flag aSign; 1475 int16 aExp; 1476 bits32 aSig; 1477 1478 aSig = extractFloat32Frac( a ); 1479 aExp = extractFloat32Exp( a ); 1480 aSign = extractFloat32Sign( a ); 1481 if ( aExp == 0xFF ) { 1482 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1483 return packFloat64( aSign, 0x7FF, 0 ); 1484 } 1485 if ( aExp == 0 ) { 1486 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1487 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1488 --aExp; 1489 } 1490 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1491 1492 } 1493 1494 #ifdef FLOATX80 1495 1496 /*---------------------------------------------------------------------------- 1497 | Returns the result of converting the single-precision floating-point value 1498 | `a' to the extended double-precision floating-point format. The conversion 1499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1500 | Arithmetic. 1501 *----------------------------------------------------------------------------*/ 1502 1503 floatx80 float32_to_floatx80( float32 a ) 1504 { 1505 flag aSign; 1506 int16 aExp; 1507 bits32 aSig; 1508 1509 aSig = extractFloat32Frac( a ); 1510 aExp = extractFloat32Exp( a ); 1511 aSign = extractFloat32Sign( a ); 1512 if ( aExp == 0xFF ) { 1513 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1514 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1515 } 1516 if ( aExp == 0 ) { 1517 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1518 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1519 } 1520 aSig |= 0x00800000; 1521 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1522 1523 } 1524 1525 #endif 1526 1527 #ifdef FLOAT128 1528 1529 /*---------------------------------------------------------------------------- 1530 | Returns the result of converting the single-precision floating-point value 1531 | `a' to the double-precision floating-point format. The conversion is 1532 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1533 | Arithmetic. 1534 *----------------------------------------------------------------------------*/ 1535 1536 float128 float32_to_float128( float32 a ) 1537 { 1538 flag aSign; 1539 int16 aExp; 1540 bits32 aSig; 1541 1542 aSig = extractFloat32Frac( a ); 1543 aExp = extractFloat32Exp( a ); 1544 aSign = extractFloat32Sign( a ); 1545 if ( aExp == 0xFF ) { 1546 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1547 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1548 } 1549 if ( aExp == 0 ) { 1550 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1551 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1552 --aExp; 1553 } 1554 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1555 1556 } 1557 1558 #endif 1559 1560 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1561 /*---------------------------------------------------------------------------- 1562 | Rounds the single-precision floating-point value `a' to an integer, and 1563 | returns the result as a single-precision floating-point value. The 1564 | operation is performed according to the IEC/IEEE Standard for Binary 1565 | Floating-Point Arithmetic. 1566 *----------------------------------------------------------------------------*/ 1567 1568 float32 float32_round_to_int( float32 a ) 1569 { 1570 flag aSign; 1571 int16 aExp; 1572 bits32 lastBitMask, roundBitsMask; 1573 int8 roundingMode; 1574 float32 z; 1575 1576 aExp = extractFloat32Exp( a ); 1577 if ( 0x96 <= aExp ) { 1578 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1579 return propagateFloat32NaN( a, a ); 1580 } 1581 return a; 1582 } 1583 if ( aExp <= 0x7E ) { 1584 if ( (bits32) ( a<<1 ) == 0 ) return a; 1585 float_set_inexact(); 1586 aSign = extractFloat32Sign( a ); 1587 switch ( float_rounding_mode() ) { 1588 case float_round_nearest_even: 1589 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1590 return packFloat32( aSign, 0x7F, 0 ); 1591 } 1592 break; 1593 case float_round_down: 1594 return aSign ? 0xBF800000 : 0; 1595 case float_round_up: 1596 return aSign ? 0x80000000 : 0x3F800000; 1597 } 1598 return packFloat32( aSign, 0, 0 ); 1599 } 1600 lastBitMask = 1; 1601 lastBitMask <<= 0x96 - aExp; 1602 roundBitsMask = lastBitMask - 1; 1603 z = a; 1604 roundingMode = float_rounding_mode(); 1605 if ( roundingMode == float_round_nearest_even ) { 1606 z += lastBitMask>>1; 1607 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1608 } 1609 else if ( roundingMode != float_round_to_zero ) { 1610 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1611 z += roundBitsMask; 1612 } 1613 } 1614 z &= ~ roundBitsMask; 1615 if ( z != a ) float_set_inexact(); 1616 return z; 1617 1618 } 1619 #endif /* !SOFTFLOAT_FOR_GCC */ 1620 1621 /*---------------------------------------------------------------------------- 1622 | Returns the result of adding the absolute values of the single-precision 1623 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1624 | before being returned. `zSign' is ignored if the result is a NaN. 1625 | The addition is performed according to the IEC/IEEE Standard for Binary 1626 | Floating-Point Arithmetic. 1627 *----------------------------------------------------------------------------*/ 1628 1629 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1630 { 1631 int16 aExp, bExp, zExp; 1632 bits32 aSig, bSig, zSig; 1633 int16 expDiff; 1634 1635 aSig = extractFloat32Frac( a ); 1636 aExp = extractFloat32Exp( a ); 1637 bSig = extractFloat32Frac( b ); 1638 bExp = extractFloat32Exp( b ); 1639 expDiff = aExp - bExp; 1640 aSig <<= 6; 1641 bSig <<= 6; 1642 if ( 0 < expDiff ) { 1643 if ( aExp == 0xFF ) { 1644 if ( aSig ) return propagateFloat32NaN( a, b ); 1645 return a; 1646 } 1647 if ( bExp == 0 ) { 1648 --expDiff; 1649 } 1650 else { 1651 bSig |= 0x20000000; 1652 } 1653 shift32RightJamming( bSig, expDiff, &bSig ); 1654 zExp = aExp; 1655 } 1656 else if ( expDiff < 0 ) { 1657 if ( bExp == 0xFF ) { 1658 if ( bSig ) return propagateFloat32NaN( a, b ); 1659 return packFloat32( zSign, 0xFF, 0 ); 1660 } 1661 if ( aExp == 0 ) { 1662 ++expDiff; 1663 } 1664 else { 1665 aSig |= 0x20000000; 1666 } 1667 shift32RightJamming( aSig, - expDiff, &aSig ); 1668 zExp = bExp; 1669 } 1670 else { 1671 if ( aExp == 0xFF ) { 1672 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1673 return a; 1674 } 1675 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1676 zSig = 0x40000000 + aSig + bSig; 1677 zExp = aExp; 1678 goto roundAndPack; 1679 } 1680 aSig |= 0x20000000; 1681 zSig = ( aSig + bSig )<<1; 1682 --zExp; 1683 if ( (sbits32) zSig < 0 ) { 1684 zSig = aSig + bSig; 1685 ++zExp; 1686 } 1687 roundAndPack: 1688 return roundAndPackFloat32( zSign, zExp, zSig ); 1689 1690 } 1691 1692 /*---------------------------------------------------------------------------- 1693 | Returns the result of subtracting the absolute values of the single- 1694 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1695 | difference is negated before being returned. `zSign' is ignored if the 1696 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1697 | Standard for Binary Floating-Point Arithmetic. 1698 *----------------------------------------------------------------------------*/ 1699 1700 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1701 { 1702 int16 aExp, bExp, zExp; 1703 bits32 aSig, bSig, zSig; 1704 int16 expDiff; 1705 1706 aSig = extractFloat32Frac( a ); 1707 aExp = extractFloat32Exp( a ); 1708 bSig = extractFloat32Frac( b ); 1709 bExp = extractFloat32Exp( b ); 1710 expDiff = aExp - bExp; 1711 aSig <<= 7; 1712 bSig <<= 7; 1713 if ( 0 < expDiff ) goto aExpBigger; 1714 if ( expDiff < 0 ) goto bExpBigger; 1715 if ( aExp == 0xFF ) { 1716 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1717 float_raise( float_flag_invalid ); 1718 return float32_default_nan; 1719 } 1720 if ( aExp == 0 ) { 1721 aExp = 1; 1722 bExp = 1; 1723 } 1724 if ( bSig < aSig ) goto aBigger; 1725 if ( aSig < bSig ) goto bBigger; 1726 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1727 bExpBigger: 1728 if ( bExp == 0xFF ) { 1729 if ( bSig ) return propagateFloat32NaN( a, b ); 1730 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1731 } 1732 if ( aExp == 0 ) { 1733 ++expDiff; 1734 } 1735 else { 1736 aSig |= 0x40000000; 1737 } 1738 shift32RightJamming( aSig, - expDiff, &aSig ); 1739 bSig |= 0x40000000; 1740 bBigger: 1741 zSig = bSig - aSig; 1742 zExp = bExp; 1743 zSign ^= 1; 1744 goto normalizeRoundAndPack; 1745 aExpBigger: 1746 if ( aExp == 0xFF ) { 1747 if ( aSig ) return propagateFloat32NaN( a, b ); 1748 return a; 1749 } 1750 if ( bExp == 0 ) { 1751 --expDiff; 1752 } 1753 else { 1754 bSig |= 0x40000000; 1755 } 1756 shift32RightJamming( bSig, expDiff, &bSig ); 1757 aSig |= 0x40000000; 1758 aBigger: 1759 zSig = aSig - bSig; 1760 zExp = aExp; 1761 normalizeRoundAndPack: 1762 --zExp; 1763 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1764 1765 } 1766 1767 /*---------------------------------------------------------------------------- 1768 | Returns the result of adding the single-precision floating-point values `a' 1769 | and `b'. The operation is performed according to the IEC/IEEE Standard for 1770 | Binary Floating-Point Arithmetic. 1771 *----------------------------------------------------------------------------*/ 1772 1773 float32 float32_add( float32 a, float32 b ) 1774 { 1775 flag aSign, bSign; 1776 1777 aSign = extractFloat32Sign( a ); 1778 bSign = extractFloat32Sign( b ); 1779 if ( aSign == bSign ) { 1780 return addFloat32Sigs( a, b, aSign ); 1781 } 1782 else { 1783 return subFloat32Sigs( a, b, aSign ); 1784 } 1785 1786 } 1787 1788 /*---------------------------------------------------------------------------- 1789 | Returns the result of subtracting the single-precision floating-point values 1790 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1791 | for Binary Floating-Point Arithmetic. 1792 *----------------------------------------------------------------------------*/ 1793 1794 float32 float32_sub( float32 a, float32 b ) 1795 { 1796 flag aSign, bSign; 1797 1798 aSign = extractFloat32Sign( a ); 1799 bSign = extractFloat32Sign( b ); 1800 if ( aSign == bSign ) { 1801 return subFloat32Sigs( a, b, aSign ); 1802 } 1803 else { 1804 return addFloat32Sigs( a, b, aSign ); 1805 } 1806 1807 } 1808 1809 /*---------------------------------------------------------------------------- 1810 | Returns the result of multiplying the single-precision floating-point values 1811 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1812 | for Binary Floating-Point Arithmetic. 1813 *----------------------------------------------------------------------------*/ 1814 1815 float32 float32_mul( float32 a, float32 b ) 1816 { 1817 flag aSign, bSign, zSign; 1818 int16 aExp, bExp, zExp; 1819 bits32 aSig, bSig; 1820 bits64 zSig64; 1821 bits32 zSig; 1822 1823 aSig = extractFloat32Frac( a ); 1824 aExp = extractFloat32Exp( a ); 1825 aSign = extractFloat32Sign( a ); 1826 bSig = extractFloat32Frac( b ); 1827 bExp = extractFloat32Exp( b ); 1828 bSign = extractFloat32Sign( b ); 1829 zSign = aSign ^ bSign; 1830 if ( aExp == 0xFF ) { 1831 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1832 return propagateFloat32NaN( a, b ); 1833 } 1834 if ( ( bExp | bSig ) == 0 ) { 1835 float_raise( float_flag_invalid ); 1836 return float32_default_nan; 1837 } 1838 return packFloat32( zSign, 0xFF, 0 ); 1839 } 1840 if ( bExp == 0xFF ) { 1841 if ( bSig ) return propagateFloat32NaN( a, b ); 1842 if ( ( aExp | aSig ) == 0 ) { 1843 float_raise( float_flag_invalid ); 1844 return float32_default_nan; 1845 } 1846 return packFloat32( zSign, 0xFF, 0 ); 1847 } 1848 if ( aExp == 0 ) { 1849 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1850 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1851 } 1852 if ( bExp == 0 ) { 1853 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1854 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1855 } 1856 zExp = aExp + bExp - 0x7F; 1857 aSig = ( aSig | 0x00800000 )<<7; 1858 bSig = ( bSig | 0x00800000 )<<8; 1859 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1860 zSig = zSig64; 1861 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1862 zSig <<= 1; 1863 --zExp; 1864 } 1865 return roundAndPackFloat32( zSign, zExp, zSig ); 1866 1867 } 1868 1869 /*---------------------------------------------------------------------------- 1870 | Returns the result of dividing the single-precision floating-point value `a' 1871 | by the corresponding value `b'. The operation is performed according to the 1872 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1873 *----------------------------------------------------------------------------*/ 1874 1875 float32 float32_div( float32 a, float32 b ) 1876 { 1877 flag aSign, bSign, zSign; 1878 int16 aExp, bExp, zExp; 1879 bits32 aSig, bSig, zSig; 1880 1881 aSig = extractFloat32Frac( a ); 1882 aExp = extractFloat32Exp( a ); 1883 aSign = extractFloat32Sign( a ); 1884 bSig = extractFloat32Frac( b ); 1885 bExp = extractFloat32Exp( b ); 1886 bSign = extractFloat32Sign( b ); 1887 zSign = aSign ^ bSign; 1888 if ( aExp == 0xFF ) { 1889 if ( aSig ) return propagateFloat32NaN( a, b ); 1890 if ( bExp == 0xFF ) { 1891 if ( bSig ) return propagateFloat32NaN( a, b ); 1892 float_raise( float_flag_invalid ); 1893 return float32_default_nan; 1894 } 1895 return packFloat32( zSign, 0xFF, 0 ); 1896 } 1897 if ( bExp == 0xFF ) { 1898 if ( bSig ) return propagateFloat32NaN( a, b ); 1899 return packFloat32( zSign, 0, 0 ); 1900 } 1901 if ( bExp == 0 ) { 1902 if ( bSig == 0 ) { 1903 if ( ( aExp | aSig ) == 0 ) { 1904 float_raise( float_flag_invalid ); 1905 return float32_default_nan; 1906 } 1907 float_raise( float_flag_divbyzero ); 1908 return packFloat32( zSign, 0xFF, 0 ); 1909 } 1910 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1911 } 1912 if ( aExp == 0 ) { 1913 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1914 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1915 } 1916 zExp = aExp - bExp + 0x7D; 1917 aSig = ( aSig | 0x00800000 )<<7; 1918 bSig = ( bSig | 0x00800000 )<<8; 1919 if ( bSig <= ( aSig + aSig ) ) { 1920 aSig >>= 1; 1921 ++zExp; 1922 } 1923 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1924 if ( ( zSig & 0x3F ) == 0 ) { 1925 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1926 } 1927 return roundAndPackFloat32( zSign, zExp, zSig ); 1928 1929 } 1930 1931 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1932 /*---------------------------------------------------------------------------- 1933 | Returns the remainder of the single-precision floating-point value `a' 1934 | with respect to the corresponding value `b'. The operation is performed 1935 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1936 *----------------------------------------------------------------------------*/ 1937 1938 float32 float32_rem( float32 a, float32 b ) 1939 { 1940 flag aSign, bSign __unused, zSign; 1941 int16 aExp, bExp, expDiff; 1942 bits32 aSig, bSig; 1943 bits32 q; 1944 bits64 aSig64, bSig64, q64; 1945 bits32 alternateASig; 1946 sbits32 sigMean; 1947 1948 aSig = extractFloat32Frac( a ); 1949 aExp = extractFloat32Exp( a ); 1950 aSign = extractFloat32Sign( a ); 1951 bSig = extractFloat32Frac( b ); 1952 bExp = extractFloat32Exp( b ); 1953 bSign = extractFloat32Sign( b ); 1954 if ( aExp == 0xFF ) { 1955 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1956 return propagateFloat32NaN( a, b ); 1957 } 1958 float_raise( float_flag_invalid ); 1959 return float32_default_nan; 1960 } 1961 if ( bExp == 0xFF ) { 1962 if ( bSig ) return propagateFloat32NaN( a, b ); 1963 return a; 1964 } 1965 if ( bExp == 0 ) { 1966 if ( bSig == 0 ) { 1967 float_raise( float_flag_invalid ); 1968 return float32_default_nan; 1969 } 1970 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1971 } 1972 if ( aExp == 0 ) { 1973 if ( aSig == 0 ) return a; 1974 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1975 } 1976 expDiff = aExp - bExp; 1977 aSig |= 0x00800000; 1978 bSig |= 0x00800000; 1979 if ( expDiff < 32 ) { 1980 aSig <<= 8; 1981 bSig <<= 8; 1982 if ( expDiff < 0 ) { 1983 if ( expDiff < -1 ) return a; 1984 aSig >>= 1; 1985 } 1986 q = ( bSig <= aSig ); 1987 if ( q ) aSig -= bSig; 1988 if ( 0 < expDiff ) { 1989 q = ( ( (bits64) aSig )<<32 ) / bSig; 1990 q >>= 32 - expDiff; 1991 bSig >>= 2; 1992 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1993 } 1994 else { 1995 aSig >>= 2; 1996 bSig >>= 2; 1997 } 1998 } 1999 else { 2000 if ( bSig <= aSig ) aSig -= bSig; 2001 aSig64 = ( (bits64) aSig )<<40; 2002 bSig64 = ( (bits64) bSig )<<40; 2003 expDiff -= 64; 2004 while ( 0 < expDiff ) { 2005 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2006 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2007 aSig64 = - ( ( bSig * q64 )<<38 ); 2008 expDiff -= 62; 2009 } 2010 expDiff += 64; 2011 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2012 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2013 q = q64>>( 64 - expDiff ); 2014 bSig <<= 6; 2015 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2016 } 2017 do { 2018 alternateASig = aSig; 2019 ++q; 2020 aSig -= bSig; 2021 } while ( 0 <= (sbits32) aSig ); 2022 sigMean = aSig + alternateASig; 2023 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2024 aSig = alternateASig; 2025 } 2026 zSign = ( (sbits32) aSig < 0 ); 2027 if ( zSign ) aSig = - aSig; 2028 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2029 2030 } 2031 #endif /* !SOFTFLOAT_FOR_GCC */ 2032 2033 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2034 2035 /*---------------------------------------------------------------------------- 2036 | Returns the square root of the single-precision floating-point value `a'. 2037 | The operation is performed according to the IEC/IEEE Standard for Binary 2038 | Floating-Point Arithmetic. 2039 *----------------------------------------------------------------------------*/ 2040 2041 float32 float32_sqrt( float32 a ) 2042 { 2043 flag aSign; 2044 int16 aExp, zExp; 2045 bits32 aSig, zSig; 2046 bits64 rem, term; 2047 2048 aSig = extractFloat32Frac( a ); 2049 aExp = extractFloat32Exp( a ); 2050 aSign = extractFloat32Sign( a ); 2051 if ( aExp == 0xFF ) { 2052 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2053 if ( ! aSign ) return a; 2054 float_raise( float_flag_invalid ); 2055 return float32_default_nan; 2056 } 2057 if ( aSign ) { 2058 if ( ( aExp | aSig ) == 0 ) return a; 2059 float_raise( float_flag_invalid ); 2060 return float32_default_nan; 2061 } 2062 if ( aExp == 0 ) { 2063 if ( aSig == 0 ) return 0; 2064 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2065 } 2066 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2067 aSig = ( aSig | 0x00800000 )<<8; 2068 zSig = estimateSqrt32( aExp, aSig ) + 2; 2069 if ( ( zSig & 0x7F ) <= 5 ) { 2070 if ( zSig < 2 ) { 2071 zSig = 0x7FFFFFFF; 2072 goto roundAndPack; 2073 } 2074 aSig >>= aExp & 1; 2075 term = ( (bits64) zSig ) * zSig; 2076 rem = ( ( (bits64) aSig )<<32 ) - term; 2077 while ( (sbits64) rem < 0 ) { 2078 --zSig; 2079 rem += ( ( (bits64) zSig )<<1 ) | 1; 2080 } 2081 zSig |= ( rem != 0 ); 2082 } 2083 shift32RightJamming( zSig, 1, &zSig ); 2084 roundAndPack: 2085 return roundAndPackFloat32( 0, zExp, zSig ); 2086 2087 } 2088 #endif /* !SOFTFLOAT_FOR_GCC */ 2089 2090 /*---------------------------------------------------------------------------- 2091 | Returns 1 if the single-precision floating-point value `a' is equal to 2092 | the corresponding value `b', and 0 otherwise. The comparison is performed 2093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2094 *----------------------------------------------------------------------------*/ 2095 2096 flag float32_eq( float32 a, float32 b ) 2097 { 2098 2099 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2100 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2101 ) { 2102 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2103 float_raise( float_flag_invalid ); 2104 } 2105 return 0; 2106 } 2107 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2108 2109 } 2110 2111 /*---------------------------------------------------------------------------- 2112 | Returns 1 if the single-precision floating-point value `a' is less than 2113 | or equal to the corresponding value `b', and 0 otherwise. The comparison 2114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2115 | Arithmetic. 2116 *----------------------------------------------------------------------------*/ 2117 2118 flag float32_le( float32 a, float32 b ) 2119 { 2120 flag aSign, bSign; 2121 2122 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2123 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2124 ) { 2125 float_raise( float_flag_invalid ); 2126 return 0; 2127 } 2128 aSign = extractFloat32Sign( a ); 2129 bSign = extractFloat32Sign( b ); 2130 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2131 return ( a == b ) || ( aSign ^ ( a < b ) ); 2132 2133 } 2134 2135 /*---------------------------------------------------------------------------- 2136 | Returns 1 if the single-precision floating-point value `a' is less than 2137 | the corresponding value `b', and 0 otherwise. The comparison is performed 2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2139 *----------------------------------------------------------------------------*/ 2140 2141 flag float32_lt( float32 a, float32 b ) 2142 { 2143 flag aSign, bSign; 2144 2145 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2146 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2147 ) { 2148 float_raise( float_flag_invalid ); 2149 return 0; 2150 } 2151 aSign = extractFloat32Sign( a ); 2152 bSign = extractFloat32Sign( b ); 2153 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2154 return ( a != b ) && ( aSign ^ ( a < b ) ); 2155 2156 } 2157 2158 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2159 /*---------------------------------------------------------------------------- 2160 | Returns 1 if the single-precision floating-point value `a' is equal to 2161 | the corresponding value `b', and 0 otherwise. The invalid exception is 2162 | raised if either operand is a NaN. Otherwise, the comparison is performed 2163 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2164 *----------------------------------------------------------------------------*/ 2165 2166 flag float32_eq_signaling( float32 a, float32 b ) 2167 { 2168 2169 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2170 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2171 ) { 2172 float_raise( float_flag_invalid ); 2173 return 0; 2174 } 2175 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2176 2177 } 2178 2179 /*---------------------------------------------------------------------------- 2180 | Returns 1 if the single-precision floating-point value `a' is less than or 2181 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2182 | cause an exception. Otherwise, the comparison is performed according to the 2183 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2184 *----------------------------------------------------------------------------*/ 2185 2186 flag float32_le_quiet( float32 a, float32 b ) 2187 { 2188 flag aSign, bSign; 2189 2190 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2191 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2192 ) { 2193 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2194 float_raise( float_flag_invalid ); 2195 } 2196 return 0; 2197 } 2198 aSign = extractFloat32Sign( a ); 2199 bSign = extractFloat32Sign( b ); 2200 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2201 return ( a == b ) || ( aSign ^ ( a < b ) ); 2202 2203 } 2204 2205 /*---------------------------------------------------------------------------- 2206 | Returns 1 if the single-precision floating-point value `a' is less than 2207 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2208 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2209 | Standard for Binary Floating-Point Arithmetic. 2210 *----------------------------------------------------------------------------*/ 2211 2212 flag float32_lt_quiet( float32 a, float32 b ) 2213 { 2214 flag aSign, bSign; 2215 2216 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2217 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2218 ) { 2219 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2220 float_raise( float_flag_invalid ); 2221 } 2222 return 0; 2223 } 2224 aSign = extractFloat32Sign( a ); 2225 bSign = extractFloat32Sign( b ); 2226 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2227 return ( a != b ) && ( aSign ^ ( a < b ) ); 2228 2229 } 2230 #endif /* !SOFTFLOAT_FOR_GCC */ 2231 2232 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2233 /*---------------------------------------------------------------------------- 2234 | Returns the result of converting the double-precision floating-point value 2235 | `a' to the 32-bit two's complement integer format. The conversion is 2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2237 | Arithmetic---which means in particular that the conversion is rounded 2238 | according to the current rounding mode. If `a' is a NaN, the largest 2239 | positive integer is returned. Otherwise, if the conversion overflows, the 2240 | largest integer with the same sign as `a' is returned. 2241 *----------------------------------------------------------------------------*/ 2242 2243 int32 float64_to_int32( float64 a ) 2244 { 2245 flag aSign; 2246 int16 aExp, shiftCount; 2247 bits64 aSig; 2248 2249 aSig = extractFloat64Frac( a ); 2250 aExp = extractFloat64Exp( a ); 2251 aSign = extractFloat64Sign( a ); 2252 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2253 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2254 shiftCount = 0x42C - aExp; 2255 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2256 return roundAndPackInt32( aSign, aSig ); 2257 2258 } 2259 #endif /* !SOFTFLOAT_FOR_GCC */ 2260 2261 /*---------------------------------------------------------------------------- 2262 | Returns the result of converting the double-precision floating-point value 2263 | `a' to the 32-bit two's complement integer format. The conversion is 2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2265 | Arithmetic, except that the conversion is always rounded toward zero. 2266 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2267 | the conversion overflows, the largest integer with the same sign as `a' is 2268 | returned. 2269 *----------------------------------------------------------------------------*/ 2270 2271 int32 float64_to_int32_round_to_zero( float64 a ) 2272 { 2273 flag aSign; 2274 int16 aExp, shiftCount; 2275 bits64 aSig, savedASig; 2276 int32 z; 2277 2278 aSig = extractFloat64Frac( a ); 2279 aExp = extractFloat64Exp( a ); 2280 aSign = extractFloat64Sign( a ); 2281 if ( 0x41E < aExp ) { 2282 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2283 goto invalid; 2284 } 2285 else if ( aExp < 0x3FF ) { 2286 if ( aExp || aSig ) float_set_inexact(); 2287 return 0; 2288 } 2289 aSig |= LIT64( 0x0010000000000000 ); 2290 shiftCount = 0x433 - aExp; 2291 savedASig = aSig; 2292 aSig >>= shiftCount; 2293 z = aSig; 2294 if ( aSign ) z = - z; 2295 if ( ( z < 0 ) ^ aSign ) { 2296 invalid: 2297 float_raise( float_flag_invalid ); 2298 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2299 } 2300 if ( ( aSig<<shiftCount ) != savedASig ) { 2301 float_set_inexact(); 2302 } 2303 return z; 2304 2305 } 2306 2307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2308 /*---------------------------------------------------------------------------- 2309 | Returns the result of converting the double-precision floating-point value 2310 | `a' to the 64-bit two's complement integer format. The conversion is 2311 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2312 | Arithmetic---which means in particular that the conversion is rounded 2313 | according to the current rounding mode. If `a' is a NaN, the largest 2314 | positive integer is returned. Otherwise, if the conversion overflows, the 2315 | largest integer with the same sign as `a' is returned. 2316 *----------------------------------------------------------------------------*/ 2317 2318 int64 float64_to_int64( float64 a ) 2319 { 2320 flag aSign; 2321 int16 aExp, shiftCount; 2322 bits64 aSig, aSigExtra; 2323 2324 aSig = extractFloat64Frac( a ); 2325 aExp = extractFloat64Exp( a ); 2326 aSign = extractFloat64Sign( a ); 2327 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2328 shiftCount = 0x433 - aExp; 2329 if ( shiftCount <= 0 ) { 2330 if ( 0x43E < aExp ) { 2331 float_raise( float_flag_invalid ); 2332 if ( ! aSign 2333 || ( ( aExp == 0x7FF ) 2334 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2335 ) { 2336 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2337 } 2338 return (sbits64) LIT64( 0x8000000000000000 ); 2339 } 2340 aSigExtra = 0; 2341 aSig <<= - shiftCount; 2342 } 2343 else { 2344 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2345 } 2346 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2347 2348 } 2349 2350 /* like above, but result is unsigned */ 2351 uint64 float64_to_uint64( float64 a ) 2352 { 2353 flag aSign; 2354 int16 aExp, shiftCount; 2355 bits64 aSig, aSigExtra; 2356 2357 aSig = extractFloat64Frac( a ); 2358 aExp = extractFloat64Exp( a ); 2359 aSign = extractFloat64Sign( a ); 2360 2361 if (aSign) { 2362 return float64_to_int64(a); 2363 } 2364 2365 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2366 shiftCount = 0x433 - aExp; 2367 if ( shiftCount <= 0 ) { 2368 if ( 0x43E < aExp ) { 2369 float_raise( float_flag_invalid ); 2370 if ( ! aSign 2371 || ( ( aExp == 0x7FF ) 2372 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2373 ) { 2374 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2375 } 2376 return (sbits64) LIT64( 0x8000000000000000 ); 2377 } 2378 aSigExtra = 0; 2379 aSig <<= - shiftCount; 2380 } 2381 else { 2382 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2383 } 2384 return roundAndPackUInt64( aSig, aSigExtra ); 2385 2386 } 2387 2388 /*---------------------------------------------------------------------------- 2389 | Returns the result of converting the double-precision floating-point value 2390 | `a' to the 64-bit two's complement integer format. The conversion is 2391 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2392 | Arithmetic, except that the conversion is always rounded toward zero. 2393 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2394 | the conversion overflows, the largest integer with the same sign as `a' is 2395 | returned. 2396 *----------------------------------------------------------------------------*/ 2397 2398 int64 float64_to_int64_round_to_zero( float64 a ) 2399 { 2400 flag aSign; 2401 int16 aExp, shiftCount; 2402 bits64 aSig; 2403 int64 z; 2404 2405 aSig = extractFloat64Frac( a ); 2406 aExp = extractFloat64Exp( a ); 2407 aSign = extractFloat64Sign( a ); 2408 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2409 shiftCount = aExp - 0x433; 2410 if ( 0 <= shiftCount ) { 2411 if ( 0x43E <= aExp ) { 2412 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2413 float_raise( float_flag_invalid ); 2414 if ( ! aSign 2415 || ( ( aExp == 0x7FF ) 2416 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2417 ) { 2418 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2419 } 2420 } 2421 return (sbits64) LIT64( 0x8000000000000000 ); 2422 } 2423 z = aSig<<shiftCount; 2424 } 2425 else { 2426 if ( aExp < 0x3FE ) { 2427 if ( aExp | aSig ) float_set_inexact(); 2428 return 0; 2429 } 2430 z = aSig>>( - shiftCount ); 2431 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2432 float_set_inexact(); 2433 } 2434 } 2435 if ( aSign ) z = - z; 2436 return z; 2437 2438 } 2439 #endif /* !SOFTFLOAT_FOR_GCC */ 2440 2441 /*---------------------------------------------------------------------------- 2442 | Returns the result of converting the double-precision floating-point value 2443 | `a' to the single-precision floating-point format. The conversion is 2444 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2445 | Arithmetic. 2446 *----------------------------------------------------------------------------*/ 2447 2448 float32 float64_to_float32( float64 a ) 2449 { 2450 flag aSign; 2451 int16 aExp; 2452 bits64 aSig; 2453 bits32 zSig; 2454 2455 aSig = extractFloat64Frac( a ); 2456 aExp = extractFloat64Exp( a ); 2457 aSign = extractFloat64Sign( a ); 2458 if ( aExp == 0x7FF ) { 2459 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2460 return packFloat32( aSign, 0xFF, 0 ); 2461 } 2462 shift64RightJamming( aSig, 22, &aSig ); 2463 zSig = aSig; 2464 if ( aExp || zSig ) { 2465 zSig |= 0x40000000; 2466 aExp -= 0x381; 2467 } 2468 return roundAndPackFloat32( aSign, aExp, zSig ); 2469 2470 } 2471 2472 #ifdef FLOATX80 2473 2474 /*---------------------------------------------------------------------------- 2475 | Returns the result of converting the double-precision floating-point value 2476 | `a' to the extended double-precision floating-point format. The conversion 2477 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2478 | Arithmetic. 2479 *----------------------------------------------------------------------------*/ 2480 2481 floatx80 float64_to_floatx80( float64 a ) 2482 { 2483 flag aSign; 2484 int16 aExp; 2485 bits64 aSig; 2486 2487 aSig = extractFloat64Frac( a ); 2488 aExp = extractFloat64Exp( a ); 2489 aSign = extractFloat64Sign( a ); 2490 if ( aExp == 0x7FF ) { 2491 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2492 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2493 } 2494 if ( aExp == 0 ) { 2495 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2496 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2497 } 2498 return 2499 packFloatx80( 2500 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2501 2502 } 2503 2504 #endif 2505 2506 #ifdef FLOAT128 2507 2508 /*---------------------------------------------------------------------------- 2509 | Returns the result of converting the double-precision floating-point value 2510 | `a' to the quadruple-precision floating-point format. The conversion is 2511 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2512 | Arithmetic. 2513 *----------------------------------------------------------------------------*/ 2514 2515 float128 float64_to_float128( float64 a ) 2516 { 2517 flag aSign; 2518 int16 aExp; 2519 bits64 aSig, zSig0, zSig1; 2520 2521 aSig = extractFloat64Frac( a ); 2522 aExp = extractFloat64Exp( a ); 2523 aSign = extractFloat64Sign( a ); 2524 if ( aExp == 0x7FF ) { 2525 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2526 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2527 } 2528 if ( aExp == 0 ) { 2529 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2530 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2531 --aExp; 2532 } 2533 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2534 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2535 2536 } 2537 2538 #endif 2539 2540 #ifndef SOFTFLOAT_FOR_GCC 2541 /*---------------------------------------------------------------------------- 2542 | Rounds the double-precision floating-point value `a' to an integer, and 2543 | returns the result as a double-precision floating-point value. The 2544 | operation is performed according to the IEC/IEEE Standard for Binary 2545 | Floating-Point Arithmetic. 2546 *----------------------------------------------------------------------------*/ 2547 2548 float64 float64_round_to_int( float64 a ) 2549 { 2550 flag aSign; 2551 int16 aExp; 2552 bits64 lastBitMask, roundBitsMask; 2553 int8 roundingMode; 2554 float64 z; 2555 2556 aExp = extractFloat64Exp( a ); 2557 if ( 0x433 <= aExp ) { 2558 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2559 return propagateFloat64NaN( a, a ); 2560 } 2561 return a; 2562 } 2563 if ( aExp < 0x3FF ) { 2564 if ( (bits64) ( a<<1 ) == 0 ) return a; 2565 float_set_inexact(); 2566 aSign = extractFloat64Sign( a ); 2567 switch ( float_rounding_mode() ) { 2568 case float_round_nearest_even: 2569 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2570 return packFloat64( aSign, 0x3FF, 0 ); 2571 } 2572 break; 2573 case float_round_down: 2574 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2575 case float_round_up: 2576 return 2577 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2578 } 2579 return packFloat64( aSign, 0, 0 ); 2580 } 2581 lastBitMask = 1; 2582 lastBitMask <<= 0x433 - aExp; 2583 roundBitsMask = lastBitMask - 1; 2584 z = a; 2585 roundingMode = float_rounding_mode(); 2586 if ( roundingMode == float_round_nearest_even ) { 2587 z += lastBitMask>>1; 2588 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2589 } 2590 else if ( roundingMode != float_round_to_zero ) { 2591 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2592 z += roundBitsMask; 2593 } 2594 } 2595 z &= ~ roundBitsMask; 2596 if ( z != a ) float_set_inexact(); 2597 return z; 2598 2599 } 2600 #endif 2601 2602 /*---------------------------------------------------------------------------- 2603 | Returns the result of adding the absolute values of the double-precision 2604 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2605 | before being returned. `zSign' is ignored if the result is a NaN. 2606 | The addition is performed according to the IEC/IEEE Standard for Binary 2607 | Floating-Point Arithmetic. 2608 *----------------------------------------------------------------------------*/ 2609 2610 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2611 { 2612 int16 aExp, bExp, zExp; 2613 bits64 aSig, bSig, zSig; 2614 int16 expDiff; 2615 2616 aSig = extractFloat64Frac( a ); 2617 aExp = extractFloat64Exp( a ); 2618 bSig = extractFloat64Frac( b ); 2619 bExp = extractFloat64Exp( b ); 2620 expDiff = aExp - bExp; 2621 aSig <<= 9; 2622 bSig <<= 9; 2623 if ( 0 < expDiff ) { 2624 if ( aExp == 0x7FF ) { 2625 if ( aSig ) return propagateFloat64NaN( a, b ); 2626 return a; 2627 } 2628 if ( bExp == 0 ) { 2629 --expDiff; 2630 } 2631 else { 2632 bSig |= LIT64( 0x2000000000000000 ); 2633 } 2634 shift64RightJamming( bSig, expDiff, &bSig ); 2635 zExp = aExp; 2636 } 2637 else if ( expDiff < 0 ) { 2638 if ( bExp == 0x7FF ) { 2639 if ( bSig ) return propagateFloat64NaN( a, b ); 2640 return packFloat64( zSign, 0x7FF, 0 ); 2641 } 2642 if ( aExp == 0 ) { 2643 ++expDiff; 2644 } 2645 else { 2646 aSig |= LIT64( 0x2000000000000000 ); 2647 } 2648 shift64RightJamming( aSig, - expDiff, &aSig ); 2649 zExp = bExp; 2650 } 2651 else { 2652 if ( aExp == 0x7FF ) { 2653 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2654 return a; 2655 } 2656 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2657 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2658 zExp = aExp; 2659 goto roundAndPack; 2660 } 2661 aSig |= LIT64( 0x2000000000000000 ); 2662 zSig = ( aSig + bSig )<<1; 2663 --zExp; 2664 if ( (sbits64) zSig < 0 ) { 2665 zSig = aSig + bSig; 2666 ++zExp; 2667 } 2668 roundAndPack: 2669 return roundAndPackFloat64( zSign, zExp, zSig ); 2670 2671 } 2672 2673 /*---------------------------------------------------------------------------- 2674 | Returns the result of subtracting the absolute values of the double- 2675 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2676 | difference is negated before being returned. `zSign' is ignored if the 2677 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2678 | Standard for Binary Floating-Point Arithmetic. 2679 *----------------------------------------------------------------------------*/ 2680 2681 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2682 { 2683 int16 aExp, bExp, zExp; 2684 bits64 aSig, bSig, zSig; 2685 int16 expDiff; 2686 2687 aSig = extractFloat64Frac( a ); 2688 aExp = extractFloat64Exp( a ); 2689 bSig = extractFloat64Frac( b ); 2690 bExp = extractFloat64Exp( b ); 2691 expDiff = aExp - bExp; 2692 aSig <<= 10; 2693 bSig <<= 10; 2694 if ( 0 < expDiff ) goto aExpBigger; 2695 if ( expDiff < 0 ) goto bExpBigger; 2696 if ( aExp == 0x7FF ) { 2697 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2698 float_raise( float_flag_invalid ); 2699 return float64_default_nan; 2700 } 2701 if ( aExp == 0 ) { 2702 aExp = 1; 2703 bExp = 1; 2704 } 2705 if ( bSig < aSig ) goto aBigger; 2706 if ( aSig < bSig ) goto bBigger; 2707 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2708 bExpBigger: 2709 if ( bExp == 0x7FF ) { 2710 if ( bSig ) return propagateFloat64NaN( a, b ); 2711 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2712 } 2713 if ( aExp == 0 ) { 2714 ++expDiff; 2715 } 2716 else { 2717 aSig |= LIT64( 0x4000000000000000 ); 2718 } 2719 shift64RightJamming( aSig, - expDiff, &aSig ); 2720 bSig |= LIT64( 0x4000000000000000 ); 2721 bBigger: 2722 zSig = bSig - aSig; 2723 zExp = bExp; 2724 zSign ^= 1; 2725 goto normalizeRoundAndPack; 2726 aExpBigger: 2727 if ( aExp == 0x7FF ) { 2728 if ( aSig ) return propagateFloat64NaN( a, b ); 2729 return a; 2730 } 2731 if ( bExp == 0 ) { 2732 --expDiff; 2733 } 2734 else { 2735 bSig |= LIT64( 0x4000000000000000 ); 2736 } 2737 shift64RightJamming( bSig, expDiff, &bSig ); 2738 aSig |= LIT64( 0x4000000000000000 ); 2739 aBigger: 2740 zSig = aSig - bSig; 2741 zExp = aExp; 2742 normalizeRoundAndPack: 2743 --zExp; 2744 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2745 2746 } 2747 2748 /*---------------------------------------------------------------------------- 2749 | Returns the result of adding the double-precision floating-point values `a' 2750 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2751 | Binary Floating-Point Arithmetic. 2752 *----------------------------------------------------------------------------*/ 2753 2754 float64 float64_add( float64 a, float64 b ) 2755 { 2756 flag aSign, bSign; 2757 2758 aSign = extractFloat64Sign( a ); 2759 bSign = extractFloat64Sign( b ); 2760 if ( aSign == bSign ) { 2761 return addFloat64Sigs( a, b, aSign ); 2762 } 2763 else { 2764 return subFloat64Sigs( a, b, aSign ); 2765 } 2766 2767 } 2768 2769 /*---------------------------------------------------------------------------- 2770 | Returns the result of subtracting the double-precision floating-point values 2771 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2772 | for Binary Floating-Point Arithmetic. 2773 *----------------------------------------------------------------------------*/ 2774 2775 float64 float64_sub( float64 a, float64 b ) 2776 { 2777 flag aSign, bSign; 2778 2779 aSign = extractFloat64Sign( a ); 2780 bSign = extractFloat64Sign( b ); 2781 if ( aSign == bSign ) { 2782 return subFloat64Sigs( a, b, aSign ); 2783 } 2784 else { 2785 return addFloat64Sigs( a, b, aSign ); 2786 } 2787 2788 } 2789 2790 /*---------------------------------------------------------------------------- 2791 | Returns the result of multiplying the double-precision floating-point values 2792 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2793 | for Binary Floating-Point Arithmetic. 2794 *----------------------------------------------------------------------------*/ 2795 2796 float64 float64_mul( float64 a, float64 b ) 2797 { 2798 flag aSign, bSign, zSign; 2799 int16 aExp, bExp, zExp; 2800 bits64 aSig, bSig, zSig0, zSig1; 2801 2802 aSig = extractFloat64Frac( a ); 2803 aExp = extractFloat64Exp( a ); 2804 aSign = extractFloat64Sign( a ); 2805 bSig = extractFloat64Frac( b ); 2806 bExp = extractFloat64Exp( b ); 2807 bSign = extractFloat64Sign( b ); 2808 zSign = aSign ^ bSign; 2809 if ( aExp == 0x7FF ) { 2810 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2811 return propagateFloat64NaN( a, b ); 2812 } 2813 if ( ( bExp | bSig ) == 0 ) { 2814 float_raise( float_flag_invalid ); 2815 return float64_default_nan; 2816 } 2817 return packFloat64( zSign, 0x7FF, 0 ); 2818 } 2819 if ( bExp == 0x7FF ) { 2820 if ( bSig ) return propagateFloat64NaN( a, b ); 2821 if ( ( aExp | aSig ) == 0 ) { 2822 float_raise( float_flag_invalid ); 2823 return float64_default_nan; 2824 } 2825 return packFloat64( zSign, 0x7FF, 0 ); 2826 } 2827 if ( aExp == 0 ) { 2828 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2829 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2830 } 2831 if ( bExp == 0 ) { 2832 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2833 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2834 } 2835 zExp = aExp + bExp - 0x3FF; 2836 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2837 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2838 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2839 zSig0 |= ( zSig1 != 0 ); 2840 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2841 zSig0 <<= 1; 2842 --zExp; 2843 } 2844 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2845 2846 } 2847 2848 /*---------------------------------------------------------------------------- 2849 | Returns the result of dividing the double-precision floating-point value `a' 2850 | by the corresponding value `b'. The operation is performed according to 2851 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2852 *----------------------------------------------------------------------------*/ 2853 2854 float64 float64_div( float64 a, float64 b ) 2855 { 2856 flag aSign, bSign, zSign; 2857 int16 aExp, bExp, zExp; 2858 bits64 aSig, bSig, zSig; 2859 bits64 rem0, rem1; 2860 bits64 term0, term1; 2861 2862 aSig = extractFloat64Frac( a ); 2863 aExp = extractFloat64Exp( a ); 2864 aSign = extractFloat64Sign( a ); 2865 bSig = extractFloat64Frac( b ); 2866 bExp = extractFloat64Exp( b ); 2867 bSign = extractFloat64Sign( b ); 2868 zSign = aSign ^ bSign; 2869 if ( aExp == 0x7FF ) { 2870 if ( aSig ) return propagateFloat64NaN( a, b ); 2871 if ( bExp == 0x7FF ) { 2872 if ( bSig ) return propagateFloat64NaN( a, b ); 2873 float_raise( float_flag_invalid ); 2874 return float64_default_nan; 2875 } 2876 return packFloat64( zSign, 0x7FF, 0 ); 2877 } 2878 if ( bExp == 0x7FF ) { 2879 if ( bSig ) return propagateFloat64NaN( a, b ); 2880 return packFloat64( zSign, 0, 0 ); 2881 } 2882 if ( bExp == 0 ) { 2883 if ( bSig == 0 ) { 2884 if ( ( aExp | aSig ) == 0 ) { 2885 float_raise( float_flag_invalid ); 2886 return float64_default_nan; 2887 } 2888 float_raise( float_flag_divbyzero ); 2889 return packFloat64( zSign, 0x7FF, 0 ); 2890 } 2891 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2892 } 2893 if ( aExp == 0 ) { 2894 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2895 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2896 } 2897 zExp = aExp - bExp + 0x3FD; 2898 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2899 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2900 if ( bSig <= ( aSig + aSig ) ) { 2901 aSig >>= 1; 2902 ++zExp; 2903 } 2904 zSig = estimateDiv128To64( aSig, 0, bSig ); 2905 if ( ( zSig & 0x1FF ) <= 2 ) { 2906 mul64To128( bSig, zSig, &term0, &term1 ); 2907 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2908 while ( (sbits64) rem0 < 0 ) { 2909 --zSig; 2910 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2911 } 2912 zSig |= ( rem1 != 0 ); 2913 } 2914 return roundAndPackFloat64( zSign, zExp, zSig ); 2915 2916 } 2917 2918 #ifndef SOFTFLOAT_FOR_GCC 2919 /*---------------------------------------------------------------------------- 2920 | Returns the remainder of the double-precision floating-point value `a' 2921 | with respect to the corresponding value `b'. The operation is performed 2922 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2923 *----------------------------------------------------------------------------*/ 2924 2925 float64 float64_rem( float64 a, float64 b ) 2926 { 2927 flag aSign, bSign __unused, zSign; 2928 int16 aExp, bExp, expDiff; 2929 bits64 aSig, bSig; 2930 bits64 q, alternateASig; 2931 sbits64 sigMean; 2932 2933 aSig = extractFloat64Frac( a ); 2934 aExp = extractFloat64Exp( a ); 2935 aSign = extractFloat64Sign( a ); 2936 bSig = extractFloat64Frac( b ); 2937 bExp = extractFloat64Exp( b ); 2938 bSign = extractFloat64Sign( b ); 2939 if ( aExp == 0x7FF ) { 2940 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2941 return propagateFloat64NaN( a, b ); 2942 } 2943 float_raise( float_flag_invalid ); 2944 return float64_default_nan; 2945 } 2946 if ( bExp == 0x7FF ) { 2947 if ( bSig ) return propagateFloat64NaN( a, b ); 2948 return a; 2949 } 2950 if ( bExp == 0 ) { 2951 if ( bSig == 0 ) { 2952 float_raise( float_flag_invalid ); 2953 return float64_default_nan; 2954 } 2955 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2956 } 2957 if ( aExp == 0 ) { 2958 if ( aSig == 0 ) return a; 2959 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2960 } 2961 expDiff = aExp - bExp; 2962 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2963 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2964 if ( expDiff < 0 ) { 2965 if ( expDiff < -1 ) return a; 2966 aSig >>= 1; 2967 } 2968 q = ( bSig <= aSig ); 2969 if ( q ) aSig -= bSig; 2970 expDiff -= 64; 2971 while ( 0 < expDiff ) { 2972 q = estimateDiv128To64( aSig, 0, bSig ); 2973 q = ( 2 < q ) ? q - 2 : 0; 2974 aSig = - ( ( bSig>>2 ) * q ); 2975 expDiff -= 62; 2976 } 2977 expDiff += 64; 2978 if ( 0 < expDiff ) { 2979 q = estimateDiv128To64( aSig, 0, bSig ); 2980 q = ( 2 < q ) ? q - 2 : 0; 2981 q >>= 64 - expDiff; 2982 bSig >>= 2; 2983 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2984 } 2985 else { 2986 aSig >>= 2; 2987 bSig >>= 2; 2988 } 2989 do { 2990 alternateASig = aSig; 2991 ++q; 2992 aSig -= bSig; 2993 } while ( 0 <= (sbits64) aSig ); 2994 sigMean = aSig + alternateASig; 2995 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2996 aSig = alternateASig; 2997 } 2998 zSign = ( (sbits64) aSig < 0 ); 2999 if ( zSign ) aSig = - aSig; 3000 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3001 3002 } 3003 3004 /*---------------------------------------------------------------------------- 3005 | Returns the square root of the double-precision floating-point value `a'. 3006 | The operation is performed according to the IEC/IEEE Standard for Binary 3007 | Floating-Point Arithmetic. 3008 *----------------------------------------------------------------------------*/ 3009 3010 float64 float64_sqrt( float64 a ) 3011 { 3012 flag aSign; 3013 int16 aExp, zExp; 3014 bits64 aSig, zSig, doubleZSig; 3015 bits64 rem0, rem1, term0, term1; 3016 3017 aSig = extractFloat64Frac( a ); 3018 aExp = extractFloat64Exp( a ); 3019 aSign = extractFloat64Sign( a ); 3020 if ( aExp == 0x7FF ) { 3021 if ( aSig ) return propagateFloat64NaN( a, a ); 3022 if ( ! aSign ) return a; 3023 float_raise( float_flag_invalid ); 3024 return float64_default_nan; 3025 } 3026 if ( aSign ) { 3027 if ( ( aExp | aSig ) == 0 ) return a; 3028 float_raise( float_flag_invalid ); 3029 return float64_default_nan; 3030 } 3031 if ( aExp == 0 ) { 3032 if ( aSig == 0 ) return 0; 3033 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3034 } 3035 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3036 aSig |= LIT64( 0x0010000000000000 ); 3037 zSig = estimateSqrt32( aExp, aSig>>21 ); 3038 aSig <<= 9 - ( aExp & 1 ); 3039 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3040 if ( ( zSig & 0x1FF ) <= 5 ) { 3041 doubleZSig = zSig<<1; 3042 mul64To128( zSig, zSig, &term0, &term1 ); 3043 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3044 while ( (sbits64) rem0 < 0 ) { 3045 --zSig; 3046 doubleZSig -= 2; 3047 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3048 } 3049 zSig |= ( ( rem0 | rem1 ) != 0 ); 3050 } 3051 return roundAndPackFloat64( 0, zExp, zSig ); 3052 3053 } 3054 #endif 3055 3056 /*---------------------------------------------------------------------------- 3057 | Returns 1 if the double-precision floating-point value `a' is equal to the 3058 | corresponding value `b', and 0 otherwise. The comparison is performed 3059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3060 *----------------------------------------------------------------------------*/ 3061 3062 flag float64_eq( float64 a, float64 b ) 3063 { 3064 3065 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3066 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3067 ) { 3068 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3069 float_raise( float_flag_invalid ); 3070 } 3071 return 0; 3072 } 3073 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3074 3075 } 3076 3077 /*---------------------------------------------------------------------------- 3078 | Returns 1 if the double-precision floating-point value `a' is less than or 3079 | equal to the corresponding value `b', and 0 otherwise. The comparison is 3080 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3081 | Arithmetic. 3082 *----------------------------------------------------------------------------*/ 3083 3084 flag float64_le( float64 a, float64 b ) 3085 { 3086 flag aSign, bSign; 3087 3088 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3089 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3090 ) { 3091 float_raise( float_flag_invalid ); 3092 return 0; 3093 } 3094 aSign = extractFloat64Sign( a ); 3095 bSign = extractFloat64Sign( b ); 3096 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3097 return ( a == b ) || ( aSign ^ ( a < b ) ); 3098 3099 } 3100 3101 /*---------------------------------------------------------------------------- 3102 | Returns 1 if the double-precision floating-point value `a' is less than 3103 | the corresponding value `b', and 0 otherwise. The comparison is performed 3104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3105 *----------------------------------------------------------------------------*/ 3106 3107 flag float64_lt( float64 a, float64 b ) 3108 { 3109 flag aSign, bSign; 3110 3111 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3112 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3113 ) { 3114 float_raise( float_flag_invalid ); 3115 return 0; 3116 } 3117 aSign = extractFloat64Sign( a ); 3118 bSign = extractFloat64Sign( b ); 3119 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3120 return ( a != b ) && ( aSign ^ ( a < b ) ); 3121 3122 } 3123 3124 #ifndef SOFTFLOAT_FOR_GCC 3125 /*---------------------------------------------------------------------------- 3126 | Returns 1 if the double-precision floating-point value `a' is equal to the 3127 | corresponding value `b', and 0 otherwise. The invalid exception is raised 3128 | if either operand is a NaN. Otherwise, the comparison is performed 3129 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3130 *----------------------------------------------------------------------------*/ 3131 3132 flag float64_eq_signaling( float64 a, float64 b ) 3133 { 3134 3135 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3136 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3137 ) { 3138 float_raise( float_flag_invalid ); 3139 return 0; 3140 } 3141 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3142 3143 } 3144 3145 /*---------------------------------------------------------------------------- 3146 | Returns 1 if the double-precision floating-point value `a' is less than or 3147 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3148 | cause an exception. Otherwise, the comparison is performed according to the 3149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3150 *----------------------------------------------------------------------------*/ 3151 3152 flag float64_le_quiet( float64 a, float64 b ) 3153 { 3154 flag aSign, bSign; 3155 3156 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3157 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3158 ) { 3159 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3160 float_raise( float_flag_invalid ); 3161 } 3162 return 0; 3163 } 3164 aSign = extractFloat64Sign( a ); 3165 bSign = extractFloat64Sign( b ); 3166 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3167 return ( a == b ) || ( aSign ^ ( a < b ) ); 3168 3169 } 3170 3171 /*---------------------------------------------------------------------------- 3172 | Returns 1 if the double-precision floating-point value `a' is less than 3173 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3174 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3175 | Standard for Binary Floating-Point Arithmetic. 3176 *----------------------------------------------------------------------------*/ 3177 3178 flag float64_lt_quiet( float64 a, float64 b ) 3179 { 3180 flag aSign, bSign; 3181 3182 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3183 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3184 ) { 3185 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3186 float_raise( float_flag_invalid ); 3187 } 3188 return 0; 3189 } 3190 aSign = extractFloat64Sign( a ); 3191 bSign = extractFloat64Sign( b ); 3192 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3193 return ( a != b ) && ( aSign ^ ( a < b ) ); 3194 3195 } 3196 #endif 3197 3198 #ifdef FLOATX80 3199 3200 /*---------------------------------------------------------------------------- 3201 | Returns the result of converting the extended double-precision floating- 3202 | point value `a' to the 32-bit two's complement integer format. The 3203 | conversion is performed according to the IEC/IEEE Standard for Binary 3204 | Floating-Point Arithmetic---which means in particular that the conversion 3205 | is rounded according to the current rounding mode. If `a' is a NaN, the 3206 | largest positive integer is returned. Otherwise, if the conversion 3207 | overflows, the largest integer with the same sign as `a' is returned. 3208 *----------------------------------------------------------------------------*/ 3209 3210 int32 floatx80_to_int32( floatx80 a ) 3211 { 3212 flag aSign; 3213 int32 aExp, shiftCount; 3214 bits64 aSig; 3215 3216 aSig = extractFloatx80Frac( a ); 3217 aExp = extractFloatx80Exp( a ); 3218 aSign = extractFloatx80Sign( a ); 3219 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3220 shiftCount = 0x4037 - aExp; 3221 if ( shiftCount <= 0 ) shiftCount = 1; 3222 shift64RightJamming( aSig, shiftCount, &aSig ); 3223 return roundAndPackInt32( aSign, aSig ); 3224 3225 } 3226 3227 /*---------------------------------------------------------------------------- 3228 | Returns the result of converting the extended double-precision floating- 3229 | point value `a' to the 32-bit two's complement integer format. The 3230 | conversion is performed according to the IEC/IEEE Standard for Binary 3231 | Floating-Point Arithmetic, except that the conversion is always rounded 3232 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3233 | Otherwise, if the conversion overflows, the largest integer with the same 3234 | sign as `a' is returned. 3235 *----------------------------------------------------------------------------*/ 3236 3237 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3238 { 3239 flag aSign; 3240 int32 aExp, shiftCount; 3241 bits64 aSig, savedASig; 3242 int32 z; 3243 3244 aSig = extractFloatx80Frac( a ); 3245 aExp = extractFloatx80Exp( a ); 3246 aSign = extractFloatx80Sign( a ); 3247 if ( 0x401E < aExp ) { 3248 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3249 goto invalid; 3250 } 3251 else if ( aExp < 0x3FFF ) { 3252 if ( aExp || aSig ) float_set_inexact(); 3253 return 0; 3254 } 3255 shiftCount = 0x403E - aExp; 3256 savedASig = aSig; 3257 aSig >>= shiftCount; 3258 z = aSig; 3259 if ( aSign ) z = - z; 3260 if ( ( z < 0 ) ^ aSign ) { 3261 invalid: 3262 float_raise( float_flag_invalid ); 3263 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3264 } 3265 if ( ( aSig<<shiftCount ) != savedASig ) { 3266 float_set_inexact(); 3267 } 3268 return z; 3269 3270 } 3271 3272 /*---------------------------------------------------------------------------- 3273 | Returns the result of converting the extended double-precision floating- 3274 | point value `a' to the 64-bit two's complement integer format. The 3275 | conversion is performed according to the IEC/IEEE Standard for Binary 3276 | Floating-Point Arithmetic---which means in particular that the conversion 3277 | is rounded according to the current rounding mode. If `a' is a NaN, 3278 | the largest positive integer is returned. Otherwise, if the conversion 3279 | overflows, the largest integer with the same sign as `a' is returned. 3280 *----------------------------------------------------------------------------*/ 3281 3282 int64 floatx80_to_int64( floatx80 a ) 3283 { 3284 flag aSign; 3285 int32 aExp, shiftCount; 3286 bits64 aSig, aSigExtra; 3287 3288 aSig = extractFloatx80Frac( a ); 3289 aExp = extractFloatx80Exp( a ); 3290 aSign = extractFloatx80Sign( a ); 3291 shiftCount = 0x403E - aExp; 3292 if ( shiftCount <= 0 ) { 3293 if ( shiftCount ) { 3294 float_raise( float_flag_invalid ); 3295 if ( ! aSign 3296 || ( ( aExp == 0x7FFF ) 3297 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3298 ) { 3299 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3300 } 3301 return (sbits64) LIT64( 0x8000000000000000 ); 3302 } 3303 aSigExtra = 0; 3304 } 3305 else { 3306 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3307 } 3308 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3309 3310 } 3311 3312 /*---------------------------------------------------------------------------- 3313 | Returns the result of converting the extended double-precision floating- 3314 | point value `a' to the 64-bit two's complement integer format. The 3315 | conversion is performed according to the IEC/IEEE Standard for Binary 3316 | Floating-Point Arithmetic, except that the conversion is always rounded 3317 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3318 | Otherwise, if the conversion overflows, the largest integer with the same 3319 | sign as `a' is returned. 3320 *----------------------------------------------------------------------------*/ 3321 3322 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3323 { 3324 flag aSign; 3325 int32 aExp, shiftCount; 3326 bits64 aSig; 3327 int64 z; 3328 3329 aSig = extractFloatx80Frac( a ); 3330 aExp = extractFloatx80Exp( a ); 3331 aSign = extractFloatx80Sign( a ); 3332 shiftCount = aExp - 0x403E; 3333 if ( 0 <= shiftCount ) { 3334 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3335 if ( ( a.high != 0xC03E ) || aSig ) { 3336 float_raise( float_flag_invalid ); 3337 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3338 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3339 } 3340 } 3341 return (sbits64) LIT64( 0x8000000000000000 ); 3342 } 3343 else if ( aExp < 0x3FFF ) { 3344 if ( aExp | aSig ) float_set_inexact(); 3345 return 0; 3346 } 3347 z = aSig>>( - shiftCount ); 3348 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3349 float_set_inexact(); 3350 } 3351 if ( aSign ) z = - z; 3352 return z; 3353 3354 } 3355 3356 /*---------------------------------------------------------------------------- 3357 | Returns the result of converting the extended double-precision floating- 3358 | point value `a' to the single-precision floating-point format. The 3359 | conversion is performed according to the IEC/IEEE Standard for Binary 3360 | Floating-Point Arithmetic. 3361 *----------------------------------------------------------------------------*/ 3362 3363 float32 floatx80_to_float32( floatx80 a ) 3364 { 3365 flag aSign; 3366 int32 aExp; 3367 bits64 aSig; 3368 3369 aSig = extractFloatx80Frac( a ); 3370 aExp = extractFloatx80Exp( a ); 3371 aSign = extractFloatx80Sign( a ); 3372 if ( aExp == 0x7FFF ) { 3373 if ( (bits64) ( aSig<<1 ) ) { 3374 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3375 } 3376 return packFloat32( aSign, 0xFF, 0 ); 3377 } 3378 shift64RightJamming( aSig, 33, &aSig ); 3379 if ( aExp || aSig ) aExp -= 0x3F81; 3380 return roundAndPackFloat32( aSign, aExp, aSig ); 3381 3382 } 3383 3384 /*---------------------------------------------------------------------------- 3385 | Returns the result of converting the extended double-precision floating- 3386 | point value `a' to the double-precision floating-point format. The 3387 | conversion is performed according to the IEC/IEEE Standard for Binary 3388 | Floating-Point Arithmetic. 3389 *----------------------------------------------------------------------------*/ 3390 3391 float64 floatx80_to_float64( floatx80 a ) 3392 { 3393 flag aSign; 3394 int32 aExp; 3395 bits64 aSig, zSig; 3396 3397 aSig = extractFloatx80Frac( a ); 3398 aExp = extractFloatx80Exp( a ); 3399 aSign = extractFloatx80Sign( a ); 3400 if ( aExp == 0x7FFF ) { 3401 if ( (bits64) ( aSig<<1 ) ) { 3402 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3403 } 3404 return packFloat64( aSign, 0x7FF, 0 ); 3405 } 3406 shift64RightJamming( aSig, 1, &zSig ); 3407 if ( aExp || aSig ) aExp -= 0x3C01; 3408 return roundAndPackFloat64( aSign, aExp, zSig ); 3409 3410 } 3411 3412 #ifdef FLOAT128 3413 3414 /*---------------------------------------------------------------------------- 3415 | Returns the result of converting the extended double-precision floating- 3416 | point value `a' to the quadruple-precision floating-point format. The 3417 | conversion is performed according to the IEC/IEEE Standard for Binary 3418 | Floating-Point Arithmetic. 3419 *----------------------------------------------------------------------------*/ 3420 3421 float128 floatx80_to_float128( floatx80 a ) 3422 { 3423 flag aSign; 3424 int16 aExp; 3425 bits64 aSig, zSig0, zSig1; 3426 3427 aSig = extractFloatx80Frac( a ); 3428 aExp = extractFloatx80Exp( a ); 3429 aSign = extractFloatx80Sign( a ); 3430 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3431 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3432 } 3433 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3434 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3435 3436 } 3437 3438 #endif 3439 3440 /*---------------------------------------------------------------------------- 3441 | Rounds the extended double-precision floating-point value `a' to an integer, 3442 | and returns the result as an extended quadruple-precision floating-point 3443 | value. The operation is performed according to the IEC/IEEE Standard for 3444 | Binary Floating-Point Arithmetic. 3445 *----------------------------------------------------------------------------*/ 3446 3447 floatx80 floatx80_round_to_int( floatx80 a ) 3448 { 3449 flag aSign; 3450 int32 aExp; 3451 bits64 lastBitMask, roundBitsMask; 3452 int8 roundingMode; 3453 floatx80 z; 3454 3455 aExp = extractFloatx80Exp( a ); 3456 if ( 0x403E <= aExp ) { 3457 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3458 return propagateFloatx80NaN( a, a ); 3459 } 3460 return a; 3461 } 3462 if ( aExp < 0x3FFF ) { 3463 if ( ( aExp == 0 ) 3464 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3465 return a; 3466 } 3467 float_set_inexact(); 3468 aSign = extractFloatx80Sign( a ); 3469 switch ( float_rounding_mode() ) { 3470 case float_round_nearest_even: 3471 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3472 ) { 3473 return 3474 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3475 } 3476 break; 3477 case float_round_down: 3478 return 3479 aSign ? 3480 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3481 : packFloatx80( 0, 0, 0 ); 3482 case float_round_up: 3483 return 3484 aSign ? packFloatx80( 1, 0, 0 ) 3485 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3486 } 3487 return packFloatx80( aSign, 0, 0 ); 3488 } 3489 lastBitMask = 1; 3490 lastBitMask <<= 0x403E - aExp; 3491 roundBitsMask = lastBitMask - 1; 3492 z = a; 3493 roundingMode = float_rounding_mode(); 3494 if ( roundingMode == float_round_nearest_even ) { 3495 z.low += lastBitMask>>1; 3496 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3497 } 3498 else if ( roundingMode != float_round_to_zero ) { 3499 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3500 z.low += roundBitsMask; 3501 } 3502 } 3503 z.low &= ~ roundBitsMask; 3504 if ( z.low == 0 ) { 3505 ++z.high; 3506 z.low = LIT64( 0x8000000000000000 ); 3507 } 3508 if ( z.low != a.low ) float_set_inexact(); 3509 return z; 3510 3511 } 3512 3513 /*---------------------------------------------------------------------------- 3514 | Returns the result of adding the absolute values of the extended double- 3515 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3516 | negated before being returned. `zSign' is ignored if the result is a NaN. 3517 | The addition is performed according to the IEC/IEEE Standard for Binary 3518 | Floating-Point Arithmetic. 3519 *----------------------------------------------------------------------------*/ 3520 3521 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3522 { 3523 int32 aExp, bExp, zExp; 3524 bits64 aSig, bSig, zSig0, zSig1; 3525 int32 expDiff; 3526 3527 aSig = extractFloatx80Frac( a ); 3528 aExp = extractFloatx80Exp( a ); 3529 bSig = extractFloatx80Frac( b ); 3530 bExp = extractFloatx80Exp( b ); 3531 expDiff = aExp - bExp; 3532 if ( 0 < expDiff ) { 3533 if ( aExp == 0x7FFF ) { 3534 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3535 return a; 3536 } 3537 if ( bExp == 0 ) --expDiff; 3538 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3539 zExp = aExp; 3540 } 3541 else if ( expDiff < 0 ) { 3542 if ( bExp == 0x7FFF ) { 3543 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3544 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3545 } 3546 if ( aExp == 0 ) ++expDiff; 3547 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3548 zExp = bExp; 3549 } 3550 else { 3551 if ( aExp == 0x7FFF ) { 3552 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3553 return propagateFloatx80NaN( a, b ); 3554 } 3555 return a; 3556 } 3557 zSig1 = 0; 3558 zSig0 = aSig + bSig; 3559 if ( aExp == 0 ) { 3560 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3561 goto roundAndPack; 3562 } 3563 zExp = aExp; 3564 goto shiftRight1; 3565 } 3566 zSig0 = aSig + bSig; 3567 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3568 shiftRight1: 3569 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3570 zSig0 |= LIT64( 0x8000000000000000 ); 3571 ++zExp; 3572 roundAndPack: 3573 return 3574 roundAndPackFloatx80( 3575 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3576 3577 } 3578 3579 /*---------------------------------------------------------------------------- 3580 | Returns the result of subtracting the absolute values of the extended 3581 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3582 | difference is negated before being returned. `zSign' is ignored if the 3583 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3584 | Standard for Binary Floating-Point Arithmetic. 3585 *----------------------------------------------------------------------------*/ 3586 3587 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3588 { 3589 int32 aExp, bExp, zExp; 3590 bits64 aSig, bSig, zSig0, zSig1; 3591 int32 expDiff; 3592 floatx80 z; 3593 3594 aSig = extractFloatx80Frac( a ); 3595 aExp = extractFloatx80Exp( a ); 3596 bSig = extractFloatx80Frac( b ); 3597 bExp = extractFloatx80Exp( b ); 3598 expDiff = aExp - bExp; 3599 if ( 0 < expDiff ) goto aExpBigger; 3600 if ( expDiff < 0 ) goto bExpBigger; 3601 if ( aExp == 0x7FFF ) { 3602 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3603 return propagateFloatx80NaN( a, b ); 3604 } 3605 float_raise( float_flag_invalid ); 3606 z.low = floatx80_default_nan_low; 3607 z.high = floatx80_default_nan_high; 3608 return z; 3609 } 3610 if ( aExp == 0 ) { 3611 aExp = 1; 3612 bExp = 1; 3613 } 3614 zSig1 = 0; 3615 if ( bSig < aSig ) goto aBigger; 3616 if ( aSig < bSig ) goto bBigger; 3617 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3618 bExpBigger: 3619 if ( bExp == 0x7FFF ) { 3620 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3621 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3622 } 3623 if ( aExp == 0 ) ++expDiff; 3624 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3625 bBigger: 3626 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3627 zExp = bExp; 3628 zSign ^= 1; 3629 goto normalizeRoundAndPack; 3630 aExpBigger: 3631 if ( aExp == 0x7FFF ) { 3632 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3633 return a; 3634 } 3635 if ( bExp == 0 ) --expDiff; 3636 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3637 aBigger: 3638 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3639 zExp = aExp; 3640 normalizeRoundAndPack: 3641 return 3642 normalizeRoundAndPackFloatx80( 3643 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3644 3645 } 3646 3647 /*---------------------------------------------------------------------------- 3648 | Returns the result of adding the extended double-precision floating-point 3649 | values `a' and `b'. The operation is performed according to the IEC/IEEE 3650 | Standard for Binary Floating-Point Arithmetic. 3651 *----------------------------------------------------------------------------*/ 3652 3653 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3654 { 3655 flag aSign, bSign; 3656 3657 aSign = extractFloatx80Sign( a ); 3658 bSign = extractFloatx80Sign( b ); 3659 if ( aSign == bSign ) { 3660 return addFloatx80Sigs( a, b, aSign ); 3661 } 3662 else { 3663 return subFloatx80Sigs( a, b, aSign ); 3664 } 3665 3666 } 3667 3668 /*---------------------------------------------------------------------------- 3669 | Returns the result of subtracting the extended double-precision floating- 3670 | point values `a' and `b'. The operation is performed according to the 3671 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3672 *----------------------------------------------------------------------------*/ 3673 3674 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3675 { 3676 flag aSign, bSign; 3677 3678 aSign = extractFloatx80Sign( a ); 3679 bSign = extractFloatx80Sign( b ); 3680 if ( aSign == bSign ) { 3681 return subFloatx80Sigs( a, b, aSign ); 3682 } 3683 else { 3684 return addFloatx80Sigs( a, b, aSign ); 3685 } 3686 3687 } 3688 3689 /*---------------------------------------------------------------------------- 3690 | Returns the result of multiplying the extended double-precision floating- 3691 | point values `a' and `b'. The operation is performed according to the 3692 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3693 *----------------------------------------------------------------------------*/ 3694 3695 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3696 { 3697 flag aSign, bSign, zSign; 3698 int32 aExp, bExp, zExp; 3699 bits64 aSig, bSig, zSig0, zSig1; 3700 floatx80 z; 3701 3702 aSig = extractFloatx80Frac( a ); 3703 aExp = extractFloatx80Exp( a ); 3704 aSign = extractFloatx80Sign( a ); 3705 bSig = extractFloatx80Frac( b ); 3706 bExp = extractFloatx80Exp( b ); 3707 bSign = extractFloatx80Sign( b ); 3708 zSign = aSign ^ bSign; 3709 if ( aExp == 0x7FFF ) { 3710 if ( (bits64) ( aSig<<1 ) 3711 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3712 return propagateFloatx80NaN( a, b ); 3713 } 3714 if ( ( bExp | bSig ) == 0 ) goto invalid; 3715 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3716 } 3717 if ( bExp == 0x7FFF ) { 3718 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3719 if ( ( aExp | aSig ) == 0 ) { 3720 invalid: 3721 float_raise( float_flag_invalid ); 3722 z.low = floatx80_default_nan_low; 3723 z.high = floatx80_default_nan_high; 3724 return z; 3725 } 3726 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3727 } 3728 if ( aExp == 0 ) { 3729 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3730 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3731 } 3732 if ( bExp == 0 ) { 3733 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3734 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3735 } 3736 zExp = aExp + bExp - 0x3FFE; 3737 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3738 if ( 0 < (sbits64) zSig0 ) { 3739 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3740 --zExp; 3741 } 3742 return 3743 roundAndPackFloatx80( 3744 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3745 3746 } 3747 3748 /*---------------------------------------------------------------------------- 3749 | Returns the result of dividing the extended double-precision floating-point 3750 | value `a' by the corresponding value `b'. The operation is performed 3751 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3752 *----------------------------------------------------------------------------*/ 3753 3754 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3755 { 3756 flag aSign, bSign, zSign; 3757 int32 aExp, bExp, zExp; 3758 bits64 aSig, bSig, zSig0, zSig1; 3759 bits64 rem0, rem1, rem2, term0, term1, term2; 3760 floatx80 z; 3761 3762 aSig = extractFloatx80Frac( a ); 3763 aExp = extractFloatx80Exp( a ); 3764 aSign = extractFloatx80Sign( a ); 3765 bSig = extractFloatx80Frac( b ); 3766 bExp = extractFloatx80Exp( b ); 3767 bSign = extractFloatx80Sign( b ); 3768 zSign = aSign ^ bSign; 3769 if ( aExp == 0x7FFF ) { 3770 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3771 if ( bExp == 0x7FFF ) { 3772 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3773 goto invalid; 3774 } 3775 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3776 } 3777 if ( bExp == 0x7FFF ) { 3778 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3779 return packFloatx80( zSign, 0, 0 ); 3780 } 3781 if ( bExp == 0 ) { 3782 if ( bSig == 0 ) { 3783 if ( ( aExp | aSig ) == 0 ) { 3784 invalid: 3785 float_raise( float_flag_invalid ); 3786 z.low = floatx80_default_nan_low; 3787 z.high = floatx80_default_nan_high; 3788 return z; 3789 } 3790 float_raise( float_flag_divbyzero ); 3791 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3792 } 3793 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3794 } 3795 if ( aExp == 0 ) { 3796 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3797 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3798 } 3799 zExp = aExp - bExp + 0x3FFE; 3800 rem1 = 0; 3801 if ( bSig <= aSig ) { 3802 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3803 ++zExp; 3804 } 3805 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3806 mul64To128( bSig, zSig0, &term0, &term1 ); 3807 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3808 while ( (sbits64) rem0 < 0 ) { 3809 --zSig0; 3810 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3811 } 3812 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3813 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3814 mul64To128( bSig, zSig1, &term1, &term2 ); 3815 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3816 while ( (sbits64) rem1 < 0 ) { 3817 --zSig1; 3818 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3819 } 3820 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3821 } 3822 return 3823 roundAndPackFloatx80( 3824 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3825 3826 } 3827 3828 /*---------------------------------------------------------------------------- 3829 | Returns the remainder of the extended double-precision floating-point value 3830 | `a' with respect to the corresponding value `b'. The operation is performed 3831 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3832 *----------------------------------------------------------------------------*/ 3833 3834 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3835 { 3836 flag aSign, bSign, zSign; 3837 int32 aExp, bExp, expDiff; 3838 bits64 aSig0, aSig1, bSig; 3839 bits64 q, term0, term1, alternateASig0, alternateASig1; 3840 floatx80 z; 3841 3842 aSig0 = extractFloatx80Frac( a ); 3843 aExp = extractFloatx80Exp( a ); 3844 aSign = extractFloatx80Sign( a ); 3845 bSig = extractFloatx80Frac( b ); 3846 bExp = extractFloatx80Exp( b ); 3847 bSign = extractFloatx80Sign( b ); 3848 if ( aExp == 0x7FFF ) { 3849 if ( (bits64) ( aSig0<<1 ) 3850 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3851 return propagateFloatx80NaN( a, b ); 3852 } 3853 goto invalid; 3854 } 3855 if ( bExp == 0x7FFF ) { 3856 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3857 return a; 3858 } 3859 if ( bExp == 0 ) { 3860 if ( bSig == 0 ) { 3861 invalid: 3862 float_raise( float_flag_invalid ); 3863 z.low = floatx80_default_nan_low; 3864 z.high = floatx80_default_nan_high; 3865 return z; 3866 } 3867 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3868 } 3869 if ( aExp == 0 ) { 3870 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3871 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3872 } 3873 bSig |= LIT64( 0x8000000000000000 ); 3874 zSign = aSign; 3875 expDiff = aExp - bExp; 3876 aSig1 = 0; 3877 if ( expDiff < 0 ) { 3878 if ( expDiff < -1 ) return a; 3879 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3880 expDiff = 0; 3881 } 3882 q = ( bSig <= aSig0 ); 3883 if ( q ) aSig0 -= bSig; 3884 expDiff -= 64; 3885 while ( 0 < expDiff ) { 3886 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3887 q = ( 2 < q ) ? q - 2 : 0; 3888 mul64To128( bSig, q, &term0, &term1 ); 3889 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3890 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3891 expDiff -= 62; 3892 } 3893 expDiff += 64; 3894 if ( 0 < expDiff ) { 3895 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3896 q = ( 2 < q ) ? q - 2 : 0; 3897 q >>= 64 - expDiff; 3898 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3899 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3900 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3901 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3902 ++q; 3903 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3904 } 3905 } 3906 else { 3907 term1 = 0; 3908 term0 = bSig; 3909 } 3910 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3911 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3912 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3913 && ( q & 1 ) ) 3914 ) { 3915 aSig0 = alternateASig0; 3916 aSig1 = alternateASig1; 3917 zSign = ! zSign; 3918 } 3919 return 3920 normalizeRoundAndPackFloatx80( 3921 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3922 3923 } 3924 3925 /*---------------------------------------------------------------------------- 3926 | Returns the square root of the extended double-precision floating-point 3927 | value `a'. The operation is performed according to the IEC/IEEE Standard 3928 | for Binary Floating-Point Arithmetic. 3929 *----------------------------------------------------------------------------*/ 3930 3931 floatx80 floatx80_sqrt( floatx80 a ) 3932 { 3933 flag aSign; 3934 int32 aExp, zExp; 3935 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3936 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3937 floatx80 z; 3938 3939 aSig0 = extractFloatx80Frac( a ); 3940 aExp = extractFloatx80Exp( a ); 3941 aSign = extractFloatx80Sign( a ); 3942 if ( aExp == 0x7FFF ) { 3943 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 3944 if ( ! aSign ) return a; 3945 goto invalid; 3946 } 3947 if ( aSign ) { 3948 if ( ( aExp | aSig0 ) == 0 ) return a; 3949 invalid: 3950 float_raise( float_flag_invalid ); 3951 z.low = floatx80_default_nan_low; 3952 z.high = floatx80_default_nan_high; 3953 return z; 3954 } 3955 if ( aExp == 0 ) { 3956 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 3957 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3958 } 3959 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 3960 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 3961 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 3962 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 3963 doubleZSig0 = zSig0<<1; 3964 mul64To128( zSig0, zSig0, &term0, &term1 ); 3965 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 3966 while ( (sbits64) rem0 < 0 ) { 3967 --zSig0; 3968 doubleZSig0 -= 2; 3969 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 3970 } 3971 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 3972 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 3973 if ( zSig1 == 0 ) zSig1 = 1; 3974 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 3975 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3976 mul64To128( zSig1, zSig1, &term2, &term3 ); 3977 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 3978 while ( (sbits64) rem1 < 0 ) { 3979 --zSig1; 3980 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 3981 term3 |= 1; 3982 term2 |= doubleZSig0; 3983 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 3984 } 3985 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 3986 } 3987 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 3988 zSig0 |= doubleZSig0; 3989 return 3990 roundAndPackFloatx80( 3991 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 3992 3993 } 3994 3995 /*---------------------------------------------------------------------------- 3996 | Returns 1 if the extended double-precision floating-point value `a' is 3997 | equal to the corresponding value `b', and 0 otherwise. The comparison is 3998 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3999 | Arithmetic. 4000 *----------------------------------------------------------------------------*/ 4001 4002 flag floatx80_eq( floatx80 a, floatx80 b ) 4003 { 4004 4005 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4006 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4007 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4008 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4009 ) { 4010 if ( floatx80_is_signaling_nan( a ) 4011 || floatx80_is_signaling_nan( b ) ) { 4012 float_raise( float_flag_invalid ); 4013 } 4014 return 0; 4015 } 4016 return 4017 ( a.low == b.low ) 4018 && ( ( a.high == b.high ) 4019 || ( ( a.low == 0 ) 4020 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4021 ); 4022 4023 } 4024 4025 /*---------------------------------------------------------------------------- 4026 | Returns 1 if the extended double-precision floating-point value `a' is 4027 | less than or equal to the corresponding value `b', and 0 otherwise. The 4028 | comparison is performed according to the IEC/IEEE Standard for Binary 4029 | Floating-Point Arithmetic. 4030 *----------------------------------------------------------------------------*/ 4031 4032 flag floatx80_le( floatx80 a, floatx80 b ) 4033 { 4034 flag aSign, bSign; 4035 4036 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4037 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4038 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4039 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4040 ) { 4041 float_raise( float_flag_invalid ); 4042 return 0; 4043 } 4044 aSign = extractFloatx80Sign( a ); 4045 bSign = extractFloatx80Sign( b ); 4046 if ( aSign != bSign ) { 4047 return 4048 aSign 4049 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4050 == 0 ); 4051 } 4052 return 4053 aSign ? le128( b.high, b.low, a.high, a.low ) 4054 : le128( a.high, a.low, b.high, b.low ); 4055 4056 } 4057 4058 /*---------------------------------------------------------------------------- 4059 | Returns 1 if the extended double-precision floating-point value `a' is 4060 | less than the corresponding value `b', and 0 otherwise. The comparison 4061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4062 | Arithmetic. 4063 *----------------------------------------------------------------------------*/ 4064 4065 flag floatx80_lt( floatx80 a, floatx80 b ) 4066 { 4067 flag aSign, bSign; 4068 4069 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4070 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4071 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4072 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4073 ) { 4074 float_raise( float_flag_invalid ); 4075 return 0; 4076 } 4077 aSign = extractFloatx80Sign( a ); 4078 bSign = extractFloatx80Sign( b ); 4079 if ( aSign != bSign ) { 4080 return 4081 aSign 4082 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4083 != 0 ); 4084 } 4085 return 4086 aSign ? lt128( b.high, b.low, a.high, a.low ) 4087 : lt128( a.high, a.low, b.high, b.low ); 4088 4089 } 4090 4091 /*---------------------------------------------------------------------------- 4092 | Returns 1 if the extended double-precision floating-point value `a' is equal 4093 | to the corresponding value `b', and 0 otherwise. The invalid exception is 4094 | raised if either operand is a NaN. Otherwise, the comparison is performed 4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4096 *----------------------------------------------------------------------------*/ 4097 4098 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4099 { 4100 4101 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4102 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4103 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4104 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4105 ) { 4106 float_raise( float_flag_invalid ); 4107 return 0; 4108 } 4109 return 4110 ( a.low == b.low ) 4111 && ( ( a.high == b.high ) 4112 || ( ( a.low == 0 ) 4113 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4114 ); 4115 4116 } 4117 4118 /*---------------------------------------------------------------------------- 4119 | Returns 1 if the extended double-precision floating-point value `a' is less 4120 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4121 | do not cause an exception. Otherwise, the comparison is performed according 4122 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4123 *----------------------------------------------------------------------------*/ 4124 4125 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4126 { 4127 flag aSign, bSign; 4128 4129 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4130 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4131 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4132 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4133 ) { 4134 if ( floatx80_is_signaling_nan( a ) 4135 || floatx80_is_signaling_nan( b ) ) { 4136 float_raise( float_flag_invalid ); 4137 } 4138 return 0; 4139 } 4140 aSign = extractFloatx80Sign( a ); 4141 bSign = extractFloatx80Sign( b ); 4142 if ( aSign != bSign ) { 4143 return 4144 aSign 4145 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4146 == 0 ); 4147 } 4148 return 4149 aSign ? le128( b.high, b.low, a.high, a.low ) 4150 : le128( a.high, a.low, b.high, b.low ); 4151 4152 } 4153 4154 /*---------------------------------------------------------------------------- 4155 | Returns 1 if the extended double-precision floating-point value `a' is less 4156 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4157 | an exception. Otherwise, the comparison is performed according to the 4158 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4159 *----------------------------------------------------------------------------*/ 4160 4161 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4162 { 4163 flag aSign, bSign; 4164 4165 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4166 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4167 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4168 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4169 ) { 4170 if ( floatx80_is_signaling_nan( a ) 4171 || floatx80_is_signaling_nan( b ) ) { 4172 float_raise( float_flag_invalid ); 4173 } 4174 return 0; 4175 } 4176 aSign = extractFloatx80Sign( a ); 4177 bSign = extractFloatx80Sign( b ); 4178 if ( aSign != bSign ) { 4179 return 4180 aSign 4181 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4182 != 0 ); 4183 } 4184 return 4185 aSign ? lt128( b.high, b.low, a.high, a.low ) 4186 : lt128( a.high, a.low, b.high, b.low ); 4187 4188 } 4189 4190 #endif 4191 4192 #ifdef FLOAT128 4193 4194 /*---------------------------------------------------------------------------- 4195 | Returns the result of converting the quadruple-precision floating-point 4196 | value `a' to the 32-bit two's complement integer format. The conversion 4197 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4198 | Arithmetic---which means in particular that the conversion is rounded 4199 | according to the current rounding mode. If `a' is a NaN, the largest 4200 | positive integer is returned. Otherwise, if the conversion overflows, the 4201 | largest integer with the same sign as `a' is returned. 4202 *----------------------------------------------------------------------------*/ 4203 4204 int32 float128_to_int32( float128 a ) 4205 { 4206 flag aSign; 4207 int32 aExp, shiftCount; 4208 bits64 aSig0, aSig1; 4209 4210 aSig1 = extractFloat128Frac1( a ); 4211 aSig0 = extractFloat128Frac0( a ); 4212 aExp = extractFloat128Exp( a ); 4213 aSign = extractFloat128Sign( a ); 4214 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4215 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4216 aSig0 |= ( aSig1 != 0 ); 4217 shiftCount = 0x4028 - aExp; 4218 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4219 return roundAndPackInt32( aSign, aSig0 ); 4220 4221 } 4222 4223 /*---------------------------------------------------------------------------- 4224 | Returns the result of converting the quadruple-precision floating-point 4225 | value `a' to the 32-bit two's complement integer format. The conversion 4226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4227 | Arithmetic, except that the conversion is always rounded toward zero. If 4228 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4229 | conversion overflows, the largest integer with the same sign as `a' is 4230 | returned. 4231 *----------------------------------------------------------------------------*/ 4232 4233 int32 float128_to_int32_round_to_zero( float128 a ) 4234 { 4235 flag aSign; 4236 int32 aExp, shiftCount; 4237 bits64 aSig0, aSig1, savedASig; 4238 int32 z; 4239 4240 aSig1 = extractFloat128Frac1( a ); 4241 aSig0 = extractFloat128Frac0( a ); 4242 aExp = extractFloat128Exp( a ); 4243 aSign = extractFloat128Sign( a ); 4244 aSig0 |= ( aSig1 != 0 ); 4245 if ( 0x401E < aExp ) { 4246 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4247 goto invalid; 4248 } 4249 else if ( aExp < 0x3FFF ) { 4250 if ( aExp || aSig0 ) float_set_inexact(); 4251 return 0; 4252 } 4253 aSig0 |= LIT64( 0x0001000000000000 ); 4254 shiftCount = 0x402F - aExp; 4255 savedASig = aSig0; 4256 aSig0 >>= shiftCount; 4257 z = aSig0; 4258 if ( aSign ) z = - z; 4259 if ( ( z < 0 ) ^ aSign ) { 4260 invalid: 4261 float_raise( float_flag_invalid ); 4262 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4263 } 4264 if ( ( aSig0<<shiftCount ) != savedASig ) { 4265 float_set_inexact(); 4266 } 4267 return z; 4268 4269 } 4270 4271 /*---------------------------------------------------------------------------- 4272 | Returns the result of converting the quadruple-precision floating-point 4273 | value `a' to the 64-bit two's complement integer format. The conversion 4274 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4275 | Arithmetic---which means in particular that the conversion is rounded 4276 | according to the current rounding mode. If `a' is a NaN, the largest 4277 | positive integer is returned. Otherwise, if the conversion overflows, the 4278 | largest integer with the same sign as `a' is returned. 4279 *----------------------------------------------------------------------------*/ 4280 4281 int64 float128_to_int64( float128 a ) 4282 { 4283 flag aSign; 4284 int32 aExp, shiftCount; 4285 bits64 aSig0, aSig1; 4286 4287 aSig1 = extractFloat128Frac1( a ); 4288 aSig0 = extractFloat128Frac0( a ); 4289 aExp = extractFloat128Exp( a ); 4290 aSign = extractFloat128Sign( a ); 4291 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4292 shiftCount = 0x402F - aExp; 4293 if ( shiftCount <= 0 ) { 4294 if ( 0x403E < aExp ) { 4295 float_raise( float_flag_invalid ); 4296 if ( ! aSign 4297 || ( ( aExp == 0x7FFF ) 4298 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4299 ) 4300 ) { 4301 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4302 } 4303 return (sbits64) LIT64( 0x8000000000000000 ); 4304 } 4305 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4306 } 4307 else { 4308 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4309 } 4310 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4311 4312 } 4313 4314 /*---------------------------------------------------------------------------- 4315 | Returns the result of converting the quadruple-precision floating-point 4316 | value `a' to the 64-bit two's complement integer format. The conversion 4317 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4318 | Arithmetic, except that the conversion is always rounded toward zero. 4319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4320 | the conversion overflows, the largest integer with the same sign as `a' is 4321 | returned. 4322 *----------------------------------------------------------------------------*/ 4323 4324 int64 float128_to_int64_round_to_zero( float128 a ) 4325 { 4326 flag aSign; 4327 int32 aExp, shiftCount; 4328 bits64 aSig0, aSig1; 4329 int64 z; 4330 4331 aSig1 = extractFloat128Frac1( a ); 4332 aSig0 = extractFloat128Frac0( a ); 4333 aExp = extractFloat128Exp( a ); 4334 aSign = extractFloat128Sign( a ); 4335 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4336 shiftCount = aExp - 0x402F; 4337 if ( 0 < shiftCount ) { 4338 if ( 0x403E <= aExp ) { 4339 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4340 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4341 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4342 if ( aSig1 ) float_set_inexact(); 4343 } 4344 else { 4345 float_raise( float_flag_invalid ); 4346 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4347 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4348 } 4349 } 4350 return (sbits64) LIT64( 0x8000000000000000 ); 4351 } 4352 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4353 if ( (bits64) ( aSig1<<shiftCount ) ) { 4354 float_set_inexact(); 4355 } 4356 } 4357 else { 4358 if ( aExp < 0x3FFF ) { 4359 if ( aExp | aSig0 | aSig1 ) { 4360 float_set_inexact(); 4361 } 4362 return 0; 4363 } 4364 z = aSig0>>( - shiftCount ); 4365 if ( aSig1 4366 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4367 float_set_inexact(); 4368 } 4369 } 4370 if ( aSign ) z = - z; 4371 return z; 4372 4373 } 4374 4375 /*---------------------------------------------------------------------------- 4376 | Returns the result of converting the quadruple-precision floating-point 4377 | value `a' to the single-precision floating-point format. The conversion 4378 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4379 | Arithmetic. 4380 *----------------------------------------------------------------------------*/ 4381 4382 float32 float128_to_float32( float128 a ) 4383 { 4384 flag aSign; 4385 int32 aExp; 4386 bits64 aSig0, aSig1; 4387 bits32 zSig; 4388 4389 aSig1 = extractFloat128Frac1( a ); 4390 aSig0 = extractFloat128Frac0( a ); 4391 aExp = extractFloat128Exp( a ); 4392 aSign = extractFloat128Sign( a ); 4393 if ( aExp == 0x7FFF ) { 4394 if ( aSig0 | aSig1 ) { 4395 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4396 } 4397 return packFloat32( aSign, 0xFF, 0 ); 4398 } 4399 aSig0 |= ( aSig1 != 0 ); 4400 shift64RightJamming( aSig0, 18, &aSig0 ); 4401 zSig = aSig0; 4402 if ( aExp || zSig ) { 4403 zSig |= 0x40000000; 4404 aExp -= 0x3F81; 4405 } 4406 return roundAndPackFloat32( aSign, aExp, zSig ); 4407 4408 } 4409 4410 /*---------------------------------------------------------------------------- 4411 | Returns the result of converting the quadruple-precision floating-point 4412 | value `a' to the double-precision floating-point format. The conversion 4413 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4414 | Arithmetic. 4415 *----------------------------------------------------------------------------*/ 4416 4417 float64 float128_to_float64( float128 a ) 4418 { 4419 flag aSign; 4420 int32 aExp; 4421 bits64 aSig0, aSig1; 4422 4423 aSig1 = extractFloat128Frac1( a ); 4424 aSig0 = extractFloat128Frac0( a ); 4425 aExp = extractFloat128Exp( a ); 4426 aSign = extractFloat128Sign( a ); 4427 if ( aExp == 0x7FFF ) { 4428 if ( aSig0 | aSig1 ) { 4429 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4430 } 4431 return packFloat64( aSign, 0x7FF, 0 ); 4432 } 4433 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4434 aSig0 |= ( aSig1 != 0 ); 4435 if ( aExp || aSig0 ) { 4436 aSig0 |= LIT64( 0x4000000000000000 ); 4437 aExp -= 0x3C01; 4438 } 4439 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4440 4441 } 4442 4443 #ifdef FLOATX80 4444 4445 /*---------------------------------------------------------------------------- 4446 | Returns the result of converting the quadruple-precision floating-point 4447 | value `a' to the extended double-precision floating-point format. The 4448 | conversion is performed according to the IEC/IEEE Standard for Binary 4449 | Floating-Point Arithmetic. 4450 *----------------------------------------------------------------------------*/ 4451 4452 floatx80 float128_to_floatx80( float128 a ) 4453 { 4454 flag aSign; 4455 int32 aExp; 4456 bits64 aSig0, aSig1; 4457 4458 aSig1 = extractFloat128Frac1( a ); 4459 aSig0 = extractFloat128Frac0( a ); 4460 aExp = extractFloat128Exp( a ); 4461 aSign = extractFloat128Sign( a ); 4462 if ( aExp == 0x7FFF ) { 4463 if ( aSig0 | aSig1 ) { 4464 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4465 } 4466 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4467 } 4468 if ( aExp == 0 ) { 4469 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4470 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4471 } 4472 else { 4473 aSig0 |= LIT64( 0x0001000000000000 ); 4474 } 4475 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4476 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4477 4478 } 4479 4480 #endif 4481 4482 /*---------------------------------------------------------------------------- 4483 | Rounds the quadruple-precision floating-point value `a' to an integer, and 4484 | returns the result as a quadruple-precision floating-point value. The 4485 | operation is performed according to the IEC/IEEE Standard for Binary 4486 | Floating-Point Arithmetic. 4487 *----------------------------------------------------------------------------*/ 4488 4489 float128 float128_round_to_int( float128 a ) 4490 { 4491 flag aSign; 4492 int32 aExp; 4493 bits64 lastBitMask, roundBitsMask; 4494 int8 roundingMode; 4495 float128 z; 4496 4497 aExp = extractFloat128Exp( a ); 4498 if ( 0x402F <= aExp ) { 4499 if ( 0x406F <= aExp ) { 4500 if ( ( aExp == 0x7FFF ) 4501 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4502 ) { 4503 return propagateFloat128NaN( a, a ); 4504 } 4505 return a; 4506 } 4507 lastBitMask = 1; 4508 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4509 roundBitsMask = lastBitMask - 1; 4510 z = a; 4511 roundingMode = float_rounding_mode(); 4512 if ( roundingMode == float_round_nearest_even ) { 4513 if ( lastBitMask ) { 4514 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4515 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4516 } 4517 else { 4518 if ( (sbits64) z.low < 0 ) { 4519 ++z.high; 4520 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4521 } 4522 } 4523 } 4524 else if ( roundingMode != float_round_to_zero ) { 4525 if ( extractFloat128Sign( z ) 4526 ^ ( roundingMode == float_round_up ) ) { 4527 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4528 } 4529 } 4530 z.low &= ~ roundBitsMask; 4531 } 4532 else { 4533 if ( aExp < 0x3FFF ) { 4534 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4535 float_set_inexact(); 4536 aSign = extractFloat128Sign( a ); 4537 switch ( float_rounding_mode() ) { 4538 case float_round_nearest_even: 4539 if ( ( aExp == 0x3FFE ) 4540 && ( extractFloat128Frac0( a ) 4541 | extractFloat128Frac1( a ) ) 4542 ) { 4543 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4544 } 4545 break; 4546 case float_round_down: 4547 return 4548 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4549 : packFloat128( 0, 0, 0, 0 ); 4550 case float_round_up: 4551 return 4552 aSign ? packFloat128( 1, 0, 0, 0 ) 4553 : packFloat128( 0, 0x3FFF, 0, 0 ); 4554 } 4555 return packFloat128( aSign, 0, 0, 0 ); 4556 } 4557 lastBitMask = 1; 4558 lastBitMask <<= 0x402F - aExp; 4559 roundBitsMask = lastBitMask - 1; 4560 z.low = 0; 4561 z.high = a.high; 4562 roundingMode = float_rounding_mode(); 4563 if ( roundingMode == float_round_nearest_even ) { 4564 z.high += lastBitMask>>1; 4565 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4566 z.high &= ~ lastBitMask; 4567 } 4568 } 4569 else if ( roundingMode != float_round_to_zero ) { 4570 if ( extractFloat128Sign( z ) 4571 ^ ( roundingMode == float_round_up ) ) { 4572 z.high |= ( a.low != 0 ); 4573 z.high += roundBitsMask; 4574 } 4575 } 4576 z.high &= ~ roundBitsMask; 4577 } 4578 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4579 float_set_inexact(); 4580 } 4581 return z; 4582 4583 } 4584 4585 /*---------------------------------------------------------------------------- 4586 | Returns the result of adding the absolute values of the quadruple-precision 4587 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4588 | before being returned. `zSign' is ignored if the result is a NaN. 4589 | The addition is performed according to the IEC/IEEE Standard for Binary 4590 | Floating-Point Arithmetic. 4591 *----------------------------------------------------------------------------*/ 4592 4593 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4594 { 4595 int32 aExp, bExp, zExp; 4596 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4597 int32 expDiff; 4598 4599 aSig1 = extractFloat128Frac1( a ); 4600 aSig0 = extractFloat128Frac0( a ); 4601 aExp = extractFloat128Exp( a ); 4602 bSig1 = extractFloat128Frac1( b ); 4603 bSig0 = extractFloat128Frac0( b ); 4604 bExp = extractFloat128Exp( b ); 4605 expDiff = aExp - bExp; 4606 if ( 0 < expDiff ) { 4607 if ( aExp == 0x7FFF ) { 4608 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4609 return a; 4610 } 4611 if ( bExp == 0 ) { 4612 --expDiff; 4613 } 4614 else { 4615 bSig0 |= LIT64( 0x0001000000000000 ); 4616 } 4617 shift128ExtraRightJamming( 4618 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4619 zExp = aExp; 4620 } 4621 else if ( expDiff < 0 ) { 4622 if ( bExp == 0x7FFF ) { 4623 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4624 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4625 } 4626 if ( aExp == 0 ) { 4627 ++expDiff; 4628 } 4629 else { 4630 aSig0 |= LIT64( 0x0001000000000000 ); 4631 } 4632 shift128ExtraRightJamming( 4633 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4634 zExp = bExp; 4635 } 4636 else { 4637 if ( aExp == 0x7FFF ) { 4638 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4639 return propagateFloat128NaN( a, b ); 4640 } 4641 return a; 4642 } 4643 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4644 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4645 zSig2 = 0; 4646 zSig0 |= LIT64( 0x0002000000000000 ); 4647 zExp = aExp; 4648 goto shiftRight1; 4649 } 4650 aSig0 |= LIT64( 0x0001000000000000 ); 4651 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4652 --zExp; 4653 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4654 ++zExp; 4655 shiftRight1: 4656 shift128ExtraRightJamming( 4657 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4658 roundAndPack: 4659 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4660 4661 } 4662 4663 /*---------------------------------------------------------------------------- 4664 | Returns the result of subtracting the absolute values of the quadruple- 4665 | precision floating-point values `a' and `b'. If `zSign' is 1, the 4666 | difference is negated before being returned. `zSign' is ignored if the 4667 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4668 | Standard for Binary Floating-Point Arithmetic. 4669 *----------------------------------------------------------------------------*/ 4670 4671 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4672 { 4673 int32 aExp, bExp, zExp; 4674 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4675 int32 expDiff; 4676 float128 z; 4677 4678 aSig1 = extractFloat128Frac1( a ); 4679 aSig0 = extractFloat128Frac0( a ); 4680 aExp = extractFloat128Exp( a ); 4681 bSig1 = extractFloat128Frac1( b ); 4682 bSig0 = extractFloat128Frac0( b ); 4683 bExp = extractFloat128Exp( b ); 4684 expDiff = aExp - bExp; 4685 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4686 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4687 if ( 0 < expDiff ) goto aExpBigger; 4688 if ( expDiff < 0 ) goto bExpBigger; 4689 if ( aExp == 0x7FFF ) { 4690 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4691 return propagateFloat128NaN( a, b ); 4692 } 4693 float_raise( float_flag_invalid ); 4694 z.low = float128_default_nan_low; 4695 z.high = float128_default_nan_high; 4696 return z; 4697 } 4698 if ( aExp == 0 ) { 4699 aExp = 1; 4700 bExp = 1; 4701 } 4702 if ( bSig0 < aSig0 ) goto aBigger; 4703 if ( aSig0 < bSig0 ) goto bBigger; 4704 if ( bSig1 < aSig1 ) goto aBigger; 4705 if ( aSig1 < bSig1 ) goto bBigger; 4706 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4707 bExpBigger: 4708 if ( bExp == 0x7FFF ) { 4709 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4710 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4711 } 4712 if ( aExp == 0 ) { 4713 ++expDiff; 4714 } 4715 else { 4716 aSig0 |= LIT64( 0x4000000000000000 ); 4717 } 4718 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4719 bSig0 |= LIT64( 0x4000000000000000 ); 4720 bBigger: 4721 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4722 zExp = bExp; 4723 zSign ^= 1; 4724 goto normalizeRoundAndPack; 4725 aExpBigger: 4726 if ( aExp == 0x7FFF ) { 4727 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4728 return a; 4729 } 4730 if ( bExp == 0 ) { 4731 --expDiff; 4732 } 4733 else { 4734 bSig0 |= LIT64( 0x4000000000000000 ); 4735 } 4736 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4737 aSig0 |= LIT64( 0x4000000000000000 ); 4738 aBigger: 4739 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4740 zExp = aExp; 4741 normalizeRoundAndPack: 4742 --zExp; 4743 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4744 4745 } 4746 4747 /*---------------------------------------------------------------------------- 4748 | Returns the result of adding the quadruple-precision floating-point values 4749 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4750 | for Binary Floating-Point Arithmetic. 4751 *----------------------------------------------------------------------------*/ 4752 4753 float128 float128_add( float128 a, float128 b ) 4754 { 4755 flag aSign, bSign; 4756 4757 aSign = extractFloat128Sign( a ); 4758 bSign = extractFloat128Sign( b ); 4759 if ( aSign == bSign ) { 4760 return addFloat128Sigs( a, b, aSign ); 4761 } 4762 else { 4763 return subFloat128Sigs( a, b, aSign ); 4764 } 4765 4766 } 4767 4768 /*---------------------------------------------------------------------------- 4769 | Returns the result of subtracting the quadruple-precision floating-point 4770 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4771 | Standard for Binary Floating-Point Arithmetic. 4772 *----------------------------------------------------------------------------*/ 4773 4774 float128 float128_sub( float128 a, float128 b ) 4775 { 4776 flag aSign, bSign; 4777 4778 aSign = extractFloat128Sign( a ); 4779 bSign = extractFloat128Sign( b ); 4780 if ( aSign == bSign ) { 4781 return subFloat128Sigs( a, b, aSign ); 4782 } 4783 else { 4784 return addFloat128Sigs( a, b, aSign ); 4785 } 4786 4787 } 4788 4789 /*---------------------------------------------------------------------------- 4790 | Returns the result of multiplying the quadruple-precision floating-point 4791 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4792 | Standard for Binary Floating-Point Arithmetic. 4793 *----------------------------------------------------------------------------*/ 4794 4795 float128 float128_mul( float128 a, float128 b ) 4796 { 4797 flag aSign, bSign, zSign; 4798 int32 aExp, bExp, zExp; 4799 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4800 float128 z; 4801 4802 aSig1 = extractFloat128Frac1( a ); 4803 aSig0 = extractFloat128Frac0( a ); 4804 aExp = extractFloat128Exp( a ); 4805 aSign = extractFloat128Sign( a ); 4806 bSig1 = extractFloat128Frac1( b ); 4807 bSig0 = extractFloat128Frac0( b ); 4808 bExp = extractFloat128Exp( b ); 4809 bSign = extractFloat128Sign( b ); 4810 zSign = aSign ^ bSign; 4811 if ( aExp == 0x7FFF ) { 4812 if ( ( aSig0 | aSig1 ) 4813 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4814 return propagateFloat128NaN( a, b ); 4815 } 4816 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4817 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4818 } 4819 if ( bExp == 0x7FFF ) { 4820 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4821 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4822 invalid: 4823 float_raise( float_flag_invalid ); 4824 z.low = float128_default_nan_low; 4825 z.high = float128_default_nan_high; 4826 return z; 4827 } 4828 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4829 } 4830 if ( aExp == 0 ) { 4831 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4832 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4833 } 4834 if ( bExp == 0 ) { 4835 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4836 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4837 } 4838 zExp = aExp + bExp - 0x4000; 4839 aSig0 |= LIT64( 0x0001000000000000 ); 4840 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4841 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4842 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4843 zSig2 |= ( zSig3 != 0 ); 4844 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4845 shift128ExtraRightJamming( 4846 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4847 ++zExp; 4848 } 4849 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4850 4851 } 4852 4853 /*---------------------------------------------------------------------------- 4854 | Returns the result of dividing the quadruple-precision floating-point value 4855 | `a' by the corresponding value `b'. The operation is performed according to 4856 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4857 *----------------------------------------------------------------------------*/ 4858 4859 float128 float128_div( float128 a, float128 b ) 4860 { 4861 flag aSign, bSign, zSign; 4862 int32 aExp, bExp, zExp; 4863 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4864 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4865 float128 z; 4866 4867 aSig1 = extractFloat128Frac1( a ); 4868 aSig0 = extractFloat128Frac0( a ); 4869 aExp = extractFloat128Exp( a ); 4870 aSign = extractFloat128Sign( a ); 4871 bSig1 = extractFloat128Frac1( b ); 4872 bSig0 = extractFloat128Frac0( b ); 4873 bExp = extractFloat128Exp( b ); 4874 bSign = extractFloat128Sign( b ); 4875 zSign = aSign ^ bSign; 4876 if ( aExp == 0x7FFF ) { 4877 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4878 if ( bExp == 0x7FFF ) { 4879 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4880 goto invalid; 4881 } 4882 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4883 } 4884 if ( bExp == 0x7FFF ) { 4885 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4886 return packFloat128( zSign, 0, 0, 0 ); 4887 } 4888 if ( bExp == 0 ) { 4889 if ( ( bSig0 | bSig1 ) == 0 ) { 4890 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4891 invalid: 4892 float_raise( float_flag_invalid ); 4893 z.low = float128_default_nan_low; 4894 z.high = float128_default_nan_high; 4895 return z; 4896 } 4897 float_raise( float_flag_divbyzero ); 4898 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4899 } 4900 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4901 } 4902 if ( aExp == 0 ) { 4903 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4904 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4905 } 4906 zExp = aExp - bExp + 0x3FFD; 4907 shortShift128Left( 4908 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4909 shortShift128Left( 4910 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4911 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4912 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4913 ++zExp; 4914 } 4915 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4916 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4917 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4918 while ( (sbits64) rem0 < 0 ) { 4919 --zSig0; 4920 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 4921 } 4922 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 4923 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 4924 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 4925 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 4926 while ( (sbits64) rem1 < 0 ) { 4927 --zSig1; 4928 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 4929 } 4930 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4931 } 4932 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 4933 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4934 4935 } 4936 4937 /*---------------------------------------------------------------------------- 4938 | Returns the remainder of the quadruple-precision floating-point value `a' 4939 | with respect to the corresponding value `b'. The operation is performed 4940 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4941 *----------------------------------------------------------------------------*/ 4942 4943 float128 float128_rem( float128 a, float128 b ) 4944 { 4945 flag aSign, bSign, zSign; 4946 int32 aExp, bExp, expDiff; 4947 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 4948 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 4949 sbits64 sigMean0; 4950 float128 z; 4951 4952 aSig1 = extractFloat128Frac1( a ); 4953 aSig0 = extractFloat128Frac0( a ); 4954 aExp = extractFloat128Exp( a ); 4955 aSign = extractFloat128Sign( a ); 4956 bSig1 = extractFloat128Frac1( b ); 4957 bSig0 = extractFloat128Frac0( b ); 4958 bExp = extractFloat128Exp( b ); 4959 bSign = extractFloat128Sign( b ); 4960 if ( aExp == 0x7FFF ) { 4961 if ( ( aSig0 | aSig1 ) 4962 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4963 return propagateFloat128NaN( a, b ); 4964 } 4965 goto invalid; 4966 } 4967 if ( bExp == 0x7FFF ) { 4968 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4969 return a; 4970 } 4971 if ( bExp == 0 ) { 4972 if ( ( bSig0 | bSig1 ) == 0 ) { 4973 invalid: 4974 float_raise( float_flag_invalid ); 4975 z.low = float128_default_nan_low; 4976 z.high = float128_default_nan_high; 4977 return z; 4978 } 4979 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4980 } 4981 if ( aExp == 0 ) { 4982 if ( ( aSig0 | aSig1 ) == 0 ) return a; 4983 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4984 } 4985 expDiff = aExp - bExp; 4986 if ( expDiff < -1 ) return a; 4987 shortShift128Left( 4988 aSig0 | LIT64( 0x0001000000000000 ), 4989 aSig1, 4990 15 - ( expDiff < 0 ), 4991 &aSig0, 4992 &aSig1 4993 ); 4994 shortShift128Left( 4995 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4996 q = le128( bSig0, bSig1, aSig0, aSig1 ); 4997 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 4998 expDiff -= 64; 4999 while ( 0 < expDiff ) { 5000 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5001 q = ( 4 < q ) ? q - 4 : 0; 5002 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5003 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5004 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5005 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5006 expDiff -= 61; 5007 } 5008 if ( -64 < expDiff ) { 5009 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5010 q = ( 4 < q ) ? q - 4 : 0; 5011 q >>= - expDiff; 5012 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5013 expDiff += 52; 5014 if ( expDiff < 0 ) { 5015 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5016 } 5017 else { 5018 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5019 } 5020 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5021 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5022 } 5023 else { 5024 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5025 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5026 } 5027 do { 5028 alternateASig0 = aSig0; 5029 alternateASig1 = aSig1; 5030 ++q; 5031 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5032 } while ( 0 <= (sbits64) aSig0 ); 5033 add128( 5034 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5035 if ( ( sigMean0 < 0 ) 5036 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5037 aSig0 = alternateASig0; 5038 aSig1 = alternateASig1; 5039 } 5040 zSign = ( (sbits64) aSig0 < 0 ); 5041 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5042 return 5043 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5044 5045 } 5046 5047 /*---------------------------------------------------------------------------- 5048 | Returns the square root of the quadruple-precision floating-point value `a'. 5049 | The operation is performed according to the IEC/IEEE Standard for Binary 5050 | Floating-Point Arithmetic. 5051 *----------------------------------------------------------------------------*/ 5052 5053 float128 float128_sqrt( float128 a ) 5054 { 5055 flag aSign; 5056 int32 aExp, zExp; 5057 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5058 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5059 float128 z; 5060 5061 aSig1 = extractFloat128Frac1( a ); 5062 aSig0 = extractFloat128Frac0( a ); 5063 aExp = extractFloat128Exp( a ); 5064 aSign = extractFloat128Sign( a ); 5065 if ( aExp == 0x7FFF ) { 5066 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5067 if ( ! aSign ) return a; 5068 goto invalid; 5069 } 5070 if ( aSign ) { 5071 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5072 invalid: 5073 float_raise( float_flag_invalid ); 5074 z.low = float128_default_nan_low; 5075 z.high = float128_default_nan_high; 5076 return z; 5077 } 5078 if ( aExp == 0 ) { 5079 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5080 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5081 } 5082 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5083 aSig0 |= LIT64( 0x0001000000000000 ); 5084 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5085 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5086 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5087 doubleZSig0 = zSig0<<1; 5088 mul64To128( zSig0, zSig0, &term0, &term1 ); 5089 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5090 while ( (sbits64) rem0 < 0 ) { 5091 --zSig0; 5092 doubleZSig0 -= 2; 5093 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5094 } 5095 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5096 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5097 if ( zSig1 == 0 ) zSig1 = 1; 5098 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5099 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5100 mul64To128( zSig1, zSig1, &term2, &term3 ); 5101 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5102 while ( (sbits64) rem1 < 0 ) { 5103 --zSig1; 5104 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5105 term3 |= 1; 5106 term2 |= doubleZSig0; 5107 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5108 } 5109 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5110 } 5111 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5112 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5113 5114 } 5115 5116 /*---------------------------------------------------------------------------- 5117 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5118 | the corresponding value `b', and 0 otherwise. The comparison is performed 5119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5120 *----------------------------------------------------------------------------*/ 5121 5122 flag float128_eq( float128 a, float128 b ) 5123 { 5124 5125 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5126 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5127 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5128 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5129 ) { 5130 if ( float128_is_signaling_nan( a ) 5131 || float128_is_signaling_nan( b ) ) { 5132 float_raise( float_flag_invalid ); 5133 } 5134 return 0; 5135 } 5136 return 5137 ( a.low == b.low ) 5138 && ( ( a.high == b.high ) 5139 || ( ( a.low == 0 ) 5140 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5141 ); 5142 5143 } 5144 5145 /*---------------------------------------------------------------------------- 5146 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5147 | or equal to the corresponding value `b', and 0 otherwise. The comparison 5148 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5149 | Arithmetic. 5150 *----------------------------------------------------------------------------*/ 5151 5152 flag float128_le( float128 a, float128 b ) 5153 { 5154 flag aSign, bSign; 5155 5156 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5157 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5158 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5159 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5160 ) { 5161 float_raise( float_flag_invalid ); 5162 return 0; 5163 } 5164 aSign = extractFloat128Sign( a ); 5165 bSign = extractFloat128Sign( b ); 5166 if ( aSign != bSign ) { 5167 return 5168 aSign 5169 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5170 == 0 ); 5171 } 5172 return 5173 aSign ? le128( b.high, b.low, a.high, a.low ) 5174 : le128( a.high, a.low, b.high, b.low ); 5175 5176 } 5177 5178 /*---------------------------------------------------------------------------- 5179 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5180 | the corresponding value `b', and 0 otherwise. The comparison is performed 5181 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5182 *----------------------------------------------------------------------------*/ 5183 5184 flag float128_lt( float128 a, float128 b ) 5185 { 5186 flag aSign, bSign; 5187 5188 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5189 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5190 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5191 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5192 ) { 5193 float_raise( float_flag_invalid ); 5194 return 0; 5195 } 5196 aSign = extractFloat128Sign( a ); 5197 bSign = extractFloat128Sign( b ); 5198 if ( aSign != bSign ) { 5199 return 5200 aSign 5201 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5202 != 0 ); 5203 } 5204 return 5205 aSign ? lt128( b.high, b.low, a.high, a.low ) 5206 : lt128( a.high, a.low, b.high, b.low ); 5207 5208 } 5209 5210 /*---------------------------------------------------------------------------- 5211 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5212 | the corresponding value `b', and 0 otherwise. The invalid exception is 5213 | raised if either operand is a NaN. Otherwise, the comparison is performed 5214 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5215 *----------------------------------------------------------------------------*/ 5216 5217 flag float128_eq_signaling( float128 a, float128 b ) 5218 { 5219 5220 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5221 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5222 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5223 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5224 ) { 5225 float_raise( float_flag_invalid ); 5226 return 0; 5227 } 5228 return 5229 ( a.low == b.low ) 5230 && ( ( a.high == b.high ) 5231 || ( ( a.low == 0 ) 5232 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5233 ); 5234 5235 } 5236 5237 /*---------------------------------------------------------------------------- 5238 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5239 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5240 | cause an exception. Otherwise, the comparison is performed according to the 5241 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5242 *----------------------------------------------------------------------------*/ 5243 5244 flag float128_le_quiet( float128 a, float128 b ) 5245 { 5246 flag aSign, bSign; 5247 5248 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5249 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5250 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5251 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5252 ) { 5253 if ( float128_is_signaling_nan( a ) 5254 || float128_is_signaling_nan( b ) ) { 5255 float_raise( float_flag_invalid ); 5256 } 5257 return 0; 5258 } 5259 aSign = extractFloat128Sign( a ); 5260 bSign = extractFloat128Sign( b ); 5261 if ( aSign != bSign ) { 5262 return 5263 aSign 5264 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5265 == 0 ); 5266 } 5267 return 5268 aSign ? le128( b.high, b.low, a.high, a.low ) 5269 : le128( a.high, a.low, b.high, b.low ); 5270 5271 } 5272 5273 /*---------------------------------------------------------------------------- 5274 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5275 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5276 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5277 | Standard for Binary Floating-Point Arithmetic. 5278 *----------------------------------------------------------------------------*/ 5279 5280 flag float128_lt_quiet( float128 a, float128 b ) 5281 { 5282 flag aSign, bSign; 5283 5284 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5285 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5286 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5287 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5288 ) { 5289 if ( float128_is_signaling_nan( a ) 5290 || float128_is_signaling_nan( b ) ) { 5291 float_raise( float_flag_invalid ); 5292 } 5293 return 0; 5294 } 5295 aSign = extractFloat128Sign( a ); 5296 bSign = extractFloat128Sign( b ); 5297 if ( aSign != bSign ) { 5298 return 5299 aSign 5300 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5301 != 0 ); 5302 } 5303 return 5304 aSign ? lt128( b.high, b.low, a.high, a.low ) 5305 : lt128( a.high, a.low, b.high, b.low ); 5306 5307 } 5308 5309 #endif 5310 5311 5312 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5313 5314 /* 5315 * These two routines are not part of the original softfloat distribution. 5316 * 5317 * They are based on the corresponding conversions to integer but return 5318 * unsigned numbers instead since these functions are required by GCC. 5319 * 5320 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5321 * 5322 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5323 */ 5324 5325 /*---------------------------------------------------------------------------- 5326 | Returns the result of converting the double-precision floating-point value 5327 | `a' to the 32-bit unsigned integer format. The conversion is 5328 | performed according to the IEC/IEEE Standard for Binary Floating-point 5329 | Arithmetic, except that the conversion is always rounded toward zero. If 5330 | `a' is a NaN, the largest positive integer is returned. If the conversion 5331 | overflows, the largest integer positive is returned. 5332 *----------------------------------------------------------------------------*/ 5333 5334 uint32 float64_to_uint32_round_to_zero( float64 a ) 5335 { 5336 flag aSign; 5337 int16 aExp, shiftCount; 5338 bits64 aSig, savedASig; 5339 uint32 z; 5340 5341 aSig = extractFloat64Frac( a ); 5342 aExp = extractFloat64Exp( a ); 5343 aSign = extractFloat64Sign( a ); 5344 5345 if (aSign) { 5346 float_raise( float_flag_invalid ); 5347 return(0); 5348 } 5349 5350 if ( 0x41E < aExp ) { 5351 float_raise( float_flag_invalid ); 5352 return 0xffffffff; 5353 } 5354 else if ( aExp < 0x3FF ) { 5355 if ( aExp || aSig ) float_set_inexact(); 5356 return 0; 5357 } 5358 aSig |= LIT64( 0x0010000000000000 ); 5359 shiftCount = 0x433 - aExp; 5360 savedASig = aSig; 5361 aSig >>= shiftCount; 5362 z = aSig; 5363 if ( ( aSig<<shiftCount ) != savedASig ) { 5364 float_set_inexact(); 5365 } 5366 return z; 5367 5368 } 5369 5370 /*---------------------------------------------------------------------------- 5371 | Returns the result of converting the single-precision floating-point value 5372 | `a' to the 32-bit unsigned integer format. The conversion is 5373 | performed according to the IEC/IEEE Standard for Binary Floating-point 5374 | Arithmetic, except that the conversion is always rounded toward zero. If 5375 | `a' is a NaN, the largest positive integer is returned. If the conversion 5376 | overflows, the largest positive integer is returned. 5377 *----------------------------------------------------------------------------*/ 5378 5379 uint32 float32_to_uint32_round_to_zero( float32 a ) 5380 { 5381 flag aSign; 5382 int16 aExp, shiftCount; 5383 bits32 aSig; 5384 uint32 z; 5385 5386 aSig = extractFloat32Frac( a ); 5387 aExp = extractFloat32Exp( a ); 5388 aSign = extractFloat32Sign( a ); 5389 shiftCount = aExp - 0x9E; 5390 5391 if (aSign) { 5392 float_raise( float_flag_invalid ); 5393 return(0); 5394 } 5395 if ( 0 < shiftCount ) { 5396 float_raise( float_flag_invalid ); 5397 return 0xFFFFFFFF; 5398 } 5399 else if ( aExp <= 0x7E ) { 5400 if ( aExp | aSig ) float_set_inexact(); 5401 return 0; 5402 } 5403 aSig = ( aSig | 0x800000 )<<8; 5404 z = aSig>>( - shiftCount ); 5405 if ( aSig<<( shiftCount & 31 ) ) { 5406 float_set_inexact(); 5407 } 5408 return z; 5409 5410 } 5411 5412 #endif 5413 5414 #endif /* _STANDALONE */ 5415