1 /* $NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /*============================================================================ 18 19 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 20 Package, Release 2b. 21 22 Written by John R. Hauser. This work was made possible in part by the 23 International Computer Science Institute, located at Suite 600, 1947 Center 24 Street, Berkeley, California 94704. Funding was partially provided by the 25 National Science Foundation under grant MIP-9311980. The original version 26 of this code was written as part of a project to build a fixed-point vector 27 processor in collaboration with the University of California at Berkeley, 28 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 29 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 30 arithmetic/SoftFloat.html'. 31 32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 33 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 34 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 35 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 36 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 37 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 38 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 39 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 40 41 Derivative works are acceptable, even for commercial purposes, so long as 42 (1) the source code for the derivative work includes prominent notice that 43 the work is derivative, and (2) the source code includes prominent notice with 44 these four paragraphs for those parts of this code that are retained. 45 46 =============================================================================*/ 47 48 /* If you need this in a boot program, you have bigger problems... */ 49 #ifndef _STANDALONE 50 51 #include <sys/cdefs.h> 52 #if defined(LIBC_SCCS) && !defined(lint) 53 __RCSID("$NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $"); 54 #endif /* LIBC_SCCS and not lint */ 55 56 #ifdef SOFTFLOAT_FOR_GCC 57 #include "softfloat-for-gcc.h" 58 #endif 59 60 #include "milieu.h" 61 #include "softfloat.h" 62 63 /* 64 * Conversions between floats as stored in memory and floats as 65 * SoftFloat uses them 66 */ 67 #ifndef FLOAT64_DEMANGLE 68 #define FLOAT64_DEMANGLE(a) (a) 69 #endif 70 #ifndef FLOAT64_MANGLE 71 #define FLOAT64_MANGLE(a) (a) 72 #endif 73 74 /*---------------------------------------------------------------------------- 75 | Floating-point rounding mode, extended double-precision rounding precision, 76 | and exception flags. 77 *----------------------------------------------------------------------------*/ 78 /* 79 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 80 * Right now, it does not. I've removed all other dynamic global 81 * variables. [ross] 82 */ 83 #ifdef FLOATX80 84 int8 floatx80_rounding_precision = 80; 85 #endif 86 87 /*---------------------------------------------------------------------------- 88 | Primitive arithmetic functions, including multi-word arithmetic, and 89 | division and square root approximations. (Can be specialized to target if 90 | desired.) 91 *----------------------------------------------------------------------------*/ 92 #include "softfloat-macros.h" 93 94 /*---------------------------------------------------------------------------- 95 | Functions and definitions to determine: (1) whether tininess for underflow 96 | is detected before or after rounding by default, (2) what (if anything) 97 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 98 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 99 | are propagated from function inputs to output. These details are target- 100 | specific. 101 *----------------------------------------------------------------------------*/ 102 #include "softfloat-specialize.h" 103 104 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 105 /*---------------------------------------------------------------------------- 106 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 107 | and 7, and returns the properly rounded 32-bit integer corresponding to the 108 | input. If `zSign' is 1, the input is negated before being converted to an 109 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 110 | is simply rounded to an integer, with the inexact exception raised if the 111 | input cannot be represented exactly as an integer. However, if the fixed- 112 | point input is too large, the invalid exception is raised and the largest 113 | positive or negative integer is returned. 114 *----------------------------------------------------------------------------*/ 115 116 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 117 { 118 int8 roundingMode; 119 flag roundNearestEven; 120 int8 roundIncrement, roundBits; 121 int32 z; 122 123 roundingMode = float_rounding_mode(); 124 roundNearestEven = ( roundingMode == float_round_nearest_even ); 125 roundIncrement = 0x40; 126 if ( ! roundNearestEven ) { 127 if ( roundingMode == float_round_to_zero ) { 128 roundIncrement = 0; 129 } 130 else { 131 roundIncrement = 0x7F; 132 if ( zSign ) { 133 if ( roundingMode == float_round_up ) roundIncrement = 0; 134 } 135 else { 136 if ( roundingMode == float_round_down ) roundIncrement = 0; 137 } 138 } 139 } 140 roundBits = absZ & 0x7F; 141 absZ = ( absZ + roundIncrement )>>7; 142 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 143 z = absZ; 144 if ( zSign ) z = - z; 145 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 146 float_raise( float_flag_invalid ); 147 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 148 } 149 if ( roundBits ) float_set_inexact(); 150 return z; 151 152 } 153 154 /*---------------------------------------------------------------------------- 155 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 156 | `absZ1', with binary point between bits 63 and 64 (between the input words), 157 | and returns the properly rounded 64-bit integer corresponding to the input. 158 | If `zSign' is 1, the input is negated before being converted to an integer. 159 | Ordinarily, the fixed-point input is simply rounded to an integer, with 160 | the inexact exception raised if the input cannot be represented exactly as 161 | an integer. However, if the fixed-point input is too large, the invalid 162 | exception is raised and the largest positive or negative integer is 163 | returned. 164 *----------------------------------------------------------------------------*/ 165 166 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 167 { 168 int8 roundingMode; 169 flag roundNearestEven, increment; 170 int64 z; 171 172 roundingMode = float_rounding_mode(); 173 roundNearestEven = ( roundingMode == float_round_nearest_even ); 174 increment = ( (sbits64) absZ1 < 0 ); 175 if ( ! roundNearestEven ) { 176 if ( roundingMode == float_round_to_zero ) { 177 increment = 0; 178 } 179 else { 180 if ( zSign ) { 181 increment = ( roundingMode == float_round_down ) && absZ1; 182 } 183 else { 184 increment = ( roundingMode == float_round_up ) && absZ1; 185 } 186 } 187 } 188 if ( increment ) { 189 ++absZ0; 190 if ( absZ0 == 0 ) goto overflow; 191 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 192 } 193 z = absZ0; 194 if ( zSign ) z = - z; 195 if ( z && ( ( z < 0 ) ^ zSign ) ) { 196 overflow: 197 float_raise( float_flag_invalid ); 198 return 199 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 200 : LIT64( 0x7FFFFFFFFFFFFFFF ); 201 } 202 if ( absZ1 ) float_set_inexact(); 203 return z; 204 205 } 206 207 /* same as above, but for unsigned values */ 208 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 ) 209 { 210 int8 roundingMode; 211 flag roundNearestEven, increment; 212 uint64 z; 213 214 roundingMode = float_rounding_mode(); 215 roundNearestEven = ( roundingMode == float_round_nearest_even ); 216 increment = ( (sbits64) absZ1 < 0 ); 217 if ( ! roundNearestEven ) { 218 if ( roundingMode == float_round_to_zero ) { 219 increment = 0; 220 } 221 else { 222 increment = ( roundingMode == float_round_up ) && absZ1; 223 } 224 } 225 if ( increment ) { 226 ++absZ0; 227 if ( absZ0 == 0 ) { 228 float_raise( float_flag_invalid ); 229 return LIT64( 0x7FFFFFFFFFFFFFFF ); 230 } 231 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 232 } 233 z = absZ0; 234 if ( absZ1 ) float_set_inexact(); 235 return z; 236 237 } 238 #endif /* SOFTFLOAT_FOR_GCC */ 239 240 /*---------------------------------------------------------------------------- 241 | Returns the fraction bits of the single-precision floating-point value `a'. 242 *----------------------------------------------------------------------------*/ 243 244 INLINE bits32 extractFloat32Frac( float32 a ) 245 { 246 247 return a & 0x007FFFFF; 248 249 } 250 251 /*---------------------------------------------------------------------------- 252 | Returns the exponent bits of the single-precision floating-point value `a'. 253 *----------------------------------------------------------------------------*/ 254 255 INLINE int16 extractFloat32Exp( float32 a ) 256 { 257 258 return ( a>>23 ) & 0xFF; 259 260 } 261 262 /*---------------------------------------------------------------------------- 263 | Returns the sign bit of the single-precision floating-point value `a'. 264 *----------------------------------------------------------------------------*/ 265 266 INLINE flag extractFloat32Sign( float32 a ) 267 { 268 269 return a>>31; 270 271 } 272 273 /*---------------------------------------------------------------------------- 274 | Normalizes the subnormal single-precision floating-point value represented 275 | by the denormalized significand `aSig'. The normalized exponent and 276 | significand are stored at the locations pointed to by `zExpPtr' and 277 | `zSigPtr', respectively. 278 *----------------------------------------------------------------------------*/ 279 280 static void 281 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 282 { 283 int8 shiftCount; 284 285 shiftCount = countLeadingZeros32( aSig ) - 8; 286 *zSigPtr = aSig<<shiftCount; 287 *zExpPtr = 1 - shiftCount; 288 289 } 290 291 /*---------------------------------------------------------------------------- 292 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 293 | single-precision floating-point value, returning the result. After being 294 | shifted into the proper positions, the three fields are simply added 295 | together to form the result. This means that any integer portion of `zSig' 296 | will be added into the exponent. Since a properly normalized significand 297 | will have an integer portion equal to 1, the `zExp' input should be 1 less 298 | than the desired result exponent whenever `zSig' is a complete, normalized 299 | significand. 300 *----------------------------------------------------------------------------*/ 301 302 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 303 { 304 305 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 306 307 } 308 309 /*---------------------------------------------------------------------------- 310 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 311 | and significand `zSig', and returns the proper single-precision floating- 312 | point value corresponding to the abstract input. Ordinarily, the abstract 313 | value is simply rounded and packed into the single-precision format, with 314 | the inexact exception raised if the abstract input cannot be represented 315 | exactly. However, if the abstract value is too large, the overflow and 316 | inexact exceptions are raised and an infinity or maximal finite value is 317 | returned. If the abstract value is too small, the input value is rounded to 318 | a subnormal number, and the underflow and inexact exceptions are raised if 319 | the abstract input cannot be represented exactly as a subnormal single- 320 | precision floating-point number. 321 | The input significand `zSig' has its binary point between bits 30 322 | and 29, which is 7 bits to the left of the usual location. This shifted 323 | significand must be normalized or smaller. If `zSig' is not normalized, 324 | `zExp' must be 0; in that case, the result returned is a subnormal number, 325 | and it must not require rounding. In the usual case that `zSig' is 326 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 327 | The handling of underflow and overflow follows the IEC/IEEE Standard for 328 | Binary Floating-Point Arithmetic. 329 *----------------------------------------------------------------------------*/ 330 331 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 332 { 333 int8 roundingMode; 334 flag roundNearestEven; 335 int8 roundIncrement, roundBits; 336 flag isTiny; 337 338 roundingMode = float_rounding_mode(); 339 roundNearestEven = ( roundingMode == float_round_nearest_even ); 340 roundIncrement = 0x40; 341 if ( ! roundNearestEven ) { 342 if ( roundingMode == float_round_to_zero ) { 343 roundIncrement = 0; 344 } 345 else { 346 roundIncrement = 0x7F; 347 if ( zSign ) { 348 if ( roundingMode == float_round_up ) roundIncrement = 0; 349 } 350 else { 351 if ( roundingMode == float_round_down ) roundIncrement = 0; 352 } 353 } 354 } 355 roundBits = zSig & 0x7F; 356 if ( 0xFD <= (bits16) zExp ) { 357 if ( ( 0xFD < zExp ) 358 || ( ( zExp == 0xFD ) 359 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 360 ) { 361 float_raise( float_flag_overflow | float_flag_inexact ); 362 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 363 } 364 if ( zExp < 0 ) { 365 isTiny = 366 ( float_detect_tininess == float_tininess_before_rounding ) 367 || ( zExp < -1 ) 368 || ( zSig + roundIncrement < 0x80000000 ); 369 shift32RightJamming( zSig, - zExp, &zSig ); 370 zExp = 0; 371 roundBits = zSig & 0x7F; 372 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 373 } 374 } 375 if ( roundBits ) float_set_inexact(); 376 zSig = ( zSig + roundIncrement )>>7; 377 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 378 if ( zSig == 0 ) zExp = 0; 379 return packFloat32( zSign, zExp, zSig ); 380 381 } 382 383 /*---------------------------------------------------------------------------- 384 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 385 | and significand `zSig', and returns the proper single-precision floating- 386 | point value corresponding to the abstract input. This routine is just like 387 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 388 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 389 | floating-point exponent. 390 *----------------------------------------------------------------------------*/ 391 392 static float32 393 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 394 { 395 int8 shiftCount; 396 397 shiftCount = countLeadingZeros32( zSig ) - 1; 398 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 399 400 } 401 402 /*---------------------------------------------------------------------------- 403 | Returns the fraction bits of the double-precision floating-point value `a'. 404 *----------------------------------------------------------------------------*/ 405 406 INLINE bits64 extractFloat64Frac( float64 a ) 407 { 408 409 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 410 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the double-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 INLINE int16 extractFloat64Exp( float64 a ) 418 { 419 420 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 421 422 } 423 /*---------------------------------------------------------------------------- 424 | Returns the sign bit of the double-precision floating-point value `a'. 425 *----------------------------------------------------------------------------*/ 426 427 INLINE flag extractFloat64Sign( float64 a ) 428 { 429 430 return FLOAT64_DEMANGLE(a)>>63; 431 432 } 433 434 /*---------------------------------------------------------------------------- 435 | Normalizes the subnormal double-precision floating-point value represented 436 | by the denormalized significand `aSig'. The normalized exponent and 437 | significand are stored at the locations pointed to by `zExpPtr' and 438 | `zSigPtr', respectively. 439 *----------------------------------------------------------------------------*/ 440 441 static void 442 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 443 { 444 int8 shiftCount; 445 446 shiftCount = countLeadingZeros64( aSig ) - 11; 447 *zSigPtr = aSig<<shiftCount; 448 *zExpPtr = 1 - shiftCount; 449 450 } 451 452 /*---------------------------------------------------------------------------- 453 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 454 | double-precision floating-point value, returning the result. After being 455 | shifted into the proper positions, the three fields are simply added 456 | together to form the result. This means that any integer portion of `zSig' 457 | will be added into the exponent. Since a properly normalized significand 458 | will have an integer portion equal to 1, the `zExp' input should be 1 less 459 | than the desired result exponent whenever `zSig' is a complete, normalized 460 | significand. 461 *----------------------------------------------------------------------------*/ 462 463 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 464 { 465 466 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 467 ( ( (bits64) zExp )<<52 ) + zSig ); 468 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 473 | and significand `zSig', and returns the proper double-precision floating- 474 | point value corresponding to the abstract input. Ordinarily, the abstract 475 | value is simply rounded and packed into the double-precision format, with 476 | the inexact exception raised if the abstract input cannot be represented 477 | exactly. However, if the abstract value is too large, the overflow and 478 | inexact exceptions are raised and an infinity or maximal finite value is 479 | returned. If the abstract value is too small, the input value is rounded 480 | to a subnormal number, and the underflow and inexact exceptions are raised 481 | if the abstract input cannot be represented exactly as a subnormal double- 482 | precision floating-point number. 483 | The input significand `zSig' has its binary point between bits 62 484 | and 61, which is 10 bits to the left of the usual location. This shifted 485 | significand must be normalized or smaller. If `zSig' is not normalized, 486 | `zExp' must be 0; in that case, the result returned is a subnormal number, 487 | and it must not require rounding. In the usual case that `zSig' is 488 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 489 | The handling of underflow and overflow follows the IEC/IEEE Standard for 490 | Binary Floating-Point Arithmetic. 491 *----------------------------------------------------------------------------*/ 492 493 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 494 { 495 int8 roundingMode; 496 flag roundNearestEven; 497 int16 roundIncrement, roundBits; 498 flag isTiny; 499 500 roundingMode = float_rounding_mode(); 501 roundNearestEven = ( roundingMode == float_round_nearest_even ); 502 roundIncrement = 0x200; 503 if ( ! roundNearestEven ) { 504 if ( roundingMode == float_round_to_zero ) { 505 roundIncrement = 0; 506 } 507 else { 508 roundIncrement = 0x3FF; 509 if ( zSign ) { 510 if ( roundingMode == float_round_up ) roundIncrement = 0; 511 } 512 else { 513 if ( roundingMode == float_round_down ) roundIncrement = 0; 514 } 515 } 516 } 517 roundBits = zSig & 0x3FF; 518 if ( 0x7FD <= (bits16) zExp ) { 519 if ( ( 0x7FD < zExp ) 520 || ( ( zExp == 0x7FD ) 521 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 522 ) { 523 float_raise( float_flag_overflow | float_flag_inexact ); 524 return FLOAT64_MANGLE( 525 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 526 ( roundIncrement == 0 )); 527 } 528 if ( zExp < 0 ) { 529 isTiny = 530 ( float_detect_tininess == float_tininess_before_rounding ) 531 || ( zExp < -1 ) 532 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 533 shift64RightJamming( zSig, - zExp, &zSig ); 534 zExp = 0; 535 roundBits = zSig & 0x3FF; 536 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 537 } 538 } 539 if ( roundBits ) float_set_inexact(); 540 zSig = ( zSig + roundIncrement )>>10; 541 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 542 if ( zSig == 0 ) zExp = 0; 543 return packFloat64( zSign, zExp, zSig ); 544 545 } 546 547 /*---------------------------------------------------------------------------- 548 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 549 | and significand `zSig', and returns the proper double-precision floating- 550 | point value corresponding to the abstract input. This routine is just like 551 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 552 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 553 | floating-point exponent. 554 *----------------------------------------------------------------------------*/ 555 556 static float64 557 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 558 { 559 int8 shiftCount; 560 561 shiftCount = countLeadingZeros64( zSig ) - 1; 562 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 563 564 } 565 566 #ifdef FLOATX80 567 568 /*---------------------------------------------------------------------------- 569 | Returns the fraction bits of the extended double-precision floating-point 570 | value `a'. 571 *----------------------------------------------------------------------------*/ 572 573 INLINE bits64 extractFloatx80Frac( floatx80 a ) 574 { 575 576 return a.low; 577 578 } 579 580 /*---------------------------------------------------------------------------- 581 | Returns the exponent bits of the extended double-precision floating-point 582 | value `a'. 583 *----------------------------------------------------------------------------*/ 584 585 INLINE int32 extractFloatx80Exp( floatx80 a ) 586 { 587 588 return a.high & 0x7FFF; 589 590 } 591 592 /*---------------------------------------------------------------------------- 593 | Returns the sign bit of the extended double-precision floating-point value 594 | `a'. 595 *----------------------------------------------------------------------------*/ 596 597 INLINE flag extractFloatx80Sign( floatx80 a ) 598 { 599 600 return a.high>>15; 601 602 } 603 604 /*---------------------------------------------------------------------------- 605 | Normalizes the subnormal extended double-precision floating-point value 606 | represented by the denormalized significand `aSig'. The normalized exponent 607 | and significand are stored at the locations pointed to by `zExpPtr' and 608 | `zSigPtr', respectively. 609 *----------------------------------------------------------------------------*/ 610 611 static void 612 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 613 { 614 int8 shiftCount; 615 616 shiftCount = countLeadingZeros64( aSig ); 617 *zSigPtr = aSig<<shiftCount; 618 *zExpPtr = 1 - shiftCount; 619 620 } 621 622 /*---------------------------------------------------------------------------- 623 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 624 | extended double-precision floating-point value, returning the result. 625 *----------------------------------------------------------------------------*/ 626 627 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 628 { 629 floatx80 z; 630 631 z.low = zSig; 632 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 633 return z; 634 635 } 636 637 /*---------------------------------------------------------------------------- 638 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 639 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 640 | and returns the proper extended double-precision floating-point value 641 | corresponding to the abstract input. Ordinarily, the abstract value is 642 | rounded and packed into the extended double-precision format, with the 643 | inexact exception raised if the abstract input cannot be represented 644 | exactly. However, if the abstract value is too large, the overflow and 645 | inexact exceptions are raised and an infinity or maximal finite value is 646 | returned. If the abstract value is too small, the input value is rounded to 647 | a subnormal number, and the underflow and inexact exceptions are raised if 648 | the abstract input cannot be represented exactly as a subnormal extended 649 | double-precision floating-point number. 650 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 651 | number of bits as single or double precision, respectively. Otherwise, the 652 | result is rounded to the full precision of the extended double-precision 653 | format. 654 | The input significand must be normalized or smaller. If the input 655 | significand is not normalized, `zExp' must be 0; in that case, the result 656 | returned is a subnormal number, and it must not require rounding. The 657 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 658 | Floating-Point Arithmetic. 659 *----------------------------------------------------------------------------*/ 660 661 static floatx80 662 roundAndPackFloatx80( 663 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 664 ) 665 { 666 int8 roundingMode; 667 flag roundNearestEven, increment, isTiny; 668 int64 roundIncrement, roundMask, roundBits; 669 670 roundingMode = float_rounding_mode(); 671 roundNearestEven = ( roundingMode == float_round_nearest_even ); 672 if ( roundingPrecision == 80 ) goto precision80; 673 if ( roundingPrecision == 64 ) { 674 roundIncrement = LIT64( 0x0000000000000400 ); 675 roundMask = LIT64( 0x00000000000007FF ); 676 } 677 else if ( roundingPrecision == 32 ) { 678 roundIncrement = LIT64( 0x0000008000000000 ); 679 roundMask = LIT64( 0x000000FFFFFFFFFF ); 680 } 681 else { 682 goto precision80; 683 } 684 zSig0 |= ( zSig1 != 0 ); 685 if ( ! roundNearestEven ) { 686 if ( roundingMode == float_round_to_zero ) { 687 roundIncrement = 0; 688 } 689 else { 690 roundIncrement = roundMask; 691 if ( zSign ) { 692 if ( roundingMode == float_round_up ) roundIncrement = 0; 693 } 694 else { 695 if ( roundingMode == float_round_down ) roundIncrement = 0; 696 } 697 } 698 } 699 roundBits = zSig0 & roundMask; 700 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 701 if ( ( 0x7FFE < zExp ) 702 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 703 ) { 704 goto overflow; 705 } 706 if ( zExp <= 0 ) { 707 isTiny = 708 ( float_detect_tininess == float_tininess_before_rounding ) 709 || ( zExp < 0 ) 710 || ( zSig0 <= zSig0 + roundIncrement ); 711 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 712 zExp = 0; 713 roundBits = zSig0 & roundMask; 714 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 715 if ( roundBits ) float_set_inexact(); 716 zSig0 += roundIncrement; 717 if ( (sbits64) zSig0 < 0 ) zExp = 1; 718 roundIncrement = roundMask + 1; 719 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 720 roundMask |= roundIncrement; 721 } 722 zSig0 &= ~ roundMask; 723 return packFloatx80( zSign, zExp, zSig0 ); 724 } 725 } 726 if ( roundBits ) float_set_inexact(); 727 zSig0 += roundIncrement; 728 if ( zSig0 < roundIncrement ) { 729 ++zExp; 730 zSig0 = LIT64( 0x8000000000000000 ); 731 } 732 roundIncrement = roundMask + 1; 733 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 734 roundMask |= roundIncrement; 735 } 736 zSig0 &= ~ roundMask; 737 if ( zSig0 == 0 ) zExp = 0; 738 return packFloatx80( zSign, zExp, zSig0 ); 739 precision80: 740 increment = ( (sbits64) zSig1 < 0 ); 741 if ( ! roundNearestEven ) { 742 if ( roundingMode == float_round_to_zero ) { 743 increment = 0; 744 } 745 else { 746 if ( zSign ) { 747 increment = ( roundingMode == float_round_down ) && zSig1; 748 } 749 else { 750 increment = ( roundingMode == float_round_up ) && zSig1; 751 } 752 } 753 } 754 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 755 if ( ( 0x7FFE < zExp ) 756 || ( ( zExp == 0x7FFE ) 757 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 758 && increment 759 ) 760 ) { 761 roundMask = 0; 762 overflow: 763 float_raise( float_flag_overflow | float_flag_inexact ); 764 if ( ( roundingMode == float_round_to_zero ) 765 || ( zSign && ( roundingMode == float_round_up ) ) 766 || ( ! zSign && ( roundingMode == float_round_down ) ) 767 ) { 768 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 769 } 770 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 771 } 772 if ( zExp <= 0 ) { 773 isTiny = 774 ( float_detect_tininess == float_tininess_before_rounding ) 775 || ( zExp < 0 ) 776 || ! increment 777 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 778 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 779 zExp = 0; 780 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 781 if ( zSig1 ) float_set_inexact(); 782 if ( roundNearestEven ) { 783 increment = ( (sbits64) zSig1 < 0 ); 784 } 785 else { 786 if ( zSign ) { 787 increment = ( roundingMode == float_round_down ) && zSig1; 788 } 789 else { 790 increment = ( roundingMode == float_round_up ) && zSig1; 791 } 792 } 793 if ( increment ) { 794 ++zSig0; 795 zSig0 &= 796 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 797 if ( (sbits64) zSig0 < 0 ) zExp = 1; 798 } 799 return packFloatx80( zSign, zExp, zSig0 ); 800 } 801 } 802 if ( zSig1 ) float_set_inexact(); 803 if ( increment ) { 804 ++zSig0; 805 if ( zSig0 == 0 ) { 806 ++zExp; 807 zSig0 = LIT64( 0x8000000000000000 ); 808 } 809 else { 810 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 811 } 812 } 813 else { 814 if ( zSig0 == 0 ) zExp = 0; 815 } 816 return packFloatx80( zSign, zExp, zSig0 ); 817 818 } 819 820 /*---------------------------------------------------------------------------- 821 | Takes an abstract floating-point value having sign `zSign', exponent 822 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 823 | and returns the proper extended double-precision floating-point value 824 | corresponding to the abstract input. This routine is just like 825 | `roundAndPackFloatx80' except that the input significand does not have to be 826 | normalized. 827 *----------------------------------------------------------------------------*/ 828 829 static floatx80 830 normalizeRoundAndPackFloatx80( 831 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 832 ) 833 { 834 int8 shiftCount; 835 836 if ( zSig0 == 0 ) { 837 zSig0 = zSig1; 838 zSig1 = 0; 839 zExp -= 64; 840 } 841 shiftCount = countLeadingZeros64( zSig0 ); 842 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 843 zExp -= shiftCount; 844 return 845 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 846 847 } 848 849 #endif 850 851 #ifdef FLOAT128 852 853 /*---------------------------------------------------------------------------- 854 | Returns the least-significant 64 fraction bits of the quadruple-precision 855 | floating-point value `a'. 856 *----------------------------------------------------------------------------*/ 857 858 INLINE bits64 extractFloat128Frac1( float128 a ) 859 { 860 861 return a.low; 862 863 } 864 865 /*---------------------------------------------------------------------------- 866 | Returns the most-significant 48 fraction bits of the quadruple-precision 867 | floating-point value `a'. 868 *----------------------------------------------------------------------------*/ 869 870 INLINE bits64 extractFloat128Frac0( float128 a ) 871 { 872 873 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 874 875 } 876 877 /*---------------------------------------------------------------------------- 878 | Returns the exponent bits of the quadruple-precision floating-point value 879 | `a'. 880 *----------------------------------------------------------------------------*/ 881 882 INLINE int32 extractFloat128Exp( float128 a ) 883 { 884 885 return ( a.high>>48 ) & 0x7FFF; 886 887 } 888 889 890 /*---------------------------------------------------------------------------- 891 | Returns the sign bit of the quadruple-precision floating-point value `a'. 892 *----------------------------------------------------------------------------*/ 893 894 INLINE flag extractFloat128Sign( float128 a ) 895 { 896 897 return a.high>>63; 898 899 } 900 901 /*---------------------------------------------------------------------------- 902 | Normalizes the subnormal quadruple-precision floating-point value 903 | represented by the denormalized significand formed by the concatenation of 904 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 905 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 906 | significand are stored at the location pointed to by `zSig0Ptr', and the 907 | least significant 64 bits of the normalized significand are stored at the 908 | location pointed to by `zSig1Ptr'. 909 *----------------------------------------------------------------------------*/ 910 911 static void 912 normalizeFloat128Subnormal( 913 bits64 aSig0, 914 bits64 aSig1, 915 int32 *zExpPtr, 916 bits64 *zSig0Ptr, 917 bits64 *zSig1Ptr 918 ) 919 { 920 int8 shiftCount; 921 922 if ( aSig0 == 0 ) { 923 shiftCount = countLeadingZeros64( aSig1 ) - 15; 924 if ( shiftCount < 0 ) { 925 *zSig0Ptr = aSig1>>( - shiftCount ); 926 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 927 } 928 else { 929 *zSig0Ptr = aSig1<<shiftCount; 930 *zSig1Ptr = 0; 931 } 932 *zExpPtr = - shiftCount - 63; 933 } 934 else { 935 shiftCount = countLeadingZeros64( aSig0 ) - 15; 936 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 937 *zExpPtr = 1 - shiftCount; 938 } 939 940 } 941 942 /*---------------------------------------------------------------------------- 943 | Packs the sign `zSign', the exponent `zExp', and the significand formed 944 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 945 | floating-point value, returning the result. After being shifted into the 946 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 947 | added together to form the most significant 32 bits of the result. This 948 | means that any integer portion of `zSig0' will be added into the exponent. 949 | Since a properly normalized significand will have an integer portion equal 950 | to 1, the `zExp' input should be 1 less than the desired result exponent 951 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 952 | significand. 953 *----------------------------------------------------------------------------*/ 954 955 INLINE float128 956 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 957 { 958 float128 z; 959 960 z.low = zSig1; 961 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 962 return z; 963 964 } 965 966 /*---------------------------------------------------------------------------- 967 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 968 | and extended significand formed by the concatenation of `zSig0', `zSig1', 969 | and `zSig2', and returns the proper quadruple-precision floating-point value 970 | corresponding to the abstract input. Ordinarily, the abstract value is 971 | simply rounded and packed into the quadruple-precision format, with the 972 | inexact exception raised if the abstract input cannot be represented 973 | exactly. However, if the abstract value is too large, the overflow and 974 | inexact exceptions are raised and an infinity or maximal finite value is 975 | returned. If the abstract value is too small, the input value is rounded to 976 | a subnormal number, and the underflow and inexact exceptions are raised if 977 | the abstract input cannot be represented exactly as a subnormal quadruple- 978 | precision floating-point number. 979 | The input significand must be normalized or smaller. If the input 980 | significand is not normalized, `zExp' must be 0; in that case, the result 981 | returned is a subnormal number, and it must not require rounding. In the 982 | usual case that the input significand is normalized, `zExp' must be 1 less 983 | than the ``true'' floating-point exponent. The handling of underflow and 984 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 985 *----------------------------------------------------------------------------*/ 986 987 static float128 988 roundAndPackFloat128( 989 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 990 { 991 int8 roundingMode; 992 flag roundNearestEven, increment, isTiny; 993 994 roundingMode = float_rounding_mode(); 995 roundNearestEven = ( roundingMode == float_round_nearest_even ); 996 increment = ( (sbits64) zSig2 < 0 ); 997 if ( ! roundNearestEven ) { 998 if ( roundingMode == float_round_to_zero ) { 999 increment = 0; 1000 } 1001 else { 1002 if ( zSign ) { 1003 increment = ( roundingMode == float_round_down ) && zSig2; 1004 } 1005 else { 1006 increment = ( roundingMode == float_round_up ) && zSig2; 1007 } 1008 } 1009 } 1010 if ( 0x7FFD <= (bits32) zExp ) { 1011 if ( ( 0x7FFD < zExp ) 1012 || ( ( zExp == 0x7FFD ) 1013 && eq128( 1014 LIT64( 0x0001FFFFFFFFFFFF ), 1015 LIT64( 0xFFFFFFFFFFFFFFFF ), 1016 zSig0, 1017 zSig1 1018 ) 1019 && increment 1020 ) 1021 ) { 1022 float_raise( float_flag_overflow | float_flag_inexact ); 1023 if ( ( roundingMode == float_round_to_zero ) 1024 || ( zSign && ( roundingMode == float_round_up ) ) 1025 || ( ! zSign && ( roundingMode == float_round_down ) ) 1026 ) { 1027 return 1028 packFloat128( 1029 zSign, 1030 0x7FFE, 1031 LIT64( 0x0000FFFFFFFFFFFF ), 1032 LIT64( 0xFFFFFFFFFFFFFFFF ) 1033 ); 1034 } 1035 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1036 } 1037 if ( zExp < 0 ) { 1038 isTiny = 1039 ( float_detect_tininess == float_tininess_before_rounding ) 1040 || ( zExp < -1 ) 1041 || ! increment 1042 || lt128( 1043 zSig0, 1044 zSig1, 1045 LIT64( 0x0001FFFFFFFFFFFF ), 1046 LIT64( 0xFFFFFFFFFFFFFFFF ) 1047 ); 1048 shift128ExtraRightJamming( 1049 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1050 zExp = 0; 1051 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1052 if ( roundNearestEven ) { 1053 increment = ( (sbits64) zSig2 < 0 ); 1054 } 1055 else { 1056 if ( zSign ) { 1057 increment = ( roundingMode == float_round_down ) && zSig2; 1058 } 1059 else { 1060 increment = ( roundingMode == float_round_up ) && zSig2; 1061 } 1062 } 1063 } 1064 } 1065 if ( zSig2 ) float_set_inexact(); 1066 if ( increment ) { 1067 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1068 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1069 } 1070 else { 1071 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1072 } 1073 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1074 1075 } 1076 1077 /*---------------------------------------------------------------------------- 1078 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1079 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1080 | returns the proper quadruple-precision floating-point value corresponding 1081 | to the abstract input. This routine is just like `roundAndPackFloat128' 1082 | except that the input significand has fewer bits and does not have to be 1083 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1084 | point exponent. 1085 *----------------------------------------------------------------------------*/ 1086 1087 static float128 1088 normalizeRoundAndPackFloat128( 1089 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1090 { 1091 int8 shiftCount; 1092 bits64 zSig2; 1093 1094 if ( zSig0 == 0 ) { 1095 zSig0 = zSig1; 1096 zSig1 = 0; 1097 zExp -= 64; 1098 } 1099 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1100 if ( 0 <= shiftCount ) { 1101 zSig2 = 0; 1102 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1103 } 1104 else { 1105 shift128ExtraRightJamming( 1106 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1107 } 1108 zExp -= shiftCount; 1109 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1110 1111 } 1112 1113 #endif 1114 1115 /*---------------------------------------------------------------------------- 1116 | Returns the result of converting the 32-bit two's complement integer `a' 1117 | to the single-precision floating-point format. The conversion is performed 1118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1119 *----------------------------------------------------------------------------*/ 1120 1121 float32 int32_to_float32( int32 a ) 1122 { 1123 flag zSign; 1124 1125 if ( a == 0 ) return 0; 1126 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1127 zSign = ( a < 0 ); 1128 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1129 1130 } 1131 1132 /*---------------------------------------------------------------------------- 1133 | Returns the result of converting the 32-bit two's complement integer `a' 1134 | to the double-precision floating-point format. The conversion is performed 1135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1136 *----------------------------------------------------------------------------*/ 1137 1138 float64 int32_to_float64( int32 a ) 1139 { 1140 flag zSign; 1141 uint32 absA; 1142 int8 shiftCount; 1143 bits64 zSig; 1144 1145 if ( a == 0 ) return 0; 1146 zSign = ( a < 0 ); 1147 absA = zSign ? - a : a; 1148 shiftCount = countLeadingZeros32( absA ) + 21; 1149 zSig = absA; 1150 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1151 1152 } 1153 1154 #ifdef FLOATX80 1155 1156 /*---------------------------------------------------------------------------- 1157 | Returns the result of converting the 32-bit two's complement integer `a' 1158 | to the extended double-precision floating-point format. The conversion 1159 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1160 | Arithmetic. 1161 *----------------------------------------------------------------------------*/ 1162 1163 floatx80 int32_to_floatx80( int32 a ) 1164 { 1165 flag zSign; 1166 uint32 absA; 1167 int8 shiftCount; 1168 bits64 zSig; 1169 1170 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1171 zSign = ( a < 0 ); 1172 absA = zSign ? - a : a; 1173 shiftCount = countLeadingZeros32( absA ) + 32; 1174 zSig = absA; 1175 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1176 1177 } 1178 1179 #endif 1180 1181 #ifdef FLOAT128 1182 1183 /*---------------------------------------------------------------------------- 1184 | Returns the result of converting the 32-bit two's complement integer `a' to 1185 | the quadruple-precision floating-point format. The conversion is performed 1186 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1187 *----------------------------------------------------------------------------*/ 1188 1189 float128 int32_to_float128( int32 a ) 1190 { 1191 flag zSign; 1192 uint32 absA; 1193 int8 shiftCount; 1194 bits64 zSig0; 1195 1196 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1197 zSign = ( a < 0 ); 1198 absA = zSign ? - a : a; 1199 shiftCount = countLeadingZeros32( absA ) + 17; 1200 zSig0 = absA; 1201 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1202 1203 } 1204 1205 #endif 1206 1207 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1208 /*---------------------------------------------------------------------------- 1209 | Returns the result of converting the 64-bit two's complement integer `a' 1210 | to the single-precision floating-point format. The conversion is performed 1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1212 *----------------------------------------------------------------------------*/ 1213 1214 float32 int64_to_float32( int64 a ) 1215 { 1216 flag zSign; 1217 uint64 absA; 1218 int8 shiftCount; 1219 1220 if ( a == 0 ) return 0; 1221 zSign = ( a < 0 ); 1222 absA = zSign ? - a : a; 1223 shiftCount = countLeadingZeros64( absA ) - 40; 1224 if ( 0 <= shiftCount ) { 1225 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1226 } 1227 else { 1228 shiftCount += 7; 1229 if ( shiftCount < 0 ) { 1230 shift64RightJamming( absA, - shiftCount, &absA ); 1231 } 1232 else { 1233 absA <<= shiftCount; 1234 } 1235 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1236 } 1237 1238 } 1239 1240 /*---------------------------------------------------------------------------- 1241 | Returns the result of converting the 64-bit two's complement integer `a' 1242 | to the double-precision floating-point format. The conversion is performed 1243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1244 *----------------------------------------------------------------------------*/ 1245 1246 float64 int64_to_float64( int64 a ) 1247 { 1248 flag zSign; 1249 1250 if ( a == 0 ) return 0; 1251 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1252 return packFloat64( 1, 0x43E, 0 ); 1253 } 1254 zSign = ( a < 0 ); 1255 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1256 1257 } 1258 1259 #ifdef FLOATX80 1260 1261 /*---------------------------------------------------------------------------- 1262 | Returns the result of converting the 64-bit two's complement integer `a' 1263 | to the extended double-precision floating-point format. The conversion 1264 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1265 | Arithmetic. 1266 *----------------------------------------------------------------------------*/ 1267 1268 floatx80 int64_to_floatx80( int64 a ) 1269 { 1270 flag zSign; 1271 uint64 absA; 1272 int8 shiftCount; 1273 1274 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1275 zSign = ( a < 0 ); 1276 absA = zSign ? - a : a; 1277 shiftCount = countLeadingZeros64( absA ); 1278 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1279 1280 } 1281 1282 #endif 1283 1284 #ifdef FLOAT128 1285 1286 /*---------------------------------------------------------------------------- 1287 | Returns the result of converting the 64-bit two's complement integer `a' to 1288 | the quadruple-precision floating-point format. The conversion is performed 1289 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1290 *----------------------------------------------------------------------------*/ 1291 1292 float128 int64_to_float128( int64 a ) 1293 { 1294 flag zSign; 1295 uint64 absA; 1296 int8 shiftCount; 1297 int32 zExp; 1298 bits64 zSig0, zSig1; 1299 1300 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1301 zSign = ( a < 0 ); 1302 absA = zSign ? - a : a; 1303 shiftCount = countLeadingZeros64( absA ) + 49; 1304 zExp = 0x406E - shiftCount; 1305 if ( 64 <= shiftCount ) { 1306 zSig1 = 0; 1307 zSig0 = absA; 1308 shiftCount -= 64; 1309 } 1310 else { 1311 zSig1 = absA; 1312 zSig0 = 0; 1313 } 1314 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1315 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1316 1317 } 1318 1319 #endif 1320 #endif /* !SOFTFLOAT_FOR_GCC */ 1321 1322 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1323 /*---------------------------------------------------------------------------- 1324 | Returns the result of converting the single-precision floating-point value 1325 | `a' to the 32-bit two's complement integer format. The conversion is 1326 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1327 | Arithmetic---which means in particular that the conversion is rounded 1328 | according to the current rounding mode. If `a' is a NaN, the largest 1329 | positive integer is returned. Otherwise, if the conversion overflows, the 1330 | largest integer with the same sign as `a' is returned. 1331 *----------------------------------------------------------------------------*/ 1332 1333 int32 float32_to_int32( float32 a ) 1334 { 1335 flag aSign; 1336 int16 aExp, shiftCount; 1337 bits32 aSig; 1338 bits64 aSig64; 1339 1340 aSig = extractFloat32Frac( a ); 1341 aExp = extractFloat32Exp( a ); 1342 aSign = extractFloat32Sign( a ); 1343 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1344 if ( aExp ) aSig |= 0x00800000; 1345 shiftCount = 0xAF - aExp; 1346 aSig64 = aSig; 1347 aSig64 <<= 32; 1348 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1349 return roundAndPackInt32( aSign, aSig64 ); 1350 1351 } 1352 #endif /* !SOFTFLOAT_FOR_GCC */ 1353 1354 /*---------------------------------------------------------------------------- 1355 | Returns the result of converting the single-precision floating-point value 1356 | `a' to the 32-bit two's complement integer format. The conversion is 1357 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1358 | Arithmetic, except that the conversion is always rounded toward zero. 1359 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1360 | the conversion overflows, the largest integer with the same sign as `a' is 1361 | returned. 1362 *----------------------------------------------------------------------------*/ 1363 1364 int32 float32_to_int32_round_to_zero( float32 a ) 1365 { 1366 flag aSign; 1367 int16 aExp, shiftCount; 1368 bits32 aSig; 1369 int32 z; 1370 1371 aSig = extractFloat32Frac( a ); 1372 aExp = extractFloat32Exp( a ); 1373 aSign = extractFloat32Sign( a ); 1374 shiftCount = aExp - 0x9E; 1375 if ( 0 <= shiftCount ) { 1376 if ( a != 0xCF000000 ) { 1377 float_raise( float_flag_invalid ); 1378 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1379 } 1380 return (sbits32) 0x80000000; 1381 } 1382 else if ( aExp <= 0x7E ) { 1383 if ( aExp | aSig ) float_set_inexact(); 1384 return 0; 1385 } 1386 aSig = ( aSig | 0x00800000 )<<8; 1387 z = aSig>>( - shiftCount ); 1388 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1389 float_set_inexact(); 1390 } 1391 if ( aSign ) z = - z; 1392 return z; 1393 1394 } 1395 1396 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1397 /*---------------------------------------------------------------------------- 1398 | Returns the result of converting the single-precision floating-point value 1399 | `a' to the 64-bit two's complement integer format. The conversion is 1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1401 | Arithmetic---which means in particular that the conversion is rounded 1402 | according to the current rounding mode. If `a' is a NaN, the largest 1403 | positive integer is returned. Otherwise, if the conversion overflows, the 1404 | largest integer with the same sign as `a' is returned. 1405 *----------------------------------------------------------------------------*/ 1406 1407 int64 float32_to_int64( float32 a ) 1408 { 1409 flag aSign; 1410 int16 aExp, shiftCount; 1411 bits32 aSig; 1412 bits64 aSig64, aSigExtra; 1413 1414 aSig = extractFloat32Frac( a ); 1415 aExp = extractFloat32Exp( a ); 1416 aSign = extractFloat32Sign( a ); 1417 shiftCount = 0xBE - aExp; 1418 if ( shiftCount < 0 ) { 1419 float_raise( float_flag_invalid ); 1420 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1421 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1422 } 1423 return (sbits64) LIT64( 0x8000000000000000 ); 1424 } 1425 if ( aExp ) aSig |= 0x00800000; 1426 aSig64 = aSig; 1427 aSig64 <<= 40; 1428 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1429 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1430 1431 } 1432 1433 /*---------------------------------------------------------------------------- 1434 | Returns the result of converting the single-precision floating-point value 1435 | `a' to the 64-bit two's complement integer format. The conversion is 1436 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1437 | Arithmetic, except that the conversion is always rounded toward zero. If 1438 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1439 | conversion overflows, the largest integer with the same sign as `a' is 1440 | returned. 1441 *----------------------------------------------------------------------------*/ 1442 1443 int64 float32_to_int64_round_to_zero( float32 a ) 1444 { 1445 flag aSign; 1446 int16 aExp, shiftCount; 1447 bits32 aSig; 1448 bits64 aSig64; 1449 int64 z; 1450 1451 aSig = extractFloat32Frac( a ); 1452 aExp = extractFloat32Exp( a ); 1453 aSign = extractFloat32Sign( a ); 1454 shiftCount = aExp - 0xBE; 1455 if ( 0 <= shiftCount ) { 1456 if ( a != 0xDF000000 ) { 1457 float_raise( float_flag_invalid ); 1458 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1459 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1460 } 1461 } 1462 return (sbits64) LIT64( 0x8000000000000000 ); 1463 } 1464 else if ( aExp <= 0x7E ) { 1465 if ( aExp | aSig ) float_set_inexact(); 1466 return 0; 1467 } 1468 aSig64 = aSig | 0x00800000; 1469 aSig64 <<= 40; 1470 z = aSig64>>( - shiftCount ); 1471 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1472 float_set_inexact(); 1473 } 1474 if ( aSign ) z = - z; 1475 return z; 1476 1477 } 1478 #endif /* !SOFTFLOAT_FOR_GCC */ 1479 1480 /*---------------------------------------------------------------------------- 1481 | Returns the result of converting the single-precision floating-point value 1482 | `a' to the double-precision floating-point format. The conversion is 1483 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1484 | Arithmetic. 1485 *----------------------------------------------------------------------------*/ 1486 1487 float64 float32_to_float64( float32 a ) 1488 { 1489 flag aSign; 1490 int16 aExp; 1491 bits32 aSig; 1492 1493 aSig = extractFloat32Frac( a ); 1494 aExp = extractFloat32Exp( a ); 1495 aSign = extractFloat32Sign( a ); 1496 if ( aExp == 0xFF ) { 1497 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1498 return packFloat64( aSign, 0x7FF, 0 ); 1499 } 1500 if ( aExp == 0 ) { 1501 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1502 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1503 --aExp; 1504 } 1505 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1506 1507 } 1508 1509 #ifdef FLOATX80 1510 1511 /*---------------------------------------------------------------------------- 1512 | Returns the result of converting the single-precision floating-point value 1513 | `a' to the extended double-precision floating-point format. The conversion 1514 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1515 | Arithmetic. 1516 *----------------------------------------------------------------------------*/ 1517 1518 floatx80 float32_to_floatx80( float32 a ) 1519 { 1520 flag aSign; 1521 int16 aExp; 1522 bits32 aSig; 1523 1524 aSig = extractFloat32Frac( a ); 1525 aExp = extractFloat32Exp( a ); 1526 aSign = extractFloat32Sign( a ); 1527 if ( aExp == 0xFF ) { 1528 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1529 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1530 } 1531 if ( aExp == 0 ) { 1532 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1533 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1534 } 1535 aSig |= 0x00800000; 1536 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1537 1538 } 1539 1540 #endif 1541 1542 #ifdef FLOAT128 1543 1544 /*---------------------------------------------------------------------------- 1545 | Returns the result of converting the single-precision floating-point value 1546 | `a' to the double-precision floating-point format. The conversion is 1547 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1548 | Arithmetic. 1549 *----------------------------------------------------------------------------*/ 1550 1551 float128 float32_to_float128( float32 a ) 1552 { 1553 flag aSign; 1554 int16 aExp; 1555 bits32 aSig; 1556 1557 aSig = extractFloat32Frac( a ); 1558 aExp = extractFloat32Exp( a ); 1559 aSign = extractFloat32Sign( a ); 1560 if ( aExp == 0xFF ) { 1561 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1562 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1563 } 1564 if ( aExp == 0 ) { 1565 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1566 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1567 --aExp; 1568 } 1569 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1570 1571 } 1572 1573 #endif 1574 1575 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1576 /*---------------------------------------------------------------------------- 1577 | Rounds the single-precision floating-point value `a' to an integer, and 1578 | returns the result as a single-precision floating-point value. The 1579 | operation is performed according to the IEC/IEEE Standard for Binary 1580 | Floating-Point Arithmetic. 1581 *----------------------------------------------------------------------------*/ 1582 1583 float32 float32_round_to_int( float32 a ) 1584 { 1585 flag aSign; 1586 int16 aExp; 1587 bits32 lastBitMask, roundBitsMask; 1588 int8 roundingMode; 1589 float32 z; 1590 1591 aExp = extractFloat32Exp( a ); 1592 if ( 0x96 <= aExp ) { 1593 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1594 return propagateFloat32NaN( a, a ); 1595 } 1596 return a; 1597 } 1598 if ( aExp <= 0x7E ) { 1599 if ( (bits32) ( a<<1 ) == 0 ) return a; 1600 float_set_inexact(); 1601 aSign = extractFloat32Sign( a ); 1602 switch ( float_rounding_mode() ) { 1603 case float_round_nearest_even: 1604 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1605 return packFloat32( aSign, 0x7F, 0 ); 1606 } 1607 break; 1608 case float_round_down: 1609 return aSign ? 0xBF800000 : 0; 1610 case float_round_up: 1611 return aSign ? 0x80000000 : 0x3F800000; 1612 } 1613 return packFloat32( aSign, 0, 0 ); 1614 } 1615 lastBitMask = 1; 1616 lastBitMask <<= 0x96 - aExp; 1617 roundBitsMask = lastBitMask - 1; 1618 z = a; 1619 roundingMode = float_rounding_mode(); 1620 if ( roundingMode == float_round_nearest_even ) { 1621 z += lastBitMask>>1; 1622 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1623 } 1624 else if ( roundingMode != float_round_to_zero ) { 1625 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1626 z += roundBitsMask; 1627 } 1628 } 1629 z &= ~ roundBitsMask; 1630 if ( z != a ) float_set_inexact(); 1631 return z; 1632 1633 } 1634 #endif /* !SOFTFLOAT_FOR_GCC */ 1635 1636 /*---------------------------------------------------------------------------- 1637 | Returns the result of adding the absolute values of the single-precision 1638 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1639 | before being returned. `zSign' is ignored if the result is a NaN. 1640 | The addition is performed according to the IEC/IEEE Standard for Binary 1641 | Floating-Point Arithmetic. 1642 *----------------------------------------------------------------------------*/ 1643 1644 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1645 { 1646 int16 aExp, bExp, zExp; 1647 bits32 aSig, bSig, zSig; 1648 int16 expDiff; 1649 1650 aSig = extractFloat32Frac( a ); 1651 aExp = extractFloat32Exp( a ); 1652 bSig = extractFloat32Frac( b ); 1653 bExp = extractFloat32Exp( b ); 1654 expDiff = aExp - bExp; 1655 aSig <<= 6; 1656 bSig <<= 6; 1657 if ( 0 < expDiff ) { 1658 if ( aExp == 0xFF ) { 1659 if ( aSig ) return propagateFloat32NaN( a, b ); 1660 return a; 1661 } 1662 if ( bExp == 0 ) { 1663 --expDiff; 1664 } 1665 else { 1666 bSig |= 0x20000000; 1667 } 1668 shift32RightJamming( bSig, expDiff, &bSig ); 1669 zExp = aExp; 1670 } 1671 else if ( expDiff < 0 ) { 1672 if ( bExp == 0xFF ) { 1673 if ( bSig ) return propagateFloat32NaN( a, b ); 1674 return packFloat32( zSign, 0xFF, 0 ); 1675 } 1676 if ( aExp == 0 ) { 1677 ++expDiff; 1678 } 1679 else { 1680 aSig |= 0x20000000; 1681 } 1682 shift32RightJamming( aSig, - expDiff, &aSig ); 1683 zExp = bExp; 1684 } 1685 else { 1686 if ( aExp == 0xFF ) { 1687 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1688 return a; 1689 } 1690 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1691 zSig = 0x40000000 + aSig + bSig; 1692 zExp = aExp; 1693 goto roundAndPack; 1694 } 1695 aSig |= 0x20000000; 1696 zSig = ( aSig + bSig )<<1; 1697 --zExp; 1698 if ( (sbits32) zSig < 0 ) { 1699 zSig = aSig + bSig; 1700 ++zExp; 1701 } 1702 roundAndPack: 1703 return roundAndPackFloat32( zSign, zExp, zSig ); 1704 1705 } 1706 1707 /*---------------------------------------------------------------------------- 1708 | Returns the result of subtracting the absolute values of the single- 1709 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1710 | difference is negated before being returned. `zSign' is ignored if the 1711 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1712 | Standard for Binary Floating-Point Arithmetic. 1713 *----------------------------------------------------------------------------*/ 1714 1715 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1716 { 1717 int16 aExp, bExp, zExp; 1718 bits32 aSig, bSig, zSig; 1719 int16 expDiff; 1720 1721 aSig = extractFloat32Frac( a ); 1722 aExp = extractFloat32Exp( a ); 1723 bSig = extractFloat32Frac( b ); 1724 bExp = extractFloat32Exp( b ); 1725 expDiff = aExp - bExp; 1726 aSig <<= 7; 1727 bSig <<= 7; 1728 if ( 0 < expDiff ) goto aExpBigger; 1729 if ( expDiff < 0 ) goto bExpBigger; 1730 if ( aExp == 0xFF ) { 1731 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1732 float_raise( float_flag_invalid ); 1733 return float32_default_nan; 1734 } 1735 if ( aExp == 0 ) { 1736 aExp = 1; 1737 bExp = 1; 1738 } 1739 if ( bSig < aSig ) goto aBigger; 1740 if ( aSig < bSig ) goto bBigger; 1741 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1742 bExpBigger: 1743 if ( bExp == 0xFF ) { 1744 if ( bSig ) return propagateFloat32NaN( a, b ); 1745 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1746 } 1747 if ( aExp == 0 ) { 1748 ++expDiff; 1749 } 1750 else { 1751 aSig |= 0x40000000; 1752 } 1753 shift32RightJamming( aSig, - expDiff, &aSig ); 1754 bSig |= 0x40000000; 1755 bBigger: 1756 zSig = bSig - aSig; 1757 zExp = bExp; 1758 zSign ^= 1; 1759 goto normalizeRoundAndPack; 1760 aExpBigger: 1761 if ( aExp == 0xFF ) { 1762 if ( aSig ) return propagateFloat32NaN( a, b ); 1763 return a; 1764 } 1765 if ( bExp == 0 ) { 1766 --expDiff; 1767 } 1768 else { 1769 bSig |= 0x40000000; 1770 } 1771 shift32RightJamming( bSig, expDiff, &bSig ); 1772 aSig |= 0x40000000; 1773 aBigger: 1774 zSig = aSig - bSig; 1775 zExp = aExp; 1776 normalizeRoundAndPack: 1777 --zExp; 1778 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1779 1780 } 1781 1782 /*---------------------------------------------------------------------------- 1783 | Returns the result of adding the single-precision floating-point values `a' 1784 | and `b'. The operation is performed according to the IEC/IEEE Standard for 1785 | Binary Floating-Point Arithmetic. 1786 *----------------------------------------------------------------------------*/ 1787 1788 float32 float32_add( float32 a, float32 b ) 1789 { 1790 flag aSign, bSign; 1791 1792 aSign = extractFloat32Sign( a ); 1793 bSign = extractFloat32Sign( b ); 1794 if ( aSign == bSign ) { 1795 return addFloat32Sigs( a, b, aSign ); 1796 } 1797 else { 1798 return subFloat32Sigs( a, b, aSign ); 1799 } 1800 1801 } 1802 1803 /*---------------------------------------------------------------------------- 1804 | Returns the result of subtracting the single-precision floating-point values 1805 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1806 | for Binary Floating-Point Arithmetic. 1807 *----------------------------------------------------------------------------*/ 1808 1809 float32 float32_sub( float32 a, float32 b ) 1810 { 1811 flag aSign, bSign; 1812 1813 aSign = extractFloat32Sign( a ); 1814 bSign = extractFloat32Sign( b ); 1815 if ( aSign == bSign ) { 1816 return subFloat32Sigs( a, b, aSign ); 1817 } 1818 else { 1819 return addFloat32Sigs( a, b, aSign ); 1820 } 1821 1822 } 1823 1824 /*---------------------------------------------------------------------------- 1825 | Returns the result of multiplying the single-precision floating-point values 1826 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1827 | for Binary Floating-Point Arithmetic. 1828 *----------------------------------------------------------------------------*/ 1829 1830 float32 float32_mul( float32 a, float32 b ) 1831 { 1832 flag aSign, bSign, zSign; 1833 int16 aExp, bExp, zExp; 1834 bits32 aSig, bSig; 1835 bits64 zSig64; 1836 bits32 zSig; 1837 1838 aSig = extractFloat32Frac( a ); 1839 aExp = extractFloat32Exp( a ); 1840 aSign = extractFloat32Sign( a ); 1841 bSig = extractFloat32Frac( b ); 1842 bExp = extractFloat32Exp( b ); 1843 bSign = extractFloat32Sign( b ); 1844 zSign = aSign ^ bSign; 1845 if ( aExp == 0xFF ) { 1846 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1847 return propagateFloat32NaN( a, b ); 1848 } 1849 if ( ( bExp | bSig ) == 0 ) { 1850 float_raise( float_flag_invalid ); 1851 return float32_default_nan; 1852 } 1853 return packFloat32( zSign, 0xFF, 0 ); 1854 } 1855 if ( bExp == 0xFF ) { 1856 if ( bSig ) return propagateFloat32NaN( a, b ); 1857 if ( ( aExp | aSig ) == 0 ) { 1858 float_raise( float_flag_invalid ); 1859 return float32_default_nan; 1860 } 1861 return packFloat32( zSign, 0xFF, 0 ); 1862 } 1863 if ( aExp == 0 ) { 1864 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1865 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1866 } 1867 if ( bExp == 0 ) { 1868 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1869 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1870 } 1871 zExp = aExp + bExp - 0x7F; 1872 aSig = ( aSig | 0x00800000 )<<7; 1873 bSig = ( bSig | 0x00800000 )<<8; 1874 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1875 zSig = zSig64; 1876 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1877 zSig <<= 1; 1878 --zExp; 1879 } 1880 return roundAndPackFloat32( zSign, zExp, zSig ); 1881 1882 } 1883 1884 /*---------------------------------------------------------------------------- 1885 | Returns the result of dividing the single-precision floating-point value `a' 1886 | by the corresponding value `b'. The operation is performed according to the 1887 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1888 *----------------------------------------------------------------------------*/ 1889 1890 float32 float32_div( float32 a, float32 b ) 1891 { 1892 flag aSign, bSign, zSign; 1893 int16 aExp, bExp, zExp; 1894 bits32 aSig, bSig, zSig; 1895 1896 aSig = extractFloat32Frac( a ); 1897 aExp = extractFloat32Exp( a ); 1898 aSign = extractFloat32Sign( a ); 1899 bSig = extractFloat32Frac( b ); 1900 bExp = extractFloat32Exp( b ); 1901 bSign = extractFloat32Sign( b ); 1902 zSign = aSign ^ bSign; 1903 if ( aExp == 0xFF ) { 1904 if ( aSig ) return propagateFloat32NaN( a, b ); 1905 if ( bExp == 0xFF ) { 1906 if ( bSig ) return propagateFloat32NaN( a, b ); 1907 float_raise( float_flag_invalid ); 1908 return float32_default_nan; 1909 } 1910 return packFloat32( zSign, 0xFF, 0 ); 1911 } 1912 if ( bExp == 0xFF ) { 1913 if ( bSig ) return propagateFloat32NaN( a, b ); 1914 return packFloat32( zSign, 0, 0 ); 1915 } 1916 if ( bExp == 0 ) { 1917 if ( bSig == 0 ) { 1918 if ( ( aExp | aSig ) == 0 ) { 1919 float_raise( float_flag_invalid ); 1920 return float32_default_nan; 1921 } 1922 float_raise( float_flag_divbyzero ); 1923 return packFloat32( zSign, 0xFF, 0 ); 1924 } 1925 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1926 } 1927 if ( aExp == 0 ) { 1928 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1929 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1930 } 1931 zExp = aExp - bExp + 0x7D; 1932 aSig = ( aSig | 0x00800000 )<<7; 1933 bSig = ( bSig | 0x00800000 )<<8; 1934 if ( bSig <= ( aSig + aSig ) ) { 1935 aSig >>= 1; 1936 ++zExp; 1937 } 1938 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1939 if ( ( zSig & 0x3F ) == 0 ) { 1940 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1941 } 1942 return roundAndPackFloat32( zSign, zExp, zSig ); 1943 1944 } 1945 1946 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1947 /*---------------------------------------------------------------------------- 1948 | Returns the remainder of the single-precision floating-point value `a' 1949 | with respect to the corresponding value `b'. The operation is performed 1950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1951 *----------------------------------------------------------------------------*/ 1952 1953 float32 float32_rem( float32 a, float32 b ) 1954 { 1955 flag aSign, bSign __unused, zSign; 1956 int16 aExp, bExp, expDiff; 1957 bits32 aSig, bSig; 1958 bits32 q; 1959 bits64 aSig64, bSig64, q64; 1960 bits32 alternateASig; 1961 sbits32 sigMean; 1962 1963 aSig = extractFloat32Frac( a ); 1964 aExp = extractFloat32Exp( a ); 1965 aSign = extractFloat32Sign( a ); 1966 bSig = extractFloat32Frac( b ); 1967 bExp = extractFloat32Exp( b ); 1968 bSign = extractFloat32Sign( b ); 1969 if ( aExp == 0xFF ) { 1970 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1971 return propagateFloat32NaN( a, b ); 1972 } 1973 float_raise( float_flag_invalid ); 1974 return float32_default_nan; 1975 } 1976 if ( bExp == 0xFF ) { 1977 if ( bSig ) return propagateFloat32NaN( a, b ); 1978 return a; 1979 } 1980 if ( bExp == 0 ) { 1981 if ( bSig == 0 ) { 1982 float_raise( float_flag_invalid ); 1983 return float32_default_nan; 1984 } 1985 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1986 } 1987 if ( aExp == 0 ) { 1988 if ( aSig == 0 ) return a; 1989 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1990 } 1991 expDiff = aExp - bExp; 1992 aSig |= 0x00800000; 1993 bSig |= 0x00800000; 1994 if ( expDiff < 32 ) { 1995 aSig <<= 8; 1996 bSig <<= 8; 1997 if ( expDiff < 0 ) { 1998 if ( expDiff < -1 ) return a; 1999 aSig >>= 1; 2000 } 2001 q = ( bSig <= aSig ); 2002 if ( q ) aSig -= bSig; 2003 if ( 0 < expDiff ) { 2004 q = ( ( (bits64) aSig )<<32 ) / bSig; 2005 q >>= 32 - expDiff; 2006 bSig >>= 2; 2007 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2008 } 2009 else { 2010 aSig >>= 2; 2011 bSig >>= 2; 2012 } 2013 } 2014 else { 2015 if ( bSig <= aSig ) aSig -= bSig; 2016 aSig64 = ( (bits64) aSig )<<40; 2017 bSig64 = ( (bits64) bSig )<<40; 2018 expDiff -= 64; 2019 while ( 0 < expDiff ) { 2020 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2021 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2022 aSig64 = - ( ( bSig * q64 )<<38 ); 2023 expDiff -= 62; 2024 } 2025 expDiff += 64; 2026 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2027 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2028 q = q64>>( 64 - expDiff ); 2029 bSig <<= 6; 2030 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2031 } 2032 do { 2033 alternateASig = aSig; 2034 ++q; 2035 aSig -= bSig; 2036 } while ( 0 <= (sbits32) aSig ); 2037 sigMean = aSig + alternateASig; 2038 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2039 aSig = alternateASig; 2040 } 2041 zSign = ( (sbits32) aSig < 0 ); 2042 if ( zSign ) aSig = - aSig; 2043 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2044 2045 } 2046 #endif /* !SOFTFLOAT_FOR_GCC */ 2047 2048 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2049 2050 /*---------------------------------------------------------------------------- 2051 | Returns the square root of the single-precision floating-point value `a'. 2052 | The operation is performed according to the IEC/IEEE Standard for Binary 2053 | Floating-Point Arithmetic. 2054 *----------------------------------------------------------------------------*/ 2055 2056 float32 float32_sqrt( float32 a ) 2057 { 2058 flag aSign; 2059 int16 aExp, zExp; 2060 bits32 aSig, zSig; 2061 bits64 rem, term; 2062 2063 aSig = extractFloat32Frac( a ); 2064 aExp = extractFloat32Exp( a ); 2065 aSign = extractFloat32Sign( a ); 2066 if ( aExp == 0xFF ) { 2067 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2068 if ( ! aSign ) return a; 2069 float_raise( float_flag_invalid ); 2070 return float32_default_nan; 2071 } 2072 if ( aSign ) { 2073 if ( ( aExp | aSig ) == 0 ) return a; 2074 float_raise( float_flag_invalid ); 2075 return float32_default_nan; 2076 } 2077 if ( aExp == 0 ) { 2078 if ( aSig == 0 ) return 0; 2079 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2080 } 2081 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2082 aSig = ( aSig | 0x00800000 )<<8; 2083 zSig = estimateSqrt32( aExp, aSig ) + 2; 2084 if ( ( zSig & 0x7F ) <= 5 ) { 2085 if ( zSig < 2 ) { 2086 zSig = 0x7FFFFFFF; 2087 goto roundAndPack; 2088 } 2089 aSig >>= aExp & 1; 2090 term = ( (bits64) zSig ) * zSig; 2091 rem = ( ( (bits64) aSig )<<32 ) - term; 2092 while ( (sbits64) rem < 0 ) { 2093 --zSig; 2094 rem += ( ( (bits64) zSig )<<1 ) | 1; 2095 } 2096 zSig |= ( rem != 0 ); 2097 } 2098 shift32RightJamming( zSig, 1, &zSig ); 2099 roundAndPack: 2100 return roundAndPackFloat32( 0, zExp, zSig ); 2101 2102 } 2103 #endif /* !SOFTFLOAT_FOR_GCC */ 2104 2105 /*---------------------------------------------------------------------------- 2106 | Returns 1 if the single-precision floating-point value `a' is equal to 2107 | the corresponding value `b', and 0 otherwise. The comparison is performed 2108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2109 *----------------------------------------------------------------------------*/ 2110 2111 flag float32_eq( float32 a, float32 b ) 2112 { 2113 2114 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2115 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2116 ) { 2117 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2118 float_raise( float_flag_invalid ); 2119 } 2120 return 0; 2121 } 2122 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2123 2124 } 2125 2126 /*---------------------------------------------------------------------------- 2127 | Returns 1 if the single-precision floating-point value `a' is less than 2128 | or equal to the corresponding value `b', and 0 otherwise. The comparison 2129 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2130 | Arithmetic. 2131 *----------------------------------------------------------------------------*/ 2132 2133 flag float32_le( float32 a, float32 b ) 2134 { 2135 flag aSign, bSign; 2136 2137 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2138 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2139 ) { 2140 float_raise( float_flag_invalid ); 2141 return 0; 2142 } 2143 aSign = extractFloat32Sign( a ); 2144 bSign = extractFloat32Sign( b ); 2145 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2146 return ( a == b ) || ( aSign ^ ( a < b ) ); 2147 2148 } 2149 2150 /*---------------------------------------------------------------------------- 2151 | Returns 1 if the single-precision floating-point value `a' is less than 2152 | the corresponding value `b', and 0 otherwise. The comparison is performed 2153 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2154 *----------------------------------------------------------------------------*/ 2155 2156 flag float32_lt( float32 a, float32 b ) 2157 { 2158 flag aSign, bSign; 2159 2160 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2161 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2162 ) { 2163 float_raise( float_flag_invalid ); 2164 return 0; 2165 } 2166 aSign = extractFloat32Sign( a ); 2167 bSign = extractFloat32Sign( b ); 2168 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2169 return ( a != b ) && ( aSign ^ ( a < b ) ); 2170 2171 } 2172 2173 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2174 /*---------------------------------------------------------------------------- 2175 | Returns 1 if the single-precision floating-point value `a' is equal to 2176 | the corresponding value `b', and 0 otherwise. The invalid exception is 2177 | raised if either operand is a NaN. Otherwise, the comparison is performed 2178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2179 *----------------------------------------------------------------------------*/ 2180 2181 flag float32_eq_signaling( float32 a, float32 b ) 2182 { 2183 2184 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2185 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2186 ) { 2187 float_raise( float_flag_invalid ); 2188 return 0; 2189 } 2190 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2191 2192 } 2193 2194 /*---------------------------------------------------------------------------- 2195 | Returns 1 if the single-precision floating-point value `a' is less than or 2196 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2197 | cause an exception. Otherwise, the comparison is performed according to the 2198 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2199 *----------------------------------------------------------------------------*/ 2200 2201 flag float32_le_quiet( float32 a, float32 b ) 2202 { 2203 flag aSign, bSign; 2204 2205 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2206 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2207 ) { 2208 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2209 float_raise( float_flag_invalid ); 2210 } 2211 return 0; 2212 } 2213 aSign = extractFloat32Sign( a ); 2214 bSign = extractFloat32Sign( b ); 2215 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2216 return ( a == b ) || ( aSign ^ ( a < b ) ); 2217 2218 } 2219 2220 /*---------------------------------------------------------------------------- 2221 | Returns 1 if the single-precision floating-point value `a' is less than 2222 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2223 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2224 | Standard for Binary Floating-Point Arithmetic. 2225 *----------------------------------------------------------------------------*/ 2226 2227 flag float32_lt_quiet( float32 a, float32 b ) 2228 { 2229 flag aSign, bSign; 2230 2231 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2232 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2233 ) { 2234 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2235 float_raise( float_flag_invalid ); 2236 } 2237 return 0; 2238 } 2239 aSign = extractFloat32Sign( a ); 2240 bSign = extractFloat32Sign( b ); 2241 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2242 return ( a != b ) && ( aSign ^ ( a < b ) ); 2243 2244 } 2245 #endif /* !SOFTFLOAT_FOR_GCC */ 2246 2247 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2248 /*---------------------------------------------------------------------------- 2249 | Returns the result of converting the double-precision floating-point value 2250 | `a' to the 32-bit two's complement integer format. The conversion is 2251 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2252 | Arithmetic---which means in particular that the conversion is rounded 2253 | according to the current rounding mode. If `a' is a NaN, the largest 2254 | positive integer is returned. Otherwise, if the conversion overflows, the 2255 | largest integer with the same sign as `a' is returned. 2256 *----------------------------------------------------------------------------*/ 2257 2258 int32 float64_to_int32( float64 a ) 2259 { 2260 flag aSign; 2261 int16 aExp, shiftCount; 2262 bits64 aSig; 2263 2264 aSig = extractFloat64Frac( a ); 2265 aExp = extractFloat64Exp( a ); 2266 aSign = extractFloat64Sign( a ); 2267 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2268 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2269 shiftCount = 0x42C - aExp; 2270 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2271 return roundAndPackInt32( aSign, aSig ); 2272 2273 } 2274 #endif /* !SOFTFLOAT_FOR_GCC */ 2275 2276 /*---------------------------------------------------------------------------- 2277 | Returns the result of converting the double-precision floating-point value 2278 | `a' to the 32-bit two's complement integer format. The conversion is 2279 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2280 | Arithmetic, except that the conversion is always rounded toward zero. 2281 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2282 | the conversion overflows, the largest integer with the same sign as `a' is 2283 | returned. 2284 *----------------------------------------------------------------------------*/ 2285 2286 int32 float64_to_int32_round_to_zero( float64 a ) 2287 { 2288 flag aSign; 2289 int16 aExp, shiftCount; 2290 bits64 aSig, savedASig; 2291 int32 z; 2292 2293 aSig = extractFloat64Frac( a ); 2294 aExp = extractFloat64Exp( a ); 2295 aSign = extractFloat64Sign( a ); 2296 if ( 0x41E < aExp ) { 2297 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2298 goto invalid; 2299 } 2300 else if ( aExp < 0x3FF ) { 2301 if ( aExp || aSig ) float_set_inexact(); 2302 return 0; 2303 } 2304 aSig |= LIT64( 0x0010000000000000 ); 2305 shiftCount = 0x433 - aExp; 2306 savedASig = aSig; 2307 aSig >>= shiftCount; 2308 z = aSig; 2309 if ( aSign ) z = - z; 2310 if ( ( z < 0 ) ^ aSign ) { 2311 invalid: 2312 float_raise( float_flag_invalid ); 2313 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2314 } 2315 if ( ( aSig<<shiftCount ) != savedASig ) { 2316 float_set_inexact(); 2317 } 2318 return z; 2319 2320 } 2321 2322 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2323 /*---------------------------------------------------------------------------- 2324 | Returns the result of converting the double-precision floating-point value 2325 | `a' to the 64-bit two's complement integer format. The conversion is 2326 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2327 | Arithmetic---which means in particular that the conversion is rounded 2328 | according to the current rounding mode. If `a' is a NaN, the largest 2329 | positive integer is returned. Otherwise, if the conversion overflows, the 2330 | largest integer with the same sign as `a' is returned. 2331 *----------------------------------------------------------------------------*/ 2332 2333 int64 float64_to_int64( float64 a ) 2334 { 2335 flag aSign; 2336 int16 aExp, shiftCount; 2337 bits64 aSig, aSigExtra; 2338 2339 aSig = extractFloat64Frac( a ); 2340 aExp = extractFloat64Exp( a ); 2341 aSign = extractFloat64Sign( a ); 2342 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2343 shiftCount = 0x433 - aExp; 2344 if ( shiftCount <= 0 ) { 2345 if ( 0x43E < aExp ) { 2346 float_raise( float_flag_invalid ); 2347 if ( ! aSign 2348 || ( ( aExp == 0x7FF ) 2349 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2350 ) { 2351 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2352 } 2353 return (sbits64) LIT64( 0x8000000000000000 ); 2354 } 2355 aSigExtra = 0; 2356 aSig <<= - shiftCount; 2357 } 2358 else { 2359 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2360 } 2361 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2362 2363 } 2364 2365 /* like above, but result is unsigned */ 2366 uint64 float64_to_uint64( float64 a ) 2367 { 2368 flag aSign; 2369 int16 aExp, shiftCount; 2370 bits64 aSig, aSigExtra; 2371 2372 aSig = extractFloat64Frac( a ); 2373 aExp = extractFloat64Exp( a ); 2374 aSign = extractFloat64Sign( a ); 2375 2376 if (aSign) { 2377 return float64_to_int64(a); 2378 } 2379 2380 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2381 shiftCount = 0x433 - aExp; 2382 if ( shiftCount <= 0 ) { 2383 if ( 0x43E < aExp ) { 2384 float_raise( float_flag_invalid ); 2385 if ( ! aSign 2386 || ( ( aExp == 0x7FF ) 2387 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2388 ) { 2389 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2390 } 2391 return (sbits64) LIT64( 0x8000000000000000 ); 2392 } 2393 aSigExtra = 0; 2394 aSig <<= - shiftCount; 2395 } 2396 else { 2397 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2398 } 2399 return roundAndPackUInt64( aSig, aSigExtra ); 2400 2401 } 2402 2403 /*---------------------------------------------------------------------------- 2404 | Returns the result of converting the double-precision floating-point value 2405 | `a' to the 64-bit two's complement integer format. The conversion is 2406 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2407 | Arithmetic, except that the conversion is always rounded toward zero. 2408 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2409 | the conversion overflows, the largest integer with the same sign as `a' is 2410 | returned. 2411 *----------------------------------------------------------------------------*/ 2412 2413 int64 float64_to_int64_round_to_zero( float64 a ) 2414 { 2415 flag aSign; 2416 int16 aExp, shiftCount; 2417 bits64 aSig; 2418 int64 z; 2419 2420 aSig = extractFloat64Frac( a ); 2421 aExp = extractFloat64Exp( a ); 2422 aSign = extractFloat64Sign( a ); 2423 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2424 shiftCount = aExp - 0x433; 2425 if ( 0 <= shiftCount ) { 2426 if ( 0x43E <= aExp ) { 2427 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2428 float_raise( float_flag_invalid ); 2429 if ( ! aSign 2430 || ( ( aExp == 0x7FF ) 2431 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2432 ) { 2433 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2434 } 2435 } 2436 return (sbits64) LIT64( 0x8000000000000000 ); 2437 } 2438 z = aSig<<shiftCount; 2439 } 2440 else { 2441 if ( aExp < 0x3FE ) { 2442 if ( aExp | aSig ) float_set_inexact(); 2443 return 0; 2444 } 2445 z = aSig>>( - shiftCount ); 2446 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2447 float_set_inexact(); 2448 } 2449 } 2450 if ( aSign ) z = - z; 2451 return z; 2452 2453 } 2454 #endif /* !SOFTFLOAT_FOR_GCC */ 2455 2456 /*---------------------------------------------------------------------------- 2457 | Returns the result of converting the double-precision floating-point value 2458 | `a' to the single-precision floating-point format. The conversion is 2459 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2460 | Arithmetic. 2461 *----------------------------------------------------------------------------*/ 2462 2463 float32 float64_to_float32( float64 a ) 2464 { 2465 flag aSign; 2466 int16 aExp; 2467 bits64 aSig; 2468 bits32 zSig; 2469 2470 aSig = extractFloat64Frac( a ); 2471 aExp = extractFloat64Exp( a ); 2472 aSign = extractFloat64Sign( a ); 2473 if ( aExp == 0x7FF ) { 2474 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2475 return packFloat32( aSign, 0xFF, 0 ); 2476 } 2477 shift64RightJamming( aSig, 22, &aSig ); 2478 zSig = aSig; 2479 if ( aExp || zSig ) { 2480 zSig |= 0x40000000; 2481 aExp -= 0x381; 2482 } 2483 return roundAndPackFloat32( aSign, aExp, zSig ); 2484 2485 } 2486 2487 #ifdef FLOATX80 2488 2489 /*---------------------------------------------------------------------------- 2490 | Returns the result of converting the double-precision floating-point value 2491 | `a' to the extended double-precision floating-point format. The conversion 2492 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2493 | Arithmetic. 2494 *----------------------------------------------------------------------------*/ 2495 2496 floatx80 float64_to_floatx80( float64 a ) 2497 { 2498 flag aSign; 2499 int16 aExp; 2500 bits64 aSig; 2501 2502 aSig = extractFloat64Frac( a ); 2503 aExp = extractFloat64Exp( a ); 2504 aSign = extractFloat64Sign( a ); 2505 if ( aExp == 0x7FF ) { 2506 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2507 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2508 } 2509 if ( aExp == 0 ) { 2510 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2511 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2512 } 2513 return 2514 packFloatx80( 2515 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2516 2517 } 2518 2519 #endif 2520 2521 #ifdef FLOAT128 2522 2523 /*---------------------------------------------------------------------------- 2524 | Returns the result of converting the double-precision floating-point value 2525 | `a' to the quadruple-precision floating-point format. The conversion is 2526 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2527 | Arithmetic. 2528 *----------------------------------------------------------------------------*/ 2529 2530 float128 float64_to_float128( float64 a ) 2531 { 2532 flag aSign; 2533 int16 aExp; 2534 bits64 aSig, zSig0, zSig1; 2535 2536 aSig = extractFloat64Frac( a ); 2537 aExp = extractFloat64Exp( a ); 2538 aSign = extractFloat64Sign( a ); 2539 if ( aExp == 0x7FF ) { 2540 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2541 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2542 } 2543 if ( aExp == 0 ) { 2544 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2545 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2546 --aExp; 2547 } 2548 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2549 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2550 2551 } 2552 2553 #endif 2554 2555 #ifndef SOFTFLOAT_FOR_GCC 2556 /*---------------------------------------------------------------------------- 2557 | Rounds the double-precision floating-point value `a' to an integer, and 2558 | returns the result as a double-precision floating-point value. The 2559 | operation is performed according to the IEC/IEEE Standard for Binary 2560 | Floating-Point Arithmetic. 2561 *----------------------------------------------------------------------------*/ 2562 2563 float64 float64_round_to_int( float64 a ) 2564 { 2565 flag aSign; 2566 int16 aExp; 2567 bits64 lastBitMask, roundBitsMask; 2568 int8 roundingMode; 2569 float64 z; 2570 2571 aExp = extractFloat64Exp( a ); 2572 if ( 0x433 <= aExp ) { 2573 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2574 return propagateFloat64NaN( a, a ); 2575 } 2576 return a; 2577 } 2578 if ( aExp < 0x3FF ) { 2579 if ( (bits64) ( a<<1 ) == 0 ) return a; 2580 float_set_inexact(); 2581 aSign = extractFloat64Sign( a ); 2582 switch ( float_rounding_mode() ) { 2583 case float_round_nearest_even: 2584 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2585 return packFloat64( aSign, 0x3FF, 0 ); 2586 } 2587 break; 2588 case float_round_down: 2589 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2590 case float_round_up: 2591 return 2592 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2593 } 2594 return packFloat64( aSign, 0, 0 ); 2595 } 2596 lastBitMask = 1; 2597 lastBitMask <<= 0x433 - aExp; 2598 roundBitsMask = lastBitMask - 1; 2599 z = a; 2600 roundingMode = float_rounding_mode(); 2601 if ( roundingMode == float_round_nearest_even ) { 2602 z += lastBitMask>>1; 2603 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2604 } 2605 else if ( roundingMode != float_round_to_zero ) { 2606 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2607 z += roundBitsMask; 2608 } 2609 } 2610 z &= ~ roundBitsMask; 2611 if ( z != a ) float_set_inexact(); 2612 return z; 2613 2614 } 2615 #endif 2616 2617 /*---------------------------------------------------------------------------- 2618 | Returns the result of adding the absolute values of the double-precision 2619 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2620 | before being returned. `zSign' is ignored if the result is a NaN. 2621 | The addition is performed according to the IEC/IEEE Standard for Binary 2622 | Floating-Point Arithmetic. 2623 *----------------------------------------------------------------------------*/ 2624 2625 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2626 { 2627 int16 aExp, bExp, zExp; 2628 bits64 aSig, bSig, zSig; 2629 int16 expDiff; 2630 2631 aSig = extractFloat64Frac( a ); 2632 aExp = extractFloat64Exp( a ); 2633 bSig = extractFloat64Frac( b ); 2634 bExp = extractFloat64Exp( b ); 2635 expDiff = aExp - bExp; 2636 aSig <<= 9; 2637 bSig <<= 9; 2638 if ( 0 < expDiff ) { 2639 if ( aExp == 0x7FF ) { 2640 if ( aSig ) return propagateFloat64NaN( a, b ); 2641 return a; 2642 } 2643 if ( bExp == 0 ) { 2644 --expDiff; 2645 } 2646 else { 2647 bSig |= LIT64( 0x2000000000000000 ); 2648 } 2649 shift64RightJamming( bSig, expDiff, &bSig ); 2650 zExp = aExp; 2651 } 2652 else if ( expDiff < 0 ) { 2653 if ( bExp == 0x7FF ) { 2654 if ( bSig ) return propagateFloat64NaN( a, b ); 2655 return packFloat64( zSign, 0x7FF, 0 ); 2656 } 2657 if ( aExp == 0 ) { 2658 ++expDiff; 2659 } 2660 else { 2661 aSig |= LIT64( 0x2000000000000000 ); 2662 } 2663 shift64RightJamming( aSig, - expDiff, &aSig ); 2664 zExp = bExp; 2665 } 2666 else { 2667 if ( aExp == 0x7FF ) { 2668 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2669 return a; 2670 } 2671 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2672 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2673 zExp = aExp; 2674 goto roundAndPack; 2675 } 2676 aSig |= LIT64( 0x2000000000000000 ); 2677 zSig = ( aSig + bSig )<<1; 2678 --zExp; 2679 if ( (sbits64) zSig < 0 ) { 2680 zSig = aSig + bSig; 2681 ++zExp; 2682 } 2683 roundAndPack: 2684 return roundAndPackFloat64( zSign, zExp, zSig ); 2685 2686 } 2687 2688 /*---------------------------------------------------------------------------- 2689 | Returns the result of subtracting the absolute values of the double- 2690 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2691 | difference is negated before being returned. `zSign' is ignored if the 2692 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2693 | Standard for Binary Floating-Point Arithmetic. 2694 *----------------------------------------------------------------------------*/ 2695 2696 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2697 { 2698 int16 aExp, bExp, zExp; 2699 bits64 aSig, bSig, zSig; 2700 int16 expDiff; 2701 2702 aSig = extractFloat64Frac( a ); 2703 aExp = extractFloat64Exp( a ); 2704 bSig = extractFloat64Frac( b ); 2705 bExp = extractFloat64Exp( b ); 2706 expDiff = aExp - bExp; 2707 aSig <<= 10; 2708 bSig <<= 10; 2709 if ( 0 < expDiff ) goto aExpBigger; 2710 if ( expDiff < 0 ) goto bExpBigger; 2711 if ( aExp == 0x7FF ) { 2712 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2713 float_raise( float_flag_invalid ); 2714 return float64_default_nan; 2715 } 2716 if ( aExp == 0 ) { 2717 aExp = 1; 2718 bExp = 1; 2719 } 2720 if ( bSig < aSig ) goto aBigger; 2721 if ( aSig < bSig ) goto bBigger; 2722 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2723 bExpBigger: 2724 if ( bExp == 0x7FF ) { 2725 if ( bSig ) return propagateFloat64NaN( a, b ); 2726 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2727 } 2728 if ( aExp == 0 ) { 2729 ++expDiff; 2730 } 2731 else { 2732 aSig |= LIT64( 0x4000000000000000 ); 2733 } 2734 shift64RightJamming( aSig, - expDiff, &aSig ); 2735 bSig |= LIT64( 0x4000000000000000 ); 2736 bBigger: 2737 zSig = bSig - aSig; 2738 zExp = bExp; 2739 zSign ^= 1; 2740 goto normalizeRoundAndPack; 2741 aExpBigger: 2742 if ( aExp == 0x7FF ) { 2743 if ( aSig ) return propagateFloat64NaN( a, b ); 2744 return a; 2745 } 2746 if ( bExp == 0 ) { 2747 --expDiff; 2748 } 2749 else { 2750 bSig |= LIT64( 0x4000000000000000 ); 2751 } 2752 shift64RightJamming( bSig, expDiff, &bSig ); 2753 aSig |= LIT64( 0x4000000000000000 ); 2754 aBigger: 2755 zSig = aSig - bSig; 2756 zExp = aExp; 2757 normalizeRoundAndPack: 2758 --zExp; 2759 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2760 2761 } 2762 2763 /*---------------------------------------------------------------------------- 2764 | Returns the result of adding the double-precision floating-point values `a' 2765 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2766 | Binary Floating-Point Arithmetic. 2767 *----------------------------------------------------------------------------*/ 2768 2769 float64 float64_add( float64 a, float64 b ) 2770 { 2771 flag aSign, bSign; 2772 2773 aSign = extractFloat64Sign( a ); 2774 bSign = extractFloat64Sign( b ); 2775 if ( aSign == bSign ) { 2776 return addFloat64Sigs( a, b, aSign ); 2777 } 2778 else { 2779 return subFloat64Sigs( a, b, aSign ); 2780 } 2781 2782 } 2783 2784 /*---------------------------------------------------------------------------- 2785 | Returns the result of subtracting the double-precision floating-point values 2786 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2787 | for Binary Floating-Point Arithmetic. 2788 *----------------------------------------------------------------------------*/ 2789 2790 float64 float64_sub( float64 a, float64 b ) 2791 { 2792 flag aSign, bSign; 2793 2794 aSign = extractFloat64Sign( a ); 2795 bSign = extractFloat64Sign( b ); 2796 if ( aSign == bSign ) { 2797 return subFloat64Sigs( a, b, aSign ); 2798 } 2799 else { 2800 return addFloat64Sigs( a, b, aSign ); 2801 } 2802 2803 } 2804 2805 /*---------------------------------------------------------------------------- 2806 | Returns the result of multiplying the double-precision floating-point values 2807 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2808 | for Binary Floating-Point Arithmetic. 2809 *----------------------------------------------------------------------------*/ 2810 2811 float64 float64_mul( float64 a, float64 b ) 2812 { 2813 flag aSign, bSign, zSign; 2814 int16 aExp, bExp, zExp; 2815 bits64 aSig, bSig, zSig0, zSig1; 2816 2817 aSig = extractFloat64Frac( a ); 2818 aExp = extractFloat64Exp( a ); 2819 aSign = extractFloat64Sign( a ); 2820 bSig = extractFloat64Frac( b ); 2821 bExp = extractFloat64Exp( b ); 2822 bSign = extractFloat64Sign( b ); 2823 zSign = aSign ^ bSign; 2824 if ( aExp == 0x7FF ) { 2825 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2826 return propagateFloat64NaN( a, b ); 2827 } 2828 if ( ( bExp | bSig ) == 0 ) { 2829 float_raise( float_flag_invalid ); 2830 return float64_default_nan; 2831 } 2832 return packFloat64( zSign, 0x7FF, 0 ); 2833 } 2834 if ( bExp == 0x7FF ) { 2835 if ( bSig ) return propagateFloat64NaN( a, b ); 2836 if ( ( aExp | aSig ) == 0 ) { 2837 float_raise( float_flag_invalid ); 2838 return float64_default_nan; 2839 } 2840 return packFloat64( zSign, 0x7FF, 0 ); 2841 } 2842 if ( aExp == 0 ) { 2843 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2844 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2845 } 2846 if ( bExp == 0 ) { 2847 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2848 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2849 } 2850 zExp = aExp + bExp - 0x3FF; 2851 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2852 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2853 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2854 zSig0 |= ( zSig1 != 0 ); 2855 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2856 zSig0 <<= 1; 2857 --zExp; 2858 } 2859 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2860 2861 } 2862 2863 /*---------------------------------------------------------------------------- 2864 | Returns the result of dividing the double-precision floating-point value `a' 2865 | by the corresponding value `b'. The operation is performed according to 2866 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2867 *----------------------------------------------------------------------------*/ 2868 2869 float64 float64_div( float64 a, float64 b ) 2870 { 2871 flag aSign, bSign, zSign; 2872 int16 aExp, bExp, zExp; 2873 bits64 aSig, bSig, zSig; 2874 bits64 rem0, rem1; 2875 bits64 term0, term1; 2876 2877 aSig = extractFloat64Frac( a ); 2878 aExp = extractFloat64Exp( a ); 2879 aSign = extractFloat64Sign( a ); 2880 bSig = extractFloat64Frac( b ); 2881 bExp = extractFloat64Exp( b ); 2882 bSign = extractFloat64Sign( b ); 2883 zSign = aSign ^ bSign; 2884 if ( aExp == 0x7FF ) { 2885 if ( aSig ) return propagateFloat64NaN( a, b ); 2886 if ( bExp == 0x7FF ) { 2887 if ( bSig ) return propagateFloat64NaN( a, b ); 2888 float_raise( float_flag_invalid ); 2889 return float64_default_nan; 2890 } 2891 return packFloat64( zSign, 0x7FF, 0 ); 2892 } 2893 if ( bExp == 0x7FF ) { 2894 if ( bSig ) return propagateFloat64NaN( a, b ); 2895 return packFloat64( zSign, 0, 0 ); 2896 } 2897 if ( bExp == 0 ) { 2898 if ( bSig == 0 ) { 2899 if ( ( aExp | aSig ) == 0 ) { 2900 float_raise( float_flag_invalid ); 2901 return float64_default_nan; 2902 } 2903 float_raise( float_flag_divbyzero ); 2904 return packFloat64( zSign, 0x7FF, 0 ); 2905 } 2906 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2907 } 2908 if ( aExp == 0 ) { 2909 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2910 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2911 } 2912 zExp = aExp - bExp + 0x3FD; 2913 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2914 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2915 if ( bSig <= ( aSig + aSig ) ) { 2916 aSig >>= 1; 2917 ++zExp; 2918 } 2919 zSig = estimateDiv128To64( aSig, 0, bSig ); 2920 if ( ( zSig & 0x1FF ) <= 2 ) { 2921 mul64To128( bSig, zSig, &term0, &term1 ); 2922 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2923 while ( (sbits64) rem0 < 0 ) { 2924 --zSig; 2925 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2926 } 2927 zSig |= ( rem1 != 0 ); 2928 } 2929 return roundAndPackFloat64( zSign, zExp, zSig ); 2930 2931 } 2932 2933 #ifndef SOFTFLOAT_FOR_GCC 2934 /*---------------------------------------------------------------------------- 2935 | Returns the remainder of the double-precision floating-point value `a' 2936 | with respect to the corresponding value `b'. The operation is performed 2937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2938 *----------------------------------------------------------------------------*/ 2939 2940 float64 float64_rem( float64 a, float64 b ) 2941 { 2942 flag aSign, bSign __unused, zSign; 2943 int16 aExp, bExp, expDiff; 2944 bits64 aSig, bSig; 2945 bits64 q, alternateASig; 2946 sbits64 sigMean; 2947 2948 aSig = extractFloat64Frac( a ); 2949 aExp = extractFloat64Exp( a ); 2950 aSign = extractFloat64Sign( a ); 2951 bSig = extractFloat64Frac( b ); 2952 bExp = extractFloat64Exp( b ); 2953 bSign = extractFloat64Sign( b ); 2954 if ( aExp == 0x7FF ) { 2955 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2956 return propagateFloat64NaN( a, b ); 2957 } 2958 float_raise( float_flag_invalid ); 2959 return float64_default_nan; 2960 } 2961 if ( bExp == 0x7FF ) { 2962 if ( bSig ) return propagateFloat64NaN( a, b ); 2963 return a; 2964 } 2965 if ( bExp == 0 ) { 2966 if ( bSig == 0 ) { 2967 float_raise( float_flag_invalid ); 2968 return float64_default_nan; 2969 } 2970 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2971 } 2972 if ( aExp == 0 ) { 2973 if ( aSig == 0 ) return a; 2974 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2975 } 2976 expDiff = aExp - bExp; 2977 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2978 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2979 if ( expDiff < 0 ) { 2980 if ( expDiff < -1 ) return a; 2981 aSig >>= 1; 2982 } 2983 q = ( bSig <= aSig ); 2984 if ( q ) aSig -= bSig; 2985 expDiff -= 64; 2986 while ( 0 < expDiff ) { 2987 q = estimateDiv128To64( aSig, 0, bSig ); 2988 q = ( 2 < q ) ? q - 2 : 0; 2989 aSig = - ( ( bSig>>2 ) * q ); 2990 expDiff -= 62; 2991 } 2992 expDiff += 64; 2993 if ( 0 < expDiff ) { 2994 q = estimateDiv128To64( aSig, 0, bSig ); 2995 q = ( 2 < q ) ? q - 2 : 0; 2996 q >>= 64 - expDiff; 2997 bSig >>= 2; 2998 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2999 } 3000 else { 3001 aSig >>= 2; 3002 bSig >>= 2; 3003 } 3004 do { 3005 alternateASig = aSig; 3006 ++q; 3007 aSig -= bSig; 3008 } while ( 0 <= (sbits64) aSig ); 3009 sigMean = aSig + alternateASig; 3010 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3011 aSig = alternateASig; 3012 } 3013 zSign = ( (sbits64) aSig < 0 ); 3014 if ( zSign ) aSig = - aSig; 3015 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3016 3017 } 3018 3019 /*---------------------------------------------------------------------------- 3020 | Returns the square root of the double-precision floating-point value `a'. 3021 | The operation is performed according to the IEC/IEEE Standard for Binary 3022 | Floating-Point Arithmetic. 3023 *----------------------------------------------------------------------------*/ 3024 3025 float64 float64_sqrt( float64 a ) 3026 { 3027 flag aSign; 3028 int16 aExp, zExp; 3029 bits64 aSig, zSig, doubleZSig; 3030 bits64 rem0, rem1, term0, term1; 3031 3032 aSig = extractFloat64Frac( a ); 3033 aExp = extractFloat64Exp( a ); 3034 aSign = extractFloat64Sign( a ); 3035 if ( aExp == 0x7FF ) { 3036 if ( aSig ) return propagateFloat64NaN( a, a ); 3037 if ( ! aSign ) return a; 3038 float_raise( float_flag_invalid ); 3039 return float64_default_nan; 3040 } 3041 if ( aSign ) { 3042 if ( ( aExp | aSig ) == 0 ) return a; 3043 float_raise( float_flag_invalid ); 3044 return float64_default_nan; 3045 } 3046 if ( aExp == 0 ) { 3047 if ( aSig == 0 ) return 0; 3048 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3049 } 3050 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3051 aSig |= LIT64( 0x0010000000000000 ); 3052 zSig = estimateSqrt32( aExp, aSig>>21 ); 3053 aSig <<= 9 - ( aExp & 1 ); 3054 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3055 if ( ( zSig & 0x1FF ) <= 5 ) { 3056 doubleZSig = zSig<<1; 3057 mul64To128( zSig, zSig, &term0, &term1 ); 3058 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3059 while ( (sbits64) rem0 < 0 ) { 3060 --zSig; 3061 doubleZSig -= 2; 3062 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3063 } 3064 zSig |= ( ( rem0 | rem1 ) != 0 ); 3065 } 3066 return roundAndPackFloat64( 0, zExp, zSig ); 3067 3068 } 3069 #endif 3070 3071 /*---------------------------------------------------------------------------- 3072 | Returns 1 if the double-precision floating-point value `a' is equal to the 3073 | corresponding value `b', and 0 otherwise. The comparison is performed 3074 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3075 *----------------------------------------------------------------------------*/ 3076 3077 flag float64_eq( float64 a, float64 b ) 3078 { 3079 3080 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3081 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3082 ) { 3083 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3084 float_raise( float_flag_invalid ); 3085 } 3086 return 0; 3087 } 3088 return ( a == b ) || 3089 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3090 3091 } 3092 3093 /*---------------------------------------------------------------------------- 3094 | Returns 1 if the double-precision floating-point value `a' is less than or 3095 | equal to the corresponding value `b', and 0 otherwise. The comparison is 3096 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3097 | Arithmetic. 3098 *----------------------------------------------------------------------------*/ 3099 3100 flag float64_le( float64 a, float64 b ) 3101 { 3102 flag aSign, bSign; 3103 3104 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3105 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3106 ) { 3107 float_raise( float_flag_invalid ); 3108 return 0; 3109 } 3110 aSign = extractFloat64Sign( a ); 3111 bSign = extractFloat64Sign( b ); 3112 if ( aSign != bSign ) 3113 return aSign || 3114 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3115 0 ); 3116 return ( a == b ) || 3117 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3118 3119 } 3120 3121 /*---------------------------------------------------------------------------- 3122 | Returns 1 if the double-precision floating-point value `a' is less than 3123 | the corresponding value `b', and 0 otherwise. The comparison is performed 3124 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3125 *----------------------------------------------------------------------------*/ 3126 3127 flag float64_lt( float64 a, float64 b ) 3128 { 3129 flag aSign, bSign; 3130 3131 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3132 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3133 ) { 3134 float_raise( float_flag_invalid ); 3135 return 0; 3136 } 3137 aSign = extractFloat64Sign( a ); 3138 bSign = extractFloat64Sign( b ); 3139 if ( aSign != bSign ) 3140 return aSign && 3141 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3142 0 ); 3143 return ( a != b ) && 3144 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3145 3146 } 3147 3148 #ifndef SOFTFLOAT_FOR_GCC 3149 /*---------------------------------------------------------------------------- 3150 | Returns 1 if the double-precision floating-point value `a' is equal to the 3151 | corresponding value `b', and 0 otherwise. The invalid exception is raised 3152 | if either operand is a NaN. Otherwise, the comparison is performed 3153 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3154 *----------------------------------------------------------------------------*/ 3155 3156 flag float64_eq_signaling( float64 a, float64 b ) 3157 { 3158 3159 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3160 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3161 ) { 3162 float_raise( float_flag_invalid ); 3163 return 0; 3164 } 3165 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3166 3167 } 3168 3169 /*---------------------------------------------------------------------------- 3170 | Returns 1 if the double-precision floating-point value `a' is less than or 3171 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3172 | cause an exception. Otherwise, the comparison is performed according to the 3173 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3174 *----------------------------------------------------------------------------*/ 3175 3176 flag float64_le_quiet( float64 a, float64 b ) 3177 { 3178 flag aSign, bSign; 3179 3180 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3181 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3182 ) { 3183 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3184 float_raise( float_flag_invalid ); 3185 } 3186 return 0; 3187 } 3188 aSign = extractFloat64Sign( a ); 3189 bSign = extractFloat64Sign( b ); 3190 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3191 return ( a == b ) || ( aSign ^ ( a < b ) ); 3192 3193 } 3194 3195 /*---------------------------------------------------------------------------- 3196 | Returns 1 if the double-precision floating-point value `a' is less than 3197 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3198 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3199 | Standard for Binary Floating-Point Arithmetic. 3200 *----------------------------------------------------------------------------*/ 3201 3202 flag float64_lt_quiet( float64 a, float64 b ) 3203 { 3204 flag aSign, bSign; 3205 3206 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3207 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3208 ) { 3209 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3210 float_raise( float_flag_invalid ); 3211 } 3212 return 0; 3213 } 3214 aSign = extractFloat64Sign( a ); 3215 bSign = extractFloat64Sign( b ); 3216 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3217 return ( a != b ) && ( aSign ^ ( a < b ) ); 3218 3219 } 3220 #endif 3221 3222 #ifdef FLOATX80 3223 3224 /*---------------------------------------------------------------------------- 3225 | Returns the result of converting the extended double-precision floating- 3226 | point value `a' to the 32-bit two's complement integer format. The 3227 | conversion is performed according to the IEC/IEEE Standard for Binary 3228 | Floating-Point Arithmetic---which means in particular that the conversion 3229 | is rounded according to the current rounding mode. If `a' is a NaN, the 3230 | largest positive integer is returned. Otherwise, if the conversion 3231 | overflows, the largest integer with the same sign as `a' is returned. 3232 *----------------------------------------------------------------------------*/ 3233 3234 int32 floatx80_to_int32( floatx80 a ) 3235 { 3236 flag aSign; 3237 int32 aExp, shiftCount; 3238 bits64 aSig; 3239 3240 aSig = extractFloatx80Frac( a ); 3241 aExp = extractFloatx80Exp( a ); 3242 aSign = extractFloatx80Sign( a ); 3243 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3244 shiftCount = 0x4037 - aExp; 3245 if ( shiftCount <= 0 ) shiftCount = 1; 3246 shift64RightJamming( aSig, shiftCount, &aSig ); 3247 return roundAndPackInt32( aSign, aSig ); 3248 3249 } 3250 3251 /*---------------------------------------------------------------------------- 3252 | Returns the result of converting the extended double-precision floating- 3253 | point value `a' to the 32-bit two's complement integer format. The 3254 | conversion is performed according to the IEC/IEEE Standard for Binary 3255 | Floating-Point Arithmetic, except that the conversion is always rounded 3256 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3257 | Otherwise, if the conversion overflows, the largest integer with the same 3258 | sign as `a' is returned. 3259 *----------------------------------------------------------------------------*/ 3260 3261 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3262 { 3263 flag aSign; 3264 int32 aExp, shiftCount; 3265 bits64 aSig, savedASig; 3266 int32 z; 3267 3268 aSig = extractFloatx80Frac( a ); 3269 aExp = extractFloatx80Exp( a ); 3270 aSign = extractFloatx80Sign( a ); 3271 if ( 0x401E < aExp ) { 3272 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3273 goto invalid; 3274 } 3275 else if ( aExp < 0x3FFF ) { 3276 if ( aExp || aSig ) float_set_inexact(); 3277 return 0; 3278 } 3279 shiftCount = 0x403E - aExp; 3280 savedASig = aSig; 3281 aSig >>= shiftCount; 3282 z = aSig; 3283 if ( aSign ) z = - z; 3284 if ( ( z < 0 ) ^ aSign ) { 3285 invalid: 3286 float_raise( float_flag_invalid ); 3287 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3288 } 3289 if ( ( aSig<<shiftCount ) != savedASig ) { 3290 float_set_inexact(); 3291 } 3292 return z; 3293 3294 } 3295 3296 /*---------------------------------------------------------------------------- 3297 | Returns the result of converting the extended double-precision floating- 3298 | point value `a' to the 64-bit two's complement integer format. The 3299 | conversion is performed according to the IEC/IEEE Standard for Binary 3300 | Floating-Point Arithmetic---which means in particular that the conversion 3301 | is rounded according to the current rounding mode. If `a' is a NaN, 3302 | the largest positive integer is returned. Otherwise, if the conversion 3303 | overflows, the largest integer with the same sign as `a' is returned. 3304 *----------------------------------------------------------------------------*/ 3305 3306 int64 floatx80_to_int64( floatx80 a ) 3307 { 3308 flag aSign; 3309 int32 aExp, shiftCount; 3310 bits64 aSig, aSigExtra; 3311 3312 aSig = extractFloatx80Frac( a ); 3313 aExp = extractFloatx80Exp( a ); 3314 aSign = extractFloatx80Sign( a ); 3315 shiftCount = 0x403E - aExp; 3316 if ( shiftCount <= 0 ) { 3317 if ( shiftCount ) { 3318 float_raise( float_flag_invalid ); 3319 if ( ! aSign 3320 || ( ( aExp == 0x7FFF ) 3321 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3322 ) { 3323 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3324 } 3325 return (sbits64) LIT64( 0x8000000000000000 ); 3326 } 3327 aSigExtra = 0; 3328 } 3329 else { 3330 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3331 } 3332 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3333 3334 } 3335 3336 /*---------------------------------------------------------------------------- 3337 | Returns the result of converting the extended double-precision floating- 3338 | point value `a' to the 64-bit two's complement integer format. The 3339 | conversion is performed according to the IEC/IEEE Standard for Binary 3340 | Floating-Point Arithmetic, except that the conversion is always rounded 3341 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3342 | Otherwise, if the conversion overflows, the largest integer with the same 3343 | sign as `a' is returned. 3344 *----------------------------------------------------------------------------*/ 3345 3346 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3347 { 3348 flag aSign; 3349 int32 aExp, shiftCount; 3350 bits64 aSig; 3351 int64 z; 3352 3353 aSig = extractFloatx80Frac( a ); 3354 aExp = extractFloatx80Exp( a ); 3355 aSign = extractFloatx80Sign( a ); 3356 shiftCount = aExp - 0x403E; 3357 if ( 0 <= shiftCount ) { 3358 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3359 if ( ( a.high != 0xC03E ) || aSig ) { 3360 float_raise( float_flag_invalid ); 3361 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3362 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3363 } 3364 } 3365 return (sbits64) LIT64( 0x8000000000000000 ); 3366 } 3367 else if ( aExp < 0x3FFF ) { 3368 if ( aExp | aSig ) float_set_inexact(); 3369 return 0; 3370 } 3371 z = aSig>>( - shiftCount ); 3372 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3373 float_set_inexact(); 3374 } 3375 if ( aSign ) z = - z; 3376 return z; 3377 3378 } 3379 3380 /*---------------------------------------------------------------------------- 3381 | Returns the result of converting the extended double-precision floating- 3382 | point value `a' to the single-precision floating-point format. The 3383 | conversion is performed according to the IEC/IEEE Standard for Binary 3384 | Floating-Point Arithmetic. 3385 *----------------------------------------------------------------------------*/ 3386 3387 float32 floatx80_to_float32( floatx80 a ) 3388 { 3389 flag aSign; 3390 int32 aExp; 3391 bits64 aSig; 3392 3393 aSig = extractFloatx80Frac( a ); 3394 aExp = extractFloatx80Exp( a ); 3395 aSign = extractFloatx80Sign( a ); 3396 if ( aExp == 0x7FFF ) { 3397 if ( (bits64) ( aSig<<1 ) ) { 3398 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3399 } 3400 return packFloat32( aSign, 0xFF, 0 ); 3401 } 3402 shift64RightJamming( aSig, 33, &aSig ); 3403 if ( aExp || aSig ) aExp -= 0x3F81; 3404 return roundAndPackFloat32( aSign, aExp, aSig ); 3405 3406 } 3407 3408 /*---------------------------------------------------------------------------- 3409 | Returns the result of converting the extended double-precision floating- 3410 | point value `a' to the double-precision floating-point format. The 3411 | conversion is performed according to the IEC/IEEE Standard for Binary 3412 | Floating-Point Arithmetic. 3413 *----------------------------------------------------------------------------*/ 3414 3415 float64 floatx80_to_float64( floatx80 a ) 3416 { 3417 flag aSign; 3418 int32 aExp; 3419 bits64 aSig, zSig; 3420 3421 aSig = extractFloatx80Frac( a ); 3422 aExp = extractFloatx80Exp( a ); 3423 aSign = extractFloatx80Sign( a ); 3424 if ( aExp == 0x7FFF ) { 3425 if ( (bits64) ( aSig<<1 ) ) { 3426 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3427 } 3428 return packFloat64( aSign, 0x7FF, 0 ); 3429 } 3430 shift64RightJamming( aSig, 1, &zSig ); 3431 if ( aExp || aSig ) aExp -= 0x3C01; 3432 return roundAndPackFloat64( aSign, aExp, zSig ); 3433 3434 } 3435 3436 #ifdef FLOAT128 3437 3438 /*---------------------------------------------------------------------------- 3439 | Returns the result of converting the extended double-precision floating- 3440 | point value `a' to the quadruple-precision floating-point format. The 3441 | conversion is performed according to the IEC/IEEE Standard for Binary 3442 | Floating-Point Arithmetic. 3443 *----------------------------------------------------------------------------*/ 3444 3445 float128 floatx80_to_float128( floatx80 a ) 3446 { 3447 flag aSign; 3448 int16 aExp; 3449 bits64 aSig, zSig0, zSig1; 3450 3451 aSig = extractFloatx80Frac( a ); 3452 aExp = extractFloatx80Exp( a ); 3453 aSign = extractFloatx80Sign( a ); 3454 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3455 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3456 } 3457 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3458 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3459 3460 } 3461 3462 #endif 3463 3464 /*---------------------------------------------------------------------------- 3465 | Rounds the extended double-precision floating-point value `a' to an integer, 3466 | and returns the result as an extended quadruple-precision floating-point 3467 | value. The operation is performed according to the IEC/IEEE Standard for 3468 | Binary Floating-Point Arithmetic. 3469 *----------------------------------------------------------------------------*/ 3470 3471 floatx80 floatx80_round_to_int( floatx80 a ) 3472 { 3473 flag aSign; 3474 int32 aExp; 3475 bits64 lastBitMask, roundBitsMask; 3476 int8 roundingMode; 3477 floatx80 z; 3478 3479 aExp = extractFloatx80Exp( a ); 3480 if ( 0x403E <= aExp ) { 3481 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3482 return propagateFloatx80NaN( a, a ); 3483 } 3484 return a; 3485 } 3486 if ( aExp < 0x3FFF ) { 3487 if ( ( aExp == 0 ) 3488 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3489 return a; 3490 } 3491 float_set_inexact(); 3492 aSign = extractFloatx80Sign( a ); 3493 switch ( float_rounding_mode() ) { 3494 case float_round_nearest_even: 3495 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3496 ) { 3497 return 3498 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3499 } 3500 break; 3501 case float_round_down: 3502 return 3503 aSign ? 3504 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3505 : packFloatx80( 0, 0, 0 ); 3506 case float_round_up: 3507 return 3508 aSign ? packFloatx80( 1, 0, 0 ) 3509 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3510 } 3511 return packFloatx80( aSign, 0, 0 ); 3512 } 3513 lastBitMask = 1; 3514 lastBitMask <<= 0x403E - aExp; 3515 roundBitsMask = lastBitMask - 1; 3516 z = a; 3517 roundingMode = float_rounding_mode(); 3518 if ( roundingMode == float_round_nearest_even ) { 3519 z.low += lastBitMask>>1; 3520 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3521 } 3522 else if ( roundingMode != float_round_to_zero ) { 3523 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3524 z.low += roundBitsMask; 3525 } 3526 } 3527 z.low &= ~ roundBitsMask; 3528 if ( z.low == 0 ) { 3529 ++z.high; 3530 z.low = LIT64( 0x8000000000000000 ); 3531 } 3532 if ( z.low != a.low ) float_set_inexact(); 3533 return z; 3534 3535 } 3536 3537 /*---------------------------------------------------------------------------- 3538 | Returns the result of adding the absolute values of the extended double- 3539 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3540 | negated before being returned. `zSign' is ignored if the result is a NaN. 3541 | The addition is performed according to the IEC/IEEE Standard for Binary 3542 | Floating-Point Arithmetic. 3543 *----------------------------------------------------------------------------*/ 3544 3545 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3546 { 3547 int32 aExp, bExp, zExp; 3548 bits64 aSig, bSig, zSig0, zSig1; 3549 int32 expDiff; 3550 3551 aSig = extractFloatx80Frac( a ); 3552 aExp = extractFloatx80Exp( a ); 3553 bSig = extractFloatx80Frac( b ); 3554 bExp = extractFloatx80Exp( b ); 3555 expDiff = aExp - bExp; 3556 if ( 0 < expDiff ) { 3557 if ( aExp == 0x7FFF ) { 3558 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3559 return a; 3560 } 3561 if ( bExp == 0 ) --expDiff; 3562 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3563 zExp = aExp; 3564 } 3565 else if ( expDiff < 0 ) { 3566 if ( bExp == 0x7FFF ) { 3567 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3568 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3569 } 3570 if ( aExp == 0 ) ++expDiff; 3571 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3572 zExp = bExp; 3573 } 3574 else { 3575 if ( aExp == 0x7FFF ) { 3576 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3577 return propagateFloatx80NaN( a, b ); 3578 } 3579 return a; 3580 } 3581 zSig1 = 0; 3582 zSig0 = aSig + bSig; 3583 if ( aExp == 0 ) { 3584 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3585 goto roundAndPack; 3586 } 3587 zExp = aExp; 3588 goto shiftRight1; 3589 } 3590 zSig0 = aSig + bSig; 3591 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3592 shiftRight1: 3593 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3594 zSig0 |= LIT64( 0x8000000000000000 ); 3595 ++zExp; 3596 roundAndPack: 3597 return 3598 roundAndPackFloatx80( 3599 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3600 3601 } 3602 3603 /*---------------------------------------------------------------------------- 3604 | Returns the result of subtracting the absolute values of the extended 3605 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3606 | difference is negated before being returned. `zSign' is ignored if the 3607 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3608 | Standard for Binary Floating-Point Arithmetic. 3609 *----------------------------------------------------------------------------*/ 3610 3611 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3612 { 3613 int32 aExp, bExp, zExp; 3614 bits64 aSig, bSig, zSig0, zSig1; 3615 int32 expDiff; 3616 floatx80 z; 3617 3618 aSig = extractFloatx80Frac( a ); 3619 aExp = extractFloatx80Exp( a ); 3620 bSig = extractFloatx80Frac( b ); 3621 bExp = extractFloatx80Exp( b ); 3622 expDiff = aExp - bExp; 3623 if ( 0 < expDiff ) goto aExpBigger; 3624 if ( expDiff < 0 ) goto bExpBigger; 3625 if ( aExp == 0x7FFF ) { 3626 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3627 return propagateFloatx80NaN( a, b ); 3628 } 3629 float_raise( float_flag_invalid ); 3630 z.low = floatx80_default_nan_low; 3631 z.high = floatx80_default_nan_high; 3632 return z; 3633 } 3634 if ( aExp == 0 ) { 3635 aExp = 1; 3636 bExp = 1; 3637 } 3638 zSig1 = 0; 3639 if ( bSig < aSig ) goto aBigger; 3640 if ( aSig < bSig ) goto bBigger; 3641 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3642 bExpBigger: 3643 if ( bExp == 0x7FFF ) { 3644 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3645 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3646 } 3647 if ( aExp == 0 ) ++expDiff; 3648 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3649 bBigger: 3650 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3651 zExp = bExp; 3652 zSign ^= 1; 3653 goto normalizeRoundAndPack; 3654 aExpBigger: 3655 if ( aExp == 0x7FFF ) { 3656 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3657 return a; 3658 } 3659 if ( bExp == 0 ) --expDiff; 3660 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3661 aBigger: 3662 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3663 zExp = aExp; 3664 normalizeRoundAndPack: 3665 return 3666 normalizeRoundAndPackFloatx80( 3667 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3668 3669 } 3670 3671 /*---------------------------------------------------------------------------- 3672 | Returns the result of adding the extended double-precision floating-point 3673 | values `a' and `b'. The operation is performed according to the IEC/IEEE 3674 | Standard for Binary Floating-Point Arithmetic. 3675 *----------------------------------------------------------------------------*/ 3676 3677 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3678 { 3679 flag aSign, bSign; 3680 3681 aSign = extractFloatx80Sign( a ); 3682 bSign = extractFloatx80Sign( b ); 3683 if ( aSign == bSign ) { 3684 return addFloatx80Sigs( a, b, aSign ); 3685 } 3686 else { 3687 return subFloatx80Sigs( a, b, aSign ); 3688 } 3689 3690 } 3691 3692 /*---------------------------------------------------------------------------- 3693 | Returns the result of subtracting the extended double-precision floating- 3694 | point values `a' and `b'. The operation is performed according to the 3695 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3696 *----------------------------------------------------------------------------*/ 3697 3698 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3699 { 3700 flag aSign, bSign; 3701 3702 aSign = extractFloatx80Sign( a ); 3703 bSign = extractFloatx80Sign( b ); 3704 if ( aSign == bSign ) { 3705 return subFloatx80Sigs( a, b, aSign ); 3706 } 3707 else { 3708 return addFloatx80Sigs( a, b, aSign ); 3709 } 3710 3711 } 3712 3713 /*---------------------------------------------------------------------------- 3714 | Returns the result of multiplying the extended double-precision floating- 3715 | point values `a' and `b'. The operation is performed according to the 3716 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3717 *----------------------------------------------------------------------------*/ 3718 3719 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3720 { 3721 flag aSign, bSign, zSign; 3722 int32 aExp, bExp, zExp; 3723 bits64 aSig, bSig, zSig0, zSig1; 3724 floatx80 z; 3725 3726 aSig = extractFloatx80Frac( a ); 3727 aExp = extractFloatx80Exp( a ); 3728 aSign = extractFloatx80Sign( a ); 3729 bSig = extractFloatx80Frac( b ); 3730 bExp = extractFloatx80Exp( b ); 3731 bSign = extractFloatx80Sign( b ); 3732 zSign = aSign ^ bSign; 3733 if ( aExp == 0x7FFF ) { 3734 if ( (bits64) ( aSig<<1 ) 3735 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3736 return propagateFloatx80NaN( a, b ); 3737 } 3738 if ( ( bExp | bSig ) == 0 ) goto invalid; 3739 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3740 } 3741 if ( bExp == 0x7FFF ) { 3742 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3743 if ( ( aExp | aSig ) == 0 ) { 3744 invalid: 3745 float_raise( float_flag_invalid ); 3746 z.low = floatx80_default_nan_low; 3747 z.high = floatx80_default_nan_high; 3748 return z; 3749 } 3750 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3751 } 3752 if ( aExp == 0 ) { 3753 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3754 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3755 } 3756 if ( bExp == 0 ) { 3757 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3758 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3759 } 3760 zExp = aExp + bExp - 0x3FFE; 3761 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3762 if ( 0 < (sbits64) zSig0 ) { 3763 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3764 --zExp; 3765 } 3766 return 3767 roundAndPackFloatx80( 3768 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3769 3770 } 3771 3772 /*---------------------------------------------------------------------------- 3773 | Returns the result of dividing the extended double-precision floating-point 3774 | value `a' by the corresponding value `b'. The operation is performed 3775 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3776 *----------------------------------------------------------------------------*/ 3777 3778 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3779 { 3780 flag aSign, bSign, zSign; 3781 int32 aExp, bExp, zExp; 3782 bits64 aSig, bSig, zSig0, zSig1; 3783 bits64 rem0, rem1, rem2, term0, term1, term2; 3784 floatx80 z; 3785 3786 aSig = extractFloatx80Frac( a ); 3787 aExp = extractFloatx80Exp( a ); 3788 aSign = extractFloatx80Sign( a ); 3789 bSig = extractFloatx80Frac( b ); 3790 bExp = extractFloatx80Exp( b ); 3791 bSign = extractFloatx80Sign( b ); 3792 zSign = aSign ^ bSign; 3793 if ( aExp == 0x7FFF ) { 3794 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3795 if ( bExp == 0x7FFF ) { 3796 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3797 goto invalid; 3798 } 3799 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3800 } 3801 if ( bExp == 0x7FFF ) { 3802 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3803 return packFloatx80( zSign, 0, 0 ); 3804 } 3805 if ( bExp == 0 ) { 3806 if ( bSig == 0 ) { 3807 if ( ( aExp | aSig ) == 0 ) { 3808 invalid: 3809 float_raise( float_flag_invalid ); 3810 z.low = floatx80_default_nan_low; 3811 z.high = floatx80_default_nan_high; 3812 return z; 3813 } 3814 float_raise( float_flag_divbyzero ); 3815 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3816 } 3817 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3818 } 3819 if ( aExp == 0 ) { 3820 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3821 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3822 } 3823 zExp = aExp - bExp + 0x3FFE; 3824 rem1 = 0; 3825 if ( bSig <= aSig ) { 3826 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3827 ++zExp; 3828 } 3829 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3830 mul64To128( bSig, zSig0, &term0, &term1 ); 3831 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3832 while ( (sbits64) rem0 < 0 ) { 3833 --zSig0; 3834 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3835 } 3836 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3837 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3838 mul64To128( bSig, zSig1, &term1, &term2 ); 3839 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3840 while ( (sbits64) rem1 < 0 ) { 3841 --zSig1; 3842 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3843 } 3844 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3845 } 3846 return 3847 roundAndPackFloatx80( 3848 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3849 3850 } 3851 3852 /*---------------------------------------------------------------------------- 3853 | Returns the remainder of the extended double-precision floating-point value 3854 | `a' with respect to the corresponding value `b'. The operation is performed 3855 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3856 *----------------------------------------------------------------------------*/ 3857 3858 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3859 { 3860 flag aSign, bSign, zSign; 3861 int32 aExp, bExp, expDiff; 3862 bits64 aSig0, aSig1, bSig; 3863 bits64 q, term0, term1, alternateASig0, alternateASig1; 3864 floatx80 z; 3865 3866 aSig0 = extractFloatx80Frac( a ); 3867 aExp = extractFloatx80Exp( a ); 3868 aSign = extractFloatx80Sign( a ); 3869 bSig = extractFloatx80Frac( b ); 3870 bExp = extractFloatx80Exp( b ); 3871 bSign = extractFloatx80Sign( b ); 3872 if ( aExp == 0x7FFF ) { 3873 if ( (bits64) ( aSig0<<1 ) 3874 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3875 return propagateFloatx80NaN( a, b ); 3876 } 3877 goto invalid; 3878 } 3879 if ( bExp == 0x7FFF ) { 3880 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3881 return a; 3882 } 3883 if ( bExp == 0 ) { 3884 if ( bSig == 0 ) { 3885 invalid: 3886 float_raise( float_flag_invalid ); 3887 z.low = floatx80_default_nan_low; 3888 z.high = floatx80_default_nan_high; 3889 return z; 3890 } 3891 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3892 } 3893 if ( aExp == 0 ) { 3894 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3895 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3896 } 3897 bSig |= LIT64( 0x8000000000000000 ); 3898 zSign = aSign; 3899 expDiff = aExp - bExp; 3900 aSig1 = 0; 3901 if ( expDiff < 0 ) { 3902 if ( expDiff < -1 ) return a; 3903 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3904 expDiff = 0; 3905 } 3906 q = ( bSig <= aSig0 ); 3907 if ( q ) aSig0 -= bSig; 3908 expDiff -= 64; 3909 while ( 0 < expDiff ) { 3910 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3911 q = ( 2 < q ) ? q - 2 : 0; 3912 mul64To128( bSig, q, &term0, &term1 ); 3913 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3914 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3915 expDiff -= 62; 3916 } 3917 expDiff += 64; 3918 if ( 0 < expDiff ) { 3919 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3920 q = ( 2 < q ) ? q - 2 : 0; 3921 q >>= 64 - expDiff; 3922 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3923 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3924 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3925 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3926 ++q; 3927 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3928 } 3929 } 3930 else { 3931 term1 = 0; 3932 term0 = bSig; 3933 } 3934 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3935 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3936 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3937 && ( q & 1 ) ) 3938 ) { 3939 aSig0 = alternateASig0; 3940 aSig1 = alternateASig1; 3941 zSign = ! zSign; 3942 } 3943 return 3944 normalizeRoundAndPackFloatx80( 3945 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3946 3947 } 3948 3949 /*---------------------------------------------------------------------------- 3950 | Returns the square root of the extended double-precision floating-point 3951 | value `a'. The operation is performed according to the IEC/IEEE Standard 3952 | for Binary Floating-Point Arithmetic. 3953 *----------------------------------------------------------------------------*/ 3954 3955 floatx80 floatx80_sqrt( floatx80 a ) 3956 { 3957 flag aSign; 3958 int32 aExp, zExp; 3959 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3960 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3961 floatx80 z; 3962 3963 aSig0 = extractFloatx80Frac( a ); 3964 aExp = extractFloatx80Exp( a ); 3965 aSign = extractFloatx80Sign( a ); 3966 if ( aExp == 0x7FFF ) { 3967 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 3968 if ( ! aSign ) return a; 3969 goto invalid; 3970 } 3971 if ( aSign ) { 3972 if ( ( aExp | aSig0 ) == 0 ) return a; 3973 invalid: 3974 float_raise( float_flag_invalid ); 3975 z.low = floatx80_default_nan_low; 3976 z.high = floatx80_default_nan_high; 3977 return z; 3978 } 3979 if ( aExp == 0 ) { 3980 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 3981 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3982 } 3983 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 3984 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 3985 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 3986 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 3987 doubleZSig0 = zSig0<<1; 3988 mul64To128( zSig0, zSig0, &term0, &term1 ); 3989 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 3990 while ( (sbits64) rem0 < 0 ) { 3991 --zSig0; 3992 doubleZSig0 -= 2; 3993 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 3994 } 3995 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 3996 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 3997 if ( zSig1 == 0 ) zSig1 = 1; 3998 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 3999 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4000 mul64To128( zSig1, zSig1, &term2, &term3 ); 4001 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4002 while ( (sbits64) rem1 < 0 ) { 4003 --zSig1; 4004 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4005 term3 |= 1; 4006 term2 |= doubleZSig0; 4007 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4008 } 4009 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4010 } 4011 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4012 zSig0 |= doubleZSig0; 4013 return 4014 roundAndPackFloatx80( 4015 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4016 4017 } 4018 4019 /*---------------------------------------------------------------------------- 4020 | Returns 1 if the extended double-precision floating-point value `a' is 4021 | equal to the corresponding value `b', and 0 otherwise. The comparison is 4022 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4023 | Arithmetic. 4024 *----------------------------------------------------------------------------*/ 4025 4026 flag floatx80_eq( floatx80 a, floatx80 b ) 4027 { 4028 4029 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4030 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4031 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4032 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4033 ) { 4034 if ( floatx80_is_signaling_nan( a ) 4035 || floatx80_is_signaling_nan( b ) ) { 4036 float_raise( float_flag_invalid ); 4037 } 4038 return 0; 4039 } 4040 return 4041 ( a.low == b.low ) 4042 && ( ( a.high == b.high ) 4043 || ( ( a.low == 0 ) 4044 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4045 ); 4046 4047 } 4048 4049 /*---------------------------------------------------------------------------- 4050 | Returns 1 if the extended double-precision floating-point value `a' is 4051 | less than or equal to the corresponding value `b', and 0 otherwise. The 4052 | comparison is performed according to the IEC/IEEE Standard for Binary 4053 | Floating-Point Arithmetic. 4054 *----------------------------------------------------------------------------*/ 4055 4056 flag floatx80_le( floatx80 a, floatx80 b ) 4057 { 4058 flag aSign, bSign; 4059 4060 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4061 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4062 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4063 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4064 ) { 4065 float_raise( float_flag_invalid ); 4066 return 0; 4067 } 4068 aSign = extractFloatx80Sign( a ); 4069 bSign = extractFloatx80Sign( b ); 4070 if ( aSign != bSign ) { 4071 return 4072 aSign 4073 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4074 == 0 ); 4075 } 4076 return 4077 aSign ? le128( b.high, b.low, a.high, a.low ) 4078 : le128( a.high, a.low, b.high, b.low ); 4079 4080 } 4081 4082 /*---------------------------------------------------------------------------- 4083 | Returns 1 if the extended double-precision floating-point value `a' is 4084 | less than the corresponding value `b', and 0 otherwise. The comparison 4085 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4086 | Arithmetic. 4087 *----------------------------------------------------------------------------*/ 4088 4089 flag floatx80_lt( floatx80 a, floatx80 b ) 4090 { 4091 flag aSign, bSign; 4092 4093 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4094 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4095 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4096 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4097 ) { 4098 float_raise( float_flag_invalid ); 4099 return 0; 4100 } 4101 aSign = extractFloatx80Sign( a ); 4102 bSign = extractFloatx80Sign( b ); 4103 if ( aSign != bSign ) { 4104 return 4105 aSign 4106 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4107 != 0 ); 4108 } 4109 return 4110 aSign ? lt128( b.high, b.low, a.high, a.low ) 4111 : lt128( a.high, a.low, b.high, b.low ); 4112 4113 } 4114 4115 /*---------------------------------------------------------------------------- 4116 | Returns 1 if the extended double-precision floating-point value `a' is equal 4117 | to the corresponding value `b', and 0 otherwise. The invalid exception is 4118 | raised if either operand is a NaN. Otherwise, the comparison is performed 4119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4120 *----------------------------------------------------------------------------*/ 4121 4122 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4123 { 4124 4125 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4126 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4127 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4128 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4129 ) { 4130 float_raise( float_flag_invalid ); 4131 return 0; 4132 } 4133 return 4134 ( a.low == b.low ) 4135 && ( ( a.high == b.high ) 4136 || ( ( a.low == 0 ) 4137 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4138 ); 4139 4140 } 4141 4142 /*---------------------------------------------------------------------------- 4143 | Returns 1 if the extended double-precision floating-point value `a' is less 4144 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4145 | do not cause an exception. Otherwise, the comparison is performed according 4146 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4147 *----------------------------------------------------------------------------*/ 4148 4149 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4150 { 4151 flag aSign, bSign; 4152 4153 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4154 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4155 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4156 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4157 ) { 4158 if ( floatx80_is_signaling_nan( a ) 4159 || floatx80_is_signaling_nan( b ) ) { 4160 float_raise( float_flag_invalid ); 4161 } 4162 return 0; 4163 } 4164 aSign = extractFloatx80Sign( a ); 4165 bSign = extractFloatx80Sign( b ); 4166 if ( aSign != bSign ) { 4167 return 4168 aSign 4169 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4170 == 0 ); 4171 } 4172 return 4173 aSign ? le128( b.high, b.low, a.high, a.low ) 4174 : le128( a.high, a.low, b.high, b.low ); 4175 4176 } 4177 4178 /*---------------------------------------------------------------------------- 4179 | Returns 1 if the extended double-precision floating-point value `a' is less 4180 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4181 | an exception. Otherwise, the comparison is performed according to the 4182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4183 *----------------------------------------------------------------------------*/ 4184 4185 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4186 { 4187 flag aSign, bSign; 4188 4189 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4190 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4191 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4192 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4193 ) { 4194 if ( floatx80_is_signaling_nan( a ) 4195 || floatx80_is_signaling_nan( b ) ) { 4196 float_raise( float_flag_invalid ); 4197 } 4198 return 0; 4199 } 4200 aSign = extractFloatx80Sign( a ); 4201 bSign = extractFloatx80Sign( b ); 4202 if ( aSign != bSign ) { 4203 return 4204 aSign 4205 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4206 != 0 ); 4207 } 4208 return 4209 aSign ? lt128( b.high, b.low, a.high, a.low ) 4210 : lt128( a.high, a.low, b.high, b.low ); 4211 4212 } 4213 4214 #endif 4215 4216 #ifdef FLOAT128 4217 4218 /*---------------------------------------------------------------------------- 4219 | Returns the result of converting the quadruple-precision floating-point 4220 | value `a' to the 32-bit two's complement integer format. The conversion 4221 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4222 | Arithmetic---which means in particular that the conversion is rounded 4223 | according to the current rounding mode. If `a' is a NaN, the largest 4224 | positive integer is returned. Otherwise, if the conversion overflows, the 4225 | largest integer with the same sign as `a' is returned. 4226 *----------------------------------------------------------------------------*/ 4227 4228 int32 float128_to_int32( float128 a ) 4229 { 4230 flag aSign; 4231 int32 aExp, shiftCount; 4232 bits64 aSig0, aSig1; 4233 4234 aSig1 = extractFloat128Frac1( a ); 4235 aSig0 = extractFloat128Frac0( a ); 4236 aExp = extractFloat128Exp( a ); 4237 aSign = extractFloat128Sign( a ); 4238 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4239 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4240 aSig0 |= ( aSig1 != 0 ); 4241 shiftCount = 0x4028 - aExp; 4242 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4243 return roundAndPackInt32( aSign, aSig0 ); 4244 4245 } 4246 4247 /*---------------------------------------------------------------------------- 4248 | Returns the result of converting the quadruple-precision floating-point 4249 | value `a' to the 32-bit two's complement integer format. The conversion 4250 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4251 | Arithmetic, except that the conversion is always rounded toward zero. If 4252 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4253 | conversion overflows, the largest integer with the same sign as `a' is 4254 | returned. 4255 *----------------------------------------------------------------------------*/ 4256 4257 int32 float128_to_int32_round_to_zero( float128 a ) 4258 { 4259 flag aSign; 4260 int32 aExp, shiftCount; 4261 bits64 aSig0, aSig1, savedASig; 4262 int32 z; 4263 4264 aSig1 = extractFloat128Frac1( a ); 4265 aSig0 = extractFloat128Frac0( a ); 4266 aExp = extractFloat128Exp( a ); 4267 aSign = extractFloat128Sign( a ); 4268 aSig0 |= ( aSig1 != 0 ); 4269 if ( 0x401E < aExp ) { 4270 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4271 goto invalid; 4272 } 4273 else if ( aExp < 0x3FFF ) { 4274 if ( aExp || aSig0 ) float_set_inexact(); 4275 return 0; 4276 } 4277 aSig0 |= LIT64( 0x0001000000000000 ); 4278 shiftCount = 0x402F - aExp; 4279 savedASig = aSig0; 4280 aSig0 >>= shiftCount; 4281 z = aSig0; 4282 if ( aSign ) z = - z; 4283 if ( ( z < 0 ) ^ aSign ) { 4284 invalid: 4285 float_raise( float_flag_invalid ); 4286 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4287 } 4288 if ( ( aSig0<<shiftCount ) != savedASig ) { 4289 float_set_inexact(); 4290 } 4291 return z; 4292 4293 } 4294 4295 /*---------------------------------------------------------------------------- 4296 | Returns the result of converting the quadruple-precision floating-point 4297 | value `a' to the 64-bit two's complement integer format. The conversion 4298 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4299 | Arithmetic---which means in particular that the conversion is rounded 4300 | according to the current rounding mode. If `a' is a NaN, the largest 4301 | positive integer is returned. Otherwise, if the conversion overflows, the 4302 | largest integer with the same sign as `a' is returned. 4303 *----------------------------------------------------------------------------*/ 4304 4305 int64 float128_to_int64( float128 a ) 4306 { 4307 flag aSign; 4308 int32 aExp, shiftCount; 4309 bits64 aSig0, aSig1; 4310 4311 aSig1 = extractFloat128Frac1( a ); 4312 aSig0 = extractFloat128Frac0( a ); 4313 aExp = extractFloat128Exp( a ); 4314 aSign = extractFloat128Sign( a ); 4315 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4316 shiftCount = 0x402F - aExp; 4317 if ( shiftCount <= 0 ) { 4318 if ( 0x403E < aExp ) { 4319 float_raise( float_flag_invalid ); 4320 if ( ! aSign 4321 || ( ( aExp == 0x7FFF ) 4322 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4323 ) 4324 ) { 4325 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4326 } 4327 return (sbits64) LIT64( 0x8000000000000000 ); 4328 } 4329 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4330 } 4331 else { 4332 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4333 } 4334 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4335 4336 } 4337 4338 /*---------------------------------------------------------------------------- 4339 | Returns the result of converting the quadruple-precision floating-point 4340 | value `a' to the 64-bit two's complement integer format. The conversion 4341 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4342 | Arithmetic, except that the conversion is always rounded toward zero. 4343 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4344 | the conversion overflows, the largest integer with the same sign as `a' is 4345 | returned. 4346 *----------------------------------------------------------------------------*/ 4347 4348 int64 float128_to_int64_round_to_zero( float128 a ) 4349 { 4350 flag aSign; 4351 int32 aExp, shiftCount; 4352 bits64 aSig0, aSig1; 4353 int64 z; 4354 4355 aSig1 = extractFloat128Frac1( a ); 4356 aSig0 = extractFloat128Frac0( a ); 4357 aExp = extractFloat128Exp( a ); 4358 aSign = extractFloat128Sign( a ); 4359 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4360 shiftCount = aExp - 0x402F; 4361 if ( 0 < shiftCount ) { 4362 if ( 0x403E <= aExp ) { 4363 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4364 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4365 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4366 if ( aSig1 ) float_set_inexact(); 4367 } 4368 else { 4369 float_raise( float_flag_invalid ); 4370 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4371 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4372 } 4373 } 4374 return (sbits64) LIT64( 0x8000000000000000 ); 4375 } 4376 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4377 if ( (bits64) ( aSig1<<shiftCount ) ) { 4378 float_set_inexact(); 4379 } 4380 } 4381 else { 4382 if ( aExp < 0x3FFF ) { 4383 if ( aExp | aSig0 | aSig1 ) { 4384 float_set_inexact(); 4385 } 4386 return 0; 4387 } 4388 z = aSig0>>( - shiftCount ); 4389 if ( aSig1 4390 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4391 float_set_inexact(); 4392 } 4393 } 4394 if ( aSign ) z = - z; 4395 return z; 4396 4397 } 4398 4399 /*---------------------------------------------------------------------------- 4400 | Returns the result of converting the quadruple-precision floating-point 4401 | value `a' to the single-precision floating-point format. The conversion 4402 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4403 | Arithmetic. 4404 *----------------------------------------------------------------------------*/ 4405 4406 float32 float128_to_float32( float128 a ) 4407 { 4408 flag aSign; 4409 int32 aExp; 4410 bits64 aSig0, aSig1; 4411 bits32 zSig; 4412 4413 aSig1 = extractFloat128Frac1( a ); 4414 aSig0 = extractFloat128Frac0( a ); 4415 aExp = extractFloat128Exp( a ); 4416 aSign = extractFloat128Sign( a ); 4417 if ( aExp == 0x7FFF ) { 4418 if ( aSig0 | aSig1 ) { 4419 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4420 } 4421 return packFloat32( aSign, 0xFF, 0 ); 4422 } 4423 aSig0 |= ( aSig1 != 0 ); 4424 shift64RightJamming( aSig0, 18, &aSig0 ); 4425 zSig = aSig0; 4426 if ( aExp || zSig ) { 4427 zSig |= 0x40000000; 4428 aExp -= 0x3F81; 4429 } 4430 return roundAndPackFloat32( aSign, aExp, zSig ); 4431 4432 } 4433 4434 /*---------------------------------------------------------------------------- 4435 | Returns the result of converting the quadruple-precision floating-point 4436 | value `a' to the double-precision floating-point format. The conversion 4437 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4438 | Arithmetic. 4439 *----------------------------------------------------------------------------*/ 4440 4441 float64 float128_to_float64( float128 a ) 4442 { 4443 flag aSign; 4444 int32 aExp; 4445 bits64 aSig0, aSig1; 4446 4447 aSig1 = extractFloat128Frac1( a ); 4448 aSig0 = extractFloat128Frac0( a ); 4449 aExp = extractFloat128Exp( a ); 4450 aSign = extractFloat128Sign( a ); 4451 if ( aExp == 0x7FFF ) { 4452 if ( aSig0 | aSig1 ) { 4453 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4454 } 4455 return packFloat64( aSign, 0x7FF, 0 ); 4456 } 4457 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4458 aSig0 |= ( aSig1 != 0 ); 4459 if ( aExp || aSig0 ) { 4460 aSig0 |= LIT64( 0x4000000000000000 ); 4461 aExp -= 0x3C01; 4462 } 4463 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4464 4465 } 4466 4467 #ifdef FLOATX80 4468 4469 /*---------------------------------------------------------------------------- 4470 | Returns the result of converting the quadruple-precision floating-point 4471 | value `a' to the extended double-precision floating-point format. The 4472 | conversion is performed according to the IEC/IEEE Standard for Binary 4473 | Floating-Point Arithmetic. 4474 *----------------------------------------------------------------------------*/ 4475 4476 floatx80 float128_to_floatx80( float128 a ) 4477 { 4478 flag aSign; 4479 int32 aExp; 4480 bits64 aSig0, aSig1; 4481 4482 aSig1 = extractFloat128Frac1( a ); 4483 aSig0 = extractFloat128Frac0( a ); 4484 aExp = extractFloat128Exp( a ); 4485 aSign = extractFloat128Sign( a ); 4486 if ( aExp == 0x7FFF ) { 4487 if ( aSig0 | aSig1 ) { 4488 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4489 } 4490 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4491 } 4492 if ( aExp == 0 ) { 4493 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4494 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4495 } 4496 else { 4497 aSig0 |= LIT64( 0x0001000000000000 ); 4498 } 4499 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4500 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4501 4502 } 4503 4504 #endif 4505 4506 /*---------------------------------------------------------------------------- 4507 | Rounds the quadruple-precision floating-point value `a' to an integer, and 4508 | returns the result as a quadruple-precision floating-point value. The 4509 | operation is performed according to the IEC/IEEE Standard for Binary 4510 | Floating-Point Arithmetic. 4511 *----------------------------------------------------------------------------*/ 4512 4513 float128 float128_round_to_int( float128 a ) 4514 { 4515 flag aSign; 4516 int32 aExp; 4517 bits64 lastBitMask, roundBitsMask; 4518 int8 roundingMode; 4519 float128 z; 4520 4521 aExp = extractFloat128Exp( a ); 4522 if ( 0x402F <= aExp ) { 4523 if ( 0x406F <= aExp ) { 4524 if ( ( aExp == 0x7FFF ) 4525 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4526 ) { 4527 return propagateFloat128NaN( a, a ); 4528 } 4529 return a; 4530 } 4531 lastBitMask = 1; 4532 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4533 roundBitsMask = lastBitMask - 1; 4534 z = a; 4535 roundingMode = float_rounding_mode(); 4536 if ( roundingMode == float_round_nearest_even ) { 4537 if ( lastBitMask ) { 4538 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4539 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4540 } 4541 else { 4542 if ( (sbits64) z.low < 0 ) { 4543 ++z.high; 4544 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4545 } 4546 } 4547 } 4548 else if ( roundingMode != float_round_to_zero ) { 4549 if ( extractFloat128Sign( z ) 4550 ^ ( roundingMode == float_round_up ) ) { 4551 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4552 } 4553 } 4554 z.low &= ~ roundBitsMask; 4555 } 4556 else { 4557 if ( aExp < 0x3FFF ) { 4558 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4559 float_set_inexact(); 4560 aSign = extractFloat128Sign( a ); 4561 switch ( float_rounding_mode() ) { 4562 case float_round_nearest_even: 4563 if ( ( aExp == 0x3FFE ) 4564 && ( extractFloat128Frac0( a ) 4565 | extractFloat128Frac1( a ) ) 4566 ) { 4567 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4568 } 4569 break; 4570 case float_round_down: 4571 return 4572 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4573 : packFloat128( 0, 0, 0, 0 ); 4574 case float_round_up: 4575 return 4576 aSign ? packFloat128( 1, 0, 0, 0 ) 4577 : packFloat128( 0, 0x3FFF, 0, 0 ); 4578 } 4579 return packFloat128( aSign, 0, 0, 0 ); 4580 } 4581 lastBitMask = 1; 4582 lastBitMask <<= 0x402F - aExp; 4583 roundBitsMask = lastBitMask - 1; 4584 z.low = 0; 4585 z.high = a.high; 4586 roundingMode = float_rounding_mode(); 4587 if ( roundingMode == float_round_nearest_even ) { 4588 z.high += lastBitMask>>1; 4589 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4590 z.high &= ~ lastBitMask; 4591 } 4592 } 4593 else if ( roundingMode != float_round_to_zero ) { 4594 if ( extractFloat128Sign( z ) 4595 ^ ( roundingMode == float_round_up ) ) { 4596 z.high |= ( a.low != 0 ); 4597 z.high += roundBitsMask; 4598 } 4599 } 4600 z.high &= ~ roundBitsMask; 4601 } 4602 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4603 float_set_inexact(); 4604 } 4605 return z; 4606 4607 } 4608 4609 /*---------------------------------------------------------------------------- 4610 | Returns the result of adding the absolute values of the quadruple-precision 4611 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4612 | before being returned. `zSign' is ignored if the result is a NaN. 4613 | The addition is performed according to the IEC/IEEE Standard for Binary 4614 | Floating-Point Arithmetic. 4615 *----------------------------------------------------------------------------*/ 4616 4617 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4618 { 4619 int32 aExp, bExp, zExp; 4620 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4621 int32 expDiff; 4622 4623 aSig1 = extractFloat128Frac1( a ); 4624 aSig0 = extractFloat128Frac0( a ); 4625 aExp = extractFloat128Exp( a ); 4626 bSig1 = extractFloat128Frac1( b ); 4627 bSig0 = extractFloat128Frac0( b ); 4628 bExp = extractFloat128Exp( b ); 4629 expDiff = aExp - bExp; 4630 if ( 0 < expDiff ) { 4631 if ( aExp == 0x7FFF ) { 4632 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4633 return a; 4634 } 4635 if ( bExp == 0 ) { 4636 --expDiff; 4637 } 4638 else { 4639 bSig0 |= LIT64( 0x0001000000000000 ); 4640 } 4641 shift128ExtraRightJamming( 4642 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4643 zExp = aExp; 4644 } 4645 else if ( expDiff < 0 ) { 4646 if ( bExp == 0x7FFF ) { 4647 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4648 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4649 } 4650 if ( aExp == 0 ) { 4651 ++expDiff; 4652 } 4653 else { 4654 aSig0 |= LIT64( 0x0001000000000000 ); 4655 } 4656 shift128ExtraRightJamming( 4657 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4658 zExp = bExp; 4659 } 4660 else { 4661 if ( aExp == 0x7FFF ) { 4662 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4663 return propagateFloat128NaN( a, b ); 4664 } 4665 return a; 4666 } 4667 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4668 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4669 zSig2 = 0; 4670 zSig0 |= LIT64( 0x0002000000000000 ); 4671 zExp = aExp; 4672 goto shiftRight1; 4673 } 4674 aSig0 |= LIT64( 0x0001000000000000 ); 4675 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4676 --zExp; 4677 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4678 ++zExp; 4679 shiftRight1: 4680 shift128ExtraRightJamming( 4681 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4682 roundAndPack: 4683 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4684 4685 } 4686 4687 /*---------------------------------------------------------------------------- 4688 | Returns the result of subtracting the absolute values of the quadruple- 4689 | precision floating-point values `a' and `b'. If `zSign' is 1, the 4690 | difference is negated before being returned. `zSign' is ignored if the 4691 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4692 | Standard for Binary Floating-Point Arithmetic. 4693 *----------------------------------------------------------------------------*/ 4694 4695 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4696 { 4697 int32 aExp, bExp, zExp; 4698 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4699 int32 expDiff; 4700 float128 z; 4701 4702 aSig1 = extractFloat128Frac1( a ); 4703 aSig0 = extractFloat128Frac0( a ); 4704 aExp = extractFloat128Exp( a ); 4705 bSig1 = extractFloat128Frac1( b ); 4706 bSig0 = extractFloat128Frac0( b ); 4707 bExp = extractFloat128Exp( b ); 4708 expDiff = aExp - bExp; 4709 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4710 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4711 if ( 0 < expDiff ) goto aExpBigger; 4712 if ( expDiff < 0 ) goto bExpBigger; 4713 if ( aExp == 0x7FFF ) { 4714 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4715 return propagateFloat128NaN( a, b ); 4716 } 4717 float_raise( float_flag_invalid ); 4718 z.low = float128_default_nan_low; 4719 z.high = float128_default_nan_high; 4720 return z; 4721 } 4722 if ( aExp == 0 ) { 4723 aExp = 1; 4724 bExp = 1; 4725 } 4726 if ( bSig0 < aSig0 ) goto aBigger; 4727 if ( aSig0 < bSig0 ) goto bBigger; 4728 if ( bSig1 < aSig1 ) goto aBigger; 4729 if ( aSig1 < bSig1 ) goto bBigger; 4730 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4731 bExpBigger: 4732 if ( bExp == 0x7FFF ) { 4733 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4734 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4735 } 4736 if ( aExp == 0 ) { 4737 ++expDiff; 4738 } 4739 else { 4740 aSig0 |= LIT64( 0x4000000000000000 ); 4741 } 4742 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4743 bSig0 |= LIT64( 0x4000000000000000 ); 4744 bBigger: 4745 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4746 zExp = bExp; 4747 zSign ^= 1; 4748 goto normalizeRoundAndPack; 4749 aExpBigger: 4750 if ( aExp == 0x7FFF ) { 4751 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4752 return a; 4753 } 4754 if ( bExp == 0 ) { 4755 --expDiff; 4756 } 4757 else { 4758 bSig0 |= LIT64( 0x4000000000000000 ); 4759 } 4760 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4761 aSig0 |= LIT64( 0x4000000000000000 ); 4762 aBigger: 4763 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4764 zExp = aExp; 4765 normalizeRoundAndPack: 4766 --zExp; 4767 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4768 4769 } 4770 4771 /*---------------------------------------------------------------------------- 4772 | Returns the result of adding the quadruple-precision floating-point values 4773 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4774 | for Binary Floating-Point Arithmetic. 4775 *----------------------------------------------------------------------------*/ 4776 4777 float128 float128_add( float128 a, float128 b ) 4778 { 4779 flag aSign, bSign; 4780 4781 aSign = extractFloat128Sign( a ); 4782 bSign = extractFloat128Sign( b ); 4783 if ( aSign == bSign ) { 4784 return addFloat128Sigs( a, b, aSign ); 4785 } 4786 else { 4787 return subFloat128Sigs( a, b, aSign ); 4788 } 4789 4790 } 4791 4792 /*---------------------------------------------------------------------------- 4793 | Returns the result of subtracting the quadruple-precision floating-point 4794 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4795 | Standard for Binary Floating-Point Arithmetic. 4796 *----------------------------------------------------------------------------*/ 4797 4798 float128 float128_sub( float128 a, float128 b ) 4799 { 4800 flag aSign, bSign; 4801 4802 aSign = extractFloat128Sign( a ); 4803 bSign = extractFloat128Sign( b ); 4804 if ( aSign == bSign ) { 4805 return subFloat128Sigs( a, b, aSign ); 4806 } 4807 else { 4808 return addFloat128Sigs( a, b, aSign ); 4809 } 4810 4811 } 4812 4813 /*---------------------------------------------------------------------------- 4814 | Returns the result of multiplying the quadruple-precision floating-point 4815 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4816 | Standard for Binary Floating-Point Arithmetic. 4817 *----------------------------------------------------------------------------*/ 4818 4819 float128 float128_mul( float128 a, float128 b ) 4820 { 4821 flag aSign, bSign, zSign; 4822 int32 aExp, bExp, zExp; 4823 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4824 float128 z; 4825 4826 aSig1 = extractFloat128Frac1( a ); 4827 aSig0 = extractFloat128Frac0( a ); 4828 aExp = extractFloat128Exp( a ); 4829 aSign = extractFloat128Sign( a ); 4830 bSig1 = extractFloat128Frac1( b ); 4831 bSig0 = extractFloat128Frac0( b ); 4832 bExp = extractFloat128Exp( b ); 4833 bSign = extractFloat128Sign( b ); 4834 zSign = aSign ^ bSign; 4835 if ( aExp == 0x7FFF ) { 4836 if ( ( aSig0 | aSig1 ) 4837 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4838 return propagateFloat128NaN( a, b ); 4839 } 4840 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4841 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4842 } 4843 if ( bExp == 0x7FFF ) { 4844 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4845 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4846 invalid: 4847 float_raise( float_flag_invalid ); 4848 z.low = float128_default_nan_low; 4849 z.high = float128_default_nan_high; 4850 return z; 4851 } 4852 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4853 } 4854 if ( aExp == 0 ) { 4855 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4856 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4857 } 4858 if ( bExp == 0 ) { 4859 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4860 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4861 } 4862 zExp = aExp + bExp - 0x4000; 4863 aSig0 |= LIT64( 0x0001000000000000 ); 4864 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4865 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4866 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4867 zSig2 |= ( zSig3 != 0 ); 4868 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4869 shift128ExtraRightJamming( 4870 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4871 ++zExp; 4872 } 4873 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4874 4875 } 4876 4877 /*---------------------------------------------------------------------------- 4878 | Returns the result of dividing the quadruple-precision floating-point value 4879 | `a' by the corresponding value `b'. The operation is performed according to 4880 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4881 *----------------------------------------------------------------------------*/ 4882 4883 float128 float128_div( float128 a, float128 b ) 4884 { 4885 flag aSign, bSign, zSign; 4886 int32 aExp, bExp, zExp; 4887 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4888 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4889 float128 z; 4890 4891 aSig1 = extractFloat128Frac1( a ); 4892 aSig0 = extractFloat128Frac0( a ); 4893 aExp = extractFloat128Exp( a ); 4894 aSign = extractFloat128Sign( a ); 4895 bSig1 = extractFloat128Frac1( b ); 4896 bSig0 = extractFloat128Frac0( b ); 4897 bExp = extractFloat128Exp( b ); 4898 bSign = extractFloat128Sign( b ); 4899 zSign = aSign ^ bSign; 4900 if ( aExp == 0x7FFF ) { 4901 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4902 if ( bExp == 0x7FFF ) { 4903 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4904 goto invalid; 4905 } 4906 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4907 } 4908 if ( bExp == 0x7FFF ) { 4909 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4910 return packFloat128( zSign, 0, 0, 0 ); 4911 } 4912 if ( bExp == 0 ) { 4913 if ( ( bSig0 | bSig1 ) == 0 ) { 4914 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4915 invalid: 4916 float_raise( float_flag_invalid ); 4917 z.low = float128_default_nan_low; 4918 z.high = float128_default_nan_high; 4919 return z; 4920 } 4921 float_raise( float_flag_divbyzero ); 4922 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4923 } 4924 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4925 } 4926 if ( aExp == 0 ) { 4927 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4928 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4929 } 4930 zExp = aExp - bExp + 0x3FFD; 4931 shortShift128Left( 4932 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4933 shortShift128Left( 4934 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4935 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4936 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4937 ++zExp; 4938 } 4939 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4940 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4941 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4942 while ( (sbits64) rem0 < 0 ) { 4943 --zSig0; 4944 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 4945 } 4946 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 4947 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 4948 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 4949 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 4950 while ( (sbits64) rem1 < 0 ) { 4951 --zSig1; 4952 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 4953 } 4954 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4955 } 4956 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 4957 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4958 4959 } 4960 4961 /*---------------------------------------------------------------------------- 4962 | Returns the remainder of the quadruple-precision floating-point value `a' 4963 | with respect to the corresponding value `b'. The operation is performed 4964 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4965 *----------------------------------------------------------------------------*/ 4966 4967 float128 float128_rem( float128 a, float128 b ) 4968 { 4969 flag aSign, bSign, zSign; 4970 int32 aExp, bExp, expDiff; 4971 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 4972 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 4973 sbits64 sigMean0; 4974 float128 z; 4975 4976 aSig1 = extractFloat128Frac1( a ); 4977 aSig0 = extractFloat128Frac0( a ); 4978 aExp = extractFloat128Exp( a ); 4979 aSign = extractFloat128Sign( a ); 4980 bSig1 = extractFloat128Frac1( b ); 4981 bSig0 = extractFloat128Frac0( b ); 4982 bExp = extractFloat128Exp( b ); 4983 bSign = extractFloat128Sign( b ); 4984 if ( aExp == 0x7FFF ) { 4985 if ( ( aSig0 | aSig1 ) 4986 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4987 return propagateFloat128NaN( a, b ); 4988 } 4989 goto invalid; 4990 } 4991 if ( bExp == 0x7FFF ) { 4992 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4993 return a; 4994 } 4995 if ( bExp == 0 ) { 4996 if ( ( bSig0 | bSig1 ) == 0 ) { 4997 invalid: 4998 float_raise( float_flag_invalid ); 4999 z.low = float128_default_nan_low; 5000 z.high = float128_default_nan_high; 5001 return z; 5002 } 5003 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5004 } 5005 if ( aExp == 0 ) { 5006 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5007 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5008 } 5009 expDiff = aExp - bExp; 5010 if ( expDiff < -1 ) return a; 5011 shortShift128Left( 5012 aSig0 | LIT64( 0x0001000000000000 ), 5013 aSig1, 5014 15 - ( expDiff < 0 ), 5015 &aSig0, 5016 &aSig1 5017 ); 5018 shortShift128Left( 5019 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5020 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5021 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5022 expDiff -= 64; 5023 while ( 0 < expDiff ) { 5024 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5025 q = ( 4 < q ) ? q - 4 : 0; 5026 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5027 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5028 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5029 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5030 expDiff -= 61; 5031 } 5032 if ( -64 < expDiff ) { 5033 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5034 q = ( 4 < q ) ? q - 4 : 0; 5035 q >>= - expDiff; 5036 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5037 expDiff += 52; 5038 if ( expDiff < 0 ) { 5039 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5040 } 5041 else { 5042 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5043 } 5044 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5045 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5046 } 5047 else { 5048 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5049 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5050 } 5051 do { 5052 alternateASig0 = aSig0; 5053 alternateASig1 = aSig1; 5054 ++q; 5055 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5056 } while ( 0 <= (sbits64) aSig0 ); 5057 add128( 5058 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5059 if ( ( sigMean0 < 0 ) 5060 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5061 aSig0 = alternateASig0; 5062 aSig1 = alternateASig1; 5063 } 5064 zSign = ( (sbits64) aSig0 < 0 ); 5065 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5066 return 5067 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5068 5069 } 5070 5071 /*---------------------------------------------------------------------------- 5072 | Returns the square root of the quadruple-precision floating-point value `a'. 5073 | The operation is performed according to the IEC/IEEE Standard for Binary 5074 | Floating-Point Arithmetic. 5075 *----------------------------------------------------------------------------*/ 5076 5077 float128 float128_sqrt( float128 a ) 5078 { 5079 flag aSign; 5080 int32 aExp, zExp; 5081 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5082 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5083 float128 z; 5084 5085 aSig1 = extractFloat128Frac1( a ); 5086 aSig0 = extractFloat128Frac0( a ); 5087 aExp = extractFloat128Exp( a ); 5088 aSign = extractFloat128Sign( a ); 5089 if ( aExp == 0x7FFF ) { 5090 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5091 if ( ! aSign ) return a; 5092 goto invalid; 5093 } 5094 if ( aSign ) { 5095 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5096 invalid: 5097 float_raise( float_flag_invalid ); 5098 z.low = float128_default_nan_low; 5099 z.high = float128_default_nan_high; 5100 return z; 5101 } 5102 if ( aExp == 0 ) { 5103 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5104 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5105 } 5106 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5107 aSig0 |= LIT64( 0x0001000000000000 ); 5108 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5109 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5110 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5111 doubleZSig0 = zSig0<<1; 5112 mul64To128( zSig0, zSig0, &term0, &term1 ); 5113 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5114 while ( (sbits64) rem0 < 0 ) { 5115 --zSig0; 5116 doubleZSig0 -= 2; 5117 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5118 } 5119 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5120 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5121 if ( zSig1 == 0 ) zSig1 = 1; 5122 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5123 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5124 mul64To128( zSig1, zSig1, &term2, &term3 ); 5125 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5126 while ( (sbits64) rem1 < 0 ) { 5127 --zSig1; 5128 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5129 term3 |= 1; 5130 term2 |= doubleZSig0; 5131 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5132 } 5133 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5134 } 5135 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5136 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5137 5138 } 5139 5140 /*---------------------------------------------------------------------------- 5141 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5142 | the corresponding value `b', and 0 otherwise. The comparison is performed 5143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5144 *----------------------------------------------------------------------------*/ 5145 5146 flag float128_eq( float128 a, float128 b ) 5147 { 5148 5149 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5150 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5151 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5152 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5153 ) { 5154 if ( float128_is_signaling_nan( a ) 5155 || float128_is_signaling_nan( b ) ) { 5156 float_raise( float_flag_invalid ); 5157 } 5158 return 0; 5159 } 5160 return 5161 ( a.low == b.low ) 5162 && ( ( a.high == b.high ) 5163 || ( ( a.low == 0 ) 5164 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5165 ); 5166 5167 } 5168 5169 /*---------------------------------------------------------------------------- 5170 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5171 | or equal to the corresponding value `b', and 0 otherwise. The comparison 5172 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5173 | Arithmetic. 5174 *----------------------------------------------------------------------------*/ 5175 5176 flag float128_le( float128 a, float128 b ) 5177 { 5178 flag aSign, bSign; 5179 5180 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5181 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5182 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5183 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5184 ) { 5185 float_raise( float_flag_invalid ); 5186 return 0; 5187 } 5188 aSign = extractFloat128Sign( a ); 5189 bSign = extractFloat128Sign( b ); 5190 if ( aSign != bSign ) { 5191 return 5192 aSign 5193 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5194 == 0 ); 5195 } 5196 return 5197 aSign ? le128( b.high, b.low, a.high, a.low ) 5198 : le128( a.high, a.low, b.high, b.low ); 5199 5200 } 5201 5202 /*---------------------------------------------------------------------------- 5203 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5204 | the corresponding value `b', and 0 otherwise. The comparison is performed 5205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5206 *----------------------------------------------------------------------------*/ 5207 5208 flag float128_lt( float128 a, float128 b ) 5209 { 5210 flag aSign, bSign; 5211 5212 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5213 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5214 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5215 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5216 ) { 5217 float_raise( float_flag_invalid ); 5218 return 0; 5219 } 5220 aSign = extractFloat128Sign( a ); 5221 bSign = extractFloat128Sign( b ); 5222 if ( aSign != bSign ) { 5223 return 5224 aSign 5225 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5226 != 0 ); 5227 } 5228 return 5229 aSign ? lt128( b.high, b.low, a.high, a.low ) 5230 : lt128( a.high, a.low, b.high, b.low ); 5231 5232 } 5233 5234 /*---------------------------------------------------------------------------- 5235 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5236 | the corresponding value `b', and 0 otherwise. The invalid exception is 5237 | raised if either operand is a NaN. Otherwise, the comparison is performed 5238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5239 *----------------------------------------------------------------------------*/ 5240 5241 flag float128_eq_signaling( float128 a, float128 b ) 5242 { 5243 5244 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5245 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5246 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5247 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5248 ) { 5249 float_raise( float_flag_invalid ); 5250 return 0; 5251 } 5252 return 5253 ( a.low == b.low ) 5254 && ( ( a.high == b.high ) 5255 || ( ( a.low == 0 ) 5256 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5257 ); 5258 5259 } 5260 5261 /*---------------------------------------------------------------------------- 5262 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5263 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5264 | cause an exception. Otherwise, the comparison is performed according to the 5265 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5266 *----------------------------------------------------------------------------*/ 5267 5268 flag float128_le_quiet( float128 a, float128 b ) 5269 { 5270 flag aSign, bSign; 5271 5272 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5273 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5274 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5275 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5276 ) { 5277 if ( float128_is_signaling_nan( a ) 5278 || float128_is_signaling_nan( b ) ) { 5279 float_raise( float_flag_invalid ); 5280 } 5281 return 0; 5282 } 5283 aSign = extractFloat128Sign( a ); 5284 bSign = extractFloat128Sign( b ); 5285 if ( aSign != bSign ) { 5286 return 5287 aSign 5288 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5289 == 0 ); 5290 } 5291 return 5292 aSign ? le128( b.high, b.low, a.high, a.low ) 5293 : le128( a.high, a.low, b.high, b.low ); 5294 5295 } 5296 5297 /*---------------------------------------------------------------------------- 5298 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5299 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5300 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5301 | Standard for Binary Floating-Point Arithmetic. 5302 *----------------------------------------------------------------------------*/ 5303 5304 flag float128_lt_quiet( float128 a, float128 b ) 5305 { 5306 flag aSign, bSign; 5307 5308 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5309 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5310 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5311 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5312 ) { 5313 if ( float128_is_signaling_nan( a ) 5314 || float128_is_signaling_nan( b ) ) { 5315 float_raise( float_flag_invalid ); 5316 } 5317 return 0; 5318 } 5319 aSign = extractFloat128Sign( a ); 5320 bSign = extractFloat128Sign( b ); 5321 if ( aSign != bSign ) { 5322 return 5323 aSign 5324 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5325 != 0 ); 5326 } 5327 return 5328 aSign ? lt128( b.high, b.low, a.high, a.low ) 5329 : lt128( a.high, a.low, b.high, b.low ); 5330 5331 } 5332 5333 #endif 5334 5335 5336 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5337 5338 /* 5339 * These two routines are not part of the original softfloat distribution. 5340 * 5341 * They are based on the corresponding conversions to integer but return 5342 * unsigned numbers instead since these functions are required by GCC. 5343 * 5344 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5345 * 5346 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5347 */ 5348 5349 /*---------------------------------------------------------------------------- 5350 | Returns the result of converting the double-precision floating-point value 5351 | `a' to the 32-bit unsigned integer format. The conversion is 5352 | performed according to the IEC/IEEE Standard for Binary Floating-point 5353 | Arithmetic, except that the conversion is always rounded toward zero. If 5354 | `a' is a NaN, the largest positive integer is returned. If the conversion 5355 | overflows, the largest integer positive is returned. 5356 *----------------------------------------------------------------------------*/ 5357 5358 uint32 float64_to_uint32_round_to_zero( float64 a ) 5359 { 5360 flag aSign; 5361 int16 aExp, shiftCount; 5362 bits64 aSig, savedASig; 5363 uint32 z; 5364 5365 aSig = extractFloat64Frac( a ); 5366 aExp = extractFloat64Exp( a ); 5367 aSign = extractFloat64Sign( a ); 5368 5369 if (aSign) { 5370 float_raise( float_flag_invalid ); 5371 return(0); 5372 } 5373 5374 if ( 0x41E < aExp ) { 5375 float_raise( float_flag_invalid ); 5376 return 0xffffffff; 5377 } 5378 else if ( aExp < 0x3FF ) { 5379 if ( aExp || aSig ) float_set_inexact(); 5380 return 0; 5381 } 5382 aSig |= LIT64( 0x0010000000000000 ); 5383 shiftCount = 0x433 - aExp; 5384 savedASig = aSig; 5385 aSig >>= shiftCount; 5386 z = aSig; 5387 if ( ( aSig<<shiftCount ) != savedASig ) { 5388 float_set_inexact(); 5389 } 5390 return z; 5391 5392 } 5393 5394 /*---------------------------------------------------------------------------- 5395 | Returns the result of converting the single-precision floating-point value 5396 | `a' to the 32-bit unsigned integer format. The conversion is 5397 | performed according to the IEC/IEEE Standard for Binary Floating-point 5398 | Arithmetic, except that the conversion is always rounded toward zero. If 5399 | `a' is a NaN, the largest positive integer is returned. If the conversion 5400 | overflows, the largest positive integer is returned. 5401 *----------------------------------------------------------------------------*/ 5402 5403 uint32 float32_to_uint32_round_to_zero( float32 a ) 5404 { 5405 flag aSign; 5406 int16 aExp, shiftCount; 5407 bits32 aSig; 5408 uint32 z; 5409 5410 aSig = extractFloat32Frac( a ); 5411 aExp = extractFloat32Exp( a ); 5412 aSign = extractFloat32Sign( a ); 5413 shiftCount = aExp - 0x9E; 5414 5415 if (aSign) { 5416 float_raise( float_flag_invalid ); 5417 return(0); 5418 } 5419 if ( 0 < shiftCount ) { 5420 float_raise( float_flag_invalid ); 5421 return 0xFFFFFFFF; 5422 } 5423 else if ( aExp <= 0x7E ) { 5424 if ( aExp | aSig ) float_set_inexact(); 5425 return 0; 5426 } 5427 aSig = ( aSig | 0x800000 )<<8; 5428 z = aSig>>( - shiftCount ); 5429 if ( aSig<<( shiftCount & 31 ) ) { 5430 float_set_inexact(); 5431 } 5432 return z; 5433 5434 } 5435 5436 #endif 5437 5438 #endif /* _STANDALONE */ 5439