xref: /openbsd-src/sys/lib/libkern/softfloat.c (revision 2821fdb080fdc801ba67bdf2837f5e164f3a2926)
1 /*	$OpenBSD: softfloat.c,v 1.4 2009/10/27 23:59:35 deraadt Exp $	*/
2 /*	$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $	*/
3 
4 /*
5  * This version hacked for use with gcc -msoft-float by bjh21.
6  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7  *  itself).
8  */
9 
10 /*
11  * Things you may want to define:
12  *
13  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
15  *   properly renamed.
16  */
17 
18 /*
19 ===============================================================================
20 
21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
22 Arithmetic Package, Release 2a.
23 
24 Written by John R. Hauser.  This work was made possible in part by the
25 International Computer Science Institute, located at Suite 600, 1947 Center
26 Street, Berkeley, California 94704.  Funding was partially provided by the
27 National Science Foundation under grant MIP-9311980.  The original version
28 of this code was written as part of a project to build a fixed-point vector
29 processor in collaboration with the University of California at Berkeley,
30 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
32 arithmetic/SoftFloat.html'.
33 
34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable
35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS
37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
43 
44 Derivative works are acceptable, even for commercial purposes, so long as
45 (1) they include prominent notice that the work is derivative, and (2) they
46 include prominent notice akin to these four paragraphs for those parts of
47 this code that are retained.
48 
49 ===============================================================================
50 */
51 
52 #include <sys/cdefs.h>
53 
54 #ifdef SOFTFLOAT_FOR_GCC
55 #include "softfloat-for-gcc.h"
56 #endif
57 
58 #include "milieu.h"
59 #include "softfloat.h"
60 
61 /*
62  * Conversions between floats as stored in memory and floats as
63  * SoftFloat uses them
64  */
65 #ifndef FLOAT64_DEMANGLE
66 #define FLOAT64_DEMANGLE(a)	(a)
67 #endif
68 #ifndef FLOAT64_MANGLE
69 #define FLOAT64_MANGLE(a)	(a)
70 #endif
71 
72 /*
73 -------------------------------------------------------------------------------
74 Floating-point rounding mode, extended double-precision rounding precision,
75 and exception flags.
76 -------------------------------------------------------------------------------
77 */
78 
79 /*
80  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
81  * 	Right now, it does not.  I've removed all other dynamic global
82  * 	variables. [ross]
83  */
84 #ifdef FLOATX80
85 int8 floatx80_rounding_precision = 80;
86 #endif
87 
88 /*
89 -------------------------------------------------------------------------------
90 Primitive arithmetic functions, including multi-word arithmetic, and
91 division and square root approximations.  (Can be specialized to target if
92 desired.)
93 -------------------------------------------------------------------------------
94 */
95 #include "softfloat-macros.h"
96 
97 /*
98 -------------------------------------------------------------------------------
99 Functions and definitions to determine:  (1) whether tininess for underflow
100 is detected before or after rounding by default, (2) what (if anything)
101 happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 are propagated from function inputs to output.  These details are target-
104 specific.
105 -------------------------------------------------------------------------------
106 */
107 #include "softfloat-specialize.h"
108 
109 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
110 /*
111 -------------------------------------------------------------------------------
112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
113 and 7, and returns the properly rounded 32-bit integer corresponding to the
114 input.  If `zSign' is 1, the input is negated before being converted to an
115 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
116 is simply rounded to an integer, with the inexact exception raised if the
117 input cannot be represented exactly as an integer.  However, if the fixed-
118 point input is too large, the invalid exception is raised and the largest
119 positive or negative integer is returned.
120 -------------------------------------------------------------------------------
121 */
122 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
123 {
124     int8 roundingMode;
125     flag roundNearestEven;
126     int8 roundIncrement, roundBits;
127     int32 z;
128 
129     roundingMode = float_rounding_mode();
130     roundNearestEven = ( roundingMode == float_round_nearest_even );
131     roundIncrement = 0x40;
132     if ( ! roundNearestEven ) {
133         if ( roundingMode == float_round_to_zero ) {
134             roundIncrement = 0;
135         }
136         else {
137             roundIncrement = 0x7F;
138             if ( zSign ) {
139                 if ( roundingMode == float_round_up ) roundIncrement = 0;
140             }
141             else {
142                 if ( roundingMode == float_round_down ) roundIncrement = 0;
143             }
144         }
145     }
146     roundBits = absZ & 0x7F;
147     absZ = ( absZ + roundIncrement )>>7;
148     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
149     z = absZ;
150     if ( zSign ) z = - z;
151     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
152         float_raise( float_flag_invalid );
153         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
154     }
155     if ( roundBits ) float_set_inexact();
156     return z;
157 
158 }
159 
160 /*
161 -------------------------------------------------------------------------------
162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
163 `absZ1', with binary point between bits 63 and 64 (between the input words),
164 and returns the properly rounded 64-bit integer corresponding to the input.
165 If `zSign' is 1, the input is negated before being converted to an integer.
166 Ordinarily, the fixed-point input is simply rounded to an integer, with
167 the inexact exception raised if the input cannot be represented exactly as
168 an integer.  However, if the fixed-point input is too large, the invalid
169 exception is raised and the largest positive or negative integer is
170 returned.
171 -------------------------------------------------------------------------------
172 */
173 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
174 {
175     int8 roundingMode;
176     flag roundNearestEven, increment;
177     int64 z;
178 
179     roundingMode = float_rounding_mode();
180     roundNearestEven = ( roundingMode == float_round_nearest_even );
181     increment = ( (sbits64) absZ1 < 0 );
182     if ( ! roundNearestEven ) {
183         if ( roundingMode == float_round_to_zero ) {
184             increment = 0;
185         }
186         else {
187             if ( zSign ) {
188                 increment = ( roundingMode == float_round_down ) && absZ1;
189             }
190             else {
191                 increment = ( roundingMode == float_round_up ) && absZ1;
192             }
193         }
194     }
195     if ( increment ) {
196         ++absZ0;
197         if ( absZ0 == 0 ) goto overflow;
198         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
199     }
200     z = absZ0;
201     if ( zSign ) z = - z;
202     if ( z && ( ( z < 0 ) ^ zSign ) ) {
203  overflow:
204         float_raise( float_flag_invalid );
205         return
206               zSign ? (sbits64) LIT64( 0x8000000000000000 )
207             : LIT64( 0x7FFFFFFFFFFFFFFF );
208     }
209     if ( absZ1 ) float_set_inexact();
210     return z;
211 
212 }
213 
214 #ifdef __alpha__
215 /*
216 -------------------------------------------------------------------------------
217 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
218 `absZ1', with binary point between bits 63 and 64 (between the input words),
219 and returns the properly rounded 64-bit integer corresponding to the input.
220 If `zSign' is 1, the input is negated before being converted to an integer.
221 Ordinarily, the fixed-point input is simply rounded to an integer, with
222 the inexact exception raised if the input cannot be represented exactly as
223 an integer.
224 -------------------------------------------------------------------------------
225 */
226 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0,
227     bits64 absZ1 )
228 {
229     int8 roundingMode;
230     flag roundNearestEven, increment;
231     int64 z;
232 
233     roundingMode = float_rounding_mode();
234     roundNearestEven = ( roundingMode == float_round_nearest_even );
235     increment = ( (sbits64) absZ1 < 0 );
236     if ( ! roundNearestEven ) {
237         if ( roundingMode == float_round_to_zero ) {
238             increment = 0;
239         }
240         else {
241             if ( zSign ) {
242                 increment = ( roundingMode == float_round_down ) && absZ1;
243             }
244             else {
245                 increment = ( roundingMode == float_round_up ) && absZ1;
246             }
247         }
248     }
249     if ( increment ) {
250         ++absZ0;
251         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
252     }
253     z = absZ0;
254     if ( zSign ) z = - z;
255     if ( absZ1 ) float_set_inexact();
256     return z;
257 
258 }
259 #endif /* __alpha__ */
260 #endif
261 
262 /*
263 -------------------------------------------------------------------------------
264 Returns the fraction bits of the single-precision floating-point value `a'.
265 -------------------------------------------------------------------------------
266 */
267 INLINE bits32 extractFloat32Frac( float32 a )
268 {
269 
270     return a & 0x007FFFFF;
271 
272 }
273 
274 /*
275 -------------------------------------------------------------------------------
276 Returns the exponent bits of the single-precision floating-point value `a'.
277 -------------------------------------------------------------------------------
278 */
279 INLINE int16 extractFloat32Exp( float32 a )
280 {
281 
282     return ( a>>23 ) & 0xFF;
283 
284 }
285 
286 /*
287 -------------------------------------------------------------------------------
288 Returns the sign bit of the single-precision floating-point value `a'.
289 -------------------------------------------------------------------------------
290 */
291 INLINE flag extractFloat32Sign( float32 a )
292 {
293 
294     return a>>31;
295 
296 }
297 
298 /*
299 -------------------------------------------------------------------------------
300 Normalizes the subnormal single-precision floating-point value represented
301 by the denormalized significand `aSig'.  The normalized exponent and
302 significand are stored at the locations pointed to by `zExpPtr' and
303 `zSigPtr', respectively.
304 -------------------------------------------------------------------------------
305 */
306 static void
307  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
308 {
309     int8 shiftCount;
310 
311     shiftCount = countLeadingZeros32( aSig ) - 8;
312     *zSigPtr = aSig<<shiftCount;
313     *zExpPtr = 1 - shiftCount;
314 
315 }
316 
317 /*
318 -------------------------------------------------------------------------------
319 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
320 single-precision floating-point value, returning the result.  After being
321 shifted into the proper positions, the three fields are simply added
322 together to form the result.  This means that any integer portion of `zSig'
323 will be added into the exponent.  Since a properly normalized significand
324 will have an integer portion equal to 1, the `zExp' input should be 1 less
325 than the desired result exponent whenever `zSig' is a complete, normalized
326 significand.
327 -------------------------------------------------------------------------------
328 */
329 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
330 {
331 
332     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
333 
334 }
335 
336 /*
337 -------------------------------------------------------------------------------
338 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
339 and significand `zSig', and returns the proper single-precision floating-
340 point value corresponding to the abstract input.  Ordinarily, the abstract
341 value is simply rounded and packed into the single-precision format, with
342 the inexact exception raised if the abstract input cannot be represented
343 exactly.  However, if the abstract value is too large, the overflow and
344 inexact exceptions are raised and an infinity or maximal finite value is
345 returned.  If the abstract value is too small, the input value is rounded to
346 a subnormal number, and the underflow and inexact exceptions are raised if
347 the abstract input cannot be represented exactly as a subnormal single-
348 precision floating-point number.
349     The input significand `zSig' has its binary point between bits 30
350 and 29, which is 7 bits to the left of the usual location.  This shifted
351 significand must be normalized or smaller.  If `zSig' is not normalized,
352 `zExp' must be 0; in that case, the result returned is a subnormal number,
353 and it must not require rounding.  In the usual case that `zSig' is
354 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
355 The handling of underflow and overflow follows the IEC/IEEE Standard for
356 Binary Floating-Point Arithmetic.
357 -------------------------------------------------------------------------------
358 */
359 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
360 {
361     int8 roundingMode;
362     flag roundNearestEven;
363     int8 roundIncrement, roundBits;
364     flag isTiny;
365 
366     roundingMode = float_rounding_mode();
367     roundNearestEven = ( roundingMode == float_round_nearest_even );
368     roundIncrement = 0x40;
369     if ( ! roundNearestEven ) {
370         if ( roundingMode == float_round_to_zero ) {
371             roundIncrement = 0;
372         }
373         else {
374             roundIncrement = 0x7F;
375             if ( zSign ) {
376                 if ( roundingMode == float_round_up ) roundIncrement = 0;
377             }
378             else {
379                 if ( roundingMode == float_round_down ) roundIncrement = 0;
380             }
381         }
382     }
383     roundBits = zSig & 0x7F;
384     if ( 0xFD <= (bits16) zExp ) {
385         if (    ( 0xFD < zExp )
386              || (    ( zExp == 0xFD )
387                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
388            ) {
389             float_raise( float_flag_overflow | float_flag_inexact );
390             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
391         }
392         if ( zExp < 0 ) {
393             isTiny =
394                    ( float_detect_tininess == float_tininess_before_rounding )
395                 || ( zExp < -1 )
396                 || ( zSig + roundIncrement < 0x80000000 );
397             shift32RightJamming( zSig, - zExp, &zSig );
398             zExp = 0;
399             roundBits = zSig & 0x7F;
400             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
401         }
402     }
403     if ( roundBits ) float_set_inexact();
404     zSig = ( zSig + roundIncrement )>>7;
405     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
406     if ( zSig == 0 ) zExp = 0;
407     return packFloat32( zSign, zExp, zSig );
408 
409 }
410 
411 /*
412 -------------------------------------------------------------------------------
413 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
414 and significand `zSig', and returns the proper single-precision floating-
415 point value corresponding to the abstract input.  This routine is just like
416 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
417 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
418 floating-point exponent.
419 -------------------------------------------------------------------------------
420 */
421 static float32
422  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
423 {
424     int8 shiftCount;
425 
426     shiftCount = countLeadingZeros32( zSig ) - 1;
427     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
428 
429 }
430 
431 /*
432 -------------------------------------------------------------------------------
433 Returns the fraction bits of the double-precision floating-point value `a'.
434 -------------------------------------------------------------------------------
435 */
436 INLINE bits64 extractFloat64Frac( float64 a )
437 {
438 
439     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
440 
441 }
442 
443 /*
444 -------------------------------------------------------------------------------
445 Returns the exponent bits of the double-precision floating-point value `a'.
446 -------------------------------------------------------------------------------
447 */
448 INLINE int16 extractFloat64Exp( float64 a )
449 {
450 
451     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
452 
453 }
454 
455 /*
456 -------------------------------------------------------------------------------
457 Returns the sign bit of the double-precision floating-point value `a'.
458 -------------------------------------------------------------------------------
459 */
460 INLINE flag extractFloat64Sign( float64 a )
461 {
462 
463     return FLOAT64_DEMANGLE(a)>>63;
464 
465 }
466 
467 /*
468 -------------------------------------------------------------------------------
469 Normalizes the subnormal double-precision floating-point value represented
470 by the denormalized significand `aSig'.  The normalized exponent and
471 significand are stored at the locations pointed to by `zExpPtr' and
472 `zSigPtr', respectively.
473 -------------------------------------------------------------------------------
474 */
475 static void
476  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
477 {
478     int8 shiftCount;
479 
480     shiftCount = countLeadingZeros64( aSig ) - 11;
481     *zSigPtr = aSig<<shiftCount;
482     *zExpPtr = 1 - shiftCount;
483 
484 }
485 
486 /*
487 -------------------------------------------------------------------------------
488 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
489 double-precision floating-point value, returning the result.  After being
490 shifted into the proper positions, the three fields are simply added
491 together to form the result.  This means that any integer portion of `zSig'
492 will be added into the exponent.  Since a properly normalized significand
493 will have an integer portion equal to 1, the `zExp' input should be 1 less
494 than the desired result exponent whenever `zSig' is a complete, normalized
495 significand.
496 -------------------------------------------------------------------------------
497 */
498 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
499 {
500 
501     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
502 			   ( ( (bits64) zExp )<<52 ) + zSig );
503 
504 }
505 
506 /*
507 -------------------------------------------------------------------------------
508 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
509 and significand `zSig', and returns the proper double-precision floating-
510 point value corresponding to the abstract input.  Ordinarily, the abstract
511 value is simply rounded and packed into the double-precision format, with
512 the inexact exception raised if the abstract input cannot be represented
513 exactly.  However, if the abstract value is too large, the overflow and
514 inexact exceptions are raised and an infinity or maximal finite value is
515 returned.  If the abstract value is too small, the input value is rounded to
516 a subnormal number, and the underflow and inexact exceptions are raised if
517 the abstract input cannot be represented exactly as a subnormal double-
518 precision floating-point number.
519     The input significand `zSig' has its binary point between bits 62
520 and 61, which is 10 bits to the left of the usual location.  This shifted
521 significand must be normalized or smaller.  If `zSig' is not normalized,
522 `zExp' must be 0; in that case, the result returned is a subnormal number,
523 and it must not require rounding.  In the usual case that `zSig' is
524 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
525 The handling of underflow and overflow follows the IEC/IEEE Standard for
526 Binary Floating-Point Arithmetic.
527 -------------------------------------------------------------------------------
528 */
529 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
530 {
531     int8 roundingMode;
532     flag roundNearestEven;
533     int16 roundIncrement, roundBits;
534     flag isTiny;
535 
536     roundingMode = float_rounding_mode();
537     roundNearestEven = ( roundingMode == float_round_nearest_even );
538     roundIncrement = 0x200;
539     if ( ! roundNearestEven ) {
540         if ( roundingMode == float_round_to_zero ) {
541             roundIncrement = 0;
542         }
543         else {
544             roundIncrement = 0x3FF;
545             if ( zSign ) {
546                 if ( roundingMode == float_round_up ) roundIncrement = 0;
547             }
548             else {
549                 if ( roundingMode == float_round_down ) roundIncrement = 0;
550             }
551         }
552     }
553     roundBits = zSig & 0x3FF;
554     if ( 0x7FD <= (bits16) zExp ) {
555         if (    ( 0x7FD < zExp )
556              || (    ( zExp == 0x7FD )
557                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
558            ) {
559             float_raise( float_flag_overflow | float_flag_inexact );
560             return FLOAT64_MANGLE(
561 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
562 		( roundIncrement == 0 ));
563         }
564         if ( zExp < 0 ) {
565             isTiny =
566                    ( float_detect_tininess == float_tininess_before_rounding )
567                 || ( zExp < -1 )
568                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
569             shift64RightJamming( zSig, - zExp, &zSig );
570             zExp = 0;
571             roundBits = zSig & 0x3FF;
572             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
573         }
574     }
575     if ( roundBits ) float_set_inexact();
576     zSig = ( zSig + roundIncrement )>>10;
577     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
578     if ( zSig == 0 ) zExp = 0;
579     return packFloat64( zSign, zExp, zSig );
580 
581 }
582 
583 /*
584 -------------------------------------------------------------------------------
585 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
586 and significand `zSig', and returns the proper double-precision floating-
587 point value corresponding to the abstract input.  This routine is just like
588 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
589 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
590 floating-point exponent.
591 -------------------------------------------------------------------------------
592 */
593 static float64
594  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
595 {
596     int8 shiftCount;
597 
598     shiftCount = countLeadingZeros64( zSig ) - 1;
599     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
600 
601 }
602 
603 #ifdef FLOATX80
604 
605 /*
606 -------------------------------------------------------------------------------
607 Returns the fraction bits of the extended double-precision floating-point
608 value `a'.
609 -------------------------------------------------------------------------------
610 */
611 INLINE bits64 extractFloatx80Frac( floatx80 a )
612 {
613 
614     return a.low;
615 
616 }
617 
618 /*
619 -------------------------------------------------------------------------------
620 Returns the exponent bits of the extended double-precision floating-point
621 value `a'.
622 -------------------------------------------------------------------------------
623 */
624 INLINE int32 extractFloatx80Exp( floatx80 a )
625 {
626 
627     return a.high & 0x7FFF;
628 
629 }
630 
631 /*
632 -------------------------------------------------------------------------------
633 Returns the sign bit of the extended double-precision floating-point value
634 `a'.
635 -------------------------------------------------------------------------------
636 */
637 INLINE flag extractFloatx80Sign( floatx80 a )
638 {
639 
640     return a.high>>15;
641 
642 }
643 
644 /*
645 -------------------------------------------------------------------------------
646 Normalizes the subnormal extended double-precision floating-point value
647 represented by the denormalized significand `aSig'.  The normalized exponent
648 and significand are stored at the locations pointed to by `zExpPtr' and
649 `zSigPtr', respectively.
650 -------------------------------------------------------------------------------
651 */
652 static void
653  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
654 {
655     int8 shiftCount;
656 
657     shiftCount = countLeadingZeros64( aSig );
658     *zSigPtr = aSig<<shiftCount;
659     *zExpPtr = 1 - shiftCount;
660 
661 }
662 
663 /*
664 -------------------------------------------------------------------------------
665 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
666 extended double-precision floating-point value, returning the result.
667 -------------------------------------------------------------------------------
668 */
669 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
670 {
671     floatx80 z;
672 
673     z.low = zSig;
674     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
675     return z;
676 
677 }
678 
679 /*
680 -------------------------------------------------------------------------------
681 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
682 and extended significand formed by the concatenation of `zSig0' and `zSig1',
683 and returns the proper extended double-precision floating-point value
684 corresponding to the abstract input.  Ordinarily, the abstract value is
685 rounded and packed into the extended double-precision format, with the
686 inexact exception raised if the abstract input cannot be represented
687 exactly.  However, if the abstract value is too large, the overflow and
688 inexact exceptions are raised and an infinity or maximal finite value is
689 returned.  If the abstract value is too small, the input value is rounded to
690 a subnormal number, and the underflow and inexact exceptions are raised if
691 the abstract input cannot be represented exactly as a subnormal extended
692 double-precision floating-point number.
693     If `roundingPrecision' is 32 or 64, the result is rounded to the same
694 number of bits as single or double precision, respectively.  Otherwise, the
695 result is rounded to the full precision of the extended double-precision
696 format.
697     The input significand must be normalized or smaller.  If the input
698 significand is not normalized, `zExp' must be 0; in that case, the result
699 returned is a subnormal number, and it must not require rounding.  The
700 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
701 Floating-Point Arithmetic.
702 -------------------------------------------------------------------------------
703 */
704 static floatx80
705  roundAndPackFloatx80(
706      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
707  )
708 {
709     int8 roundingMode;
710     flag roundNearestEven, increment, isTiny;
711     int64 roundIncrement, roundMask, roundBits;
712 
713     roundingMode = float_rounding_mode();
714     roundNearestEven = ( roundingMode == float_round_nearest_even );
715     if ( roundingPrecision == 80 ) goto precision80;
716     if ( roundingPrecision == 64 ) {
717         roundIncrement = LIT64( 0x0000000000000400 );
718         roundMask = LIT64( 0x00000000000007FF );
719     }
720     else if ( roundingPrecision == 32 ) {
721         roundIncrement = LIT64( 0x0000008000000000 );
722         roundMask = LIT64( 0x000000FFFFFFFFFF );
723     }
724     else {
725         goto precision80;
726     }
727     zSig0 |= ( zSig1 != 0 );
728     if ( ! roundNearestEven ) {
729         if ( roundingMode == float_round_to_zero ) {
730             roundIncrement = 0;
731         }
732         else {
733             roundIncrement = roundMask;
734             if ( zSign ) {
735                 if ( roundingMode == float_round_up ) roundIncrement = 0;
736             }
737             else {
738                 if ( roundingMode == float_round_down ) roundIncrement = 0;
739             }
740         }
741     }
742     roundBits = zSig0 & roundMask;
743     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
744         if (    ( 0x7FFE < zExp )
745              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
746            ) {
747             goto overflow;
748         }
749         if ( zExp <= 0 ) {
750             isTiny =
751                    ( float_detect_tininess == float_tininess_before_rounding )
752                 || ( zExp < 0 )
753                 || ( zSig0 <= zSig0 + roundIncrement );
754             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
755             zExp = 0;
756             roundBits = zSig0 & roundMask;
757             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
758             if ( roundBits ) float_set_inexact();
759             zSig0 += roundIncrement;
760             if ( (sbits64) zSig0 < 0 ) zExp = 1;
761             roundIncrement = roundMask + 1;
762             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
763                 roundMask |= roundIncrement;
764             }
765             zSig0 &= ~ roundMask;
766             return packFloatx80( zSign, zExp, zSig0 );
767         }
768     }
769     if ( roundBits ) float_set_inexact();
770     zSig0 += roundIncrement;
771     if ( zSig0 < roundIncrement ) {
772         ++zExp;
773         zSig0 = LIT64( 0x8000000000000000 );
774     }
775     roundIncrement = roundMask + 1;
776     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
777         roundMask |= roundIncrement;
778     }
779     zSig0 &= ~ roundMask;
780     if ( zSig0 == 0 ) zExp = 0;
781     return packFloatx80( zSign, zExp, zSig0 );
782  precision80:
783     increment = ( (sbits64) zSig1 < 0 );
784     if ( ! roundNearestEven ) {
785         if ( roundingMode == float_round_to_zero ) {
786             increment = 0;
787         }
788         else {
789             if ( zSign ) {
790                 increment = ( roundingMode == float_round_down ) && zSig1;
791             }
792             else {
793                 increment = ( roundingMode == float_round_up ) && zSig1;
794             }
795         }
796     }
797     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
798         if (    ( 0x7FFE < zExp )
799              || (    ( zExp == 0x7FFE )
800                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
801                   && increment
802                 )
803            ) {
804             roundMask = 0;
805  overflow:
806             float_raise( float_flag_overflow | float_flag_inexact );
807             if (    ( roundingMode == float_round_to_zero )
808                  || ( zSign && ( roundingMode == float_round_up ) )
809                  || ( ! zSign && ( roundingMode == float_round_down ) )
810                ) {
811                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
812             }
813             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
814         }
815         if ( zExp <= 0 ) {
816             isTiny =
817                    ( float_detect_tininess == float_tininess_before_rounding )
818                 || ( zExp < 0 )
819                 || ! increment
820                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
821             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
822             zExp = 0;
823             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
824             if ( zSig1 ) float_set_inexact();
825             if ( roundNearestEven ) {
826                 increment = ( (sbits64) zSig1 < 0 );
827             }
828             else {
829                 if ( zSign ) {
830                     increment = ( roundingMode == float_round_down ) && zSig1;
831                 }
832                 else {
833                     increment = ( roundingMode == float_round_up ) && zSig1;
834                 }
835             }
836             if ( increment ) {
837                 ++zSig0;
838                 zSig0 &=
839                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
840                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
841             }
842             return packFloatx80( zSign, zExp, zSig0 );
843         }
844     }
845     if ( zSig1 ) float_set_inexact();
846     if ( increment ) {
847         ++zSig0;
848         if ( zSig0 == 0 ) {
849             ++zExp;
850             zSig0 = LIT64( 0x8000000000000000 );
851         }
852         else {
853             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
854         }
855     }
856     else {
857         if ( zSig0 == 0 ) zExp = 0;
858     }
859     return packFloatx80( zSign, zExp, zSig0 );
860 
861 }
862 
863 /*
864 -------------------------------------------------------------------------------
865 Takes an abstract floating-point value having sign `zSign', exponent
866 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
867 and returns the proper extended double-precision floating-point value
868 corresponding to the abstract input.  This routine is just like
869 `roundAndPackFloatx80' except that the input significand does not have to be
870 normalized.
871 -------------------------------------------------------------------------------
872 */
873 static floatx80
874  normalizeRoundAndPackFloatx80(
875      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
876  )
877 {
878     int8 shiftCount;
879 
880     if ( zSig0 == 0 ) {
881         zSig0 = zSig1;
882         zSig1 = 0;
883         zExp -= 64;
884     }
885     shiftCount = countLeadingZeros64( zSig0 );
886     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
887     zExp -= shiftCount;
888     return
889         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
890 
891 }
892 
893 #endif
894 
895 #ifdef FLOAT128
896 
897 /*
898 -------------------------------------------------------------------------------
899 Returns the least-significant 64 fraction bits of the quadruple-precision
900 floating-point value `a'.
901 -------------------------------------------------------------------------------
902 */
903 INLINE bits64 extractFloat128Frac1( float128 a )
904 {
905 
906     return a.low;
907 
908 }
909 
910 /*
911 -------------------------------------------------------------------------------
912 Returns the most-significant 48 fraction bits of the quadruple-precision
913 floating-point value `a'.
914 -------------------------------------------------------------------------------
915 */
916 INLINE bits64 extractFloat128Frac0( float128 a )
917 {
918 
919     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
920 
921 }
922 
923 /*
924 -------------------------------------------------------------------------------
925 Returns the exponent bits of the quadruple-precision floating-point value
926 `a'.
927 -------------------------------------------------------------------------------
928 */
929 INLINE int32 extractFloat128Exp( float128 a )
930 {
931 
932     return ( a.high>>48 ) & 0x7FFF;
933 
934 }
935 
936 /*
937 -------------------------------------------------------------------------------
938 Returns the sign bit of the quadruple-precision floating-point value `a'.
939 -------------------------------------------------------------------------------
940 */
941 INLINE flag extractFloat128Sign( float128 a )
942 {
943 
944     return a.high>>63;
945 
946 }
947 
948 /*
949 -------------------------------------------------------------------------------
950 Normalizes the subnormal quadruple-precision floating-point value
951 represented by the denormalized significand formed by the concatenation of
952 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
953 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
954 significand are stored at the location pointed to by `zSig0Ptr', and the
955 least significant 64 bits of the normalized significand are stored at the
956 location pointed to by `zSig1Ptr'.
957 -------------------------------------------------------------------------------
958 */
959 static void
960  normalizeFloat128Subnormal(
961      bits64 aSig0,
962      bits64 aSig1,
963      int32 *zExpPtr,
964      bits64 *zSig0Ptr,
965      bits64 *zSig1Ptr
966  )
967 {
968     int8 shiftCount;
969 
970     if ( aSig0 == 0 ) {
971         shiftCount = countLeadingZeros64( aSig1 ) - 15;
972         if ( shiftCount < 0 ) {
973             *zSig0Ptr = aSig1>>( - shiftCount );
974             *zSig1Ptr = aSig1<<( shiftCount & 63 );
975         }
976         else {
977             *zSig0Ptr = aSig1<<shiftCount;
978             *zSig1Ptr = 0;
979         }
980         *zExpPtr = - shiftCount - 63;
981     }
982     else {
983         shiftCount = countLeadingZeros64( aSig0 ) - 15;
984         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
985         *zExpPtr = 1 - shiftCount;
986     }
987 
988 }
989 
990 /*
991 -------------------------------------------------------------------------------
992 Packs the sign `zSign', the exponent `zExp', and the significand formed
993 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
994 floating-point value, returning the result.  After being shifted into the
995 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
996 added together to form the most significant 32 bits of the result.  This
997 means that any integer portion of `zSig0' will be added into the exponent.
998 Since a properly normalized significand will have an integer portion equal
999 to 1, the `zExp' input should be 1 less than the desired result exponent
1000 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1001 significand.
1002 -------------------------------------------------------------------------------
1003 */
1004 INLINE float128
1005  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1006 {
1007     float128 z;
1008 
1009     z.low = zSig1;
1010     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
1011     return z;
1012 
1013 }
1014 
1015 /*
1016 -------------------------------------------------------------------------------
1017 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1018 and extended significand formed by the concatenation of `zSig0', `zSig1',
1019 and `zSig2', and returns the proper quadruple-precision floating-point value
1020 corresponding to the abstract input.  Ordinarily, the abstract value is
1021 simply rounded and packed into the quadruple-precision format, with the
1022 inexact exception raised if the abstract input cannot be represented
1023 exactly.  However, if the abstract value is too large, the overflow and
1024 inexact exceptions are raised and an infinity or maximal finite value is
1025 returned.  If the abstract value is too small, the input value is rounded to
1026 a subnormal number, and the underflow and inexact exceptions are raised if
1027 the abstract input cannot be represented exactly as a subnormal quadruple-
1028 precision floating-point number.
1029     The input significand must be normalized or smaller.  If the input
1030 significand is not normalized, `zExp' must be 0; in that case, the result
1031 returned is a subnormal number, and it must not require rounding.  In the
1032 usual case that the input significand is normalized, `zExp' must be 1 less
1033 than the ``true'' floating-point exponent.  The handling of underflow and
1034 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1035 -------------------------------------------------------------------------------
1036 */
1037 static float128
1038  roundAndPackFloat128(
1039      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
1040 {
1041     int8 roundingMode;
1042     flag roundNearestEven, increment, isTiny;
1043 
1044     roundingMode = float_rounding_mode();
1045     roundNearestEven = ( roundingMode == float_round_nearest_even );
1046     increment = ( (sbits64) zSig2 < 0 );
1047     if ( ! roundNearestEven ) {
1048         if ( roundingMode == float_round_to_zero ) {
1049             increment = 0;
1050         }
1051         else {
1052             if ( zSign ) {
1053                 increment = ( roundingMode == float_round_down ) && zSig2;
1054             }
1055             else {
1056                 increment = ( roundingMode == float_round_up ) && zSig2;
1057             }
1058         }
1059     }
1060     if ( 0x7FFD <= (bits32) zExp ) {
1061         if (    ( 0x7FFD < zExp )
1062              || (    ( zExp == 0x7FFD )
1063                   && eq128(
1064                          LIT64( 0x0001FFFFFFFFFFFF ),
1065                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1066                          zSig0,
1067                          zSig1
1068                      )
1069                   && increment
1070                 )
1071            ) {
1072             float_raise( float_flag_overflow | float_flag_inexact );
1073             if (    ( roundingMode == float_round_to_zero )
1074                  || ( zSign && ( roundingMode == float_round_up ) )
1075                  || ( ! zSign && ( roundingMode == float_round_down ) )
1076                ) {
1077                 return
1078                     packFloat128(
1079                         zSign,
1080                         0x7FFE,
1081                         LIT64( 0x0000FFFFFFFFFFFF ),
1082                         LIT64( 0xFFFFFFFFFFFFFFFF )
1083                     );
1084             }
1085             return packFloat128( zSign, 0x7FFF, 0, 0 );
1086         }
1087         if ( zExp < 0 ) {
1088             isTiny =
1089                    ( float_detect_tininess == float_tininess_before_rounding )
1090                 || ( zExp < -1 )
1091                 || ! increment
1092                 || lt128(
1093                        zSig0,
1094                        zSig1,
1095                        LIT64( 0x0001FFFFFFFFFFFF ),
1096                        LIT64( 0xFFFFFFFFFFFFFFFF )
1097                    );
1098             shift128ExtraRightJamming(
1099                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1100             zExp = 0;
1101             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1102             if ( roundNearestEven ) {
1103                 increment = ( (sbits64) zSig2 < 0 );
1104             }
1105             else {
1106                 if ( zSign ) {
1107                     increment = ( roundingMode == float_round_down ) && zSig2;
1108                 }
1109                 else {
1110                     increment = ( roundingMode == float_round_up ) && zSig2;
1111                 }
1112             }
1113         }
1114     }
1115     if ( zSig2 ) float_set_inexact();
1116     if ( increment ) {
1117         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1118         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1119     }
1120     else {
1121         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1122     }
1123     return packFloat128( zSign, zExp, zSig0, zSig1 );
1124 
1125 }
1126 
1127 /*
1128 -------------------------------------------------------------------------------
1129 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1130 and significand formed by the concatenation of `zSig0' and `zSig1', and
1131 returns the proper quadruple-precision floating-point value corresponding
1132 to the abstract input.  This routine is just like `roundAndPackFloat128'
1133 except that the input significand has fewer bits and does not have to be
1134 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1135 point exponent.
1136 -------------------------------------------------------------------------------
1137 */
1138 static float128
1139  normalizeRoundAndPackFloat128(
1140      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1141 {
1142     int8 shiftCount;
1143     bits64 zSig2;
1144 
1145     if ( zSig0 == 0 ) {
1146         zSig0 = zSig1;
1147         zSig1 = 0;
1148         zExp -= 64;
1149     }
1150     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1151     if ( 0 <= shiftCount ) {
1152         zSig2 = 0;
1153         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1154     }
1155     else {
1156         shift128ExtraRightJamming(
1157             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1158     }
1159     zExp -= shiftCount;
1160     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1161 
1162 }
1163 
1164 #endif
1165 
1166 /*
1167 -------------------------------------------------------------------------------
1168 Returns the result of converting the 32-bit two's complement integer `a'
1169 to the single-precision floating-point format.  The conversion is performed
1170 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171 -------------------------------------------------------------------------------
1172 */
1173 float32 int32_to_float32( int32 a )
1174 {
1175     flag zSign;
1176 
1177     if ( a == 0 ) return 0;
1178     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1179     zSign = ( a < 0 );
1180     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1181 
1182 }
1183 
1184 /*
1185 -------------------------------------------------------------------------------
1186 Returns the result of converting the 32-bit two's complement integer `a'
1187 to the double-precision floating-point format.  The conversion is performed
1188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189 -------------------------------------------------------------------------------
1190 */
1191 float64 int32_to_float64( int32 a )
1192 {
1193     flag zSign;
1194     uint32 absA;
1195     int8 shiftCount;
1196     bits64 zSig;
1197 
1198     if ( a == 0 ) return 0;
1199     zSign = ( a < 0 );
1200     absA = zSign ? - a : a;
1201     shiftCount = countLeadingZeros32( absA ) + 21;
1202     zSig = absA;
1203     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1204 
1205 }
1206 
1207 #ifdef FLOATX80
1208 
1209 /*
1210 -------------------------------------------------------------------------------
1211 Returns the result of converting the 32-bit two's complement integer `a'
1212 to the extended double-precision floating-point format.  The conversion
1213 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1214 Arithmetic.
1215 -------------------------------------------------------------------------------
1216 */
1217 floatx80 int32_to_floatx80( int32 a )
1218 {
1219     flag zSign;
1220     uint32 absA;
1221     int8 shiftCount;
1222     bits64 zSig;
1223 
1224     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1225     zSign = ( a < 0 );
1226     absA = zSign ? - a : a;
1227     shiftCount = countLeadingZeros32( absA ) + 32;
1228     zSig = absA;
1229     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1230 
1231 }
1232 
1233 #endif
1234 
1235 #ifdef FLOAT128
1236 
1237 /*
1238 -------------------------------------------------------------------------------
1239 Returns the result of converting the 32-bit two's complement integer `a' to
1240 the quadruple-precision floating-point format.  The conversion is performed
1241 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1242 -------------------------------------------------------------------------------
1243 */
1244 float128 int32_to_float128( int32 a )
1245 {
1246     flag zSign;
1247     uint32 absA;
1248     int8 shiftCount;
1249     bits64 zSig0;
1250 
1251     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1252     zSign = ( a < 0 );
1253     absA = zSign ? - a : a;
1254     shiftCount = countLeadingZeros32( absA ) + 17;
1255     zSig0 = absA;
1256     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1257 
1258 }
1259 
1260 #endif
1261 
1262 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1263 /*
1264 -------------------------------------------------------------------------------
1265 Returns the result of converting the 64-bit two's complement integer `a'
1266 to the single-precision floating-point format.  The conversion is performed
1267 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1268 -------------------------------------------------------------------------------
1269 */
1270 float32 int64_to_float32( int64 a )
1271 {
1272     flag zSign;
1273     uint64 absA;
1274     int8 shiftCount;
1275 
1276     if ( a == 0 ) return 0;
1277     zSign = ( a < 0 );
1278     absA = zSign ? - a : a;
1279     shiftCount = countLeadingZeros64( absA ) - 40;
1280     if ( 0 <= shiftCount ) {
1281         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1282     }
1283     else {
1284         shiftCount += 7;
1285         if ( shiftCount < 0 ) {
1286             shift64RightJamming( absA, - shiftCount, &absA );
1287         }
1288         else {
1289             absA <<= shiftCount;
1290         }
1291         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1292     }
1293 
1294 }
1295 
1296 /*
1297 -------------------------------------------------------------------------------
1298 Returns the result of converting the 64-bit two's complement integer `a'
1299 to the double-precision floating-point format.  The conversion is performed
1300 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 -------------------------------------------------------------------------------
1302 */
1303 float64 int64_to_float64( int64 a )
1304 {
1305     flag zSign;
1306 
1307     if ( a == 0 ) return 0;
1308     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1309         return packFloat64( 1, 0x43E, 0 );
1310     }
1311     zSign = ( a < 0 );
1312     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1313 
1314 }
1315 
1316 #ifdef FLOATX80
1317 
1318 /*
1319 -------------------------------------------------------------------------------
1320 Returns the result of converting the 64-bit two's complement integer `a'
1321 to the extended double-precision floating-point format.  The conversion
1322 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1323 Arithmetic.
1324 -------------------------------------------------------------------------------
1325 */
1326 floatx80 int64_to_floatx80( int64 a )
1327 {
1328     flag zSign;
1329     uint64 absA;
1330     int8 shiftCount;
1331 
1332     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1333     zSign = ( a < 0 );
1334     absA = zSign ? - a : a;
1335     shiftCount = countLeadingZeros64( absA );
1336     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1337 
1338 }
1339 
1340 #endif
1341 
1342 #ifdef FLOAT128
1343 
1344 /*
1345 -------------------------------------------------------------------------------
1346 Returns the result of converting the 64-bit two's complement integer `a' to
1347 the quadruple-precision floating-point format.  The conversion is performed
1348 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1349 -------------------------------------------------------------------------------
1350 */
1351 float128 int64_to_float128( int64 a )
1352 {
1353     flag zSign;
1354     uint64 absA;
1355     int8 shiftCount;
1356     int32 zExp;
1357     bits64 zSig0, zSig1;
1358 
1359     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1360     zSign = ( a < 0 );
1361     absA = zSign ? - a : a;
1362     shiftCount = countLeadingZeros64( absA ) + 49;
1363     zExp = 0x406E - shiftCount;
1364     if ( 64 <= shiftCount ) {
1365         zSig1 = 0;
1366         zSig0 = absA;
1367         shiftCount -= 64;
1368     }
1369     else {
1370         zSig1 = absA;
1371         zSig0 = 0;
1372     }
1373     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1374     return packFloat128( zSign, zExp, zSig0, zSig1 );
1375 
1376 }
1377 
1378 #endif
1379 #endif /* !SOFTFLOAT_FOR_GCC */
1380 
1381 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1382 /*
1383 -------------------------------------------------------------------------------
1384 Returns the result of converting the single-precision floating-point value
1385 `a' to the 32-bit two's complement integer format.  The conversion is
1386 performed according to the IEC/IEEE Standard for Binary Floating-Point
1387 Arithmetic---which means in particular that the conversion is rounded
1388 according to the current rounding mode.  If `a' is a NaN, the largest
1389 positive integer is returned.  Otherwise, if the conversion overflows, the
1390 largest integer with the same sign as `a' is returned.
1391 -------------------------------------------------------------------------------
1392 */
1393 int32 float32_to_int32( float32 a )
1394 {
1395     flag aSign;
1396     int16 aExp, shiftCount;
1397     bits32 aSig;
1398     bits64 aSig64;
1399 
1400     aSig = extractFloat32Frac( a );
1401     aExp = extractFloat32Exp( a );
1402     aSign = extractFloat32Sign( a );
1403     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1404     if ( aExp ) aSig |= 0x00800000;
1405     shiftCount = 0xAF - aExp;
1406     aSig64 = aSig;
1407     aSig64 <<= 32;
1408     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1409     return roundAndPackInt32( aSign, aSig64 );
1410 
1411 }
1412 #endif /* !SOFTFLOAT_FOR_GCC */
1413 
1414 /*
1415 -------------------------------------------------------------------------------
1416 Returns the result of converting the single-precision floating-point value
1417 `a' to the 32-bit two's complement integer format.  The conversion is
1418 performed according to the IEC/IEEE Standard for Binary Floating-Point
1419 Arithmetic, except that the conversion is always rounded toward zero.
1420 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1421 the conversion overflows, the largest integer with the same sign as `a' is
1422 returned.
1423 -------------------------------------------------------------------------------
1424 */
1425 int32 float32_to_int32_round_to_zero( float32 a )
1426 {
1427     flag aSign;
1428     int16 aExp, shiftCount;
1429     bits32 aSig;
1430     int32 z;
1431 
1432     aSig = extractFloat32Frac( a );
1433     aExp = extractFloat32Exp( a );
1434     aSign = extractFloat32Sign( a );
1435     shiftCount = aExp - 0x9E;
1436     if ( 0 <= shiftCount ) {
1437         if ( a != 0xCF000000 ) {
1438             float_raise( float_flag_invalid );
1439             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1440         }
1441         return (sbits32) 0x80000000;
1442     }
1443     else if ( aExp <= 0x7E ) {
1444         if ( aExp | aSig ) float_set_inexact();
1445         return 0;
1446     }
1447     aSig = ( aSig | 0x00800000 )<<8;
1448     z = aSig>>( - shiftCount );
1449     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1450         float_set_inexact();
1451     }
1452     if ( aSign ) z = - z;
1453     return z;
1454 
1455 }
1456 
1457 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1458 /*
1459 -------------------------------------------------------------------------------
1460 Returns the result of converting the single-precision floating-point value
1461 `a' to the 64-bit two's complement integer format.  The conversion is
1462 performed according to the IEC/IEEE Standard for Binary Floating-Point
1463 Arithmetic---which means in particular that the conversion is rounded
1464 according to the current rounding mode.  If `a' is a NaN, the largest
1465 positive integer is returned.  Otherwise, if the conversion overflows, the
1466 largest integer with the same sign as `a' is returned.
1467 -------------------------------------------------------------------------------
1468 */
1469 int64 float32_to_int64( float32 a )
1470 {
1471     flag aSign;
1472     int16 aExp, shiftCount;
1473     bits32 aSig;
1474     bits64 aSig64, aSigExtra;
1475 
1476     aSig = extractFloat32Frac( a );
1477     aExp = extractFloat32Exp( a );
1478     aSign = extractFloat32Sign( a );
1479     shiftCount = 0xBE - aExp;
1480     if ( shiftCount < 0 ) {
1481         float_raise( float_flag_invalid );
1482         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483             return LIT64( 0x7FFFFFFFFFFFFFFF );
1484         }
1485         return (sbits64) LIT64( 0x8000000000000000 );
1486     }
1487     if ( aExp ) aSig |= 0x00800000;
1488     aSig64 = aSig;
1489     aSig64 <<= 40;
1490     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1492 
1493 }
1494 
1495 /*
1496 -------------------------------------------------------------------------------
1497 Returns the result of converting the single-precision floating-point value
1498 `a' to the 64-bit two's complement integer format.  The conversion is
1499 performed according to the IEC/IEEE Standard for Binary Floating-Point
1500 Arithmetic, except that the conversion is always rounded toward zero.  If
1501 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1502 conversion overflows, the largest integer with the same sign as `a' is
1503 returned.
1504 -------------------------------------------------------------------------------
1505 */
1506 int64 float32_to_int64_round_to_zero( float32 a )
1507 {
1508     flag aSign;
1509     int16 aExp, shiftCount;
1510     bits32 aSig;
1511     bits64 aSig64;
1512     int64 z;
1513 
1514     aSig = extractFloat32Frac( a );
1515     aExp = extractFloat32Exp( a );
1516     aSign = extractFloat32Sign( a );
1517     shiftCount = aExp - 0xBE;
1518     if ( 0 <= shiftCount ) {
1519         if ( a != 0xDF000000 ) {
1520             float_raise( float_flag_invalid );
1521             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1523             }
1524         }
1525         return (sbits64) LIT64( 0x8000000000000000 );
1526     }
1527     else if ( aExp <= 0x7E ) {
1528         if ( aExp | aSig ) float_set_inexact();
1529         return 0;
1530     }
1531     aSig64 = aSig | 0x00800000;
1532     aSig64 <<= 40;
1533     z = aSig64>>( - shiftCount );
1534     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1535         float_set_inexact();
1536     }
1537     if ( aSign ) z = - z;
1538     return z;
1539 
1540 }
1541 #endif /* !SOFTFLOAT_FOR_GCC */
1542 
1543 /*
1544 -------------------------------------------------------------------------------
1545 Returns the result of converting the single-precision floating-point value
1546 `a' to the double-precision floating-point format.  The conversion is
1547 performed according to the IEC/IEEE Standard for Binary Floating-Point
1548 Arithmetic.
1549 -------------------------------------------------------------------------------
1550 */
1551 float64 float32_to_float64( float32 a )
1552 {
1553     flag aSign;
1554     int16 aExp;
1555     bits32 aSig;
1556 
1557     aSig = extractFloat32Frac( a );
1558     aExp = extractFloat32Exp( a );
1559     aSign = extractFloat32Sign( a );
1560     if ( aExp == 0xFF ) {
1561         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1562         return packFloat64( aSign, 0x7FF, 0 );
1563     }
1564     if ( aExp == 0 ) {
1565         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1566         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1567         --aExp;
1568     }
1569     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1570 
1571 }
1572 
1573 #ifdef FLOATX80
1574 
1575 /*
1576 -------------------------------------------------------------------------------
1577 Returns the result of converting the single-precision floating-point value
1578 `a' to the extended double-precision floating-point format.  The conversion
1579 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1580 Arithmetic.
1581 -------------------------------------------------------------------------------
1582 */
1583 floatx80 float32_to_floatx80( float32 a )
1584 {
1585     flag aSign;
1586     int16 aExp;
1587     bits32 aSig;
1588 
1589     aSig = extractFloat32Frac( a );
1590     aExp = extractFloat32Exp( a );
1591     aSign = extractFloat32Sign( a );
1592     if ( aExp == 0xFF ) {
1593         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1594         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1595     }
1596     if ( aExp == 0 ) {
1597         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1598         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1599     }
1600     aSig |= 0x00800000;
1601     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1602 
1603 }
1604 
1605 #endif
1606 
1607 #ifdef FLOAT128
1608 
1609 /*
1610 -------------------------------------------------------------------------------
1611 Returns the result of converting the single-precision floating-point value
1612 `a' to the double-precision floating-point format.  The conversion is
1613 performed according to the IEC/IEEE Standard for Binary Floating-Point
1614 Arithmetic.
1615 -------------------------------------------------------------------------------
1616 */
1617 float128 float32_to_float128( float32 a )
1618 {
1619     flag aSign;
1620     int16 aExp;
1621     bits32 aSig;
1622 
1623     aSig = extractFloat32Frac( a );
1624     aExp = extractFloat32Exp( a );
1625     aSign = extractFloat32Sign( a );
1626     if ( aExp == 0xFF ) {
1627         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1628         return packFloat128( aSign, 0x7FFF, 0, 0 );
1629     }
1630     if ( aExp == 0 ) {
1631         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1632         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1633         --aExp;
1634     }
1635     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1636 
1637 }
1638 
1639 #endif
1640 
1641 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1642 /*
1643 -------------------------------------------------------------------------------
1644 Rounds the single-precision floating-point value `a' to an integer, and
1645 returns the result as a single-precision floating-point value.  The
1646 operation is performed according to the IEC/IEEE Standard for Binary
1647 Floating-Point Arithmetic.
1648 -------------------------------------------------------------------------------
1649 */
1650 float32 float32_round_to_int( float32 a )
1651 {
1652     flag aSign;
1653     int16 aExp;
1654     bits32 lastBitMask, roundBitsMask;
1655     int8 roundingMode;
1656     float32 z;
1657 
1658     aExp = extractFloat32Exp( a );
1659     if ( 0x96 <= aExp ) {
1660         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1661             return propagateFloat32NaN( a, a );
1662         }
1663         return a;
1664     }
1665     if ( aExp <= 0x7E ) {
1666         if ( (bits32) ( a<<1 ) == 0 ) return a;
1667         float_set_inexact();
1668         aSign = extractFloat32Sign( a );
1669         switch ( float_rounding_mode() ) {
1670          case float_round_nearest_even:
1671             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1672                 return packFloat32( aSign, 0x7F, 0 );
1673             }
1674             break;
1675          case float_round_down:
1676             return aSign ? 0xBF800000 : 0;
1677          case float_round_up:
1678             return aSign ? 0x80000000 : 0x3F800000;
1679         }
1680         return packFloat32( aSign, 0, 0 );
1681     }
1682     lastBitMask = 1;
1683     lastBitMask <<= 0x96 - aExp;
1684     roundBitsMask = lastBitMask - 1;
1685     z = a;
1686     roundingMode = float_rounding_mode();
1687     if ( roundingMode == float_round_nearest_even ) {
1688         z += lastBitMask>>1;
1689         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1690     }
1691     else if ( roundingMode != float_round_to_zero ) {
1692         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1693             z += roundBitsMask;
1694         }
1695     }
1696     z &= ~ roundBitsMask;
1697     if ( z != a ) float_set_inexact();
1698     return z;
1699 
1700 }
1701 #endif /* !SOFTFLOAT_FOR_GCC */
1702 
1703 /*
1704 -------------------------------------------------------------------------------
1705 Returns the result of adding the absolute values of the single-precision
1706 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1707 before being returned.  `zSign' is ignored if the result is a NaN.
1708 The addition is performed according to the IEC/IEEE Standard for Binary
1709 Floating-Point Arithmetic.
1710 -------------------------------------------------------------------------------
1711 */
1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1713 {
1714     int16 aExp, bExp, zExp;
1715     bits32 aSig, bSig, zSig;
1716     int16 expDiff;
1717 
1718     aSig = extractFloat32Frac( a );
1719     aExp = extractFloat32Exp( a );
1720     bSig = extractFloat32Frac( b );
1721     bExp = extractFloat32Exp( b );
1722     expDiff = aExp - bExp;
1723     aSig <<= 6;
1724     bSig <<= 6;
1725     if ( 0 < expDiff ) {
1726         if ( aExp == 0xFF ) {
1727             if ( aSig ) return propagateFloat32NaN( a, b );
1728             return a;
1729         }
1730         if ( bExp == 0 ) {
1731             --expDiff;
1732         }
1733         else {
1734             bSig |= 0x20000000;
1735         }
1736         shift32RightJamming( bSig, expDiff, &bSig );
1737         zExp = aExp;
1738     }
1739     else if ( expDiff < 0 ) {
1740         if ( bExp == 0xFF ) {
1741             if ( bSig ) return propagateFloat32NaN( a, b );
1742             return packFloat32( zSign, 0xFF, 0 );
1743         }
1744         if ( aExp == 0 ) {
1745             ++expDiff;
1746         }
1747         else {
1748             aSig |= 0x20000000;
1749         }
1750         shift32RightJamming( aSig, - expDiff, &aSig );
1751         zExp = bExp;
1752     }
1753     else {
1754         if ( aExp == 0xFF ) {
1755             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1756             return a;
1757         }
1758         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1759         zSig = 0x40000000 + aSig + bSig;
1760         zExp = aExp;
1761         goto roundAndPack;
1762     }
1763     aSig |= 0x20000000;
1764     zSig = ( aSig + bSig )<<1;
1765     --zExp;
1766     if ( (sbits32) zSig < 0 ) {
1767         zSig = aSig + bSig;
1768         ++zExp;
1769     }
1770  roundAndPack:
1771     return roundAndPackFloat32( zSign, zExp, zSig );
1772 
1773 }
1774 
1775 /*
1776 -------------------------------------------------------------------------------
1777 Returns the result of subtracting the absolute values of the single-
1778 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1779 difference is negated before being returned.  `zSign' is ignored if the
1780 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1781 Standard for Binary Floating-Point Arithmetic.
1782 -------------------------------------------------------------------------------
1783 */
1784 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1785 {
1786     int16 aExp, bExp, zExp;
1787     bits32 aSig, bSig, zSig;
1788     int16 expDiff;
1789 
1790     aSig = extractFloat32Frac( a );
1791     aExp = extractFloat32Exp( a );
1792     bSig = extractFloat32Frac( b );
1793     bExp = extractFloat32Exp( b );
1794     expDiff = aExp - bExp;
1795     aSig <<= 7;
1796     bSig <<= 7;
1797     if ( 0 < expDiff ) goto aExpBigger;
1798     if ( expDiff < 0 ) goto bExpBigger;
1799     if ( aExp == 0xFF ) {
1800         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1801         float_raise( float_flag_invalid );
1802         return float32_default_nan;
1803     }
1804     if ( aExp == 0 ) {
1805         aExp = 1;
1806         bExp = 1;
1807     }
1808     if ( bSig < aSig ) goto aBigger;
1809     if ( aSig < bSig ) goto bBigger;
1810     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1811  bExpBigger:
1812     if ( bExp == 0xFF ) {
1813         if ( bSig ) return propagateFloat32NaN( a, b );
1814         return packFloat32( zSign ^ 1, 0xFF, 0 );
1815     }
1816     if ( aExp == 0 ) {
1817         ++expDiff;
1818     }
1819     else {
1820         aSig |= 0x40000000;
1821     }
1822     shift32RightJamming( aSig, - expDiff, &aSig );
1823     bSig |= 0x40000000;
1824  bBigger:
1825     zSig = bSig - aSig;
1826     zExp = bExp;
1827     zSign ^= 1;
1828     goto normalizeRoundAndPack;
1829  aExpBigger:
1830     if ( aExp == 0xFF ) {
1831         if ( aSig ) return propagateFloat32NaN( a, b );
1832         return a;
1833     }
1834     if ( bExp == 0 ) {
1835         --expDiff;
1836     }
1837     else {
1838         bSig |= 0x40000000;
1839     }
1840     shift32RightJamming( bSig, expDiff, &bSig );
1841     aSig |= 0x40000000;
1842  aBigger:
1843     zSig = aSig - bSig;
1844     zExp = aExp;
1845  normalizeRoundAndPack:
1846     --zExp;
1847     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1848 
1849 }
1850 
1851 /*
1852 -------------------------------------------------------------------------------
1853 Returns the result of adding the single-precision floating-point values `a'
1854 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1855 Binary Floating-Point Arithmetic.
1856 -------------------------------------------------------------------------------
1857 */
1858 float32 float32_add( float32 a, float32 b )
1859 {
1860     flag aSign, bSign;
1861 
1862     aSign = extractFloat32Sign( a );
1863     bSign = extractFloat32Sign( b );
1864     if ( aSign == bSign ) {
1865         return addFloat32Sigs( a, b, aSign );
1866     }
1867     else {
1868         return subFloat32Sigs( a, b, aSign );
1869     }
1870 
1871 }
1872 
1873 /*
1874 -------------------------------------------------------------------------------
1875 Returns the result of subtracting the single-precision floating-point values
1876 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1877 for Binary Floating-Point Arithmetic.
1878 -------------------------------------------------------------------------------
1879 */
1880 float32 float32_sub( float32 a, float32 b )
1881 {
1882     flag aSign, bSign;
1883 
1884     aSign = extractFloat32Sign( a );
1885     bSign = extractFloat32Sign( b );
1886     if ( aSign == bSign ) {
1887         return subFloat32Sigs( a, b, aSign );
1888     }
1889     else {
1890         return addFloat32Sigs( a, b, aSign );
1891     }
1892 
1893 }
1894 
1895 /*
1896 -------------------------------------------------------------------------------
1897 Returns the result of multiplying the single-precision floating-point values
1898 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1899 for Binary Floating-Point Arithmetic.
1900 -------------------------------------------------------------------------------
1901 */
1902 float32 float32_mul( float32 a, float32 b )
1903 {
1904     flag aSign, bSign, zSign;
1905     int16 aExp, bExp, zExp;
1906     bits32 aSig, bSig;
1907     bits64 zSig64;
1908     bits32 zSig;
1909 
1910     aSig = extractFloat32Frac( a );
1911     aExp = extractFloat32Exp( a );
1912     aSign = extractFloat32Sign( a );
1913     bSig = extractFloat32Frac( b );
1914     bExp = extractFloat32Exp( b );
1915     bSign = extractFloat32Sign( b );
1916     zSign = aSign ^ bSign;
1917     if ( aExp == 0xFF ) {
1918         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1919             return propagateFloat32NaN( a, b );
1920         }
1921         if ( ( bExp | bSig ) == 0 ) {
1922             float_raise( float_flag_invalid );
1923             return float32_default_nan;
1924         }
1925         return packFloat32( zSign, 0xFF, 0 );
1926     }
1927     if ( bExp == 0xFF ) {
1928         if ( bSig ) return propagateFloat32NaN( a, b );
1929         if ( ( aExp | aSig ) == 0 ) {
1930             float_raise( float_flag_invalid );
1931             return float32_default_nan;
1932         }
1933         return packFloat32( zSign, 0xFF, 0 );
1934     }
1935     if ( aExp == 0 ) {
1936         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1937         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1938     }
1939     if ( bExp == 0 ) {
1940         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1941         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1942     }
1943     zExp = aExp + bExp - 0x7F;
1944     aSig = ( aSig | 0x00800000 )<<7;
1945     bSig = ( bSig | 0x00800000 )<<8;
1946     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1947     zSig = zSig64;
1948     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1949         zSig <<= 1;
1950         --zExp;
1951     }
1952     return roundAndPackFloat32( zSign, zExp, zSig );
1953 
1954 }
1955 
1956 /*
1957 -------------------------------------------------------------------------------
1958 Returns the result of dividing the single-precision floating-point value `a'
1959 by the corresponding value `b'.  The operation is performed according to the
1960 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1961 -------------------------------------------------------------------------------
1962 */
1963 float32 float32_div( float32 a, float32 b )
1964 {
1965     flag aSign, bSign, zSign;
1966     int16 aExp, bExp, zExp;
1967     bits32 aSig, bSig, zSig;
1968 
1969     aSig = extractFloat32Frac( a );
1970     aExp = extractFloat32Exp( a );
1971     aSign = extractFloat32Sign( a );
1972     bSig = extractFloat32Frac( b );
1973     bExp = extractFloat32Exp( b );
1974     bSign = extractFloat32Sign( b );
1975     zSign = aSign ^ bSign;
1976     if ( aExp == 0xFF ) {
1977         if ( aSig ) return propagateFloat32NaN( a, b );
1978         if ( bExp == 0xFF ) {
1979             if ( bSig ) return propagateFloat32NaN( a, b );
1980             float_raise( float_flag_invalid );
1981             return float32_default_nan;
1982         }
1983         return packFloat32( zSign, 0xFF, 0 );
1984     }
1985     if ( bExp == 0xFF ) {
1986         if ( bSig ) return propagateFloat32NaN( a, b );
1987         return packFloat32( zSign, 0, 0 );
1988     }
1989     if ( bExp == 0 ) {
1990         if ( bSig == 0 ) {
1991             if ( ( aExp | aSig ) == 0 ) {
1992                 float_raise( float_flag_invalid );
1993                 return float32_default_nan;
1994             }
1995             float_raise( float_flag_divbyzero );
1996             return packFloat32( zSign, 0xFF, 0 );
1997         }
1998         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1999     }
2000     if ( aExp == 0 ) {
2001         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2003     }
2004     zExp = aExp - bExp + 0x7D;
2005     aSig = ( aSig | 0x00800000 )<<7;
2006     bSig = ( bSig | 0x00800000 )<<8;
2007     if ( bSig <= ( aSig + aSig ) ) {
2008         aSig >>= 1;
2009         ++zExp;
2010     }
2011     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2012     if ( ( zSig & 0x3F ) == 0 ) {
2013         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2014     }
2015     return roundAndPackFloat32( zSign, zExp, zSig );
2016 
2017 }
2018 
2019 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2020 /*
2021 -------------------------------------------------------------------------------
2022 Returns the remainder of the single-precision floating-point value `a'
2023 with respect to the corresponding value `b'.  The operation is performed
2024 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2025 -------------------------------------------------------------------------------
2026 */
2027 float32 float32_rem( float32 a, float32 b )
2028 {
2029     flag aSign, bSign, zSign;
2030     int16 aExp, bExp, expDiff;
2031     bits32 aSig, bSig;
2032     bits32 q;
2033     bits64 aSig64, bSig64, q64;
2034     bits32 alternateASig;
2035     sbits32 sigMean;
2036 
2037     aSig = extractFloat32Frac( a );
2038     aExp = extractFloat32Exp( a );
2039     aSign = extractFloat32Sign( a );
2040     bSig = extractFloat32Frac( b );
2041     bExp = extractFloat32Exp( b );
2042     bSign = extractFloat32Sign( b );
2043     if ( aExp == 0xFF ) {
2044         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2045             return propagateFloat32NaN( a, b );
2046         }
2047         float_raise( float_flag_invalid );
2048         return float32_default_nan;
2049     }
2050     if ( bExp == 0xFF ) {
2051         if ( bSig ) return propagateFloat32NaN( a, b );
2052         return a;
2053     }
2054     if ( bExp == 0 ) {
2055         if ( bSig == 0 ) {
2056             float_raise( float_flag_invalid );
2057             return float32_default_nan;
2058         }
2059         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2060     }
2061     if ( aExp == 0 ) {
2062         if ( aSig == 0 ) return a;
2063         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2064     }
2065     expDiff = aExp - bExp;
2066     aSig |= 0x00800000;
2067     bSig |= 0x00800000;
2068     if ( expDiff < 32 ) {
2069         aSig <<= 8;
2070         bSig <<= 8;
2071         if ( expDiff < 0 ) {
2072             if ( expDiff < -1 ) return a;
2073             aSig >>= 1;
2074         }
2075         q = ( bSig <= aSig );
2076         if ( q ) aSig -= bSig;
2077         if ( 0 < expDiff ) {
2078             q = ( ( (bits64) aSig )<<32 ) / bSig;
2079             q >>= 32 - expDiff;
2080             bSig >>= 2;
2081             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2082         }
2083         else {
2084             aSig >>= 2;
2085             bSig >>= 2;
2086         }
2087     }
2088     else {
2089         if ( bSig <= aSig ) aSig -= bSig;
2090         aSig64 = ( (bits64) aSig )<<40;
2091         bSig64 = ( (bits64) bSig )<<40;
2092         expDiff -= 64;
2093         while ( 0 < expDiff ) {
2094             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2095             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2096             aSig64 = - ( ( bSig * q64 )<<38 );
2097             expDiff -= 62;
2098         }
2099         expDiff += 64;
2100         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102         q = q64>>( 64 - expDiff );
2103         bSig <<= 6;
2104         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2105     }
2106     do {
2107         alternateASig = aSig;
2108         ++q;
2109         aSig -= bSig;
2110     } while ( 0 <= (sbits32) aSig );
2111     sigMean = aSig + alternateASig;
2112     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2113         aSig = alternateASig;
2114     }
2115     zSign = ( (sbits32) aSig < 0 );
2116     if ( zSign ) aSig = - aSig;
2117     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2118 
2119 }
2120 #endif /* !SOFTFLOAT_FOR_GCC */
2121 
2122 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2123 /*
2124 -------------------------------------------------------------------------------
2125 Returns the square root of the single-precision floating-point value `a'.
2126 The operation is performed according to the IEC/IEEE Standard for Binary
2127 Floating-Point Arithmetic.
2128 -------------------------------------------------------------------------------
2129 */
2130 float32 float32_sqrt( float32 a )
2131 {
2132     flag aSign;
2133     int16 aExp, zExp;
2134     bits32 aSig, zSig;
2135     bits64 rem, term;
2136 
2137     aSig = extractFloat32Frac( a );
2138     aExp = extractFloat32Exp( a );
2139     aSign = extractFloat32Sign( a );
2140     if ( aExp == 0xFF ) {
2141         if ( aSig ) return propagateFloat32NaN( a, 0 );
2142         if ( ! aSign ) return a;
2143         float_raise( float_flag_invalid );
2144         return float32_default_nan;
2145     }
2146     if ( aSign ) {
2147         if ( ( aExp | aSig ) == 0 ) return a;
2148         float_raise( float_flag_invalid );
2149         return float32_default_nan;
2150     }
2151     if ( aExp == 0 ) {
2152         if ( aSig == 0 ) return 0;
2153         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2154     }
2155     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2156     aSig = ( aSig | 0x00800000 )<<8;
2157     zSig = estimateSqrt32( aExp, aSig ) + 2;
2158     if ( ( zSig & 0x7F ) <= 5 ) {
2159         if ( zSig < 2 ) {
2160             zSig = 0x7FFFFFFF;
2161             goto roundAndPack;
2162         }
2163         aSig >>= aExp & 1;
2164         term = ( (bits64) zSig ) * zSig;
2165         rem = ( ( (bits64) aSig )<<32 ) - term;
2166         while ( (sbits64) rem < 0 ) {
2167             --zSig;
2168             rem += ( ( (bits64) zSig )<<1 ) | 1;
2169         }
2170         zSig |= ( rem != 0 );
2171     }
2172     shift32RightJamming( zSig, 1, &zSig );
2173  roundAndPack:
2174     return roundAndPackFloat32( 0, zExp, zSig );
2175 
2176 }
2177 #endif /* !SOFTFLOAT_FOR_GCC */
2178 
2179 /*
2180 -------------------------------------------------------------------------------
2181 Returns 1 if the single-precision floating-point value `a' is equal to
2182 the corresponding value `b', and 0 otherwise.  The comparison is performed
2183 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2184 -------------------------------------------------------------------------------
2185 */
2186 flag float32_eq( float32 a, float32 b )
2187 {
2188 
2189     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2190          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2191        ) {
2192         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2193             float_raise( float_flag_invalid );
2194         }
2195         return 0;
2196     }
2197     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2198 
2199 }
2200 
2201 /*
2202 -------------------------------------------------------------------------------
2203 Returns 1 if the single-precision floating-point value `a' is less than
2204 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2205 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2206 Arithmetic.
2207 -------------------------------------------------------------------------------
2208 */
2209 flag float32_le( float32 a, float32 b )
2210 {
2211     flag aSign, bSign;
2212 
2213     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2214          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2215        ) {
2216         float_raise( float_flag_invalid );
2217         return 0;
2218     }
2219     aSign = extractFloat32Sign( a );
2220     bSign = extractFloat32Sign( b );
2221     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2222     return ( a == b ) || ( aSign ^ ( a < b ) );
2223 
2224 }
2225 
2226 /*
2227 -------------------------------------------------------------------------------
2228 Returns 1 if the single-precision floating-point value `a' is less than
2229 the corresponding value `b', and 0 otherwise.  The comparison is performed
2230 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2231 -------------------------------------------------------------------------------
2232 */
2233 flag float32_lt( float32 a, float32 b )
2234 {
2235     flag aSign, bSign;
2236 
2237     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2238          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2239        ) {
2240         float_raise( float_flag_invalid );
2241         return 0;
2242     }
2243     aSign = extractFloat32Sign( a );
2244     bSign = extractFloat32Sign( b );
2245     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2246     return ( a != b ) && ( aSign ^ ( a < b ) );
2247 
2248 }
2249 
2250 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2251 /*
2252 -------------------------------------------------------------------------------
2253 Returns 1 if the single-precision floating-point value `a' is equal to
2254 the corresponding value `b', and 0 otherwise.  The invalid exception is
2255 raised if either operand is a NaN.  Otherwise, the comparison is performed
2256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2257 -------------------------------------------------------------------------------
2258 */
2259 flag float32_eq_signaling( float32 a, float32 b )
2260 {
2261 
2262     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2263          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2264        ) {
2265         float_raise( float_flag_invalid );
2266         return 0;
2267     }
2268     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2269 
2270 }
2271 
2272 /*
2273 -------------------------------------------------------------------------------
2274 Returns 1 if the single-precision floating-point value `a' is less than or
2275 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2276 cause an exception.  Otherwise, the comparison is performed according to the
2277 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2278 -------------------------------------------------------------------------------
2279 */
2280 flag float32_le_quiet( float32 a, float32 b )
2281 {
2282     flag aSign, bSign;
2283 
2284     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2285          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2286        ) {
2287         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2288             float_raise( float_flag_invalid );
2289         }
2290         return 0;
2291     }
2292     aSign = extractFloat32Sign( a );
2293     bSign = extractFloat32Sign( b );
2294     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2295     return ( a == b ) || ( aSign ^ ( a < b ) );
2296 
2297 }
2298 
2299 /*
2300 -------------------------------------------------------------------------------
2301 Returns 1 if the single-precision floating-point value `a' is less than
2302 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2303 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2304 Standard for Binary Floating-Point Arithmetic.
2305 -------------------------------------------------------------------------------
2306 */
2307 flag float32_lt_quiet( float32 a, float32 b )
2308 {
2309     flag aSign, bSign;
2310 
2311     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2312          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2313        ) {
2314         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2315             float_raise( float_flag_invalid );
2316         }
2317         return 0;
2318     }
2319     aSign = extractFloat32Sign( a );
2320     bSign = extractFloat32Sign( b );
2321     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2322     return ( a != b ) && ( aSign ^ ( a < b ) );
2323 
2324 }
2325 #endif /* !SOFTFLOAT_FOR_GCC */
2326 
2327 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2328 /*
2329 -------------------------------------------------------------------------------
2330 Returns the result of converting the double-precision floating-point value
2331 `a' to the 32-bit two's complement integer format.  The conversion is
2332 performed according to the IEC/IEEE Standard for Binary Floating-Point
2333 Arithmetic---which means in particular that the conversion is rounded
2334 according to the current rounding mode.  If `a' is a NaN, the largest
2335 positive integer is returned.  Otherwise, if the conversion overflows, the
2336 largest integer with the same sign as `a' is returned.
2337 -------------------------------------------------------------------------------
2338 */
2339 int32 float64_to_int32( float64 a )
2340 {
2341     flag aSign;
2342     int16 aExp, shiftCount;
2343     bits64 aSig;
2344 
2345     aSig = extractFloat64Frac( a );
2346     aExp = extractFloat64Exp( a );
2347     aSign = extractFloat64Sign( a );
2348     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2349     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2350     shiftCount = 0x42C - aExp;
2351     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2352     return roundAndPackInt32( aSign, aSig );
2353 
2354 }
2355 #endif /* !SOFTFLOAT_FOR_GCC */
2356 
2357 /*
2358 -------------------------------------------------------------------------------
2359 Returns the result of converting the double-precision floating-point value
2360 `a' to the 32-bit two's complement integer format.  The conversion is
2361 performed according to the IEC/IEEE Standard for Binary Floating-Point
2362 Arithmetic, except that the conversion is always rounded toward zero.
2363 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2364 the conversion overflows, the largest integer with the same sign as `a' is
2365 returned.
2366 -------------------------------------------------------------------------------
2367 */
2368 int32 float64_to_int32_round_to_zero( float64 a )
2369 {
2370     flag aSign;
2371     int16 aExp, shiftCount;
2372     bits64 aSig, savedASig;
2373     int32 z;
2374 
2375     aSig = extractFloat64Frac( a );
2376     aExp = extractFloat64Exp( a );
2377     aSign = extractFloat64Sign( a );
2378     if ( 0x41E < aExp ) {
2379         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2380         goto invalid;
2381     }
2382     else if ( aExp < 0x3FF ) {
2383         if ( aExp || aSig ) float_set_inexact();
2384         return 0;
2385     }
2386     aSig |= LIT64( 0x0010000000000000 );
2387     shiftCount = 0x433 - aExp;
2388     savedASig = aSig;
2389     aSig >>= shiftCount;
2390     z = aSig;
2391     if ( aSign ) z = - z;
2392     if ( ( z < 0 ) ^ aSign ) {
2393  invalid:
2394         float_raise( float_flag_invalid );
2395         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2396     }
2397     if ( ( aSig<<shiftCount ) != savedASig ) {
2398         float_set_inexact();
2399     }
2400     return z;
2401 
2402 }
2403 
2404 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2405 /*
2406 -------------------------------------------------------------------------------
2407 Returns the result of converting the double-precision floating-point value
2408 `a' to the 64-bit two's complement integer format.  The conversion is
2409 performed according to the IEC/IEEE Standard for Binary Floating-Point
2410 Arithmetic---which means in particular that the conversion is rounded
2411 according to the current rounding mode.  If `a' is a NaN, the largest
2412 positive integer is returned.  Otherwise, if the conversion overflows, the
2413 largest integer with the same sign as `a' is returned.
2414 -------------------------------------------------------------------------------
2415 */
2416 int64 float64_to_int64( float64 a )
2417 {
2418     flag aSign;
2419     int16 aExp, shiftCount;
2420     bits64 aSig, aSigExtra;
2421 
2422     aSig = extractFloat64Frac( a );
2423     aExp = extractFloat64Exp( a );
2424     aSign = extractFloat64Sign( a );
2425     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2426     shiftCount = 0x433 - aExp;
2427     if ( shiftCount <= 0 ) {
2428         if ( 0x43E < aExp ) {
2429             float_raise( float_flag_invalid );
2430             if (    ! aSign
2431                  || (    ( aExp == 0x7FF )
2432                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2433                ) {
2434                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2435             }
2436             return (sbits64) LIT64( 0x8000000000000000 );
2437         }
2438         aSigExtra = 0;
2439         aSig <<= - shiftCount;
2440     }
2441     else {
2442         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2443     }
2444     return roundAndPackInt64( aSign, aSig, aSigExtra );
2445 
2446 }
2447 
2448 #ifdef __alpha__
2449 /*
2450 -------------------------------------------------------------------------------
2451 Returns the result of converting the double-precision floating-point value
2452 `a' to the 64-bit two's complement integer format.  The conversion is
2453 performed according to the IEC/IEEE Standard for Binary Floating-Point
2454 Arithmetic---which means in particular that the conversion is rounded
2455 according to the current rounding mode.  If `a' is a NaN, the invalid
2456 exception is raised and zero is returned.
2457 -------------------------------------------------------------------------------
2458 */
2459 int64 float64_to_int64_no_overflow( float64 a )
2460 {
2461     flag aSign;
2462     int16 aExp, shiftCount;
2463     bits64 aSig, aSigExtra;
2464 
2465     aSig = extractFloat64Frac( a );
2466     aExp = extractFloat64Exp( a );
2467     aSign = extractFloat64Sign( a );
2468     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2469     shiftCount = 0x433 - aExp;
2470     if ( shiftCount <= 0 ) {
2471         if ( 0x43E < aExp ) {
2472             float_raise( float_flag_invalid );
2473             return 0;
2474         }
2475         aSigExtra = 0;
2476         aSig <<= - shiftCount;
2477     }
2478     else {
2479         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2480     }
2481     return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra );
2482 
2483 }
2484 #endif /* __alpha__ */
2485 
2486 /*
2487 -------------------------------------------------------------------------------
2488 Returns the result of converting the double-precision floating-point value
2489 `a' to the 64-bit two's complement integer format.  The conversion is
2490 performed according to the IEC/IEEE Standard for Binary Floating-Point
2491 Arithmetic, except that the conversion is always rounded toward zero.
2492 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2493 the conversion overflows, the largest integer with the same sign as `a' is
2494 returned.
2495 -------------------------------------------------------------------------------
2496 */
2497 int64 float64_to_int64_round_to_zero( float64 a )
2498 {
2499     flag aSign;
2500     int16 aExp, shiftCount;
2501     bits64 aSig;
2502     int64 z;
2503 
2504     aSig = extractFloat64Frac( a );
2505     aExp = extractFloat64Exp( a );
2506     aSign = extractFloat64Sign( a );
2507     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2508     shiftCount = aExp - 0x433;
2509     if ( 0 <= shiftCount ) {
2510         if ( 0x43E <= aExp ) {
2511             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2512                 float_raise( float_flag_invalid );
2513                 if (    ! aSign
2514                      || (    ( aExp == 0x7FF )
2515                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2516                    ) {
2517                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2518                 }
2519             }
2520             return (sbits64) LIT64( 0x8000000000000000 );
2521         }
2522         z = aSig<<shiftCount;
2523     }
2524     else {
2525         if ( aExp < 0x3FE ) {
2526             if ( aExp | aSig ) float_set_inexact();
2527             return 0;
2528         }
2529         z = aSig>>( - shiftCount );
2530         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2531             float_set_inexact();
2532         }
2533     }
2534     if ( aSign ) z = - z;
2535     return z;
2536 
2537 }
2538 #endif /* !SOFTFLOAT_FOR_GCC */
2539 
2540 /*
2541 -------------------------------------------------------------------------------
2542 Returns the result of converting the double-precision floating-point value
2543 `a' to the single-precision floating-point format.  The conversion is
2544 performed according to the IEC/IEEE Standard for Binary Floating-Point
2545 Arithmetic.
2546 -------------------------------------------------------------------------------
2547 */
2548 float32 float64_to_float32( float64 a )
2549 {
2550     flag aSign;
2551     int16 aExp;
2552     bits64 aSig;
2553     bits32 zSig;
2554 
2555     aSig = extractFloat64Frac( a );
2556     aExp = extractFloat64Exp( a );
2557     aSign = extractFloat64Sign( a );
2558     if ( aExp == 0x7FF ) {
2559         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2560         return packFloat32( aSign, 0xFF, 0 );
2561     }
2562     shift64RightJamming( aSig, 22, &aSig );
2563     zSig = aSig;
2564     if ( aExp || zSig ) {
2565         zSig |= 0x40000000;
2566         aExp -= 0x381;
2567     }
2568     return roundAndPackFloat32( aSign, aExp, zSig );
2569 
2570 }
2571 
2572 #ifdef FLOATX80
2573 
2574 /*
2575 -------------------------------------------------------------------------------
2576 Returns the result of converting the double-precision floating-point value
2577 `a' to the extended double-precision floating-point format.  The conversion
2578 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2579 Arithmetic.
2580 -------------------------------------------------------------------------------
2581 */
2582 floatx80 float64_to_floatx80( float64 a )
2583 {
2584     flag aSign;
2585     int16 aExp;
2586     bits64 aSig;
2587 
2588     aSig = extractFloat64Frac( a );
2589     aExp = extractFloat64Exp( a );
2590     aSign = extractFloat64Sign( a );
2591     if ( aExp == 0x7FF ) {
2592         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2593         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2594     }
2595     if ( aExp == 0 ) {
2596         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2597         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2598     }
2599     return
2600         packFloatx80(
2601             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2602 
2603 }
2604 
2605 #endif
2606 
2607 #ifdef FLOAT128
2608 
2609 /*
2610 -------------------------------------------------------------------------------
2611 Returns the result of converting the double-precision floating-point value
2612 `a' to the quadruple-precision floating-point format.  The conversion is
2613 performed according to the IEC/IEEE Standard for Binary Floating-Point
2614 Arithmetic.
2615 -------------------------------------------------------------------------------
2616 */
2617 float128 float64_to_float128( float64 a )
2618 {
2619     flag aSign;
2620     int16 aExp;
2621     bits64 aSig, zSig0, zSig1;
2622 
2623     aSig = extractFloat64Frac( a );
2624     aExp = extractFloat64Exp( a );
2625     aSign = extractFloat64Sign( a );
2626     if ( aExp == 0x7FF ) {
2627         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2628         return packFloat128( aSign, 0x7FFF, 0, 0 );
2629     }
2630     if ( aExp == 0 ) {
2631         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2632         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2633         --aExp;
2634     }
2635     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2636     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2637 
2638 }
2639 
2640 #endif
2641 
2642 #ifndef SOFTFLOAT_FOR_GCC
2643 /*
2644 -------------------------------------------------------------------------------
2645 Rounds the double-precision floating-point value `a' to an integer, and
2646 returns the result as a double-precision floating-point value.  The
2647 operation is performed according to the IEC/IEEE Standard for Binary
2648 Floating-Point Arithmetic.
2649 -------------------------------------------------------------------------------
2650 */
2651 float64 float64_round_to_int( float64 a )
2652 {
2653     flag aSign;
2654     int16 aExp;
2655     bits64 lastBitMask, roundBitsMask;
2656     int8 roundingMode;
2657     float64 z;
2658 
2659     aExp = extractFloat64Exp( a );
2660     if ( 0x433 <= aExp ) {
2661         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2662             return propagateFloat64NaN( a, a );
2663         }
2664         return a;
2665     }
2666     if ( aExp < 0x3FF ) {
2667         if ( (bits64) ( a<<1 ) == 0 ) return a;
2668         float_set_inexact();
2669         aSign = extractFloat64Sign( a );
2670         switch ( float_rounding_mode() ) {
2671          case float_round_nearest_even:
2672             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2673                 return packFloat64( aSign, 0x3FF, 0 );
2674             }
2675             break;
2676          case float_round_down:
2677             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2678          case float_round_up:
2679             return
2680             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2681         }
2682         return packFloat64( aSign, 0, 0 );
2683     }
2684     lastBitMask = 1;
2685     lastBitMask <<= 0x433 - aExp;
2686     roundBitsMask = lastBitMask - 1;
2687     z = a;
2688     roundingMode = float_rounding_mode();
2689     if ( roundingMode == float_round_nearest_even ) {
2690         z += lastBitMask>>1;
2691         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2692     }
2693     else if ( roundingMode != float_round_to_zero ) {
2694         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2695             z += roundBitsMask;
2696         }
2697     }
2698     z &= ~ roundBitsMask;
2699     if ( z != a ) float_set_inexact();
2700     return z;
2701 
2702 }
2703 #endif
2704 
2705 /*
2706 -------------------------------------------------------------------------------
2707 Returns the result of adding the absolute values of the double-precision
2708 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2709 before being returned.  `zSign' is ignored if the result is a NaN.
2710 The addition is performed according to the IEC/IEEE Standard for Binary
2711 Floating-Point Arithmetic.
2712 -------------------------------------------------------------------------------
2713 */
2714 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2715 {
2716     int16 aExp, bExp, zExp;
2717     bits64 aSig, bSig, zSig;
2718     int16 expDiff;
2719 
2720     aSig = extractFloat64Frac( a );
2721     aExp = extractFloat64Exp( a );
2722     bSig = extractFloat64Frac( b );
2723     bExp = extractFloat64Exp( b );
2724     expDiff = aExp - bExp;
2725     aSig <<= 9;
2726     bSig <<= 9;
2727     if ( 0 < expDiff ) {
2728         if ( aExp == 0x7FF ) {
2729             if ( aSig ) return propagateFloat64NaN( a, b );
2730             return a;
2731         }
2732         if ( bExp == 0 ) {
2733             --expDiff;
2734         }
2735         else {
2736             bSig |= LIT64( 0x2000000000000000 );
2737         }
2738         shift64RightJamming( bSig, expDiff, &bSig );
2739         zExp = aExp;
2740     }
2741     else if ( expDiff < 0 ) {
2742         if ( bExp == 0x7FF ) {
2743             if ( bSig ) return propagateFloat64NaN( a, b );
2744             return packFloat64( zSign, 0x7FF, 0 );
2745         }
2746         if ( aExp == 0 ) {
2747             ++expDiff;
2748         }
2749         else {
2750             aSig |= LIT64( 0x2000000000000000 );
2751         }
2752         shift64RightJamming( aSig, - expDiff, &aSig );
2753         zExp = bExp;
2754     }
2755     else {
2756         if ( aExp == 0x7FF ) {
2757             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2758             return a;
2759         }
2760         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2761         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2762         zExp = aExp;
2763         goto roundAndPack;
2764     }
2765     aSig |= LIT64( 0x2000000000000000 );
2766     zSig = ( aSig + bSig )<<1;
2767     --zExp;
2768     if ( (sbits64) zSig < 0 ) {
2769         zSig = aSig + bSig;
2770         ++zExp;
2771     }
2772  roundAndPack:
2773     return roundAndPackFloat64( zSign, zExp, zSig );
2774 
2775 }
2776 
2777 /*
2778 -------------------------------------------------------------------------------
2779 Returns the result of subtracting the absolute values of the double-
2780 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2781 difference is negated before being returned.  `zSign' is ignored if the
2782 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2783 Standard for Binary Floating-Point Arithmetic.
2784 -------------------------------------------------------------------------------
2785 */
2786 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2787 {
2788     int16 aExp, bExp, zExp;
2789     bits64 aSig, bSig, zSig;
2790     int16 expDiff;
2791 
2792     aSig = extractFloat64Frac( a );
2793     aExp = extractFloat64Exp( a );
2794     bSig = extractFloat64Frac( b );
2795     bExp = extractFloat64Exp( b );
2796     expDiff = aExp - bExp;
2797     aSig <<= 10;
2798     bSig <<= 10;
2799     if ( 0 < expDiff ) goto aExpBigger;
2800     if ( expDiff < 0 ) goto bExpBigger;
2801     if ( aExp == 0x7FF ) {
2802         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2803         float_raise( float_flag_invalid );
2804         return float64_default_nan;
2805     }
2806     if ( aExp == 0 ) {
2807         aExp = 1;
2808         bExp = 1;
2809     }
2810     if ( bSig < aSig ) goto aBigger;
2811     if ( aSig < bSig ) goto bBigger;
2812     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2813  bExpBigger:
2814     if ( bExp == 0x7FF ) {
2815         if ( bSig ) return propagateFloat64NaN( a, b );
2816         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2817     }
2818     if ( aExp == 0 ) {
2819         ++expDiff;
2820     }
2821     else {
2822         aSig |= LIT64( 0x4000000000000000 );
2823     }
2824     shift64RightJamming( aSig, - expDiff, &aSig );
2825     bSig |= LIT64( 0x4000000000000000 );
2826  bBigger:
2827     zSig = bSig - aSig;
2828     zExp = bExp;
2829     zSign ^= 1;
2830     goto normalizeRoundAndPack;
2831  aExpBigger:
2832     if ( aExp == 0x7FF ) {
2833         if ( aSig ) return propagateFloat64NaN( a, b );
2834         return a;
2835     }
2836     if ( bExp == 0 ) {
2837         --expDiff;
2838     }
2839     else {
2840         bSig |= LIT64( 0x4000000000000000 );
2841     }
2842     shift64RightJamming( bSig, expDiff, &bSig );
2843     aSig |= LIT64( 0x4000000000000000 );
2844  aBigger:
2845     zSig = aSig - bSig;
2846     zExp = aExp;
2847  normalizeRoundAndPack:
2848     --zExp;
2849     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2850 
2851 }
2852 
2853 /*
2854 -------------------------------------------------------------------------------
2855 Returns the result of adding the double-precision floating-point values `a'
2856 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2857 Binary Floating-Point Arithmetic.
2858 -------------------------------------------------------------------------------
2859 */
2860 float64 float64_add( float64 a, float64 b )
2861 {
2862     flag aSign, bSign;
2863 
2864     aSign = extractFloat64Sign( a );
2865     bSign = extractFloat64Sign( b );
2866     if ( aSign == bSign ) {
2867         return addFloat64Sigs( a, b, aSign );
2868     }
2869     else {
2870         return subFloat64Sigs( a, b, aSign );
2871     }
2872 
2873 }
2874 
2875 /*
2876 -------------------------------------------------------------------------------
2877 Returns the result of subtracting the double-precision floating-point values
2878 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2879 for Binary Floating-Point Arithmetic.
2880 -------------------------------------------------------------------------------
2881 */
2882 float64 float64_sub( float64 a, float64 b )
2883 {
2884     flag aSign, bSign;
2885 
2886     aSign = extractFloat64Sign( a );
2887     bSign = extractFloat64Sign( b );
2888     if ( aSign == bSign ) {
2889         return subFloat64Sigs( a, b, aSign );
2890     }
2891     else {
2892         return addFloat64Sigs( a, b, aSign );
2893     }
2894 
2895 }
2896 
2897 /*
2898 -------------------------------------------------------------------------------
2899 Returns the result of multiplying the double-precision floating-point values
2900 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2901 for Binary Floating-Point Arithmetic.
2902 -------------------------------------------------------------------------------
2903 */
2904 float64 float64_mul( float64 a, float64 b )
2905 {
2906     flag aSign, bSign, zSign;
2907     int16 aExp, bExp, zExp;
2908     bits64 aSig, bSig, zSig0, zSig1;
2909 
2910     aSig = extractFloat64Frac( a );
2911     aExp = extractFloat64Exp( a );
2912     aSign = extractFloat64Sign( a );
2913     bSig = extractFloat64Frac( b );
2914     bExp = extractFloat64Exp( b );
2915     bSign = extractFloat64Sign( b );
2916     zSign = aSign ^ bSign;
2917     if ( aExp == 0x7FF ) {
2918         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2919             return propagateFloat64NaN( a, b );
2920         }
2921         if ( ( bExp | bSig ) == 0 ) {
2922             float_raise( float_flag_invalid );
2923             return float64_default_nan;
2924         }
2925         return packFloat64( zSign, 0x7FF, 0 );
2926     }
2927     if ( bExp == 0x7FF ) {
2928         if ( bSig ) return propagateFloat64NaN( a, b );
2929         if ( ( aExp | aSig ) == 0 ) {
2930             float_raise( float_flag_invalid );
2931             return float64_default_nan;
2932         }
2933         return packFloat64( zSign, 0x7FF, 0 );
2934     }
2935     if ( aExp == 0 ) {
2936         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2937         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2938     }
2939     if ( bExp == 0 ) {
2940         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2941         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2942     }
2943     zExp = aExp + bExp - 0x3FF;
2944     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2945     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2946     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2947     zSig0 |= ( zSig1 != 0 );
2948     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2949         zSig0 <<= 1;
2950         --zExp;
2951     }
2952     return roundAndPackFloat64( zSign, zExp, zSig0 );
2953 
2954 }
2955 
2956 /*
2957 -------------------------------------------------------------------------------
2958 Returns the result of dividing the double-precision floating-point value `a'
2959 by the corresponding value `b'.  The operation is performed according to
2960 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2961 -------------------------------------------------------------------------------
2962 */
2963 float64 float64_div( float64 a, float64 b )
2964 {
2965     flag aSign, bSign, zSign;
2966     int16 aExp, bExp, zExp;
2967     bits64 aSig, bSig, zSig;
2968     bits64 rem0, rem1;
2969     bits64 term0, term1;
2970 
2971     aSig = extractFloat64Frac( a );
2972     aExp = extractFloat64Exp( a );
2973     aSign = extractFloat64Sign( a );
2974     bSig = extractFloat64Frac( b );
2975     bExp = extractFloat64Exp( b );
2976     bSign = extractFloat64Sign( b );
2977     zSign = aSign ^ bSign;
2978     if ( aExp == 0x7FF ) {
2979         if ( aSig ) return propagateFloat64NaN( a, b );
2980         if ( bExp == 0x7FF ) {
2981             if ( bSig ) return propagateFloat64NaN( a, b );
2982             float_raise( float_flag_invalid );
2983             return float64_default_nan;
2984         }
2985         return packFloat64( zSign, 0x7FF, 0 );
2986     }
2987     if ( bExp == 0x7FF ) {
2988         if ( bSig ) return propagateFloat64NaN( a, b );
2989         return packFloat64( zSign, 0, 0 );
2990     }
2991     if ( bExp == 0 ) {
2992         if ( bSig == 0 ) {
2993             if ( ( aExp | aSig ) == 0 ) {
2994                 float_raise( float_flag_invalid );
2995                 return float64_default_nan;
2996             }
2997             float_raise( float_flag_divbyzero );
2998             return packFloat64( zSign, 0x7FF, 0 );
2999         }
3000         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3001     }
3002     if ( aExp == 0 ) {
3003         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3004         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3005     }
3006     zExp = aExp - bExp + 0x3FD;
3007     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3008     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3009     if ( bSig <= ( aSig + aSig ) ) {
3010         aSig >>= 1;
3011         ++zExp;
3012     }
3013     zSig = estimateDiv128To64( aSig, 0, bSig );
3014     if ( ( zSig & 0x1FF ) <= 2 ) {
3015         mul64To128( bSig, zSig, &term0, &term1 );
3016         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3017         while ( (sbits64) rem0 < 0 ) {
3018             --zSig;
3019             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3020         }
3021         zSig |= ( rem1 != 0 );
3022     }
3023     return roundAndPackFloat64( zSign, zExp, zSig );
3024 
3025 }
3026 
3027 #ifndef SOFTFLOAT_FOR_GCC
3028 /*
3029 -------------------------------------------------------------------------------
3030 Returns the remainder of the double-precision floating-point value `a'
3031 with respect to the corresponding value `b'.  The operation is performed
3032 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3033 -------------------------------------------------------------------------------
3034 */
3035 float64 float64_rem( float64 a, float64 b )
3036 {
3037     flag aSign, bSign, zSign;
3038     int16 aExp, bExp, expDiff;
3039     bits64 aSig, bSig;
3040     bits64 q, alternateASig;
3041     sbits64 sigMean;
3042 
3043     aSig = extractFloat64Frac( a );
3044     aExp = extractFloat64Exp( a );
3045     aSign = extractFloat64Sign( a );
3046     bSig = extractFloat64Frac( b );
3047     bExp = extractFloat64Exp( b );
3048     bSign = extractFloat64Sign( b );
3049     if ( aExp == 0x7FF ) {
3050         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3051             return propagateFloat64NaN( a, b );
3052         }
3053         float_raise( float_flag_invalid );
3054         return float64_default_nan;
3055     }
3056     if ( bExp == 0x7FF ) {
3057         if ( bSig ) return propagateFloat64NaN( a, b );
3058         return a;
3059     }
3060     if ( bExp == 0 ) {
3061         if ( bSig == 0 ) {
3062             float_raise( float_flag_invalid );
3063             return float64_default_nan;
3064         }
3065         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3066     }
3067     if ( aExp == 0 ) {
3068         if ( aSig == 0 ) return a;
3069         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3070     }
3071     expDiff = aExp - bExp;
3072     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3073     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3074     if ( expDiff < 0 ) {
3075         if ( expDiff < -1 ) return a;
3076         aSig >>= 1;
3077     }
3078     q = ( bSig <= aSig );
3079     if ( q ) aSig -= bSig;
3080     expDiff -= 64;
3081     while ( 0 < expDiff ) {
3082         q = estimateDiv128To64( aSig, 0, bSig );
3083         q = ( 2 < q ) ? q - 2 : 0;
3084         aSig = - ( ( bSig>>2 ) * q );
3085         expDiff -= 62;
3086     }
3087     expDiff += 64;
3088     if ( 0 < expDiff ) {
3089         q = estimateDiv128To64( aSig, 0, bSig );
3090         q = ( 2 < q ) ? q - 2 : 0;
3091         q >>= 64 - expDiff;
3092         bSig >>= 2;
3093         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3094     }
3095     else {
3096         aSig >>= 2;
3097         bSig >>= 2;
3098     }
3099     do {
3100         alternateASig = aSig;
3101         ++q;
3102         aSig -= bSig;
3103     } while ( 0 <= (sbits64) aSig );
3104     sigMean = aSig + alternateASig;
3105     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3106         aSig = alternateASig;
3107     }
3108     zSign = ( (sbits64) aSig < 0 );
3109     if ( zSign ) aSig = - aSig;
3110     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3111 
3112 }
3113 
3114 /*
3115 -------------------------------------------------------------------------------
3116 Returns the square root of the double-precision floating-point value `a'.
3117 The operation is performed according to the IEC/IEEE Standard for Binary
3118 Floating-Point Arithmetic.
3119 -------------------------------------------------------------------------------
3120 */
3121 float64 float64_sqrt( float64 a )
3122 {
3123     flag aSign;
3124     int16 aExp, zExp;
3125     bits64 aSig, zSig, doubleZSig;
3126     bits64 rem0, rem1, term0, term1;
3127 
3128     aSig = extractFloat64Frac( a );
3129     aExp = extractFloat64Exp( a );
3130     aSign = extractFloat64Sign( a );
3131     if ( aExp == 0x7FF ) {
3132         if ( aSig ) return propagateFloat64NaN( a, a );
3133         if ( ! aSign ) return a;
3134         float_raise( float_flag_invalid );
3135         return float64_default_nan;
3136     }
3137     if ( aSign ) {
3138         if ( ( aExp | aSig ) == 0 ) return a;
3139         float_raise( float_flag_invalid );
3140         return float64_default_nan;
3141     }
3142     if ( aExp == 0 ) {
3143         if ( aSig == 0 ) return 0;
3144         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3145     }
3146     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3147     aSig |= LIT64( 0x0010000000000000 );
3148     zSig = estimateSqrt32( aExp, aSig>>21 );
3149     aSig <<= 9 - ( aExp & 1 );
3150     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3151     if ( ( zSig & 0x1FF ) <= 5 ) {
3152         doubleZSig = zSig<<1;
3153         mul64To128( zSig, zSig, &term0, &term1 );
3154         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3155         while ( (sbits64) rem0 < 0 ) {
3156             --zSig;
3157             doubleZSig -= 2;
3158             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3159         }
3160         zSig |= ( ( rem0 | rem1 ) != 0 );
3161     }
3162     return roundAndPackFloat64( 0, zExp, zSig );
3163 
3164 }
3165 #endif
3166 
3167 /*
3168 -------------------------------------------------------------------------------
3169 Returns 1 if the double-precision floating-point value `a' is equal to the
3170 corresponding value `b', and 0 otherwise.  The comparison is performed
3171 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3172 -------------------------------------------------------------------------------
3173 */
3174 flag float64_eq( float64 a, float64 b )
3175 {
3176 
3177     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3178          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3179        ) {
3180         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3181             float_raise( float_flag_invalid );
3182         }
3183         return 0;
3184     }
3185     return ( a == b ) ||
3186 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3187 
3188 }
3189 
3190 /*
3191 -------------------------------------------------------------------------------
3192 Returns 1 if the double-precision floating-point value `a' is less than or
3193 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3194 performed according to the IEC/IEEE Standard for Binary Floating-Point
3195 Arithmetic.
3196 -------------------------------------------------------------------------------
3197 */
3198 flag float64_le( float64 a, float64 b )
3199 {
3200     flag aSign, bSign;
3201 
3202     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3203          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3204        ) {
3205         float_raise( float_flag_invalid );
3206         return 0;
3207     }
3208     aSign = extractFloat64Sign( a );
3209     bSign = extractFloat64Sign( b );
3210     if ( aSign != bSign )
3211 	return aSign ||
3212 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3213 	      0 );
3214     return ( a == b ) ||
3215 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3216 
3217 }
3218 
3219 /*
3220 -------------------------------------------------------------------------------
3221 Returns 1 if the double-precision floating-point value `a' is less than
3222 the corresponding value `b', and 0 otherwise.  The comparison is performed
3223 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3224 -------------------------------------------------------------------------------
3225 */
3226 flag float64_lt( float64 a, float64 b )
3227 {
3228     flag aSign, bSign;
3229 
3230     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3231          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3232        ) {
3233         float_raise( float_flag_invalid );
3234         return 0;
3235     }
3236     aSign = extractFloat64Sign( a );
3237     bSign = extractFloat64Sign( b );
3238     if ( aSign != bSign )
3239 	return aSign &&
3240 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3241 	      0 );
3242     return ( a != b ) &&
3243 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3244 
3245 }
3246 
3247 #ifndef SOFTFLOAT_FOR_GCC
3248 /*
3249 -------------------------------------------------------------------------------
3250 Returns 1 if the double-precision floating-point value `a' is equal to the
3251 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3252 if either operand is a NaN.  Otherwise, the comparison is performed
3253 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3254 -------------------------------------------------------------------------------
3255 */
3256 flag float64_eq_signaling( float64 a, float64 b )
3257 {
3258 
3259     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3260          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3261        ) {
3262         float_raise( float_flag_invalid );
3263         return 0;
3264     }
3265     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3266 
3267 }
3268 
3269 /*
3270 -------------------------------------------------------------------------------
3271 Returns 1 if the double-precision floating-point value `a' is less than or
3272 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3273 cause an exception.  Otherwise, the comparison is performed according to the
3274 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3275 -------------------------------------------------------------------------------
3276 */
3277 flag float64_le_quiet( float64 a, float64 b )
3278 {
3279     flag aSign, bSign;
3280 
3281     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3282          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3283        ) {
3284         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3285             float_raise( float_flag_invalid );
3286         }
3287         return 0;
3288     }
3289     aSign = extractFloat64Sign( a );
3290     bSign = extractFloat64Sign( b );
3291     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3292     return ( a == b ) || ( aSign ^ ( a < b ) );
3293 
3294 }
3295 
3296 /*
3297 -------------------------------------------------------------------------------
3298 Returns 1 if the double-precision floating-point value `a' is less than
3299 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3300 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3301 Standard for Binary Floating-Point Arithmetic.
3302 -------------------------------------------------------------------------------
3303 */
3304 flag float64_lt_quiet( float64 a, float64 b )
3305 {
3306     flag aSign, bSign;
3307 
3308     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3309          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3310        ) {
3311         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3312             float_raise( float_flag_invalid );
3313         }
3314         return 0;
3315     }
3316     aSign = extractFloat64Sign( a );
3317     bSign = extractFloat64Sign( b );
3318     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3319     return ( a != b ) && ( aSign ^ ( a < b ) );
3320 
3321 }
3322 #endif
3323 
3324 #ifdef FLOATX80
3325 
3326 /*
3327 -------------------------------------------------------------------------------
3328 Returns the result of converting the extended double-precision floating-
3329 point value `a' to the 32-bit two's complement integer format.  The
3330 conversion is performed according to the IEC/IEEE Standard for Binary
3331 Floating-Point Arithmetic---which means in particular that the conversion
3332 is rounded according to the current rounding mode.  If `a' is a NaN, the
3333 largest positive integer is returned.  Otherwise, if the conversion
3334 overflows, the largest integer with the same sign as `a' is returned.
3335 -------------------------------------------------------------------------------
3336 */
3337 int32 floatx80_to_int32( floatx80 a )
3338 {
3339     flag aSign;
3340     int32 aExp, shiftCount;
3341     bits64 aSig;
3342 
3343     aSig = extractFloatx80Frac( a );
3344     aExp = extractFloatx80Exp( a );
3345     aSign = extractFloatx80Sign( a );
3346     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3347     shiftCount = 0x4037 - aExp;
3348     if ( shiftCount <= 0 ) shiftCount = 1;
3349     shift64RightJamming( aSig, shiftCount, &aSig );
3350     return roundAndPackInt32( aSign, aSig );
3351 
3352 }
3353 
3354 /*
3355 -------------------------------------------------------------------------------
3356 Returns the result of converting the extended double-precision floating-
3357 point value `a' to the 32-bit two's complement integer format.  The
3358 conversion is performed according to the IEC/IEEE Standard for Binary
3359 Floating-Point Arithmetic, except that the conversion is always rounded
3360 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3361 Otherwise, if the conversion overflows, the largest integer with the same
3362 sign as `a' is returned.
3363 -------------------------------------------------------------------------------
3364 */
3365 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3366 {
3367     flag aSign;
3368     int32 aExp, shiftCount;
3369     bits64 aSig, savedASig;
3370     int32 z;
3371 
3372     aSig = extractFloatx80Frac( a );
3373     aExp = extractFloatx80Exp( a );
3374     aSign = extractFloatx80Sign( a );
3375     if ( 0x401E < aExp ) {
3376         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3377         goto invalid;
3378     }
3379     else if ( aExp < 0x3FFF ) {
3380         if ( aExp || aSig ) float_set_inexact();
3381         return 0;
3382     }
3383     shiftCount = 0x403E - aExp;
3384     savedASig = aSig;
3385     aSig >>= shiftCount;
3386     z = aSig;
3387     if ( aSign ) z = - z;
3388     if ( ( z < 0 ) ^ aSign ) {
3389  invalid:
3390         float_raise( float_flag_invalid );
3391         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3392     }
3393     if ( ( aSig<<shiftCount ) != savedASig ) {
3394         float_set_inexact();
3395     }
3396     return z;
3397 
3398 }
3399 
3400 /*
3401 -------------------------------------------------------------------------------
3402 Returns the result of converting the extended double-precision floating-
3403 point value `a' to the 64-bit two's complement integer format.  The
3404 conversion is performed according to the IEC/IEEE Standard for Binary
3405 Floating-Point Arithmetic---which means in particular that the conversion
3406 is rounded according to the current rounding mode.  If `a' is a NaN,
3407 the largest positive integer is returned.  Otherwise, if the conversion
3408 overflows, the largest integer with the same sign as `a' is returned.
3409 -------------------------------------------------------------------------------
3410 */
3411 int64 floatx80_to_int64( floatx80 a )
3412 {
3413     flag aSign;
3414     int32 aExp, shiftCount;
3415     bits64 aSig, aSigExtra;
3416 
3417     aSig = extractFloatx80Frac( a );
3418     aExp = extractFloatx80Exp( a );
3419     aSign = extractFloatx80Sign( a );
3420     shiftCount = 0x403E - aExp;
3421     if ( shiftCount <= 0 ) {
3422         if ( shiftCount ) {
3423             float_raise( float_flag_invalid );
3424             if (    ! aSign
3425                  || (    ( aExp == 0x7FFF )
3426                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3427                ) {
3428                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3429             }
3430             return (sbits64) LIT64( 0x8000000000000000 );
3431         }
3432         aSigExtra = 0;
3433     }
3434     else {
3435         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3436     }
3437     return roundAndPackInt64( aSign, aSig, aSigExtra );
3438 
3439 }
3440 
3441 /*
3442 -------------------------------------------------------------------------------
3443 Returns the result of converting the extended double-precision floating-
3444 point value `a' to the 64-bit two's complement integer format.  The
3445 conversion is performed according to the IEC/IEEE Standard for Binary
3446 Floating-Point Arithmetic, except that the conversion is always rounded
3447 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3448 Otherwise, if the conversion overflows, the largest integer with the same
3449 sign as `a' is returned.
3450 -------------------------------------------------------------------------------
3451 */
3452 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3453 {
3454     flag aSign;
3455     int32 aExp, shiftCount;
3456     bits64 aSig;
3457     int64 z;
3458 
3459     aSig = extractFloatx80Frac( a );
3460     aExp = extractFloatx80Exp( a );
3461     aSign = extractFloatx80Sign( a );
3462     shiftCount = aExp - 0x403E;
3463     if ( 0 <= shiftCount ) {
3464         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3465         if ( ( a.high != 0xC03E ) || aSig ) {
3466             float_raise( float_flag_invalid );
3467             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3468                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3469             }
3470         }
3471         return (sbits64) LIT64( 0x8000000000000000 );
3472     }
3473     else if ( aExp < 0x3FFF ) {
3474         if ( aExp | aSig ) float_set_inexact();
3475         return 0;
3476     }
3477     z = aSig>>( - shiftCount );
3478     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3479         float_set_inexact();
3480     }
3481     if ( aSign ) z = - z;
3482     return z;
3483 
3484 }
3485 
3486 /*
3487 -------------------------------------------------------------------------------
3488 Returns the result of converting the extended double-precision floating-
3489 point value `a' to the single-precision floating-point format.  The
3490 conversion is performed according to the IEC/IEEE Standard for Binary
3491 Floating-Point Arithmetic.
3492 -------------------------------------------------------------------------------
3493 */
3494 float32 floatx80_to_float32( floatx80 a )
3495 {
3496     flag aSign;
3497     int32 aExp;
3498     bits64 aSig;
3499 
3500     aSig = extractFloatx80Frac( a );
3501     aExp = extractFloatx80Exp( a );
3502     aSign = extractFloatx80Sign( a );
3503     if ( aExp == 0x7FFF ) {
3504         if ( (bits64) ( aSig<<1 ) ) {
3505             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3506         }
3507         return packFloat32( aSign, 0xFF, 0 );
3508     }
3509     shift64RightJamming( aSig, 33, &aSig );
3510     if ( aExp || aSig ) aExp -= 0x3F81;
3511     return roundAndPackFloat32( aSign, aExp, aSig );
3512 
3513 }
3514 
3515 /*
3516 -------------------------------------------------------------------------------
3517 Returns the result of converting the extended double-precision floating-
3518 point value `a' to the double-precision floating-point format.  The
3519 conversion is performed according to the IEC/IEEE Standard for Binary
3520 Floating-Point Arithmetic.
3521 -------------------------------------------------------------------------------
3522 */
3523 float64 floatx80_to_float64( floatx80 a )
3524 {
3525     flag aSign;
3526     int32 aExp;
3527     bits64 aSig, zSig;
3528 
3529     aSig = extractFloatx80Frac( a );
3530     aExp = extractFloatx80Exp( a );
3531     aSign = extractFloatx80Sign( a );
3532     if ( aExp == 0x7FFF ) {
3533         if ( (bits64) ( aSig<<1 ) ) {
3534             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3535         }
3536         return packFloat64( aSign, 0x7FF, 0 );
3537     }
3538     shift64RightJamming( aSig, 1, &zSig );
3539     if ( aExp || aSig ) aExp -= 0x3C01;
3540     return roundAndPackFloat64( aSign, aExp, zSig );
3541 
3542 }
3543 
3544 #ifdef FLOAT128
3545 
3546 /*
3547 -------------------------------------------------------------------------------
3548 Returns the result of converting the extended double-precision floating-
3549 point value `a' to the quadruple-precision floating-point format.  The
3550 conversion is performed according to the IEC/IEEE Standard for Binary
3551 Floating-Point Arithmetic.
3552 -------------------------------------------------------------------------------
3553 */
3554 float128 floatx80_to_float128( floatx80 a )
3555 {
3556     flag aSign;
3557     int16 aExp;
3558     bits64 aSig, zSig0, zSig1;
3559 
3560     aSig = extractFloatx80Frac( a );
3561     aExp = extractFloatx80Exp( a );
3562     aSign = extractFloatx80Sign( a );
3563     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3564         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3565     }
3566     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3567     return packFloat128( aSign, aExp, zSig0, zSig1 );
3568 
3569 }
3570 
3571 #endif
3572 
3573 /*
3574 -------------------------------------------------------------------------------
3575 Rounds the extended double-precision floating-point value `a' to an integer,
3576 and returns the result as an extended quadruple-precision floating-point
3577 value.  The operation is performed according to the IEC/IEEE Standard for
3578 Binary Floating-Point Arithmetic.
3579 -------------------------------------------------------------------------------
3580 */
3581 floatx80 floatx80_round_to_int( floatx80 a )
3582 {
3583     flag aSign;
3584     int32 aExp;
3585     bits64 lastBitMask, roundBitsMask;
3586     int8 roundingMode;
3587     floatx80 z;
3588 
3589     aExp = extractFloatx80Exp( a );
3590     if ( 0x403E <= aExp ) {
3591         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3592             return propagateFloatx80NaN( a, a );
3593         }
3594         return a;
3595     }
3596     if ( aExp < 0x3FFF ) {
3597         if (    ( aExp == 0 )
3598              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3599             return a;
3600         }
3601         float_set_inexact();
3602         aSign = extractFloatx80Sign( a );
3603         switch ( float_rounding_mode() ) {
3604          case float_round_nearest_even:
3605             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3606                ) {
3607                 return
3608                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3609             }
3610             break;
3611          case float_round_down:
3612             return
3613                   aSign ?
3614                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3615                 : packFloatx80( 0, 0, 0 );
3616          case float_round_up:
3617             return
3618                   aSign ? packFloatx80( 1, 0, 0 )
3619                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3620         }
3621         return packFloatx80( aSign, 0, 0 );
3622     }
3623     lastBitMask = 1;
3624     lastBitMask <<= 0x403E - aExp;
3625     roundBitsMask = lastBitMask - 1;
3626     z = a;
3627     roundingMode = float_rounding_mode();
3628     if ( roundingMode == float_round_nearest_even ) {
3629         z.low += lastBitMask>>1;
3630         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3631     }
3632     else if ( roundingMode != float_round_to_zero ) {
3633         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3634             z.low += roundBitsMask;
3635         }
3636     }
3637     z.low &= ~ roundBitsMask;
3638     if ( z.low == 0 ) {
3639         ++z.high;
3640         z.low = LIT64( 0x8000000000000000 );
3641     }
3642     if ( z.low != a.low ) float_set_inexact();
3643     return z;
3644 
3645 }
3646 
3647 /*
3648 -------------------------------------------------------------------------------
3649 Returns the result of adding the absolute values of the extended double-
3650 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3651 negated before being returned.  `zSign' is ignored if the result is a NaN.
3652 The addition is performed according to the IEC/IEEE Standard for Binary
3653 Floating-Point Arithmetic.
3654 -------------------------------------------------------------------------------
3655 */
3656 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3657 {
3658     int32 aExp, bExp, zExp;
3659     bits64 aSig, bSig, zSig0, zSig1;
3660     int32 expDiff;
3661 
3662     aSig = extractFloatx80Frac( a );
3663     aExp = extractFloatx80Exp( a );
3664     bSig = extractFloatx80Frac( b );
3665     bExp = extractFloatx80Exp( b );
3666     expDiff = aExp - bExp;
3667     if ( 0 < expDiff ) {
3668         if ( aExp == 0x7FFF ) {
3669             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3670             return a;
3671         }
3672         if ( bExp == 0 ) --expDiff;
3673         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3674         zExp = aExp;
3675     }
3676     else if ( expDiff < 0 ) {
3677         if ( bExp == 0x7FFF ) {
3678             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3679             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3680         }
3681         if ( aExp == 0 ) ++expDiff;
3682         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3683         zExp = bExp;
3684     }
3685     else {
3686         if ( aExp == 0x7FFF ) {
3687             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3688                 return propagateFloatx80NaN( a, b );
3689             }
3690             return a;
3691         }
3692         zSig1 = 0;
3693         zSig0 = aSig + bSig;
3694         if ( aExp == 0 ) {
3695             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3696             goto roundAndPack;
3697         }
3698         zExp = aExp;
3699         goto shiftRight1;
3700     }
3701     zSig0 = aSig + bSig;
3702     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3703  shiftRight1:
3704     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3705     zSig0 |= LIT64( 0x8000000000000000 );
3706     ++zExp;
3707  roundAndPack:
3708     return
3709         roundAndPackFloatx80(
3710             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3711 
3712 }
3713 
3714 /*
3715 -------------------------------------------------------------------------------
3716 Returns the result of subtracting the absolute values of the extended
3717 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3718 difference is negated before being returned.  `zSign' is ignored if the
3719 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3720 Standard for Binary Floating-Point Arithmetic.
3721 -------------------------------------------------------------------------------
3722 */
3723 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3724 {
3725     int32 aExp, bExp, zExp;
3726     bits64 aSig, bSig, zSig0, zSig1;
3727     int32 expDiff;
3728     floatx80 z;
3729 
3730     aSig = extractFloatx80Frac( a );
3731     aExp = extractFloatx80Exp( a );
3732     bSig = extractFloatx80Frac( b );
3733     bExp = extractFloatx80Exp( b );
3734     expDiff = aExp - bExp;
3735     if ( 0 < expDiff ) goto aExpBigger;
3736     if ( expDiff < 0 ) goto bExpBigger;
3737     if ( aExp == 0x7FFF ) {
3738         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3739             return propagateFloatx80NaN( a, b );
3740         }
3741         float_raise( float_flag_invalid );
3742         z.low = floatx80_default_nan_low;
3743         z.high = floatx80_default_nan_high;
3744         return z;
3745     }
3746     if ( aExp == 0 ) {
3747         aExp = 1;
3748         bExp = 1;
3749     }
3750     zSig1 = 0;
3751     if ( bSig < aSig ) goto aBigger;
3752     if ( aSig < bSig ) goto bBigger;
3753     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3754  bExpBigger:
3755     if ( bExp == 0x7FFF ) {
3756         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3757         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3758     }
3759     if ( aExp == 0 ) ++expDiff;
3760     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3761  bBigger:
3762     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3763     zExp = bExp;
3764     zSign ^= 1;
3765     goto normalizeRoundAndPack;
3766  aExpBigger:
3767     if ( aExp == 0x7FFF ) {
3768         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3769         return a;
3770     }
3771     if ( bExp == 0 ) --expDiff;
3772     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3773  aBigger:
3774     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3775     zExp = aExp;
3776  normalizeRoundAndPack:
3777     return
3778         normalizeRoundAndPackFloatx80(
3779             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3780 
3781 }
3782 
3783 /*
3784 -------------------------------------------------------------------------------
3785 Returns the result of adding the extended double-precision floating-point
3786 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3787 Standard for Binary Floating-Point Arithmetic.
3788 -------------------------------------------------------------------------------
3789 */
3790 floatx80 floatx80_add( floatx80 a, floatx80 b )
3791 {
3792     flag aSign, bSign;
3793 
3794     aSign = extractFloatx80Sign( a );
3795     bSign = extractFloatx80Sign( b );
3796     if ( aSign == bSign ) {
3797         return addFloatx80Sigs( a, b, aSign );
3798     }
3799     else {
3800         return subFloatx80Sigs( a, b, aSign );
3801     }
3802 
3803 }
3804 
3805 /*
3806 -------------------------------------------------------------------------------
3807 Returns the result of subtracting the extended double-precision floating-
3808 point values `a' and `b'.  The operation is performed according to the
3809 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3810 -------------------------------------------------------------------------------
3811 */
3812 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3813 {
3814     flag aSign, bSign;
3815 
3816     aSign = extractFloatx80Sign( a );
3817     bSign = extractFloatx80Sign( b );
3818     if ( aSign == bSign ) {
3819         return subFloatx80Sigs( a, b, aSign );
3820     }
3821     else {
3822         return addFloatx80Sigs( a, b, aSign );
3823     }
3824 
3825 }
3826 
3827 /*
3828 -------------------------------------------------------------------------------
3829 Returns the result of multiplying the extended double-precision floating-
3830 point values `a' and `b'.  The operation is performed according to the
3831 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3832 -------------------------------------------------------------------------------
3833 */
3834 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3835 {
3836     flag aSign, bSign, zSign;
3837     int32 aExp, bExp, zExp;
3838     bits64 aSig, bSig, zSig0, zSig1;
3839     floatx80 z;
3840 
3841     aSig = extractFloatx80Frac( a );
3842     aExp = extractFloatx80Exp( a );
3843     aSign = extractFloatx80Sign( a );
3844     bSig = extractFloatx80Frac( b );
3845     bExp = extractFloatx80Exp( b );
3846     bSign = extractFloatx80Sign( b );
3847     zSign = aSign ^ bSign;
3848     if ( aExp == 0x7FFF ) {
3849         if (    (bits64) ( aSig<<1 )
3850              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3851             return propagateFloatx80NaN( a, b );
3852         }
3853         if ( ( bExp | bSig ) == 0 ) goto invalid;
3854         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3855     }
3856     if ( bExp == 0x7FFF ) {
3857         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3858         if ( ( aExp | aSig ) == 0 ) {
3859  invalid:
3860             float_raise( float_flag_invalid );
3861             z.low = floatx80_default_nan_low;
3862             z.high = floatx80_default_nan_high;
3863             return z;
3864         }
3865         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3866     }
3867     if ( aExp == 0 ) {
3868         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3869         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3870     }
3871     if ( bExp == 0 ) {
3872         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3873         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3874     }
3875     zExp = aExp + bExp - 0x3FFE;
3876     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3877     if ( 0 < (sbits64) zSig0 ) {
3878         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3879         --zExp;
3880     }
3881     return
3882         roundAndPackFloatx80(
3883             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3884 
3885 }
3886 
3887 /*
3888 -------------------------------------------------------------------------------
3889 Returns the result of dividing the extended double-precision floating-point
3890 value `a' by the corresponding value `b'.  The operation is performed
3891 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3892 -------------------------------------------------------------------------------
3893 */
3894 floatx80 floatx80_div( floatx80 a, floatx80 b )
3895 {
3896     flag aSign, bSign, zSign;
3897     int32 aExp, bExp, zExp;
3898     bits64 aSig, bSig, zSig0, zSig1;
3899     bits64 rem0, rem1, rem2, term0, term1, term2;
3900     floatx80 z;
3901 
3902     aSig = extractFloatx80Frac( a );
3903     aExp = extractFloatx80Exp( a );
3904     aSign = extractFloatx80Sign( a );
3905     bSig = extractFloatx80Frac( b );
3906     bExp = extractFloatx80Exp( b );
3907     bSign = extractFloatx80Sign( b );
3908     zSign = aSign ^ bSign;
3909     if ( aExp == 0x7FFF ) {
3910         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3911         if ( bExp == 0x7FFF ) {
3912             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3913             goto invalid;
3914         }
3915         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3916     }
3917     if ( bExp == 0x7FFF ) {
3918         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3919         return packFloatx80( zSign, 0, 0 );
3920     }
3921     if ( bExp == 0 ) {
3922         if ( bSig == 0 ) {
3923             if ( ( aExp | aSig ) == 0 ) {
3924  invalid:
3925                 float_raise( float_flag_invalid );
3926                 z.low = floatx80_default_nan_low;
3927                 z.high = floatx80_default_nan_high;
3928                 return z;
3929             }
3930             float_raise( float_flag_divbyzero );
3931             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3932         }
3933         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3934     }
3935     if ( aExp == 0 ) {
3936         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3937         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3938     }
3939     zExp = aExp - bExp + 0x3FFE;
3940     rem1 = 0;
3941     if ( bSig <= aSig ) {
3942         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3943         ++zExp;
3944     }
3945     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3946     mul64To128( bSig, zSig0, &term0, &term1 );
3947     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3948     while ( (sbits64) rem0 < 0 ) {
3949         --zSig0;
3950         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3951     }
3952     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3953     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3954         mul64To128( bSig, zSig1, &term1, &term2 );
3955         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3956         while ( (sbits64) rem1 < 0 ) {
3957             --zSig1;
3958             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3959         }
3960         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3961     }
3962     return
3963         roundAndPackFloatx80(
3964             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3965 
3966 }
3967 
3968 /*
3969 -------------------------------------------------------------------------------
3970 Returns the remainder of the extended double-precision floating-point value
3971 `a' with respect to the corresponding value `b'.  The operation is performed
3972 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3973 -------------------------------------------------------------------------------
3974 */
3975 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3976 {
3977     flag aSign, bSign, zSign;
3978     int32 aExp, bExp, expDiff;
3979     bits64 aSig0, aSig1, bSig;
3980     bits64 q, term0, term1, alternateASig0, alternateASig1;
3981     floatx80 z;
3982 
3983     aSig0 = extractFloatx80Frac( a );
3984     aExp = extractFloatx80Exp( a );
3985     aSign = extractFloatx80Sign( a );
3986     bSig = extractFloatx80Frac( b );
3987     bExp = extractFloatx80Exp( b );
3988     bSign = extractFloatx80Sign( b );
3989     if ( aExp == 0x7FFF ) {
3990         if (    (bits64) ( aSig0<<1 )
3991              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3992             return propagateFloatx80NaN( a, b );
3993         }
3994         goto invalid;
3995     }
3996     if ( bExp == 0x7FFF ) {
3997         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3998         return a;
3999     }
4000     if ( bExp == 0 ) {
4001         if ( bSig == 0 ) {
4002  invalid:
4003             float_raise( float_flag_invalid );
4004             z.low = floatx80_default_nan_low;
4005             z.high = floatx80_default_nan_high;
4006             return z;
4007         }
4008         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4009     }
4010     if ( aExp == 0 ) {
4011         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4012         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4013     }
4014     bSig |= LIT64( 0x8000000000000000 );
4015     zSign = aSign;
4016     expDiff = aExp - bExp;
4017     aSig1 = 0;
4018     if ( expDiff < 0 ) {
4019         if ( expDiff < -1 ) return a;
4020         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4021         expDiff = 0;
4022     }
4023     q = ( bSig <= aSig0 );
4024     if ( q ) aSig0 -= bSig;
4025     expDiff -= 64;
4026     while ( 0 < expDiff ) {
4027         q = estimateDiv128To64( aSig0, aSig1, bSig );
4028         q = ( 2 < q ) ? q - 2 : 0;
4029         mul64To128( bSig, q, &term0, &term1 );
4030         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4031         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4032         expDiff -= 62;
4033     }
4034     expDiff += 64;
4035     if ( 0 < expDiff ) {
4036         q = estimateDiv128To64( aSig0, aSig1, bSig );
4037         q = ( 2 < q ) ? q - 2 : 0;
4038         q >>= 64 - expDiff;
4039         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4040         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4041         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4042         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4043             ++q;
4044             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4045         }
4046     }
4047     else {
4048         term1 = 0;
4049         term0 = bSig;
4050     }
4051     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4052     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4053          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4054               && ( q & 1 ) )
4055        ) {
4056         aSig0 = alternateASig0;
4057         aSig1 = alternateASig1;
4058         zSign = ! zSign;
4059     }
4060     return
4061         normalizeRoundAndPackFloatx80(
4062             80, zSign, bExp + expDiff, aSig0, aSig1 );
4063 
4064 }
4065 
4066 /*
4067 -------------------------------------------------------------------------------
4068 Returns the square root of the extended double-precision floating-point
4069 value `a'.  The operation is performed according to the IEC/IEEE Standard
4070 for Binary Floating-Point Arithmetic.
4071 -------------------------------------------------------------------------------
4072 */
4073 floatx80 floatx80_sqrt( floatx80 a )
4074 {
4075     flag aSign;
4076     int32 aExp, zExp;
4077     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4078     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4079     floatx80 z;
4080 
4081     aSig0 = extractFloatx80Frac( a );
4082     aExp = extractFloatx80Exp( a );
4083     aSign = extractFloatx80Sign( a );
4084     if ( aExp == 0x7FFF ) {
4085         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4086         if ( ! aSign ) return a;
4087         goto invalid;
4088     }
4089     if ( aSign ) {
4090         if ( ( aExp | aSig0 ) == 0 ) return a;
4091  invalid:
4092         float_raise( float_flag_invalid );
4093         z.low = floatx80_default_nan_low;
4094         z.high = floatx80_default_nan_high;
4095         return z;
4096     }
4097     if ( aExp == 0 ) {
4098         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4099         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4100     }
4101     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4102     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4103     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4104     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4105     doubleZSig0 = zSig0<<1;
4106     mul64To128( zSig0, zSig0, &term0, &term1 );
4107     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4108     while ( (sbits64) rem0 < 0 ) {
4109         --zSig0;
4110         doubleZSig0 -= 2;
4111         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4112     }
4113     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4114     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4115         if ( zSig1 == 0 ) zSig1 = 1;
4116         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4117         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4118         mul64To128( zSig1, zSig1, &term2, &term3 );
4119         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4120         while ( (sbits64) rem1 < 0 ) {
4121             --zSig1;
4122             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4123             term3 |= 1;
4124             term2 |= doubleZSig0;
4125             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4126         }
4127         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4128     }
4129     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4130     zSig0 |= doubleZSig0;
4131     return
4132         roundAndPackFloatx80(
4133             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4134 
4135 }
4136 
4137 /*
4138 -------------------------------------------------------------------------------
4139 Returns 1 if the extended double-precision floating-point value `a' is
4140 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4141 performed according to the IEC/IEEE Standard for Binary Floating-Point
4142 Arithmetic.
4143 -------------------------------------------------------------------------------
4144 */
4145 flag floatx80_eq( floatx80 a, floatx80 b )
4146 {
4147 
4148     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4149               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4150          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4151               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4152        ) {
4153         if (    floatx80_is_signaling_nan( a )
4154              || floatx80_is_signaling_nan( b ) ) {
4155             float_raise( float_flag_invalid );
4156         }
4157         return 0;
4158     }
4159     return
4160            ( a.low == b.low )
4161         && (    ( a.high == b.high )
4162              || (    ( a.low == 0 )
4163                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4164            );
4165 
4166 }
4167 
4168 /*
4169 -------------------------------------------------------------------------------
4170 Returns 1 if the extended double-precision floating-point value `a' is
4171 less than or equal to the corresponding value `b', and 0 otherwise.  The
4172 comparison is performed according to the IEC/IEEE Standard for Binary
4173 Floating-Point Arithmetic.
4174 -------------------------------------------------------------------------------
4175 */
4176 flag floatx80_le( floatx80 a, floatx80 b )
4177 {
4178     flag aSign, bSign;
4179 
4180     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4181               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4182          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4183               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4184        ) {
4185         float_raise( float_flag_invalid );
4186         return 0;
4187     }
4188     aSign = extractFloatx80Sign( a );
4189     bSign = extractFloatx80Sign( b );
4190     if ( aSign != bSign ) {
4191         return
4192                aSign
4193             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4194                  == 0 );
4195     }
4196     return
4197           aSign ? le128( b.high, b.low, a.high, a.low )
4198         : le128( a.high, a.low, b.high, b.low );
4199 
4200 }
4201 
4202 /*
4203 -------------------------------------------------------------------------------
4204 Returns 1 if the extended double-precision floating-point value `a' is
4205 less than the corresponding value `b', and 0 otherwise.  The comparison
4206 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4207 Arithmetic.
4208 -------------------------------------------------------------------------------
4209 */
4210 flag floatx80_lt( floatx80 a, floatx80 b )
4211 {
4212     flag aSign, bSign;
4213 
4214     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4215               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4216          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4217               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4218        ) {
4219         float_raise( float_flag_invalid );
4220         return 0;
4221     }
4222     aSign = extractFloatx80Sign( a );
4223     bSign = extractFloatx80Sign( b );
4224     if ( aSign != bSign ) {
4225         return
4226                aSign
4227             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4228                  != 0 );
4229     }
4230     return
4231           aSign ? lt128( b.high, b.low, a.high, a.low )
4232         : lt128( a.high, a.low, b.high, b.low );
4233 
4234 }
4235 
4236 /*
4237 -------------------------------------------------------------------------------
4238 Returns 1 if the extended double-precision floating-point value `a' is equal
4239 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4240 raised if either operand is a NaN.  Otherwise, the comparison is performed
4241 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4242 -------------------------------------------------------------------------------
4243 */
4244 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4245 {
4246 
4247     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4248               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4249          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4250               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4251        ) {
4252         float_raise( float_flag_invalid );
4253         return 0;
4254     }
4255     return
4256            ( a.low == b.low )
4257         && (    ( a.high == b.high )
4258              || (    ( a.low == 0 )
4259                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4260            );
4261 
4262 }
4263 
4264 /*
4265 -------------------------------------------------------------------------------
4266 Returns 1 if the extended double-precision floating-point value `a' is less
4267 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4268 do not cause an exception.  Otherwise, the comparison is performed according
4269 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4270 -------------------------------------------------------------------------------
4271 */
4272 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4273 {
4274     flag aSign, bSign;
4275 
4276     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4277               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4278          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4279               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4280        ) {
4281         if (    floatx80_is_signaling_nan( a )
4282              || floatx80_is_signaling_nan( b ) ) {
4283             float_raise( float_flag_invalid );
4284         }
4285         return 0;
4286     }
4287     aSign = extractFloatx80Sign( a );
4288     bSign = extractFloatx80Sign( b );
4289     if ( aSign != bSign ) {
4290         return
4291                aSign
4292             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4293                  == 0 );
4294     }
4295     return
4296           aSign ? le128( b.high, b.low, a.high, a.low )
4297         : le128( a.high, a.low, b.high, b.low );
4298 
4299 }
4300 
4301 /*
4302 -------------------------------------------------------------------------------
4303 Returns 1 if the extended double-precision floating-point value `a' is less
4304 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4305 an exception.  Otherwise, the comparison is performed according to the
4306 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4307 -------------------------------------------------------------------------------
4308 */
4309 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4310 {
4311     flag aSign, bSign;
4312 
4313     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4314               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4315          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4316               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4317        ) {
4318         if (    floatx80_is_signaling_nan( a )
4319              || floatx80_is_signaling_nan( b ) ) {
4320             float_raise( float_flag_invalid );
4321         }
4322         return 0;
4323     }
4324     aSign = extractFloatx80Sign( a );
4325     bSign = extractFloatx80Sign( b );
4326     if ( aSign != bSign ) {
4327         return
4328                aSign
4329             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4330                  != 0 );
4331     }
4332     return
4333           aSign ? lt128( b.high, b.low, a.high, a.low )
4334         : lt128( a.high, a.low, b.high, b.low );
4335 
4336 }
4337 
4338 #endif
4339 
4340 #ifdef FLOAT128
4341 
4342 /*
4343 -------------------------------------------------------------------------------
4344 Returns the result of converting the quadruple-precision floating-point
4345 value `a' to the 32-bit two's complement integer format.  The conversion
4346 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4347 Arithmetic---which means in particular that the conversion is rounded
4348 according to the current rounding mode.  If `a' is a NaN, the largest
4349 positive integer is returned.  Otherwise, if the conversion overflows, the
4350 largest integer with the same sign as `a' is returned.
4351 -------------------------------------------------------------------------------
4352 */
4353 int32 float128_to_int32( float128 a )
4354 {
4355     flag aSign;
4356     int32 aExp, shiftCount;
4357     bits64 aSig0, aSig1;
4358 
4359     aSig1 = extractFloat128Frac1( a );
4360     aSig0 = extractFloat128Frac0( a );
4361     aExp = extractFloat128Exp( a );
4362     aSign = extractFloat128Sign( a );
4363     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4364     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4365     aSig0 |= ( aSig1 != 0 );
4366     shiftCount = 0x4028 - aExp;
4367     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4368     return roundAndPackInt32( aSign, aSig0 );
4369 
4370 }
4371 
4372 /*
4373 -------------------------------------------------------------------------------
4374 Returns the result of converting the quadruple-precision floating-point
4375 value `a' to the 32-bit two's complement integer format.  The conversion
4376 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4377 Arithmetic, except that the conversion is always rounded toward zero.  If
4378 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4379 conversion overflows, the largest integer with the same sign as `a' is
4380 returned.
4381 -------------------------------------------------------------------------------
4382 */
4383 int32 float128_to_int32_round_to_zero( float128 a )
4384 {
4385     flag aSign;
4386     int32 aExp, shiftCount;
4387     bits64 aSig0, aSig1, savedASig;
4388     int32 z;
4389 
4390     aSig1 = extractFloat128Frac1( a );
4391     aSig0 = extractFloat128Frac0( a );
4392     aExp = extractFloat128Exp( a );
4393     aSign = extractFloat128Sign( a );
4394     aSig0 |= ( aSig1 != 0 );
4395     if ( 0x401E < aExp ) {
4396         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4397         goto invalid;
4398     }
4399     else if ( aExp < 0x3FFF ) {
4400         if ( aExp || aSig0 ) float_set_inexact();
4401         return 0;
4402     }
4403     aSig0 |= LIT64( 0x0001000000000000 );
4404     shiftCount = 0x402F - aExp;
4405     savedASig = aSig0;
4406     aSig0 >>= shiftCount;
4407     z = aSig0;
4408     if ( aSign ) z = - z;
4409     if ( ( z < 0 ) ^ aSign ) {
4410  invalid:
4411         float_raise( float_flag_invalid );
4412         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4413     }
4414     if ( ( aSig0<<shiftCount ) != savedASig ) {
4415         float_set_inexact();
4416     }
4417     return z;
4418 
4419 }
4420 
4421 /*
4422 -------------------------------------------------------------------------------
4423 Returns the result of converting the quadruple-precision floating-point
4424 value `a' to the 64-bit two's complement integer format.  The conversion
4425 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4426 Arithmetic---which means in particular that the conversion is rounded
4427 according to the current rounding mode.  If `a' is a NaN, the largest
4428 positive integer is returned.  Otherwise, if the conversion overflows, the
4429 largest integer with the same sign as `a' is returned.
4430 -------------------------------------------------------------------------------
4431 */
4432 int64 float128_to_int64( float128 a )
4433 {
4434     flag aSign;
4435     int32 aExp, shiftCount;
4436     bits64 aSig0, aSig1;
4437 
4438     aSig1 = extractFloat128Frac1( a );
4439     aSig0 = extractFloat128Frac0( a );
4440     aExp = extractFloat128Exp( a );
4441     aSign = extractFloat128Sign( a );
4442     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4443     shiftCount = 0x402F - aExp;
4444     if ( shiftCount <= 0 ) {
4445         if ( 0x403E < aExp ) {
4446             float_raise( float_flag_invalid );
4447             if (    ! aSign
4448                  || (    ( aExp == 0x7FFF )
4449                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4450                     )
4451                ) {
4452                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4453             }
4454             return (sbits64) LIT64( 0x8000000000000000 );
4455         }
4456         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4457     }
4458     else {
4459         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4460     }
4461     return roundAndPackInt64( aSign, aSig0, aSig1 );
4462 
4463 }
4464 
4465 /*
4466 -------------------------------------------------------------------------------
4467 Returns the result of converting the quadruple-precision floating-point
4468 value `a' to the 64-bit two's complement integer format.  The conversion
4469 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4470 Arithmetic, except that the conversion is always rounded toward zero.
4471 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4472 the conversion overflows, the largest integer with the same sign as `a' is
4473 returned.
4474 -------------------------------------------------------------------------------
4475 */
4476 int64 float128_to_int64_round_to_zero( float128 a )
4477 {
4478     flag aSign;
4479     int32 aExp, shiftCount;
4480     bits64 aSig0, aSig1;
4481     int64 z;
4482 
4483     aSig1 = extractFloat128Frac1( a );
4484     aSig0 = extractFloat128Frac0( a );
4485     aExp = extractFloat128Exp( a );
4486     aSign = extractFloat128Sign( a );
4487     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4488     shiftCount = aExp - 0x402F;
4489     if ( 0 < shiftCount ) {
4490         if ( 0x403E <= aExp ) {
4491             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4492             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4493                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4494                 if ( aSig1 ) float_set_inexact();
4495             }
4496             else {
4497                 float_raise( float_flag_invalid );
4498                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4499                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4500                 }
4501             }
4502             return (sbits64) LIT64( 0x8000000000000000 );
4503         }
4504         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4505         if ( (bits64) ( aSig1<<shiftCount ) ) {
4506             float_set_inexact();
4507         }
4508     }
4509     else {
4510         if ( aExp < 0x3FFF ) {
4511             if ( aExp | aSig0 | aSig1 ) {
4512                 float_set_inexact();
4513             }
4514             return 0;
4515         }
4516         z = aSig0>>( - shiftCount );
4517         if (    aSig1
4518              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4519             float_set_inexact();
4520         }
4521     }
4522     if ( aSign ) z = - z;
4523     return z;
4524 
4525 }
4526 
4527 /*
4528 -------------------------------------------------------------------------------
4529 Returns the result of converting the quadruple-precision floating-point
4530 value `a' to the single-precision floating-point format.  The conversion
4531 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4532 Arithmetic.
4533 -------------------------------------------------------------------------------
4534 */
4535 float32 float128_to_float32( float128 a )
4536 {
4537     flag aSign;
4538     int32 aExp;
4539     bits64 aSig0, aSig1;
4540     bits32 zSig;
4541 
4542     aSig1 = extractFloat128Frac1( a );
4543     aSig0 = extractFloat128Frac0( a );
4544     aExp = extractFloat128Exp( a );
4545     aSign = extractFloat128Sign( a );
4546     if ( aExp == 0x7FFF ) {
4547         if ( aSig0 | aSig1 ) {
4548             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4549         }
4550         return packFloat32( aSign, 0xFF, 0 );
4551     }
4552     aSig0 |= ( aSig1 != 0 );
4553     shift64RightJamming( aSig0, 18, &aSig0 );
4554     zSig = aSig0;
4555     if ( aExp || zSig ) {
4556         zSig |= 0x40000000;
4557         aExp -= 0x3F81;
4558     }
4559     return roundAndPackFloat32( aSign, aExp, zSig );
4560 
4561 }
4562 
4563 /*
4564 -------------------------------------------------------------------------------
4565 Returns the result of converting the quadruple-precision floating-point
4566 value `a' to the double-precision floating-point format.  The conversion
4567 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4568 Arithmetic.
4569 -------------------------------------------------------------------------------
4570 */
4571 float64 float128_to_float64( float128 a )
4572 {
4573     flag aSign;
4574     int32 aExp;
4575     bits64 aSig0, aSig1;
4576 
4577     aSig1 = extractFloat128Frac1( a );
4578     aSig0 = extractFloat128Frac0( a );
4579     aExp = extractFloat128Exp( a );
4580     aSign = extractFloat128Sign( a );
4581     if ( aExp == 0x7FFF ) {
4582         if ( aSig0 | aSig1 ) {
4583             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4584         }
4585         return packFloat64( aSign, 0x7FF, 0 );
4586     }
4587     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4588     aSig0 |= ( aSig1 != 0 );
4589     if ( aExp || aSig0 ) {
4590         aSig0 |= LIT64( 0x4000000000000000 );
4591         aExp -= 0x3C01;
4592     }
4593     return roundAndPackFloat64( aSign, aExp, aSig0 );
4594 
4595 }
4596 
4597 #ifdef FLOATX80
4598 
4599 /*
4600 -------------------------------------------------------------------------------
4601 Returns the result of converting the quadruple-precision floating-point
4602 value `a' to the extended double-precision floating-point format.  The
4603 conversion is performed according to the IEC/IEEE Standard for Binary
4604 Floating-Point Arithmetic.
4605 -------------------------------------------------------------------------------
4606 */
4607 floatx80 float128_to_floatx80( float128 a )
4608 {
4609     flag aSign;
4610     int32 aExp;
4611     bits64 aSig0, aSig1;
4612 
4613     aSig1 = extractFloat128Frac1( a );
4614     aSig0 = extractFloat128Frac0( a );
4615     aExp = extractFloat128Exp( a );
4616     aSign = extractFloat128Sign( a );
4617     if ( aExp == 0x7FFF ) {
4618         if ( aSig0 | aSig1 ) {
4619             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4620         }
4621         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4622     }
4623     if ( aExp == 0 ) {
4624         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4625         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4626     }
4627     else {
4628         aSig0 |= LIT64( 0x0001000000000000 );
4629     }
4630     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4631     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4632 
4633 }
4634 
4635 #endif
4636 
4637 /*
4638 -------------------------------------------------------------------------------
4639 Rounds the quadruple-precision floating-point value `a' to an integer, and
4640 returns the result as a quadruple-precision floating-point value.  The
4641 operation is performed according to the IEC/IEEE Standard for Binary
4642 Floating-Point Arithmetic.
4643 -------------------------------------------------------------------------------
4644 */
4645 float128 float128_round_to_int( float128 a )
4646 {
4647     flag aSign;
4648     int32 aExp;
4649     bits64 lastBitMask, roundBitsMask;
4650     int8 roundingMode;
4651     float128 z;
4652 
4653     aExp = extractFloat128Exp( a );
4654     if ( 0x402F <= aExp ) {
4655         if ( 0x406F <= aExp ) {
4656             if (    ( aExp == 0x7FFF )
4657                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4658                ) {
4659                 return propagateFloat128NaN( a, a );
4660             }
4661             return a;
4662         }
4663         lastBitMask = 1;
4664         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4665         roundBitsMask = lastBitMask - 1;
4666         z = a;
4667         roundingMode = float_rounding_mode();
4668         if ( roundingMode == float_round_nearest_even ) {
4669             if ( lastBitMask ) {
4670                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4671                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4672             }
4673             else {
4674                 if ( (sbits64) z.low < 0 ) {
4675                     ++z.high;
4676                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4677                 }
4678             }
4679         }
4680         else if ( roundingMode != float_round_to_zero ) {
4681             if (   extractFloat128Sign( z )
4682                  ^ ( roundingMode == float_round_up ) ) {
4683                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4684             }
4685         }
4686         z.low &= ~ roundBitsMask;
4687     }
4688     else {
4689         if ( aExp < 0x3FFF ) {
4690             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4691             float_set_inexact();
4692             aSign = extractFloat128Sign( a );
4693             switch ( float_rounding_mode() ) {
4694              case float_round_nearest_even:
4695                 if (    ( aExp == 0x3FFE )
4696                      && (   extractFloat128Frac0( a )
4697                           | extractFloat128Frac1( a ) )
4698                    ) {
4699                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4700                 }
4701                 break;
4702              case float_round_down:
4703                 return
4704                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4705                     : packFloat128( 0, 0, 0, 0 );
4706              case float_round_up:
4707                 return
4708                       aSign ? packFloat128( 1, 0, 0, 0 )
4709                     : packFloat128( 0, 0x3FFF, 0, 0 );
4710             }
4711             return packFloat128( aSign, 0, 0, 0 );
4712         }
4713         lastBitMask = 1;
4714         lastBitMask <<= 0x402F - aExp;
4715         roundBitsMask = lastBitMask - 1;
4716         z.low = 0;
4717         z.high = a.high;
4718         roundingMode = float_rounding_mode();
4719         if ( roundingMode == float_round_nearest_even ) {
4720             z.high += lastBitMask>>1;
4721             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4722                 z.high &= ~ lastBitMask;
4723             }
4724         }
4725         else if ( roundingMode != float_round_to_zero ) {
4726             if (   extractFloat128Sign( z )
4727                  ^ ( roundingMode == float_round_up ) ) {
4728                 z.high |= ( a.low != 0 );
4729                 z.high += roundBitsMask;
4730             }
4731         }
4732         z.high &= ~ roundBitsMask;
4733     }
4734     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4735         float_set_inexact();
4736     }
4737     return z;
4738 
4739 }
4740 
4741 /*
4742 -------------------------------------------------------------------------------
4743 Returns the result of adding the absolute values of the quadruple-precision
4744 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4745 before being returned.  `zSign' is ignored if the result is a NaN.
4746 The addition is performed according to the IEC/IEEE Standard for Binary
4747 Floating-Point Arithmetic.
4748 -------------------------------------------------------------------------------
4749 */
4750 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4751 {
4752     int32 aExp, bExp, zExp;
4753     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4754     int32 expDiff;
4755 
4756     aSig1 = extractFloat128Frac1( a );
4757     aSig0 = extractFloat128Frac0( a );
4758     aExp = extractFloat128Exp( a );
4759     bSig1 = extractFloat128Frac1( b );
4760     bSig0 = extractFloat128Frac0( b );
4761     bExp = extractFloat128Exp( b );
4762     expDiff = aExp - bExp;
4763     if ( 0 < expDiff ) {
4764         if ( aExp == 0x7FFF ) {
4765             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4766             return a;
4767         }
4768         if ( bExp == 0 ) {
4769             --expDiff;
4770         }
4771         else {
4772             bSig0 |= LIT64( 0x0001000000000000 );
4773         }
4774         shift128ExtraRightJamming(
4775             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4776         zExp = aExp;
4777     }
4778     else if ( expDiff < 0 ) {
4779         if ( bExp == 0x7FFF ) {
4780             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4781             return packFloat128( zSign, 0x7FFF, 0, 0 );
4782         }
4783         if ( aExp == 0 ) {
4784             ++expDiff;
4785         }
4786         else {
4787             aSig0 |= LIT64( 0x0001000000000000 );
4788         }
4789         shift128ExtraRightJamming(
4790             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4791         zExp = bExp;
4792     }
4793     else {
4794         if ( aExp == 0x7FFF ) {
4795             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4796                 return propagateFloat128NaN( a, b );
4797             }
4798             return a;
4799         }
4800         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4801         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4802         zSig2 = 0;
4803         zSig0 |= LIT64( 0x0002000000000000 );
4804         zExp = aExp;
4805         goto shiftRight1;
4806     }
4807     aSig0 |= LIT64( 0x0001000000000000 );
4808     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4809     --zExp;
4810     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4811     ++zExp;
4812  shiftRight1:
4813     shift128ExtraRightJamming(
4814         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4815  roundAndPack:
4816     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4817 
4818 }
4819 
4820 /*
4821 -------------------------------------------------------------------------------
4822 Returns the result of subtracting the absolute values of the quadruple-
4823 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4824 difference is negated before being returned.  `zSign' is ignored if the
4825 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4826 Standard for Binary Floating-Point Arithmetic.
4827 -------------------------------------------------------------------------------
4828 */
4829 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4830 {
4831     int32 aExp, bExp, zExp;
4832     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4833     int32 expDiff;
4834     float128 z;
4835 
4836     aSig1 = extractFloat128Frac1( a );
4837     aSig0 = extractFloat128Frac0( a );
4838     aExp = extractFloat128Exp( a );
4839     bSig1 = extractFloat128Frac1( b );
4840     bSig0 = extractFloat128Frac0( b );
4841     bExp = extractFloat128Exp( b );
4842     expDiff = aExp - bExp;
4843     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4844     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4845     if ( 0 < expDiff ) goto aExpBigger;
4846     if ( expDiff < 0 ) goto bExpBigger;
4847     if ( aExp == 0x7FFF ) {
4848         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4849             return propagateFloat128NaN( a, b );
4850         }
4851         float_raise( float_flag_invalid );
4852         z.low = float128_default_nan_low;
4853         z.high = float128_default_nan_high;
4854         return z;
4855     }
4856     if ( aExp == 0 ) {
4857         aExp = 1;
4858         bExp = 1;
4859     }
4860     if ( bSig0 < aSig0 ) goto aBigger;
4861     if ( aSig0 < bSig0 ) goto bBigger;
4862     if ( bSig1 < aSig1 ) goto aBigger;
4863     if ( aSig1 < bSig1 ) goto bBigger;
4864     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4865  bExpBigger:
4866     if ( bExp == 0x7FFF ) {
4867         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4868         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4869     }
4870     if ( aExp == 0 ) {
4871         ++expDiff;
4872     }
4873     else {
4874         aSig0 |= LIT64( 0x4000000000000000 );
4875     }
4876     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4877     bSig0 |= LIT64( 0x4000000000000000 );
4878  bBigger:
4879     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4880     zExp = bExp;
4881     zSign ^= 1;
4882     goto normalizeRoundAndPack;
4883  aExpBigger:
4884     if ( aExp == 0x7FFF ) {
4885         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4886         return a;
4887     }
4888     if ( bExp == 0 ) {
4889         --expDiff;
4890     }
4891     else {
4892         bSig0 |= LIT64( 0x4000000000000000 );
4893     }
4894     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4895     aSig0 |= LIT64( 0x4000000000000000 );
4896  aBigger:
4897     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4898     zExp = aExp;
4899  normalizeRoundAndPack:
4900     --zExp;
4901     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4902 
4903 }
4904 
4905 /*
4906 -------------------------------------------------------------------------------
4907 Returns the result of adding the quadruple-precision floating-point values
4908 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4909 for Binary Floating-Point Arithmetic.
4910 -------------------------------------------------------------------------------
4911 */
4912 float128 float128_add( float128 a, float128 b )
4913 {
4914     flag aSign, bSign;
4915 
4916     aSign = extractFloat128Sign( a );
4917     bSign = extractFloat128Sign( b );
4918     if ( aSign == bSign ) {
4919         return addFloat128Sigs( a, b, aSign );
4920     }
4921     else {
4922         return subFloat128Sigs( a, b, aSign );
4923     }
4924 
4925 }
4926 
4927 /*
4928 -------------------------------------------------------------------------------
4929 Returns the result of subtracting the quadruple-precision floating-point
4930 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4931 Standard for Binary Floating-Point Arithmetic.
4932 -------------------------------------------------------------------------------
4933 */
4934 float128 float128_sub( float128 a, float128 b )
4935 {
4936     flag aSign, bSign;
4937 
4938     aSign = extractFloat128Sign( a );
4939     bSign = extractFloat128Sign( b );
4940     if ( aSign == bSign ) {
4941         return subFloat128Sigs( a, b, aSign );
4942     }
4943     else {
4944         return addFloat128Sigs( a, b, aSign );
4945     }
4946 
4947 }
4948 
4949 /*
4950 -------------------------------------------------------------------------------
4951 Returns the result of multiplying the quadruple-precision floating-point
4952 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4953 Standard for Binary Floating-Point Arithmetic.
4954 -------------------------------------------------------------------------------
4955 */
4956 float128 float128_mul( float128 a, float128 b )
4957 {
4958     flag aSign, bSign, zSign;
4959     int32 aExp, bExp, zExp;
4960     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4961     float128 z;
4962 
4963     aSig1 = extractFloat128Frac1( a );
4964     aSig0 = extractFloat128Frac0( a );
4965     aExp = extractFloat128Exp( a );
4966     aSign = extractFloat128Sign( a );
4967     bSig1 = extractFloat128Frac1( b );
4968     bSig0 = extractFloat128Frac0( b );
4969     bExp = extractFloat128Exp( b );
4970     bSign = extractFloat128Sign( b );
4971     zSign = aSign ^ bSign;
4972     if ( aExp == 0x7FFF ) {
4973         if (    ( aSig0 | aSig1 )
4974              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4975             return propagateFloat128NaN( a, b );
4976         }
4977         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4978         return packFloat128( zSign, 0x7FFF, 0, 0 );
4979     }
4980     if ( bExp == 0x7FFF ) {
4981         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4982         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4983  invalid:
4984             float_raise( float_flag_invalid );
4985             z.low = float128_default_nan_low;
4986             z.high = float128_default_nan_high;
4987             return z;
4988         }
4989         return packFloat128( zSign, 0x7FFF, 0, 0 );
4990     }
4991     if ( aExp == 0 ) {
4992         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4993         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4994     }
4995     if ( bExp == 0 ) {
4996         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4997         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4998     }
4999     zExp = aExp + bExp - 0x4000;
5000     aSig0 |= LIT64( 0x0001000000000000 );
5001     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5002     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5003     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5004     zSig2 |= ( zSig3 != 0 );
5005     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5006         shift128ExtraRightJamming(
5007             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5008         ++zExp;
5009     }
5010     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5011 
5012 }
5013 
5014 /*
5015 -------------------------------------------------------------------------------
5016 Returns the result of dividing the quadruple-precision floating-point value
5017 `a' by the corresponding value `b'.  The operation is performed according to
5018 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 -------------------------------------------------------------------------------
5020 */
5021 float128 float128_div( float128 a, float128 b )
5022 {
5023     flag aSign, bSign, zSign;
5024     int32 aExp, bExp, zExp;
5025     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5026     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5027     float128 z;
5028 
5029     aSig1 = extractFloat128Frac1( a );
5030     aSig0 = extractFloat128Frac0( a );
5031     aExp = extractFloat128Exp( a );
5032     aSign = extractFloat128Sign( a );
5033     bSig1 = extractFloat128Frac1( b );
5034     bSig0 = extractFloat128Frac0( b );
5035     bExp = extractFloat128Exp( b );
5036     bSign = extractFloat128Sign( b );
5037     zSign = aSign ^ bSign;
5038     if ( aExp == 0x7FFF ) {
5039         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5040         if ( bExp == 0x7FFF ) {
5041             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5042             goto invalid;
5043         }
5044         return packFloat128( zSign, 0x7FFF, 0, 0 );
5045     }
5046     if ( bExp == 0x7FFF ) {
5047         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5048         return packFloat128( zSign, 0, 0, 0 );
5049     }
5050     if ( bExp == 0 ) {
5051         if ( ( bSig0 | bSig1 ) == 0 ) {
5052             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5053  invalid:
5054                 float_raise( float_flag_invalid );
5055                 z.low = float128_default_nan_low;
5056                 z.high = float128_default_nan_high;
5057                 return z;
5058             }
5059             float_raise( float_flag_divbyzero );
5060             return packFloat128( zSign, 0x7FFF, 0, 0 );
5061         }
5062         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5063     }
5064     if ( aExp == 0 ) {
5065         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5066         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5067     }
5068     zExp = aExp - bExp + 0x3FFD;
5069     shortShift128Left(
5070         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5071     shortShift128Left(
5072         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5073     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5074         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5075         ++zExp;
5076     }
5077     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5078     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5079     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5080     while ( (sbits64) rem0 < 0 ) {
5081         --zSig0;
5082         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5083     }
5084     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5085     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5086         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5087         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5088         while ( (sbits64) rem1 < 0 ) {
5089             --zSig1;
5090             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5091         }
5092         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5093     }
5094     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5095     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5096 
5097 }
5098 
5099 /*
5100 -------------------------------------------------------------------------------
5101 Returns the remainder of the quadruple-precision floating-point value `a'
5102 with respect to the corresponding value `b'.  The operation is performed
5103 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5104 -------------------------------------------------------------------------------
5105 */
5106 float128 float128_rem( float128 a, float128 b )
5107 {
5108     flag aSign, bSign, zSign;
5109     int32 aExp, bExp, expDiff;
5110     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5111     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5112     sbits64 sigMean0;
5113     float128 z;
5114 
5115     aSig1 = extractFloat128Frac1( a );
5116     aSig0 = extractFloat128Frac0( a );
5117     aExp = extractFloat128Exp( a );
5118     aSign = extractFloat128Sign( a );
5119     bSig1 = extractFloat128Frac1( b );
5120     bSig0 = extractFloat128Frac0( b );
5121     bExp = extractFloat128Exp( b );
5122     bSign = extractFloat128Sign( b );
5123     if ( aExp == 0x7FFF ) {
5124         if (    ( aSig0 | aSig1 )
5125              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5126             return propagateFloat128NaN( a, b );
5127         }
5128         goto invalid;
5129     }
5130     if ( bExp == 0x7FFF ) {
5131         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5132         return a;
5133     }
5134     if ( bExp == 0 ) {
5135         if ( ( bSig0 | bSig1 ) == 0 ) {
5136  invalid:
5137             float_raise( float_flag_invalid );
5138             z.low = float128_default_nan_low;
5139             z.high = float128_default_nan_high;
5140             return z;
5141         }
5142         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5143     }
5144     if ( aExp == 0 ) {
5145         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5146         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5147     }
5148     expDiff = aExp - bExp;
5149     if ( expDiff < -1 ) return a;
5150     shortShift128Left(
5151         aSig0 | LIT64( 0x0001000000000000 ),
5152         aSig1,
5153         15 - ( expDiff < 0 ),
5154         &aSig0,
5155         &aSig1
5156     );
5157     shortShift128Left(
5158         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5159     q = le128( bSig0, bSig1, aSig0, aSig1 );
5160     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5161     expDiff -= 64;
5162     while ( 0 < expDiff ) {
5163         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5164         q = ( 4 < q ) ? q - 4 : 0;
5165         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5166         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5167         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5168         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5169         expDiff -= 61;
5170     }
5171     if ( -64 < expDiff ) {
5172         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5173         q = ( 4 < q ) ? q - 4 : 0;
5174         q >>= - expDiff;
5175         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5176         expDiff += 52;
5177         if ( expDiff < 0 ) {
5178             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5179         }
5180         else {
5181             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5182         }
5183         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5184         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5185     }
5186     else {
5187         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5188         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5189     }
5190     do {
5191         alternateASig0 = aSig0;
5192         alternateASig1 = aSig1;
5193         ++q;
5194         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5195     } while ( 0 <= (sbits64) aSig0 );
5196     add128(
5197         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5198     if (    ( sigMean0 < 0 )
5199          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5200         aSig0 = alternateASig0;
5201         aSig1 = alternateASig1;
5202     }
5203     zSign = ( (sbits64) aSig0 < 0 );
5204     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5205     return
5206         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5207 
5208 }
5209 
5210 /*
5211 -------------------------------------------------------------------------------
5212 Returns the square root of the quadruple-precision floating-point value `a'.
5213 The operation is performed according to the IEC/IEEE Standard for Binary
5214 Floating-Point Arithmetic.
5215 -------------------------------------------------------------------------------
5216 */
5217 float128 float128_sqrt( float128 a )
5218 {
5219     flag aSign;
5220     int32 aExp, zExp;
5221     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5222     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5223     float128 z;
5224 
5225     aSig1 = extractFloat128Frac1( a );
5226     aSig0 = extractFloat128Frac0( a );
5227     aExp = extractFloat128Exp( a );
5228     aSign = extractFloat128Sign( a );
5229     if ( aExp == 0x7FFF ) {
5230         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5231         if ( ! aSign ) return a;
5232         goto invalid;
5233     }
5234     if ( aSign ) {
5235         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5236  invalid:
5237         float_raise( float_flag_invalid );
5238         z.low = float128_default_nan_low;
5239         z.high = float128_default_nan_high;
5240         return z;
5241     }
5242     if ( aExp == 0 ) {
5243         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5244         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5245     }
5246     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5247     aSig0 |= LIT64( 0x0001000000000000 );
5248     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5249     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5250     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5251     doubleZSig0 = zSig0<<1;
5252     mul64To128( zSig0, zSig0, &term0, &term1 );
5253     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5254     while ( (sbits64) rem0 < 0 ) {
5255         --zSig0;
5256         doubleZSig0 -= 2;
5257         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5258     }
5259     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5260     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5261         if ( zSig1 == 0 ) zSig1 = 1;
5262         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5263         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5264         mul64To128( zSig1, zSig1, &term2, &term3 );
5265         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5266         while ( (sbits64) rem1 < 0 ) {
5267             --zSig1;
5268             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5269             term3 |= 1;
5270             term2 |= doubleZSig0;
5271             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5272         }
5273         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5274     }
5275     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5276     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5277 
5278 }
5279 
5280 /*
5281 -------------------------------------------------------------------------------
5282 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5283 the corresponding value `b', and 0 otherwise.  The comparison is performed
5284 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5285 -------------------------------------------------------------------------------
5286 */
5287 flag float128_eq( float128 a, float128 b )
5288 {
5289 
5290     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5291               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5292          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5293               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5294        ) {
5295         if (    float128_is_signaling_nan( a )
5296              || float128_is_signaling_nan( b ) ) {
5297             float_raise( float_flag_invalid );
5298         }
5299         return 0;
5300     }
5301     return
5302            ( a.low == b.low )
5303         && (    ( a.high == b.high )
5304              || (    ( a.low == 0 )
5305                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5306            );
5307 
5308 }
5309 
5310 /*
5311 -------------------------------------------------------------------------------
5312 Returns 1 if the quadruple-precision floating-point value `a' is less than
5313 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5314 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5315 Arithmetic.
5316 -------------------------------------------------------------------------------
5317 */
5318 flag float128_le( float128 a, float128 b )
5319 {
5320     flag aSign, bSign;
5321 
5322     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5323               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5324          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5325               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5326        ) {
5327         float_raise( float_flag_invalid );
5328         return 0;
5329     }
5330     aSign = extractFloat128Sign( a );
5331     bSign = extractFloat128Sign( b );
5332     if ( aSign != bSign ) {
5333         return
5334                aSign
5335             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5336                  == 0 );
5337     }
5338     return
5339           aSign ? le128( b.high, b.low, a.high, a.low )
5340         : le128( a.high, a.low, b.high, b.low );
5341 
5342 }
5343 
5344 /*
5345 -------------------------------------------------------------------------------
5346 Returns 1 if the quadruple-precision floating-point value `a' is less than
5347 the corresponding value `b', and 0 otherwise.  The comparison is performed
5348 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5349 -------------------------------------------------------------------------------
5350 */
5351 flag float128_lt( float128 a, float128 b )
5352 {
5353     flag aSign, bSign;
5354 
5355     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5356               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5357          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5358               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5359        ) {
5360         float_raise( float_flag_invalid );
5361         return 0;
5362     }
5363     aSign = extractFloat128Sign( a );
5364     bSign = extractFloat128Sign( b );
5365     if ( aSign != bSign ) {
5366         return
5367                aSign
5368             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5369                  != 0 );
5370     }
5371     return
5372           aSign ? lt128( b.high, b.low, a.high, a.low )
5373         : lt128( a.high, a.low, b.high, b.low );
5374 
5375 }
5376 
5377 /*
5378 -------------------------------------------------------------------------------
5379 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5380 the corresponding value `b', and 0 otherwise.  The invalid exception is
5381 raised if either operand is a NaN.  Otherwise, the comparison is performed
5382 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5383 -------------------------------------------------------------------------------
5384 */
5385 flag float128_eq_signaling( float128 a, float128 b )
5386 {
5387 
5388     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5389               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5390          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5391               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5392        ) {
5393         float_raise( float_flag_invalid );
5394         return 0;
5395     }
5396     return
5397            ( a.low == b.low )
5398         && (    ( a.high == b.high )
5399              || (    ( a.low == 0 )
5400                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5401            );
5402 
5403 }
5404 
5405 /*
5406 -------------------------------------------------------------------------------
5407 Returns 1 if the quadruple-precision floating-point value `a' is less than
5408 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5409 cause an exception.  Otherwise, the comparison is performed according to the
5410 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5411 -------------------------------------------------------------------------------
5412 */
5413 flag float128_le_quiet( float128 a, float128 b )
5414 {
5415     flag aSign, bSign;
5416 
5417     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5418               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5419          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5420               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5421        ) {
5422         if (    float128_is_signaling_nan( a )
5423              || float128_is_signaling_nan( b ) ) {
5424             float_raise( float_flag_invalid );
5425         }
5426         return 0;
5427     }
5428     aSign = extractFloat128Sign( a );
5429     bSign = extractFloat128Sign( b );
5430     if ( aSign != bSign ) {
5431         return
5432                aSign
5433             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5434                  == 0 );
5435     }
5436     return
5437           aSign ? le128( b.high, b.low, a.high, a.low )
5438         : le128( a.high, a.low, b.high, b.low );
5439 
5440 }
5441 
5442 /*
5443 -------------------------------------------------------------------------------
5444 Returns 1 if the quadruple-precision floating-point value `a' is less than
5445 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5446 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5447 Standard for Binary Floating-Point Arithmetic.
5448 -------------------------------------------------------------------------------
5449 */
5450 flag float128_lt_quiet( float128 a, float128 b )
5451 {
5452     flag aSign, bSign;
5453 
5454     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5455               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5456          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5457               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5458        ) {
5459         if (    float128_is_signaling_nan( a )
5460              || float128_is_signaling_nan( b ) ) {
5461             float_raise( float_flag_invalid );
5462         }
5463         return 0;
5464     }
5465     aSign = extractFloat128Sign( a );
5466     bSign = extractFloat128Sign( b );
5467     if ( aSign != bSign ) {
5468         return
5469                aSign
5470             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5471                  != 0 );
5472     }
5473     return
5474           aSign ? lt128( b.high, b.low, a.high, a.low )
5475         : lt128( a.high, a.low, b.high, b.low );
5476 
5477 }
5478 
5479 #endif
5480 
5481 
5482 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5483 
5484 /*
5485  * These two routines are not part of the original softfloat distribution.
5486  *
5487  * They are based on the corresponding conversions to integer but return
5488  * unsigned numbers instead since these functions are required by GCC.
5489  *
5490  * Added by Mark Brinicombe <mark@netbsd.org>	27/09/97
5491  *
5492  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5493  */
5494 
5495 /*
5496 -------------------------------------------------------------------------------
5497 Returns the result of converting the double-precision floating-point value
5498 `a' to the 32-bit unsigned integer format.  The conversion is
5499 performed according to the IEC/IEEE Standard for Binary Floating-point
5500 Arithmetic, except that the conversion is always rounded toward zero.  If
5501 `a' is a NaN, the largest positive integer is returned.  If the conversion
5502 overflows, the largest integer positive is returned.
5503 -------------------------------------------------------------------------------
5504 */
5505 uint32 float64_to_uint32_round_to_zero( float64 a )
5506 {
5507     flag aSign;
5508     int16 aExp, shiftCount;
5509     bits64 aSig, savedASig;
5510     uint32 z;
5511 
5512     aSig = extractFloat64Frac( a );
5513     aExp = extractFloat64Exp( a );
5514     aSign = extractFloat64Sign( a );
5515 
5516     if (aSign) {
5517         float_raise( float_flag_invalid );
5518     	return(0);
5519     }
5520 
5521     if ( 0x41E < aExp ) {
5522         float_raise( float_flag_invalid );
5523         return 0xffffffff;
5524     }
5525     else if ( aExp < 0x3FF ) {
5526         if ( aExp || aSig ) float_set_inexact();
5527         return 0;
5528     }
5529     aSig |= LIT64( 0x0010000000000000 );
5530     shiftCount = 0x433 - aExp;
5531     savedASig = aSig;
5532     aSig >>= shiftCount;
5533     z = aSig;
5534     if ( ( aSig<<shiftCount ) != savedASig ) {
5535         float_set_inexact();
5536     }
5537     return z;
5538 
5539 }
5540 
5541 /*
5542 -------------------------------------------------------------------------------
5543 Returns the result of converting the single-precision floating-point value
5544 `a' to the 32-bit unsigned integer format.  The conversion is
5545 performed according to the IEC/IEEE Standard for Binary Floating-point
5546 Arithmetic, except that the conversion is always rounded toward zero.  If
5547 `a' is a NaN, the largest positive integer is returned.  If the conversion
5548 overflows, the largest positive integer is returned.
5549 -------------------------------------------------------------------------------
5550 */
5551 uint32 float32_to_uint32_round_to_zero( float32 a )
5552 {
5553     flag aSign;
5554     int16 aExp, shiftCount;
5555     bits32 aSig;
5556     uint32 z;
5557 
5558     aSig = extractFloat32Frac( a );
5559     aExp = extractFloat32Exp( a );
5560     aSign = extractFloat32Sign( a );
5561     shiftCount = aExp - 0x9E;
5562 
5563     if (aSign) {
5564         float_raise( float_flag_invalid );
5565     	return(0);
5566     }
5567     if ( 0 < shiftCount ) {
5568         float_raise( float_flag_invalid );
5569         return 0xFFFFFFFF;
5570     }
5571     else if ( aExp <= 0x7E ) {
5572         if ( aExp | aSig ) float_set_inexact();
5573         return 0;
5574     }
5575     aSig = ( aSig | 0x800000 )<<8;
5576     z = aSig>>( - shiftCount );
5577     if ( aSig<<( shiftCount & 31 ) ) {
5578         float_set_inexact();
5579     }
5580     return z;
5581 
5582 }
5583 
5584 #endif
5585