xref: /netbsd-src/sys/lib/libkern/softfloat.c (revision 16dce51364ebe8aeafbae46bc5aa167b8115bc45)
1 /* $NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* If you need this in a boot program, you have bigger problems... */
48 #ifndef _STANDALONE
49 
50 #include <sys/cdefs.h>
51 #if defined(LIBC_SCCS) && !defined(lint)
52 __RCSID("$NetBSD: softfloat.c,v 1.6 2017/12/31 11:43:42 martin Exp $");
53 #endif /* LIBC_SCCS and not lint */
54 
55 #ifdef SOFTFLOAT_FOR_GCC
56 #include "softfloat-for-gcc.h"
57 #endif
58 
59 #include "milieu.h"
60 #include "softfloat.h"
61 
62 /*
63  * Conversions between floats as stored in memory and floats as
64  * SoftFloat uses them
65  */
66 #ifndef FLOAT64_DEMANGLE
67 #define FLOAT64_DEMANGLE(a)	(a)
68 #endif
69 #ifndef FLOAT64_MANGLE
70 #define FLOAT64_MANGLE(a)	(a)
71 #endif
72 
73 /*
74 -------------------------------------------------------------------------------
75 Floating-point rounding mode, extended double-precision rounding precision,
76 and exception flags.
77 -------------------------------------------------------------------------------
78 */
79 
80 /*
81  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
82  * 	Right now, it does not.  I've removed all other dynamic global
83  * 	variables. [ross]
84  */
85 #ifdef FLOATX80
86 int8 floatx80_rounding_precision = 80;
87 #endif
88 
89 /*
90 -------------------------------------------------------------------------------
91 Primitive arithmetic functions, including multi-word arithmetic, and
92 division and square root approximations.  (Can be specialized to target if
93 desired.)
94 -------------------------------------------------------------------------------
95 */
96 #include "softfloat-macros.h"
97 
98 /*
99 -------------------------------------------------------------------------------
100 Functions and definitions to determine:  (1) whether tininess for underflow
101 is detected before or after rounding by default, (2) what (if anything)
102 happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 are propagated from function inputs to output.  These details are target-
105 specific.
106 -------------------------------------------------------------------------------
107 */
108 #include "softfloat-specialize.h"
109 
110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
111 /*
112 -------------------------------------------------------------------------------
113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
114 and 7, and returns the properly rounded 32-bit integer corresponding to the
115 input.  If `zSign' is 1, the input is negated before being converted to an
116 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
117 is simply rounded to an integer, with the inexact exception raised if the
118 input cannot be represented exactly as an integer.  However, if the fixed-
119 point input is too large, the invalid exception is raised and the largest
120 positive or negative integer is returned.
121 -------------------------------------------------------------------------------
122 */
123 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
124 {
125     int8 roundingMode;
126     flag roundNearestEven;
127     int8 roundIncrement, roundBits;
128     int32 z;
129 
130     roundingMode = float_rounding_mode();
131     roundNearestEven = ( roundingMode == float_round_nearest_even );
132     roundIncrement = 0x40;
133     if ( ! roundNearestEven ) {
134         if ( roundingMode == float_round_to_zero ) {
135             roundIncrement = 0;
136         }
137         else {
138             roundIncrement = 0x7F;
139             if ( zSign ) {
140                 if ( roundingMode == float_round_up ) roundIncrement = 0;
141             }
142             else {
143                 if ( roundingMode == float_round_down ) roundIncrement = 0;
144             }
145         }
146     }
147     roundBits = absZ & 0x7F;
148     absZ = ( absZ + roundIncrement )>>7;
149     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
150     z = absZ;
151     if ( zSign ) z = - z;
152     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
153         float_raise( float_flag_invalid );
154         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
155     }
156     if ( roundBits ) float_set_inexact();
157     return z;
158 
159 }
160 
161 /*
162 -------------------------------------------------------------------------------
163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
164 `absZ1', with binary point between bits 63 and 64 (between the input words),
165 and returns the properly rounded 64-bit integer corresponding to the input.
166 If `zSign' is 1, the input is negated before being converted to an integer.
167 Ordinarily, the fixed-point input is simply rounded to an integer, with
168 the inexact exception raised if the input cannot be represented exactly as
169 an integer.  However, if the fixed-point input is too large, the invalid
170 exception is raised and the largest positive or negative integer is
171 returned.
172 -------------------------------------------------------------------------------
173 */
174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
175 {
176     int8 roundingMode;
177     flag roundNearestEven, increment;
178     int64 z;
179 
180     roundingMode = float_rounding_mode();
181     roundNearestEven = ( roundingMode == float_round_nearest_even );
182     increment = ( (sbits64) absZ1 < 0 );
183     if ( ! roundNearestEven ) {
184         if ( roundingMode == float_round_to_zero ) {
185             increment = 0;
186         }
187         else {
188             if ( zSign ) {
189                 increment = ( roundingMode == float_round_down ) && absZ1;
190             }
191             else {
192                 increment = ( roundingMode == float_round_up ) && absZ1;
193             }
194         }
195     }
196     if ( increment ) {
197         ++absZ0;
198         if ( absZ0 == 0 ) goto overflow;
199         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
200     }
201     z = absZ0;
202     if ( zSign ) z = - z;
203     if ( z && ( ( z < 0 ) ^ zSign ) ) {
204  overflow:
205         float_raise( float_flag_invalid );
206         return
207               zSign ? (sbits64) LIT64( 0x8000000000000000 )
208             : LIT64( 0x7FFFFFFFFFFFFFFF );
209     }
210     if ( absZ1 ) float_set_inexact();
211     return z;
212 
213 }
214 
215 /* same as above, but for unsigned values */
216 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
217 {
218     int8 roundingMode;
219     flag roundNearestEven, increment;
220     uint64 z;
221 
222     roundingMode = float_rounding_mode();
223     roundNearestEven = ( roundingMode == float_round_nearest_even );
224     increment = ( (sbits64) absZ1 < 0 );
225     if ( ! roundNearestEven ) {
226         if ( roundingMode == float_round_to_zero ) {
227             increment = 0;
228         }
229         else {
230             increment = ( roundingMode == float_round_up ) && absZ1;
231         }
232     }
233     if ( increment ) {
234         ++absZ0;
235         if ( absZ0 == 0 ) {
236             float_raise( float_flag_invalid );
237             return LIT64( 0x7FFFFFFFFFFFFFFF );
238 	}
239         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
240     }
241     z = absZ0;
242     if ( absZ1 ) float_set_inexact();
243     return z;
244 
245 }
246 #endif
247 
248 /*
249 -------------------------------------------------------------------------------
250 Returns the fraction bits of the single-precision floating-point value `a'.
251 -------------------------------------------------------------------------------
252 */
253 INLINE bits32 extractFloat32Frac( float32 a )
254 {
255 
256     return a & 0x007FFFFF;
257 
258 }
259 
260 /*
261 -------------------------------------------------------------------------------
262 Returns the exponent bits of the single-precision floating-point value `a'.
263 -------------------------------------------------------------------------------
264 */
265 INLINE int16 extractFloat32Exp( float32 a )
266 {
267 
268     return ( a>>23 ) & 0xFF;
269 
270 }
271 
272 /*
273 -------------------------------------------------------------------------------
274 Returns the sign bit of the single-precision floating-point value `a'.
275 -------------------------------------------------------------------------------
276 */
277 INLINE flag extractFloat32Sign( float32 a )
278 {
279 
280     return a>>31;
281 
282 }
283 
284 /*
285 -------------------------------------------------------------------------------
286 Normalizes the subnormal single-precision floating-point value represented
287 by the denormalized significand `aSig'.  The normalized exponent and
288 significand are stored at the locations pointed to by `zExpPtr' and
289 `zSigPtr', respectively.
290 -------------------------------------------------------------------------------
291 */
292 static void
293  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
294 {
295     int8 shiftCount;
296 
297     shiftCount = countLeadingZeros32( aSig ) - 8;
298     *zSigPtr = aSig<<shiftCount;
299     *zExpPtr = 1 - shiftCount;
300 
301 }
302 
303 /*
304 -------------------------------------------------------------------------------
305 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
306 single-precision floating-point value, returning the result.  After being
307 shifted into the proper positions, the three fields are simply added
308 together to form the result.  This means that any integer portion of `zSig'
309 will be added into the exponent.  Since a properly normalized significand
310 will have an integer portion equal to 1, the `zExp' input should be 1 less
311 than the desired result exponent whenever `zSig' is a complete, normalized
312 significand.
313 -------------------------------------------------------------------------------
314 */
315 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
316 {
317 
318     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
319 
320 }
321 
322 /*
323 -------------------------------------------------------------------------------
324 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
325 and significand `zSig', and returns the proper single-precision floating-
326 point value corresponding to the abstract input.  Ordinarily, the abstract
327 value is simply rounded and packed into the single-precision format, with
328 the inexact exception raised if the abstract input cannot be represented
329 exactly.  However, if the abstract value is too large, the overflow and
330 inexact exceptions are raised and an infinity or maximal finite value is
331 returned.  If the abstract value is too small, the input value is rounded to
332 a subnormal number, and the underflow and inexact exceptions are raised if
333 the abstract input cannot be represented exactly as a subnormal single-
334 precision floating-point number.
335     The input significand `zSig' has its binary point between bits 30
336 and 29, which is 7 bits to the left of the usual location.  This shifted
337 significand must be normalized or smaller.  If `zSig' is not normalized,
338 `zExp' must be 0; in that case, the result returned is a subnormal number,
339 and it must not require rounding.  In the usual case that `zSig' is
340 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
341 The handling of underflow and overflow follows the IEC/IEEE Standard for
342 Binary Floating-Point Arithmetic.
343 -------------------------------------------------------------------------------
344 */
345 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
346 {
347     int8 roundingMode;
348     flag roundNearestEven;
349     int8 roundIncrement, roundBits;
350     flag isTiny;
351 
352     roundingMode = float_rounding_mode();
353     roundNearestEven = ( roundingMode == float_round_nearest_even );
354     roundIncrement = 0x40;
355     if ( ! roundNearestEven ) {
356         if ( roundingMode == float_round_to_zero ) {
357             roundIncrement = 0;
358         }
359         else {
360             roundIncrement = 0x7F;
361             if ( zSign ) {
362                 if ( roundingMode == float_round_up ) roundIncrement = 0;
363             }
364             else {
365                 if ( roundingMode == float_round_down ) roundIncrement = 0;
366             }
367         }
368     }
369     roundBits = zSig & 0x7F;
370     if ( 0xFD <= (bits16) zExp ) {
371         if (    ( 0xFD < zExp )
372              || (    ( zExp == 0xFD )
373                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
374            ) {
375             float_raise( float_flag_overflow | float_flag_inexact );
376             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
377         }
378         if ( zExp < 0 ) {
379             isTiny =
380                    ( float_detect_tininess == float_tininess_before_rounding )
381                 || ( zExp < -1 )
382                 || ( zSig + roundIncrement < 0x80000000 );
383             shift32RightJamming( zSig, - zExp, &zSig );
384             zExp = 0;
385             roundBits = zSig & 0x7F;
386             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
387         }
388     }
389     if ( roundBits ) float_set_inexact();
390     zSig = ( zSig + roundIncrement )>>7;
391     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
392     if ( zSig == 0 ) zExp = 0;
393     return packFloat32( zSign, zExp, zSig );
394 
395 }
396 
397 /*
398 -------------------------------------------------------------------------------
399 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
400 and significand `zSig', and returns the proper single-precision floating-
401 point value corresponding to the abstract input.  This routine is just like
402 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
403 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
404 floating-point exponent.
405 -------------------------------------------------------------------------------
406 */
407 static float32
408  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
409 {
410     int8 shiftCount;
411 
412     shiftCount = countLeadingZeros32( zSig ) - 1;
413     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
414 
415 }
416 
417 /*
418 -------------------------------------------------------------------------------
419 Returns the fraction bits of the double-precision floating-point value `a'.
420 -------------------------------------------------------------------------------
421 */
422 INLINE bits64 extractFloat64Frac( float64 a )
423 {
424 
425     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
426 
427 }
428 
429 /*
430 -------------------------------------------------------------------------------
431 Returns the exponent bits of the double-precision floating-point value `a'.
432 -------------------------------------------------------------------------------
433 */
434 INLINE int16 extractFloat64Exp( float64 a )
435 {
436 
437     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
438 
439 }
440 
441 /*
442 -------------------------------------------------------------------------------
443 Returns the sign bit of the double-precision floating-point value `a'.
444 -------------------------------------------------------------------------------
445 */
446 INLINE flag extractFloat64Sign( float64 a )
447 {
448 
449     return FLOAT64_DEMANGLE(a)>>63;
450 
451 }
452 
453 /*
454 -------------------------------------------------------------------------------
455 Normalizes the subnormal double-precision floating-point value represented
456 by the denormalized significand `aSig'.  The normalized exponent and
457 significand are stored at the locations pointed to by `zExpPtr' and
458 `zSigPtr', respectively.
459 -------------------------------------------------------------------------------
460 */
461 static void
462  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
463 {
464     int8 shiftCount;
465 
466     shiftCount = countLeadingZeros64( aSig ) - 11;
467     *zSigPtr = aSig<<shiftCount;
468     *zExpPtr = 1 - shiftCount;
469 
470 }
471 
472 /*
473 -------------------------------------------------------------------------------
474 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
475 double-precision floating-point value, returning the result.  After being
476 shifted into the proper positions, the three fields are simply added
477 together to form the result.  This means that any integer portion of `zSig'
478 will be added into the exponent.  Since a properly normalized significand
479 will have an integer portion equal to 1, the `zExp' input should be 1 less
480 than the desired result exponent whenever `zSig' is a complete, normalized
481 significand.
482 -------------------------------------------------------------------------------
483 */
484 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
485 {
486 
487     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
488 			   ( ( (bits64) zExp )<<52 ) + zSig );
489 
490 }
491 
492 /*
493 -------------------------------------------------------------------------------
494 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
495 and significand `zSig', and returns the proper double-precision floating-
496 point value corresponding to the abstract input.  Ordinarily, the abstract
497 value is simply rounded and packed into the double-precision format, with
498 the inexact exception raised if the abstract input cannot be represented
499 exactly.  However, if the abstract value is too large, the overflow and
500 inexact exceptions are raised and an infinity or maximal finite value is
501 returned.  If the abstract value is too small, the input value is rounded to
502 a subnormal number, and the underflow and inexact exceptions are raised if
503 the abstract input cannot be represented exactly as a subnormal double-
504 precision floating-point number.
505     The input significand `zSig' has its binary point between bits 62
506 and 61, which is 10 bits to the left of the usual location.  This shifted
507 significand must be normalized or smaller.  If `zSig' is not normalized,
508 `zExp' must be 0; in that case, the result returned is a subnormal number,
509 and it must not require rounding.  In the usual case that `zSig' is
510 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
511 The handling of underflow and overflow follows the IEC/IEEE Standard for
512 Binary Floating-Point Arithmetic.
513 -------------------------------------------------------------------------------
514 */
515 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
516 {
517     int8 roundingMode;
518     flag roundNearestEven;
519     int16 roundIncrement, roundBits;
520     flag isTiny;
521 
522     roundingMode = float_rounding_mode();
523     roundNearestEven = ( roundingMode == float_round_nearest_even );
524     roundIncrement = 0x200;
525     if ( ! roundNearestEven ) {
526         if ( roundingMode == float_round_to_zero ) {
527             roundIncrement = 0;
528         }
529         else {
530             roundIncrement = 0x3FF;
531             if ( zSign ) {
532                 if ( roundingMode == float_round_up ) roundIncrement = 0;
533             }
534             else {
535                 if ( roundingMode == float_round_down ) roundIncrement = 0;
536             }
537         }
538     }
539     roundBits = zSig & 0x3FF;
540     if ( 0x7FD <= (bits16) zExp ) {
541         if (    ( 0x7FD < zExp )
542              || (    ( zExp == 0x7FD )
543                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
544            ) {
545             float_raise( float_flag_overflow | float_flag_inexact );
546             return FLOAT64_MANGLE(
547 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
548 		( roundIncrement == 0 ));
549         }
550         if ( zExp < 0 ) {
551             isTiny =
552                    ( float_detect_tininess == float_tininess_before_rounding )
553                 || ( zExp < -1 )
554                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
555             shift64RightJamming( zSig, - zExp, &zSig );
556             zExp = 0;
557             roundBits = zSig & 0x3FF;
558             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
559         }
560     }
561     if ( roundBits ) float_set_inexact();
562     zSig = ( zSig + roundIncrement )>>10;
563     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
564     if ( zSig == 0 ) zExp = 0;
565     return packFloat64( zSign, zExp, zSig );
566 
567 }
568 
569 /*
570 -------------------------------------------------------------------------------
571 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
572 and significand `zSig', and returns the proper double-precision floating-
573 point value corresponding to the abstract input.  This routine is just like
574 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
575 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
576 floating-point exponent.
577 -------------------------------------------------------------------------------
578 */
579 static float64
580  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
581 {
582     int8 shiftCount;
583 
584     shiftCount = countLeadingZeros64( zSig ) - 1;
585     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
586 
587 }
588 
589 #ifdef FLOATX80
590 
591 /*
592 -------------------------------------------------------------------------------
593 Returns the fraction bits of the extended double-precision floating-point
594 value `a'.
595 -------------------------------------------------------------------------------
596 */
597 INLINE bits64 extractFloatx80Frac( floatx80 a )
598 {
599 
600     return a.low;
601 
602 }
603 
604 /*
605 -------------------------------------------------------------------------------
606 Returns the exponent bits of the extended double-precision floating-point
607 value `a'.
608 -------------------------------------------------------------------------------
609 */
610 INLINE int32 extractFloatx80Exp( floatx80 a )
611 {
612 
613     return a.high & 0x7FFF;
614 
615 }
616 
617 /*
618 -------------------------------------------------------------------------------
619 Returns the sign bit of the extended double-precision floating-point value
620 `a'.
621 -------------------------------------------------------------------------------
622 */
623 INLINE flag extractFloatx80Sign( floatx80 a )
624 {
625 
626     return a.high>>15;
627 
628 }
629 
630 /*
631 -------------------------------------------------------------------------------
632 Normalizes the subnormal extended double-precision floating-point value
633 represented by the denormalized significand `aSig'.  The normalized exponent
634 and significand are stored at the locations pointed to by `zExpPtr' and
635 `zSigPtr', respectively.
636 -------------------------------------------------------------------------------
637 */
638 static void
639  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
640 {
641     int8 shiftCount;
642 
643     shiftCount = countLeadingZeros64( aSig );
644     *zSigPtr = aSig<<shiftCount;
645     *zExpPtr = 1 - shiftCount;
646 
647 }
648 
649 /*
650 -------------------------------------------------------------------------------
651 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
652 extended double-precision floating-point value, returning the result.
653 -------------------------------------------------------------------------------
654 */
655 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
656 {
657     floatx80 z;
658 
659     z.low = zSig;
660     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
661     return z;
662 
663 }
664 
665 /*
666 -------------------------------------------------------------------------------
667 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 and extended significand formed by the concatenation of `zSig0' and `zSig1',
669 and returns the proper extended double-precision floating-point value
670 corresponding to the abstract input.  Ordinarily, the abstract value is
671 rounded and packed into the extended double-precision format, with the
672 inexact exception raised if the abstract input cannot be represented
673 exactly.  However, if the abstract value is too large, the overflow and
674 inexact exceptions are raised and an infinity or maximal finite value is
675 returned.  If the abstract value is too small, the input value is rounded to
676 a subnormal number, and the underflow and inexact exceptions are raised if
677 the abstract input cannot be represented exactly as a subnormal extended
678 double-precision floating-point number.
679     If `roundingPrecision' is 32 or 64, the result is rounded to the same
680 number of bits as single or double precision, respectively.  Otherwise, the
681 result is rounded to the full precision of the extended double-precision
682 format.
683     The input significand must be normalized or smaller.  If the input
684 significand is not normalized, `zExp' must be 0; in that case, the result
685 returned is a subnormal number, and it must not require rounding.  The
686 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
687 Floating-Point Arithmetic.
688 -------------------------------------------------------------------------------
689 */
690 static floatx80
691  roundAndPackFloatx80(
692      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
693  )
694 {
695     int8 roundingMode;
696     flag roundNearestEven, increment, isTiny;
697     int64 roundIncrement, roundMask, roundBits;
698 
699     roundingMode = float_rounding_mode();
700     roundNearestEven = ( roundingMode == float_round_nearest_even );
701     if ( roundingPrecision == 80 ) goto precision80;
702     if ( roundingPrecision == 64 ) {
703         roundIncrement = LIT64( 0x0000000000000400 );
704         roundMask = LIT64( 0x00000000000007FF );
705     }
706     else if ( roundingPrecision == 32 ) {
707         roundIncrement = LIT64( 0x0000008000000000 );
708         roundMask = LIT64( 0x000000FFFFFFFFFF );
709     }
710     else {
711         goto precision80;
712     }
713     zSig0 |= ( zSig1 != 0 );
714     if ( ! roundNearestEven ) {
715         if ( roundingMode == float_round_to_zero ) {
716             roundIncrement = 0;
717         }
718         else {
719             roundIncrement = roundMask;
720             if ( zSign ) {
721                 if ( roundingMode == float_round_up ) roundIncrement = 0;
722             }
723             else {
724                 if ( roundingMode == float_round_down ) roundIncrement = 0;
725             }
726         }
727     }
728     roundBits = zSig0 & roundMask;
729     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
730         if (    ( 0x7FFE < zExp )
731              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
732            ) {
733             goto overflow;
734         }
735         if ( zExp <= 0 ) {
736             isTiny =
737                    ( float_detect_tininess == float_tininess_before_rounding )
738                 || ( zExp < 0 )
739                 || ( zSig0 <= zSig0 + roundIncrement );
740             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
741             zExp = 0;
742             roundBits = zSig0 & roundMask;
743             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
744             if ( roundBits ) float_set_inexact();
745             zSig0 += roundIncrement;
746             if ( (sbits64) zSig0 < 0 ) zExp = 1;
747             roundIncrement = roundMask + 1;
748             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
749                 roundMask |= roundIncrement;
750             }
751             zSig0 &= ~ roundMask;
752             return packFloatx80( zSign, zExp, zSig0 );
753         }
754     }
755     if ( roundBits ) float_set_inexact();
756     zSig0 += roundIncrement;
757     if ( zSig0 < roundIncrement ) {
758         ++zExp;
759         zSig0 = LIT64( 0x8000000000000000 );
760     }
761     roundIncrement = roundMask + 1;
762     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
763         roundMask |= roundIncrement;
764     }
765     zSig0 &= ~ roundMask;
766     if ( zSig0 == 0 ) zExp = 0;
767     return packFloatx80( zSign, zExp, zSig0 );
768  precision80:
769     increment = ( (sbits64) zSig1 < 0 );
770     if ( ! roundNearestEven ) {
771         if ( roundingMode == float_round_to_zero ) {
772             increment = 0;
773         }
774         else {
775             if ( zSign ) {
776                 increment = ( roundingMode == float_round_down ) && zSig1;
777             }
778             else {
779                 increment = ( roundingMode == float_round_up ) && zSig1;
780             }
781         }
782     }
783     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
784         if (    ( 0x7FFE < zExp )
785              || (    ( zExp == 0x7FFE )
786                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
787                   && increment
788                 )
789            ) {
790             roundMask = 0;
791  overflow:
792             float_raise( float_flag_overflow | float_flag_inexact );
793             if (    ( roundingMode == float_round_to_zero )
794                  || ( zSign && ( roundingMode == float_round_up ) )
795                  || ( ! zSign && ( roundingMode == float_round_down ) )
796                ) {
797                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
798             }
799             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
800         }
801         if ( zExp <= 0 ) {
802             isTiny =
803                    ( float_detect_tininess == float_tininess_before_rounding )
804                 || ( zExp < 0 )
805                 || ! increment
806                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
807             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
808             zExp = 0;
809             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
810             if ( zSig1 ) float_set_inexact();
811             if ( roundNearestEven ) {
812                 increment = ( (sbits64) zSig1 < 0 );
813             }
814             else {
815                 if ( zSign ) {
816                     increment = ( roundingMode == float_round_down ) && zSig1;
817                 }
818                 else {
819                     increment = ( roundingMode == float_round_up ) && zSig1;
820                 }
821             }
822             if ( increment ) {
823                 ++zSig0;
824                 zSig0 &=
825                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
826                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
827             }
828             return packFloatx80( zSign, zExp, zSig0 );
829         }
830     }
831     if ( zSig1 ) float_set_inexact();
832     if ( increment ) {
833         ++zSig0;
834         if ( zSig0 == 0 ) {
835             ++zExp;
836             zSig0 = LIT64( 0x8000000000000000 );
837         }
838         else {
839             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
840         }
841     }
842     else {
843         if ( zSig0 == 0 ) zExp = 0;
844     }
845     return packFloatx80( zSign, zExp, zSig0 );
846 
847 }
848 
849 /*
850 -------------------------------------------------------------------------------
851 Takes an abstract floating-point value having sign `zSign', exponent
852 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
853 and returns the proper extended double-precision floating-point value
854 corresponding to the abstract input.  This routine is just like
855 `roundAndPackFloatx80' except that the input significand does not have to be
856 normalized.
857 -------------------------------------------------------------------------------
858 */
859 static floatx80
860  normalizeRoundAndPackFloatx80(
861      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
862  )
863 {
864     int8 shiftCount;
865 
866     if ( zSig0 == 0 ) {
867         zSig0 = zSig1;
868         zSig1 = 0;
869         zExp -= 64;
870     }
871     shiftCount = countLeadingZeros64( zSig0 );
872     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
873     zExp -= shiftCount;
874     return
875         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
876 
877 }
878 
879 #endif
880 
881 #ifdef FLOAT128
882 
883 /*
884 -------------------------------------------------------------------------------
885 Returns the least-significant 64 fraction bits of the quadruple-precision
886 floating-point value `a'.
887 -------------------------------------------------------------------------------
888 */
889 INLINE bits64 extractFloat128Frac1( float128 a )
890 {
891 
892     return a.low;
893 
894 }
895 
896 /*
897 -------------------------------------------------------------------------------
898 Returns the most-significant 48 fraction bits of the quadruple-precision
899 floating-point value `a'.
900 -------------------------------------------------------------------------------
901 */
902 INLINE bits64 extractFloat128Frac0( float128 a )
903 {
904 
905     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
906 
907 }
908 
909 /*
910 -------------------------------------------------------------------------------
911 Returns the exponent bits of the quadruple-precision floating-point value
912 `a'.
913 -------------------------------------------------------------------------------
914 */
915 INLINE int32 extractFloat128Exp( float128 a )
916 {
917 
918     return ( a.high>>48 ) & 0x7FFF;
919 
920 }
921 
922 /*
923 -------------------------------------------------------------------------------
924 Returns the sign bit of the quadruple-precision floating-point value `a'.
925 -------------------------------------------------------------------------------
926 */
927 INLINE flag extractFloat128Sign( float128 a )
928 {
929 
930     return a.high>>63;
931 
932 }
933 
934 /*
935 -------------------------------------------------------------------------------
936 Normalizes the subnormal quadruple-precision floating-point value
937 represented by the denormalized significand formed by the concatenation of
938 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
939 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
940 significand are stored at the location pointed to by `zSig0Ptr', and the
941 least significant 64 bits of the normalized significand are stored at the
942 location pointed to by `zSig1Ptr'.
943 -------------------------------------------------------------------------------
944 */
945 static void
946  normalizeFloat128Subnormal(
947      bits64 aSig0,
948      bits64 aSig1,
949      int32 *zExpPtr,
950      bits64 *zSig0Ptr,
951      bits64 *zSig1Ptr
952  )
953 {
954     int8 shiftCount;
955 
956     if ( aSig0 == 0 ) {
957         shiftCount = countLeadingZeros64( aSig1 ) - 15;
958         if ( shiftCount < 0 ) {
959             *zSig0Ptr = aSig1>>( - shiftCount );
960             *zSig1Ptr = aSig1<<( shiftCount & 63 );
961         }
962         else {
963             *zSig0Ptr = aSig1<<shiftCount;
964             *zSig1Ptr = 0;
965         }
966         *zExpPtr = - shiftCount - 63;
967     }
968     else {
969         shiftCount = countLeadingZeros64( aSig0 ) - 15;
970         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
971         *zExpPtr = 1 - shiftCount;
972     }
973 
974 }
975 
976 /*
977 -------------------------------------------------------------------------------
978 Packs the sign `zSign', the exponent `zExp', and the significand formed
979 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
980 floating-point value, returning the result.  After being shifted into the
981 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
982 added together to form the most significant 32 bits of the result.  This
983 means that any integer portion of `zSig0' will be added into the exponent.
984 Since a properly normalized significand will have an integer portion equal
985 to 1, the `zExp' input should be 1 less than the desired result exponent
986 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
987 significand.
988 -------------------------------------------------------------------------------
989 */
990 INLINE float128
991  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
992 {
993     float128 z;
994 
995     z.low = zSig1;
996     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
997     return z;
998 
999 }
1000 
1001 /*
1002 -------------------------------------------------------------------------------
1003 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1004 and extended significand formed by the concatenation of `zSig0', `zSig1',
1005 and `zSig2', and returns the proper quadruple-precision floating-point value
1006 corresponding to the abstract input.  Ordinarily, the abstract value is
1007 simply rounded and packed into the quadruple-precision format, with the
1008 inexact exception raised if the abstract input cannot be represented
1009 exactly.  However, if the abstract value is too large, the overflow and
1010 inexact exceptions are raised and an infinity or maximal finite value is
1011 returned.  If the abstract value is too small, the input value is rounded to
1012 a subnormal number, and the underflow and inexact exceptions are raised if
1013 the abstract input cannot be represented exactly as a subnormal quadruple-
1014 precision floating-point number.
1015     The input significand must be normalized or smaller.  If the input
1016 significand is not normalized, `zExp' must be 0; in that case, the result
1017 returned is a subnormal number, and it must not require rounding.  In the
1018 usual case that the input significand is normalized, `zExp' must be 1 less
1019 than the ``true'' floating-point exponent.  The handling of underflow and
1020 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1021 -------------------------------------------------------------------------------
1022 */
1023 static float128
1024  roundAndPackFloat128(
1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
1026 {
1027     int8 roundingMode;
1028     flag roundNearestEven, increment, isTiny;
1029 
1030     roundingMode = float_rounding_mode();
1031     roundNearestEven = ( roundingMode == float_round_nearest_even );
1032     increment = ( (sbits64) zSig2 < 0 );
1033     if ( ! roundNearestEven ) {
1034         if ( roundingMode == float_round_to_zero ) {
1035             increment = 0;
1036         }
1037         else {
1038             if ( zSign ) {
1039                 increment = ( roundingMode == float_round_down ) && zSig2;
1040             }
1041             else {
1042                 increment = ( roundingMode == float_round_up ) && zSig2;
1043             }
1044         }
1045     }
1046     if ( 0x7FFD <= (bits32) zExp ) {
1047         if (    ( 0x7FFD < zExp )
1048              || (    ( zExp == 0x7FFD )
1049                   && eq128(
1050                          LIT64( 0x0001FFFFFFFFFFFF ),
1051                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1052                          zSig0,
1053                          zSig1
1054                      )
1055                   && increment
1056                 )
1057            ) {
1058             float_raise( float_flag_overflow | float_flag_inexact );
1059             if (    ( roundingMode == float_round_to_zero )
1060                  || ( zSign && ( roundingMode == float_round_up ) )
1061                  || ( ! zSign && ( roundingMode == float_round_down ) )
1062                ) {
1063                 return
1064                     packFloat128(
1065                         zSign,
1066                         0x7FFE,
1067                         LIT64( 0x0000FFFFFFFFFFFF ),
1068                         LIT64( 0xFFFFFFFFFFFFFFFF )
1069                     );
1070             }
1071             return packFloat128( zSign, 0x7FFF, 0, 0 );
1072         }
1073         if ( zExp < 0 ) {
1074             isTiny =
1075                    ( float_detect_tininess == float_tininess_before_rounding )
1076                 || ( zExp < -1 )
1077                 || ! increment
1078                 || lt128(
1079                        zSig0,
1080                        zSig1,
1081                        LIT64( 0x0001FFFFFFFFFFFF ),
1082                        LIT64( 0xFFFFFFFFFFFFFFFF )
1083                    );
1084             shift128ExtraRightJamming(
1085                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1086             zExp = 0;
1087             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1088             if ( roundNearestEven ) {
1089                 increment = ( (sbits64) zSig2 < 0 );
1090             }
1091             else {
1092                 if ( zSign ) {
1093                     increment = ( roundingMode == float_round_down ) && zSig2;
1094                 }
1095                 else {
1096                     increment = ( roundingMode == float_round_up ) && zSig2;
1097                 }
1098             }
1099         }
1100     }
1101     if ( zSig2 ) float_set_inexact();
1102     if ( increment ) {
1103         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1104         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1105     }
1106     else {
1107         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1108     }
1109     return packFloat128( zSign, zExp, zSig0, zSig1 );
1110 
1111 }
1112 
1113 /*
1114 -------------------------------------------------------------------------------
1115 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1116 and significand formed by the concatenation of `zSig0' and `zSig1', and
1117 returns the proper quadruple-precision floating-point value corresponding
1118 to the abstract input.  This routine is just like `roundAndPackFloat128'
1119 except that the input significand has fewer bits and does not have to be
1120 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1121 point exponent.
1122 -------------------------------------------------------------------------------
1123 */
1124 static float128
1125  normalizeRoundAndPackFloat128(
1126      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1127 {
1128     int8 shiftCount;
1129     bits64 zSig2;
1130 
1131     if ( zSig0 == 0 ) {
1132         zSig0 = zSig1;
1133         zSig1 = 0;
1134         zExp -= 64;
1135     }
1136     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1137     if ( 0 <= shiftCount ) {
1138         zSig2 = 0;
1139         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1140     }
1141     else {
1142         shift128ExtraRightJamming(
1143             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1144     }
1145     zExp -= shiftCount;
1146     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1147 
1148 }
1149 
1150 #endif
1151 
1152 /*
1153 -------------------------------------------------------------------------------
1154 Returns the result of converting the 32-bit two's complement integer `a'
1155 to the single-precision floating-point format.  The conversion is performed
1156 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1157 -------------------------------------------------------------------------------
1158 */
1159 float32 int32_to_float32( int32 a )
1160 {
1161     flag zSign;
1162 
1163     if ( a == 0 ) return 0;
1164     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1165     zSign = ( a < 0 );
1166     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1167 
1168 }
1169 
1170 /*
1171 -------------------------------------------------------------------------------
1172 Returns the result of converting the 32-bit two's complement integer `a'
1173 to the double-precision floating-point format.  The conversion is performed
1174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1175 -------------------------------------------------------------------------------
1176 */
1177 float64 int32_to_float64( int32 a )
1178 {
1179     flag zSign;
1180     uint32 absA;
1181     int8 shiftCount;
1182     bits64 zSig;
1183 
1184     if ( a == 0 ) return 0;
1185     zSign = ( a < 0 );
1186     absA = zSign ? - a : a;
1187     shiftCount = countLeadingZeros32( absA ) + 21;
1188     zSig = absA;
1189     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1190 
1191 }
1192 
1193 #ifdef FLOATX80
1194 
1195 /*
1196 -------------------------------------------------------------------------------
1197 Returns the result of converting the 32-bit two's complement integer `a'
1198 to the extended double-precision floating-point format.  The conversion
1199 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1200 Arithmetic.
1201 -------------------------------------------------------------------------------
1202 */
1203 floatx80 int32_to_floatx80( int32 a )
1204 {
1205     flag zSign;
1206     uint32 absA;
1207     int8 shiftCount;
1208     bits64 zSig;
1209 
1210     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1211     zSign = ( a < 0 );
1212     absA = zSign ? - a : a;
1213     shiftCount = countLeadingZeros32( absA ) + 32;
1214     zSig = absA;
1215     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1216 
1217 }
1218 
1219 #endif
1220 
1221 #ifdef FLOAT128
1222 
1223 /*
1224 -------------------------------------------------------------------------------
1225 Returns the result of converting the 32-bit two's complement integer `a' to
1226 the quadruple-precision floating-point format.  The conversion is performed
1227 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1228 -------------------------------------------------------------------------------
1229 */
1230 float128 int32_to_float128( int32 a )
1231 {
1232     flag zSign;
1233     uint32 absA;
1234     int8 shiftCount;
1235     bits64 zSig0;
1236 
1237     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1238     zSign = ( a < 0 );
1239     absA = zSign ? - a : a;
1240     shiftCount = countLeadingZeros32( absA ) + 17;
1241     zSig0 = absA;
1242     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1243 
1244 }
1245 
1246 #endif
1247 
1248 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1249 /*
1250 -------------------------------------------------------------------------------
1251 Returns the result of converting the 64-bit two's complement integer `a'
1252 to the single-precision floating-point format.  The conversion is performed
1253 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1254 -------------------------------------------------------------------------------
1255 */
1256 float32 int64_to_float32( int64 a )
1257 {
1258     flag zSign;
1259     uint64 absA;
1260     int8 shiftCount;
1261 
1262     if ( a == 0 ) return 0;
1263     zSign = ( a < 0 );
1264     absA = zSign ? - a : a;
1265     shiftCount = countLeadingZeros64( absA ) - 40;
1266     if ( 0 <= shiftCount ) {
1267         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1268     }
1269     else {
1270         shiftCount += 7;
1271         if ( shiftCount < 0 ) {
1272             shift64RightJamming( absA, - shiftCount, &absA );
1273         }
1274         else {
1275             absA <<= shiftCount;
1276         }
1277         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1278     }
1279 
1280 }
1281 
1282 /*
1283 -------------------------------------------------------------------------------
1284 Returns the result of converting the 64-bit two's complement integer `a'
1285 to the double-precision floating-point format.  The conversion is performed
1286 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1287 -------------------------------------------------------------------------------
1288 */
1289 float64 int64_to_float64( int64 a )
1290 {
1291     flag zSign;
1292 
1293     if ( a == 0 ) return 0;
1294     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1295         return packFloat64( 1, 0x43E, 0 );
1296     }
1297     zSign = ( a < 0 );
1298     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1299 
1300 }
1301 
1302 #ifdef FLOATX80
1303 
1304 /*
1305 -------------------------------------------------------------------------------
1306 Returns the result of converting the 64-bit two's complement integer `a'
1307 to the extended double-precision floating-point format.  The conversion
1308 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1309 Arithmetic.
1310 -------------------------------------------------------------------------------
1311 */
1312 floatx80 int64_to_floatx80( int64 a )
1313 {
1314     flag zSign;
1315     uint64 absA;
1316     int8 shiftCount;
1317 
1318     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1319     zSign = ( a < 0 );
1320     absA = zSign ? - a : a;
1321     shiftCount = countLeadingZeros64( absA );
1322     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1323 
1324 }
1325 
1326 #endif
1327 
1328 #ifdef FLOAT128
1329 
1330 /*
1331 -------------------------------------------------------------------------------
1332 Returns the result of converting the 64-bit two's complement integer `a' to
1333 the quadruple-precision floating-point format.  The conversion is performed
1334 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1335 -------------------------------------------------------------------------------
1336 */
1337 float128 int64_to_float128( int64 a )
1338 {
1339     flag zSign;
1340     uint64 absA;
1341     int8 shiftCount;
1342     int32 zExp;
1343     bits64 zSig0, zSig1;
1344 
1345     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1346     zSign = ( a < 0 );
1347     absA = zSign ? - a : a;
1348     shiftCount = countLeadingZeros64( absA ) + 49;
1349     zExp = 0x406E - shiftCount;
1350     if ( 64 <= shiftCount ) {
1351         zSig1 = 0;
1352         zSig0 = absA;
1353         shiftCount -= 64;
1354     }
1355     else {
1356         zSig1 = absA;
1357         zSig0 = 0;
1358     }
1359     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1360     return packFloat128( zSign, zExp, zSig0, zSig1 );
1361 
1362 }
1363 
1364 #endif
1365 #endif /* !SOFTFLOAT_FOR_GCC */
1366 
1367 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1368 /*
1369 -------------------------------------------------------------------------------
1370 Returns the result of converting the single-precision floating-point value
1371 `a' to the 32-bit two's complement integer format.  The conversion is
1372 performed according to the IEC/IEEE Standard for Binary Floating-Point
1373 Arithmetic---which means in particular that the conversion is rounded
1374 according to the current rounding mode.  If `a' is a NaN, the largest
1375 positive integer is returned.  Otherwise, if the conversion overflows, the
1376 largest integer with the same sign as `a' is returned.
1377 -------------------------------------------------------------------------------
1378 */
1379 int32 float32_to_int32( float32 a )
1380 {
1381     flag aSign;
1382     int16 aExp, shiftCount;
1383     bits32 aSig;
1384     bits64 aSig64;
1385 
1386     aSig = extractFloat32Frac( a );
1387     aExp = extractFloat32Exp( a );
1388     aSign = extractFloat32Sign( a );
1389     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1390     if ( aExp ) aSig |= 0x00800000;
1391     shiftCount = 0xAF - aExp;
1392     aSig64 = aSig;
1393     aSig64 <<= 32;
1394     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1395     return roundAndPackInt32( aSign, aSig64 );
1396 
1397 }
1398 #endif /* !SOFTFLOAT_FOR_GCC */
1399 
1400 /*
1401 -------------------------------------------------------------------------------
1402 Returns the result of converting the single-precision floating-point value
1403 `a' to the 32-bit two's complement integer format.  The conversion is
1404 performed according to the IEC/IEEE Standard for Binary Floating-Point
1405 Arithmetic, except that the conversion is always rounded toward zero.
1406 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1407 the conversion overflows, the largest integer with the same sign as `a' is
1408 returned.
1409 -------------------------------------------------------------------------------
1410 */
1411 int32 float32_to_int32_round_to_zero( float32 a )
1412 {
1413     flag aSign;
1414     int16 aExp, shiftCount;
1415     bits32 aSig;
1416     int32 z;
1417 
1418     aSig = extractFloat32Frac( a );
1419     aExp = extractFloat32Exp( a );
1420     aSign = extractFloat32Sign( a );
1421     shiftCount = aExp - 0x9E;
1422     if ( 0 <= shiftCount ) {
1423         if ( a != 0xCF000000 ) {
1424             float_raise( float_flag_invalid );
1425             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1426         }
1427         return (sbits32) 0x80000000;
1428     }
1429     else if ( aExp <= 0x7E ) {
1430         if ( aExp | aSig ) float_set_inexact();
1431         return 0;
1432     }
1433     aSig = ( aSig | 0x00800000 )<<8;
1434     z = aSig>>( - shiftCount );
1435     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1436         float_set_inexact();
1437     }
1438     if ( aSign ) z = - z;
1439     return z;
1440 
1441 }
1442 
1443 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1444 /*
1445 -------------------------------------------------------------------------------
1446 Returns the result of converting the single-precision floating-point value
1447 `a' to the 64-bit two's complement integer format.  The conversion is
1448 performed according to the IEC/IEEE Standard for Binary Floating-Point
1449 Arithmetic---which means in particular that the conversion is rounded
1450 according to the current rounding mode.  If `a' is a NaN, the largest
1451 positive integer is returned.  Otherwise, if the conversion overflows, the
1452 largest integer with the same sign as `a' is returned.
1453 -------------------------------------------------------------------------------
1454 */
1455 int64 float32_to_int64( float32 a )
1456 {
1457     flag aSign;
1458     int16 aExp, shiftCount;
1459     bits32 aSig;
1460     bits64 aSig64, aSigExtra;
1461 
1462     aSig = extractFloat32Frac( a );
1463     aExp = extractFloat32Exp( a );
1464     aSign = extractFloat32Sign( a );
1465     shiftCount = 0xBE - aExp;
1466     if ( shiftCount < 0 ) {
1467         float_raise( float_flag_invalid );
1468         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1469             return LIT64( 0x7FFFFFFFFFFFFFFF );
1470         }
1471         return (sbits64) LIT64( 0x8000000000000000 );
1472     }
1473     if ( aExp ) aSig |= 0x00800000;
1474     aSig64 = aSig;
1475     aSig64 <<= 40;
1476     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1477     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1478 
1479 }
1480 
1481 /*
1482 -------------------------------------------------------------------------------
1483 Returns the result of converting the single-precision floating-point value
1484 `a' to the 64-bit two's complement integer format.  The conversion is
1485 performed according to the IEC/IEEE Standard for Binary Floating-Point
1486 Arithmetic, except that the conversion is always rounded toward zero.  If
1487 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1488 conversion overflows, the largest integer with the same sign as `a' is
1489 returned.
1490 -------------------------------------------------------------------------------
1491 */
1492 int64 float32_to_int64_round_to_zero( float32 a )
1493 {
1494     flag aSign;
1495     int16 aExp, shiftCount;
1496     bits32 aSig;
1497     bits64 aSig64;
1498     int64 z;
1499 
1500     aSig = extractFloat32Frac( a );
1501     aExp = extractFloat32Exp( a );
1502     aSign = extractFloat32Sign( a );
1503     shiftCount = aExp - 0xBE;
1504     if ( 0 <= shiftCount ) {
1505         if ( a != 0xDF000000 ) {
1506             float_raise( float_flag_invalid );
1507             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1508                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1509             }
1510         }
1511         return (sbits64) LIT64( 0x8000000000000000 );
1512     }
1513     else if ( aExp <= 0x7E ) {
1514         if ( aExp | aSig ) float_set_inexact();
1515         return 0;
1516     }
1517     aSig64 = aSig | 0x00800000;
1518     aSig64 <<= 40;
1519     z = aSig64>>( - shiftCount );
1520     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1521         float_set_inexact();
1522     }
1523     if ( aSign ) z = - z;
1524     return z;
1525 
1526 }
1527 #endif /* !SOFTFLOAT_FOR_GCC */
1528 
1529 /*
1530 -------------------------------------------------------------------------------
1531 Returns the result of converting the single-precision floating-point value
1532 `a' to the double-precision floating-point format.  The conversion is
1533 performed according to the IEC/IEEE Standard for Binary Floating-Point
1534 Arithmetic.
1535 -------------------------------------------------------------------------------
1536 */
1537 float64 float32_to_float64( float32 a )
1538 {
1539     flag aSign;
1540     int16 aExp;
1541     bits32 aSig;
1542 
1543     aSig = extractFloat32Frac( a );
1544     aExp = extractFloat32Exp( a );
1545     aSign = extractFloat32Sign( a );
1546     if ( aExp == 0xFF ) {
1547         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1548         return packFloat64( aSign, 0x7FF, 0 );
1549     }
1550     if ( aExp == 0 ) {
1551         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1552         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1553         --aExp;
1554     }
1555     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1556 
1557 }
1558 
1559 #ifdef FLOATX80
1560 
1561 /*
1562 -------------------------------------------------------------------------------
1563 Returns the result of converting the single-precision floating-point value
1564 `a' to the extended double-precision floating-point format.  The conversion
1565 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1566 Arithmetic.
1567 -------------------------------------------------------------------------------
1568 */
1569 floatx80 float32_to_floatx80( float32 a )
1570 {
1571     flag aSign;
1572     int16 aExp;
1573     bits32 aSig;
1574 
1575     aSig = extractFloat32Frac( a );
1576     aExp = extractFloat32Exp( a );
1577     aSign = extractFloat32Sign( a );
1578     if ( aExp == 0xFF ) {
1579         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1580         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1581     }
1582     if ( aExp == 0 ) {
1583         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1584         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1585     }
1586     aSig |= 0x00800000;
1587     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1588 
1589 }
1590 
1591 #endif
1592 
1593 #ifdef FLOAT128
1594 
1595 /*
1596 -------------------------------------------------------------------------------
1597 Returns the result of converting the single-precision floating-point value
1598 `a' to the double-precision floating-point format.  The conversion is
1599 performed according to the IEC/IEEE Standard for Binary Floating-Point
1600 Arithmetic.
1601 -------------------------------------------------------------------------------
1602 */
1603 float128 float32_to_float128( float32 a )
1604 {
1605     flag aSign;
1606     int16 aExp;
1607     bits32 aSig;
1608 
1609     aSig = extractFloat32Frac( a );
1610     aExp = extractFloat32Exp( a );
1611     aSign = extractFloat32Sign( a );
1612     if ( aExp == 0xFF ) {
1613         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1614         return packFloat128( aSign, 0x7FFF, 0, 0 );
1615     }
1616     if ( aExp == 0 ) {
1617         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1618         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1619         --aExp;
1620     }
1621     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1622 
1623 }
1624 
1625 #endif
1626 
1627 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1628 /*
1629 -------------------------------------------------------------------------------
1630 Rounds the single-precision floating-point value `a' to an integer, and
1631 returns the result as a single-precision floating-point value.  The
1632 operation is performed according to the IEC/IEEE Standard for Binary
1633 Floating-Point Arithmetic.
1634 -------------------------------------------------------------------------------
1635 */
1636 float32 float32_round_to_int( float32 a )
1637 {
1638     flag aSign;
1639     int16 aExp;
1640     bits32 lastBitMask, roundBitsMask;
1641     int8 roundingMode;
1642     float32 z;
1643 
1644     aExp = extractFloat32Exp( a );
1645     if ( 0x96 <= aExp ) {
1646         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1647             return propagateFloat32NaN( a, a );
1648         }
1649         return a;
1650     }
1651     if ( aExp <= 0x7E ) {
1652         if ( (bits32) ( a<<1 ) == 0 ) return a;
1653         float_set_inexact();
1654         aSign = extractFloat32Sign( a );
1655         switch ( float_rounding_mode() ) {
1656          case float_round_nearest_even:
1657             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1658                 return packFloat32( aSign, 0x7F, 0 );
1659             }
1660             break;
1661          case float_round_down:
1662             return aSign ? 0xBF800000 : 0;
1663          case float_round_up:
1664             return aSign ? 0x80000000 : 0x3F800000;
1665         }
1666         return packFloat32( aSign, 0, 0 );
1667     }
1668     lastBitMask = 1;
1669     lastBitMask <<= 0x96 - aExp;
1670     roundBitsMask = lastBitMask - 1;
1671     z = a;
1672     roundingMode = float_rounding_mode();
1673     if ( roundingMode == float_round_nearest_even ) {
1674         z += lastBitMask>>1;
1675         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1676     }
1677     else if ( roundingMode != float_round_to_zero ) {
1678         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1679             z += roundBitsMask;
1680         }
1681     }
1682     z &= ~ roundBitsMask;
1683     if ( z != a ) float_set_inexact();
1684     return z;
1685 
1686 }
1687 #endif /* !SOFTFLOAT_FOR_GCC */
1688 
1689 /*
1690 -------------------------------------------------------------------------------
1691 Returns the result of adding the absolute values of the single-precision
1692 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1693 before being returned.  `zSign' is ignored if the result is a NaN.
1694 The addition is performed according to the IEC/IEEE Standard for Binary
1695 Floating-Point Arithmetic.
1696 -------------------------------------------------------------------------------
1697 */
1698 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1699 {
1700     int16 aExp, bExp, zExp;
1701     bits32 aSig, bSig, zSig;
1702     int16 expDiff;
1703 
1704     aSig = extractFloat32Frac( a );
1705     aExp = extractFloat32Exp( a );
1706     bSig = extractFloat32Frac( b );
1707     bExp = extractFloat32Exp( b );
1708     expDiff = aExp - bExp;
1709     aSig <<= 6;
1710     bSig <<= 6;
1711     if ( 0 < expDiff ) {
1712         if ( aExp == 0xFF ) {
1713             if ( aSig ) return propagateFloat32NaN( a, b );
1714             return a;
1715         }
1716         if ( bExp == 0 ) {
1717             --expDiff;
1718         }
1719         else {
1720             bSig |= 0x20000000;
1721         }
1722         shift32RightJamming( bSig, expDiff, &bSig );
1723         zExp = aExp;
1724     }
1725     else if ( expDiff < 0 ) {
1726         if ( bExp == 0xFF ) {
1727             if ( bSig ) return propagateFloat32NaN( a, b );
1728             return packFloat32( zSign, 0xFF, 0 );
1729         }
1730         if ( aExp == 0 ) {
1731             ++expDiff;
1732         }
1733         else {
1734             aSig |= 0x20000000;
1735         }
1736         shift32RightJamming( aSig, - expDiff, &aSig );
1737         zExp = bExp;
1738     }
1739     else {
1740         if ( aExp == 0xFF ) {
1741             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1742             return a;
1743         }
1744         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1745         zSig = 0x40000000 + aSig + bSig;
1746         zExp = aExp;
1747         goto roundAndPack;
1748     }
1749     aSig |= 0x20000000;
1750     zSig = ( aSig + bSig )<<1;
1751     --zExp;
1752     if ( (sbits32) zSig < 0 ) {
1753         zSig = aSig + bSig;
1754         ++zExp;
1755     }
1756  roundAndPack:
1757     return roundAndPackFloat32( zSign, zExp, zSig );
1758 
1759 }
1760 
1761 /*
1762 -------------------------------------------------------------------------------
1763 Returns the result of subtracting the absolute values of the single-
1764 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1765 difference is negated before being returned.  `zSign' is ignored if the
1766 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1767 Standard for Binary Floating-Point Arithmetic.
1768 -------------------------------------------------------------------------------
1769 */
1770 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1771 {
1772     int16 aExp, bExp, zExp;
1773     bits32 aSig, bSig, zSig;
1774     int16 expDiff;
1775 
1776     aSig = extractFloat32Frac( a );
1777     aExp = extractFloat32Exp( a );
1778     bSig = extractFloat32Frac( b );
1779     bExp = extractFloat32Exp( b );
1780     expDiff = aExp - bExp;
1781     aSig <<= 7;
1782     bSig <<= 7;
1783     if ( 0 < expDiff ) goto aExpBigger;
1784     if ( expDiff < 0 ) goto bExpBigger;
1785     if ( aExp == 0xFF ) {
1786         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1787         float_raise( float_flag_invalid );
1788         return float32_default_nan;
1789     }
1790     if ( aExp == 0 ) {
1791         aExp = 1;
1792         bExp = 1;
1793     }
1794     if ( bSig < aSig ) goto aBigger;
1795     if ( aSig < bSig ) goto bBigger;
1796     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1797  bExpBigger:
1798     if ( bExp == 0xFF ) {
1799         if ( bSig ) return propagateFloat32NaN( a, b );
1800         return packFloat32( zSign ^ 1, 0xFF, 0 );
1801     }
1802     if ( aExp == 0 ) {
1803         ++expDiff;
1804     }
1805     else {
1806         aSig |= 0x40000000;
1807     }
1808     shift32RightJamming( aSig, - expDiff, &aSig );
1809     bSig |= 0x40000000;
1810  bBigger:
1811     zSig = bSig - aSig;
1812     zExp = bExp;
1813     zSign ^= 1;
1814     goto normalizeRoundAndPack;
1815  aExpBigger:
1816     if ( aExp == 0xFF ) {
1817         if ( aSig ) return propagateFloat32NaN( a, b );
1818         return a;
1819     }
1820     if ( bExp == 0 ) {
1821         --expDiff;
1822     }
1823     else {
1824         bSig |= 0x40000000;
1825     }
1826     shift32RightJamming( bSig, expDiff, &bSig );
1827     aSig |= 0x40000000;
1828  aBigger:
1829     zSig = aSig - bSig;
1830     zExp = aExp;
1831  normalizeRoundAndPack:
1832     --zExp;
1833     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1834 
1835 }
1836 
1837 /*
1838 -------------------------------------------------------------------------------
1839 Returns the result of adding the single-precision floating-point values `a'
1840 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1841 Binary Floating-Point Arithmetic.
1842 -------------------------------------------------------------------------------
1843 */
1844 float32 float32_add( float32 a, float32 b )
1845 {
1846     flag aSign, bSign;
1847 
1848     aSign = extractFloat32Sign( a );
1849     bSign = extractFloat32Sign( b );
1850     if ( aSign == bSign ) {
1851         return addFloat32Sigs( a, b, aSign );
1852     }
1853     else {
1854         return subFloat32Sigs( a, b, aSign );
1855     }
1856 
1857 }
1858 
1859 /*
1860 -------------------------------------------------------------------------------
1861 Returns the result of subtracting the single-precision floating-point values
1862 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1863 for Binary Floating-Point Arithmetic.
1864 -------------------------------------------------------------------------------
1865 */
1866 float32 float32_sub( float32 a, float32 b )
1867 {
1868     flag aSign, bSign;
1869 
1870     aSign = extractFloat32Sign( a );
1871     bSign = extractFloat32Sign( b );
1872     if ( aSign == bSign ) {
1873         return subFloat32Sigs( a, b, aSign );
1874     }
1875     else {
1876         return addFloat32Sigs( a, b, aSign );
1877     }
1878 
1879 }
1880 
1881 /*
1882 -------------------------------------------------------------------------------
1883 Returns the result of multiplying the single-precision floating-point values
1884 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1885 for Binary Floating-Point Arithmetic.
1886 -------------------------------------------------------------------------------
1887 */
1888 float32 float32_mul( float32 a, float32 b )
1889 {
1890     flag aSign, bSign, zSign;
1891     int16 aExp, bExp, zExp;
1892     bits32 aSig, bSig;
1893     bits64 zSig64;
1894     bits32 zSig;
1895 
1896     aSig = extractFloat32Frac( a );
1897     aExp = extractFloat32Exp( a );
1898     aSign = extractFloat32Sign( a );
1899     bSig = extractFloat32Frac( b );
1900     bExp = extractFloat32Exp( b );
1901     bSign = extractFloat32Sign( b );
1902     zSign = aSign ^ bSign;
1903     if ( aExp == 0xFF ) {
1904         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1905             return propagateFloat32NaN( a, b );
1906         }
1907         if ( ( bExp | bSig ) == 0 ) {
1908             float_raise( float_flag_invalid );
1909             return float32_default_nan;
1910         }
1911         return packFloat32( zSign, 0xFF, 0 );
1912     }
1913     if ( bExp == 0xFF ) {
1914         if ( bSig ) return propagateFloat32NaN( a, b );
1915         if ( ( aExp | aSig ) == 0 ) {
1916             float_raise( float_flag_invalid );
1917             return float32_default_nan;
1918         }
1919         return packFloat32( zSign, 0xFF, 0 );
1920     }
1921     if ( aExp == 0 ) {
1922         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1923         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1924     }
1925     if ( bExp == 0 ) {
1926         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1927         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1928     }
1929     zExp = aExp + bExp - 0x7F;
1930     aSig = ( aSig | 0x00800000 )<<7;
1931     bSig = ( bSig | 0x00800000 )<<8;
1932     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1933     zSig = zSig64;
1934     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1935         zSig <<= 1;
1936         --zExp;
1937     }
1938     return roundAndPackFloat32( zSign, zExp, zSig );
1939 
1940 }
1941 
1942 /*
1943 -------------------------------------------------------------------------------
1944 Returns the result of dividing the single-precision floating-point value `a'
1945 by the corresponding value `b'.  The operation is performed according to the
1946 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1947 -------------------------------------------------------------------------------
1948 */
1949 float32 float32_div( float32 a, float32 b )
1950 {
1951     flag aSign, bSign, zSign;
1952     int16 aExp, bExp, zExp;
1953     bits32 aSig, bSig, zSig;
1954 
1955     aSig = extractFloat32Frac( a );
1956     aExp = extractFloat32Exp( a );
1957     aSign = extractFloat32Sign( a );
1958     bSig = extractFloat32Frac( b );
1959     bExp = extractFloat32Exp( b );
1960     bSign = extractFloat32Sign( b );
1961     zSign = aSign ^ bSign;
1962     if ( aExp == 0xFF ) {
1963         if ( aSig ) return propagateFloat32NaN( a, b );
1964         if ( bExp == 0xFF ) {
1965             if ( bSig ) return propagateFloat32NaN( a, b );
1966             float_raise( float_flag_invalid );
1967             return float32_default_nan;
1968         }
1969         return packFloat32( zSign, 0xFF, 0 );
1970     }
1971     if ( bExp == 0xFF ) {
1972         if ( bSig ) return propagateFloat32NaN( a, b );
1973         return packFloat32( zSign, 0, 0 );
1974     }
1975     if ( bExp == 0 ) {
1976         if ( bSig == 0 ) {
1977             if ( ( aExp | aSig ) == 0 ) {
1978                 float_raise( float_flag_invalid );
1979                 return float32_default_nan;
1980             }
1981             float_raise( float_flag_divbyzero );
1982             return packFloat32( zSign, 0xFF, 0 );
1983         }
1984         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1985     }
1986     if ( aExp == 0 ) {
1987         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1988         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1989     }
1990     zExp = aExp - bExp + 0x7D;
1991     aSig = ( aSig | 0x00800000 )<<7;
1992     bSig = ( bSig | 0x00800000 )<<8;
1993     if ( bSig <= ( aSig + aSig ) ) {
1994         aSig >>= 1;
1995         ++zExp;
1996     }
1997     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1998     if ( ( zSig & 0x3F ) == 0 ) {
1999         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2000     }
2001     return roundAndPackFloat32( zSign, zExp, zSig );
2002 
2003 }
2004 
2005 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2006 /*
2007 -------------------------------------------------------------------------------
2008 Returns the remainder of the single-precision floating-point value `a'
2009 with respect to the corresponding value `b'.  The operation is performed
2010 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2011 -------------------------------------------------------------------------------
2012 */
2013 float32 float32_rem( float32 a, float32 b )
2014 {
2015     flag aSign, bSign __unused, zSign;
2016     int16 aExp, bExp, expDiff;
2017     bits32 aSig, bSig;
2018     bits32 q;
2019     bits64 aSig64, bSig64, q64;
2020     bits32 alternateASig;
2021     sbits32 sigMean;
2022 
2023     aSig = extractFloat32Frac( a );
2024     aExp = extractFloat32Exp( a );
2025     aSign = extractFloat32Sign( a );
2026     bSig = extractFloat32Frac( b );
2027     bExp = extractFloat32Exp( b );
2028     bSign = extractFloat32Sign( b );
2029     if ( aExp == 0xFF ) {
2030         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2031             return propagateFloat32NaN( a, b );
2032         }
2033         float_raise( float_flag_invalid );
2034         return float32_default_nan;
2035     }
2036     if ( bExp == 0xFF ) {
2037         if ( bSig ) return propagateFloat32NaN( a, b );
2038         return a;
2039     }
2040     if ( bExp == 0 ) {
2041         if ( bSig == 0 ) {
2042             float_raise( float_flag_invalid );
2043             return float32_default_nan;
2044         }
2045         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2046     }
2047     if ( aExp == 0 ) {
2048         if ( aSig == 0 ) return a;
2049         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2050     }
2051     expDiff = aExp - bExp;
2052     aSig |= 0x00800000;
2053     bSig |= 0x00800000;
2054     if ( expDiff < 32 ) {
2055         aSig <<= 8;
2056         bSig <<= 8;
2057         if ( expDiff < 0 ) {
2058             if ( expDiff < -1 ) return a;
2059             aSig >>= 1;
2060         }
2061         q = ( bSig <= aSig );
2062         if ( q ) aSig -= bSig;
2063         if ( 0 < expDiff ) {
2064             q = ( ( (bits64) aSig )<<32 ) / bSig;
2065             q >>= 32 - expDiff;
2066             bSig >>= 2;
2067             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2068         }
2069         else {
2070             aSig >>= 2;
2071             bSig >>= 2;
2072         }
2073     }
2074     else {
2075         if ( bSig <= aSig ) aSig -= bSig;
2076         aSig64 = ( (bits64) aSig )<<40;
2077         bSig64 = ( (bits64) bSig )<<40;
2078         expDiff -= 64;
2079         while ( 0 < expDiff ) {
2080             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2081             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2082             aSig64 = - ( ( bSig * q64 )<<38 );
2083             expDiff -= 62;
2084         }
2085         expDiff += 64;
2086         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2087         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2088         q = q64>>( 64 - expDiff );
2089         bSig <<= 6;
2090         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2091     }
2092     do {
2093         alternateASig = aSig;
2094         ++q;
2095         aSig -= bSig;
2096     } while ( 0 <= (sbits32) aSig );
2097     sigMean = aSig + alternateASig;
2098     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2099         aSig = alternateASig;
2100     }
2101     zSign = ( (sbits32) aSig < 0 );
2102     if ( zSign ) aSig = - aSig;
2103     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2104 
2105 }
2106 #endif /* !SOFTFLOAT_FOR_GCC */
2107 
2108 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2109 /*
2110 -------------------------------------------------------------------------------
2111 Returns the square root of the single-precision floating-point value `a'.
2112 The operation is performed according to the IEC/IEEE Standard for Binary
2113 Floating-Point Arithmetic.
2114 -------------------------------------------------------------------------------
2115 */
2116 float32 float32_sqrt( float32 a )
2117 {
2118     flag aSign;
2119     int16 aExp, zExp;
2120     bits32 aSig, zSig;
2121     bits64 rem, term;
2122 
2123     aSig = extractFloat32Frac( a );
2124     aExp = extractFloat32Exp( a );
2125     aSign = extractFloat32Sign( a );
2126     if ( aExp == 0xFF ) {
2127         if ( aSig ) return propagateFloat32NaN( a, 0 );
2128         if ( ! aSign ) return a;
2129         float_raise( float_flag_invalid );
2130         return float32_default_nan;
2131     }
2132     if ( aSign ) {
2133         if ( ( aExp | aSig ) == 0 ) return a;
2134         float_raise( float_flag_invalid );
2135         return float32_default_nan;
2136     }
2137     if ( aExp == 0 ) {
2138         if ( aSig == 0 ) return 0;
2139         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2140     }
2141     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2142     aSig = ( aSig | 0x00800000 )<<8;
2143     zSig = estimateSqrt32( aExp, aSig ) + 2;
2144     if ( ( zSig & 0x7F ) <= 5 ) {
2145         if ( zSig < 2 ) {
2146             zSig = 0x7FFFFFFF;
2147             goto roundAndPack;
2148         }
2149         aSig >>= aExp & 1;
2150         term = ( (bits64) zSig ) * zSig;
2151         rem = ( ( (bits64) aSig )<<32 ) - term;
2152         while ( (sbits64) rem < 0 ) {
2153             --zSig;
2154             rem += ( ( (bits64) zSig )<<1 ) | 1;
2155         }
2156         zSig |= ( rem != 0 );
2157     }
2158     shift32RightJamming( zSig, 1, &zSig );
2159  roundAndPack:
2160     return roundAndPackFloat32( 0, zExp, zSig );
2161 
2162 }
2163 #endif /* !SOFTFLOAT_FOR_GCC */
2164 
2165 /*
2166 -------------------------------------------------------------------------------
2167 Returns 1 if the single-precision floating-point value `a' is equal to
2168 the corresponding value `b', and 0 otherwise.  The comparison is performed
2169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2170 -------------------------------------------------------------------------------
2171 */
2172 flag float32_eq( float32 a, float32 b )
2173 {
2174 
2175     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2176          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2177        ) {
2178         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2179             float_raise( float_flag_invalid );
2180         }
2181         return 0;
2182     }
2183     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2184 
2185 }
2186 
2187 /*
2188 -------------------------------------------------------------------------------
2189 Returns 1 if the single-precision floating-point value `a' is less than
2190 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2191 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2192 Arithmetic.
2193 -------------------------------------------------------------------------------
2194 */
2195 flag float32_le( float32 a, float32 b )
2196 {
2197     flag aSign, bSign;
2198 
2199     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2200          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2201        ) {
2202         float_raise( float_flag_invalid );
2203         return 0;
2204     }
2205     aSign = extractFloat32Sign( a );
2206     bSign = extractFloat32Sign( b );
2207     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2208     return ( a == b ) || ( aSign ^ ( a < b ) );
2209 
2210 }
2211 
2212 /*
2213 -------------------------------------------------------------------------------
2214 Returns 1 if the single-precision floating-point value `a' is less than
2215 the corresponding value `b', and 0 otherwise.  The comparison is performed
2216 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2217 -------------------------------------------------------------------------------
2218 */
2219 flag float32_lt( float32 a, float32 b )
2220 {
2221     flag aSign, bSign;
2222 
2223     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2224          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2225        ) {
2226         float_raise( float_flag_invalid );
2227         return 0;
2228     }
2229     aSign = extractFloat32Sign( a );
2230     bSign = extractFloat32Sign( b );
2231     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2232     return ( a != b ) && ( aSign ^ ( a < b ) );
2233 
2234 }
2235 
2236 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2237 /*
2238 -------------------------------------------------------------------------------
2239 Returns 1 if the single-precision floating-point value `a' is equal to
2240 the corresponding value `b', and 0 otherwise.  The invalid exception is
2241 raised if either operand is a NaN.  Otherwise, the comparison is performed
2242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2243 -------------------------------------------------------------------------------
2244 */
2245 flag float32_eq_signaling( float32 a, float32 b )
2246 {
2247 
2248     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2249          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2250        ) {
2251         float_raise( float_flag_invalid );
2252         return 0;
2253     }
2254     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2255 
2256 }
2257 
2258 /*
2259 -------------------------------------------------------------------------------
2260 Returns 1 if the single-precision floating-point value `a' is less than or
2261 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2262 cause an exception.  Otherwise, the comparison is performed according to the
2263 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2264 -------------------------------------------------------------------------------
2265 */
2266 flag float32_le_quiet( float32 a, float32 b )
2267 {
2268     flag aSign, bSign;
2269 
2270     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2271          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2272        ) {
2273         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2274             float_raise( float_flag_invalid );
2275         }
2276         return 0;
2277     }
2278     aSign = extractFloat32Sign( a );
2279     bSign = extractFloat32Sign( b );
2280     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2281     return ( a == b ) || ( aSign ^ ( a < b ) );
2282 
2283 }
2284 
2285 /*
2286 -------------------------------------------------------------------------------
2287 Returns 1 if the single-precision floating-point value `a' is less than
2288 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2289 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2290 Standard for Binary Floating-Point Arithmetic.
2291 -------------------------------------------------------------------------------
2292 */
2293 flag float32_lt_quiet( float32 a, float32 b )
2294 {
2295     flag aSign, bSign;
2296 
2297     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2298          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2299        ) {
2300         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2301             float_raise( float_flag_invalid );
2302         }
2303         return 0;
2304     }
2305     aSign = extractFloat32Sign( a );
2306     bSign = extractFloat32Sign( b );
2307     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2308     return ( a != b ) && ( aSign ^ ( a < b ) );
2309 
2310 }
2311 #endif /* !SOFTFLOAT_FOR_GCC */
2312 
2313 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2314 /*
2315 -------------------------------------------------------------------------------
2316 Returns the result of converting the double-precision floating-point value
2317 `a' to the 32-bit two's complement integer format.  The conversion is
2318 performed according to the IEC/IEEE Standard for Binary Floating-Point
2319 Arithmetic---which means in particular that the conversion is rounded
2320 according to the current rounding mode.  If `a' is a NaN, the largest
2321 positive integer is returned.  Otherwise, if the conversion overflows, the
2322 largest integer with the same sign as `a' is returned.
2323 -------------------------------------------------------------------------------
2324 */
2325 int32 float64_to_int32( float64 a )
2326 {
2327     flag aSign;
2328     int16 aExp, shiftCount;
2329     bits64 aSig;
2330 
2331     aSig = extractFloat64Frac( a );
2332     aExp = extractFloat64Exp( a );
2333     aSign = extractFloat64Sign( a );
2334     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2335     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2336     shiftCount = 0x42C - aExp;
2337     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2338     return roundAndPackInt32( aSign, aSig );
2339 
2340 }
2341 #endif /* !SOFTFLOAT_FOR_GCC */
2342 
2343 /*
2344 -------------------------------------------------------------------------------
2345 Returns the result of converting the double-precision floating-point value
2346 `a' to the 32-bit two's complement integer format.  The conversion is
2347 performed according to the IEC/IEEE Standard for Binary Floating-Point
2348 Arithmetic, except that the conversion is always rounded toward zero.
2349 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2350 the conversion overflows, the largest integer with the same sign as `a' is
2351 returned.
2352 -------------------------------------------------------------------------------
2353 */
2354 int32 float64_to_int32_round_to_zero( float64 a )
2355 {
2356     flag aSign;
2357     int16 aExp, shiftCount;
2358     bits64 aSig, savedASig;
2359     int32 z;
2360 
2361     aSig = extractFloat64Frac( a );
2362     aExp = extractFloat64Exp( a );
2363     aSign = extractFloat64Sign( a );
2364     if ( 0x41E < aExp ) {
2365         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2366         goto invalid;
2367     }
2368     else if ( aExp < 0x3FF ) {
2369         if ( aExp || aSig ) float_set_inexact();
2370         return 0;
2371     }
2372     aSig |= LIT64( 0x0010000000000000 );
2373     shiftCount = 0x433 - aExp;
2374     savedASig = aSig;
2375     aSig >>= shiftCount;
2376     z = aSig;
2377     if ( aSign ) z = - z;
2378     if ( ( z < 0 ) ^ aSign ) {
2379  invalid:
2380         float_raise( float_flag_invalid );
2381         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2382     }
2383     if ( ( aSig<<shiftCount ) != savedASig ) {
2384         float_set_inexact();
2385     }
2386     return z;
2387 
2388 }
2389 
2390 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2391 /*
2392 -------------------------------------------------------------------------------
2393 Returns the result of converting the double-precision floating-point value
2394 `a' to the 64-bit two's complement integer format.  The conversion is
2395 performed according to the IEC/IEEE Standard for Binary Floating-Point
2396 Arithmetic---which means in particular that the conversion is rounded
2397 according to the current rounding mode.  If `a' is a NaN, the largest
2398 positive integer is returned.  Otherwise, if the conversion overflows, the
2399 largest integer with the same sign as `a' is returned.
2400 -------------------------------------------------------------------------------
2401 */
2402 int64 float64_to_int64( float64 a )
2403 {
2404     flag aSign;
2405     int16 aExp, shiftCount;
2406     bits64 aSig, aSigExtra;
2407 
2408     aSig = extractFloat64Frac( a );
2409     aExp = extractFloat64Exp( a );
2410     aSign = extractFloat64Sign( a );
2411     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2412     shiftCount = 0x433 - aExp;
2413     if ( shiftCount <= 0 ) {
2414         if ( 0x43E < aExp ) {
2415             float_raise( float_flag_invalid );
2416             if (    ! aSign
2417                  || (    ( aExp == 0x7FF )
2418                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2419                ) {
2420                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2421             }
2422             return (sbits64) LIT64( 0x8000000000000000 );
2423         }
2424         aSigExtra = 0;
2425         aSig <<= - shiftCount;
2426     }
2427     else {
2428         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2429     }
2430     return roundAndPackInt64( aSign, aSig, aSigExtra );
2431 
2432 }
2433 
2434 /* like above, but result is unsigned */
2435 uint64 float64_to_uint64( float64 a )
2436 {
2437     flag aSign;
2438     int16 aExp, shiftCount;
2439     bits64 aSig, aSigExtra;
2440 
2441     aSig = extractFloat64Frac( a );
2442     aExp = extractFloat64Exp( a );
2443     aSign = extractFloat64Sign( a );
2444 
2445     if (aSign) {
2446 	return float64_to_int64(a);
2447     }
2448 
2449     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2450     shiftCount = 0x433 - aExp;
2451     if ( shiftCount <= 0 ) {
2452         if ( 0x43E < aExp ) {
2453             float_raise( float_flag_invalid );
2454             if (    ! aSign
2455                  || (    ( aExp == 0x7FF )
2456                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2457                ) {
2458                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2459             }
2460             return (sbits64) LIT64( 0x8000000000000000 );
2461         }
2462         aSigExtra = 0;
2463         aSig <<= - shiftCount;
2464     }
2465     else {
2466         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2467     }
2468     return roundAndPackUInt64( aSig, aSigExtra );
2469 
2470 }
2471 
2472 /*
2473 -------------------------------------------------------------------------------
2474 Returns the result of converting the double-precision floating-point value
2475 `a' to the 64-bit two's complement integer format.  The conversion is
2476 performed according to the IEC/IEEE Standard for Binary Floating-Point
2477 Arithmetic, except that the conversion is always rounded toward zero.
2478 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2479 the conversion overflows, the largest integer with the same sign as `a' is
2480 returned.
2481 -------------------------------------------------------------------------------
2482 */
2483 int64 float64_to_int64_round_to_zero( float64 a )
2484 {
2485     flag aSign;
2486     int16 aExp, shiftCount;
2487     bits64 aSig;
2488     int64 z;
2489 
2490     aSig = extractFloat64Frac( a );
2491     aExp = extractFloat64Exp( a );
2492     aSign = extractFloat64Sign( a );
2493     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2494     shiftCount = aExp - 0x433;
2495     if ( 0 <= shiftCount ) {
2496         if ( 0x43E <= aExp ) {
2497             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2498                 float_raise( float_flag_invalid );
2499                 if (    ! aSign
2500                      || (    ( aExp == 0x7FF )
2501                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2502                    ) {
2503                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2504                 }
2505             }
2506             return (sbits64) LIT64( 0x8000000000000000 );
2507         }
2508         z = aSig<<shiftCount;
2509     }
2510     else {
2511         if ( aExp < 0x3FE ) {
2512             if ( aExp | aSig ) float_set_inexact();
2513             return 0;
2514         }
2515         z = aSig>>( - shiftCount );
2516         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2517             float_set_inexact();
2518         }
2519     }
2520     if ( aSign ) z = - z;
2521     return z;
2522 
2523 }
2524 #endif /* !SOFTFLOAT_FOR_GCC */
2525 
2526 /*
2527 -------------------------------------------------------------------------------
2528 Returns the result of converting the double-precision floating-point value
2529 `a' to the single-precision floating-point format.  The conversion is
2530 performed according to the IEC/IEEE Standard for Binary Floating-Point
2531 Arithmetic.
2532 -------------------------------------------------------------------------------
2533 */
2534 float32 float64_to_float32( float64 a )
2535 {
2536     flag aSign;
2537     int16 aExp;
2538     bits64 aSig;
2539     bits32 zSig;
2540 
2541     aSig = extractFloat64Frac( a );
2542     aExp = extractFloat64Exp( a );
2543     aSign = extractFloat64Sign( a );
2544     if ( aExp == 0x7FF ) {
2545         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2546         return packFloat32( aSign, 0xFF, 0 );
2547     }
2548     shift64RightJamming( aSig, 22, &aSig );
2549     zSig = aSig;
2550     if ( aExp || zSig ) {
2551         zSig |= 0x40000000;
2552         aExp -= 0x381;
2553     }
2554     return roundAndPackFloat32( aSign, aExp, zSig );
2555 
2556 }
2557 
2558 #ifdef FLOATX80
2559 
2560 /*
2561 -------------------------------------------------------------------------------
2562 Returns the result of converting the double-precision floating-point value
2563 `a' to the extended double-precision floating-point format.  The conversion
2564 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2565 Arithmetic.
2566 -------------------------------------------------------------------------------
2567 */
2568 floatx80 float64_to_floatx80( float64 a )
2569 {
2570     flag aSign;
2571     int16 aExp;
2572     bits64 aSig;
2573 
2574     aSig = extractFloat64Frac( a );
2575     aExp = extractFloat64Exp( a );
2576     aSign = extractFloat64Sign( a );
2577     if ( aExp == 0x7FF ) {
2578         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2579         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2580     }
2581     if ( aExp == 0 ) {
2582         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2583         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2584     }
2585     return
2586         packFloatx80(
2587             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2588 
2589 }
2590 
2591 #endif
2592 
2593 #ifdef FLOAT128
2594 
2595 /*
2596 -------------------------------------------------------------------------------
2597 Returns the result of converting the double-precision floating-point value
2598 `a' to the quadruple-precision floating-point format.  The conversion is
2599 performed according to the IEC/IEEE Standard for Binary Floating-Point
2600 Arithmetic.
2601 -------------------------------------------------------------------------------
2602 */
2603 float128 float64_to_float128( float64 a )
2604 {
2605     flag aSign;
2606     int16 aExp;
2607     bits64 aSig, zSig0, zSig1;
2608 
2609     aSig = extractFloat64Frac( a );
2610     aExp = extractFloat64Exp( a );
2611     aSign = extractFloat64Sign( a );
2612     if ( aExp == 0x7FF ) {
2613         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2614         return packFloat128( aSign, 0x7FFF, 0, 0 );
2615     }
2616     if ( aExp == 0 ) {
2617         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2618         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2619         --aExp;
2620     }
2621     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2622     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2623 
2624 }
2625 
2626 #endif
2627 
2628 #ifndef SOFTFLOAT_FOR_GCC
2629 /*
2630 -------------------------------------------------------------------------------
2631 Rounds the double-precision floating-point value `a' to an integer, and
2632 returns the result as a double-precision floating-point value.  The
2633 operation is performed according to the IEC/IEEE Standard for Binary
2634 Floating-Point Arithmetic.
2635 -------------------------------------------------------------------------------
2636 */
2637 float64 float64_round_to_int( float64 a )
2638 {
2639     flag aSign;
2640     int16 aExp;
2641     bits64 lastBitMask, roundBitsMask;
2642     int8 roundingMode;
2643     float64 z;
2644 
2645     aExp = extractFloat64Exp( a );
2646     if ( 0x433 <= aExp ) {
2647         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2648             return propagateFloat64NaN( a, a );
2649         }
2650         return a;
2651     }
2652     if ( aExp < 0x3FF ) {
2653         if ( (bits64) ( a<<1 ) == 0 ) return a;
2654         float_set_inexact();
2655         aSign = extractFloat64Sign( a );
2656         switch ( float_rounding_mode() ) {
2657          case float_round_nearest_even:
2658             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2659                 return packFloat64( aSign, 0x3FF, 0 );
2660             }
2661             break;
2662          case float_round_down:
2663             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2664          case float_round_up:
2665             return
2666             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2667         }
2668         return packFloat64( aSign, 0, 0 );
2669     }
2670     lastBitMask = 1;
2671     lastBitMask <<= 0x433 - aExp;
2672     roundBitsMask = lastBitMask - 1;
2673     z = a;
2674     roundingMode = float_rounding_mode();
2675     if ( roundingMode == float_round_nearest_even ) {
2676         z += lastBitMask>>1;
2677         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2678     }
2679     else if ( roundingMode != float_round_to_zero ) {
2680         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2681             z += roundBitsMask;
2682         }
2683     }
2684     z &= ~ roundBitsMask;
2685     if ( z != a ) float_set_inexact();
2686     return z;
2687 
2688 }
2689 #endif
2690 
2691 /*
2692 -------------------------------------------------------------------------------
2693 Returns the result of adding the absolute values of the double-precision
2694 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2695 before being returned.  `zSign' is ignored if the result is a NaN.
2696 The addition is performed according to the IEC/IEEE Standard for Binary
2697 Floating-Point Arithmetic.
2698 -------------------------------------------------------------------------------
2699 */
2700 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2701 {
2702     int16 aExp, bExp, zExp;
2703     bits64 aSig, bSig, zSig;
2704     int16 expDiff;
2705 
2706     aSig = extractFloat64Frac( a );
2707     aExp = extractFloat64Exp( a );
2708     bSig = extractFloat64Frac( b );
2709     bExp = extractFloat64Exp( b );
2710     expDiff = aExp - bExp;
2711     aSig <<= 9;
2712     bSig <<= 9;
2713     if ( 0 < expDiff ) {
2714         if ( aExp == 0x7FF ) {
2715             if ( aSig ) return propagateFloat64NaN( a, b );
2716             return a;
2717         }
2718         if ( bExp == 0 ) {
2719             --expDiff;
2720         }
2721         else {
2722             bSig |= LIT64( 0x2000000000000000 );
2723         }
2724         shift64RightJamming( bSig, expDiff, &bSig );
2725         zExp = aExp;
2726     }
2727     else if ( expDiff < 0 ) {
2728         if ( bExp == 0x7FF ) {
2729             if ( bSig ) return propagateFloat64NaN( a, b );
2730             return packFloat64( zSign, 0x7FF, 0 );
2731         }
2732         if ( aExp == 0 ) {
2733             ++expDiff;
2734         }
2735         else {
2736             aSig |= LIT64( 0x2000000000000000 );
2737         }
2738         shift64RightJamming( aSig, - expDiff, &aSig );
2739         zExp = bExp;
2740     }
2741     else {
2742         if ( aExp == 0x7FF ) {
2743             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2744             return a;
2745         }
2746         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2747         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2748         zExp = aExp;
2749         goto roundAndPack;
2750     }
2751     aSig |= LIT64( 0x2000000000000000 );
2752     zSig = ( aSig + bSig )<<1;
2753     --zExp;
2754     if ( (sbits64) zSig < 0 ) {
2755         zSig = aSig + bSig;
2756         ++zExp;
2757     }
2758  roundAndPack:
2759     return roundAndPackFloat64( zSign, zExp, zSig );
2760 
2761 }
2762 
2763 /*
2764 -------------------------------------------------------------------------------
2765 Returns the result of subtracting the absolute values of the double-
2766 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2767 difference is negated before being returned.  `zSign' is ignored if the
2768 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2769 Standard for Binary Floating-Point Arithmetic.
2770 -------------------------------------------------------------------------------
2771 */
2772 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2773 {
2774     int16 aExp, bExp, zExp;
2775     bits64 aSig, bSig, zSig;
2776     int16 expDiff;
2777 
2778     aSig = extractFloat64Frac( a );
2779     aExp = extractFloat64Exp( a );
2780     bSig = extractFloat64Frac( b );
2781     bExp = extractFloat64Exp( b );
2782     expDiff = aExp - bExp;
2783     aSig <<= 10;
2784     bSig <<= 10;
2785     if ( 0 < expDiff ) goto aExpBigger;
2786     if ( expDiff < 0 ) goto bExpBigger;
2787     if ( aExp == 0x7FF ) {
2788         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2789         float_raise( float_flag_invalid );
2790         return float64_default_nan;
2791     }
2792     if ( aExp == 0 ) {
2793         aExp = 1;
2794         bExp = 1;
2795     }
2796     if ( bSig < aSig ) goto aBigger;
2797     if ( aSig < bSig ) goto bBigger;
2798     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2799  bExpBigger:
2800     if ( bExp == 0x7FF ) {
2801         if ( bSig ) return propagateFloat64NaN( a, b );
2802         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2803     }
2804     if ( aExp == 0 ) {
2805         ++expDiff;
2806     }
2807     else {
2808         aSig |= LIT64( 0x4000000000000000 );
2809     }
2810     shift64RightJamming( aSig, - expDiff, &aSig );
2811     bSig |= LIT64( 0x4000000000000000 );
2812  bBigger:
2813     zSig = bSig - aSig;
2814     zExp = bExp;
2815     zSign ^= 1;
2816     goto normalizeRoundAndPack;
2817  aExpBigger:
2818     if ( aExp == 0x7FF ) {
2819         if ( aSig ) return propagateFloat64NaN( a, b );
2820         return a;
2821     }
2822     if ( bExp == 0 ) {
2823         --expDiff;
2824     }
2825     else {
2826         bSig |= LIT64( 0x4000000000000000 );
2827     }
2828     shift64RightJamming( bSig, expDiff, &bSig );
2829     aSig |= LIT64( 0x4000000000000000 );
2830  aBigger:
2831     zSig = aSig - bSig;
2832     zExp = aExp;
2833  normalizeRoundAndPack:
2834     --zExp;
2835     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2836 
2837 }
2838 
2839 /*
2840 -------------------------------------------------------------------------------
2841 Returns the result of adding the double-precision floating-point values `a'
2842 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2843 Binary Floating-Point Arithmetic.
2844 -------------------------------------------------------------------------------
2845 */
2846 float64 float64_add( float64 a, float64 b )
2847 {
2848     flag aSign, bSign;
2849 
2850     aSign = extractFloat64Sign( a );
2851     bSign = extractFloat64Sign( b );
2852     if ( aSign == bSign ) {
2853         return addFloat64Sigs( a, b, aSign );
2854     }
2855     else {
2856         return subFloat64Sigs( a, b, aSign );
2857     }
2858 
2859 }
2860 
2861 /*
2862 -------------------------------------------------------------------------------
2863 Returns the result of subtracting the double-precision floating-point values
2864 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2865 for Binary Floating-Point Arithmetic.
2866 -------------------------------------------------------------------------------
2867 */
2868 float64 float64_sub( float64 a, float64 b )
2869 {
2870     flag aSign, bSign;
2871 
2872     aSign = extractFloat64Sign( a );
2873     bSign = extractFloat64Sign( b );
2874     if ( aSign == bSign ) {
2875         return subFloat64Sigs( a, b, aSign );
2876     }
2877     else {
2878         return addFloat64Sigs( a, b, aSign );
2879     }
2880 
2881 }
2882 
2883 /*
2884 -------------------------------------------------------------------------------
2885 Returns the result of multiplying the double-precision floating-point values
2886 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2887 for Binary Floating-Point Arithmetic.
2888 -------------------------------------------------------------------------------
2889 */
2890 float64 float64_mul( float64 a, float64 b )
2891 {
2892     flag aSign, bSign, zSign;
2893     int16 aExp, bExp, zExp;
2894     bits64 aSig, bSig, zSig0, zSig1;
2895 
2896     aSig = extractFloat64Frac( a );
2897     aExp = extractFloat64Exp( a );
2898     aSign = extractFloat64Sign( a );
2899     bSig = extractFloat64Frac( b );
2900     bExp = extractFloat64Exp( b );
2901     bSign = extractFloat64Sign( b );
2902     zSign = aSign ^ bSign;
2903     if ( aExp == 0x7FF ) {
2904         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2905             return propagateFloat64NaN( a, b );
2906         }
2907         if ( ( bExp | bSig ) == 0 ) {
2908             float_raise( float_flag_invalid );
2909             return float64_default_nan;
2910         }
2911         return packFloat64( zSign, 0x7FF, 0 );
2912     }
2913     if ( bExp == 0x7FF ) {
2914         if ( bSig ) return propagateFloat64NaN( a, b );
2915         if ( ( aExp | aSig ) == 0 ) {
2916             float_raise( float_flag_invalid );
2917             return float64_default_nan;
2918         }
2919         return packFloat64( zSign, 0x7FF, 0 );
2920     }
2921     if ( aExp == 0 ) {
2922         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2923         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2924     }
2925     if ( bExp == 0 ) {
2926         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2927         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2928     }
2929     zExp = aExp + bExp - 0x3FF;
2930     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2931     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2932     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2933     zSig0 |= ( zSig1 != 0 );
2934     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2935         zSig0 <<= 1;
2936         --zExp;
2937     }
2938     return roundAndPackFloat64( zSign, zExp, zSig0 );
2939 
2940 }
2941 
2942 /*
2943 -------------------------------------------------------------------------------
2944 Returns the result of dividing the double-precision floating-point value `a'
2945 by the corresponding value `b'.  The operation is performed according to
2946 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2947 -------------------------------------------------------------------------------
2948 */
2949 float64 float64_div( float64 a, float64 b )
2950 {
2951     flag aSign, bSign, zSign;
2952     int16 aExp, bExp, zExp;
2953     bits64 aSig, bSig, zSig;
2954     bits64 rem0, rem1;
2955     bits64 term0, term1;
2956 
2957     aSig = extractFloat64Frac( a );
2958     aExp = extractFloat64Exp( a );
2959     aSign = extractFloat64Sign( a );
2960     bSig = extractFloat64Frac( b );
2961     bExp = extractFloat64Exp( b );
2962     bSign = extractFloat64Sign( b );
2963     zSign = aSign ^ bSign;
2964     if ( aExp == 0x7FF ) {
2965         if ( aSig ) return propagateFloat64NaN( a, b );
2966         if ( bExp == 0x7FF ) {
2967             if ( bSig ) return propagateFloat64NaN( a, b );
2968             float_raise( float_flag_invalid );
2969             return float64_default_nan;
2970         }
2971         return packFloat64( zSign, 0x7FF, 0 );
2972     }
2973     if ( bExp == 0x7FF ) {
2974         if ( bSig ) return propagateFloat64NaN( a, b );
2975         return packFloat64( zSign, 0, 0 );
2976     }
2977     if ( bExp == 0 ) {
2978         if ( bSig == 0 ) {
2979             if ( ( aExp | aSig ) == 0 ) {
2980                 float_raise( float_flag_invalid );
2981                 return float64_default_nan;
2982             }
2983             float_raise( float_flag_divbyzero );
2984             return packFloat64( zSign, 0x7FF, 0 );
2985         }
2986         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2987     }
2988     if ( aExp == 0 ) {
2989         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2990         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2991     }
2992     zExp = aExp - bExp + 0x3FD;
2993     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2994     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2995     if ( bSig <= ( aSig + aSig ) ) {
2996         aSig >>= 1;
2997         ++zExp;
2998     }
2999     zSig = estimateDiv128To64( aSig, 0, bSig );
3000     if ( ( zSig & 0x1FF ) <= 2 ) {
3001         mul64To128( bSig, zSig, &term0, &term1 );
3002         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3003         while ( (sbits64) rem0 < 0 ) {
3004             --zSig;
3005             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3006         }
3007         zSig |= ( rem1 != 0 );
3008     }
3009     return roundAndPackFloat64( zSign, zExp, zSig );
3010 
3011 }
3012 
3013 #ifndef SOFTFLOAT_FOR_GCC
3014 /*
3015 -------------------------------------------------------------------------------
3016 Returns the remainder of the double-precision floating-point value `a'
3017 with respect to the corresponding value `b'.  The operation is performed
3018 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3019 -------------------------------------------------------------------------------
3020 */
3021 float64 float64_rem( float64 a, float64 b )
3022 {
3023     flag aSign, bSign __unused, zSign;
3024     int16 aExp, bExp, expDiff;
3025     bits64 aSig, bSig;
3026     bits64 q, alternateASig;
3027     sbits64 sigMean;
3028 
3029     aSig = extractFloat64Frac( a );
3030     aExp = extractFloat64Exp( a );
3031     aSign = extractFloat64Sign( a );
3032     bSig = extractFloat64Frac( b );
3033     bExp = extractFloat64Exp( b );
3034     bSign = extractFloat64Sign( b );
3035     if ( aExp == 0x7FF ) {
3036         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3037             return propagateFloat64NaN( a, b );
3038         }
3039         float_raise( float_flag_invalid );
3040         return float64_default_nan;
3041     }
3042     if ( bExp == 0x7FF ) {
3043         if ( bSig ) return propagateFloat64NaN( a, b );
3044         return a;
3045     }
3046     if ( bExp == 0 ) {
3047         if ( bSig == 0 ) {
3048             float_raise( float_flag_invalid );
3049             return float64_default_nan;
3050         }
3051         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3052     }
3053     if ( aExp == 0 ) {
3054         if ( aSig == 0 ) return a;
3055         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3056     }
3057     expDiff = aExp - bExp;
3058     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3059     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3060     if ( expDiff < 0 ) {
3061         if ( expDiff < -1 ) return a;
3062         aSig >>= 1;
3063     }
3064     q = ( bSig <= aSig );
3065     if ( q ) aSig -= bSig;
3066     expDiff -= 64;
3067     while ( 0 < expDiff ) {
3068         q = estimateDiv128To64( aSig, 0, bSig );
3069         q = ( 2 < q ) ? q - 2 : 0;
3070         aSig = - ( ( bSig>>2 ) * q );
3071         expDiff -= 62;
3072     }
3073     expDiff += 64;
3074     if ( 0 < expDiff ) {
3075         q = estimateDiv128To64( aSig, 0, bSig );
3076         q = ( 2 < q ) ? q - 2 : 0;
3077         q >>= 64 - expDiff;
3078         bSig >>= 2;
3079         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3080     }
3081     else {
3082         aSig >>= 2;
3083         bSig >>= 2;
3084     }
3085     do {
3086         alternateASig = aSig;
3087         ++q;
3088         aSig -= bSig;
3089     } while ( 0 <= (sbits64) aSig );
3090     sigMean = aSig + alternateASig;
3091     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3092         aSig = alternateASig;
3093     }
3094     zSign = ( (sbits64) aSig < 0 );
3095     if ( zSign ) aSig = - aSig;
3096     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3097 
3098 }
3099 
3100 /*
3101 -------------------------------------------------------------------------------
3102 Returns the square root of the double-precision floating-point value `a'.
3103 The operation is performed according to the IEC/IEEE Standard for Binary
3104 Floating-Point Arithmetic.
3105 -------------------------------------------------------------------------------
3106 */
3107 float64 float64_sqrt( float64 a )
3108 {
3109     flag aSign;
3110     int16 aExp, zExp;
3111     bits64 aSig, zSig, doubleZSig;
3112     bits64 rem0, rem1, term0, term1;
3113 
3114     aSig = extractFloat64Frac( a );
3115     aExp = extractFloat64Exp( a );
3116     aSign = extractFloat64Sign( a );
3117     if ( aExp == 0x7FF ) {
3118         if ( aSig ) return propagateFloat64NaN( a, a );
3119         if ( ! aSign ) return a;
3120         float_raise( float_flag_invalid );
3121         return float64_default_nan;
3122     }
3123     if ( aSign ) {
3124         if ( ( aExp | aSig ) == 0 ) return a;
3125         float_raise( float_flag_invalid );
3126         return float64_default_nan;
3127     }
3128     if ( aExp == 0 ) {
3129         if ( aSig == 0 ) return 0;
3130         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3131     }
3132     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3133     aSig |= LIT64( 0x0010000000000000 );
3134     zSig = estimateSqrt32( aExp, aSig>>21 );
3135     aSig <<= 9 - ( aExp & 1 );
3136     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3137     if ( ( zSig & 0x1FF ) <= 5 ) {
3138         doubleZSig = zSig<<1;
3139         mul64To128( zSig, zSig, &term0, &term1 );
3140         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3141         while ( (sbits64) rem0 < 0 ) {
3142             --zSig;
3143             doubleZSig -= 2;
3144             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3145         }
3146         zSig |= ( ( rem0 | rem1 ) != 0 );
3147     }
3148     return roundAndPackFloat64( 0, zExp, zSig );
3149 
3150 }
3151 #endif
3152 
3153 /*
3154 -------------------------------------------------------------------------------
3155 Returns 1 if the double-precision floating-point value `a' is equal to the
3156 corresponding value `b', and 0 otherwise.  The comparison is performed
3157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3158 -------------------------------------------------------------------------------
3159 */
3160 flag float64_eq( float64 a, float64 b )
3161 {
3162 
3163     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3164          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3165        ) {
3166         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3167             float_raise( float_flag_invalid );
3168         }
3169         return 0;
3170     }
3171     return ( a == b ) ||
3172 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3173 
3174 }
3175 
3176 /*
3177 -------------------------------------------------------------------------------
3178 Returns 1 if the double-precision floating-point value `a' is less than or
3179 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3180 performed according to the IEC/IEEE Standard for Binary Floating-Point
3181 Arithmetic.
3182 -------------------------------------------------------------------------------
3183 */
3184 flag float64_le( float64 a, float64 b )
3185 {
3186     flag aSign, bSign;
3187 
3188     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3189          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3190        ) {
3191         float_raise( float_flag_invalid );
3192         return 0;
3193     }
3194     aSign = extractFloat64Sign( a );
3195     bSign = extractFloat64Sign( b );
3196     if ( aSign != bSign )
3197 	return aSign ||
3198 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3199 	      0 );
3200     return ( a == b ) ||
3201 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3202 
3203 }
3204 
3205 /*
3206 -------------------------------------------------------------------------------
3207 Returns 1 if the double-precision floating-point value `a' is less than
3208 the corresponding value `b', and 0 otherwise.  The comparison is performed
3209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3210 -------------------------------------------------------------------------------
3211 */
3212 flag float64_lt( float64 a, float64 b )
3213 {
3214     flag aSign, bSign;
3215 
3216     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3217          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3218        ) {
3219         float_raise( float_flag_invalid );
3220         return 0;
3221     }
3222     aSign = extractFloat64Sign( a );
3223     bSign = extractFloat64Sign( b );
3224     if ( aSign != bSign )
3225 	return aSign &&
3226 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3227 	      0 );
3228     return ( a != b ) &&
3229 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3230 
3231 }
3232 
3233 #ifndef SOFTFLOAT_FOR_GCC
3234 /*
3235 -------------------------------------------------------------------------------
3236 Returns 1 if the double-precision floating-point value `a' is equal to the
3237 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3238 if either operand is a NaN.  Otherwise, the comparison is performed
3239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3240 -------------------------------------------------------------------------------
3241 */
3242 flag float64_eq_signaling( float64 a, float64 b )
3243 {
3244 
3245     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3246          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3247        ) {
3248         float_raise( float_flag_invalid );
3249         return 0;
3250     }
3251     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3252 
3253 }
3254 
3255 /*
3256 -------------------------------------------------------------------------------
3257 Returns 1 if the double-precision floating-point value `a' is less than or
3258 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3259 cause an exception.  Otherwise, the comparison is performed according to the
3260 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3261 -------------------------------------------------------------------------------
3262 */
3263 flag float64_le_quiet( float64 a, float64 b )
3264 {
3265     flag aSign, bSign;
3266 
3267     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3268          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3269        ) {
3270         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3271             float_raise( float_flag_invalid );
3272         }
3273         return 0;
3274     }
3275     aSign = extractFloat64Sign( a );
3276     bSign = extractFloat64Sign( b );
3277     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3278     return ( a == b ) || ( aSign ^ ( a < b ) );
3279 
3280 }
3281 
3282 /*
3283 -------------------------------------------------------------------------------
3284 Returns 1 if the double-precision floating-point value `a' is less than
3285 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3286 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3287 Standard for Binary Floating-Point Arithmetic.
3288 -------------------------------------------------------------------------------
3289 */
3290 flag float64_lt_quiet( float64 a, float64 b )
3291 {
3292     flag aSign, bSign;
3293 
3294     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3295          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3296        ) {
3297         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3298             float_raise( float_flag_invalid );
3299         }
3300         return 0;
3301     }
3302     aSign = extractFloat64Sign( a );
3303     bSign = extractFloat64Sign( b );
3304     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3305     return ( a != b ) && ( aSign ^ ( a < b ) );
3306 
3307 }
3308 #endif
3309 
3310 #ifdef FLOATX80
3311 
3312 /*
3313 -------------------------------------------------------------------------------
3314 Returns the result of converting the extended double-precision floating-
3315 point value `a' to the 32-bit two's complement integer format.  The
3316 conversion is performed according to the IEC/IEEE Standard for Binary
3317 Floating-Point Arithmetic---which means in particular that the conversion
3318 is rounded according to the current rounding mode.  If `a' is a NaN, the
3319 largest positive integer is returned.  Otherwise, if the conversion
3320 overflows, the largest integer with the same sign as `a' is returned.
3321 -------------------------------------------------------------------------------
3322 */
3323 int32 floatx80_to_int32( floatx80 a )
3324 {
3325     flag aSign;
3326     int32 aExp, shiftCount;
3327     bits64 aSig;
3328 
3329     aSig = extractFloatx80Frac( a );
3330     aExp = extractFloatx80Exp( a );
3331     aSign = extractFloatx80Sign( a );
3332     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3333     shiftCount = 0x4037 - aExp;
3334     if ( shiftCount <= 0 ) shiftCount = 1;
3335     shift64RightJamming( aSig, shiftCount, &aSig );
3336     return roundAndPackInt32( aSign, aSig );
3337 
3338 }
3339 
3340 /*
3341 -------------------------------------------------------------------------------
3342 Returns the result of converting the extended double-precision floating-
3343 point value `a' to the 32-bit two's complement integer format.  The
3344 conversion is performed according to the IEC/IEEE Standard for Binary
3345 Floating-Point Arithmetic, except that the conversion is always rounded
3346 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3347 Otherwise, if the conversion overflows, the largest integer with the same
3348 sign as `a' is returned.
3349 -------------------------------------------------------------------------------
3350 */
3351 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3352 {
3353     flag aSign;
3354     int32 aExp, shiftCount;
3355     bits64 aSig, savedASig;
3356     int32 z;
3357 
3358     aSig = extractFloatx80Frac( a );
3359     aExp = extractFloatx80Exp( a );
3360     aSign = extractFloatx80Sign( a );
3361     if ( 0x401E < aExp ) {
3362         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3363         goto invalid;
3364     }
3365     else if ( aExp < 0x3FFF ) {
3366         if ( aExp || aSig ) float_set_inexact();
3367         return 0;
3368     }
3369     shiftCount = 0x403E - aExp;
3370     savedASig = aSig;
3371     aSig >>= shiftCount;
3372     z = aSig;
3373     if ( aSign ) z = - z;
3374     if ( ( z < 0 ) ^ aSign ) {
3375  invalid:
3376         float_raise( float_flag_invalid );
3377         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3378     }
3379     if ( ( aSig<<shiftCount ) != savedASig ) {
3380         float_set_inexact();
3381     }
3382     return z;
3383 
3384 }
3385 
3386 /*
3387 -------------------------------------------------------------------------------
3388 Returns the result of converting the extended double-precision floating-
3389 point value `a' to the 64-bit two's complement integer format.  The
3390 conversion is performed according to the IEC/IEEE Standard for Binary
3391 Floating-Point Arithmetic---which means in particular that the conversion
3392 is rounded according to the current rounding mode.  If `a' is a NaN,
3393 the largest positive integer is returned.  Otherwise, if the conversion
3394 overflows, the largest integer with the same sign as `a' is returned.
3395 -------------------------------------------------------------------------------
3396 */
3397 int64 floatx80_to_int64( floatx80 a )
3398 {
3399     flag aSign;
3400     int32 aExp, shiftCount;
3401     bits64 aSig, aSigExtra;
3402 
3403     aSig = extractFloatx80Frac( a );
3404     aExp = extractFloatx80Exp( a );
3405     aSign = extractFloatx80Sign( a );
3406     shiftCount = 0x403E - aExp;
3407     if ( shiftCount <= 0 ) {
3408         if ( shiftCount ) {
3409             float_raise( float_flag_invalid );
3410             if (    ! aSign
3411                  || (    ( aExp == 0x7FFF )
3412                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3413                ) {
3414                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3415             }
3416             return (sbits64) LIT64( 0x8000000000000000 );
3417         }
3418         aSigExtra = 0;
3419     }
3420     else {
3421         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3422     }
3423     return roundAndPackInt64( aSign, aSig, aSigExtra );
3424 
3425 }
3426 
3427 /*
3428 -------------------------------------------------------------------------------
3429 Returns the result of converting the extended double-precision floating-
3430 point value `a' to the 64-bit two's complement integer format.  The
3431 conversion is performed according to the IEC/IEEE Standard for Binary
3432 Floating-Point Arithmetic, except that the conversion is always rounded
3433 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3434 Otherwise, if the conversion overflows, the largest integer with the same
3435 sign as `a' is returned.
3436 -------------------------------------------------------------------------------
3437 */
3438 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3439 {
3440     flag aSign;
3441     int32 aExp, shiftCount;
3442     bits64 aSig;
3443     int64 z;
3444 
3445     aSig = extractFloatx80Frac( a );
3446     aExp = extractFloatx80Exp( a );
3447     aSign = extractFloatx80Sign( a );
3448     shiftCount = aExp - 0x403E;
3449     if ( 0 <= shiftCount ) {
3450         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3451         if ( ( a.high != 0xC03E ) || aSig ) {
3452             float_raise( float_flag_invalid );
3453             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3454                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3455             }
3456         }
3457         return (sbits64) LIT64( 0x8000000000000000 );
3458     }
3459     else if ( aExp < 0x3FFF ) {
3460         if ( aExp | aSig ) float_set_inexact();
3461         return 0;
3462     }
3463     z = aSig>>( - shiftCount );
3464     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3465         float_set_inexact();
3466     }
3467     if ( aSign ) z = - z;
3468     return z;
3469 
3470 }
3471 
3472 /*
3473 -------------------------------------------------------------------------------
3474 Returns the result of converting the extended double-precision floating-
3475 point value `a' to the single-precision floating-point format.  The
3476 conversion is performed according to the IEC/IEEE Standard for Binary
3477 Floating-Point Arithmetic.
3478 -------------------------------------------------------------------------------
3479 */
3480 float32 floatx80_to_float32( floatx80 a )
3481 {
3482     flag aSign;
3483     int32 aExp;
3484     bits64 aSig;
3485 
3486     aSig = extractFloatx80Frac( a );
3487     aExp = extractFloatx80Exp( a );
3488     aSign = extractFloatx80Sign( a );
3489     if ( aExp == 0x7FFF ) {
3490         if ( (bits64) ( aSig<<1 ) ) {
3491             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3492         }
3493         return packFloat32( aSign, 0xFF, 0 );
3494     }
3495     shift64RightJamming( aSig, 33, &aSig );
3496     if ( aExp || aSig ) aExp -= 0x3F81;
3497     return roundAndPackFloat32( aSign, aExp, aSig );
3498 
3499 }
3500 
3501 /*
3502 -------------------------------------------------------------------------------
3503 Returns the result of converting the extended double-precision floating-
3504 point value `a' to the double-precision floating-point format.  The
3505 conversion is performed according to the IEC/IEEE Standard for Binary
3506 Floating-Point Arithmetic.
3507 -------------------------------------------------------------------------------
3508 */
3509 float64 floatx80_to_float64( floatx80 a )
3510 {
3511     flag aSign;
3512     int32 aExp;
3513     bits64 aSig, zSig;
3514 
3515     aSig = extractFloatx80Frac( a );
3516     aExp = extractFloatx80Exp( a );
3517     aSign = extractFloatx80Sign( a );
3518     if ( aExp == 0x7FFF ) {
3519         if ( (bits64) ( aSig<<1 ) ) {
3520             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3521         }
3522         return packFloat64( aSign, 0x7FF, 0 );
3523     }
3524     shift64RightJamming( aSig, 1, &zSig );
3525     if ( aExp || aSig ) aExp -= 0x3C01;
3526     return roundAndPackFloat64( aSign, aExp, zSig );
3527 
3528 }
3529 
3530 #ifdef FLOAT128
3531 
3532 /*
3533 -------------------------------------------------------------------------------
3534 Returns the result of converting the extended double-precision floating-
3535 point value `a' to the quadruple-precision floating-point format.  The
3536 conversion is performed according to the IEC/IEEE Standard for Binary
3537 Floating-Point Arithmetic.
3538 -------------------------------------------------------------------------------
3539 */
3540 float128 floatx80_to_float128( floatx80 a )
3541 {
3542     flag aSign;
3543     int16 aExp;
3544     bits64 aSig, zSig0, zSig1;
3545 
3546     aSig = extractFloatx80Frac( a );
3547     aExp = extractFloatx80Exp( a );
3548     aSign = extractFloatx80Sign( a );
3549     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3550         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3551     }
3552     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3553     return packFloat128( aSign, aExp, zSig0, zSig1 );
3554 
3555 }
3556 
3557 #endif
3558 
3559 /*
3560 -------------------------------------------------------------------------------
3561 Rounds the extended double-precision floating-point value `a' to an integer,
3562 and returns the result as an extended quadruple-precision floating-point
3563 value.  The operation is performed according to the IEC/IEEE Standard for
3564 Binary Floating-Point Arithmetic.
3565 -------------------------------------------------------------------------------
3566 */
3567 floatx80 floatx80_round_to_int( floatx80 a )
3568 {
3569     flag aSign;
3570     int32 aExp;
3571     bits64 lastBitMask, roundBitsMask;
3572     int8 roundingMode;
3573     floatx80 z;
3574 
3575     aExp = extractFloatx80Exp( a );
3576     if ( 0x403E <= aExp ) {
3577         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3578             return propagateFloatx80NaN( a, a );
3579         }
3580         return a;
3581     }
3582     if ( aExp < 0x3FFF ) {
3583         if (    ( aExp == 0 )
3584              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3585             return a;
3586         }
3587         float_set_inexact();
3588         aSign = extractFloatx80Sign( a );
3589         switch ( float_rounding_mode() ) {
3590          case float_round_nearest_even:
3591             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3592                ) {
3593                 return
3594                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3595             }
3596             break;
3597          case float_round_down:
3598             return
3599                   aSign ?
3600                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3601                 : packFloatx80( 0, 0, 0 );
3602          case float_round_up:
3603             return
3604                   aSign ? packFloatx80( 1, 0, 0 )
3605                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3606         }
3607         return packFloatx80( aSign, 0, 0 );
3608     }
3609     lastBitMask = 1;
3610     lastBitMask <<= 0x403E - aExp;
3611     roundBitsMask = lastBitMask - 1;
3612     z = a;
3613     roundingMode = float_rounding_mode();
3614     if ( roundingMode == float_round_nearest_even ) {
3615         z.low += lastBitMask>>1;
3616         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3617     }
3618     else if ( roundingMode != float_round_to_zero ) {
3619         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3620             z.low += roundBitsMask;
3621         }
3622     }
3623     z.low &= ~ roundBitsMask;
3624     if ( z.low == 0 ) {
3625         ++z.high;
3626         z.low = LIT64( 0x8000000000000000 );
3627     }
3628     if ( z.low != a.low ) float_set_inexact();
3629     return z;
3630 
3631 }
3632 
3633 /*
3634 -------------------------------------------------------------------------------
3635 Returns the result of adding the absolute values of the extended double-
3636 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3637 negated before being returned.  `zSign' is ignored if the result is a NaN.
3638 The addition is performed according to the IEC/IEEE Standard for Binary
3639 Floating-Point Arithmetic.
3640 -------------------------------------------------------------------------------
3641 */
3642 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3643 {
3644     int32 aExp, bExp, zExp;
3645     bits64 aSig, bSig, zSig0, zSig1;
3646     int32 expDiff;
3647 
3648     aSig = extractFloatx80Frac( a );
3649     aExp = extractFloatx80Exp( a );
3650     bSig = extractFloatx80Frac( b );
3651     bExp = extractFloatx80Exp( b );
3652     expDiff = aExp - bExp;
3653     if ( 0 < expDiff ) {
3654         if ( aExp == 0x7FFF ) {
3655             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3656             return a;
3657         }
3658         if ( bExp == 0 ) --expDiff;
3659         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3660         zExp = aExp;
3661     }
3662     else if ( expDiff < 0 ) {
3663         if ( bExp == 0x7FFF ) {
3664             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3665             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3666         }
3667         if ( aExp == 0 ) ++expDiff;
3668         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3669         zExp = bExp;
3670     }
3671     else {
3672         if ( aExp == 0x7FFF ) {
3673             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3674                 return propagateFloatx80NaN( a, b );
3675             }
3676             return a;
3677         }
3678         zSig1 = 0;
3679         zSig0 = aSig + bSig;
3680         if ( aExp == 0 ) {
3681             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3682             goto roundAndPack;
3683         }
3684         zExp = aExp;
3685         goto shiftRight1;
3686     }
3687     zSig0 = aSig + bSig;
3688     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3689  shiftRight1:
3690     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3691     zSig0 |= LIT64( 0x8000000000000000 );
3692     ++zExp;
3693  roundAndPack:
3694     return
3695         roundAndPackFloatx80(
3696             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3697 
3698 }
3699 
3700 /*
3701 -------------------------------------------------------------------------------
3702 Returns the result of subtracting the absolute values of the extended
3703 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3704 difference is negated before being returned.  `zSign' is ignored if the
3705 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3706 Standard for Binary Floating-Point Arithmetic.
3707 -------------------------------------------------------------------------------
3708 */
3709 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3710 {
3711     int32 aExp, bExp, zExp;
3712     bits64 aSig, bSig, zSig0, zSig1;
3713     int32 expDiff;
3714     floatx80 z;
3715 
3716     aSig = extractFloatx80Frac( a );
3717     aExp = extractFloatx80Exp( a );
3718     bSig = extractFloatx80Frac( b );
3719     bExp = extractFloatx80Exp( b );
3720     expDiff = aExp - bExp;
3721     if ( 0 < expDiff ) goto aExpBigger;
3722     if ( expDiff < 0 ) goto bExpBigger;
3723     if ( aExp == 0x7FFF ) {
3724         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3725             return propagateFloatx80NaN( a, b );
3726         }
3727         float_raise( float_flag_invalid );
3728         z.low = floatx80_default_nan_low;
3729         z.high = floatx80_default_nan_high;
3730         return z;
3731     }
3732     if ( aExp == 0 ) {
3733         aExp = 1;
3734         bExp = 1;
3735     }
3736     zSig1 = 0;
3737     if ( bSig < aSig ) goto aBigger;
3738     if ( aSig < bSig ) goto bBigger;
3739     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3740  bExpBigger:
3741     if ( bExp == 0x7FFF ) {
3742         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3743         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3744     }
3745     if ( aExp == 0 ) ++expDiff;
3746     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3747  bBigger:
3748     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3749     zExp = bExp;
3750     zSign ^= 1;
3751     goto normalizeRoundAndPack;
3752  aExpBigger:
3753     if ( aExp == 0x7FFF ) {
3754         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3755         return a;
3756     }
3757     if ( bExp == 0 ) --expDiff;
3758     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3759  aBigger:
3760     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3761     zExp = aExp;
3762  normalizeRoundAndPack:
3763     return
3764         normalizeRoundAndPackFloatx80(
3765             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3766 
3767 }
3768 
3769 /*
3770 -------------------------------------------------------------------------------
3771 Returns the result of adding the extended double-precision floating-point
3772 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3773 Standard for Binary Floating-Point Arithmetic.
3774 -------------------------------------------------------------------------------
3775 */
3776 floatx80 floatx80_add( floatx80 a, floatx80 b )
3777 {
3778     flag aSign, bSign;
3779 
3780     aSign = extractFloatx80Sign( a );
3781     bSign = extractFloatx80Sign( b );
3782     if ( aSign == bSign ) {
3783         return addFloatx80Sigs( a, b, aSign );
3784     }
3785     else {
3786         return subFloatx80Sigs( a, b, aSign );
3787     }
3788 
3789 }
3790 
3791 /*
3792 -------------------------------------------------------------------------------
3793 Returns the result of subtracting the extended double-precision floating-
3794 point values `a' and `b'.  The operation is performed according to the
3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3796 -------------------------------------------------------------------------------
3797 */
3798 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3799 {
3800     flag aSign, bSign;
3801 
3802     aSign = extractFloatx80Sign( a );
3803     bSign = extractFloatx80Sign( b );
3804     if ( aSign == bSign ) {
3805         return subFloatx80Sigs( a, b, aSign );
3806     }
3807     else {
3808         return addFloatx80Sigs( a, b, aSign );
3809     }
3810 
3811 }
3812 
3813 /*
3814 -------------------------------------------------------------------------------
3815 Returns the result of multiplying the extended double-precision floating-
3816 point values `a' and `b'.  The operation is performed according to the
3817 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3818 -------------------------------------------------------------------------------
3819 */
3820 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3821 {
3822     flag aSign, bSign, zSign;
3823     int32 aExp, bExp, zExp;
3824     bits64 aSig, bSig, zSig0, zSig1;
3825     floatx80 z;
3826 
3827     aSig = extractFloatx80Frac( a );
3828     aExp = extractFloatx80Exp( a );
3829     aSign = extractFloatx80Sign( a );
3830     bSig = extractFloatx80Frac( b );
3831     bExp = extractFloatx80Exp( b );
3832     bSign = extractFloatx80Sign( b );
3833     zSign = aSign ^ bSign;
3834     if ( aExp == 0x7FFF ) {
3835         if (    (bits64) ( aSig<<1 )
3836              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3837             return propagateFloatx80NaN( a, b );
3838         }
3839         if ( ( bExp | bSig ) == 0 ) goto invalid;
3840         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3841     }
3842     if ( bExp == 0x7FFF ) {
3843         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3844         if ( ( aExp | aSig ) == 0 ) {
3845  invalid:
3846             float_raise( float_flag_invalid );
3847             z.low = floatx80_default_nan_low;
3848             z.high = floatx80_default_nan_high;
3849             return z;
3850         }
3851         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3852     }
3853     if ( aExp == 0 ) {
3854         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3855         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3856     }
3857     if ( bExp == 0 ) {
3858         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3859         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3860     }
3861     zExp = aExp + bExp - 0x3FFE;
3862     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3863     if ( 0 < (sbits64) zSig0 ) {
3864         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3865         --zExp;
3866     }
3867     return
3868         roundAndPackFloatx80(
3869             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3870 
3871 }
3872 
3873 /*
3874 -------------------------------------------------------------------------------
3875 Returns the result of dividing the extended double-precision floating-point
3876 value `a' by the corresponding value `b'.  The operation is performed
3877 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3878 -------------------------------------------------------------------------------
3879 */
3880 floatx80 floatx80_div( floatx80 a, floatx80 b )
3881 {
3882     flag aSign, bSign, zSign;
3883     int32 aExp, bExp, zExp;
3884     bits64 aSig, bSig, zSig0, zSig1;
3885     bits64 rem0, rem1, rem2, term0, term1, term2;
3886     floatx80 z;
3887 
3888     aSig = extractFloatx80Frac( a );
3889     aExp = extractFloatx80Exp( a );
3890     aSign = extractFloatx80Sign( a );
3891     bSig = extractFloatx80Frac( b );
3892     bExp = extractFloatx80Exp( b );
3893     bSign = extractFloatx80Sign( b );
3894     zSign = aSign ^ bSign;
3895     if ( aExp == 0x7FFF ) {
3896         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3897         if ( bExp == 0x7FFF ) {
3898             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3899             goto invalid;
3900         }
3901         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3902     }
3903     if ( bExp == 0x7FFF ) {
3904         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3905         return packFloatx80( zSign, 0, 0 );
3906     }
3907     if ( bExp == 0 ) {
3908         if ( bSig == 0 ) {
3909             if ( ( aExp | aSig ) == 0 ) {
3910  invalid:
3911                 float_raise( float_flag_invalid );
3912                 z.low = floatx80_default_nan_low;
3913                 z.high = floatx80_default_nan_high;
3914                 return z;
3915             }
3916             float_raise( float_flag_divbyzero );
3917             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3918         }
3919         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3920     }
3921     if ( aExp == 0 ) {
3922         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3923         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3924     }
3925     zExp = aExp - bExp + 0x3FFE;
3926     rem1 = 0;
3927     if ( bSig <= aSig ) {
3928         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3929         ++zExp;
3930     }
3931     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3932     mul64To128( bSig, zSig0, &term0, &term1 );
3933     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3934     while ( (sbits64) rem0 < 0 ) {
3935         --zSig0;
3936         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3937     }
3938     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3939     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3940         mul64To128( bSig, zSig1, &term1, &term2 );
3941         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3942         while ( (sbits64) rem1 < 0 ) {
3943             --zSig1;
3944             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3945         }
3946         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3947     }
3948     return
3949         roundAndPackFloatx80(
3950             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3951 
3952 }
3953 
3954 /*
3955 -------------------------------------------------------------------------------
3956 Returns the remainder of the extended double-precision floating-point value
3957 `a' with respect to the corresponding value `b'.  The operation is performed
3958 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3959 -------------------------------------------------------------------------------
3960 */
3961 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3962 {
3963     flag aSign, bSign, zSign;
3964     int32 aExp, bExp, expDiff;
3965     bits64 aSig0, aSig1, bSig;
3966     bits64 q, term0, term1, alternateASig0, alternateASig1;
3967     floatx80 z;
3968 
3969     aSig0 = extractFloatx80Frac( a );
3970     aExp = extractFloatx80Exp( a );
3971     aSign = extractFloatx80Sign( a );
3972     bSig = extractFloatx80Frac( b );
3973     bExp = extractFloatx80Exp( b );
3974     bSign = extractFloatx80Sign( b );
3975     if ( aExp == 0x7FFF ) {
3976         if (    (bits64) ( aSig0<<1 )
3977              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3978             return propagateFloatx80NaN( a, b );
3979         }
3980         goto invalid;
3981     }
3982     if ( bExp == 0x7FFF ) {
3983         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3984         return a;
3985     }
3986     if ( bExp == 0 ) {
3987         if ( bSig == 0 ) {
3988  invalid:
3989             float_raise( float_flag_invalid );
3990             z.low = floatx80_default_nan_low;
3991             z.high = floatx80_default_nan_high;
3992             return z;
3993         }
3994         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3995     }
3996     if ( aExp == 0 ) {
3997         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3998         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3999     }
4000     bSig |= LIT64( 0x8000000000000000 );
4001     zSign = aSign;
4002     expDiff = aExp - bExp;
4003     aSig1 = 0;
4004     if ( expDiff < 0 ) {
4005         if ( expDiff < -1 ) return a;
4006         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4007         expDiff = 0;
4008     }
4009     q = ( bSig <= aSig0 );
4010     if ( q ) aSig0 -= bSig;
4011     expDiff -= 64;
4012     while ( 0 < expDiff ) {
4013         q = estimateDiv128To64( aSig0, aSig1, bSig );
4014         q = ( 2 < q ) ? q - 2 : 0;
4015         mul64To128( bSig, q, &term0, &term1 );
4016         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4017         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4018         expDiff -= 62;
4019     }
4020     expDiff += 64;
4021     if ( 0 < expDiff ) {
4022         q = estimateDiv128To64( aSig0, aSig1, bSig );
4023         q = ( 2 < q ) ? q - 2 : 0;
4024         q >>= 64 - expDiff;
4025         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4026         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4027         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4028         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4029             ++q;
4030             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4031         }
4032     }
4033     else {
4034         term1 = 0;
4035         term0 = bSig;
4036     }
4037     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4038     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4039          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4040               && ( q & 1 ) )
4041        ) {
4042         aSig0 = alternateASig0;
4043         aSig1 = alternateASig1;
4044         zSign = ! zSign;
4045     }
4046     return
4047         normalizeRoundAndPackFloatx80(
4048             80, zSign, bExp + expDiff, aSig0, aSig1 );
4049 
4050 }
4051 
4052 /*
4053 -------------------------------------------------------------------------------
4054 Returns the square root of the extended double-precision floating-point
4055 value `a'.  The operation is performed according to the IEC/IEEE Standard
4056 for Binary Floating-Point Arithmetic.
4057 -------------------------------------------------------------------------------
4058 */
4059 floatx80 floatx80_sqrt( floatx80 a )
4060 {
4061     flag aSign;
4062     int32 aExp, zExp;
4063     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4064     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4065     floatx80 z;
4066 
4067     aSig0 = extractFloatx80Frac( a );
4068     aExp = extractFloatx80Exp( a );
4069     aSign = extractFloatx80Sign( a );
4070     if ( aExp == 0x7FFF ) {
4071         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4072         if ( ! aSign ) return a;
4073         goto invalid;
4074     }
4075     if ( aSign ) {
4076         if ( ( aExp | aSig0 ) == 0 ) return a;
4077  invalid:
4078         float_raise( float_flag_invalid );
4079         z.low = floatx80_default_nan_low;
4080         z.high = floatx80_default_nan_high;
4081         return z;
4082     }
4083     if ( aExp == 0 ) {
4084         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4085         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4086     }
4087     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4088     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4089     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4090     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4091     doubleZSig0 = zSig0<<1;
4092     mul64To128( zSig0, zSig0, &term0, &term1 );
4093     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4094     while ( (sbits64) rem0 < 0 ) {
4095         --zSig0;
4096         doubleZSig0 -= 2;
4097         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4098     }
4099     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4100     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4101         if ( zSig1 == 0 ) zSig1 = 1;
4102         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4103         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4104         mul64To128( zSig1, zSig1, &term2, &term3 );
4105         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4106         while ( (sbits64) rem1 < 0 ) {
4107             --zSig1;
4108             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4109             term3 |= 1;
4110             term2 |= doubleZSig0;
4111             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4112         }
4113         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4114     }
4115     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4116     zSig0 |= doubleZSig0;
4117     return
4118         roundAndPackFloatx80(
4119             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4120 
4121 }
4122 
4123 /*
4124 -------------------------------------------------------------------------------
4125 Returns 1 if the extended double-precision floating-point value `a' is
4126 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4127 performed according to the IEC/IEEE Standard for Binary Floating-Point
4128 Arithmetic.
4129 -------------------------------------------------------------------------------
4130 */
4131 flag floatx80_eq( floatx80 a, floatx80 b )
4132 {
4133 
4134     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4135               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4136          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4137               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4138        ) {
4139         if (    floatx80_is_signaling_nan( a )
4140              || floatx80_is_signaling_nan( b ) ) {
4141             float_raise( float_flag_invalid );
4142         }
4143         return 0;
4144     }
4145     return
4146            ( a.low == b.low )
4147         && (    ( a.high == b.high )
4148              || (    ( a.low == 0 )
4149                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4150            );
4151 
4152 }
4153 
4154 /*
4155 -------------------------------------------------------------------------------
4156 Returns 1 if the extended double-precision floating-point value `a' is
4157 less than or equal to the corresponding value `b', and 0 otherwise.  The
4158 comparison is performed according to the IEC/IEEE Standard for Binary
4159 Floating-Point Arithmetic.
4160 -------------------------------------------------------------------------------
4161 */
4162 flag floatx80_le( floatx80 a, floatx80 b )
4163 {
4164     flag aSign, bSign;
4165 
4166     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4167               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4168          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4169               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4170        ) {
4171         float_raise( float_flag_invalid );
4172         return 0;
4173     }
4174     aSign = extractFloatx80Sign( a );
4175     bSign = extractFloatx80Sign( b );
4176     if ( aSign != bSign ) {
4177         return
4178                aSign
4179             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4180                  == 0 );
4181     }
4182     return
4183           aSign ? le128( b.high, b.low, a.high, a.low )
4184         : le128( a.high, a.low, b.high, b.low );
4185 
4186 }
4187 
4188 /*
4189 -------------------------------------------------------------------------------
4190 Returns 1 if the extended double-precision floating-point value `a' is
4191 less than the corresponding value `b', and 0 otherwise.  The comparison
4192 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4193 Arithmetic.
4194 -------------------------------------------------------------------------------
4195 */
4196 flag floatx80_lt( floatx80 a, floatx80 b )
4197 {
4198     flag aSign, bSign;
4199 
4200     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4201               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4202          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4203               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4204        ) {
4205         float_raise( float_flag_invalid );
4206         return 0;
4207     }
4208     aSign = extractFloatx80Sign( a );
4209     bSign = extractFloatx80Sign( b );
4210     if ( aSign != bSign ) {
4211         return
4212                aSign
4213             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4214                  != 0 );
4215     }
4216     return
4217           aSign ? lt128( b.high, b.low, a.high, a.low )
4218         : lt128( a.high, a.low, b.high, b.low );
4219 
4220 }
4221 
4222 /*
4223 -------------------------------------------------------------------------------
4224 Returns 1 if the extended double-precision floating-point value `a' is equal
4225 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4226 raised if either operand is a NaN.  Otherwise, the comparison is performed
4227 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 -------------------------------------------------------------------------------
4229 */
4230 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4231 {
4232 
4233     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4234               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4235          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4236               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237        ) {
4238         float_raise( float_flag_invalid );
4239         return 0;
4240     }
4241     return
4242            ( a.low == b.low )
4243         && (    ( a.high == b.high )
4244              || (    ( a.low == 0 )
4245                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4246            );
4247 
4248 }
4249 
4250 /*
4251 -------------------------------------------------------------------------------
4252 Returns 1 if the extended double-precision floating-point value `a' is less
4253 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4254 do not cause an exception.  Otherwise, the comparison is performed according
4255 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 -------------------------------------------------------------------------------
4257 */
4258 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4259 {
4260     flag aSign, bSign;
4261 
4262     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4263               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4264          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4265               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4266        ) {
4267         if (    floatx80_is_signaling_nan( a )
4268              || floatx80_is_signaling_nan( b ) ) {
4269             float_raise( float_flag_invalid );
4270         }
4271         return 0;
4272     }
4273     aSign = extractFloatx80Sign( a );
4274     bSign = extractFloatx80Sign( b );
4275     if ( aSign != bSign ) {
4276         return
4277                aSign
4278             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4279                  == 0 );
4280     }
4281     return
4282           aSign ? le128( b.high, b.low, a.high, a.low )
4283         : le128( a.high, a.low, b.high, b.low );
4284 
4285 }
4286 
4287 /*
4288 -------------------------------------------------------------------------------
4289 Returns 1 if the extended double-precision floating-point value `a' is less
4290 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4291 an exception.  Otherwise, the comparison is performed according to the
4292 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4293 -------------------------------------------------------------------------------
4294 */
4295 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4296 {
4297     flag aSign, bSign;
4298 
4299     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4300               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4301          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4302               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4303        ) {
4304         if (    floatx80_is_signaling_nan( a )
4305              || floatx80_is_signaling_nan( b ) ) {
4306             float_raise( float_flag_invalid );
4307         }
4308         return 0;
4309     }
4310     aSign = extractFloatx80Sign( a );
4311     bSign = extractFloatx80Sign( b );
4312     if ( aSign != bSign ) {
4313         return
4314                aSign
4315             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4316                  != 0 );
4317     }
4318     return
4319           aSign ? lt128( b.high, b.low, a.high, a.low )
4320         : lt128( a.high, a.low, b.high, b.low );
4321 
4322 }
4323 
4324 #endif
4325 
4326 #ifdef FLOAT128
4327 
4328 /*
4329 -------------------------------------------------------------------------------
4330 Returns the result of converting the quadruple-precision floating-point
4331 value `a' to the 32-bit two's complement integer format.  The conversion
4332 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4333 Arithmetic---which means in particular that the conversion is rounded
4334 according to the current rounding mode.  If `a' is a NaN, the largest
4335 positive integer is returned.  Otherwise, if the conversion overflows, the
4336 largest integer with the same sign as `a' is returned.
4337 -------------------------------------------------------------------------------
4338 */
4339 int32 float128_to_int32( float128 a )
4340 {
4341     flag aSign;
4342     int32 aExp, shiftCount;
4343     bits64 aSig0, aSig1;
4344 
4345     aSig1 = extractFloat128Frac1( a );
4346     aSig0 = extractFloat128Frac0( a );
4347     aExp = extractFloat128Exp( a );
4348     aSign = extractFloat128Sign( a );
4349     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4350     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4351     aSig0 |= ( aSig1 != 0 );
4352     shiftCount = 0x4028 - aExp;
4353     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4354     return roundAndPackInt32( aSign, aSig0 );
4355 
4356 }
4357 
4358 /*
4359 -------------------------------------------------------------------------------
4360 Returns the result of converting the quadruple-precision floating-point
4361 value `a' to the 32-bit two's complement integer format.  The conversion
4362 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4363 Arithmetic, except that the conversion is always rounded toward zero.  If
4364 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4365 conversion overflows, the largest integer with the same sign as `a' is
4366 returned.
4367 -------------------------------------------------------------------------------
4368 */
4369 int32 float128_to_int32_round_to_zero( float128 a )
4370 {
4371     flag aSign;
4372     int32 aExp, shiftCount;
4373     bits64 aSig0, aSig1, savedASig;
4374     int32 z;
4375 
4376     aSig1 = extractFloat128Frac1( a );
4377     aSig0 = extractFloat128Frac0( a );
4378     aExp = extractFloat128Exp( a );
4379     aSign = extractFloat128Sign( a );
4380     aSig0 |= ( aSig1 != 0 );
4381     if ( 0x401E < aExp ) {
4382         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4383         goto invalid;
4384     }
4385     else if ( aExp < 0x3FFF ) {
4386         if ( aExp || aSig0 ) float_set_inexact();
4387         return 0;
4388     }
4389     aSig0 |= LIT64( 0x0001000000000000 );
4390     shiftCount = 0x402F - aExp;
4391     savedASig = aSig0;
4392     aSig0 >>= shiftCount;
4393     z = aSig0;
4394     if ( aSign ) z = - z;
4395     if ( ( z < 0 ) ^ aSign ) {
4396  invalid:
4397         float_raise( float_flag_invalid );
4398         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4399     }
4400     if ( ( aSig0<<shiftCount ) != savedASig ) {
4401         float_set_inexact();
4402     }
4403     return z;
4404 
4405 }
4406 
4407 /*
4408 -------------------------------------------------------------------------------
4409 Returns the result of converting the quadruple-precision floating-point
4410 value `a' to the 64-bit two's complement integer format.  The conversion
4411 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4412 Arithmetic---which means in particular that the conversion is rounded
4413 according to the current rounding mode.  If `a' is a NaN, the largest
4414 positive integer is returned.  Otherwise, if the conversion overflows, the
4415 largest integer with the same sign as `a' is returned.
4416 -------------------------------------------------------------------------------
4417 */
4418 int64 float128_to_int64( float128 a )
4419 {
4420     flag aSign;
4421     int32 aExp, shiftCount;
4422     bits64 aSig0, aSig1;
4423 
4424     aSig1 = extractFloat128Frac1( a );
4425     aSig0 = extractFloat128Frac0( a );
4426     aExp = extractFloat128Exp( a );
4427     aSign = extractFloat128Sign( a );
4428     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4429     shiftCount = 0x402F - aExp;
4430     if ( shiftCount <= 0 ) {
4431         if ( 0x403E < aExp ) {
4432             float_raise( float_flag_invalid );
4433             if (    ! aSign
4434                  || (    ( aExp == 0x7FFF )
4435                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4436                     )
4437                ) {
4438                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4439             }
4440             return (sbits64) LIT64( 0x8000000000000000 );
4441         }
4442         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4443     }
4444     else {
4445         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4446     }
4447     return roundAndPackInt64( aSign, aSig0, aSig1 );
4448 
4449 }
4450 
4451 /*
4452 -------------------------------------------------------------------------------
4453 Returns the result of converting the quadruple-precision floating-point
4454 value `a' to the 64-bit two's complement integer format.  The conversion
4455 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4456 Arithmetic, except that the conversion is always rounded toward zero.
4457 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4458 the conversion overflows, the largest integer with the same sign as `a' is
4459 returned.
4460 -------------------------------------------------------------------------------
4461 */
4462 int64 float128_to_int64_round_to_zero( float128 a )
4463 {
4464     flag aSign;
4465     int32 aExp, shiftCount;
4466     bits64 aSig0, aSig1;
4467     int64 z;
4468 
4469     aSig1 = extractFloat128Frac1( a );
4470     aSig0 = extractFloat128Frac0( a );
4471     aExp = extractFloat128Exp( a );
4472     aSign = extractFloat128Sign( a );
4473     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4474     shiftCount = aExp - 0x402F;
4475     if ( 0 < shiftCount ) {
4476         if ( 0x403E <= aExp ) {
4477             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4478             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4479                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4480                 if ( aSig1 ) float_set_inexact();
4481             }
4482             else {
4483                 float_raise( float_flag_invalid );
4484                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4485                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4486                 }
4487             }
4488             return (sbits64) LIT64( 0x8000000000000000 );
4489         }
4490         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4491         if ( (bits64) ( aSig1<<shiftCount ) ) {
4492             float_set_inexact();
4493         }
4494     }
4495     else {
4496         if ( aExp < 0x3FFF ) {
4497             if ( aExp | aSig0 | aSig1 ) {
4498                 float_set_inexact();
4499             }
4500             return 0;
4501         }
4502         z = aSig0>>( - shiftCount );
4503         if (    aSig1
4504              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4505             float_set_inexact();
4506         }
4507     }
4508     if ( aSign ) z = - z;
4509     return z;
4510 
4511 }
4512 
4513 /*
4514 -------------------------------------------------------------------------------
4515 Returns the result of converting the quadruple-precision floating-point
4516 value `a' to the single-precision floating-point format.  The conversion
4517 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4518 Arithmetic.
4519 -------------------------------------------------------------------------------
4520 */
4521 float32 float128_to_float32( float128 a )
4522 {
4523     flag aSign;
4524     int32 aExp;
4525     bits64 aSig0, aSig1;
4526     bits32 zSig;
4527 
4528     aSig1 = extractFloat128Frac1( a );
4529     aSig0 = extractFloat128Frac0( a );
4530     aExp = extractFloat128Exp( a );
4531     aSign = extractFloat128Sign( a );
4532     if ( aExp == 0x7FFF ) {
4533         if ( aSig0 | aSig1 ) {
4534             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4535         }
4536         return packFloat32( aSign, 0xFF, 0 );
4537     }
4538     aSig0 |= ( aSig1 != 0 );
4539     shift64RightJamming( aSig0, 18, &aSig0 );
4540     zSig = aSig0;
4541     if ( aExp || zSig ) {
4542         zSig |= 0x40000000;
4543         aExp -= 0x3F81;
4544     }
4545     return roundAndPackFloat32( aSign, aExp, zSig );
4546 
4547 }
4548 
4549 /*
4550 -------------------------------------------------------------------------------
4551 Returns the result of converting the quadruple-precision floating-point
4552 value `a' to the double-precision floating-point format.  The conversion
4553 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4554 Arithmetic.
4555 -------------------------------------------------------------------------------
4556 */
4557 float64 float128_to_float64( float128 a )
4558 {
4559     flag aSign;
4560     int32 aExp;
4561     bits64 aSig0, aSig1;
4562 
4563     aSig1 = extractFloat128Frac1( a );
4564     aSig0 = extractFloat128Frac0( a );
4565     aExp = extractFloat128Exp( a );
4566     aSign = extractFloat128Sign( a );
4567     if ( aExp == 0x7FFF ) {
4568         if ( aSig0 | aSig1 ) {
4569             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4570         }
4571         return packFloat64( aSign, 0x7FF, 0 );
4572     }
4573     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4574     aSig0 |= ( aSig1 != 0 );
4575     if ( aExp || aSig0 ) {
4576         aSig0 |= LIT64( 0x4000000000000000 );
4577         aExp -= 0x3C01;
4578     }
4579     return roundAndPackFloat64( aSign, aExp, aSig0 );
4580 
4581 }
4582 
4583 #ifdef FLOATX80
4584 
4585 /*
4586 -------------------------------------------------------------------------------
4587 Returns the result of converting the quadruple-precision floating-point
4588 value `a' to the extended double-precision floating-point format.  The
4589 conversion is performed according to the IEC/IEEE Standard for Binary
4590 Floating-Point Arithmetic.
4591 -------------------------------------------------------------------------------
4592 */
4593 floatx80 float128_to_floatx80( float128 a )
4594 {
4595     flag aSign;
4596     int32 aExp;
4597     bits64 aSig0, aSig1;
4598 
4599     aSig1 = extractFloat128Frac1( a );
4600     aSig0 = extractFloat128Frac0( a );
4601     aExp = extractFloat128Exp( a );
4602     aSign = extractFloat128Sign( a );
4603     if ( aExp == 0x7FFF ) {
4604         if ( aSig0 | aSig1 ) {
4605             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4606         }
4607         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4608     }
4609     if ( aExp == 0 ) {
4610         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4611         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4612     }
4613     else {
4614         aSig0 |= LIT64( 0x0001000000000000 );
4615     }
4616     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4617     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4618 
4619 }
4620 
4621 #endif
4622 
4623 /*
4624 -------------------------------------------------------------------------------
4625 Rounds the quadruple-precision floating-point value `a' to an integer, and
4626 returns the result as a quadruple-precision floating-point value.  The
4627 operation is performed according to the IEC/IEEE Standard for Binary
4628 Floating-Point Arithmetic.
4629 -------------------------------------------------------------------------------
4630 */
4631 float128 float128_round_to_int( float128 a )
4632 {
4633     flag aSign;
4634     int32 aExp;
4635     bits64 lastBitMask, roundBitsMask;
4636     int8 roundingMode;
4637     float128 z;
4638 
4639     aExp = extractFloat128Exp( a );
4640     if ( 0x402F <= aExp ) {
4641         if ( 0x406F <= aExp ) {
4642             if (    ( aExp == 0x7FFF )
4643                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4644                ) {
4645                 return propagateFloat128NaN( a, a );
4646             }
4647             return a;
4648         }
4649         lastBitMask = 1;
4650         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4651         roundBitsMask = lastBitMask - 1;
4652         z = a;
4653         roundingMode = float_rounding_mode();
4654         if ( roundingMode == float_round_nearest_even ) {
4655             if ( lastBitMask ) {
4656                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4657                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4658             }
4659             else {
4660                 if ( (sbits64) z.low < 0 ) {
4661                     ++z.high;
4662                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4663                 }
4664             }
4665         }
4666         else if ( roundingMode != float_round_to_zero ) {
4667             if (   extractFloat128Sign( z )
4668                  ^ ( roundingMode == float_round_up ) ) {
4669                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4670             }
4671         }
4672         z.low &= ~ roundBitsMask;
4673     }
4674     else {
4675         if ( aExp < 0x3FFF ) {
4676             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4677             float_set_inexact();
4678             aSign = extractFloat128Sign( a );
4679             switch ( float_rounding_mode() ) {
4680              case float_round_nearest_even:
4681                 if (    ( aExp == 0x3FFE )
4682                      && (   extractFloat128Frac0( a )
4683                           | extractFloat128Frac1( a ) )
4684                    ) {
4685                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4686                 }
4687                 break;
4688              case float_round_down:
4689                 return
4690                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4691                     : packFloat128( 0, 0, 0, 0 );
4692              case float_round_up:
4693                 return
4694                       aSign ? packFloat128( 1, 0, 0, 0 )
4695                     : packFloat128( 0, 0x3FFF, 0, 0 );
4696             }
4697             return packFloat128( aSign, 0, 0, 0 );
4698         }
4699         lastBitMask = 1;
4700         lastBitMask <<= 0x402F - aExp;
4701         roundBitsMask = lastBitMask - 1;
4702         z.low = 0;
4703         z.high = a.high;
4704         roundingMode = float_rounding_mode();
4705         if ( roundingMode == float_round_nearest_even ) {
4706             z.high += lastBitMask>>1;
4707             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4708                 z.high &= ~ lastBitMask;
4709             }
4710         }
4711         else if ( roundingMode != float_round_to_zero ) {
4712             if (   extractFloat128Sign( z )
4713                  ^ ( roundingMode == float_round_up ) ) {
4714                 z.high |= ( a.low != 0 );
4715                 z.high += roundBitsMask;
4716             }
4717         }
4718         z.high &= ~ roundBitsMask;
4719     }
4720     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4721         float_set_inexact();
4722     }
4723     return z;
4724 
4725 }
4726 
4727 /*
4728 -------------------------------------------------------------------------------
4729 Returns the result of adding the absolute values of the quadruple-precision
4730 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4731 before being returned.  `zSign' is ignored if the result is a NaN.
4732 The addition is performed according to the IEC/IEEE Standard for Binary
4733 Floating-Point Arithmetic.
4734 -------------------------------------------------------------------------------
4735 */
4736 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4737 {
4738     int32 aExp, bExp, zExp;
4739     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4740     int32 expDiff;
4741 
4742     aSig1 = extractFloat128Frac1( a );
4743     aSig0 = extractFloat128Frac0( a );
4744     aExp = extractFloat128Exp( a );
4745     bSig1 = extractFloat128Frac1( b );
4746     bSig0 = extractFloat128Frac0( b );
4747     bExp = extractFloat128Exp( b );
4748     expDiff = aExp - bExp;
4749     if ( 0 < expDiff ) {
4750         if ( aExp == 0x7FFF ) {
4751             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4752             return a;
4753         }
4754         if ( bExp == 0 ) {
4755             --expDiff;
4756         }
4757         else {
4758             bSig0 |= LIT64( 0x0001000000000000 );
4759         }
4760         shift128ExtraRightJamming(
4761             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4762         zExp = aExp;
4763     }
4764     else if ( expDiff < 0 ) {
4765         if ( bExp == 0x7FFF ) {
4766             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4767             return packFloat128( zSign, 0x7FFF, 0, 0 );
4768         }
4769         if ( aExp == 0 ) {
4770             ++expDiff;
4771         }
4772         else {
4773             aSig0 |= LIT64( 0x0001000000000000 );
4774         }
4775         shift128ExtraRightJamming(
4776             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4777         zExp = bExp;
4778     }
4779     else {
4780         if ( aExp == 0x7FFF ) {
4781             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4782                 return propagateFloat128NaN( a, b );
4783             }
4784             return a;
4785         }
4786         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4787         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4788         zSig2 = 0;
4789         zSig0 |= LIT64( 0x0002000000000000 );
4790         zExp = aExp;
4791         goto shiftRight1;
4792     }
4793     aSig0 |= LIT64( 0x0001000000000000 );
4794     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4795     --zExp;
4796     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4797     ++zExp;
4798  shiftRight1:
4799     shift128ExtraRightJamming(
4800         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4801  roundAndPack:
4802     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4803 
4804 }
4805 
4806 /*
4807 -------------------------------------------------------------------------------
4808 Returns the result of subtracting the absolute values of the quadruple-
4809 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4810 difference is negated before being returned.  `zSign' is ignored if the
4811 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4812 Standard for Binary Floating-Point Arithmetic.
4813 -------------------------------------------------------------------------------
4814 */
4815 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4816 {
4817     int32 aExp, bExp, zExp;
4818     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4819     int32 expDiff;
4820     float128 z;
4821 
4822     aSig1 = extractFloat128Frac1( a );
4823     aSig0 = extractFloat128Frac0( a );
4824     aExp = extractFloat128Exp( a );
4825     bSig1 = extractFloat128Frac1( b );
4826     bSig0 = extractFloat128Frac0( b );
4827     bExp = extractFloat128Exp( b );
4828     expDiff = aExp - bExp;
4829     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4830     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4831     if ( 0 < expDiff ) goto aExpBigger;
4832     if ( expDiff < 0 ) goto bExpBigger;
4833     if ( aExp == 0x7FFF ) {
4834         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4835             return propagateFloat128NaN( a, b );
4836         }
4837         float_raise( float_flag_invalid );
4838         z.low = float128_default_nan_low;
4839         z.high = float128_default_nan_high;
4840         return z;
4841     }
4842     if ( aExp == 0 ) {
4843         aExp = 1;
4844         bExp = 1;
4845     }
4846     if ( bSig0 < aSig0 ) goto aBigger;
4847     if ( aSig0 < bSig0 ) goto bBigger;
4848     if ( bSig1 < aSig1 ) goto aBigger;
4849     if ( aSig1 < bSig1 ) goto bBigger;
4850     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4851  bExpBigger:
4852     if ( bExp == 0x7FFF ) {
4853         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4854         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4855     }
4856     if ( aExp == 0 ) {
4857         ++expDiff;
4858     }
4859     else {
4860         aSig0 |= LIT64( 0x4000000000000000 );
4861     }
4862     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4863     bSig0 |= LIT64( 0x4000000000000000 );
4864  bBigger:
4865     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4866     zExp = bExp;
4867     zSign ^= 1;
4868     goto normalizeRoundAndPack;
4869  aExpBigger:
4870     if ( aExp == 0x7FFF ) {
4871         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4872         return a;
4873     }
4874     if ( bExp == 0 ) {
4875         --expDiff;
4876     }
4877     else {
4878         bSig0 |= LIT64( 0x4000000000000000 );
4879     }
4880     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4881     aSig0 |= LIT64( 0x4000000000000000 );
4882  aBigger:
4883     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4884     zExp = aExp;
4885  normalizeRoundAndPack:
4886     --zExp;
4887     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4888 
4889 }
4890 
4891 /*
4892 -------------------------------------------------------------------------------
4893 Returns the result of adding the quadruple-precision floating-point values
4894 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4895 for Binary Floating-Point Arithmetic.
4896 -------------------------------------------------------------------------------
4897 */
4898 float128 float128_add( float128 a, float128 b )
4899 {
4900     flag aSign, bSign;
4901 
4902     aSign = extractFloat128Sign( a );
4903     bSign = extractFloat128Sign( b );
4904     if ( aSign == bSign ) {
4905         return addFloat128Sigs( a, b, aSign );
4906     }
4907     else {
4908         return subFloat128Sigs( a, b, aSign );
4909     }
4910 
4911 }
4912 
4913 /*
4914 -------------------------------------------------------------------------------
4915 Returns the result of subtracting the quadruple-precision floating-point
4916 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4917 Standard for Binary Floating-Point Arithmetic.
4918 -------------------------------------------------------------------------------
4919 */
4920 float128 float128_sub( float128 a, float128 b )
4921 {
4922     flag aSign, bSign;
4923 
4924     aSign = extractFloat128Sign( a );
4925     bSign = extractFloat128Sign( b );
4926     if ( aSign == bSign ) {
4927         return subFloat128Sigs( a, b, aSign );
4928     }
4929     else {
4930         return addFloat128Sigs( a, b, aSign );
4931     }
4932 
4933 }
4934 
4935 /*
4936 -------------------------------------------------------------------------------
4937 Returns the result of multiplying the quadruple-precision floating-point
4938 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4939 Standard for Binary Floating-Point Arithmetic.
4940 -------------------------------------------------------------------------------
4941 */
4942 float128 float128_mul( float128 a, float128 b )
4943 {
4944     flag aSign, bSign, zSign;
4945     int32 aExp, bExp, zExp;
4946     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4947     float128 z;
4948 
4949     aSig1 = extractFloat128Frac1( a );
4950     aSig0 = extractFloat128Frac0( a );
4951     aExp = extractFloat128Exp( a );
4952     aSign = extractFloat128Sign( a );
4953     bSig1 = extractFloat128Frac1( b );
4954     bSig0 = extractFloat128Frac0( b );
4955     bExp = extractFloat128Exp( b );
4956     bSign = extractFloat128Sign( b );
4957     zSign = aSign ^ bSign;
4958     if ( aExp == 0x7FFF ) {
4959         if (    ( aSig0 | aSig1 )
4960              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4961             return propagateFloat128NaN( a, b );
4962         }
4963         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4964         return packFloat128( zSign, 0x7FFF, 0, 0 );
4965     }
4966     if ( bExp == 0x7FFF ) {
4967         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4968         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4969  invalid:
4970             float_raise( float_flag_invalid );
4971             z.low = float128_default_nan_low;
4972             z.high = float128_default_nan_high;
4973             return z;
4974         }
4975         return packFloat128( zSign, 0x7FFF, 0, 0 );
4976     }
4977     if ( aExp == 0 ) {
4978         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4979         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4980     }
4981     if ( bExp == 0 ) {
4982         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4983         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4984     }
4985     zExp = aExp + bExp - 0x4000;
4986     aSig0 |= LIT64( 0x0001000000000000 );
4987     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4988     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4989     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4990     zSig2 |= ( zSig3 != 0 );
4991     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4992         shift128ExtraRightJamming(
4993             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4994         ++zExp;
4995     }
4996     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4997 
4998 }
4999 
5000 /*
5001 -------------------------------------------------------------------------------
5002 Returns the result of dividing the quadruple-precision floating-point value
5003 `a' by the corresponding value `b'.  The operation is performed according to
5004 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005 -------------------------------------------------------------------------------
5006 */
5007 float128 float128_div( float128 a, float128 b )
5008 {
5009     flag aSign, bSign, zSign;
5010     int32 aExp, bExp, zExp;
5011     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5012     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5013     float128 z;
5014 
5015     aSig1 = extractFloat128Frac1( a );
5016     aSig0 = extractFloat128Frac0( a );
5017     aExp = extractFloat128Exp( a );
5018     aSign = extractFloat128Sign( a );
5019     bSig1 = extractFloat128Frac1( b );
5020     bSig0 = extractFloat128Frac0( b );
5021     bExp = extractFloat128Exp( b );
5022     bSign = extractFloat128Sign( b );
5023     zSign = aSign ^ bSign;
5024     if ( aExp == 0x7FFF ) {
5025         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5026         if ( bExp == 0x7FFF ) {
5027             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5028             goto invalid;
5029         }
5030         return packFloat128( zSign, 0x7FFF, 0, 0 );
5031     }
5032     if ( bExp == 0x7FFF ) {
5033         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5034         return packFloat128( zSign, 0, 0, 0 );
5035     }
5036     if ( bExp == 0 ) {
5037         if ( ( bSig0 | bSig1 ) == 0 ) {
5038             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5039  invalid:
5040                 float_raise( float_flag_invalid );
5041                 z.low = float128_default_nan_low;
5042                 z.high = float128_default_nan_high;
5043                 return z;
5044             }
5045             float_raise( float_flag_divbyzero );
5046             return packFloat128( zSign, 0x7FFF, 0, 0 );
5047         }
5048         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5049     }
5050     if ( aExp == 0 ) {
5051         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5052         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5053     }
5054     zExp = aExp - bExp + 0x3FFD;
5055     shortShift128Left(
5056         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5057     shortShift128Left(
5058         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5059     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5060         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5061         ++zExp;
5062     }
5063     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5064     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5065     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5066     while ( (sbits64) rem0 < 0 ) {
5067         --zSig0;
5068         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5069     }
5070     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5071     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5072         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5073         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5074         while ( (sbits64) rem1 < 0 ) {
5075             --zSig1;
5076             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5077         }
5078         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5079     }
5080     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5081     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5082 
5083 }
5084 
5085 /*
5086 -------------------------------------------------------------------------------
5087 Returns the remainder of the quadruple-precision floating-point value `a'
5088 with respect to the corresponding value `b'.  The operation is performed
5089 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5090 -------------------------------------------------------------------------------
5091 */
5092 float128 float128_rem( float128 a, float128 b )
5093 {
5094     flag aSign, bSign, zSign;
5095     int32 aExp, bExp, expDiff;
5096     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5097     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5098     sbits64 sigMean0;
5099     float128 z;
5100 
5101     aSig1 = extractFloat128Frac1( a );
5102     aSig0 = extractFloat128Frac0( a );
5103     aExp = extractFloat128Exp( a );
5104     aSign = extractFloat128Sign( a );
5105     bSig1 = extractFloat128Frac1( b );
5106     bSig0 = extractFloat128Frac0( b );
5107     bExp = extractFloat128Exp( b );
5108     bSign = extractFloat128Sign( b );
5109     if ( aExp == 0x7FFF ) {
5110         if (    ( aSig0 | aSig1 )
5111              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5112             return propagateFloat128NaN( a, b );
5113         }
5114         goto invalid;
5115     }
5116     if ( bExp == 0x7FFF ) {
5117         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5118         return a;
5119     }
5120     if ( bExp == 0 ) {
5121         if ( ( bSig0 | bSig1 ) == 0 ) {
5122  invalid:
5123             float_raise( float_flag_invalid );
5124             z.low = float128_default_nan_low;
5125             z.high = float128_default_nan_high;
5126             return z;
5127         }
5128         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5129     }
5130     if ( aExp == 0 ) {
5131         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5132         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5133     }
5134     expDiff = aExp - bExp;
5135     if ( expDiff < -1 ) return a;
5136     shortShift128Left(
5137         aSig0 | LIT64( 0x0001000000000000 ),
5138         aSig1,
5139         15 - ( expDiff < 0 ),
5140         &aSig0,
5141         &aSig1
5142     );
5143     shortShift128Left(
5144         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5145     q = le128( bSig0, bSig1, aSig0, aSig1 );
5146     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5147     expDiff -= 64;
5148     while ( 0 < expDiff ) {
5149         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5150         q = ( 4 < q ) ? q - 4 : 0;
5151         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5152         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5153         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5154         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5155         expDiff -= 61;
5156     }
5157     if ( -64 < expDiff ) {
5158         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5159         q = ( 4 < q ) ? q - 4 : 0;
5160         q >>= - expDiff;
5161         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5162         expDiff += 52;
5163         if ( expDiff < 0 ) {
5164             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5165         }
5166         else {
5167             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5168         }
5169         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5170         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5171     }
5172     else {
5173         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5174         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5175     }
5176     do {
5177         alternateASig0 = aSig0;
5178         alternateASig1 = aSig1;
5179         ++q;
5180         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5181     } while ( 0 <= (sbits64) aSig0 );
5182     add128(
5183         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5184     if (    ( sigMean0 < 0 )
5185          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5186         aSig0 = alternateASig0;
5187         aSig1 = alternateASig1;
5188     }
5189     zSign = ( (sbits64) aSig0 < 0 );
5190     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5191     return
5192         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5193 
5194 }
5195 
5196 /*
5197 -------------------------------------------------------------------------------
5198 Returns the square root of the quadruple-precision floating-point value `a'.
5199 The operation is performed according to the IEC/IEEE Standard for Binary
5200 Floating-Point Arithmetic.
5201 -------------------------------------------------------------------------------
5202 */
5203 float128 float128_sqrt( float128 a )
5204 {
5205     flag aSign;
5206     int32 aExp, zExp;
5207     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5208     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5209     float128 z;
5210 
5211     aSig1 = extractFloat128Frac1( a );
5212     aSig0 = extractFloat128Frac0( a );
5213     aExp = extractFloat128Exp( a );
5214     aSign = extractFloat128Sign( a );
5215     if ( aExp == 0x7FFF ) {
5216         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5217         if ( ! aSign ) return a;
5218         goto invalid;
5219     }
5220     if ( aSign ) {
5221         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5222  invalid:
5223         float_raise( float_flag_invalid );
5224         z.low = float128_default_nan_low;
5225         z.high = float128_default_nan_high;
5226         return z;
5227     }
5228     if ( aExp == 0 ) {
5229         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5230         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5231     }
5232     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5233     aSig0 |= LIT64( 0x0001000000000000 );
5234     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5235     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5236     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5237     doubleZSig0 = zSig0<<1;
5238     mul64To128( zSig0, zSig0, &term0, &term1 );
5239     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5240     while ( (sbits64) rem0 < 0 ) {
5241         --zSig0;
5242         doubleZSig0 -= 2;
5243         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5244     }
5245     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5246     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5247         if ( zSig1 == 0 ) zSig1 = 1;
5248         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5249         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5250         mul64To128( zSig1, zSig1, &term2, &term3 );
5251         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5252         while ( (sbits64) rem1 < 0 ) {
5253             --zSig1;
5254             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5255             term3 |= 1;
5256             term2 |= doubleZSig0;
5257             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5258         }
5259         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5260     }
5261     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5262     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5263 
5264 }
5265 
5266 /*
5267 -------------------------------------------------------------------------------
5268 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5269 the corresponding value `b', and 0 otherwise.  The comparison is performed
5270 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5271 -------------------------------------------------------------------------------
5272 */
5273 flag float128_eq( float128 a, float128 b )
5274 {
5275 
5276     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5277               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5278          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5279               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5280        ) {
5281         if (    float128_is_signaling_nan( a )
5282              || float128_is_signaling_nan( b ) ) {
5283             float_raise( float_flag_invalid );
5284         }
5285         return 0;
5286     }
5287     return
5288            ( a.low == b.low )
5289         && (    ( a.high == b.high )
5290              || (    ( a.low == 0 )
5291                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5292            );
5293 
5294 }
5295 
5296 /*
5297 -------------------------------------------------------------------------------
5298 Returns 1 if the quadruple-precision floating-point value `a' is less than
5299 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5300 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5301 Arithmetic.
5302 -------------------------------------------------------------------------------
5303 */
5304 flag float128_le( float128 a, float128 b )
5305 {
5306     flag aSign, bSign;
5307 
5308     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5309               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5311               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312        ) {
5313         float_raise( float_flag_invalid );
5314         return 0;
5315     }
5316     aSign = extractFloat128Sign( a );
5317     bSign = extractFloat128Sign( b );
5318     if ( aSign != bSign ) {
5319         return
5320                aSign
5321             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5322                  == 0 );
5323     }
5324     return
5325           aSign ? le128( b.high, b.low, a.high, a.low )
5326         : le128( a.high, a.low, b.high, b.low );
5327 
5328 }
5329 
5330 /*
5331 -------------------------------------------------------------------------------
5332 Returns 1 if the quadruple-precision floating-point value `a' is less than
5333 the corresponding value `b', and 0 otherwise.  The comparison is performed
5334 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5335 -------------------------------------------------------------------------------
5336 */
5337 flag float128_lt( float128 a, float128 b )
5338 {
5339     flag aSign, bSign;
5340 
5341     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5342               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5343          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5344               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5345        ) {
5346         float_raise( float_flag_invalid );
5347         return 0;
5348     }
5349     aSign = extractFloat128Sign( a );
5350     bSign = extractFloat128Sign( b );
5351     if ( aSign != bSign ) {
5352         return
5353                aSign
5354             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5355                  != 0 );
5356     }
5357     return
5358           aSign ? lt128( b.high, b.low, a.high, a.low )
5359         : lt128( a.high, a.low, b.high, b.low );
5360 
5361 }
5362 
5363 /*
5364 -------------------------------------------------------------------------------
5365 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5366 the corresponding value `b', and 0 otherwise.  The invalid exception is
5367 raised if either operand is a NaN.  Otherwise, the comparison is performed
5368 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5369 -------------------------------------------------------------------------------
5370 */
5371 flag float128_eq_signaling( float128 a, float128 b )
5372 {
5373 
5374     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5375               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5376          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5377               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5378        ) {
5379         float_raise( float_flag_invalid );
5380         return 0;
5381     }
5382     return
5383            ( a.low == b.low )
5384         && (    ( a.high == b.high )
5385              || (    ( a.low == 0 )
5386                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5387            );
5388 
5389 }
5390 
5391 /*
5392 -------------------------------------------------------------------------------
5393 Returns 1 if the quadruple-precision floating-point value `a' is less than
5394 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5395 cause an exception.  Otherwise, the comparison is performed according to the
5396 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5397 -------------------------------------------------------------------------------
5398 */
5399 flag float128_le_quiet( float128 a, float128 b )
5400 {
5401     flag aSign, bSign;
5402 
5403     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5404               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5405          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5406               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5407        ) {
5408         if (    float128_is_signaling_nan( a )
5409              || float128_is_signaling_nan( b ) ) {
5410             float_raise( float_flag_invalid );
5411         }
5412         return 0;
5413     }
5414     aSign = extractFloat128Sign( a );
5415     bSign = extractFloat128Sign( b );
5416     if ( aSign != bSign ) {
5417         return
5418                aSign
5419             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5420                  == 0 );
5421     }
5422     return
5423           aSign ? le128( b.high, b.low, a.high, a.low )
5424         : le128( a.high, a.low, b.high, b.low );
5425 
5426 }
5427 
5428 /*
5429 -------------------------------------------------------------------------------
5430 Returns 1 if the quadruple-precision floating-point value `a' is less than
5431 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5432 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5433 Standard for Binary Floating-Point Arithmetic.
5434 -------------------------------------------------------------------------------
5435 */
5436 flag float128_lt_quiet( float128 a, float128 b )
5437 {
5438     flag aSign, bSign;
5439 
5440     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5441               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5442          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5443               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5444        ) {
5445         if (    float128_is_signaling_nan( a )
5446              || float128_is_signaling_nan( b ) ) {
5447             float_raise( float_flag_invalid );
5448         }
5449         return 0;
5450     }
5451     aSign = extractFloat128Sign( a );
5452     bSign = extractFloat128Sign( b );
5453     if ( aSign != bSign ) {
5454         return
5455                aSign
5456             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5457                  != 0 );
5458     }
5459     return
5460           aSign ? lt128( b.high, b.low, a.high, a.low )
5461         : lt128( a.high, a.low, b.high, b.low );
5462 
5463 }
5464 
5465 #endif
5466 
5467 
5468 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5469 
5470 /*
5471  * These two routines are not part of the original softfloat distribution.
5472  *
5473  * They are based on the corresponding conversions to integer but return
5474  * unsigned numbers instead since these functions are required by GCC.
5475  *
5476  * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5477  *
5478  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5479  */
5480 
5481 /*
5482 -------------------------------------------------------------------------------
5483 Returns the result of converting the double-precision floating-point value
5484 `a' to the 32-bit unsigned integer format.  The conversion is
5485 performed according to the IEC/IEEE Standard for Binary Floating-point
5486 Arithmetic, except that the conversion is always rounded toward zero.  If
5487 `a' is a NaN, the largest positive integer is returned.  If the conversion
5488 overflows, the largest integer positive is returned.
5489 -------------------------------------------------------------------------------
5490 */
5491 uint32 float64_to_uint32_round_to_zero( float64 a )
5492 {
5493     flag aSign;
5494     int16 aExp, shiftCount;
5495     bits64 aSig, savedASig;
5496     uint32 z;
5497 
5498     aSig = extractFloat64Frac( a );
5499     aExp = extractFloat64Exp( a );
5500     aSign = extractFloat64Sign( a );
5501 
5502     if (aSign) {
5503         float_raise( float_flag_invalid );
5504     	return(0);
5505     }
5506 
5507     if ( 0x41E < aExp ) {
5508         float_raise( float_flag_invalid );
5509         return 0xffffffff;
5510     }
5511     else if ( aExp < 0x3FF ) {
5512         if ( aExp || aSig ) float_set_inexact();
5513         return 0;
5514     }
5515     aSig |= LIT64( 0x0010000000000000 );
5516     shiftCount = 0x433 - aExp;
5517     savedASig = aSig;
5518     aSig >>= shiftCount;
5519     z = aSig;
5520     if ( ( aSig<<shiftCount ) != savedASig ) {
5521         float_set_inexact();
5522     }
5523     return z;
5524 
5525 }
5526 
5527 /*
5528 -------------------------------------------------------------------------------
5529 Returns the result of converting the single-precision floating-point value
5530 `a' to the 32-bit unsigned integer format.  The conversion is
5531 performed according to the IEC/IEEE Standard for Binary Floating-point
5532 Arithmetic, except that the conversion is always rounded toward zero.  If
5533 `a' is a NaN, the largest positive integer is returned.  If the conversion
5534 overflows, the largest positive integer is returned.
5535 -------------------------------------------------------------------------------
5536 */
5537 uint32 float32_to_uint32_round_to_zero( float32 a )
5538 {
5539     flag aSign;
5540     int16 aExp, shiftCount;
5541     bits32 aSig;
5542     uint32 z;
5543 
5544     aSig = extractFloat32Frac( a );
5545     aExp = extractFloat32Exp( a );
5546     aSign = extractFloat32Sign( a );
5547     shiftCount = aExp - 0x9E;
5548 
5549     if (aSign) {
5550         float_raise( float_flag_invalid );
5551     	return(0);
5552     }
5553     if ( 0 < shiftCount ) {
5554         float_raise( float_flag_invalid );
5555         return 0xFFFFFFFF;
5556     }
5557     else if ( aExp <= 0x7E ) {
5558         if ( aExp | aSig ) float_set_inexact();
5559         return 0;
5560     }
5561     aSig = ( aSig | 0x800000 )<<8;
5562     z = aSig>>( - shiftCount );
5563     if ( aSig<<( shiftCount & 31 ) ) {
5564         float_set_inexact();
5565     }
5566     return z;
5567 
5568 }
5569 
5570 #endif
5571 
5572 #endif /* _STANDALONE */
5573