xref: /minix3/lib/libc/softfloat/bits64/softfloat.c (revision f14fb602092e015ff630df58e17c2a9cd57d29b3)
1 /* $NetBSD: softfloat.c,v 1.11 2012/03/24 00:06:20 matt Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.11 2012/03/24 00:06:20 matt Exp $");
50 #endif /* LIBC_SCCS and not lint */
51 
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55 
56 #include "milieu.h"
57 #include "softfloat.h"
58 
59 /*
60  * Conversions between floats as stored in memory and floats as
61  * SoftFloat uses them
62  */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a)	(a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a)	(a)
68 #endif
69 
70 /*
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
75 */
76 fp_rnd float_rounding_mode = float_round_nearest_even;
77 fp_except float_exception_flags = 0;
78 #ifdef FLOATX80
79 int8 floatx80_rounding_precision = 80;
80 #endif
81 
82 /*
83 -------------------------------------------------------------------------------
84 Primitive arithmetic functions, including multi-word arithmetic, and
85 division and square root approximations.  (Can be specialized to target if
86 desired.)
87 -------------------------------------------------------------------------------
88 */
89 #include "softfloat-macros"
90 
91 /*
92 -------------------------------------------------------------------------------
93 Functions and definitions to determine:  (1) whether tininess for underflow
94 is detected before or after rounding by default, (2) what (if anything)
95 happens when exceptions are raised, (3) how signaling NaNs are distinguished
96 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
97 are propagated from function inputs to output.  These details are target-
98 specific.
99 -------------------------------------------------------------------------------
100 */
101 #include "softfloat-specialize"
102 
103 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
104 /*
105 -------------------------------------------------------------------------------
106 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 and 7, and returns the properly rounded 32-bit integer corresponding to the
108 input.  If `zSign' is 1, the input is negated before being converted to an
109 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
110 is simply rounded to an integer, with the inexact exception raised if the
111 input cannot be represented exactly as an integer.  However, if the fixed-
112 point input is too large, the invalid exception is raised and the largest
113 positive or negative integer is returned.
114 -------------------------------------------------------------------------------
115 */
116 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 {
118     int8 roundingMode;
119     flag roundNearestEven;
120     int8 roundIncrement, roundBits;
121     int32 z;
122 
123     roundingMode = float_rounding_mode;
124     roundNearestEven = ( roundingMode == float_round_nearest_even );
125     roundIncrement = 0x40;
126     if ( ! roundNearestEven ) {
127         if ( roundingMode == float_round_to_zero ) {
128             roundIncrement = 0;
129         }
130         else {
131             roundIncrement = 0x7F;
132             if ( zSign ) {
133                 if ( roundingMode == float_round_up ) roundIncrement = 0;
134             }
135             else {
136                 if ( roundingMode == float_round_down ) roundIncrement = 0;
137             }
138         }
139     }
140     roundBits = (int8)(absZ & 0x7F);
141     absZ = ( absZ + roundIncrement )>>7;
142     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143     z = (int32)absZ;
144     if ( zSign ) z = - z;
145     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146         float_raise( float_flag_invalid );
147         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148     }
149     if ( roundBits ) float_exception_flags |= float_flag_inexact;
150     return z;
151 
152 }
153 
154 /*
155 -------------------------------------------------------------------------------
156 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
157 `absZ1', with binary point between bits 63 and 64 (between the input words),
158 and returns the properly rounded 64-bit integer corresponding to the input.
159 If `zSign' is 1, the input is negated before being converted to an integer.
160 Ordinarily, the fixed-point input is simply rounded to an integer, with
161 the inexact exception raised if the input cannot be represented exactly as
162 an integer.  However, if the fixed-point input is too large, the invalid
163 exception is raised and the largest positive or negative integer is
164 returned.
165 -------------------------------------------------------------------------------
166 */
167 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
168 {
169     int8 roundingMode;
170     flag roundNearestEven, increment;
171     int64 z;
172 
173     roundingMode = float_rounding_mode;
174     roundNearestEven = ( roundingMode == float_round_nearest_even );
175     increment = ( (sbits64) absZ1 < 0 );
176     if ( ! roundNearestEven ) {
177         if ( roundingMode == float_round_to_zero ) {
178             increment = 0;
179         }
180         else {
181             if ( zSign ) {
182                 increment = ( roundingMode == float_round_down ) && absZ1;
183             }
184             else {
185                 increment = ( roundingMode == float_round_up ) && absZ1;
186             }
187         }
188     }
189     if ( increment ) {
190         ++absZ0;
191         if ( absZ0 == 0 ) goto overflow;
192         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193     }
194     z = absZ0;
195     if ( zSign ) z = - z;
196     if ( z && ( ( z < 0 ) ^ zSign ) ) {
197  overflow:
198         float_raise( float_flag_invalid );
199         return
200               zSign ? (sbits64) LIT64( 0x8000000000000000 )
201             : LIT64( 0x7FFFFFFFFFFFFFFF );
202     }
203     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
204     return z;
205 
206 }
207 #endif
208 
209 /*
210 -------------------------------------------------------------------------------
211 Returns the fraction bits of the single-precision floating-point value `a'.
212 -------------------------------------------------------------------------------
213 */
214 INLINE bits32 extractFloat32Frac( float32 a )
215 {
216 
217     return a & 0x007FFFFF;
218 
219 }
220 
221 /*
222 -------------------------------------------------------------------------------
223 Returns the exponent bits of the single-precision floating-point value `a'.
224 -------------------------------------------------------------------------------
225 */
226 INLINE int16 extractFloat32Exp( float32 a )
227 {
228 
229     return ( a>>23 ) & 0xFF;
230 
231 }
232 
233 /*
234 -------------------------------------------------------------------------------
235 Returns the sign bit of the single-precision floating-point value `a'.
236 -------------------------------------------------------------------------------
237 */
238 INLINE flag extractFloat32Sign( float32 a )
239 {
240 
241     return a>>31;
242 
243 }
244 
245 /*
246 -------------------------------------------------------------------------------
247 Normalizes the subnormal single-precision floating-point value represented
248 by the denormalized significand `aSig'.  The normalized exponent and
249 significand are stored at the locations pointed to by `zExpPtr' and
250 `zSigPtr', respectively.
251 -------------------------------------------------------------------------------
252 */
253 static void
254  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
255 {
256     int8 shiftCount;
257 
258     shiftCount = countLeadingZeros32( aSig ) - 8;
259     *zSigPtr = aSig<<shiftCount;
260     *zExpPtr = 1 - shiftCount;
261 
262 }
263 
264 /*
265 -------------------------------------------------------------------------------
266 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
267 single-precision floating-point value, returning the result.  After being
268 shifted into the proper positions, the three fields are simply added
269 together to form the result.  This means that any integer portion of `zSig'
270 will be added into the exponent.  Since a properly normalized significand
271 will have an integer portion equal to 1, the `zExp' input should be 1 less
272 than the desired result exponent whenever `zSig' is a complete, normalized
273 significand.
274 -------------------------------------------------------------------------------
275 */
276 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
277 {
278 
279     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
280 
281 }
282 
283 /*
284 -------------------------------------------------------------------------------
285 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
286 and significand `zSig', and returns the proper single-precision floating-
287 point value corresponding to the abstract input.  Ordinarily, the abstract
288 value is simply rounded and packed into the single-precision format, with
289 the inexact exception raised if the abstract input cannot be represented
290 exactly.  However, if the abstract value is too large, the overflow and
291 inexact exceptions are raised and an infinity or maximal finite value is
292 returned.  If the abstract value is too small, the input value is rounded to
293 a subnormal number, and the underflow and inexact exceptions are raised if
294 the abstract input cannot be represented exactly as a subnormal single-
295 precision floating-point number.
296     The input significand `zSig' has its binary point between bits 30
297 and 29, which is 7 bits to the left of the usual location.  This shifted
298 significand must be normalized or smaller.  If `zSig' is not normalized,
299 `zExp' must be 0; in that case, the result returned is a subnormal number,
300 and it must not require rounding.  In the usual case that `zSig' is
301 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
302 The handling of underflow and overflow follows the IEC/IEEE Standard for
303 Binary Floating-Point Arithmetic.
304 -------------------------------------------------------------------------------
305 */
306 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
307 {
308     int8 roundingMode;
309     flag roundNearestEven;
310     int8 roundIncrement, roundBits;
311     flag isTiny;
312 
313     roundingMode = float_rounding_mode;
314     roundNearestEven = ( roundingMode == float_round_nearest_even );
315     roundIncrement = 0x40;
316     if ( ! roundNearestEven ) {
317         if ( roundingMode == float_round_to_zero ) {
318             roundIncrement = 0;
319         }
320         else {
321             roundIncrement = 0x7F;
322             if ( zSign ) {
323                 if ( roundingMode == float_round_up ) roundIncrement = 0;
324             }
325             else {
326                 if ( roundingMode == float_round_down ) roundIncrement = 0;
327             }
328         }
329     }
330     roundBits = zSig & 0x7F;
331     if ( 0xFD <= (bits16) zExp ) {
332         if (    ( 0xFD < zExp )
333              || (    ( zExp == 0xFD )
334                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
335            ) {
336             float_raise( float_flag_overflow | float_flag_inexact );
337             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
338         }
339         if ( zExp < 0 ) {
340             isTiny =
341                    ( float_detect_tininess == float_tininess_before_rounding )
342                 || ( zExp < -1 )
343                 || ( zSig + roundIncrement < 0x80000000U );
344             shift32RightJamming( zSig, - zExp, &zSig );
345             zExp = 0;
346             roundBits = zSig & 0x7F;
347             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
348         }
349     }
350     if ( roundBits ) float_exception_flags |= float_flag_inexact;
351     zSig = ( zSig + roundIncrement )>>7;
352     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353     if ( zSig == 0 ) zExp = 0;
354     return packFloat32( zSign, zExp, zSig );
355 
356 }
357 
358 /*
359 -------------------------------------------------------------------------------
360 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
361 and significand `zSig', and returns the proper single-precision floating-
362 point value corresponding to the abstract input.  This routine is just like
363 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
364 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
365 floating-point exponent.
366 -------------------------------------------------------------------------------
367 */
368 static float32
369  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
370 {
371     int8 shiftCount;
372 
373     shiftCount = countLeadingZeros32( zSig ) - 1;
374     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
375 
376 }
377 
378 /*
379 -------------------------------------------------------------------------------
380 Returns the fraction bits of the double-precision floating-point value `a'.
381 -------------------------------------------------------------------------------
382 */
383 INLINE bits64 extractFloat64Frac( float64 a )
384 {
385 
386     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
387 
388 }
389 
390 /*
391 -------------------------------------------------------------------------------
392 Returns the exponent bits of the double-precision floating-point value `a'.
393 -------------------------------------------------------------------------------
394 */
395 INLINE int16 extractFloat64Exp( float64 a )
396 {
397 
398     return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
399 
400 }
401 
402 /*
403 -------------------------------------------------------------------------------
404 Returns the sign bit of the double-precision floating-point value `a'.
405 -------------------------------------------------------------------------------
406 */
407 INLINE flag extractFloat64Sign( float64 a )
408 {
409 
410     return (flag)(FLOAT64_DEMANGLE(a) >> 63);
411 
412 }
413 
414 /*
415 -------------------------------------------------------------------------------
416 Normalizes the subnormal double-precision floating-point value represented
417 by the denormalized significand `aSig'.  The normalized exponent and
418 significand are stored at the locations pointed to by `zExpPtr' and
419 `zSigPtr', respectively.
420 -------------------------------------------------------------------------------
421 */
422 static void
423  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
424 {
425     int8 shiftCount;
426 
427     shiftCount = countLeadingZeros64( aSig ) - 11;
428     *zSigPtr = aSig<<shiftCount;
429     *zExpPtr = 1 - shiftCount;
430 
431 }
432 
433 /*
434 -------------------------------------------------------------------------------
435 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
436 double-precision floating-point value, returning the result.  After being
437 shifted into the proper positions, the three fields are simply added
438 together to form the result.  This means that any integer portion of `zSig'
439 will be added into the exponent.  Since a properly normalized significand
440 will have an integer portion equal to 1, the `zExp' input should be 1 less
441 than the desired result exponent whenever `zSig' is a complete, normalized
442 significand.
443 -------------------------------------------------------------------------------
444 */
445 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
446 {
447 
448     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
449 			   ( ( (bits64) zExp )<<52 ) + zSig );
450 
451 }
452 
453 /*
454 -------------------------------------------------------------------------------
455 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
456 and significand `zSig', and returns the proper double-precision floating-
457 point value corresponding to the abstract input.  Ordinarily, the abstract
458 value is simply rounded and packed into the double-precision format, with
459 the inexact exception raised if the abstract input cannot be represented
460 exactly.  However, if the abstract value is too large, the overflow and
461 inexact exceptions are raised and an infinity or maximal finite value is
462 returned.  If the abstract value is too small, the input value is rounded to
463 a subnormal number, and the underflow and inexact exceptions are raised if
464 the abstract input cannot be represented exactly as a subnormal double-
465 precision floating-point number.
466     The input significand `zSig' has its binary point between bits 62
467 and 61, which is 10 bits to the left of the usual location.  This shifted
468 significand must be normalized or smaller.  If `zSig' is not normalized,
469 `zExp' must be 0; in that case, the result returned is a subnormal number,
470 and it must not require rounding.  In the usual case that `zSig' is
471 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
472 The handling of underflow and overflow follows the IEC/IEEE Standard for
473 Binary Floating-Point Arithmetic.
474 -------------------------------------------------------------------------------
475 */
476 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
477 {
478     int8 roundingMode;
479     flag roundNearestEven;
480     int16 roundIncrement, roundBits;
481     flag isTiny;
482 
483     roundingMode = float_rounding_mode;
484     roundNearestEven = ( roundingMode == float_round_nearest_even );
485     roundIncrement = 0x200;
486     if ( ! roundNearestEven ) {
487         if ( roundingMode == float_round_to_zero ) {
488             roundIncrement = 0;
489         }
490         else {
491             roundIncrement = 0x3FF;
492             if ( zSign ) {
493                 if ( roundingMode == float_round_up ) roundIncrement = 0;
494             }
495             else {
496                 if ( roundingMode == float_round_down ) roundIncrement = 0;
497             }
498         }
499     }
500     roundBits = (int16)(zSig & 0x3FF);
501     if ( 0x7FD <= (bits16) zExp ) {
502         if (    ( 0x7FD < zExp )
503              || (    ( zExp == 0x7FD )
504                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
505            ) {
506             float_raise( float_flag_overflow | float_flag_inexact );
507             return FLOAT64_MANGLE(
508 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
509 		( roundIncrement == 0 ));
510         }
511         if ( zExp < 0 ) {
512             isTiny =
513                    ( float_detect_tininess == float_tininess_before_rounding )
514                 || ( zExp < -1 )
515                 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
516             shift64RightJamming( zSig, - zExp, &zSig );
517             zExp = 0;
518             roundBits = (int16)(zSig & 0x3FF);
519             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
520         }
521     }
522     if ( roundBits ) float_exception_flags |= float_flag_inexact;
523     zSig = ( zSig + roundIncrement )>>10;
524     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
525     if ( zSig == 0 ) zExp = 0;
526     return packFloat64( zSign, zExp, zSig );
527 
528 }
529 
530 /*
531 -------------------------------------------------------------------------------
532 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533 and significand `zSig', and returns the proper double-precision floating-
534 point value corresponding to the abstract input.  This routine is just like
535 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
536 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
537 floating-point exponent.
538 -------------------------------------------------------------------------------
539 */
540 static float64
541  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
542 {
543     int8 shiftCount;
544 
545     shiftCount = countLeadingZeros64( zSig ) - 1;
546     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
547 
548 }
549 
550 #ifdef FLOATX80
551 
552 /*
553 -------------------------------------------------------------------------------
554 Returns the fraction bits of the extended double-precision floating-point
555 value `a'.
556 -------------------------------------------------------------------------------
557 */
558 INLINE bits64 extractFloatx80Frac( floatx80 a )
559 {
560 
561     return a.low;
562 
563 }
564 
565 /*
566 -------------------------------------------------------------------------------
567 Returns the exponent bits of the extended double-precision floating-point
568 value `a'.
569 -------------------------------------------------------------------------------
570 */
571 INLINE int32 extractFloatx80Exp( floatx80 a )
572 {
573 
574     return a.high & 0x7FFF;
575 
576 }
577 
578 /*
579 -------------------------------------------------------------------------------
580 Returns the sign bit of the extended double-precision floating-point value
581 `a'.
582 -------------------------------------------------------------------------------
583 */
584 INLINE flag extractFloatx80Sign( floatx80 a )
585 {
586 
587     return a.high>>15;
588 
589 }
590 
591 /*
592 -------------------------------------------------------------------------------
593 Normalizes the subnormal extended double-precision floating-point value
594 represented by the denormalized significand `aSig'.  The normalized exponent
595 and significand are stored at the locations pointed to by `zExpPtr' and
596 `zSigPtr', respectively.
597 -------------------------------------------------------------------------------
598 */
599 static void
600  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
601 {
602     int8 shiftCount;
603 
604     shiftCount = countLeadingZeros64( aSig );
605     *zSigPtr = aSig<<shiftCount;
606     *zExpPtr = 1 - shiftCount;
607 
608 }
609 
610 /*
611 -------------------------------------------------------------------------------
612 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
613 extended double-precision floating-point value, returning the result.
614 -------------------------------------------------------------------------------
615 */
616 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
617 {
618     floatx80 z;
619 
620     z.low = zSig;
621     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
622     return z;
623 
624 }
625 
626 /*
627 -------------------------------------------------------------------------------
628 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 and returns the proper extended double-precision floating-point value
631 corresponding to the abstract input.  Ordinarily, the abstract value is
632 rounded and packed into the extended double-precision format, with the
633 inexact exception raised if the abstract input cannot be represented
634 exactly.  However, if the abstract value is too large, the overflow and
635 inexact exceptions are raised and an infinity or maximal finite value is
636 returned.  If the abstract value is too small, the input value is rounded to
637 a subnormal number, and the underflow and inexact exceptions are raised if
638 the abstract input cannot be represented exactly as a subnormal extended
639 double-precision floating-point number.
640     If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 number of bits as single or double precision, respectively.  Otherwise, the
642 result is rounded to the full precision of the extended double-precision
643 format.
644     The input significand must be normalized or smaller.  If the input
645 significand is not normalized, `zExp' must be 0; in that case, the result
646 returned is a subnormal number, and it must not require rounding.  The
647 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 Floating-Point Arithmetic.
649 -------------------------------------------------------------------------------
650 */
651 static floatx80
652  roundAndPackFloatx80(
653      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654  )
655 {
656     int8 roundingMode;
657     flag roundNearestEven, increment, isTiny;
658     int64 roundIncrement, roundMask, roundBits;
659 
660     roundingMode = float_rounding_mode;
661     roundNearestEven = ( roundingMode == float_round_nearest_even );
662     if ( roundingPrecision == 80 ) goto precision80;
663     if ( roundingPrecision == 64 ) {
664         roundIncrement = LIT64( 0x0000000000000400 );
665         roundMask = LIT64( 0x00000000000007FF );
666     }
667     else if ( roundingPrecision == 32 ) {
668         roundIncrement = LIT64( 0x0000008000000000 );
669         roundMask = LIT64( 0x000000FFFFFFFFFF );
670     }
671     else {
672         goto precision80;
673     }
674     zSig0 |= ( zSig1 != 0 );
675     if ( ! roundNearestEven ) {
676         if ( roundingMode == float_round_to_zero ) {
677             roundIncrement = 0;
678         }
679         else {
680             roundIncrement = roundMask;
681             if ( zSign ) {
682                 if ( roundingMode == float_round_up ) roundIncrement = 0;
683             }
684             else {
685                 if ( roundingMode == float_round_down ) roundIncrement = 0;
686             }
687         }
688     }
689     roundBits = zSig0 & roundMask;
690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691         if (    ( 0x7FFE < zExp )
692              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
693            ) {
694             goto overflow;
695         }
696         if ( zExp <= 0 ) {
697             isTiny =
698                    ( float_detect_tininess == float_tininess_before_rounding )
699                 || ( zExp < 0 )
700                 || ( zSig0 <= zSig0 + roundIncrement );
701             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
702             zExp = 0;
703             roundBits = zSig0 & roundMask;
704             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
705             if ( roundBits ) float_exception_flags |= float_flag_inexact;
706             zSig0 += roundIncrement;
707             if ( (sbits64) zSig0 < 0 ) zExp = 1;
708             roundIncrement = roundMask + 1;
709             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
710                 roundMask |= roundIncrement;
711             }
712             zSig0 &= ~ roundMask;
713             return packFloatx80( zSign, zExp, zSig0 );
714         }
715     }
716     if ( roundBits ) float_exception_flags |= float_flag_inexact;
717     zSig0 += roundIncrement;
718     if ( zSig0 < roundIncrement ) {
719         ++zExp;
720         zSig0 = LIT64( 0x8000000000000000 );
721     }
722     roundIncrement = roundMask + 1;
723     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
724         roundMask |= roundIncrement;
725     }
726     zSig0 &= ~ roundMask;
727     if ( zSig0 == 0 ) zExp = 0;
728     return packFloatx80( zSign, zExp, zSig0 );
729  precision80:
730     increment = ( (sbits64) zSig1 < 0 );
731     if ( ! roundNearestEven ) {
732         if ( roundingMode == float_round_to_zero ) {
733             increment = 0;
734         }
735         else {
736             if ( zSign ) {
737                 increment = ( roundingMode == float_round_down ) && zSig1;
738             }
739             else {
740                 increment = ( roundingMode == float_round_up ) && zSig1;
741             }
742         }
743     }
744     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
745         if (    ( 0x7FFE < zExp )
746              || (    ( zExp == 0x7FFE )
747                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
748                   && increment
749                 )
750            ) {
751             roundMask = 0;
752  overflow:
753             float_raise( float_flag_overflow | float_flag_inexact );
754             if (    ( roundingMode == float_round_to_zero )
755                  || ( zSign && ( roundingMode == float_round_up ) )
756                  || ( ! zSign && ( roundingMode == float_round_down ) )
757                ) {
758                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
759             }
760             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
761         }
762         if ( zExp <= 0 ) {
763             isTiny =
764                    ( float_detect_tininess == float_tininess_before_rounding )
765                 || ( zExp < 0 )
766                 || ! increment
767                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
768             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
769             zExp = 0;
770             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
771             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
772             if ( roundNearestEven ) {
773                 increment = ( (sbits64) zSig1 < 0 );
774             }
775             else {
776                 if ( zSign ) {
777                     increment = ( roundingMode == float_round_down ) && zSig1;
778                 }
779                 else {
780                     increment = ( roundingMode == float_round_up ) && zSig1;
781                 }
782             }
783             if ( increment ) {
784                 ++zSig0;
785                 zSig0 &=
786                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
787                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
788             }
789             return packFloatx80( zSign, zExp, zSig0 );
790         }
791     }
792     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
793     if ( increment ) {
794         ++zSig0;
795         if ( zSig0 == 0 ) {
796             ++zExp;
797             zSig0 = LIT64( 0x8000000000000000 );
798         }
799         else {
800             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
801         }
802     }
803     else {
804         if ( zSig0 == 0 ) zExp = 0;
805     }
806     return packFloatx80( zSign, zExp, zSig0 );
807 
808 }
809 
810 /*
811 -------------------------------------------------------------------------------
812 Takes an abstract floating-point value having sign `zSign', exponent
813 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 and returns the proper extended double-precision floating-point value
815 corresponding to the abstract input.  This routine is just like
816 `roundAndPackFloatx80' except that the input significand does not have to be
817 normalized.
818 -------------------------------------------------------------------------------
819 */
820 static floatx80
821  normalizeRoundAndPackFloatx80(
822      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823  )
824 {
825     int8 shiftCount;
826 
827     if ( zSig0 == 0 ) {
828         zSig0 = zSig1;
829         zSig1 = 0;
830         zExp -= 64;
831     }
832     shiftCount = countLeadingZeros64( zSig0 );
833     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834     zExp -= shiftCount;
835     return
836         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
837 
838 }
839 
840 #endif
841 
842 #ifdef FLOAT128
843 
844 /*
845 -------------------------------------------------------------------------------
846 Returns the least-significant 64 fraction bits of the quadruple-precision
847 floating-point value `a'.
848 -------------------------------------------------------------------------------
849 */
850 INLINE bits64 extractFloat128Frac1( float128 a )
851 {
852 
853     return a.low;
854 
855 }
856 
857 /*
858 -------------------------------------------------------------------------------
859 Returns the most-significant 48 fraction bits of the quadruple-precision
860 floating-point value `a'.
861 -------------------------------------------------------------------------------
862 */
863 INLINE bits64 extractFloat128Frac0( float128 a )
864 {
865 
866     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
867 
868 }
869 
870 /*
871 -------------------------------------------------------------------------------
872 Returns the exponent bits of the quadruple-precision floating-point value
873 `a'.
874 -------------------------------------------------------------------------------
875 */
876 INLINE int32 extractFloat128Exp( float128 a )
877 {
878 
879     return (int32)((a.high >> 48) & 0x7FFF);
880 
881 }
882 
883 /*
884 -------------------------------------------------------------------------------
885 Returns the sign bit of the quadruple-precision floating-point value `a'.
886 -------------------------------------------------------------------------------
887 */
888 INLINE flag extractFloat128Sign( float128 a )
889 {
890 
891     return (flag)(a.high >> 63);
892 
893 }
894 
895 /*
896 -------------------------------------------------------------------------------
897 Normalizes the subnormal quadruple-precision floating-point value
898 represented by the denormalized significand formed by the concatenation of
899 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
900 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
901 significand are stored at the location pointed to by `zSig0Ptr', and the
902 least significant 64 bits of the normalized significand are stored at the
903 location pointed to by `zSig1Ptr'.
904 -------------------------------------------------------------------------------
905 */
906 static void
907  normalizeFloat128Subnormal(
908      bits64 aSig0,
909      bits64 aSig1,
910      int32 *zExpPtr,
911      bits64 *zSig0Ptr,
912      bits64 *zSig1Ptr
913  )
914 {
915     int8 shiftCount;
916 
917     if ( aSig0 == 0 ) {
918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
919         if ( shiftCount < 0 ) {
920             *zSig0Ptr = aSig1>>( - shiftCount );
921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
922         }
923         else {
924             *zSig0Ptr = aSig1<<shiftCount;
925             *zSig1Ptr = 0;
926         }
927         *zExpPtr = - shiftCount - 63;
928     }
929     else {
930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932         *zExpPtr = 1 - shiftCount;
933     }
934 
935 }
936 
937 /*
938 -------------------------------------------------------------------------------
939 Packs the sign `zSign', the exponent `zExp', and the significand formed
940 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 floating-point value, returning the result.  After being shifted into the
942 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 added together to form the most significant 32 bits of the result.  This
944 means that any integer portion of `zSig0' will be added into the exponent.
945 Since a properly normalized significand will have an integer portion equal
946 to 1, the `zExp' input should be 1 less than the desired result exponent
947 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948 significand.
949 -------------------------------------------------------------------------------
950 */
951 INLINE float128
952  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
953 {
954     float128 z;
955 
956     z.low = zSig1;
957     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
958     return z;
959 
960 }
961 
962 /*
963 -------------------------------------------------------------------------------
964 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
965 and extended significand formed by the concatenation of `zSig0', `zSig1',
966 and `zSig2', and returns the proper quadruple-precision floating-point value
967 corresponding to the abstract input.  Ordinarily, the abstract value is
968 simply rounded and packed into the quadruple-precision format, with the
969 inexact exception raised if the abstract input cannot be represented
970 exactly.  However, if the abstract value is too large, the overflow and
971 inexact exceptions are raised and an infinity or maximal finite value is
972 returned.  If the abstract value is too small, the input value is rounded to
973 a subnormal number, and the underflow and inexact exceptions are raised if
974 the abstract input cannot be represented exactly as a subnormal quadruple-
975 precision floating-point number.
976     The input significand must be normalized or smaller.  If the input
977 significand is not normalized, `zExp' must be 0; in that case, the result
978 returned is a subnormal number, and it must not require rounding.  In the
979 usual case that the input significand is normalized, `zExp' must be 1 less
980 than the ``true'' floating-point exponent.  The handling of underflow and
981 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
982 -------------------------------------------------------------------------------
983 */
984 static float128
985  roundAndPackFloat128(
986      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
987 {
988     int8 roundingMode;
989     flag roundNearestEven, increment, isTiny;
990 
991     roundingMode = float_rounding_mode;
992     roundNearestEven = ( roundingMode == float_round_nearest_even );
993     increment = ( (sbits64) zSig2 < 0 );
994     if ( ! roundNearestEven ) {
995         if ( roundingMode == float_round_to_zero ) {
996             increment = 0;
997         }
998         else {
999             if ( zSign ) {
1000                 increment = ( roundingMode == float_round_down ) && zSig2;
1001             }
1002             else {
1003                 increment = ( roundingMode == float_round_up ) && zSig2;
1004             }
1005         }
1006     }
1007     if ( 0x7FFD <= (bits32) zExp ) {
1008         if (    ( 0x7FFD < zExp )
1009              || (    ( zExp == 0x7FFD )
1010                   && eq128(
1011                          LIT64( 0x0001FFFFFFFFFFFF ),
1012                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1013                          zSig0,
1014                          zSig1
1015                      )
1016                   && increment
1017                 )
1018            ) {
1019             float_raise( float_flag_overflow | float_flag_inexact );
1020             if (    ( roundingMode == float_round_to_zero )
1021                  || ( zSign && ( roundingMode == float_round_up ) )
1022                  || ( ! zSign && ( roundingMode == float_round_down ) )
1023                ) {
1024                 return
1025                     packFloat128(
1026                         zSign,
1027                         0x7FFE,
1028                         LIT64( 0x0000FFFFFFFFFFFF ),
1029                         LIT64( 0xFFFFFFFFFFFFFFFF )
1030                     );
1031             }
1032             return packFloat128( zSign, 0x7FFF, 0, 0 );
1033         }
1034         if ( zExp < 0 ) {
1035             isTiny =
1036                    ( float_detect_tininess == float_tininess_before_rounding )
1037                 || ( zExp < -1 )
1038                 || ! increment
1039                 || lt128(
1040                        zSig0,
1041                        zSig1,
1042                        LIT64( 0x0001FFFFFFFFFFFF ),
1043                        LIT64( 0xFFFFFFFFFFFFFFFF )
1044                    );
1045             shift128ExtraRightJamming(
1046                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1047             zExp = 0;
1048             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1049             if ( roundNearestEven ) {
1050                 increment = ( (sbits64) zSig2 < 0 );
1051             }
1052             else {
1053                 if ( zSign ) {
1054                     increment = ( roundingMode == float_round_down ) && zSig2;
1055                 }
1056                 else {
1057                     increment = ( roundingMode == float_round_up ) && zSig2;
1058                 }
1059             }
1060         }
1061     }
1062     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1063     if ( increment ) {
1064         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1065         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066     }
1067     else {
1068         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069     }
1070     return packFloat128( zSign, zExp, zSig0, zSig1 );
1071 
1072 }
1073 
1074 /*
1075 -------------------------------------------------------------------------------
1076 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 and significand formed by the concatenation of `zSig0' and `zSig1', and
1078 returns the proper quadruple-precision floating-point value corresponding
1079 to the abstract input.  This routine is just like `roundAndPackFloat128'
1080 except that the input significand has fewer bits and does not have to be
1081 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1082 point exponent.
1083 -------------------------------------------------------------------------------
1084 */
1085 static float128
1086  normalizeRoundAndPackFloat128(
1087      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1088 {
1089     int8 shiftCount;
1090     bits64 zSig2;
1091 
1092     if ( zSig0 == 0 ) {
1093         zSig0 = zSig1;
1094         zSig1 = 0;
1095         zExp -= 64;
1096     }
1097     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1098     if ( 0 <= shiftCount ) {
1099         zSig2 = 0;
1100         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101     }
1102     else {
1103         shift128ExtraRightJamming(
1104             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105     }
1106     zExp -= shiftCount;
1107     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1108 
1109 }
1110 
1111 #endif
1112 
1113 /*
1114 -------------------------------------------------------------------------------
1115 Returns the result of converting the 32-bit two's complement integer `a'
1116 to the single-precision floating-point format.  The conversion is performed
1117 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 -------------------------------------------------------------------------------
1119 */
1120 float32 int32_to_float32( int32 a )
1121 {
1122     flag zSign;
1123 
1124     if ( a == 0 ) return 0;
1125     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1126     zSign = ( a < 0 );
1127     return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
1128 
1129 }
1130 
1131 float32 uint32_to_float32( uint32 a )
1132 {
1133     if ( a == 0 ) return 0;
1134     if ( a & (bits32) 0x80000000 )
1135 	return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1136     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1137 }
1138 
1139 
1140 /*
1141 -------------------------------------------------------------------------------
1142 Returns the result of converting the 32-bit two's complement integer `a'
1143 to the double-precision floating-point format.  The conversion is performed
1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 -------------------------------------------------------------------------------
1146 */
1147 float64 int32_to_float64( int32 a )
1148 {
1149     flag zSign;
1150     uint32 absA;
1151     int8 shiftCount;
1152     bits64 zSig;
1153 
1154     if ( a == 0 ) return 0;
1155     zSign = ( a < 0 );
1156     absA = zSign ? - a : a;
1157     shiftCount = countLeadingZeros32( absA ) + 21;
1158     zSig = absA;
1159     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1160 
1161 }
1162 
1163 float64 uint32_to_float64( uint32 a )
1164 {
1165     int8 shiftCount;
1166     bits64 zSig = a;
1167 
1168     if ( a == 0 ) return 0;
1169     shiftCount = countLeadingZeros32( a ) + 21;
1170     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1171 
1172 }
1173 
1174 #ifdef FLOATX80
1175 
1176 /*
1177 -------------------------------------------------------------------------------
1178 Returns the result of converting the 32-bit two's complement integer `a'
1179 to the extended double-precision floating-point format.  The conversion
1180 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1181 Arithmetic.
1182 -------------------------------------------------------------------------------
1183 */
1184 floatx80 int32_to_floatx80( int32 a )
1185 {
1186     flag zSign;
1187     uint32 absA;
1188     int8 shiftCount;
1189     bits64 zSig;
1190 
1191     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1192     zSign = ( a < 0 );
1193     absA = zSign ? - a : a;
1194     shiftCount = countLeadingZeros32( absA ) + 32;
1195     zSig = absA;
1196     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1197 
1198 }
1199 
1200 floatx80 uint32_to_floatx80( uint32 a )
1201 {
1202     int8 shiftCount;
1203     bits64 zSig = a;
1204 
1205     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1206     shiftCount = countLeadingZeros32( a ) + 32;
1207     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1208 
1209 }
1210 
1211 #endif
1212 
1213 #ifdef FLOAT128
1214 
1215 /*
1216 -------------------------------------------------------------------------------
1217 Returns the result of converting the 32-bit two's complement integer `a' to
1218 the quadruple-precision floating-point format.  The conversion is performed
1219 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1220 -------------------------------------------------------------------------------
1221 */
1222 float128 int32_to_float128( int32 a )
1223 {
1224     flag zSign;
1225     uint32 absA;
1226     int8 shiftCount;
1227     bits64 zSig0;
1228 
1229     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1230     zSign = ( a < 0 );
1231     absA = zSign ? - a : a;
1232     shiftCount = countLeadingZeros32( absA ) + 17;
1233     zSig0 = absA;
1234     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1235 
1236 }
1237 
1238 float128 uint32_to_float128( uint32 a )
1239 {
1240     int8 shiftCount;
1241     bits64 zSig0 = a;
1242 
1243     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1244     shiftCount = countLeadingZeros32( a ) + 17;
1245     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1246 
1247 }
1248 
1249 #endif
1250 
1251 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1252 /*
1253 -------------------------------------------------------------------------------
1254 Returns the result of converting the 64-bit two's complement integer `a'
1255 to the single-precision floating-point format.  The conversion is performed
1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 -------------------------------------------------------------------------------
1258 */
1259 float32 int64_to_float32( int64 a )
1260 {
1261     flag zSign;
1262     uint64 absA;
1263     int8 shiftCount;
1264 
1265     if ( a == 0 ) return 0;
1266     zSign = ( a < 0 );
1267     absA = zSign ? - a : a;
1268     shiftCount = countLeadingZeros64( absA ) - 40;
1269     if ( 0 <= shiftCount ) {
1270         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1271     }
1272     else {
1273         shiftCount += 7;
1274         if ( shiftCount < 0 ) {
1275             shift64RightJamming( absA, - shiftCount, &absA );
1276         }
1277         else {
1278             absA <<= shiftCount;
1279         }
1280         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1281     }
1282 
1283 }
1284 
1285 /*
1286 -------------------------------------------------------------------------------
1287 Returns the result of converting the 64-bit two's complement integer `a'
1288 to the double-precision floating-point format.  The conversion is performed
1289 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 -------------------------------------------------------------------------------
1291 */
1292 float64 int64_to_float64( int64 a )
1293 {
1294     flag zSign;
1295 
1296     if ( a == 0 ) return 0;
1297     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1298         return packFloat64( 1, 0x43E, 0 );
1299     }
1300     zSign = ( a < 0 );
1301     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1302 
1303 }
1304 
1305 #ifdef FLOATX80
1306 
1307 /*
1308 -------------------------------------------------------------------------------
1309 Returns the result of converting the 64-bit two's complement integer `a'
1310 to the extended double-precision floating-point format.  The conversion
1311 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 Arithmetic.
1313 -------------------------------------------------------------------------------
1314 */
1315 floatx80 int64_to_floatx80( int64 a )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320 
1321     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1322     zSign = ( a < 0 );
1323     absA = zSign ? - a : a;
1324     shiftCount = countLeadingZeros64( absA );
1325     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1326 
1327 }
1328 
1329 #endif
1330 
1331 #endif /* !SOFTFLOAT_FOR_GCC */
1332 
1333 #ifdef FLOAT128
1334 
1335 /*
1336 -------------------------------------------------------------------------------
1337 Returns the result of converting the 64-bit two's complement integer `a' to
1338 the quadruple-precision floating-point format.  The conversion is performed
1339 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1340 -------------------------------------------------------------------------------
1341 */
1342 float128 int64_to_float128( int64 a )
1343 {
1344     flag zSign;
1345     uint64 absA;
1346     int8 shiftCount;
1347     int32 zExp;
1348     bits64 zSig0, zSig1;
1349 
1350     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1351     zSign = ( a < 0 );
1352     absA = zSign ? - a : a;
1353     shiftCount = countLeadingZeros64( absA ) + 49;
1354     zExp = 0x406E - shiftCount;
1355     if ( 64 <= shiftCount ) {
1356         zSig1 = 0;
1357         zSig0 = absA;
1358         shiftCount -= 64;
1359     }
1360     else {
1361         zSig1 = absA;
1362         zSig0 = 0;
1363     }
1364     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1365     return packFloat128( zSign, zExp, zSig0, zSig1 );
1366 
1367 }
1368 
1369 #endif
1370 
1371 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1372 /*
1373 -------------------------------------------------------------------------------
1374 Returns the result of converting the single-precision floating-point value
1375 `a' to the 32-bit two's complement integer format.  The conversion is
1376 performed according to the IEC/IEEE Standard for Binary Floating-Point
1377 Arithmetic---which means in particular that the conversion is rounded
1378 according to the current rounding mode.  If `a' is a NaN, the largest
1379 positive integer is returned.  Otherwise, if the conversion overflows, the
1380 largest integer with the same sign as `a' is returned.
1381 -------------------------------------------------------------------------------
1382 */
1383 int32 float32_to_int32( float32 a )
1384 {
1385     flag aSign;
1386     int16 aExp, shiftCount;
1387     bits32 aSig;
1388     bits64 aSig64;
1389 
1390     aSig = extractFloat32Frac( a );
1391     aExp = extractFloat32Exp( a );
1392     aSign = extractFloat32Sign( a );
1393     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1394     if ( aExp ) aSig |= 0x00800000;
1395     shiftCount = 0xAF - aExp;
1396     aSig64 = aSig;
1397     aSig64 <<= 32;
1398     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1399     return roundAndPackInt32( aSign, aSig64 );
1400 
1401 }
1402 #endif /* !SOFTFLOAT_FOR_GCC */
1403 
1404 /*
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the single-precision floating-point value
1407 `a' to the 32-bit two's complement integer format.  The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic, except that the conversion is always rounded toward zero.
1410 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1411 the conversion overflows, the largest integer with the same sign as `a' is
1412 returned.
1413 -------------------------------------------------------------------------------
1414 */
1415 int32 float32_to_int32_round_to_zero( float32 a )
1416 {
1417     flag aSign;
1418     int16 aExp, shiftCount;
1419     bits32 aSig;
1420     int32 z;
1421 
1422     aSig = extractFloat32Frac( a );
1423     aExp = extractFloat32Exp( a );
1424     aSign = extractFloat32Sign( a );
1425     shiftCount = aExp - 0x9E;
1426     if ( 0 <= shiftCount ) {
1427         if ( a != 0xCF000000 ) {
1428             float_raise( float_flag_invalid );
1429             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1430         }
1431         return (sbits32) 0x80000000;
1432     }
1433     else if ( aExp <= 0x7E ) {
1434         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1435         return 0;
1436     }
1437     aSig = ( aSig | 0x00800000 )<<8;
1438     z = aSig>>( - shiftCount );
1439     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1440         float_exception_flags |= float_flag_inexact;
1441     }
1442     if ( aSign ) z = - z;
1443     return z;
1444 
1445 }
1446 
1447 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1448 /*
1449 -------------------------------------------------------------------------------
1450 Returns the result of converting the single-precision floating-point value
1451 `a' to the 64-bit two's complement integer format.  The conversion is
1452 performed according to the IEC/IEEE Standard for Binary Floating-Point
1453 Arithmetic---which means in particular that the conversion is rounded
1454 according to the current rounding mode.  If `a' is a NaN, the largest
1455 positive integer is returned.  Otherwise, if the conversion overflows, the
1456 largest integer with the same sign as `a' is returned.
1457 -------------------------------------------------------------------------------
1458 */
1459 int64 float32_to_int64( float32 a )
1460 {
1461     flag aSign;
1462     int16 aExp, shiftCount;
1463     bits32 aSig;
1464     bits64 aSig64, aSigExtra;
1465 
1466     aSig = extractFloat32Frac( a );
1467     aExp = extractFloat32Exp( a );
1468     aSign = extractFloat32Sign( a );
1469     shiftCount = 0xBE - aExp;
1470     if ( shiftCount < 0 ) {
1471         float_raise( float_flag_invalid );
1472         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1473             return LIT64( 0x7FFFFFFFFFFFFFFF );
1474         }
1475         return (sbits64) LIT64( 0x8000000000000000 );
1476     }
1477     if ( aExp ) aSig |= 0x00800000;
1478     aSig64 = aSig;
1479     aSig64 <<= 40;
1480     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1481     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1482 
1483 }
1484 
1485 /*
1486 -------------------------------------------------------------------------------
1487 Returns the result of converting the single-precision floating-point value
1488 `a' to the 64-bit two's complement integer format.  The conversion is
1489 performed according to the IEC/IEEE Standard for Binary Floating-Point
1490 Arithmetic, except that the conversion is always rounded toward zero.  If
1491 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1492 conversion overflows, the largest integer with the same sign as `a' is
1493 returned.
1494 -------------------------------------------------------------------------------
1495 */
1496 int64 float32_to_int64_round_to_zero( float32 a )
1497 {
1498     flag aSign;
1499     int16 aExp, shiftCount;
1500     bits32 aSig;
1501     bits64 aSig64;
1502     int64 z;
1503 
1504     aSig = extractFloat32Frac( a );
1505     aExp = extractFloat32Exp( a );
1506     aSign = extractFloat32Sign( a );
1507     shiftCount = aExp - 0xBE;
1508     if ( 0 <= shiftCount ) {
1509         if ( a != 0xDF000000 ) {
1510             float_raise( float_flag_invalid );
1511             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1512                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1513             }
1514         }
1515         return (sbits64) LIT64( 0x8000000000000000 );
1516     }
1517     else if ( aExp <= 0x7E ) {
1518         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1519         return 0;
1520     }
1521     aSig64 = aSig | 0x00800000;
1522     aSig64 <<= 40;
1523     z = aSig64>>( - shiftCount );
1524     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1525         float_exception_flags |= float_flag_inexact;
1526     }
1527     if ( aSign ) z = - z;
1528     return z;
1529 
1530 }
1531 #endif /* !SOFTFLOAT_FOR_GCC */
1532 
1533 /*
1534 -------------------------------------------------------------------------------
1535 Returns the result of converting the single-precision floating-point value
1536 `a' to the double-precision floating-point format.  The conversion is
1537 performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 Arithmetic.
1539 -------------------------------------------------------------------------------
1540 */
1541 float64 float32_to_float64( float32 a )
1542 {
1543     flag aSign;
1544     int16 aExp;
1545     bits32 aSig;
1546 
1547     aSig = extractFloat32Frac( a );
1548     aExp = extractFloat32Exp( a );
1549     aSign = extractFloat32Sign( a );
1550     if ( aExp == 0xFF ) {
1551         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1552         return packFloat64( aSign, 0x7FF, 0 );
1553     }
1554     if ( aExp == 0 ) {
1555         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1556         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1557         --aExp;
1558     }
1559     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1560 
1561 }
1562 
1563 #ifdef FLOATX80
1564 
1565 /*
1566 -------------------------------------------------------------------------------
1567 Returns the result of converting the single-precision floating-point value
1568 `a' to the extended double-precision floating-point format.  The conversion
1569 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 Arithmetic.
1571 -------------------------------------------------------------------------------
1572 */
1573 floatx80 float32_to_floatx80( float32 a )
1574 {
1575     flag aSign;
1576     int16 aExp;
1577     bits32 aSig;
1578 
1579     aSig = extractFloat32Frac( a );
1580     aExp = extractFloat32Exp( a );
1581     aSign = extractFloat32Sign( a );
1582     if ( aExp == 0xFF ) {
1583         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1584         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1585     }
1586     if ( aExp == 0 ) {
1587         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1588         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589     }
1590     aSig |= 0x00800000;
1591     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1592 
1593 }
1594 
1595 #endif
1596 
1597 #ifdef FLOAT128
1598 
1599 /*
1600 -------------------------------------------------------------------------------
1601 Returns the result of converting the single-precision floating-point value
1602 `a' to the double-precision floating-point format.  The conversion is
1603 performed according to the IEC/IEEE Standard for Binary Floating-Point
1604 Arithmetic.
1605 -------------------------------------------------------------------------------
1606 */
1607 float128 float32_to_float128( float32 a )
1608 {
1609     flag aSign;
1610     int16 aExp;
1611     bits32 aSig;
1612 
1613     aSig = extractFloat32Frac( a );
1614     aExp = extractFloat32Exp( a );
1615     aSign = extractFloat32Sign( a );
1616     if ( aExp == 0xFF ) {
1617         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1618         return packFloat128( aSign, 0x7FFF, 0, 0 );
1619     }
1620     if ( aExp == 0 ) {
1621         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1622         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1623         --aExp;
1624     }
1625     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1626 
1627 }
1628 
1629 #endif
1630 
1631 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1632 /*
1633 -------------------------------------------------------------------------------
1634 Rounds the single-precision floating-point value `a' to an integer, and
1635 returns the result as a single-precision floating-point value.  The
1636 operation is performed according to the IEC/IEEE Standard for Binary
1637 Floating-Point Arithmetic.
1638 -------------------------------------------------------------------------------
1639 */
1640 float32 float32_round_to_int( float32 a )
1641 {
1642     flag aSign;
1643     int16 aExp;
1644     bits32 lastBitMask, roundBitsMask;
1645     int8 roundingMode;
1646     float32 z;
1647 
1648     aExp = extractFloat32Exp( a );
1649     if ( 0x96 <= aExp ) {
1650         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651             return propagateFloat32NaN( a, a );
1652         }
1653         return a;
1654     }
1655     if ( aExp <= 0x7E ) {
1656         if ( (bits32) ( a<<1 ) == 0 ) return a;
1657         float_exception_flags |= float_flag_inexact;
1658         aSign = extractFloat32Sign( a );
1659         switch ( float_rounding_mode ) {
1660          case float_round_nearest_even:
1661             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662                 return packFloat32( aSign, 0x7F, 0 );
1663             }
1664             break;
1665 	 case float_round_to_zero:
1666 	    break;
1667          case float_round_down:
1668             return aSign ? 0xBF800000 : 0;
1669          case float_round_up:
1670             return aSign ? 0x80000000 : 0x3F800000;
1671         }
1672         return packFloat32( aSign, 0, 0 );
1673     }
1674     lastBitMask = 1;
1675     lastBitMask <<= 0x96 - aExp;
1676     roundBitsMask = lastBitMask - 1;
1677     z = a;
1678     roundingMode = float_rounding_mode;
1679     if ( roundingMode == float_round_nearest_even ) {
1680         z += lastBitMask>>1;
1681         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1682     }
1683     else if ( roundingMode != float_round_to_zero ) {
1684         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1685             z += roundBitsMask;
1686         }
1687     }
1688     z &= ~ roundBitsMask;
1689     if ( z != a ) float_exception_flags |= float_flag_inexact;
1690     return z;
1691 
1692 }
1693 #endif /* !SOFTFLOAT_FOR_GCC */
1694 
1695 /*
1696 -------------------------------------------------------------------------------
1697 Returns the result of adding the absolute values of the single-precision
1698 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1699 before being returned.  `zSign' is ignored if the result is a NaN.
1700 The addition is performed according to the IEC/IEEE Standard for Binary
1701 Floating-Point Arithmetic.
1702 -------------------------------------------------------------------------------
1703 */
1704 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1705 {
1706     int16 aExp, bExp, zExp;
1707     bits32 aSig, bSig, zSig;
1708     int16 expDiff;
1709 
1710     aSig = extractFloat32Frac( a );
1711     aExp = extractFloat32Exp( a );
1712     bSig = extractFloat32Frac( b );
1713     bExp = extractFloat32Exp( b );
1714     expDiff = aExp - bExp;
1715     aSig <<= 6;
1716     bSig <<= 6;
1717     if ( 0 < expDiff ) {
1718         if ( aExp == 0xFF ) {
1719             if ( aSig ) return propagateFloat32NaN( a, b );
1720             return a;
1721         }
1722         if ( bExp == 0 ) {
1723             --expDiff;
1724         }
1725         else {
1726             bSig |= 0x20000000;
1727         }
1728         shift32RightJamming( bSig, expDiff, &bSig );
1729         zExp = aExp;
1730     }
1731     else if ( expDiff < 0 ) {
1732         if ( bExp == 0xFF ) {
1733             if ( bSig ) return propagateFloat32NaN( a, b );
1734             return packFloat32( zSign, 0xFF, 0 );
1735         }
1736         if ( aExp == 0 ) {
1737             ++expDiff;
1738         }
1739         else {
1740             aSig |= 0x20000000;
1741         }
1742         shift32RightJamming( aSig, - expDiff, &aSig );
1743         zExp = bExp;
1744     }
1745     else {
1746         if ( aExp == 0xFF ) {
1747             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1748             return a;
1749         }
1750         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1751         zSig = 0x40000000 + aSig + bSig;
1752         zExp = aExp;
1753         goto roundAndPack;
1754     }
1755     aSig |= 0x20000000;
1756     zSig = ( aSig + bSig )<<1;
1757     --zExp;
1758     if ( (sbits32) zSig < 0 ) {
1759         zSig = aSig + bSig;
1760         ++zExp;
1761     }
1762  roundAndPack:
1763     return roundAndPackFloat32( zSign, zExp, zSig );
1764 
1765 }
1766 
1767 /*
1768 -------------------------------------------------------------------------------
1769 Returns the result of subtracting the absolute values of the single-
1770 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1771 difference is negated before being returned.  `zSign' is ignored if the
1772 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1773 Standard for Binary Floating-Point Arithmetic.
1774 -------------------------------------------------------------------------------
1775 */
1776 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1777 {
1778     int16 aExp, bExp, zExp;
1779     bits32 aSig, bSig, zSig;
1780     int16 expDiff;
1781 
1782     aSig = extractFloat32Frac( a );
1783     aExp = extractFloat32Exp( a );
1784     bSig = extractFloat32Frac( b );
1785     bExp = extractFloat32Exp( b );
1786     expDiff = aExp - bExp;
1787     aSig <<= 7;
1788     bSig <<= 7;
1789     if ( 0 < expDiff ) goto aExpBigger;
1790     if ( expDiff < 0 ) goto bExpBigger;
1791     if ( aExp == 0xFF ) {
1792         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1793         float_raise( float_flag_invalid );
1794         return float32_default_nan;
1795     }
1796     if ( aExp == 0 ) {
1797         aExp = 1;
1798         bExp = 1;
1799     }
1800     if ( bSig < aSig ) goto aBigger;
1801     if ( aSig < bSig ) goto bBigger;
1802     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1803  bExpBigger:
1804     if ( bExp == 0xFF ) {
1805         if ( bSig ) return propagateFloat32NaN( a, b );
1806         return packFloat32( zSign ^ 1, 0xFF, 0 );
1807     }
1808     if ( aExp == 0 ) {
1809         ++expDiff;
1810     }
1811     else {
1812         aSig |= 0x40000000;
1813     }
1814     shift32RightJamming( aSig, - expDiff, &aSig );
1815     bSig |= 0x40000000;
1816  bBigger:
1817     zSig = bSig - aSig;
1818     zExp = bExp;
1819     zSign ^= 1;
1820     goto normalizeRoundAndPack;
1821  aExpBigger:
1822     if ( aExp == 0xFF ) {
1823         if ( aSig ) return propagateFloat32NaN( a, b );
1824         return a;
1825     }
1826     if ( bExp == 0 ) {
1827         --expDiff;
1828     }
1829     else {
1830         bSig |= 0x40000000;
1831     }
1832     shift32RightJamming( bSig, expDiff, &bSig );
1833     aSig |= 0x40000000;
1834  aBigger:
1835     zSig = aSig - bSig;
1836     zExp = aExp;
1837  normalizeRoundAndPack:
1838     --zExp;
1839     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1840 
1841 }
1842 
1843 /*
1844 -------------------------------------------------------------------------------
1845 Returns the result of adding the single-precision floating-point values `a'
1846 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1847 Binary Floating-Point Arithmetic.
1848 -------------------------------------------------------------------------------
1849 */
1850 float32 float32_add( float32 a, float32 b )
1851 {
1852     flag aSign, bSign;
1853 
1854     aSign = extractFloat32Sign( a );
1855     bSign = extractFloat32Sign( b );
1856     if ( aSign == bSign ) {
1857         return addFloat32Sigs( a, b, aSign );
1858     }
1859     else {
1860         return subFloat32Sigs( a, b, aSign );
1861     }
1862 
1863 }
1864 
1865 /*
1866 -------------------------------------------------------------------------------
1867 Returns the result of subtracting the single-precision floating-point values
1868 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1869 for Binary Floating-Point Arithmetic.
1870 -------------------------------------------------------------------------------
1871 */
1872 float32 float32_sub( float32 a, float32 b )
1873 {
1874     flag aSign, bSign;
1875 
1876     aSign = extractFloat32Sign( a );
1877     bSign = extractFloat32Sign( b );
1878     if ( aSign == bSign ) {
1879         return subFloat32Sigs( a, b, aSign );
1880     }
1881     else {
1882         return addFloat32Sigs( a, b, aSign );
1883     }
1884 
1885 }
1886 
1887 /*
1888 -------------------------------------------------------------------------------
1889 Returns the result of multiplying the single-precision floating-point values
1890 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1891 for Binary Floating-Point Arithmetic.
1892 -------------------------------------------------------------------------------
1893 */
1894 float32 float32_mul( float32 a, float32 b )
1895 {
1896     flag aSign, bSign, zSign;
1897     int16 aExp, bExp, zExp;
1898     bits32 aSig, bSig;
1899     bits64 zSig64;
1900     bits32 zSig;
1901 
1902     aSig = extractFloat32Frac( a );
1903     aExp = extractFloat32Exp( a );
1904     aSign = extractFloat32Sign( a );
1905     bSig = extractFloat32Frac( b );
1906     bExp = extractFloat32Exp( b );
1907     bSign = extractFloat32Sign( b );
1908     zSign = aSign ^ bSign;
1909     if ( aExp == 0xFF ) {
1910         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1911             return propagateFloat32NaN( a, b );
1912         }
1913         if ( ( bExp | bSig ) == 0 ) {
1914             float_raise( float_flag_invalid );
1915             return float32_default_nan;
1916         }
1917         return packFloat32( zSign, 0xFF, 0 );
1918     }
1919     if ( bExp == 0xFF ) {
1920         if ( bSig ) return propagateFloat32NaN( a, b );
1921         if ( ( aExp | aSig ) == 0 ) {
1922             float_raise( float_flag_invalid );
1923             return float32_default_nan;
1924         }
1925         return packFloat32( zSign, 0xFF, 0 );
1926     }
1927     if ( aExp == 0 ) {
1928         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930     }
1931     if ( bExp == 0 ) {
1932         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1933         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934     }
1935     zExp = aExp + bExp - 0x7F;
1936     aSig = ( aSig | 0x00800000 )<<7;
1937     bSig = ( bSig | 0x00800000 )<<8;
1938     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1939     zSig = (bits32)zSig64;
1940     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1941         zSig <<= 1;
1942         --zExp;
1943     }
1944     return roundAndPackFloat32( zSign, zExp, zSig );
1945 
1946 }
1947 
1948 /*
1949 -------------------------------------------------------------------------------
1950 Returns the result of dividing the single-precision floating-point value `a'
1951 by the corresponding value `b'.  The operation is performed according to the
1952 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1953 -------------------------------------------------------------------------------
1954 */
1955 float32 float32_div( float32 a, float32 b )
1956 {
1957     flag aSign, bSign, zSign;
1958     int16 aExp, bExp, zExp;
1959     bits32 aSig, bSig, zSig;
1960 
1961     aSig = extractFloat32Frac( a );
1962     aExp = extractFloat32Exp( a );
1963     aSign = extractFloat32Sign( a );
1964     bSig = extractFloat32Frac( b );
1965     bExp = extractFloat32Exp( b );
1966     bSign = extractFloat32Sign( b );
1967     zSign = aSign ^ bSign;
1968     if ( aExp == 0xFF ) {
1969         if ( aSig ) return propagateFloat32NaN( a, b );
1970         if ( bExp == 0xFF ) {
1971             if ( bSig ) return propagateFloat32NaN( a, b );
1972             float_raise( float_flag_invalid );
1973             return float32_default_nan;
1974         }
1975         return packFloat32( zSign, 0xFF, 0 );
1976     }
1977     if ( bExp == 0xFF ) {
1978         if ( bSig ) return propagateFloat32NaN( a, b );
1979         return packFloat32( zSign, 0, 0 );
1980     }
1981     if ( bExp == 0 ) {
1982         if ( bSig == 0 ) {
1983             if ( ( aExp | aSig ) == 0 ) {
1984                 float_raise( float_flag_invalid );
1985                 return float32_default_nan;
1986             }
1987             float_raise( float_flag_divbyzero );
1988             return packFloat32( zSign, 0xFF, 0 );
1989         }
1990         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1991     }
1992     if ( aExp == 0 ) {
1993         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1994         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1995     }
1996     zExp = aExp - bExp + 0x7D;
1997     aSig = ( aSig | 0x00800000 )<<7;
1998     bSig = ( bSig | 0x00800000 )<<8;
1999     if ( bSig <= ( aSig + aSig ) ) {
2000         aSig >>= 1;
2001         ++zExp;
2002     }
2003     zSig = (bits32)((((bits64) aSig) << 32) / bSig);
2004     if ( ( zSig & 0x3F ) == 0 ) {
2005         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2006     }
2007     return roundAndPackFloat32( zSign, zExp, zSig );
2008 
2009 }
2010 
2011 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2012 /*
2013 -------------------------------------------------------------------------------
2014 Returns the remainder of the single-precision floating-point value `a'
2015 with respect to the corresponding value `b'.  The operation is performed
2016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2017 -------------------------------------------------------------------------------
2018 */
2019 float32 float32_rem( float32 a, float32 b )
2020 {
2021     flag aSign, bSign, zSign;
2022     int16 aExp, bExp, expDiff;
2023     bits32 aSig, bSig;
2024     bits32 q;
2025     bits64 aSig64, bSig64, q64;
2026     bits32 alternateASig;
2027     sbits32 sigMean;
2028 
2029     aSig = extractFloat32Frac( a );
2030     aExp = extractFloat32Exp( a );
2031     aSign = extractFloat32Sign( a );
2032     bSig = extractFloat32Frac( b );
2033     bExp = extractFloat32Exp( b );
2034     bSign = extractFloat32Sign( b );
2035     if ( aExp == 0xFF ) {
2036         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2037             return propagateFloat32NaN( a, b );
2038         }
2039         float_raise( float_flag_invalid );
2040         return float32_default_nan;
2041     }
2042     if ( bExp == 0xFF ) {
2043         if ( bSig ) return propagateFloat32NaN( a, b );
2044         return a;
2045     }
2046     if ( bExp == 0 ) {
2047         if ( bSig == 0 ) {
2048             float_raise( float_flag_invalid );
2049             return float32_default_nan;
2050         }
2051         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2052     }
2053     if ( aExp == 0 ) {
2054         if ( aSig == 0 ) return a;
2055         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2056     }
2057     expDiff = aExp - bExp;
2058     aSig |= 0x00800000;
2059     bSig |= 0x00800000;
2060     if ( expDiff < 32 ) {
2061         aSig <<= 8;
2062         bSig <<= 8;
2063         if ( expDiff < 0 ) {
2064             if ( expDiff < -1 ) return a;
2065             aSig >>= 1;
2066         }
2067         q = ( bSig <= aSig );
2068         if ( q ) aSig -= bSig;
2069         if ( 0 < expDiff ) {
2070             q = ( ( (bits64) aSig )<<32 ) / bSig;
2071             q >>= 32 - expDiff;
2072             bSig >>= 2;
2073             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2074         }
2075         else {
2076             aSig >>= 2;
2077             bSig >>= 2;
2078         }
2079     }
2080     else {
2081         if ( bSig <= aSig ) aSig -= bSig;
2082         aSig64 = ( (bits64) aSig )<<40;
2083         bSig64 = ( (bits64) bSig )<<40;
2084         expDiff -= 64;
2085         while ( 0 < expDiff ) {
2086             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2087             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2088             aSig64 = - ( ( bSig * q64 )<<38 );
2089             expDiff -= 62;
2090         }
2091         expDiff += 64;
2092         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094         q = q64>>( 64 - expDiff );
2095         bSig <<= 6;
2096         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2097     }
2098     do {
2099         alternateASig = aSig;
2100         ++q;
2101         aSig -= bSig;
2102     } while ( 0 <= (sbits32) aSig );
2103     sigMean = aSig + alternateASig;
2104     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2105         aSig = alternateASig;
2106     }
2107     zSign = ( (sbits32) aSig < 0 );
2108     if ( zSign ) aSig = - aSig;
2109     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2110 
2111 }
2112 #endif /* !SOFTFLOAT_FOR_GCC */
2113 
2114 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2115 /*
2116 -------------------------------------------------------------------------------
2117 Returns the square root of the single-precision floating-point value `a'.
2118 The operation is performed according to the IEC/IEEE Standard for Binary
2119 Floating-Point Arithmetic.
2120 -------------------------------------------------------------------------------
2121 */
2122 float32 float32_sqrt( float32 a )
2123 {
2124     flag aSign;
2125     int16 aExp, zExp;
2126     bits32 aSig, zSig;
2127     bits64 rem, term;
2128 
2129     aSig = extractFloat32Frac( a );
2130     aExp = extractFloat32Exp( a );
2131     aSign = extractFloat32Sign( a );
2132     if ( aExp == 0xFF ) {
2133         if ( aSig ) return propagateFloat32NaN( a, 0 );
2134         if ( ! aSign ) return a;
2135         float_raise( float_flag_invalid );
2136         return float32_default_nan;
2137     }
2138     if ( aSign ) {
2139         if ( ( aExp | aSig ) == 0 ) return a;
2140         float_raise( float_flag_invalid );
2141         return float32_default_nan;
2142     }
2143     if ( aExp == 0 ) {
2144         if ( aSig == 0 ) return 0;
2145         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146     }
2147     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148     aSig = ( aSig | 0x00800000 )<<8;
2149     zSig = estimateSqrt32( aExp, aSig ) + 2;
2150     if ( ( zSig & 0x7F ) <= 5 ) {
2151         if ( zSig < 2 ) {
2152             zSig = 0x7FFFFFFF;
2153             goto roundAndPack;
2154         }
2155         aSig >>= aExp & 1;
2156         term = ( (bits64) zSig ) * zSig;
2157         rem = ( ( (bits64) aSig )<<32 ) - term;
2158         while ( (sbits64) rem < 0 ) {
2159             --zSig;
2160             rem += ( ( (bits64) zSig )<<1 ) | 1;
2161         }
2162         zSig |= ( rem != 0 );
2163     }
2164     shift32RightJamming( zSig, 1, &zSig );
2165  roundAndPack:
2166     return roundAndPackFloat32( 0, zExp, zSig );
2167 
2168 }
2169 #endif /* !SOFTFLOAT_FOR_GCC */
2170 
2171 /*
2172 -------------------------------------------------------------------------------
2173 Returns 1 if the single-precision floating-point value `a' is equal to
2174 the corresponding value `b', and 0 otherwise.  The comparison is performed
2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 -------------------------------------------------------------------------------
2177 */
2178 flag float32_eq( float32 a, float32 b )
2179 {
2180 
2181     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2182          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2183        ) {
2184         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2185             float_raise( float_flag_invalid );
2186         }
2187         return 0;
2188     }
2189     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2190 
2191 }
2192 
2193 /*
2194 -------------------------------------------------------------------------------
2195 Returns 1 if the single-precision floating-point value `a' is less than
2196 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2197 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2198 Arithmetic.
2199 -------------------------------------------------------------------------------
2200 */
2201 flag float32_le( float32 a, float32 b )
2202 {
2203     flag aSign, bSign;
2204 
2205     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207        ) {
2208         float_raise( float_flag_invalid );
2209         return 0;
2210     }
2211     aSign = extractFloat32Sign( a );
2212     bSign = extractFloat32Sign( b );
2213     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2214     return ( a == b ) || ( aSign ^ ( a < b ) );
2215 
2216 }
2217 
2218 /*
2219 -------------------------------------------------------------------------------
2220 Returns 1 if the single-precision floating-point value `a' is less than
2221 the corresponding value `b', and 0 otherwise.  The comparison is performed
2222 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 -------------------------------------------------------------------------------
2224 */
2225 flag float32_lt( float32 a, float32 b )
2226 {
2227     flag aSign, bSign;
2228 
2229     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2230          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2231        ) {
2232         float_raise( float_flag_invalid );
2233         return 0;
2234     }
2235     aSign = extractFloat32Sign( a );
2236     bSign = extractFloat32Sign( b );
2237     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2238     return ( a != b ) && ( aSign ^ ( a < b ) );
2239 
2240 }
2241 
2242 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2243 /*
2244 -------------------------------------------------------------------------------
2245 Returns 1 if the single-precision floating-point value `a' is equal to
2246 the corresponding value `b', and 0 otherwise.  The invalid exception is
2247 raised if either operand is a NaN.  Otherwise, the comparison is performed
2248 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2249 -------------------------------------------------------------------------------
2250 */
2251 flag float32_eq_signaling( float32 a, float32 b )
2252 {
2253 
2254     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2255          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2256        ) {
2257         float_raise( float_flag_invalid );
2258         return 0;
2259     }
2260     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2261 
2262 }
2263 
2264 /*
2265 -------------------------------------------------------------------------------
2266 Returns 1 if the single-precision floating-point value `a' is less than or
2267 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2268 cause an exception.  Otherwise, the comparison is performed according to the
2269 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 -------------------------------------------------------------------------------
2271 */
2272 flag float32_le_quiet( float32 a, float32 b )
2273 {
2274     flag aSign, bSign;
2275 
2276     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2277          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2278        ) {
2279         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2280             float_raise( float_flag_invalid );
2281         }
2282         return 0;
2283     }
2284     aSign = extractFloat32Sign( a );
2285     bSign = extractFloat32Sign( b );
2286     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2287     return ( a == b ) || ( aSign ^ ( a < b ) );
2288 
2289 }
2290 
2291 /*
2292 -------------------------------------------------------------------------------
2293 Returns 1 if the single-precision floating-point value `a' is less than
2294 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2295 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2296 Standard for Binary Floating-Point Arithmetic.
2297 -------------------------------------------------------------------------------
2298 */
2299 flag float32_lt_quiet( float32 a, float32 b )
2300 {
2301     flag aSign, bSign;
2302 
2303     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2304          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2305        ) {
2306         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2307             float_raise( float_flag_invalid );
2308         }
2309         return 0;
2310     }
2311     aSign = extractFloat32Sign( a );
2312     bSign = extractFloat32Sign( b );
2313     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2314     return ( a != b ) && ( aSign ^ ( a < b ) );
2315 
2316 }
2317 #endif /* !SOFTFLOAT_FOR_GCC */
2318 
2319 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2320 /*
2321 -------------------------------------------------------------------------------
2322 Returns the result of converting the double-precision floating-point value
2323 `a' to the 32-bit two's complement integer format.  The conversion is
2324 performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 Arithmetic---which means in particular that the conversion is rounded
2326 according to the current rounding mode.  If `a' is a NaN, the largest
2327 positive integer is returned.  Otherwise, if the conversion overflows, the
2328 largest integer with the same sign as `a' is returned.
2329 -------------------------------------------------------------------------------
2330 */
2331 int32 float64_to_int32( float64 a )
2332 {
2333     flag aSign;
2334     int16 aExp, shiftCount;
2335     bits64 aSig;
2336 
2337     aSig = extractFloat64Frac( a );
2338     aExp = extractFloat64Exp( a );
2339     aSign = extractFloat64Sign( a );
2340     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2341     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342     shiftCount = 0x42C - aExp;
2343     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2344     return roundAndPackInt32( aSign, aSig );
2345 
2346 }
2347 #endif /* !SOFTFLOAT_FOR_GCC */
2348 
2349 /*
2350 -------------------------------------------------------------------------------
2351 Returns the result of converting the double-precision floating-point value
2352 `a' to the 32-bit two's complement integer format.  The conversion is
2353 performed according to the IEC/IEEE Standard for Binary Floating-Point
2354 Arithmetic, except that the conversion is always rounded toward zero.
2355 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2356 the conversion overflows, the largest integer with the same sign as `a' is
2357 returned.
2358 -------------------------------------------------------------------------------
2359 */
2360 int32 float64_to_int32_round_to_zero( float64 a )
2361 {
2362     flag aSign;
2363     int16 aExp, shiftCount;
2364     bits64 aSig, savedASig;
2365     int32 z;
2366 
2367     aSig = extractFloat64Frac( a );
2368     aExp = extractFloat64Exp( a );
2369     aSign = extractFloat64Sign( a );
2370     if ( 0x41E < aExp ) {
2371         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2372         goto invalid;
2373     }
2374     else if ( aExp < 0x3FF ) {
2375         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2376         return 0;
2377     }
2378     aSig |= LIT64( 0x0010000000000000 );
2379     shiftCount = 0x433 - aExp;
2380     savedASig = aSig;
2381     aSig >>= shiftCount;
2382     z = (int32)aSig;
2383     if ( aSign ) z = - z;
2384     if ( ( z < 0 ) ^ aSign ) {
2385  invalid:
2386         float_raise( float_flag_invalid );
2387         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2388     }
2389     if ( ( aSig<<shiftCount ) != savedASig ) {
2390         float_exception_flags |= float_flag_inexact;
2391     }
2392     return z;
2393 
2394 }
2395 
2396 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2397 /*
2398 -------------------------------------------------------------------------------
2399 Returns the result of converting the double-precision floating-point value
2400 `a' to the 64-bit two's complement integer format.  The conversion is
2401 performed according to the IEC/IEEE Standard for Binary Floating-Point
2402 Arithmetic---which means in particular that the conversion is rounded
2403 according to the current rounding mode.  If `a' is a NaN, the largest
2404 positive integer is returned.  Otherwise, if the conversion overflows, the
2405 largest integer with the same sign as `a' is returned.
2406 -------------------------------------------------------------------------------
2407 */
2408 int64 float64_to_int64( float64 a )
2409 {
2410     flag aSign;
2411     int16 aExp, shiftCount;
2412     bits64 aSig, aSigExtra;
2413 
2414     aSig = extractFloat64Frac( a );
2415     aExp = extractFloat64Exp( a );
2416     aSign = extractFloat64Sign( a );
2417     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418     shiftCount = 0x433 - aExp;
2419     if ( shiftCount <= 0 ) {
2420         if ( 0x43E < aExp ) {
2421             float_raise( float_flag_invalid );
2422             if (    ! aSign
2423                  || (    ( aExp == 0x7FF )
2424                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2425                ) {
2426                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2427             }
2428             return (sbits64) LIT64( 0x8000000000000000 );
2429         }
2430         aSigExtra = 0;
2431         aSig <<= - shiftCount;
2432     }
2433     else {
2434         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2435     }
2436     return roundAndPackInt64( aSign, aSig, aSigExtra );
2437 
2438 }
2439 
2440 /*
2441 -------------------------------------------------------------------------------
2442 Returns the result of converting the double-precision floating-point value
2443 `a' to the 64-bit two's complement integer format.  The conversion is
2444 performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 Arithmetic, except that the conversion is always rounded toward zero.
2446 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2447 the conversion overflows, the largest integer with the same sign as `a' is
2448 returned.
2449 -------------------------------------------------------------------------------
2450 */
2451 int64 float64_to_int64_round_to_zero( float64 a )
2452 {
2453     flag aSign;
2454     int16 aExp, shiftCount;
2455     bits64 aSig;
2456     int64 z;
2457 
2458     aSig = extractFloat64Frac( a );
2459     aExp = extractFloat64Exp( a );
2460     aSign = extractFloat64Sign( a );
2461     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2462     shiftCount = aExp - 0x433;
2463     if ( 0 <= shiftCount ) {
2464         if ( 0x43E <= aExp ) {
2465             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2466                 float_raise( float_flag_invalid );
2467                 if (    ! aSign
2468                      || (    ( aExp == 0x7FF )
2469                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2470                    ) {
2471                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2472                 }
2473             }
2474             return (sbits64) LIT64( 0x8000000000000000 );
2475         }
2476         z = aSig<<shiftCount;
2477     }
2478     else {
2479         if ( aExp < 0x3FE ) {
2480             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2481             return 0;
2482         }
2483         z = aSig>>( - shiftCount );
2484         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2485             float_exception_flags |= float_flag_inexact;
2486         }
2487     }
2488     if ( aSign ) z = - z;
2489     return z;
2490 
2491 }
2492 #endif /* !SOFTFLOAT_FOR_GCC */
2493 
2494 /*
2495 -------------------------------------------------------------------------------
2496 Returns the result of converting the double-precision floating-point value
2497 `a' to the single-precision floating-point format.  The conversion is
2498 performed according to the IEC/IEEE Standard for Binary Floating-Point
2499 Arithmetic.
2500 -------------------------------------------------------------------------------
2501 */
2502 float32 float64_to_float32( float64 a )
2503 {
2504     flag aSign;
2505     int16 aExp;
2506     bits64 aSig;
2507     bits32 zSig;
2508 
2509     aSig = extractFloat64Frac( a );
2510     aExp = extractFloat64Exp( a );
2511     aSign = extractFloat64Sign( a );
2512     if ( aExp == 0x7FF ) {
2513         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2514         return packFloat32( aSign, 0xFF, 0 );
2515     }
2516     shift64RightJamming( aSig, 22, &aSig );
2517     zSig = (bits32)aSig;
2518     if ( aExp || zSig ) {
2519         zSig |= 0x40000000;
2520         aExp -= 0x381;
2521     }
2522     return roundAndPackFloat32( aSign, aExp, zSig );
2523 
2524 }
2525 
2526 #ifdef FLOATX80
2527 
2528 /*
2529 -------------------------------------------------------------------------------
2530 Returns the result of converting the double-precision floating-point value
2531 `a' to the extended double-precision floating-point format.  The conversion
2532 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2533 Arithmetic.
2534 -------------------------------------------------------------------------------
2535 */
2536 floatx80 float64_to_floatx80( float64 a )
2537 {
2538     flag aSign;
2539     int16 aExp;
2540     bits64 aSig;
2541 
2542     aSig = extractFloat64Frac( a );
2543     aExp = extractFloat64Exp( a );
2544     aSign = extractFloat64Sign( a );
2545     if ( aExp == 0x7FF ) {
2546         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2547         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2548     }
2549     if ( aExp == 0 ) {
2550         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2551         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2552     }
2553     return
2554         packFloatx80(
2555             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2556 
2557 }
2558 
2559 #endif
2560 
2561 #ifdef FLOAT128
2562 
2563 /*
2564 -------------------------------------------------------------------------------
2565 Returns the result of converting the double-precision floating-point value
2566 `a' to the quadruple-precision floating-point format.  The conversion is
2567 performed according to the IEC/IEEE Standard for Binary Floating-Point
2568 Arithmetic.
2569 -------------------------------------------------------------------------------
2570 */
2571 float128 float64_to_float128( float64 a )
2572 {
2573     flag aSign;
2574     int16 aExp;
2575     bits64 aSig, zSig0, zSig1;
2576 
2577     aSig = extractFloat64Frac( a );
2578     aExp = extractFloat64Exp( a );
2579     aSign = extractFloat64Sign( a );
2580     if ( aExp == 0x7FF ) {
2581         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2582         return packFloat128( aSign, 0x7FFF, 0, 0 );
2583     }
2584     if ( aExp == 0 ) {
2585         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2586         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2587         --aExp;
2588     }
2589     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2590     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2591 
2592 }
2593 
2594 #endif
2595 
2596 #ifndef SOFTFLOAT_FOR_GCC
2597 /*
2598 -------------------------------------------------------------------------------
2599 Rounds the double-precision floating-point value `a' to an integer, and
2600 returns the result as a double-precision floating-point value.  The
2601 operation is performed according to the IEC/IEEE Standard for Binary
2602 Floating-Point Arithmetic.
2603 -------------------------------------------------------------------------------
2604 */
2605 float64 float64_round_to_int( float64 a )
2606 {
2607     flag aSign;
2608     int16 aExp;
2609     bits64 lastBitMask, roundBitsMask;
2610     int8 roundingMode;
2611     float64 z;
2612 
2613     aExp = extractFloat64Exp( a );
2614     if ( 0x433 <= aExp ) {
2615         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2616             return propagateFloat64NaN( a, a );
2617         }
2618         return a;
2619     }
2620     if ( aExp < 0x3FF ) {
2621         if ( (bits64) ( a<<1 ) == 0 ) return a;
2622         float_exception_flags |= float_flag_inexact;
2623         aSign = extractFloat64Sign( a );
2624         switch ( float_rounding_mode ) {
2625          case float_round_nearest_even:
2626             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2627                 return packFloat64( aSign, 0x3FF, 0 );
2628             }
2629             break;
2630 	 case float_round_to_zero:
2631 	    break;
2632          case float_round_down:
2633             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2634          case float_round_up:
2635             return
2636             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2637         }
2638         return packFloat64( aSign, 0, 0 );
2639     }
2640     lastBitMask = 1;
2641     lastBitMask <<= 0x433 - aExp;
2642     roundBitsMask = lastBitMask - 1;
2643     z = a;
2644     roundingMode = float_rounding_mode;
2645     if ( roundingMode == float_round_nearest_even ) {
2646         z += lastBitMask>>1;
2647         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2648     }
2649     else if ( roundingMode != float_round_to_zero ) {
2650         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2651             z += roundBitsMask;
2652         }
2653     }
2654     z &= ~ roundBitsMask;
2655     if ( z != a ) float_exception_flags |= float_flag_inexact;
2656     return z;
2657 
2658 }
2659 #endif
2660 
2661 /*
2662 -------------------------------------------------------------------------------
2663 Returns the result of adding the absolute values of the double-precision
2664 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2665 before being returned.  `zSign' is ignored if the result is a NaN.
2666 The addition is performed according to the IEC/IEEE Standard for Binary
2667 Floating-Point Arithmetic.
2668 -------------------------------------------------------------------------------
2669 */
2670 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2671 {
2672     int16 aExp, bExp, zExp;
2673     bits64 aSig, bSig, zSig;
2674     int16 expDiff;
2675 
2676     aSig = extractFloat64Frac( a );
2677     aExp = extractFloat64Exp( a );
2678     bSig = extractFloat64Frac( b );
2679     bExp = extractFloat64Exp( b );
2680     expDiff = aExp - bExp;
2681     aSig <<= 9;
2682     bSig <<= 9;
2683     if ( 0 < expDiff ) {
2684         if ( aExp == 0x7FF ) {
2685             if ( aSig ) return propagateFloat64NaN( a, b );
2686             return a;
2687         }
2688         if ( bExp == 0 ) {
2689             --expDiff;
2690         }
2691         else {
2692             bSig |= LIT64( 0x2000000000000000 );
2693         }
2694         shift64RightJamming( bSig, expDiff, &bSig );
2695         zExp = aExp;
2696     }
2697     else if ( expDiff < 0 ) {
2698         if ( bExp == 0x7FF ) {
2699             if ( bSig ) return propagateFloat64NaN( a, b );
2700             return packFloat64( zSign, 0x7FF, 0 );
2701         }
2702         if ( aExp == 0 ) {
2703             ++expDiff;
2704         }
2705         else {
2706             aSig |= LIT64( 0x2000000000000000 );
2707         }
2708         shift64RightJamming( aSig, - expDiff, &aSig );
2709         zExp = bExp;
2710     }
2711     else {
2712         if ( aExp == 0x7FF ) {
2713             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2714             return a;
2715         }
2716         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2717         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2718         zExp = aExp;
2719         goto roundAndPack;
2720     }
2721     aSig |= LIT64( 0x2000000000000000 );
2722     zSig = ( aSig + bSig )<<1;
2723     --zExp;
2724     if ( (sbits64) zSig < 0 ) {
2725         zSig = aSig + bSig;
2726         ++zExp;
2727     }
2728  roundAndPack:
2729     return roundAndPackFloat64( zSign, zExp, zSig );
2730 
2731 }
2732 
2733 /*
2734 -------------------------------------------------------------------------------
2735 Returns the result of subtracting the absolute values of the double-
2736 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2737 difference is negated before being returned.  `zSign' is ignored if the
2738 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2739 Standard for Binary Floating-Point Arithmetic.
2740 -------------------------------------------------------------------------------
2741 */
2742 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2743 {
2744     int16 aExp, bExp, zExp;
2745     bits64 aSig, bSig, zSig;
2746     int16 expDiff;
2747 
2748     aSig = extractFloat64Frac( a );
2749     aExp = extractFloat64Exp( a );
2750     bSig = extractFloat64Frac( b );
2751     bExp = extractFloat64Exp( b );
2752     expDiff = aExp - bExp;
2753     aSig <<= 10;
2754     bSig <<= 10;
2755     if ( 0 < expDiff ) goto aExpBigger;
2756     if ( expDiff < 0 ) goto bExpBigger;
2757     if ( aExp == 0x7FF ) {
2758         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2759         float_raise( float_flag_invalid );
2760         return float64_default_nan;
2761     }
2762     if ( aExp == 0 ) {
2763         aExp = 1;
2764         bExp = 1;
2765     }
2766     if ( bSig < aSig ) goto aBigger;
2767     if ( aSig < bSig ) goto bBigger;
2768     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2769  bExpBigger:
2770     if ( bExp == 0x7FF ) {
2771         if ( bSig ) return propagateFloat64NaN( a, b );
2772         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2773     }
2774     if ( aExp == 0 ) {
2775         ++expDiff;
2776     }
2777     else {
2778         aSig |= LIT64( 0x4000000000000000 );
2779     }
2780     shift64RightJamming( aSig, - expDiff, &aSig );
2781     bSig |= LIT64( 0x4000000000000000 );
2782  bBigger:
2783     zSig = bSig - aSig;
2784     zExp = bExp;
2785     zSign ^= 1;
2786     goto normalizeRoundAndPack;
2787  aExpBigger:
2788     if ( aExp == 0x7FF ) {
2789         if ( aSig ) return propagateFloat64NaN( a, b );
2790         return a;
2791     }
2792     if ( bExp == 0 ) {
2793         --expDiff;
2794     }
2795     else {
2796         bSig |= LIT64( 0x4000000000000000 );
2797     }
2798     shift64RightJamming( bSig, expDiff, &bSig );
2799     aSig |= LIT64( 0x4000000000000000 );
2800  aBigger:
2801     zSig = aSig - bSig;
2802     zExp = aExp;
2803  normalizeRoundAndPack:
2804     --zExp;
2805     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2806 
2807 }
2808 
2809 /*
2810 -------------------------------------------------------------------------------
2811 Returns the result of adding the double-precision floating-point values `a'
2812 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2813 Binary Floating-Point Arithmetic.
2814 -------------------------------------------------------------------------------
2815 */
2816 float64 float64_add( float64 a, float64 b )
2817 {
2818     flag aSign, bSign;
2819 
2820     aSign = extractFloat64Sign( a );
2821     bSign = extractFloat64Sign( b );
2822     if ( aSign == bSign ) {
2823         return addFloat64Sigs( a, b, aSign );
2824     }
2825     else {
2826         return subFloat64Sigs( a, b, aSign );
2827     }
2828 
2829 }
2830 
2831 /*
2832 -------------------------------------------------------------------------------
2833 Returns the result of subtracting the double-precision floating-point values
2834 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2835 for Binary Floating-Point Arithmetic.
2836 -------------------------------------------------------------------------------
2837 */
2838 float64 float64_sub( float64 a, float64 b )
2839 {
2840     flag aSign, bSign;
2841 
2842     aSign = extractFloat64Sign( a );
2843     bSign = extractFloat64Sign( b );
2844     if ( aSign == bSign ) {
2845         return subFloat64Sigs( a, b, aSign );
2846     }
2847     else {
2848         return addFloat64Sigs( a, b, aSign );
2849     }
2850 
2851 }
2852 
2853 /*
2854 -------------------------------------------------------------------------------
2855 Returns the result of multiplying the double-precision floating-point values
2856 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2857 for Binary Floating-Point Arithmetic.
2858 -------------------------------------------------------------------------------
2859 */
2860 float64 float64_mul( float64 a, float64 b )
2861 {
2862     flag aSign, bSign, zSign;
2863     int16 aExp, bExp, zExp;
2864     bits64 aSig, bSig, zSig0, zSig1;
2865 
2866     aSig = extractFloat64Frac( a );
2867     aExp = extractFloat64Exp( a );
2868     aSign = extractFloat64Sign( a );
2869     bSig = extractFloat64Frac( b );
2870     bExp = extractFloat64Exp( b );
2871     bSign = extractFloat64Sign( b );
2872     zSign = aSign ^ bSign;
2873     if ( aExp == 0x7FF ) {
2874         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2875             return propagateFloat64NaN( a, b );
2876         }
2877         if ( ( bExp | bSig ) == 0 ) {
2878             float_raise( float_flag_invalid );
2879             return float64_default_nan;
2880         }
2881         return packFloat64( zSign, 0x7FF, 0 );
2882     }
2883     if ( bExp == 0x7FF ) {
2884         if ( bSig ) return propagateFloat64NaN( a, b );
2885         if ( ( aExp | aSig ) == 0 ) {
2886             float_raise( float_flag_invalid );
2887             return float64_default_nan;
2888         }
2889         return packFloat64( zSign, 0x7FF, 0 );
2890     }
2891     if ( aExp == 0 ) {
2892         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2893         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2894     }
2895     if ( bExp == 0 ) {
2896         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2897         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2898     }
2899     zExp = aExp + bExp - 0x3FF;
2900     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2901     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2902     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2903     zSig0 |= ( zSig1 != 0 );
2904     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2905         zSig0 <<= 1;
2906         --zExp;
2907     }
2908     return roundAndPackFloat64( zSign, zExp, zSig0 );
2909 
2910 }
2911 
2912 /*
2913 -------------------------------------------------------------------------------
2914 Returns the result of dividing the double-precision floating-point value `a'
2915 by the corresponding value `b'.  The operation is performed according to
2916 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 -------------------------------------------------------------------------------
2918 */
2919 float64 float64_div( float64 a, float64 b )
2920 {
2921     flag aSign, bSign, zSign;
2922     int16 aExp, bExp, zExp;
2923     bits64 aSig, bSig, zSig;
2924     bits64 rem0, rem1;
2925     bits64 term0, term1;
2926 
2927     aSig = extractFloat64Frac( a );
2928     aExp = extractFloat64Exp( a );
2929     aSign = extractFloat64Sign( a );
2930     bSig = extractFloat64Frac( b );
2931     bExp = extractFloat64Exp( b );
2932     bSign = extractFloat64Sign( b );
2933     zSign = aSign ^ bSign;
2934     if ( aExp == 0x7FF ) {
2935         if ( aSig ) return propagateFloat64NaN( a, b );
2936         if ( bExp == 0x7FF ) {
2937             if ( bSig ) return propagateFloat64NaN( a, b );
2938             float_raise( float_flag_invalid );
2939             return float64_default_nan;
2940         }
2941         return packFloat64( zSign, 0x7FF, 0 );
2942     }
2943     if ( bExp == 0x7FF ) {
2944         if ( bSig ) return propagateFloat64NaN( a, b );
2945         return packFloat64( zSign, 0, 0 );
2946     }
2947     if ( bExp == 0 ) {
2948         if ( bSig == 0 ) {
2949             if ( ( aExp | aSig ) == 0 ) {
2950                 float_raise( float_flag_invalid );
2951                 return float64_default_nan;
2952             }
2953             float_raise( float_flag_divbyzero );
2954             return packFloat64( zSign, 0x7FF, 0 );
2955         }
2956         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2957     }
2958     if ( aExp == 0 ) {
2959         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2960         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2961     }
2962     zExp = aExp - bExp + 0x3FD;
2963     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2964     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2965     if ( bSig <= ( aSig + aSig ) ) {
2966         aSig >>= 1;
2967         ++zExp;
2968     }
2969     zSig = estimateDiv128To64( aSig, 0, bSig );
2970     if ( ( zSig & 0x1FF ) <= 2 ) {
2971         mul64To128( bSig, zSig, &term0, &term1 );
2972         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2973         while ( (sbits64) rem0 < 0 ) {
2974             --zSig;
2975             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2976         }
2977         zSig |= ( rem1 != 0 );
2978     }
2979     return roundAndPackFloat64( zSign, zExp, zSig );
2980 
2981 }
2982 
2983 #ifndef SOFTFLOAT_FOR_GCC
2984 /*
2985 -------------------------------------------------------------------------------
2986 Returns the remainder of the double-precision floating-point value `a'
2987 with respect to the corresponding value `b'.  The operation is performed
2988 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2989 -------------------------------------------------------------------------------
2990 */
2991 float64 float64_rem( float64 a, float64 b )
2992 {
2993     flag aSign, bSign, zSign;
2994     int16 aExp, bExp, expDiff;
2995     bits64 aSig, bSig;
2996     bits64 q, alternateASig;
2997     sbits64 sigMean;
2998 
2999     aSig = extractFloat64Frac( a );
3000     aExp = extractFloat64Exp( a );
3001     aSign = extractFloat64Sign( a );
3002     bSig = extractFloat64Frac( b );
3003     bExp = extractFloat64Exp( b );
3004     bSign = extractFloat64Sign( b );
3005     if ( aExp == 0x7FF ) {
3006         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3007             return propagateFloat64NaN( a, b );
3008         }
3009         float_raise( float_flag_invalid );
3010         return float64_default_nan;
3011     }
3012     if ( bExp == 0x7FF ) {
3013         if ( bSig ) return propagateFloat64NaN( a, b );
3014         return a;
3015     }
3016     if ( bExp == 0 ) {
3017         if ( bSig == 0 ) {
3018             float_raise( float_flag_invalid );
3019             return float64_default_nan;
3020         }
3021         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3022     }
3023     if ( aExp == 0 ) {
3024         if ( aSig == 0 ) return a;
3025         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3026     }
3027     expDiff = aExp - bExp;
3028     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3029     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3030     if ( expDiff < 0 ) {
3031         if ( expDiff < -1 ) return a;
3032         aSig >>= 1;
3033     }
3034     q = ( bSig <= aSig );
3035     if ( q ) aSig -= bSig;
3036     expDiff -= 64;
3037     while ( 0 < expDiff ) {
3038         q = estimateDiv128To64( aSig, 0, bSig );
3039         q = ( 2 < q ) ? q - 2 : 0;
3040         aSig = - ( ( bSig>>2 ) * q );
3041         expDiff -= 62;
3042     }
3043     expDiff += 64;
3044     if ( 0 < expDiff ) {
3045         q = estimateDiv128To64( aSig, 0, bSig );
3046         q = ( 2 < q ) ? q - 2 : 0;
3047         q >>= 64 - expDiff;
3048         bSig >>= 2;
3049         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3050     }
3051     else {
3052         aSig >>= 2;
3053         bSig >>= 2;
3054     }
3055     do {
3056         alternateASig = aSig;
3057         ++q;
3058         aSig -= bSig;
3059     } while ( 0 <= (sbits64) aSig );
3060     sigMean = aSig + alternateASig;
3061     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3062         aSig = alternateASig;
3063     }
3064     zSign = ( (sbits64) aSig < 0 );
3065     if ( zSign ) aSig = - aSig;
3066     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3067 
3068 }
3069 
3070 /*
3071 -------------------------------------------------------------------------------
3072 Returns the square root of the double-precision floating-point value `a'.
3073 The operation is performed according to the IEC/IEEE Standard for Binary
3074 Floating-Point Arithmetic.
3075 -------------------------------------------------------------------------------
3076 */
3077 float64 float64_sqrt( float64 a )
3078 {
3079     flag aSign;
3080     int16 aExp, zExp;
3081     bits64 aSig, zSig, doubleZSig;
3082     bits64 rem0, rem1, term0, term1;
3083 
3084     aSig = extractFloat64Frac( a );
3085     aExp = extractFloat64Exp( a );
3086     aSign = extractFloat64Sign( a );
3087     if ( aExp == 0x7FF ) {
3088         if ( aSig ) return propagateFloat64NaN( a, a );
3089         if ( ! aSign ) return a;
3090         float_raise( float_flag_invalid );
3091         return float64_default_nan;
3092     }
3093     if ( aSign ) {
3094         if ( ( aExp | aSig ) == 0 ) return a;
3095         float_raise( float_flag_invalid );
3096         return float64_default_nan;
3097     }
3098     if ( aExp == 0 ) {
3099         if ( aSig == 0 ) return 0;
3100         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3101     }
3102     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3103     aSig |= LIT64( 0x0010000000000000 );
3104     zSig = estimateSqrt32( aExp, aSig>>21 );
3105     aSig <<= 9 - ( aExp & 1 );
3106     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3107     if ( ( zSig & 0x1FF ) <= 5 ) {
3108         doubleZSig = zSig<<1;
3109         mul64To128( zSig, zSig, &term0, &term1 );
3110         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3111         while ( (sbits64) rem0 < 0 ) {
3112             --zSig;
3113             doubleZSig -= 2;
3114             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3115         }
3116         zSig |= ( ( rem0 | rem1 ) != 0 );
3117     }
3118     return roundAndPackFloat64( 0, zExp, zSig );
3119 
3120 }
3121 #endif
3122 
3123 /*
3124 -------------------------------------------------------------------------------
3125 Returns 1 if the double-precision floating-point value `a' is equal to the
3126 corresponding value `b', and 0 otherwise.  The comparison is performed
3127 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3128 -------------------------------------------------------------------------------
3129 */
3130 flag float64_eq( float64 a, float64 b )
3131 {
3132 
3133     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3134          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3135        ) {
3136         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3137             float_raise( float_flag_invalid );
3138         }
3139         return 0;
3140     }
3141     return ( a == b ) ||
3142 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3143 
3144 }
3145 
3146 /*
3147 -------------------------------------------------------------------------------
3148 Returns 1 if the double-precision floating-point value `a' is less than or
3149 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3150 performed according to the IEC/IEEE Standard for Binary Floating-Point
3151 Arithmetic.
3152 -------------------------------------------------------------------------------
3153 */
3154 flag float64_le( float64 a, float64 b )
3155 {
3156     flag aSign, bSign;
3157 
3158     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3159          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3160        ) {
3161         float_raise( float_flag_invalid );
3162         return 0;
3163     }
3164     aSign = extractFloat64Sign( a );
3165     bSign = extractFloat64Sign( b );
3166     if ( aSign != bSign )
3167 	return aSign ||
3168 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3169 	      0 );
3170     return ( a == b ) ||
3171 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3172 
3173 }
3174 
3175 /*
3176 -------------------------------------------------------------------------------
3177 Returns 1 if the double-precision floating-point value `a' is less than
3178 the corresponding value `b', and 0 otherwise.  The comparison is performed
3179 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3180 -------------------------------------------------------------------------------
3181 */
3182 flag float64_lt( float64 a, float64 b )
3183 {
3184     flag aSign, bSign;
3185 
3186     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3187          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3188        ) {
3189         float_raise( float_flag_invalid );
3190         return 0;
3191     }
3192     aSign = extractFloat64Sign( a );
3193     bSign = extractFloat64Sign( b );
3194     if ( aSign != bSign )
3195 	return aSign &&
3196 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3197 	      0 );
3198     return ( a != b ) &&
3199 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3200 
3201 }
3202 
3203 #ifndef SOFTFLOAT_FOR_GCC
3204 /*
3205 -------------------------------------------------------------------------------
3206 Returns 1 if the double-precision floating-point value `a' is equal to the
3207 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3208 if either operand is a NaN.  Otherwise, the comparison is performed
3209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3210 -------------------------------------------------------------------------------
3211 */
3212 flag float64_eq_signaling( float64 a, float64 b )
3213 {
3214 
3215     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3216          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3217        ) {
3218         float_raise( float_flag_invalid );
3219         return 0;
3220     }
3221     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3222 
3223 }
3224 
3225 /*
3226 -------------------------------------------------------------------------------
3227 Returns 1 if the double-precision floating-point value `a' is less than or
3228 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3229 cause an exception.  Otherwise, the comparison is performed according to the
3230 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3231 -------------------------------------------------------------------------------
3232 */
3233 flag float64_le_quiet( float64 a, float64 b )
3234 {
3235     flag aSign, bSign;
3236 
3237     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3238          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3239        ) {
3240         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3241             float_raise( float_flag_invalid );
3242         }
3243         return 0;
3244     }
3245     aSign = extractFloat64Sign( a );
3246     bSign = extractFloat64Sign( b );
3247     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3248     return ( a == b ) || ( aSign ^ ( a < b ) );
3249 
3250 }
3251 
3252 /*
3253 -------------------------------------------------------------------------------
3254 Returns 1 if the double-precision floating-point value `a' is less than
3255 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3256 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3257 Standard for Binary Floating-Point Arithmetic.
3258 -------------------------------------------------------------------------------
3259 */
3260 flag float64_lt_quiet( float64 a, float64 b )
3261 {
3262     flag aSign, bSign;
3263 
3264     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3265          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3266        ) {
3267         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3268             float_raise( float_flag_invalid );
3269         }
3270         return 0;
3271     }
3272     aSign = extractFloat64Sign( a );
3273     bSign = extractFloat64Sign( b );
3274     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3275     return ( a != b ) && ( aSign ^ ( a < b ) );
3276 
3277 }
3278 #endif
3279 
3280 #ifdef FLOATX80
3281 
3282 /*
3283 -------------------------------------------------------------------------------
3284 Returns the result of converting the extended double-precision floating-
3285 point value `a' to the 32-bit two's complement integer format.  The
3286 conversion is performed according to the IEC/IEEE Standard for Binary
3287 Floating-Point Arithmetic---which means in particular that the conversion
3288 is rounded according to the current rounding mode.  If `a' is a NaN, the
3289 largest positive integer is returned.  Otherwise, if the conversion
3290 overflows, the largest integer with the same sign as `a' is returned.
3291 -------------------------------------------------------------------------------
3292 */
3293 int32 floatx80_to_int32( floatx80 a )
3294 {
3295     flag aSign;
3296     int32 aExp, shiftCount;
3297     bits64 aSig;
3298 
3299     aSig = extractFloatx80Frac( a );
3300     aExp = extractFloatx80Exp( a );
3301     aSign = extractFloatx80Sign( a );
3302     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3303     shiftCount = 0x4037 - aExp;
3304     if ( shiftCount <= 0 ) shiftCount = 1;
3305     shift64RightJamming( aSig, shiftCount, &aSig );
3306     return roundAndPackInt32( aSign, aSig );
3307 
3308 }
3309 
3310 /*
3311 -------------------------------------------------------------------------------
3312 Returns the result of converting the extended double-precision floating-
3313 point value `a' to the 32-bit two's complement integer format.  The
3314 conversion is performed according to the IEC/IEEE Standard for Binary
3315 Floating-Point Arithmetic, except that the conversion is always rounded
3316 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3317 Otherwise, if the conversion overflows, the largest integer with the same
3318 sign as `a' is returned.
3319 -------------------------------------------------------------------------------
3320 */
3321 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3322 {
3323     flag aSign;
3324     int32 aExp, shiftCount;
3325     bits64 aSig, savedASig;
3326     int32 z;
3327 
3328     aSig = extractFloatx80Frac( a );
3329     aExp = extractFloatx80Exp( a );
3330     aSign = extractFloatx80Sign( a );
3331     if ( 0x401E < aExp ) {
3332         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3333         goto invalid;
3334     }
3335     else if ( aExp < 0x3FFF ) {
3336         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3337         return 0;
3338     }
3339     shiftCount = 0x403E - aExp;
3340     savedASig = aSig;
3341     aSig >>= shiftCount;
3342     z = aSig;
3343     if ( aSign ) z = - z;
3344     if ( ( z < 0 ) ^ aSign ) {
3345  invalid:
3346         float_raise( float_flag_invalid );
3347         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3348     }
3349     if ( ( aSig<<shiftCount ) != savedASig ) {
3350         float_exception_flags |= float_flag_inexact;
3351     }
3352     return z;
3353 
3354 }
3355 
3356 /*
3357 -------------------------------------------------------------------------------
3358 Returns the result of converting the extended double-precision floating-
3359 point value `a' to the 64-bit two's complement integer format.  The
3360 conversion is performed according to the IEC/IEEE Standard for Binary
3361 Floating-Point Arithmetic---which means in particular that the conversion
3362 is rounded according to the current rounding mode.  If `a' is a NaN,
3363 the largest positive integer is returned.  Otherwise, if the conversion
3364 overflows, the largest integer with the same sign as `a' is returned.
3365 -------------------------------------------------------------------------------
3366 */
3367 int64 floatx80_to_int64( floatx80 a )
3368 {
3369     flag aSign;
3370     int32 aExp, shiftCount;
3371     bits64 aSig, aSigExtra;
3372 
3373     aSig = extractFloatx80Frac( a );
3374     aExp = extractFloatx80Exp( a );
3375     aSign = extractFloatx80Sign( a );
3376     shiftCount = 0x403E - aExp;
3377     if ( shiftCount <= 0 ) {
3378         if ( shiftCount ) {
3379             float_raise( float_flag_invalid );
3380             if (    ! aSign
3381                  || (    ( aExp == 0x7FFF )
3382                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3383                ) {
3384                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3385             }
3386             return (sbits64) LIT64( 0x8000000000000000 );
3387         }
3388         aSigExtra = 0;
3389     }
3390     else {
3391         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3392     }
3393     return roundAndPackInt64( aSign, aSig, aSigExtra );
3394 
3395 }
3396 
3397 /*
3398 -------------------------------------------------------------------------------
3399 Returns the result of converting the extended double-precision floating-
3400 point value `a' to the 64-bit two's complement integer format.  The
3401 conversion is performed according to the IEC/IEEE Standard for Binary
3402 Floating-Point Arithmetic, except that the conversion is always rounded
3403 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3404 Otherwise, if the conversion overflows, the largest integer with the same
3405 sign as `a' is returned.
3406 -------------------------------------------------------------------------------
3407 */
3408 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3409 {
3410     flag aSign;
3411     int32 aExp, shiftCount;
3412     bits64 aSig;
3413     int64 z;
3414 
3415     aSig = extractFloatx80Frac( a );
3416     aExp = extractFloatx80Exp( a );
3417     aSign = extractFloatx80Sign( a );
3418     shiftCount = aExp - 0x403E;
3419     if ( 0 <= shiftCount ) {
3420         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3421         if ( ( a.high != 0xC03E ) || aSig ) {
3422             float_raise( float_flag_invalid );
3423             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3424                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3425             }
3426         }
3427         return (sbits64) LIT64( 0x8000000000000000 );
3428     }
3429     else if ( aExp < 0x3FFF ) {
3430         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3431         return 0;
3432     }
3433     z = aSig>>( - shiftCount );
3434     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3435         float_exception_flags |= float_flag_inexact;
3436     }
3437     if ( aSign ) z = - z;
3438     return z;
3439 
3440 }
3441 
3442 /*
3443 -------------------------------------------------------------------------------
3444 Returns the result of converting the extended double-precision floating-
3445 point value `a' to the single-precision floating-point format.  The
3446 conversion is performed according to the IEC/IEEE Standard for Binary
3447 Floating-Point Arithmetic.
3448 -------------------------------------------------------------------------------
3449 */
3450 float32 floatx80_to_float32( floatx80 a )
3451 {
3452     flag aSign;
3453     int32 aExp;
3454     bits64 aSig;
3455 
3456     aSig = extractFloatx80Frac( a );
3457     aExp = extractFloatx80Exp( a );
3458     aSign = extractFloatx80Sign( a );
3459     if ( aExp == 0x7FFF ) {
3460         if ( (bits64) ( aSig<<1 ) ) {
3461             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3462         }
3463         return packFloat32( aSign, 0xFF, 0 );
3464     }
3465     shift64RightJamming( aSig, 33, &aSig );
3466     if ( aExp || aSig ) aExp -= 0x3F81;
3467     return roundAndPackFloat32( aSign, aExp, aSig );
3468 
3469 }
3470 
3471 /*
3472 -------------------------------------------------------------------------------
3473 Returns the result of converting the extended double-precision floating-
3474 point value `a' to the double-precision floating-point format.  The
3475 conversion is performed according to the IEC/IEEE Standard for Binary
3476 Floating-Point Arithmetic.
3477 -------------------------------------------------------------------------------
3478 */
3479 float64 floatx80_to_float64( floatx80 a )
3480 {
3481     flag aSign;
3482     int32 aExp;
3483     bits64 aSig, zSig;
3484 
3485     aSig = extractFloatx80Frac( a );
3486     aExp = extractFloatx80Exp( a );
3487     aSign = extractFloatx80Sign( a );
3488     if ( aExp == 0x7FFF ) {
3489         if ( (bits64) ( aSig<<1 ) ) {
3490             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3491         }
3492         return packFloat64( aSign, 0x7FF, 0 );
3493     }
3494     shift64RightJamming( aSig, 1, &zSig );
3495     if ( aExp || aSig ) aExp -= 0x3C01;
3496     return roundAndPackFloat64( aSign, aExp, zSig );
3497 
3498 }
3499 
3500 #ifdef FLOAT128
3501 
3502 /*
3503 -------------------------------------------------------------------------------
3504 Returns the result of converting the extended double-precision floating-
3505 point value `a' to the quadruple-precision floating-point format.  The
3506 conversion is performed according to the IEC/IEEE Standard for Binary
3507 Floating-Point Arithmetic.
3508 -------------------------------------------------------------------------------
3509 */
3510 float128 floatx80_to_float128( floatx80 a )
3511 {
3512     flag aSign;
3513     int16 aExp;
3514     bits64 aSig, zSig0, zSig1;
3515 
3516     aSig = extractFloatx80Frac( a );
3517     aExp = extractFloatx80Exp( a );
3518     aSign = extractFloatx80Sign( a );
3519     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3520         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3521     }
3522     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3523     return packFloat128( aSign, aExp, zSig0, zSig1 );
3524 
3525 }
3526 
3527 #endif
3528 
3529 /*
3530 -------------------------------------------------------------------------------
3531 Rounds the extended double-precision floating-point value `a' to an integer,
3532 and returns the result as an extended quadruple-precision floating-point
3533 value.  The operation is performed according to the IEC/IEEE Standard for
3534 Binary Floating-Point Arithmetic.
3535 -------------------------------------------------------------------------------
3536 */
3537 floatx80 floatx80_round_to_int( floatx80 a )
3538 {
3539     flag aSign;
3540     int32 aExp;
3541     bits64 lastBitMask, roundBitsMask;
3542     int8 roundingMode;
3543     floatx80 z;
3544 
3545     aExp = extractFloatx80Exp( a );
3546     if ( 0x403E <= aExp ) {
3547         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3548             return propagateFloatx80NaN( a, a );
3549         }
3550         return a;
3551     }
3552     if ( aExp < 0x3FFF ) {
3553         if (    ( aExp == 0 )
3554              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3555             return a;
3556         }
3557         float_exception_flags |= float_flag_inexact;
3558         aSign = extractFloatx80Sign( a );
3559         switch ( float_rounding_mode ) {
3560          case float_round_nearest_even:
3561             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3562                ) {
3563                 return
3564                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3565             }
3566             break;
3567 	 case float_round_to_zero:
3568 	    break;
3569          case float_round_down:
3570             return
3571                   aSign ?
3572                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3573                 : packFloatx80( 0, 0, 0 );
3574          case float_round_up:
3575             return
3576                   aSign ? packFloatx80( 1, 0, 0 )
3577                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3578         }
3579         return packFloatx80( aSign, 0, 0 );
3580     }
3581     lastBitMask = 1;
3582     lastBitMask <<= 0x403E - aExp;
3583     roundBitsMask = lastBitMask - 1;
3584     z = a;
3585     roundingMode = float_rounding_mode;
3586     if ( roundingMode == float_round_nearest_even ) {
3587         z.low += lastBitMask>>1;
3588         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3589     }
3590     else if ( roundingMode != float_round_to_zero ) {
3591         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3592             z.low += roundBitsMask;
3593         }
3594     }
3595     z.low &= ~ roundBitsMask;
3596     if ( z.low == 0 ) {
3597         ++z.high;
3598         z.low = LIT64( 0x8000000000000000 );
3599     }
3600     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3601     return z;
3602 
3603 }
3604 
3605 /*
3606 -------------------------------------------------------------------------------
3607 Returns the result of adding the absolute values of the extended double-
3608 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3609 negated before being returned.  `zSign' is ignored if the result is a NaN.
3610 The addition is performed according to the IEC/IEEE Standard for Binary
3611 Floating-Point Arithmetic.
3612 -------------------------------------------------------------------------------
3613 */
3614 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3615 {
3616     int32 aExp, bExp, zExp;
3617     bits64 aSig, bSig, zSig0, zSig1;
3618     int32 expDiff;
3619 
3620     aSig = extractFloatx80Frac( a );
3621     aExp = extractFloatx80Exp( a );
3622     bSig = extractFloatx80Frac( b );
3623     bExp = extractFloatx80Exp( b );
3624     expDiff = aExp - bExp;
3625     if ( 0 < expDiff ) {
3626         if ( aExp == 0x7FFF ) {
3627             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3628             return a;
3629         }
3630         if ( bExp == 0 ) --expDiff;
3631         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3632         zExp = aExp;
3633     }
3634     else if ( expDiff < 0 ) {
3635         if ( bExp == 0x7FFF ) {
3636             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3637             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3638         }
3639         if ( aExp == 0 ) ++expDiff;
3640         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3641         zExp = bExp;
3642     }
3643     else {
3644         if ( aExp == 0x7FFF ) {
3645             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3646                 return propagateFloatx80NaN( a, b );
3647             }
3648             return a;
3649         }
3650         zSig1 = 0;
3651         zSig0 = aSig + bSig;
3652         if ( aExp == 0 ) {
3653             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3654             goto roundAndPack;
3655         }
3656         zExp = aExp;
3657         goto shiftRight1;
3658     }
3659     zSig0 = aSig + bSig;
3660     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3661  shiftRight1:
3662     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3663     zSig0 |= LIT64( 0x8000000000000000 );
3664     ++zExp;
3665  roundAndPack:
3666     return
3667         roundAndPackFloatx80(
3668             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3669 
3670 }
3671 
3672 /*
3673 -------------------------------------------------------------------------------
3674 Returns the result of subtracting the absolute values of the extended
3675 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3676 difference is negated before being returned.  `zSign' is ignored if the
3677 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3678 Standard for Binary Floating-Point Arithmetic.
3679 -------------------------------------------------------------------------------
3680 */
3681 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3682 {
3683     int32 aExp, bExp, zExp;
3684     bits64 aSig, bSig, zSig0, zSig1;
3685     int32 expDiff;
3686     floatx80 z;
3687 
3688     aSig = extractFloatx80Frac( a );
3689     aExp = extractFloatx80Exp( a );
3690     bSig = extractFloatx80Frac( b );
3691     bExp = extractFloatx80Exp( b );
3692     expDiff = aExp - bExp;
3693     if ( 0 < expDiff ) goto aExpBigger;
3694     if ( expDiff < 0 ) goto bExpBigger;
3695     if ( aExp == 0x7FFF ) {
3696         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3697             return propagateFloatx80NaN( a, b );
3698         }
3699         float_raise( float_flag_invalid );
3700         z.low = floatx80_default_nan_low;
3701         z.high = floatx80_default_nan_high;
3702         return z;
3703     }
3704     if ( aExp == 0 ) {
3705         aExp = 1;
3706         bExp = 1;
3707     }
3708     zSig1 = 0;
3709     if ( bSig < aSig ) goto aBigger;
3710     if ( aSig < bSig ) goto bBigger;
3711     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3712  bExpBigger:
3713     if ( bExp == 0x7FFF ) {
3714         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3715         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3716     }
3717     if ( aExp == 0 ) ++expDiff;
3718     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3719  bBigger:
3720     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3721     zExp = bExp;
3722     zSign ^= 1;
3723     goto normalizeRoundAndPack;
3724  aExpBigger:
3725     if ( aExp == 0x7FFF ) {
3726         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3727         return a;
3728     }
3729     if ( bExp == 0 ) --expDiff;
3730     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3731  aBigger:
3732     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3733     zExp = aExp;
3734  normalizeRoundAndPack:
3735     return
3736         normalizeRoundAndPackFloatx80(
3737             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3738 
3739 }
3740 
3741 /*
3742 -------------------------------------------------------------------------------
3743 Returns the result of adding the extended double-precision floating-point
3744 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3745 Standard for Binary Floating-Point Arithmetic.
3746 -------------------------------------------------------------------------------
3747 */
3748 floatx80 floatx80_add( floatx80 a, floatx80 b )
3749 {
3750     flag aSign, bSign;
3751 
3752     aSign = extractFloatx80Sign( a );
3753     bSign = extractFloatx80Sign( b );
3754     if ( aSign == bSign ) {
3755         return addFloatx80Sigs( a, b, aSign );
3756     }
3757     else {
3758         return subFloatx80Sigs( a, b, aSign );
3759     }
3760 
3761 }
3762 
3763 /*
3764 -------------------------------------------------------------------------------
3765 Returns the result of subtracting the extended double-precision floating-
3766 point values `a' and `b'.  The operation is performed according to the
3767 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3768 -------------------------------------------------------------------------------
3769 */
3770 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3771 {
3772     flag aSign, bSign;
3773 
3774     aSign = extractFloatx80Sign( a );
3775     bSign = extractFloatx80Sign( b );
3776     if ( aSign == bSign ) {
3777         return subFloatx80Sigs( a, b, aSign );
3778     }
3779     else {
3780         return addFloatx80Sigs( a, b, aSign );
3781     }
3782 
3783 }
3784 
3785 /*
3786 -------------------------------------------------------------------------------
3787 Returns the result of multiplying the extended double-precision floating-
3788 point values `a' and `b'.  The operation is performed according to the
3789 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3790 -------------------------------------------------------------------------------
3791 */
3792 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3793 {
3794     flag aSign, bSign, zSign;
3795     int32 aExp, bExp, zExp;
3796     bits64 aSig, bSig, zSig0, zSig1;
3797     floatx80 z;
3798 
3799     aSig = extractFloatx80Frac( a );
3800     aExp = extractFloatx80Exp( a );
3801     aSign = extractFloatx80Sign( a );
3802     bSig = extractFloatx80Frac( b );
3803     bExp = extractFloatx80Exp( b );
3804     bSign = extractFloatx80Sign( b );
3805     zSign = aSign ^ bSign;
3806     if ( aExp == 0x7FFF ) {
3807         if (    (bits64) ( aSig<<1 )
3808              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3809             return propagateFloatx80NaN( a, b );
3810         }
3811         if ( ( bExp | bSig ) == 0 ) goto invalid;
3812         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3813     }
3814     if ( bExp == 0x7FFF ) {
3815         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3816         if ( ( aExp | aSig ) == 0 ) {
3817  invalid:
3818             float_raise( float_flag_invalid );
3819             z.low = floatx80_default_nan_low;
3820             z.high = floatx80_default_nan_high;
3821             return z;
3822         }
3823         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3824     }
3825     if ( aExp == 0 ) {
3826         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3827         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3828     }
3829     if ( bExp == 0 ) {
3830         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3831         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3832     }
3833     zExp = aExp + bExp - 0x3FFE;
3834     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3835     if ( 0 < (sbits64) zSig0 ) {
3836         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3837         --zExp;
3838     }
3839     return
3840         roundAndPackFloatx80(
3841             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3842 
3843 }
3844 
3845 /*
3846 -------------------------------------------------------------------------------
3847 Returns the result of dividing the extended double-precision floating-point
3848 value `a' by the corresponding value `b'.  The operation is performed
3849 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3850 -------------------------------------------------------------------------------
3851 */
3852 floatx80 floatx80_div( floatx80 a, floatx80 b )
3853 {
3854     flag aSign, bSign, zSign;
3855     int32 aExp, bExp, zExp;
3856     bits64 aSig, bSig, zSig0, zSig1;
3857     bits64 rem0, rem1, rem2, term0, term1, term2;
3858     floatx80 z;
3859 
3860     aSig = extractFloatx80Frac( a );
3861     aExp = extractFloatx80Exp( a );
3862     aSign = extractFloatx80Sign( a );
3863     bSig = extractFloatx80Frac( b );
3864     bExp = extractFloatx80Exp( b );
3865     bSign = extractFloatx80Sign( b );
3866     zSign = aSign ^ bSign;
3867     if ( aExp == 0x7FFF ) {
3868         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3869         if ( bExp == 0x7FFF ) {
3870             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3871             goto invalid;
3872         }
3873         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3874     }
3875     if ( bExp == 0x7FFF ) {
3876         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877         return packFloatx80( zSign, 0, 0 );
3878     }
3879     if ( bExp == 0 ) {
3880         if ( bSig == 0 ) {
3881             if ( ( aExp | aSig ) == 0 ) {
3882  invalid:
3883                 float_raise( float_flag_invalid );
3884                 z.low = floatx80_default_nan_low;
3885                 z.high = floatx80_default_nan_high;
3886                 return z;
3887             }
3888             float_raise( float_flag_divbyzero );
3889             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3890         }
3891         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892     }
3893     if ( aExp == 0 ) {
3894         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3895         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3896     }
3897     zExp = aExp - bExp + 0x3FFE;
3898     rem1 = 0;
3899     if ( bSig <= aSig ) {
3900         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3901         ++zExp;
3902     }
3903     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3904     mul64To128( bSig, zSig0, &term0, &term1 );
3905     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3906     while ( (sbits64) rem0 < 0 ) {
3907         --zSig0;
3908         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3909     }
3910     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3911     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3912         mul64To128( bSig, zSig1, &term1, &term2 );
3913         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3914         while ( (sbits64) rem1 < 0 ) {
3915             --zSig1;
3916             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3917         }
3918         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3919     }
3920     return
3921         roundAndPackFloatx80(
3922             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3923 
3924 }
3925 
3926 /*
3927 -------------------------------------------------------------------------------
3928 Returns the remainder of the extended double-precision floating-point value
3929 `a' with respect to the corresponding value `b'.  The operation is performed
3930 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3931 -------------------------------------------------------------------------------
3932 */
3933 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3934 {
3935     flag aSign, bSign, zSign;
3936     int32 aExp, bExp, expDiff;
3937     bits64 aSig0, aSig1, bSig;
3938     bits64 q, term0, term1, alternateASig0, alternateASig1;
3939     floatx80 z;
3940 
3941     aSig0 = extractFloatx80Frac( a );
3942     aExp = extractFloatx80Exp( a );
3943     aSign = extractFloatx80Sign( a );
3944     bSig = extractFloatx80Frac( b );
3945     bExp = extractFloatx80Exp( b );
3946     bSign = extractFloatx80Sign( b );
3947     if ( aExp == 0x7FFF ) {
3948         if (    (bits64) ( aSig0<<1 )
3949              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3950             return propagateFloatx80NaN( a, b );
3951         }
3952         goto invalid;
3953     }
3954     if ( bExp == 0x7FFF ) {
3955         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3956         return a;
3957     }
3958     if ( bExp == 0 ) {
3959         if ( bSig == 0 ) {
3960  invalid:
3961             float_raise( float_flag_invalid );
3962             z.low = floatx80_default_nan_low;
3963             z.high = floatx80_default_nan_high;
3964             return z;
3965         }
3966         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3967     }
3968     if ( aExp == 0 ) {
3969         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3970         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3971     }
3972     bSig |= LIT64( 0x8000000000000000 );
3973     zSign = aSign;
3974     expDiff = aExp - bExp;
3975     aSig1 = 0;
3976     if ( expDiff < 0 ) {
3977         if ( expDiff < -1 ) return a;
3978         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3979         expDiff = 0;
3980     }
3981     q = ( bSig <= aSig0 );
3982     if ( q ) aSig0 -= bSig;
3983     expDiff -= 64;
3984     while ( 0 < expDiff ) {
3985         q = estimateDiv128To64( aSig0, aSig1, bSig );
3986         q = ( 2 < q ) ? q - 2 : 0;
3987         mul64To128( bSig, q, &term0, &term1 );
3988         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3989         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3990         expDiff -= 62;
3991     }
3992     expDiff += 64;
3993     if ( 0 < expDiff ) {
3994         q = estimateDiv128To64( aSig0, aSig1, bSig );
3995         q = ( 2 < q ) ? q - 2 : 0;
3996         q >>= 64 - expDiff;
3997         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3998         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3999         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4000         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4001             ++q;
4002             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4003         }
4004     }
4005     else {
4006         term1 = 0;
4007         term0 = bSig;
4008     }
4009     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4010     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4012               && ( q & 1 ) )
4013        ) {
4014         aSig0 = alternateASig0;
4015         aSig1 = alternateASig1;
4016         zSign = ! zSign;
4017     }
4018     return
4019         normalizeRoundAndPackFloatx80(
4020             80, zSign, bExp + expDiff, aSig0, aSig1 );
4021 
4022 }
4023 
4024 /*
4025 -------------------------------------------------------------------------------
4026 Returns the square root of the extended double-precision floating-point
4027 value `a'.  The operation is performed according to the IEC/IEEE Standard
4028 for Binary Floating-Point Arithmetic.
4029 -------------------------------------------------------------------------------
4030 */
4031 floatx80 floatx80_sqrt( floatx80 a )
4032 {
4033     flag aSign;
4034     int32 aExp, zExp;
4035     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4036     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4037     floatx80 z;
4038 
4039     aSig0 = extractFloatx80Frac( a );
4040     aExp = extractFloatx80Exp( a );
4041     aSign = extractFloatx80Sign( a );
4042     if ( aExp == 0x7FFF ) {
4043         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4044         if ( ! aSign ) return a;
4045         goto invalid;
4046     }
4047     if ( aSign ) {
4048         if ( ( aExp | aSig0 ) == 0 ) return a;
4049  invalid:
4050         float_raise( float_flag_invalid );
4051         z.low = floatx80_default_nan_low;
4052         z.high = floatx80_default_nan_high;
4053         return z;
4054     }
4055     if ( aExp == 0 ) {
4056         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4057         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4058     }
4059     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4060     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4061     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4062     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4063     doubleZSig0 = zSig0<<1;
4064     mul64To128( zSig0, zSig0, &term0, &term1 );
4065     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4066     while ( (sbits64) rem0 < 0 ) {
4067         --zSig0;
4068         doubleZSig0 -= 2;
4069         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4070     }
4071     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4072     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4073         if ( zSig1 == 0 ) zSig1 = 1;
4074         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4075         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4076         mul64To128( zSig1, zSig1, &term2, &term3 );
4077         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4078         while ( (sbits64) rem1 < 0 ) {
4079             --zSig1;
4080             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4081             term3 |= 1;
4082             term2 |= doubleZSig0;
4083             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4084         }
4085         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4086     }
4087     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4088     zSig0 |= doubleZSig0;
4089     return
4090         roundAndPackFloatx80(
4091             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4092 
4093 }
4094 
4095 /*
4096 -------------------------------------------------------------------------------
4097 Returns 1 if the extended double-precision floating-point value `a' is
4098 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4099 performed according to the IEC/IEEE Standard for Binary Floating-Point
4100 Arithmetic.
4101 -------------------------------------------------------------------------------
4102 */
4103 flag floatx80_eq( floatx80 a, floatx80 b )
4104 {
4105 
4106     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4107               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4108          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4109               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4110        ) {
4111         if (    floatx80_is_signaling_nan( a )
4112              || floatx80_is_signaling_nan( b ) ) {
4113             float_raise( float_flag_invalid );
4114         }
4115         return 0;
4116     }
4117     return
4118            ( a.low == b.low )
4119         && (    ( a.high == b.high )
4120              || (    ( a.low == 0 )
4121                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4122            );
4123 
4124 }
4125 
4126 /*
4127 -------------------------------------------------------------------------------
4128 Returns 1 if the extended double-precision floating-point value `a' is
4129 less than or equal to the corresponding value `b', and 0 otherwise.  The
4130 comparison is performed according to the IEC/IEEE Standard for Binary
4131 Floating-Point Arithmetic.
4132 -------------------------------------------------------------------------------
4133 */
4134 flag floatx80_le( floatx80 a, floatx80 b )
4135 {
4136     flag aSign, bSign;
4137 
4138     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4139               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4140          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4141               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4142        ) {
4143         float_raise( float_flag_invalid );
4144         return 0;
4145     }
4146     aSign = extractFloatx80Sign( a );
4147     bSign = extractFloatx80Sign( b );
4148     if ( aSign != bSign ) {
4149         return
4150                aSign
4151             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4152                  == 0 );
4153     }
4154     return
4155           aSign ? le128( b.high, b.low, a.high, a.low )
4156         : le128( a.high, a.low, b.high, b.low );
4157 
4158 }
4159 
4160 /*
4161 -------------------------------------------------------------------------------
4162 Returns 1 if the extended double-precision floating-point value `a' is
4163 less than the corresponding value `b', and 0 otherwise.  The comparison
4164 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 Arithmetic.
4166 -------------------------------------------------------------------------------
4167 */
4168 flag floatx80_lt( floatx80 a, floatx80 b )
4169 {
4170     flag aSign, bSign;
4171 
4172     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4173               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4174          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4175               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4176        ) {
4177         float_raise( float_flag_invalid );
4178         return 0;
4179     }
4180     aSign = extractFloatx80Sign( a );
4181     bSign = extractFloatx80Sign( b );
4182     if ( aSign != bSign ) {
4183         return
4184                aSign
4185             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4186                  != 0 );
4187     }
4188     return
4189           aSign ? lt128( b.high, b.low, a.high, a.low )
4190         : lt128( a.high, a.low, b.high, b.low );
4191 
4192 }
4193 
4194 /*
4195 -------------------------------------------------------------------------------
4196 Returns 1 if the extended double-precision floating-point value `a' is equal
4197 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4198 raised if either operand is a NaN.  Otherwise, the comparison is performed
4199 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4200 -------------------------------------------------------------------------------
4201 */
4202 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4203 {
4204 
4205     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4206               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4207          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4208               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4209        ) {
4210         float_raise( float_flag_invalid );
4211         return 0;
4212     }
4213     return
4214            ( a.low == b.low )
4215         && (    ( a.high == b.high )
4216              || (    ( a.low == 0 )
4217                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4218            );
4219 
4220 }
4221 
4222 /*
4223 -------------------------------------------------------------------------------
4224 Returns 1 if the extended double-precision floating-point value `a' is less
4225 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4226 do not cause an exception.  Otherwise, the comparison is performed according
4227 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 -------------------------------------------------------------------------------
4229 */
4230 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4231 {
4232     flag aSign, bSign;
4233 
4234     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4235               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4236          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4237               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4238        ) {
4239         if (    floatx80_is_signaling_nan( a )
4240              || floatx80_is_signaling_nan( b ) ) {
4241             float_raise( float_flag_invalid );
4242         }
4243         return 0;
4244     }
4245     aSign = extractFloatx80Sign( a );
4246     bSign = extractFloatx80Sign( b );
4247     if ( aSign != bSign ) {
4248         return
4249                aSign
4250             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4251                  == 0 );
4252     }
4253     return
4254           aSign ? le128( b.high, b.low, a.high, a.low )
4255         : le128( a.high, a.low, b.high, b.low );
4256 
4257 }
4258 
4259 /*
4260 -------------------------------------------------------------------------------
4261 Returns 1 if the extended double-precision floating-point value `a' is less
4262 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4263 an exception.  Otherwise, the comparison is performed according to the
4264 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4265 -------------------------------------------------------------------------------
4266 */
4267 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4268 {
4269     flag aSign, bSign;
4270 
4271     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4272               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4273          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4274               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4275        ) {
4276         if (    floatx80_is_signaling_nan( a )
4277              || floatx80_is_signaling_nan( b ) ) {
4278             float_raise( float_flag_invalid );
4279         }
4280         return 0;
4281     }
4282     aSign = extractFloatx80Sign( a );
4283     bSign = extractFloatx80Sign( b );
4284     if ( aSign != bSign ) {
4285         return
4286                aSign
4287             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4288                  != 0 );
4289     }
4290     return
4291           aSign ? lt128( b.high, b.low, a.high, a.low )
4292         : lt128( a.high, a.low, b.high, b.low );
4293 
4294 }
4295 
4296 #endif
4297 
4298 #ifdef FLOAT128
4299 
4300 /*
4301 -------------------------------------------------------------------------------
4302 Returns the result of converting the quadruple-precision floating-point
4303 value `a' to the 32-bit two's complement integer format.  The conversion
4304 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305 Arithmetic---which means in particular that the conversion is rounded
4306 according to the current rounding mode.  If `a' is a NaN, the largest
4307 positive integer is returned.  Otherwise, if the conversion overflows, the
4308 largest integer with the same sign as `a' is returned.
4309 -------------------------------------------------------------------------------
4310 */
4311 int32 float128_to_int32( float128 a )
4312 {
4313     flag aSign;
4314     int32 aExp, shiftCount;
4315     bits64 aSig0, aSig1;
4316 
4317     aSig1 = extractFloat128Frac1( a );
4318     aSig0 = extractFloat128Frac0( a );
4319     aExp = extractFloat128Exp( a );
4320     aSign = extractFloat128Sign( a );
4321     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4322     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4323     aSig0 |= ( aSig1 != 0 );
4324     shiftCount = 0x4028 - aExp;
4325     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4326     return roundAndPackInt32( aSign, aSig0 );
4327 
4328 }
4329 
4330 /*
4331 -------------------------------------------------------------------------------
4332 Returns the result of converting the quadruple-precision floating-point
4333 value `a' to the 32-bit two's complement integer format.  The conversion
4334 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335 Arithmetic, except that the conversion is always rounded toward zero.  If
4336 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4337 conversion overflows, the largest integer with the same sign as `a' is
4338 returned.
4339 -------------------------------------------------------------------------------
4340 */
4341 int32 float128_to_int32_round_to_zero( float128 a )
4342 {
4343     flag aSign;
4344     int32 aExp, shiftCount;
4345     bits64 aSig0, aSig1, savedASig;
4346     int32 z;
4347 
4348     aSig1 = extractFloat128Frac1( a );
4349     aSig0 = extractFloat128Frac0( a );
4350     aExp = extractFloat128Exp( a );
4351     aSign = extractFloat128Sign( a );
4352     aSig0 |= ( aSig1 != 0 );
4353     if ( 0x401E < aExp ) {
4354         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4355         goto invalid;
4356     }
4357     else if ( aExp < 0x3FFF ) {
4358         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4359         return 0;
4360     }
4361     aSig0 |= LIT64( 0x0001000000000000 );
4362     shiftCount = 0x402F - aExp;
4363     savedASig = aSig0;
4364     aSig0 >>= shiftCount;
4365     z = (int32)aSig0;
4366     if ( aSign ) z = - z;
4367     if ( ( z < 0 ) ^ aSign ) {
4368  invalid:
4369         float_raise( float_flag_invalid );
4370         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4371     }
4372     if ( ( aSig0<<shiftCount ) != savedASig ) {
4373         float_exception_flags |= float_flag_inexact;
4374     }
4375     return z;
4376 
4377 }
4378 
4379 /*
4380 -------------------------------------------------------------------------------
4381 Returns the result of converting the quadruple-precision floating-point
4382 value `a' to the 64-bit two's complement integer format.  The conversion
4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 Arithmetic---which means in particular that the conversion is rounded
4385 according to the current rounding mode.  If `a' is a NaN, the largest
4386 positive integer is returned.  Otherwise, if the conversion overflows, the
4387 largest integer with the same sign as `a' is returned.
4388 -------------------------------------------------------------------------------
4389 */
4390 int64 float128_to_int64( float128 a )
4391 {
4392     flag aSign;
4393     int32 aExp, shiftCount;
4394     bits64 aSig0, aSig1;
4395 
4396     aSig1 = extractFloat128Frac1( a );
4397     aSig0 = extractFloat128Frac0( a );
4398     aExp = extractFloat128Exp( a );
4399     aSign = extractFloat128Sign( a );
4400     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4401     shiftCount = 0x402F - aExp;
4402     if ( shiftCount <= 0 ) {
4403         if ( 0x403E < aExp ) {
4404             float_raise( float_flag_invalid );
4405             if (    ! aSign
4406                  || (    ( aExp == 0x7FFF )
4407                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4408                     )
4409                ) {
4410                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4411             }
4412             return (sbits64) LIT64( 0x8000000000000000 );
4413         }
4414         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4415     }
4416     else {
4417         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4418     }
4419     return roundAndPackInt64( aSign, aSig0, aSig1 );
4420 
4421 }
4422 
4423 /*
4424 -------------------------------------------------------------------------------
4425 Returns the result of converting the quadruple-precision floating-point
4426 value `a' to the 64-bit two's complement integer format.  The conversion
4427 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4428 Arithmetic, except that the conversion is always rounded toward zero.
4429 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4430 the conversion overflows, the largest integer with the same sign as `a' is
4431 returned.
4432 -------------------------------------------------------------------------------
4433 */
4434 int64 float128_to_int64_round_to_zero( float128 a )
4435 {
4436     flag aSign;
4437     int32 aExp, shiftCount;
4438     bits64 aSig0, aSig1;
4439     int64 z;
4440 
4441     aSig1 = extractFloat128Frac1( a );
4442     aSig0 = extractFloat128Frac0( a );
4443     aExp = extractFloat128Exp( a );
4444     aSign = extractFloat128Sign( a );
4445     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4446     shiftCount = aExp - 0x402F;
4447     if ( 0 < shiftCount ) {
4448         if ( 0x403E <= aExp ) {
4449             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4450             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4451                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4452                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4453             }
4454             else {
4455                 float_raise( float_flag_invalid );
4456                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4457                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4458                 }
4459             }
4460             return (sbits64) LIT64( 0x8000000000000000 );
4461         }
4462         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4463         if ( (bits64) ( aSig1<<shiftCount ) ) {
4464             float_exception_flags |= float_flag_inexact;
4465         }
4466     }
4467     else {
4468         if ( aExp < 0x3FFF ) {
4469             if ( aExp | aSig0 | aSig1 ) {
4470                 float_exception_flags |= float_flag_inexact;
4471             }
4472             return 0;
4473         }
4474         z = aSig0>>( - shiftCount );
4475         if (    aSig1
4476              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4477             float_exception_flags |= float_flag_inexact;
4478         }
4479     }
4480     if ( aSign ) z = - z;
4481     return z;
4482 
4483 }
4484 
4485 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4486     && defined(SOFTFLOAT_NEED_FIXUNS)
4487 /*
4488  * just like above - but do not care for overflow of signed results
4489  */
4490 uint64 float128_to_uint64_round_to_zero( float128 a )
4491 {
4492     flag aSign;
4493     int32 aExp, shiftCount;
4494     bits64 aSig0, aSig1;
4495     uint64 z;
4496 
4497     aSig1 = extractFloat128Frac1( a );
4498     aSig0 = extractFloat128Frac0( a );
4499     aExp = extractFloat128Exp( a );
4500     aSign = extractFloat128Sign( a );
4501     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4502     shiftCount = aExp - 0x402F;
4503     if ( 0 < shiftCount ) {
4504         if ( 0x403F <= aExp ) {
4505             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4506             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4507                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4508                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4509             }
4510             else {
4511                 float_raise( float_flag_invalid );
4512             }
4513             return LIT64( 0xFFFFFFFFFFFFFFFF );
4514         }
4515         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4516         if ( (bits64) ( aSig1<<shiftCount ) ) {
4517             float_exception_flags |= float_flag_inexact;
4518         }
4519     }
4520     else {
4521         if ( aExp < 0x3FFF ) {
4522             if ( aExp | aSig0 | aSig1 ) {
4523                 float_exception_flags |= float_flag_inexact;
4524             }
4525             return 0;
4526         }
4527         z = aSig0>>( - shiftCount );
4528         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4529             float_exception_flags |= float_flag_inexact;
4530         }
4531     }
4532     if ( aSign ) z = - z;
4533     return z;
4534 
4535 }
4536 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4537 
4538 /*
4539 -------------------------------------------------------------------------------
4540 Returns the result of converting the quadruple-precision floating-point
4541 value `a' to the single-precision floating-point format.  The conversion
4542 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4543 Arithmetic.
4544 -------------------------------------------------------------------------------
4545 */
4546 float32 float128_to_float32( float128 a )
4547 {
4548     flag aSign;
4549     int32 aExp;
4550     bits64 aSig0, aSig1;
4551     bits32 zSig;
4552 
4553     aSig1 = extractFloat128Frac1( a );
4554     aSig0 = extractFloat128Frac0( a );
4555     aExp = extractFloat128Exp( a );
4556     aSign = extractFloat128Sign( a );
4557     if ( aExp == 0x7FFF ) {
4558         if ( aSig0 | aSig1 ) {
4559             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4560         }
4561         return packFloat32( aSign, 0xFF, 0 );
4562     }
4563     aSig0 |= ( aSig1 != 0 );
4564     shift64RightJamming( aSig0, 18, &aSig0 );
4565     zSig = (bits32)aSig0;
4566     if ( aExp || zSig ) {
4567         zSig |= 0x40000000;
4568         aExp -= 0x3F81;
4569     }
4570     return roundAndPackFloat32( aSign, aExp, zSig );
4571 
4572 }
4573 
4574 /*
4575 -------------------------------------------------------------------------------
4576 Returns the result of converting the quadruple-precision floating-point
4577 value `a' to the double-precision floating-point format.  The conversion
4578 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4579 Arithmetic.
4580 -------------------------------------------------------------------------------
4581 */
4582 float64 float128_to_float64( float128 a )
4583 {
4584     flag aSign;
4585     int32 aExp;
4586     bits64 aSig0, aSig1;
4587 
4588     aSig1 = extractFloat128Frac1( a );
4589     aSig0 = extractFloat128Frac0( a );
4590     aExp = extractFloat128Exp( a );
4591     aSign = extractFloat128Sign( a );
4592     if ( aExp == 0x7FFF ) {
4593         if ( aSig0 | aSig1 ) {
4594             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4595         }
4596         return packFloat64( aSign, 0x7FF, 0 );
4597     }
4598     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4599     aSig0 |= ( aSig1 != 0 );
4600     if ( aExp || aSig0 ) {
4601         aSig0 |= LIT64( 0x4000000000000000 );
4602         aExp -= 0x3C01;
4603     }
4604     return roundAndPackFloat64( aSign, aExp, aSig0 );
4605 
4606 }
4607 
4608 #ifdef FLOATX80
4609 
4610 /*
4611 -------------------------------------------------------------------------------
4612 Returns the result of converting the quadruple-precision floating-point
4613 value `a' to the extended double-precision floating-point format.  The
4614 conversion is performed according to the IEC/IEEE Standard for Binary
4615 Floating-Point Arithmetic.
4616 -------------------------------------------------------------------------------
4617 */
4618 floatx80 float128_to_floatx80( float128 a )
4619 {
4620     flag aSign;
4621     int32 aExp;
4622     bits64 aSig0, aSig1;
4623 
4624     aSig1 = extractFloat128Frac1( a );
4625     aSig0 = extractFloat128Frac0( a );
4626     aExp = extractFloat128Exp( a );
4627     aSign = extractFloat128Sign( a );
4628     if ( aExp == 0x7FFF ) {
4629         if ( aSig0 | aSig1 ) {
4630             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4631         }
4632         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4633     }
4634     if ( aExp == 0 ) {
4635         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4636         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4637     }
4638     else {
4639         aSig0 |= LIT64( 0x0001000000000000 );
4640     }
4641     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4642     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4643 
4644 }
4645 
4646 #endif
4647 
4648 /*
4649 -------------------------------------------------------------------------------
4650 Rounds the quadruple-precision floating-point value `a' to an integer, and
4651 returns the result as a quadruple-precision floating-point value.  The
4652 operation is performed according to the IEC/IEEE Standard for Binary
4653 Floating-Point Arithmetic.
4654 -------------------------------------------------------------------------------
4655 */
4656 float128 float128_round_to_int( float128 a )
4657 {
4658     flag aSign;
4659     int32 aExp;
4660     bits64 lastBitMask, roundBitsMask;
4661     int8 roundingMode;
4662     float128 z;
4663 
4664     aExp = extractFloat128Exp( a );
4665     if ( 0x402F <= aExp ) {
4666         if ( 0x406F <= aExp ) {
4667             if (    ( aExp == 0x7FFF )
4668                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4669                ) {
4670                 return propagateFloat128NaN( a, a );
4671             }
4672             return a;
4673         }
4674         lastBitMask = 1;
4675         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4676         roundBitsMask = lastBitMask - 1;
4677         z = a;
4678         roundingMode = float_rounding_mode;
4679         if ( roundingMode == float_round_nearest_even ) {
4680             if ( lastBitMask ) {
4681                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4682                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4683             }
4684             else {
4685                 if ( (sbits64) z.low < 0 ) {
4686                     ++z.high;
4687                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4688                 }
4689             }
4690         }
4691         else if ( roundingMode != float_round_to_zero ) {
4692             if (   extractFloat128Sign( z )
4693                  ^ ( roundingMode == float_round_up ) ) {
4694                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4695             }
4696         }
4697         z.low &= ~ roundBitsMask;
4698     }
4699     else {
4700         if ( aExp < 0x3FFF ) {
4701             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4702             float_exception_flags |= float_flag_inexact;
4703             aSign = extractFloat128Sign( a );
4704             switch ( float_rounding_mode ) {
4705              case float_round_nearest_even:
4706                 if (    ( aExp == 0x3FFE )
4707                      && (   extractFloat128Frac0( a )
4708                           | extractFloat128Frac1( a ) )
4709                    ) {
4710                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4711                 }
4712                 break;
4713 	     case float_round_to_zero:
4714 		break;
4715              case float_round_down:
4716                 return
4717                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4718                     : packFloat128( 0, 0, 0, 0 );
4719              case float_round_up:
4720                 return
4721                       aSign ? packFloat128( 1, 0, 0, 0 )
4722                     : packFloat128( 0, 0x3FFF, 0, 0 );
4723             }
4724             return packFloat128( aSign, 0, 0, 0 );
4725         }
4726         lastBitMask = 1;
4727         lastBitMask <<= 0x402F - aExp;
4728         roundBitsMask = lastBitMask - 1;
4729         z.low = 0;
4730         z.high = a.high;
4731         roundingMode = float_rounding_mode;
4732         if ( roundingMode == float_round_nearest_even ) {
4733             z.high += lastBitMask>>1;
4734             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4735                 z.high &= ~ lastBitMask;
4736             }
4737         }
4738         else if ( roundingMode != float_round_to_zero ) {
4739             if (   extractFloat128Sign( z )
4740                  ^ ( roundingMode == float_round_up ) ) {
4741                 z.high |= ( a.low != 0 );
4742                 z.high += roundBitsMask;
4743             }
4744         }
4745         z.high &= ~ roundBitsMask;
4746     }
4747     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4748         float_exception_flags |= float_flag_inexact;
4749     }
4750     return z;
4751 
4752 }
4753 
4754 /*
4755 -------------------------------------------------------------------------------
4756 Returns the result of adding the absolute values of the quadruple-precision
4757 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4758 before being returned.  `zSign' is ignored if the result is a NaN.
4759 The addition is performed according to the IEC/IEEE Standard for Binary
4760 Floating-Point Arithmetic.
4761 -------------------------------------------------------------------------------
4762 */
4763 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4764 {
4765     int32 aExp, bExp, zExp;
4766     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4767     int32 expDiff;
4768 
4769     aSig1 = extractFloat128Frac1( a );
4770     aSig0 = extractFloat128Frac0( a );
4771     aExp = extractFloat128Exp( a );
4772     bSig1 = extractFloat128Frac1( b );
4773     bSig0 = extractFloat128Frac0( b );
4774     bExp = extractFloat128Exp( b );
4775     expDiff = aExp - bExp;
4776     if ( 0 < expDiff ) {
4777         if ( aExp == 0x7FFF ) {
4778             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4779             return a;
4780         }
4781         if ( bExp == 0 ) {
4782             --expDiff;
4783         }
4784         else {
4785             bSig0 |= LIT64( 0x0001000000000000 );
4786         }
4787         shift128ExtraRightJamming(
4788             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4789         zExp = aExp;
4790     }
4791     else if ( expDiff < 0 ) {
4792         if ( bExp == 0x7FFF ) {
4793             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4794             return packFloat128( zSign, 0x7FFF, 0, 0 );
4795         }
4796         if ( aExp == 0 ) {
4797             ++expDiff;
4798         }
4799         else {
4800             aSig0 |= LIT64( 0x0001000000000000 );
4801         }
4802         shift128ExtraRightJamming(
4803             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4804         zExp = bExp;
4805     }
4806     else {
4807         if ( aExp == 0x7FFF ) {
4808             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4809                 return propagateFloat128NaN( a, b );
4810             }
4811             return a;
4812         }
4813         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4815         zSig2 = 0;
4816         zSig0 |= LIT64( 0x0002000000000000 );
4817         zExp = aExp;
4818         goto shiftRight1;
4819     }
4820     aSig0 |= LIT64( 0x0001000000000000 );
4821     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4822     --zExp;
4823     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4824     ++zExp;
4825  shiftRight1:
4826     shift128ExtraRightJamming(
4827         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4828  roundAndPack:
4829     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4830 
4831 }
4832 
4833 /*
4834 -------------------------------------------------------------------------------
4835 Returns the result of subtracting the absolute values of the quadruple-
4836 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4837 difference is negated before being returned.  `zSign' is ignored if the
4838 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4839 Standard for Binary Floating-Point Arithmetic.
4840 -------------------------------------------------------------------------------
4841 */
4842 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4843 {
4844     int32 aExp, bExp, zExp;
4845     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4846     int32 expDiff;
4847     float128 z;
4848 
4849     aSig1 = extractFloat128Frac1( a );
4850     aSig0 = extractFloat128Frac0( a );
4851     aExp = extractFloat128Exp( a );
4852     bSig1 = extractFloat128Frac1( b );
4853     bSig0 = extractFloat128Frac0( b );
4854     bExp = extractFloat128Exp( b );
4855     expDiff = aExp - bExp;
4856     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4857     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4858     if ( 0 < expDiff ) goto aExpBigger;
4859     if ( expDiff < 0 ) goto bExpBigger;
4860     if ( aExp == 0x7FFF ) {
4861         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4862             return propagateFloat128NaN( a, b );
4863         }
4864         float_raise( float_flag_invalid );
4865         z.low = float128_default_nan_low;
4866         z.high = float128_default_nan_high;
4867         return z;
4868     }
4869     if ( aExp == 0 ) {
4870         aExp = 1;
4871         bExp = 1;
4872     }
4873     if ( bSig0 < aSig0 ) goto aBigger;
4874     if ( aSig0 < bSig0 ) goto bBigger;
4875     if ( bSig1 < aSig1 ) goto aBigger;
4876     if ( aSig1 < bSig1 ) goto bBigger;
4877     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4878  bExpBigger:
4879     if ( bExp == 0x7FFF ) {
4880         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4881         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4882     }
4883     if ( aExp == 0 ) {
4884         ++expDiff;
4885     }
4886     else {
4887         aSig0 |= LIT64( 0x4000000000000000 );
4888     }
4889     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4890     bSig0 |= LIT64( 0x4000000000000000 );
4891  bBigger:
4892     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4893     zExp = bExp;
4894     zSign ^= 1;
4895     goto normalizeRoundAndPack;
4896  aExpBigger:
4897     if ( aExp == 0x7FFF ) {
4898         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4899         return a;
4900     }
4901     if ( bExp == 0 ) {
4902         --expDiff;
4903     }
4904     else {
4905         bSig0 |= LIT64( 0x4000000000000000 );
4906     }
4907     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4908     aSig0 |= LIT64( 0x4000000000000000 );
4909  aBigger:
4910     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4911     zExp = aExp;
4912  normalizeRoundAndPack:
4913     --zExp;
4914     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4915 
4916 }
4917 
4918 /*
4919 -------------------------------------------------------------------------------
4920 Returns the result of adding the quadruple-precision floating-point values
4921 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4922 for Binary Floating-Point Arithmetic.
4923 -------------------------------------------------------------------------------
4924 */
4925 float128 float128_add( float128 a, float128 b )
4926 {
4927     flag aSign, bSign;
4928 
4929     aSign = extractFloat128Sign( a );
4930     bSign = extractFloat128Sign( b );
4931     if ( aSign == bSign ) {
4932         return addFloat128Sigs( a, b, aSign );
4933     }
4934     else {
4935         return subFloat128Sigs( a, b, aSign );
4936     }
4937 
4938 }
4939 
4940 /*
4941 -------------------------------------------------------------------------------
4942 Returns the result of subtracting the quadruple-precision floating-point
4943 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4944 Standard for Binary Floating-Point Arithmetic.
4945 -------------------------------------------------------------------------------
4946 */
4947 float128 float128_sub( float128 a, float128 b )
4948 {
4949     flag aSign, bSign;
4950 
4951     aSign = extractFloat128Sign( a );
4952     bSign = extractFloat128Sign( b );
4953     if ( aSign == bSign ) {
4954         return subFloat128Sigs( a, b, aSign );
4955     }
4956     else {
4957         return addFloat128Sigs( a, b, aSign );
4958     }
4959 
4960 }
4961 
4962 /*
4963 -------------------------------------------------------------------------------
4964 Returns the result of multiplying the quadruple-precision floating-point
4965 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4966 Standard for Binary Floating-Point Arithmetic.
4967 -------------------------------------------------------------------------------
4968 */
4969 float128 float128_mul( float128 a, float128 b )
4970 {
4971     flag aSign, bSign, zSign;
4972     int32 aExp, bExp, zExp;
4973     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4974     float128 z;
4975 
4976     aSig1 = extractFloat128Frac1( a );
4977     aSig0 = extractFloat128Frac0( a );
4978     aExp = extractFloat128Exp( a );
4979     aSign = extractFloat128Sign( a );
4980     bSig1 = extractFloat128Frac1( b );
4981     bSig0 = extractFloat128Frac0( b );
4982     bExp = extractFloat128Exp( b );
4983     bSign = extractFloat128Sign( b );
4984     zSign = aSign ^ bSign;
4985     if ( aExp == 0x7FFF ) {
4986         if (    ( aSig0 | aSig1 )
4987              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4988             return propagateFloat128NaN( a, b );
4989         }
4990         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4991         return packFloat128( zSign, 0x7FFF, 0, 0 );
4992     }
4993     if ( bExp == 0x7FFF ) {
4994         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4995         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4996  invalid:
4997             float_raise( float_flag_invalid );
4998             z.low = float128_default_nan_low;
4999             z.high = float128_default_nan_high;
5000             return z;
5001         }
5002         return packFloat128( zSign, 0x7FFF, 0, 0 );
5003     }
5004     if ( aExp == 0 ) {
5005         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5006         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5007     }
5008     if ( bExp == 0 ) {
5009         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5010         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5011     }
5012     zExp = aExp + bExp - 0x4000;
5013     aSig0 |= LIT64( 0x0001000000000000 );
5014     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5015     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5016     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5017     zSig2 |= ( zSig3 != 0 );
5018     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5019         shift128ExtraRightJamming(
5020             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5021         ++zExp;
5022     }
5023     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5024 
5025 }
5026 
5027 /*
5028 -------------------------------------------------------------------------------
5029 Returns the result of dividing the quadruple-precision floating-point value
5030 `a' by the corresponding value `b'.  The operation is performed according to
5031 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5032 -------------------------------------------------------------------------------
5033 */
5034 float128 float128_div( float128 a, float128 b )
5035 {
5036     flag aSign, bSign, zSign;
5037     int32 aExp, bExp, zExp;
5038     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5039     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5040     float128 z;
5041 
5042     aSig1 = extractFloat128Frac1( a );
5043     aSig0 = extractFloat128Frac0( a );
5044     aExp = extractFloat128Exp( a );
5045     aSign = extractFloat128Sign( a );
5046     bSig1 = extractFloat128Frac1( b );
5047     bSig0 = extractFloat128Frac0( b );
5048     bExp = extractFloat128Exp( b );
5049     bSign = extractFloat128Sign( b );
5050     zSign = aSign ^ bSign;
5051     if ( aExp == 0x7FFF ) {
5052         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5053         if ( bExp == 0x7FFF ) {
5054             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5055             goto invalid;
5056         }
5057         return packFloat128( zSign, 0x7FFF, 0, 0 );
5058     }
5059     if ( bExp == 0x7FFF ) {
5060         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061         return packFloat128( zSign, 0, 0, 0 );
5062     }
5063     if ( bExp == 0 ) {
5064         if ( ( bSig0 | bSig1 ) == 0 ) {
5065             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5066  invalid:
5067                 float_raise( float_flag_invalid );
5068                 z.low = float128_default_nan_low;
5069                 z.high = float128_default_nan_high;
5070                 return z;
5071             }
5072             float_raise( float_flag_divbyzero );
5073             return packFloat128( zSign, 0x7FFF, 0, 0 );
5074         }
5075         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5076     }
5077     if ( aExp == 0 ) {
5078         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5079         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5080     }
5081     zExp = aExp - bExp + 0x3FFD;
5082     shortShift128Left(
5083         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5084     shortShift128Left(
5085         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5086     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5087         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5088         ++zExp;
5089     }
5090     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5091     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5092     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5093     while ( (sbits64) rem0 < 0 ) {
5094         --zSig0;
5095         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5096     }
5097     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5098     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5099         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5100         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5101         while ( (sbits64) rem1 < 0 ) {
5102             --zSig1;
5103             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5104         }
5105         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5106     }
5107     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5108     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5109 
5110 }
5111 
5112 /*
5113 -------------------------------------------------------------------------------
5114 Returns the remainder of the quadruple-precision floating-point value `a'
5115 with respect to the corresponding value `b'.  The operation is performed
5116 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5117 -------------------------------------------------------------------------------
5118 */
5119 float128 float128_rem( float128 a, float128 b )
5120 {
5121     flag aSign, zSign;
5122     int32 aExp, bExp, expDiff;
5123     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5124     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5125     sbits64 sigMean0;
5126     float128 z;
5127 
5128     aSig1 = extractFloat128Frac1( a );
5129     aSig0 = extractFloat128Frac0( a );
5130     aExp = extractFloat128Exp( a );
5131     aSign = extractFloat128Sign( a );
5132     bSig1 = extractFloat128Frac1( b );
5133     bSig0 = extractFloat128Frac0( b );
5134     bExp = extractFloat128Exp( b );
5135     if ( aExp == 0x7FFF ) {
5136         if (    ( aSig0 | aSig1 )
5137              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5138             return propagateFloat128NaN( a, b );
5139         }
5140         goto invalid;
5141     }
5142     if ( bExp == 0x7FFF ) {
5143         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5144         return a;
5145     }
5146     if ( bExp == 0 ) {
5147         if ( ( bSig0 | bSig1 ) == 0 ) {
5148  invalid:
5149             float_raise( float_flag_invalid );
5150             z.low = float128_default_nan_low;
5151             z.high = float128_default_nan_high;
5152             return z;
5153         }
5154         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5155     }
5156     if ( aExp == 0 ) {
5157         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5158         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5159     }
5160     expDiff = aExp - bExp;
5161     if ( expDiff < -1 ) return a;
5162     shortShift128Left(
5163         aSig0 | LIT64( 0x0001000000000000 ),
5164         aSig1,
5165         15 - ( expDiff < 0 ),
5166         &aSig0,
5167         &aSig1
5168     );
5169     shortShift128Left(
5170         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5171     q = le128( bSig0, bSig1, aSig0, aSig1 );
5172     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5173     expDiff -= 64;
5174     while ( 0 < expDiff ) {
5175         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176         q = ( 4 < q ) ? q - 4 : 0;
5177         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5178         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5179         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5180         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5181         expDiff -= 61;
5182     }
5183     if ( -64 < expDiff ) {
5184         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5185         q = ( 4 < q ) ? q - 4 : 0;
5186         q >>= - expDiff;
5187         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5188         expDiff += 52;
5189         if ( expDiff < 0 ) {
5190             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5191         }
5192         else {
5193             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5194         }
5195         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5196         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5197     }
5198     else {
5199         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5200         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5201     }
5202     do {
5203         alternateASig0 = aSig0;
5204         alternateASig1 = aSig1;
5205         ++q;
5206         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5207     } while ( 0 <= (sbits64) aSig0 );
5208     add128(
5209         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5210     if (    ( sigMean0 < 0 )
5211          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5212         aSig0 = alternateASig0;
5213         aSig1 = alternateASig1;
5214     }
5215     zSign = ( (sbits64) aSig0 < 0 );
5216     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5217     return
5218         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5219 
5220 }
5221 
5222 /*
5223 -------------------------------------------------------------------------------
5224 Returns the square root of the quadruple-precision floating-point value `a'.
5225 The operation is performed according to the IEC/IEEE Standard for Binary
5226 Floating-Point Arithmetic.
5227 -------------------------------------------------------------------------------
5228 */
5229 float128 float128_sqrt( float128 a )
5230 {
5231     flag aSign;
5232     int32 aExp, zExp;
5233     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5234     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5235     float128 z;
5236 
5237     aSig1 = extractFloat128Frac1( a );
5238     aSig0 = extractFloat128Frac0( a );
5239     aExp = extractFloat128Exp( a );
5240     aSign = extractFloat128Sign( a );
5241     if ( aExp == 0x7FFF ) {
5242         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5243         if ( ! aSign ) return a;
5244         goto invalid;
5245     }
5246     if ( aSign ) {
5247         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5248  invalid:
5249         float_raise( float_flag_invalid );
5250         z.low = float128_default_nan_low;
5251         z.high = float128_default_nan_high;
5252         return z;
5253     }
5254     if ( aExp == 0 ) {
5255         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5256         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5257     }
5258     zExp = ( (unsigned int)(aExp - 0x3FFF) >> 1) + 0x3FFE;
5259     aSig0 |= LIT64( 0x0001000000000000 );
5260     zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
5261     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5262     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5263     doubleZSig0 = zSig0<<1;
5264     mul64To128( zSig0, zSig0, &term0, &term1 );
5265     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5266     while ( (sbits64) rem0 < 0 ) {
5267         --zSig0;
5268         doubleZSig0 -= 2;
5269         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5270     }
5271     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5272     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5273         if ( zSig1 == 0 ) zSig1 = 1;
5274         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5275         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5276         mul64To128( zSig1, zSig1, &term2, &term3 );
5277         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5278         while ( (sbits64) rem1 < 0 ) {
5279             --zSig1;
5280             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5281             term3 |= 1;
5282             term2 |= doubleZSig0;
5283             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5284         }
5285         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5286     }
5287     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5288     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5289 
5290 }
5291 
5292 /*
5293 -------------------------------------------------------------------------------
5294 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5295 the corresponding value `b', and 0 otherwise.  The comparison is performed
5296 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 -------------------------------------------------------------------------------
5298 */
5299 flag float128_eq( float128 a, float128 b )
5300 {
5301 
5302     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5303               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5304          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5305               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5306        ) {
5307         if (    float128_is_signaling_nan( a )
5308              || float128_is_signaling_nan( b ) ) {
5309             float_raise( float_flag_invalid );
5310         }
5311         return 0;
5312     }
5313     return
5314            ( a.low == b.low )
5315         && (    ( a.high == b.high )
5316              || (    ( a.low == 0 )
5317                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5318            );
5319 
5320 }
5321 
5322 /*
5323 -------------------------------------------------------------------------------
5324 Returns 1 if the quadruple-precision floating-point value `a' is less than
5325 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5326 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5327 Arithmetic.
5328 -------------------------------------------------------------------------------
5329 */
5330 flag float128_le( float128 a, float128 b )
5331 {
5332     flag aSign, bSign;
5333 
5334     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5335               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5336          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5337               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338        ) {
5339         float_raise( float_flag_invalid );
5340         return 0;
5341     }
5342     aSign = extractFloat128Sign( a );
5343     bSign = extractFloat128Sign( b );
5344     if ( aSign != bSign ) {
5345         return
5346                aSign
5347             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5348                  == 0 );
5349     }
5350     return
5351           aSign ? le128( b.high, b.low, a.high, a.low )
5352         : le128( a.high, a.low, b.high, b.low );
5353 
5354 }
5355 
5356 /*
5357 -------------------------------------------------------------------------------
5358 Returns 1 if the quadruple-precision floating-point value `a' is less than
5359 the corresponding value `b', and 0 otherwise.  The comparison is performed
5360 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 -------------------------------------------------------------------------------
5362 */
5363 flag float128_lt( float128 a, float128 b )
5364 {
5365     flag aSign, bSign;
5366 
5367     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5368               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5370               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371        ) {
5372         float_raise( float_flag_invalid );
5373         return 0;
5374     }
5375     aSign = extractFloat128Sign( a );
5376     bSign = extractFloat128Sign( b );
5377     if ( aSign != bSign ) {
5378         return
5379                aSign
5380             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5381                  != 0 );
5382     }
5383     return
5384           aSign ? lt128( b.high, b.low, a.high, a.low )
5385         : lt128( a.high, a.low, b.high, b.low );
5386 
5387 }
5388 
5389 /*
5390 -------------------------------------------------------------------------------
5391 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5392 the corresponding value `b', and 0 otherwise.  The invalid exception is
5393 raised if either operand is a NaN.  Otherwise, the comparison is performed
5394 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 -------------------------------------------------------------------------------
5396 */
5397 flag float128_eq_signaling( float128 a, float128 b )
5398 {
5399 
5400     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5401               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5402          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5403               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5404        ) {
5405         float_raise( float_flag_invalid );
5406         return 0;
5407     }
5408     return
5409            ( a.low == b.low )
5410         && (    ( a.high == b.high )
5411              || (    ( a.low == 0 )
5412                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5413            );
5414 
5415 }
5416 
5417 /*
5418 -------------------------------------------------------------------------------
5419 Returns 1 if the quadruple-precision floating-point value `a' is less than
5420 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5421 cause an exception.  Otherwise, the comparison is performed according to the
5422 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423 -------------------------------------------------------------------------------
5424 */
5425 flag float128_le_quiet( float128 a, float128 b )
5426 {
5427     flag aSign, bSign;
5428 
5429     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5430               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5431          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5432               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5433        ) {
5434         if (    float128_is_signaling_nan( a )
5435              || float128_is_signaling_nan( b ) ) {
5436             float_raise( float_flag_invalid );
5437         }
5438         return 0;
5439     }
5440     aSign = extractFloat128Sign( a );
5441     bSign = extractFloat128Sign( b );
5442     if ( aSign != bSign ) {
5443         return
5444                aSign
5445             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5446                  == 0 );
5447     }
5448     return
5449           aSign ? le128( b.high, b.low, a.high, a.low )
5450         : le128( a.high, a.low, b.high, b.low );
5451 
5452 }
5453 
5454 /*
5455 -------------------------------------------------------------------------------
5456 Returns 1 if the quadruple-precision floating-point value `a' is less than
5457 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5458 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5459 Standard for Binary Floating-Point Arithmetic.
5460 -------------------------------------------------------------------------------
5461 */
5462 flag float128_lt_quiet( float128 a, float128 b )
5463 {
5464     flag aSign, bSign;
5465 
5466     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5467               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5468          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5469               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470        ) {
5471         if (    float128_is_signaling_nan( a )
5472              || float128_is_signaling_nan( b ) ) {
5473             float_raise( float_flag_invalid );
5474         }
5475         return 0;
5476     }
5477     aSign = extractFloat128Sign( a );
5478     bSign = extractFloat128Sign( b );
5479     if ( aSign != bSign ) {
5480         return
5481                aSign
5482             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5483                  != 0 );
5484     }
5485     return
5486           aSign ? lt128( b.high, b.low, a.high, a.low )
5487         : lt128( a.high, a.low, b.high, b.low );
5488 
5489 }
5490 
5491 #endif
5492 
5493 
5494 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5495 
5496 /*
5497  * These two routines are not part of the original softfloat distribution.
5498  *
5499  * They are based on the corresponding conversions to integer but return
5500  * unsigned numbers instead since these functions are required by GCC.
5501  *
5502  * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5503  *
5504  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5505  */
5506 
5507 /*
5508 -------------------------------------------------------------------------------
5509 Returns the result of converting the double-precision floating-point value
5510 `a' to the 32-bit unsigned integer format.  The conversion is
5511 performed according to the IEC/IEEE Standard for Binary Floating-point
5512 Arithmetic, except that the conversion is always rounded toward zero.  If
5513 `a' is a NaN, the largest positive integer is returned.  If the conversion
5514 overflows, the largest integer positive is returned.
5515 -------------------------------------------------------------------------------
5516 */
5517 uint32 float64_to_uint32_round_to_zero( float64 a )
5518 {
5519     flag aSign;
5520     int16 aExp, shiftCount;
5521     bits64 aSig, savedASig;
5522     uint32 z;
5523 
5524     aSig = extractFloat64Frac( a );
5525     aExp = extractFloat64Exp( a );
5526     aSign = extractFloat64Sign( a );
5527 
5528     if (aSign) {
5529         float_raise( float_flag_invalid );
5530     	return(0);
5531     }
5532 
5533     if ( 0x41E < aExp ) {
5534         float_raise( float_flag_invalid );
5535         return 0xffffffff;
5536     }
5537     else if ( aExp < 0x3FF ) {
5538         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5539         return 0;
5540     }
5541     aSig |= LIT64( 0x0010000000000000 );
5542     shiftCount = 0x433 - aExp;
5543     savedASig = aSig;
5544     aSig >>= shiftCount;
5545     z = (uint32)aSig;
5546     if ( ( aSig<<shiftCount ) != savedASig ) {
5547         float_exception_flags |= float_flag_inexact;
5548     }
5549     return z;
5550 
5551 }
5552 
5553 /*
5554 -------------------------------------------------------------------------------
5555 Returns the result of converting the single-precision floating-point value
5556 `a' to the 32-bit unsigned integer format.  The conversion is
5557 performed according to the IEC/IEEE Standard for Binary Floating-point
5558 Arithmetic, except that the conversion is always rounded toward zero.  If
5559 `a' is a NaN, the largest positive integer is returned.  If the conversion
5560 overflows, the largest positive integer is returned.
5561 -------------------------------------------------------------------------------
5562 */
5563 uint32 float32_to_uint32_round_to_zero( float32 a )
5564 {
5565     flag aSign;
5566     int16 aExp, shiftCount;
5567     bits32 aSig;
5568     uint32 z;
5569 
5570     aSig = extractFloat32Frac( a );
5571     aExp = extractFloat32Exp( a );
5572     aSign = extractFloat32Sign( a );
5573     shiftCount = aExp - 0x9E;
5574 
5575     if (aSign) {
5576         float_raise( float_flag_invalid );
5577     	return(0);
5578     }
5579     if ( 0 < shiftCount ) {
5580         float_raise( float_flag_invalid );
5581         return 0xFFFFFFFF;
5582     }
5583     else if ( aExp <= 0x7E ) {
5584         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5585         return 0;
5586     }
5587     aSig = ( aSig | 0x800000 )<<8;
5588     z = aSig>>( - shiftCount );
5589     if ( aSig<<( shiftCount & 31 ) ) {
5590         float_exception_flags |= float_flag_inexact;
5591     }
5592     return z;
5593 
5594 }
5595 
5596 #endif
5597