xref: /openbsd-src/sys/lib/libkern/softfloat.c (revision 48950c12d106c85f315112191a0228d7b83b9510)
1 /*	$OpenBSD: softfloat.c,v 1.5 2012/12/05 23:20:22 deraadt Exp $	*/
2 /*	$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $	*/
3 
4 /*
5  * This version hacked for use with gcc -msoft-float by bjh21.
6  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7  *  itself).
8  */
9 
10 /*
11  * Things you may want to define:
12  *
13  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
15  *   properly renamed.
16  */
17 
18 /*
19 ===============================================================================
20 
21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
22 Arithmetic Package, Release 2a.
23 
24 Written by John R. Hauser.  This work was made possible in part by the
25 International Computer Science Institute, located at Suite 600, 1947 Center
26 Street, Berkeley, California 94704.  Funding was partially provided by the
27 National Science Foundation under grant MIP-9311980.  The original version
28 of this code was written as part of a project to build a fixed-point vector
29 processor in collaboration with the University of California at Berkeley,
30 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
32 arithmetic/SoftFloat.html'.
33 
34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable
35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS
37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
43 
44 Derivative works are acceptable, even for commercial purposes, so long as
45 (1) they include prominent notice that the work is derivative, and (2) they
46 include prominent notice akin to these four paragraphs for those parts of
47 this code that are retained.
48 
49 ===============================================================================
50 */
51 
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55 
56 #include "milieu.h"
57 #include "softfloat.h"
58 
59 /*
60  * Conversions between floats as stored in memory and floats as
61  * SoftFloat uses them
62  */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a)	(a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a)	(a)
68 #endif
69 
70 /*
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
75 */
76 
77 /*
78  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
79  * 	Right now, it does not.  I've removed all other dynamic global
80  * 	variables. [ross]
81  */
82 #ifdef FLOATX80
83 int8 floatx80_rounding_precision = 80;
84 #endif
85 
86 /*
87 -------------------------------------------------------------------------------
88 Primitive arithmetic functions, including multi-word arithmetic, and
89 division and square root approximations.  (Can be specialized to target if
90 desired.)
91 -------------------------------------------------------------------------------
92 */
93 #include "softfloat-macros.h"
94 
95 /*
96 -------------------------------------------------------------------------------
97 Functions and definitions to determine:  (1) whether tininess for underflow
98 is detected before or after rounding by default, (2) what (if anything)
99 happens when exceptions are raised, (3) how signaling NaNs are distinguished
100 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
101 are propagated from function inputs to output.  These details are target-
102 specific.
103 -------------------------------------------------------------------------------
104 */
105 #include "softfloat-specialize.h"
106 
107 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
108 /*
109 -------------------------------------------------------------------------------
110 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
111 and 7, and returns the properly rounded 32-bit integer corresponding to the
112 input.  If `zSign' is 1, the input is negated before being converted to an
113 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
114 is simply rounded to an integer, with the inexact exception raised if the
115 input cannot be represented exactly as an integer.  However, if the fixed-
116 point input is too large, the invalid exception is raised and the largest
117 positive or negative integer is returned.
118 -------------------------------------------------------------------------------
119 */
120 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
121 {
122     int8 roundingMode;
123     flag roundNearestEven;
124     int8 roundIncrement, roundBits;
125     int32 z;
126 
127     roundingMode = float_rounding_mode();
128     roundNearestEven = ( roundingMode == float_round_nearest_even );
129     roundIncrement = 0x40;
130     if ( ! roundNearestEven ) {
131         if ( roundingMode == float_round_to_zero ) {
132             roundIncrement = 0;
133         }
134         else {
135             roundIncrement = 0x7F;
136             if ( zSign ) {
137                 if ( roundingMode == float_round_up ) roundIncrement = 0;
138             }
139             else {
140                 if ( roundingMode == float_round_down ) roundIncrement = 0;
141             }
142         }
143     }
144     roundBits = absZ & 0x7F;
145     absZ = ( absZ + roundIncrement )>>7;
146     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
147     z = absZ;
148     if ( zSign ) z = - z;
149     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
150         float_raise( float_flag_invalid );
151         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
152     }
153     if ( roundBits ) float_set_inexact();
154     return z;
155 
156 }
157 
158 /*
159 -------------------------------------------------------------------------------
160 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
161 `absZ1', with binary point between bits 63 and 64 (between the input words),
162 and returns the properly rounded 64-bit integer corresponding to the input.
163 If `zSign' is 1, the input is negated before being converted to an integer.
164 Ordinarily, the fixed-point input is simply rounded to an integer, with
165 the inexact exception raised if the input cannot be represented exactly as
166 an integer.  However, if the fixed-point input is too large, the invalid
167 exception is raised and the largest positive or negative integer is
168 returned.
169 -------------------------------------------------------------------------------
170 */
171 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
172 {
173     int8 roundingMode;
174     flag roundNearestEven, increment;
175     int64 z;
176 
177     roundingMode = float_rounding_mode();
178     roundNearestEven = ( roundingMode == float_round_nearest_even );
179     increment = ( (sbits64) absZ1 < 0 );
180     if ( ! roundNearestEven ) {
181         if ( roundingMode == float_round_to_zero ) {
182             increment = 0;
183         }
184         else {
185             if ( zSign ) {
186                 increment = ( roundingMode == float_round_down ) && absZ1;
187             }
188             else {
189                 increment = ( roundingMode == float_round_up ) && absZ1;
190             }
191         }
192     }
193     if ( increment ) {
194         ++absZ0;
195         if ( absZ0 == 0 ) goto overflow;
196         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
197     }
198     z = absZ0;
199     if ( zSign ) z = - z;
200     if ( z && ( ( z < 0 ) ^ zSign ) ) {
201  overflow:
202         float_raise( float_flag_invalid );
203         return
204               zSign ? (sbits64) LIT64( 0x8000000000000000 )
205             : LIT64( 0x7FFFFFFFFFFFFFFF );
206     }
207     if ( absZ1 ) float_set_inexact();
208     return z;
209 
210 }
211 
212 #ifdef __alpha__
213 /*
214 -------------------------------------------------------------------------------
215 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
216 `absZ1', with binary point between bits 63 and 64 (between the input words),
217 and returns the properly rounded 64-bit integer corresponding to the input.
218 If `zSign' is 1, the input is negated before being converted to an integer.
219 Ordinarily, the fixed-point input is simply rounded to an integer, with
220 the inexact exception raised if the input cannot be represented exactly as
221 an integer.
222 -------------------------------------------------------------------------------
223 */
224 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0,
225     bits64 absZ1 )
226 {
227     int8 roundingMode;
228     flag roundNearestEven, increment;
229     int64 z;
230 
231     roundingMode = float_rounding_mode();
232     roundNearestEven = ( roundingMode == float_round_nearest_even );
233     increment = ( (sbits64) absZ1 < 0 );
234     if ( ! roundNearestEven ) {
235         if ( roundingMode == float_round_to_zero ) {
236             increment = 0;
237         }
238         else {
239             if ( zSign ) {
240                 increment = ( roundingMode == float_round_down ) && absZ1;
241             }
242             else {
243                 increment = ( roundingMode == float_round_up ) && absZ1;
244             }
245         }
246     }
247     if ( increment ) {
248         ++absZ0;
249         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
250     }
251     z = absZ0;
252     if ( zSign ) z = - z;
253     if ( absZ1 ) float_set_inexact();
254     return z;
255 
256 }
257 #endif /* __alpha__ */
258 #endif
259 
260 /*
261 -------------------------------------------------------------------------------
262 Returns the fraction bits of the single-precision floating-point value `a'.
263 -------------------------------------------------------------------------------
264 */
265 INLINE bits32 extractFloat32Frac( float32 a )
266 {
267 
268     return a & 0x007FFFFF;
269 
270 }
271 
272 /*
273 -------------------------------------------------------------------------------
274 Returns the exponent bits of the single-precision floating-point value `a'.
275 -------------------------------------------------------------------------------
276 */
277 INLINE int16 extractFloat32Exp( float32 a )
278 {
279 
280     return ( a>>23 ) & 0xFF;
281 
282 }
283 
284 /*
285 -------------------------------------------------------------------------------
286 Returns the sign bit of the single-precision floating-point value `a'.
287 -------------------------------------------------------------------------------
288 */
289 INLINE flag extractFloat32Sign( float32 a )
290 {
291 
292     return a>>31;
293 
294 }
295 
296 /*
297 -------------------------------------------------------------------------------
298 Normalizes the subnormal single-precision floating-point value represented
299 by the denormalized significand `aSig'.  The normalized exponent and
300 significand are stored at the locations pointed to by `zExpPtr' and
301 `zSigPtr', respectively.
302 -------------------------------------------------------------------------------
303 */
304 static void
305  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
306 {
307     int8 shiftCount;
308 
309     shiftCount = countLeadingZeros32( aSig ) - 8;
310     *zSigPtr = aSig<<shiftCount;
311     *zExpPtr = 1 - shiftCount;
312 
313 }
314 
315 /*
316 -------------------------------------------------------------------------------
317 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
318 single-precision floating-point value, returning the result.  After being
319 shifted into the proper positions, the three fields are simply added
320 together to form the result.  This means that any integer portion of `zSig'
321 will be added into the exponent.  Since a properly normalized significand
322 will have an integer portion equal to 1, the `zExp' input should be 1 less
323 than the desired result exponent whenever `zSig' is a complete, normalized
324 significand.
325 -------------------------------------------------------------------------------
326 */
327 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
328 {
329 
330     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
331 
332 }
333 
334 /*
335 -------------------------------------------------------------------------------
336 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
337 and significand `zSig', and returns the proper single-precision floating-
338 point value corresponding to the abstract input.  Ordinarily, the abstract
339 value is simply rounded and packed into the single-precision format, with
340 the inexact exception raised if the abstract input cannot be represented
341 exactly.  However, if the abstract value is too large, the overflow and
342 inexact exceptions are raised and an infinity or maximal finite value is
343 returned.  If the abstract value is too small, the input value is rounded to
344 a subnormal number, and the underflow and inexact exceptions are raised if
345 the abstract input cannot be represented exactly as a subnormal single-
346 precision floating-point number.
347     The input significand `zSig' has its binary point between bits 30
348 and 29, which is 7 bits to the left of the usual location.  This shifted
349 significand must be normalized or smaller.  If `zSig' is not normalized,
350 `zExp' must be 0; in that case, the result returned is a subnormal number,
351 and it must not require rounding.  In the usual case that `zSig' is
352 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
353 The handling of underflow and overflow follows the IEC/IEEE Standard for
354 Binary Floating-Point Arithmetic.
355 -------------------------------------------------------------------------------
356 */
357 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
358 {
359     int8 roundingMode;
360     flag roundNearestEven;
361     int8 roundIncrement, roundBits;
362     flag isTiny;
363 
364     roundingMode = float_rounding_mode();
365     roundNearestEven = ( roundingMode == float_round_nearest_even );
366     roundIncrement = 0x40;
367     if ( ! roundNearestEven ) {
368         if ( roundingMode == float_round_to_zero ) {
369             roundIncrement = 0;
370         }
371         else {
372             roundIncrement = 0x7F;
373             if ( zSign ) {
374                 if ( roundingMode == float_round_up ) roundIncrement = 0;
375             }
376             else {
377                 if ( roundingMode == float_round_down ) roundIncrement = 0;
378             }
379         }
380     }
381     roundBits = zSig & 0x7F;
382     if ( 0xFD <= (bits16) zExp ) {
383         if (    ( 0xFD < zExp )
384              || (    ( zExp == 0xFD )
385                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
386            ) {
387             float_raise( float_flag_overflow | float_flag_inexact );
388             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
389         }
390         if ( zExp < 0 ) {
391             isTiny =
392                    ( float_detect_tininess == float_tininess_before_rounding )
393                 || ( zExp < -1 )
394                 || ( zSig + roundIncrement < 0x80000000 );
395             shift32RightJamming( zSig, - zExp, &zSig );
396             zExp = 0;
397             roundBits = zSig & 0x7F;
398             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
399         }
400     }
401     if ( roundBits ) float_set_inexact();
402     zSig = ( zSig + roundIncrement )>>7;
403     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
404     if ( zSig == 0 ) zExp = 0;
405     return packFloat32( zSign, zExp, zSig );
406 
407 }
408 
409 /*
410 -------------------------------------------------------------------------------
411 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
412 and significand `zSig', and returns the proper single-precision floating-
413 point value corresponding to the abstract input.  This routine is just like
414 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
415 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
416 floating-point exponent.
417 -------------------------------------------------------------------------------
418 */
419 static float32
420  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
421 {
422     int8 shiftCount;
423 
424     shiftCount = countLeadingZeros32( zSig ) - 1;
425     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
426 
427 }
428 
429 /*
430 -------------------------------------------------------------------------------
431 Returns the fraction bits of the double-precision floating-point value `a'.
432 -------------------------------------------------------------------------------
433 */
434 INLINE bits64 extractFloat64Frac( float64 a )
435 {
436 
437     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
438 
439 }
440 
441 /*
442 -------------------------------------------------------------------------------
443 Returns the exponent bits of the double-precision floating-point value `a'.
444 -------------------------------------------------------------------------------
445 */
446 INLINE int16 extractFloat64Exp( float64 a )
447 {
448 
449     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
450 
451 }
452 
453 /*
454 -------------------------------------------------------------------------------
455 Returns the sign bit of the double-precision floating-point value `a'.
456 -------------------------------------------------------------------------------
457 */
458 INLINE flag extractFloat64Sign( float64 a )
459 {
460 
461     return FLOAT64_DEMANGLE(a)>>63;
462 
463 }
464 
465 /*
466 -------------------------------------------------------------------------------
467 Normalizes the subnormal double-precision floating-point value represented
468 by the denormalized significand `aSig'.  The normalized exponent and
469 significand are stored at the locations pointed to by `zExpPtr' and
470 `zSigPtr', respectively.
471 -------------------------------------------------------------------------------
472 */
473 static void
474  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
475 {
476     int8 shiftCount;
477 
478     shiftCount = countLeadingZeros64( aSig ) - 11;
479     *zSigPtr = aSig<<shiftCount;
480     *zExpPtr = 1 - shiftCount;
481 
482 }
483 
484 /*
485 -------------------------------------------------------------------------------
486 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
487 double-precision floating-point value, returning the result.  After being
488 shifted into the proper positions, the three fields are simply added
489 together to form the result.  This means that any integer portion of `zSig'
490 will be added into the exponent.  Since a properly normalized significand
491 will have an integer portion equal to 1, the `zExp' input should be 1 less
492 than the desired result exponent whenever `zSig' is a complete, normalized
493 significand.
494 -------------------------------------------------------------------------------
495 */
496 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
497 {
498 
499     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
500 			   ( ( (bits64) zExp )<<52 ) + zSig );
501 
502 }
503 
504 /*
505 -------------------------------------------------------------------------------
506 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
507 and significand `zSig', and returns the proper double-precision floating-
508 point value corresponding to the abstract input.  Ordinarily, the abstract
509 value is simply rounded and packed into the double-precision format, with
510 the inexact exception raised if the abstract input cannot be represented
511 exactly.  However, if the abstract value is too large, the overflow and
512 inexact exceptions are raised and an infinity or maximal finite value is
513 returned.  If the abstract value is too small, the input value is rounded to
514 a subnormal number, and the underflow and inexact exceptions are raised if
515 the abstract input cannot be represented exactly as a subnormal double-
516 precision floating-point number.
517     The input significand `zSig' has its binary point between bits 62
518 and 61, which is 10 bits to the left of the usual location.  This shifted
519 significand must be normalized or smaller.  If `zSig' is not normalized,
520 `zExp' must be 0; in that case, the result returned is a subnormal number,
521 and it must not require rounding.  In the usual case that `zSig' is
522 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
523 The handling of underflow and overflow follows the IEC/IEEE Standard for
524 Binary Floating-Point Arithmetic.
525 -------------------------------------------------------------------------------
526 */
527 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
528 {
529     int8 roundingMode;
530     flag roundNearestEven;
531     int16 roundIncrement, roundBits;
532     flag isTiny;
533 
534     roundingMode = float_rounding_mode();
535     roundNearestEven = ( roundingMode == float_round_nearest_even );
536     roundIncrement = 0x200;
537     if ( ! roundNearestEven ) {
538         if ( roundingMode == float_round_to_zero ) {
539             roundIncrement = 0;
540         }
541         else {
542             roundIncrement = 0x3FF;
543             if ( zSign ) {
544                 if ( roundingMode == float_round_up ) roundIncrement = 0;
545             }
546             else {
547                 if ( roundingMode == float_round_down ) roundIncrement = 0;
548             }
549         }
550     }
551     roundBits = zSig & 0x3FF;
552     if ( 0x7FD <= (bits16) zExp ) {
553         if (    ( 0x7FD < zExp )
554              || (    ( zExp == 0x7FD )
555                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
556            ) {
557             float_raise( float_flag_overflow | float_flag_inexact );
558             return FLOAT64_MANGLE(
559 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
560 		( roundIncrement == 0 ));
561         }
562         if ( zExp < 0 ) {
563             isTiny =
564                    ( float_detect_tininess == float_tininess_before_rounding )
565                 || ( zExp < -1 )
566                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
567             shift64RightJamming( zSig, - zExp, &zSig );
568             zExp = 0;
569             roundBits = zSig & 0x3FF;
570             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
571         }
572     }
573     if ( roundBits ) float_set_inexact();
574     zSig = ( zSig + roundIncrement )>>10;
575     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
576     if ( zSig == 0 ) zExp = 0;
577     return packFloat64( zSign, zExp, zSig );
578 
579 }
580 
581 /*
582 -------------------------------------------------------------------------------
583 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
584 and significand `zSig', and returns the proper double-precision floating-
585 point value corresponding to the abstract input.  This routine is just like
586 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
587 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
588 floating-point exponent.
589 -------------------------------------------------------------------------------
590 */
591 static float64
592  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
593 {
594     int8 shiftCount;
595 
596     shiftCount = countLeadingZeros64( zSig ) - 1;
597     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
598 
599 }
600 
601 #ifdef FLOATX80
602 
603 /*
604 -------------------------------------------------------------------------------
605 Returns the fraction bits of the extended double-precision floating-point
606 value `a'.
607 -------------------------------------------------------------------------------
608 */
609 INLINE bits64 extractFloatx80Frac( floatx80 a )
610 {
611 
612     return a.low;
613 
614 }
615 
616 /*
617 -------------------------------------------------------------------------------
618 Returns the exponent bits of the extended double-precision floating-point
619 value `a'.
620 -------------------------------------------------------------------------------
621 */
622 INLINE int32 extractFloatx80Exp( floatx80 a )
623 {
624 
625     return a.high & 0x7FFF;
626 
627 }
628 
629 /*
630 -------------------------------------------------------------------------------
631 Returns the sign bit of the extended double-precision floating-point value
632 `a'.
633 -------------------------------------------------------------------------------
634 */
635 INLINE flag extractFloatx80Sign( floatx80 a )
636 {
637 
638     return a.high>>15;
639 
640 }
641 
642 /*
643 -------------------------------------------------------------------------------
644 Normalizes the subnormal extended double-precision floating-point value
645 represented by the denormalized significand `aSig'.  The normalized exponent
646 and significand are stored at the locations pointed to by `zExpPtr' and
647 `zSigPtr', respectively.
648 -------------------------------------------------------------------------------
649 */
650 static void
651  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
652 {
653     int8 shiftCount;
654 
655     shiftCount = countLeadingZeros64( aSig );
656     *zSigPtr = aSig<<shiftCount;
657     *zExpPtr = 1 - shiftCount;
658 
659 }
660 
661 /*
662 -------------------------------------------------------------------------------
663 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
664 extended double-precision floating-point value, returning the result.
665 -------------------------------------------------------------------------------
666 */
667 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
668 {
669     floatx80 z;
670 
671     z.low = zSig;
672     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
673     return z;
674 
675 }
676 
677 /*
678 -------------------------------------------------------------------------------
679 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
680 and extended significand formed by the concatenation of `zSig0' and `zSig1',
681 and returns the proper extended double-precision floating-point value
682 corresponding to the abstract input.  Ordinarily, the abstract value is
683 rounded and packed into the extended double-precision format, with the
684 inexact exception raised if the abstract input cannot be represented
685 exactly.  However, if the abstract value is too large, the overflow and
686 inexact exceptions are raised and an infinity or maximal finite value is
687 returned.  If the abstract value is too small, the input value is rounded to
688 a subnormal number, and the underflow and inexact exceptions are raised if
689 the abstract input cannot be represented exactly as a subnormal extended
690 double-precision floating-point number.
691     If `roundingPrecision' is 32 or 64, the result is rounded to the same
692 number of bits as single or double precision, respectively.  Otherwise, the
693 result is rounded to the full precision of the extended double-precision
694 format.
695     The input significand must be normalized or smaller.  If the input
696 significand is not normalized, `zExp' must be 0; in that case, the result
697 returned is a subnormal number, and it must not require rounding.  The
698 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
699 Floating-Point Arithmetic.
700 -------------------------------------------------------------------------------
701 */
702 static floatx80
703  roundAndPackFloatx80(
704      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
705  )
706 {
707     int8 roundingMode;
708     flag roundNearestEven, increment, isTiny;
709     int64 roundIncrement, roundMask, roundBits;
710 
711     roundingMode = float_rounding_mode();
712     roundNearestEven = ( roundingMode == float_round_nearest_even );
713     if ( roundingPrecision == 80 ) goto precision80;
714     if ( roundingPrecision == 64 ) {
715         roundIncrement = LIT64( 0x0000000000000400 );
716         roundMask = LIT64( 0x00000000000007FF );
717     }
718     else if ( roundingPrecision == 32 ) {
719         roundIncrement = LIT64( 0x0000008000000000 );
720         roundMask = LIT64( 0x000000FFFFFFFFFF );
721     }
722     else {
723         goto precision80;
724     }
725     zSig0 |= ( zSig1 != 0 );
726     if ( ! roundNearestEven ) {
727         if ( roundingMode == float_round_to_zero ) {
728             roundIncrement = 0;
729         }
730         else {
731             roundIncrement = roundMask;
732             if ( zSign ) {
733                 if ( roundingMode == float_round_up ) roundIncrement = 0;
734             }
735             else {
736                 if ( roundingMode == float_round_down ) roundIncrement = 0;
737             }
738         }
739     }
740     roundBits = zSig0 & roundMask;
741     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
742         if (    ( 0x7FFE < zExp )
743              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
744            ) {
745             goto overflow;
746         }
747         if ( zExp <= 0 ) {
748             isTiny =
749                    ( float_detect_tininess == float_tininess_before_rounding )
750                 || ( zExp < 0 )
751                 || ( zSig0 <= zSig0 + roundIncrement );
752             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
753             zExp = 0;
754             roundBits = zSig0 & roundMask;
755             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
756             if ( roundBits ) float_set_inexact();
757             zSig0 += roundIncrement;
758             if ( (sbits64) zSig0 < 0 ) zExp = 1;
759             roundIncrement = roundMask + 1;
760             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
761                 roundMask |= roundIncrement;
762             }
763             zSig0 &= ~ roundMask;
764             return packFloatx80( zSign, zExp, zSig0 );
765         }
766     }
767     if ( roundBits ) float_set_inexact();
768     zSig0 += roundIncrement;
769     if ( zSig0 < roundIncrement ) {
770         ++zExp;
771         zSig0 = LIT64( 0x8000000000000000 );
772     }
773     roundIncrement = roundMask + 1;
774     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
775         roundMask |= roundIncrement;
776     }
777     zSig0 &= ~ roundMask;
778     if ( zSig0 == 0 ) zExp = 0;
779     return packFloatx80( zSign, zExp, zSig0 );
780  precision80:
781     increment = ( (sbits64) zSig1 < 0 );
782     if ( ! roundNearestEven ) {
783         if ( roundingMode == float_round_to_zero ) {
784             increment = 0;
785         }
786         else {
787             if ( zSign ) {
788                 increment = ( roundingMode == float_round_down ) && zSig1;
789             }
790             else {
791                 increment = ( roundingMode == float_round_up ) && zSig1;
792             }
793         }
794     }
795     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
796         if (    ( 0x7FFE < zExp )
797              || (    ( zExp == 0x7FFE )
798                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
799                   && increment
800                 )
801            ) {
802             roundMask = 0;
803  overflow:
804             float_raise( float_flag_overflow | float_flag_inexact );
805             if (    ( roundingMode == float_round_to_zero )
806                  || ( zSign && ( roundingMode == float_round_up ) )
807                  || ( ! zSign && ( roundingMode == float_round_down ) )
808                ) {
809                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
810             }
811             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
812         }
813         if ( zExp <= 0 ) {
814             isTiny =
815                    ( float_detect_tininess == float_tininess_before_rounding )
816                 || ( zExp < 0 )
817                 || ! increment
818                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
819             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
820             zExp = 0;
821             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
822             if ( zSig1 ) float_set_inexact();
823             if ( roundNearestEven ) {
824                 increment = ( (sbits64) zSig1 < 0 );
825             }
826             else {
827                 if ( zSign ) {
828                     increment = ( roundingMode == float_round_down ) && zSig1;
829                 }
830                 else {
831                     increment = ( roundingMode == float_round_up ) && zSig1;
832                 }
833             }
834             if ( increment ) {
835                 ++zSig0;
836                 zSig0 &=
837                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
838                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
839             }
840             return packFloatx80( zSign, zExp, zSig0 );
841         }
842     }
843     if ( zSig1 ) float_set_inexact();
844     if ( increment ) {
845         ++zSig0;
846         if ( zSig0 == 0 ) {
847             ++zExp;
848             zSig0 = LIT64( 0x8000000000000000 );
849         }
850         else {
851             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
852         }
853     }
854     else {
855         if ( zSig0 == 0 ) zExp = 0;
856     }
857     return packFloatx80( zSign, zExp, zSig0 );
858 
859 }
860 
861 /*
862 -------------------------------------------------------------------------------
863 Takes an abstract floating-point value having sign `zSign', exponent
864 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
865 and returns the proper extended double-precision floating-point value
866 corresponding to the abstract input.  This routine is just like
867 `roundAndPackFloatx80' except that the input significand does not have to be
868 normalized.
869 -------------------------------------------------------------------------------
870 */
871 static floatx80
872  normalizeRoundAndPackFloatx80(
873      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
874  )
875 {
876     int8 shiftCount;
877 
878     if ( zSig0 == 0 ) {
879         zSig0 = zSig1;
880         zSig1 = 0;
881         zExp -= 64;
882     }
883     shiftCount = countLeadingZeros64( zSig0 );
884     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
885     zExp -= shiftCount;
886     return
887         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
888 
889 }
890 
891 #endif
892 
893 #ifdef FLOAT128
894 
895 /*
896 -------------------------------------------------------------------------------
897 Returns the least-significant 64 fraction bits of the quadruple-precision
898 floating-point value `a'.
899 -------------------------------------------------------------------------------
900 */
901 INLINE bits64 extractFloat128Frac1( float128 a )
902 {
903 
904     return a.low;
905 
906 }
907 
908 /*
909 -------------------------------------------------------------------------------
910 Returns the most-significant 48 fraction bits of the quadruple-precision
911 floating-point value `a'.
912 -------------------------------------------------------------------------------
913 */
914 INLINE bits64 extractFloat128Frac0( float128 a )
915 {
916 
917     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
918 
919 }
920 
921 /*
922 -------------------------------------------------------------------------------
923 Returns the exponent bits of the quadruple-precision floating-point value
924 `a'.
925 -------------------------------------------------------------------------------
926 */
927 INLINE int32 extractFloat128Exp( float128 a )
928 {
929 
930     return ( a.high>>48 ) & 0x7FFF;
931 
932 }
933 
934 /*
935 -------------------------------------------------------------------------------
936 Returns the sign bit of the quadruple-precision floating-point value `a'.
937 -------------------------------------------------------------------------------
938 */
939 INLINE flag extractFloat128Sign( float128 a )
940 {
941 
942     return a.high>>63;
943 
944 }
945 
946 /*
947 -------------------------------------------------------------------------------
948 Normalizes the subnormal quadruple-precision floating-point value
949 represented by the denormalized significand formed by the concatenation of
950 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
951 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
952 significand are stored at the location pointed to by `zSig0Ptr', and the
953 least significant 64 bits of the normalized significand are stored at the
954 location pointed to by `zSig1Ptr'.
955 -------------------------------------------------------------------------------
956 */
957 static void
958  normalizeFloat128Subnormal(
959      bits64 aSig0,
960      bits64 aSig1,
961      int32 *zExpPtr,
962      bits64 *zSig0Ptr,
963      bits64 *zSig1Ptr
964  )
965 {
966     int8 shiftCount;
967 
968     if ( aSig0 == 0 ) {
969         shiftCount = countLeadingZeros64( aSig1 ) - 15;
970         if ( shiftCount < 0 ) {
971             *zSig0Ptr = aSig1>>( - shiftCount );
972             *zSig1Ptr = aSig1<<( shiftCount & 63 );
973         }
974         else {
975             *zSig0Ptr = aSig1<<shiftCount;
976             *zSig1Ptr = 0;
977         }
978         *zExpPtr = - shiftCount - 63;
979     }
980     else {
981         shiftCount = countLeadingZeros64( aSig0 ) - 15;
982         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
983         *zExpPtr = 1 - shiftCount;
984     }
985 
986 }
987 
988 /*
989 -------------------------------------------------------------------------------
990 Packs the sign `zSign', the exponent `zExp', and the significand formed
991 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
992 floating-point value, returning the result.  After being shifted into the
993 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
994 added together to form the most significant 32 bits of the result.  This
995 means that any integer portion of `zSig0' will be added into the exponent.
996 Since a properly normalized significand will have an integer portion equal
997 to 1, the `zExp' input should be 1 less than the desired result exponent
998 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
999 significand.
1000 -------------------------------------------------------------------------------
1001 */
1002 INLINE float128
1003  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1004 {
1005     float128 z;
1006 
1007     z.low = zSig1;
1008     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
1009     return z;
1010 
1011 }
1012 
1013 /*
1014 -------------------------------------------------------------------------------
1015 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1016 and extended significand formed by the concatenation of `zSig0', `zSig1',
1017 and `zSig2', and returns the proper quadruple-precision floating-point value
1018 corresponding to the abstract input.  Ordinarily, the abstract value is
1019 simply rounded and packed into the quadruple-precision format, with the
1020 inexact exception raised if the abstract input cannot be represented
1021 exactly.  However, if the abstract value is too large, the overflow and
1022 inexact exceptions are raised and an infinity or maximal finite value is
1023 returned.  If the abstract value is too small, the input value is rounded to
1024 a subnormal number, and the underflow and inexact exceptions are raised if
1025 the abstract input cannot be represented exactly as a subnormal quadruple-
1026 precision floating-point number.
1027     The input significand must be normalized or smaller.  If the input
1028 significand is not normalized, `zExp' must be 0; in that case, the result
1029 returned is a subnormal number, and it must not require rounding.  In the
1030 usual case that the input significand is normalized, `zExp' must be 1 less
1031 than the ``true'' floating-point exponent.  The handling of underflow and
1032 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1033 -------------------------------------------------------------------------------
1034 */
1035 static float128
1036  roundAndPackFloat128(
1037      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
1038 {
1039     int8 roundingMode;
1040     flag roundNearestEven, increment, isTiny;
1041 
1042     roundingMode = float_rounding_mode();
1043     roundNearestEven = ( roundingMode == float_round_nearest_even );
1044     increment = ( (sbits64) zSig2 < 0 );
1045     if ( ! roundNearestEven ) {
1046         if ( roundingMode == float_round_to_zero ) {
1047             increment = 0;
1048         }
1049         else {
1050             if ( zSign ) {
1051                 increment = ( roundingMode == float_round_down ) && zSig2;
1052             }
1053             else {
1054                 increment = ( roundingMode == float_round_up ) && zSig2;
1055             }
1056         }
1057     }
1058     if ( 0x7FFD <= (bits32) zExp ) {
1059         if (    ( 0x7FFD < zExp )
1060              || (    ( zExp == 0x7FFD )
1061                   && eq128(
1062                          LIT64( 0x0001FFFFFFFFFFFF ),
1063                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1064                          zSig0,
1065                          zSig1
1066                      )
1067                   && increment
1068                 )
1069            ) {
1070             float_raise( float_flag_overflow | float_flag_inexact );
1071             if (    ( roundingMode == float_round_to_zero )
1072                  || ( zSign && ( roundingMode == float_round_up ) )
1073                  || ( ! zSign && ( roundingMode == float_round_down ) )
1074                ) {
1075                 return
1076                     packFloat128(
1077                         zSign,
1078                         0x7FFE,
1079                         LIT64( 0x0000FFFFFFFFFFFF ),
1080                         LIT64( 0xFFFFFFFFFFFFFFFF )
1081                     );
1082             }
1083             return packFloat128( zSign, 0x7FFF, 0, 0 );
1084         }
1085         if ( zExp < 0 ) {
1086             isTiny =
1087                    ( float_detect_tininess == float_tininess_before_rounding )
1088                 || ( zExp < -1 )
1089                 || ! increment
1090                 || lt128(
1091                        zSig0,
1092                        zSig1,
1093                        LIT64( 0x0001FFFFFFFFFFFF ),
1094                        LIT64( 0xFFFFFFFFFFFFFFFF )
1095                    );
1096             shift128ExtraRightJamming(
1097                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1098             zExp = 0;
1099             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1100             if ( roundNearestEven ) {
1101                 increment = ( (sbits64) zSig2 < 0 );
1102             }
1103             else {
1104                 if ( zSign ) {
1105                     increment = ( roundingMode == float_round_down ) && zSig2;
1106                 }
1107                 else {
1108                     increment = ( roundingMode == float_round_up ) && zSig2;
1109                 }
1110             }
1111         }
1112     }
1113     if ( zSig2 ) float_set_inexact();
1114     if ( increment ) {
1115         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1116         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1117     }
1118     else {
1119         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1120     }
1121     return packFloat128( zSign, zExp, zSig0, zSig1 );
1122 
1123 }
1124 
1125 /*
1126 -------------------------------------------------------------------------------
1127 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1128 and significand formed by the concatenation of `zSig0' and `zSig1', and
1129 returns the proper quadruple-precision floating-point value corresponding
1130 to the abstract input.  This routine is just like `roundAndPackFloat128'
1131 except that the input significand has fewer bits and does not have to be
1132 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1133 point exponent.
1134 -------------------------------------------------------------------------------
1135 */
1136 static float128
1137  normalizeRoundAndPackFloat128(
1138      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1139 {
1140     int8 shiftCount;
1141     bits64 zSig2;
1142 
1143     if ( zSig0 == 0 ) {
1144         zSig0 = zSig1;
1145         zSig1 = 0;
1146         zExp -= 64;
1147     }
1148     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1149     if ( 0 <= shiftCount ) {
1150         zSig2 = 0;
1151         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1152     }
1153     else {
1154         shift128ExtraRightJamming(
1155             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1156     }
1157     zExp -= shiftCount;
1158     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1159 
1160 }
1161 
1162 #endif
1163 
1164 /*
1165 -------------------------------------------------------------------------------
1166 Returns the result of converting the 32-bit two's complement integer `a'
1167 to the single-precision floating-point format.  The conversion is performed
1168 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1169 -------------------------------------------------------------------------------
1170 */
1171 float32 int32_to_float32( int32 a )
1172 {
1173     flag zSign;
1174 
1175     if ( a == 0 ) return 0;
1176     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1177     zSign = ( a < 0 );
1178     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1179 
1180 }
1181 
1182 /*
1183 -------------------------------------------------------------------------------
1184 Returns the result of converting the 32-bit two's complement integer `a'
1185 to the double-precision floating-point format.  The conversion is performed
1186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187 -------------------------------------------------------------------------------
1188 */
1189 float64 int32_to_float64( int32 a )
1190 {
1191     flag zSign;
1192     uint32 absA;
1193     int8 shiftCount;
1194     bits64 zSig;
1195 
1196     if ( a == 0 ) return 0;
1197     zSign = ( a < 0 );
1198     absA = zSign ? - a : a;
1199     shiftCount = countLeadingZeros32( absA ) + 21;
1200     zSig = absA;
1201     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1202 
1203 }
1204 
1205 #ifdef FLOATX80
1206 
1207 /*
1208 -------------------------------------------------------------------------------
1209 Returns the result of converting the 32-bit two's complement integer `a'
1210 to the extended double-precision floating-point format.  The conversion
1211 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1212 Arithmetic.
1213 -------------------------------------------------------------------------------
1214 */
1215 floatx80 int32_to_floatx80( int32 a )
1216 {
1217     flag zSign;
1218     uint32 absA;
1219     int8 shiftCount;
1220     bits64 zSig;
1221 
1222     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1223     zSign = ( a < 0 );
1224     absA = zSign ? - a : a;
1225     shiftCount = countLeadingZeros32( absA ) + 32;
1226     zSig = absA;
1227     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1228 
1229 }
1230 
1231 #endif
1232 
1233 #ifdef FLOAT128
1234 
1235 /*
1236 -------------------------------------------------------------------------------
1237 Returns the result of converting the 32-bit two's complement integer `a' to
1238 the quadruple-precision floating-point format.  The conversion is performed
1239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1240 -------------------------------------------------------------------------------
1241 */
1242 float128 int32_to_float128( int32 a )
1243 {
1244     flag zSign;
1245     uint32 absA;
1246     int8 shiftCount;
1247     bits64 zSig0;
1248 
1249     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1250     zSign = ( a < 0 );
1251     absA = zSign ? - a : a;
1252     shiftCount = countLeadingZeros32( absA ) + 17;
1253     zSig0 = absA;
1254     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1255 
1256 }
1257 
1258 #endif
1259 
1260 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1261 /*
1262 -------------------------------------------------------------------------------
1263 Returns the result of converting the 64-bit two's complement integer `a'
1264 to the single-precision floating-point format.  The conversion is performed
1265 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1266 -------------------------------------------------------------------------------
1267 */
1268 float32 int64_to_float32( int64 a )
1269 {
1270     flag zSign;
1271     uint64 absA;
1272     int8 shiftCount;
1273 
1274     if ( a == 0 ) return 0;
1275     zSign = ( a < 0 );
1276     absA = zSign ? - a : a;
1277     shiftCount = countLeadingZeros64( absA ) - 40;
1278     if ( 0 <= shiftCount ) {
1279         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1280     }
1281     else {
1282         shiftCount += 7;
1283         if ( shiftCount < 0 ) {
1284             shift64RightJamming( absA, - shiftCount, &absA );
1285         }
1286         else {
1287             absA <<= shiftCount;
1288         }
1289         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1290     }
1291 
1292 }
1293 
1294 /*
1295 -------------------------------------------------------------------------------
1296 Returns the result of converting the 64-bit two's complement integer `a'
1297 to the double-precision floating-point format.  The conversion is performed
1298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1299 -------------------------------------------------------------------------------
1300 */
1301 float64 int64_to_float64( int64 a )
1302 {
1303     flag zSign;
1304 
1305     if ( a == 0 ) return 0;
1306     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1307         return packFloat64( 1, 0x43E, 0 );
1308     }
1309     zSign = ( a < 0 );
1310     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1311 
1312 }
1313 
1314 #ifdef FLOATX80
1315 
1316 /*
1317 -------------------------------------------------------------------------------
1318 Returns the result of converting the 64-bit two's complement integer `a'
1319 to the extended double-precision floating-point format.  The conversion
1320 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1321 Arithmetic.
1322 -------------------------------------------------------------------------------
1323 */
1324 floatx80 int64_to_floatx80( int64 a )
1325 {
1326     flag zSign;
1327     uint64 absA;
1328     int8 shiftCount;
1329 
1330     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1331     zSign = ( a < 0 );
1332     absA = zSign ? - a : a;
1333     shiftCount = countLeadingZeros64( absA );
1334     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1335 
1336 }
1337 
1338 #endif
1339 
1340 #ifdef FLOAT128
1341 
1342 /*
1343 -------------------------------------------------------------------------------
1344 Returns the result of converting the 64-bit two's complement integer `a' to
1345 the quadruple-precision floating-point format.  The conversion is performed
1346 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1347 -------------------------------------------------------------------------------
1348 */
1349 float128 int64_to_float128( int64 a )
1350 {
1351     flag zSign;
1352     uint64 absA;
1353     int8 shiftCount;
1354     int32 zExp;
1355     bits64 zSig0, zSig1;
1356 
1357     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1358     zSign = ( a < 0 );
1359     absA = zSign ? - a : a;
1360     shiftCount = countLeadingZeros64( absA ) + 49;
1361     zExp = 0x406E - shiftCount;
1362     if ( 64 <= shiftCount ) {
1363         zSig1 = 0;
1364         zSig0 = absA;
1365         shiftCount -= 64;
1366     }
1367     else {
1368         zSig1 = absA;
1369         zSig0 = 0;
1370     }
1371     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1372     return packFloat128( zSign, zExp, zSig0, zSig1 );
1373 
1374 }
1375 
1376 #endif
1377 #endif /* !SOFTFLOAT_FOR_GCC */
1378 
1379 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1380 /*
1381 -------------------------------------------------------------------------------
1382 Returns the result of converting the single-precision floating-point value
1383 `a' to the 32-bit two's complement integer format.  The conversion is
1384 performed according to the IEC/IEEE Standard for Binary Floating-Point
1385 Arithmetic---which means in particular that the conversion is rounded
1386 according to the current rounding mode.  If `a' is a NaN, the largest
1387 positive integer is returned.  Otherwise, if the conversion overflows, the
1388 largest integer with the same sign as `a' is returned.
1389 -------------------------------------------------------------------------------
1390 */
1391 int32 float32_to_int32( float32 a )
1392 {
1393     flag aSign;
1394     int16 aExp, shiftCount;
1395     bits32 aSig;
1396     bits64 aSig64;
1397 
1398     aSig = extractFloat32Frac( a );
1399     aExp = extractFloat32Exp( a );
1400     aSign = extractFloat32Sign( a );
1401     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1402     if ( aExp ) aSig |= 0x00800000;
1403     shiftCount = 0xAF - aExp;
1404     aSig64 = aSig;
1405     aSig64 <<= 32;
1406     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1407     return roundAndPackInt32( aSign, aSig64 );
1408 
1409 }
1410 #endif /* !SOFTFLOAT_FOR_GCC */
1411 
1412 /*
1413 -------------------------------------------------------------------------------
1414 Returns the result of converting the single-precision floating-point value
1415 `a' to the 32-bit two's complement integer format.  The conversion is
1416 performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 Arithmetic, except that the conversion is always rounded toward zero.
1418 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1419 the conversion overflows, the largest integer with the same sign as `a' is
1420 returned.
1421 -------------------------------------------------------------------------------
1422 */
1423 int32 float32_to_int32_round_to_zero( float32 a )
1424 {
1425     flag aSign;
1426     int16 aExp, shiftCount;
1427     bits32 aSig;
1428     int32 z;
1429 
1430     aSig = extractFloat32Frac( a );
1431     aExp = extractFloat32Exp( a );
1432     aSign = extractFloat32Sign( a );
1433     shiftCount = aExp - 0x9E;
1434     if ( 0 <= shiftCount ) {
1435         if ( a != 0xCF000000 ) {
1436             float_raise( float_flag_invalid );
1437             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1438         }
1439         return (sbits32) 0x80000000;
1440     }
1441     else if ( aExp <= 0x7E ) {
1442         if ( aExp | aSig ) float_set_inexact();
1443         return 0;
1444     }
1445     aSig = ( aSig | 0x00800000 )<<8;
1446     z = aSig>>( - shiftCount );
1447     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1448         float_set_inexact();
1449     }
1450     if ( aSign ) z = - z;
1451     return z;
1452 
1453 }
1454 
1455 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1456 /*
1457 -------------------------------------------------------------------------------
1458 Returns the result of converting the single-precision floating-point value
1459 `a' to the 64-bit two's complement integer format.  The conversion is
1460 performed according to the IEC/IEEE Standard for Binary Floating-Point
1461 Arithmetic---which means in particular that the conversion is rounded
1462 according to the current rounding mode.  If `a' is a NaN, the largest
1463 positive integer is returned.  Otherwise, if the conversion overflows, the
1464 largest integer with the same sign as `a' is returned.
1465 -------------------------------------------------------------------------------
1466 */
1467 int64 float32_to_int64( float32 a )
1468 {
1469     flag aSign;
1470     int16 aExp, shiftCount;
1471     bits32 aSig;
1472     bits64 aSig64, aSigExtra;
1473 
1474     aSig = extractFloat32Frac( a );
1475     aExp = extractFloat32Exp( a );
1476     aSign = extractFloat32Sign( a );
1477     shiftCount = 0xBE - aExp;
1478     if ( shiftCount < 0 ) {
1479         float_raise( float_flag_invalid );
1480         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1481             return LIT64( 0x7FFFFFFFFFFFFFFF );
1482         }
1483         return (sbits64) LIT64( 0x8000000000000000 );
1484     }
1485     if ( aExp ) aSig |= 0x00800000;
1486     aSig64 = aSig;
1487     aSig64 <<= 40;
1488     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1489     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1490 
1491 }
1492 
1493 /*
1494 -------------------------------------------------------------------------------
1495 Returns the result of converting the single-precision floating-point value
1496 `a' to the 64-bit two's complement integer format.  The conversion is
1497 performed according to the IEC/IEEE Standard for Binary Floating-Point
1498 Arithmetic, except that the conversion is always rounded toward zero.  If
1499 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1500 conversion overflows, the largest integer with the same sign as `a' is
1501 returned.
1502 -------------------------------------------------------------------------------
1503 */
1504 int64 float32_to_int64_round_to_zero( float32 a )
1505 {
1506     flag aSign;
1507     int16 aExp, shiftCount;
1508     bits32 aSig;
1509     bits64 aSig64;
1510     int64 z;
1511 
1512     aSig = extractFloat32Frac( a );
1513     aExp = extractFloat32Exp( a );
1514     aSign = extractFloat32Sign( a );
1515     shiftCount = aExp - 0xBE;
1516     if ( 0 <= shiftCount ) {
1517         if ( a != 0xDF000000 ) {
1518             float_raise( float_flag_invalid );
1519             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1520                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1521             }
1522         }
1523         return (sbits64) LIT64( 0x8000000000000000 );
1524     }
1525     else if ( aExp <= 0x7E ) {
1526         if ( aExp | aSig ) float_set_inexact();
1527         return 0;
1528     }
1529     aSig64 = aSig | 0x00800000;
1530     aSig64 <<= 40;
1531     z = aSig64>>( - shiftCount );
1532     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1533         float_set_inexact();
1534     }
1535     if ( aSign ) z = - z;
1536     return z;
1537 
1538 }
1539 #endif /* !SOFTFLOAT_FOR_GCC */
1540 
1541 /*
1542 -------------------------------------------------------------------------------
1543 Returns the result of converting the single-precision floating-point value
1544 `a' to the double-precision floating-point format.  The conversion is
1545 performed according to the IEC/IEEE Standard for Binary Floating-Point
1546 Arithmetic.
1547 -------------------------------------------------------------------------------
1548 */
1549 float64 float32_to_float64( float32 a )
1550 {
1551     flag aSign;
1552     int16 aExp;
1553     bits32 aSig;
1554 
1555     aSig = extractFloat32Frac( a );
1556     aExp = extractFloat32Exp( a );
1557     aSign = extractFloat32Sign( a );
1558     if ( aExp == 0xFF ) {
1559         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1560         return packFloat64( aSign, 0x7FF, 0 );
1561     }
1562     if ( aExp == 0 ) {
1563         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1564         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1565         --aExp;
1566     }
1567     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1568 
1569 }
1570 
1571 #ifdef FLOATX80
1572 
1573 /*
1574 -------------------------------------------------------------------------------
1575 Returns the result of converting the single-precision floating-point value
1576 `a' to the extended double-precision floating-point format.  The conversion
1577 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1578 Arithmetic.
1579 -------------------------------------------------------------------------------
1580 */
1581 floatx80 float32_to_floatx80( float32 a )
1582 {
1583     flag aSign;
1584     int16 aExp;
1585     bits32 aSig;
1586 
1587     aSig = extractFloat32Frac( a );
1588     aExp = extractFloat32Exp( a );
1589     aSign = extractFloat32Sign( a );
1590     if ( aExp == 0xFF ) {
1591         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1592         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1593     }
1594     if ( aExp == 0 ) {
1595         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1596         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1597     }
1598     aSig |= 0x00800000;
1599     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1600 
1601 }
1602 
1603 #endif
1604 
1605 #ifdef FLOAT128
1606 
1607 /*
1608 -------------------------------------------------------------------------------
1609 Returns the result of converting the single-precision floating-point value
1610 `a' to the double-precision floating-point format.  The conversion is
1611 performed according to the IEC/IEEE Standard for Binary Floating-Point
1612 Arithmetic.
1613 -------------------------------------------------------------------------------
1614 */
1615 float128 float32_to_float128( float32 a )
1616 {
1617     flag aSign;
1618     int16 aExp;
1619     bits32 aSig;
1620 
1621     aSig = extractFloat32Frac( a );
1622     aExp = extractFloat32Exp( a );
1623     aSign = extractFloat32Sign( a );
1624     if ( aExp == 0xFF ) {
1625         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1626         return packFloat128( aSign, 0x7FFF, 0, 0 );
1627     }
1628     if ( aExp == 0 ) {
1629         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1630         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1631         --aExp;
1632     }
1633     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1634 
1635 }
1636 
1637 #endif
1638 
1639 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1640 /*
1641 -------------------------------------------------------------------------------
1642 Rounds the single-precision floating-point value `a' to an integer, and
1643 returns the result as a single-precision floating-point value.  The
1644 operation is performed according to the IEC/IEEE Standard for Binary
1645 Floating-Point Arithmetic.
1646 -------------------------------------------------------------------------------
1647 */
1648 float32 float32_round_to_int( float32 a )
1649 {
1650     flag aSign;
1651     int16 aExp;
1652     bits32 lastBitMask, roundBitsMask;
1653     int8 roundingMode;
1654     float32 z;
1655 
1656     aExp = extractFloat32Exp( a );
1657     if ( 0x96 <= aExp ) {
1658         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1659             return propagateFloat32NaN( a, a );
1660         }
1661         return a;
1662     }
1663     if ( aExp <= 0x7E ) {
1664         if ( (bits32) ( a<<1 ) == 0 ) return a;
1665         float_set_inexact();
1666         aSign = extractFloat32Sign( a );
1667         switch ( float_rounding_mode() ) {
1668          case float_round_nearest_even:
1669             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1670                 return packFloat32( aSign, 0x7F, 0 );
1671             }
1672             break;
1673          case float_round_down:
1674             return aSign ? 0xBF800000 : 0;
1675          case float_round_up:
1676             return aSign ? 0x80000000 : 0x3F800000;
1677         }
1678         return packFloat32( aSign, 0, 0 );
1679     }
1680     lastBitMask = 1;
1681     lastBitMask <<= 0x96 - aExp;
1682     roundBitsMask = lastBitMask - 1;
1683     z = a;
1684     roundingMode = float_rounding_mode();
1685     if ( roundingMode == float_round_nearest_even ) {
1686         z += lastBitMask>>1;
1687         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1688     }
1689     else if ( roundingMode != float_round_to_zero ) {
1690         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1691             z += roundBitsMask;
1692         }
1693     }
1694     z &= ~ roundBitsMask;
1695     if ( z != a ) float_set_inexact();
1696     return z;
1697 
1698 }
1699 #endif /* !SOFTFLOAT_FOR_GCC */
1700 
1701 /*
1702 -------------------------------------------------------------------------------
1703 Returns the result of adding the absolute values of the single-precision
1704 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1705 before being returned.  `zSign' is ignored if the result is a NaN.
1706 The addition is performed according to the IEC/IEEE Standard for Binary
1707 Floating-Point Arithmetic.
1708 -------------------------------------------------------------------------------
1709 */
1710 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1711 {
1712     int16 aExp, bExp, zExp;
1713     bits32 aSig, bSig, zSig;
1714     int16 expDiff;
1715 
1716     aSig = extractFloat32Frac( a );
1717     aExp = extractFloat32Exp( a );
1718     bSig = extractFloat32Frac( b );
1719     bExp = extractFloat32Exp( b );
1720     expDiff = aExp - bExp;
1721     aSig <<= 6;
1722     bSig <<= 6;
1723     if ( 0 < expDiff ) {
1724         if ( aExp == 0xFF ) {
1725             if ( aSig ) return propagateFloat32NaN( a, b );
1726             return a;
1727         }
1728         if ( bExp == 0 ) {
1729             --expDiff;
1730         }
1731         else {
1732             bSig |= 0x20000000;
1733         }
1734         shift32RightJamming( bSig, expDiff, &bSig );
1735         zExp = aExp;
1736     }
1737     else if ( expDiff < 0 ) {
1738         if ( bExp == 0xFF ) {
1739             if ( bSig ) return propagateFloat32NaN( a, b );
1740             return packFloat32( zSign, 0xFF, 0 );
1741         }
1742         if ( aExp == 0 ) {
1743             ++expDiff;
1744         }
1745         else {
1746             aSig |= 0x20000000;
1747         }
1748         shift32RightJamming( aSig, - expDiff, &aSig );
1749         zExp = bExp;
1750     }
1751     else {
1752         if ( aExp == 0xFF ) {
1753             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1754             return a;
1755         }
1756         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1757         zSig = 0x40000000 + aSig + bSig;
1758         zExp = aExp;
1759         goto roundAndPack;
1760     }
1761     aSig |= 0x20000000;
1762     zSig = ( aSig + bSig )<<1;
1763     --zExp;
1764     if ( (sbits32) zSig < 0 ) {
1765         zSig = aSig + bSig;
1766         ++zExp;
1767     }
1768  roundAndPack:
1769     return roundAndPackFloat32( zSign, zExp, zSig );
1770 
1771 }
1772 
1773 /*
1774 -------------------------------------------------------------------------------
1775 Returns the result of subtracting the absolute values of the single-
1776 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1777 difference is negated before being returned.  `zSign' is ignored if the
1778 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1779 Standard for Binary Floating-Point Arithmetic.
1780 -------------------------------------------------------------------------------
1781 */
1782 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1783 {
1784     int16 aExp, bExp, zExp;
1785     bits32 aSig, bSig, zSig;
1786     int16 expDiff;
1787 
1788     aSig = extractFloat32Frac( a );
1789     aExp = extractFloat32Exp( a );
1790     bSig = extractFloat32Frac( b );
1791     bExp = extractFloat32Exp( b );
1792     expDiff = aExp - bExp;
1793     aSig <<= 7;
1794     bSig <<= 7;
1795     if ( 0 < expDiff ) goto aExpBigger;
1796     if ( expDiff < 0 ) goto bExpBigger;
1797     if ( aExp == 0xFF ) {
1798         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1799         float_raise( float_flag_invalid );
1800         return float32_default_nan;
1801     }
1802     if ( aExp == 0 ) {
1803         aExp = 1;
1804         bExp = 1;
1805     }
1806     if ( bSig < aSig ) goto aBigger;
1807     if ( aSig < bSig ) goto bBigger;
1808     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1809  bExpBigger:
1810     if ( bExp == 0xFF ) {
1811         if ( bSig ) return propagateFloat32NaN( a, b );
1812         return packFloat32( zSign ^ 1, 0xFF, 0 );
1813     }
1814     if ( aExp == 0 ) {
1815         ++expDiff;
1816     }
1817     else {
1818         aSig |= 0x40000000;
1819     }
1820     shift32RightJamming( aSig, - expDiff, &aSig );
1821     bSig |= 0x40000000;
1822  bBigger:
1823     zSig = bSig - aSig;
1824     zExp = bExp;
1825     zSign ^= 1;
1826     goto normalizeRoundAndPack;
1827  aExpBigger:
1828     if ( aExp == 0xFF ) {
1829         if ( aSig ) return propagateFloat32NaN( a, b );
1830         return a;
1831     }
1832     if ( bExp == 0 ) {
1833         --expDiff;
1834     }
1835     else {
1836         bSig |= 0x40000000;
1837     }
1838     shift32RightJamming( bSig, expDiff, &bSig );
1839     aSig |= 0x40000000;
1840  aBigger:
1841     zSig = aSig - bSig;
1842     zExp = aExp;
1843  normalizeRoundAndPack:
1844     --zExp;
1845     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1846 
1847 }
1848 
1849 /*
1850 -------------------------------------------------------------------------------
1851 Returns the result of adding the single-precision floating-point values `a'
1852 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1853 Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1855 */
1856 float32 float32_add( float32 a, float32 b )
1857 {
1858     flag aSign, bSign;
1859 
1860     aSign = extractFloat32Sign( a );
1861     bSign = extractFloat32Sign( b );
1862     if ( aSign == bSign ) {
1863         return addFloat32Sigs( a, b, aSign );
1864     }
1865     else {
1866         return subFloat32Sigs( a, b, aSign );
1867     }
1868 
1869 }
1870 
1871 /*
1872 -------------------------------------------------------------------------------
1873 Returns the result of subtracting the single-precision floating-point values
1874 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1875 for Binary Floating-Point Arithmetic.
1876 -------------------------------------------------------------------------------
1877 */
1878 float32 float32_sub( float32 a, float32 b )
1879 {
1880     flag aSign, bSign;
1881 
1882     aSign = extractFloat32Sign( a );
1883     bSign = extractFloat32Sign( b );
1884     if ( aSign == bSign ) {
1885         return subFloat32Sigs( a, b, aSign );
1886     }
1887     else {
1888         return addFloat32Sigs( a, b, aSign );
1889     }
1890 
1891 }
1892 
1893 /*
1894 -------------------------------------------------------------------------------
1895 Returns the result of multiplying the single-precision floating-point values
1896 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1897 for Binary Floating-Point Arithmetic.
1898 -------------------------------------------------------------------------------
1899 */
1900 float32 float32_mul( float32 a, float32 b )
1901 {
1902     flag aSign, bSign, zSign;
1903     int16 aExp, bExp, zExp;
1904     bits32 aSig, bSig;
1905     bits64 zSig64;
1906     bits32 zSig;
1907 
1908     aSig = extractFloat32Frac( a );
1909     aExp = extractFloat32Exp( a );
1910     aSign = extractFloat32Sign( a );
1911     bSig = extractFloat32Frac( b );
1912     bExp = extractFloat32Exp( b );
1913     bSign = extractFloat32Sign( b );
1914     zSign = aSign ^ bSign;
1915     if ( aExp == 0xFF ) {
1916         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1917             return propagateFloat32NaN( a, b );
1918         }
1919         if ( ( bExp | bSig ) == 0 ) {
1920             float_raise( float_flag_invalid );
1921             return float32_default_nan;
1922         }
1923         return packFloat32( zSign, 0xFF, 0 );
1924     }
1925     if ( bExp == 0xFF ) {
1926         if ( bSig ) return propagateFloat32NaN( a, b );
1927         if ( ( aExp | aSig ) == 0 ) {
1928             float_raise( float_flag_invalid );
1929             return float32_default_nan;
1930         }
1931         return packFloat32( zSign, 0xFF, 0 );
1932     }
1933     if ( aExp == 0 ) {
1934         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1935         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1936     }
1937     if ( bExp == 0 ) {
1938         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1939         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1940     }
1941     zExp = aExp + bExp - 0x7F;
1942     aSig = ( aSig | 0x00800000 )<<7;
1943     bSig = ( bSig | 0x00800000 )<<8;
1944     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1945     zSig = zSig64;
1946     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1947         zSig <<= 1;
1948         --zExp;
1949     }
1950     return roundAndPackFloat32( zSign, zExp, zSig );
1951 
1952 }
1953 
1954 /*
1955 -------------------------------------------------------------------------------
1956 Returns the result of dividing the single-precision floating-point value `a'
1957 by the corresponding value `b'.  The operation is performed according to the
1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 -------------------------------------------------------------------------------
1960 */
1961 float32 float32_div( float32 a, float32 b )
1962 {
1963     flag aSign, bSign, zSign;
1964     int16 aExp, bExp, zExp;
1965     bits32 aSig, bSig, zSig;
1966 
1967     aSig = extractFloat32Frac( a );
1968     aExp = extractFloat32Exp( a );
1969     aSign = extractFloat32Sign( a );
1970     bSig = extractFloat32Frac( b );
1971     bExp = extractFloat32Exp( b );
1972     bSign = extractFloat32Sign( b );
1973     zSign = aSign ^ bSign;
1974     if ( aExp == 0xFF ) {
1975         if ( aSig ) return propagateFloat32NaN( a, b );
1976         if ( bExp == 0xFF ) {
1977             if ( bSig ) return propagateFloat32NaN( a, b );
1978             float_raise( float_flag_invalid );
1979             return float32_default_nan;
1980         }
1981         return packFloat32( zSign, 0xFF, 0 );
1982     }
1983     if ( bExp == 0xFF ) {
1984         if ( bSig ) return propagateFloat32NaN( a, b );
1985         return packFloat32( zSign, 0, 0 );
1986     }
1987     if ( bExp == 0 ) {
1988         if ( bSig == 0 ) {
1989             if ( ( aExp | aSig ) == 0 ) {
1990                 float_raise( float_flag_invalid );
1991                 return float32_default_nan;
1992             }
1993             float_raise( float_flag_divbyzero );
1994             return packFloat32( zSign, 0xFF, 0 );
1995         }
1996         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1997     }
1998     if ( aExp == 0 ) {
1999         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2000         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2001     }
2002     zExp = aExp - bExp + 0x7D;
2003     aSig = ( aSig | 0x00800000 )<<7;
2004     bSig = ( bSig | 0x00800000 )<<8;
2005     if ( bSig <= ( aSig + aSig ) ) {
2006         aSig >>= 1;
2007         ++zExp;
2008     }
2009     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2010     if ( ( zSig & 0x3F ) == 0 ) {
2011         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2012     }
2013     return roundAndPackFloat32( zSign, zExp, zSig );
2014 
2015 }
2016 
2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2018 /*
2019 -------------------------------------------------------------------------------
2020 Returns the remainder of the single-precision floating-point value `a'
2021 with respect to the corresponding value `b'.  The operation is performed
2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 -------------------------------------------------------------------------------
2024 */
2025 float32 float32_rem( float32 a, float32 b )
2026 {
2027     flag aSign, bSign, zSign;
2028     int16 aExp, bExp, expDiff;
2029     bits32 aSig, bSig;
2030     bits32 q;
2031     bits64 aSig64, bSig64, q64;
2032     bits32 alternateASig;
2033     sbits32 sigMean;
2034 
2035     aSig = extractFloat32Frac( a );
2036     aExp = extractFloat32Exp( a );
2037     aSign = extractFloat32Sign( a );
2038     bSig = extractFloat32Frac( b );
2039     bExp = extractFloat32Exp( b );
2040     bSign = extractFloat32Sign( b );
2041     if ( aExp == 0xFF ) {
2042         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2043             return propagateFloat32NaN( a, b );
2044         }
2045         float_raise( float_flag_invalid );
2046         return float32_default_nan;
2047     }
2048     if ( bExp == 0xFF ) {
2049         if ( bSig ) return propagateFloat32NaN( a, b );
2050         return a;
2051     }
2052     if ( bExp == 0 ) {
2053         if ( bSig == 0 ) {
2054             float_raise( float_flag_invalid );
2055             return float32_default_nan;
2056         }
2057         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2058     }
2059     if ( aExp == 0 ) {
2060         if ( aSig == 0 ) return a;
2061         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2062     }
2063     expDiff = aExp - bExp;
2064     aSig |= 0x00800000;
2065     bSig |= 0x00800000;
2066     if ( expDiff < 32 ) {
2067         aSig <<= 8;
2068         bSig <<= 8;
2069         if ( expDiff < 0 ) {
2070             if ( expDiff < -1 ) return a;
2071             aSig >>= 1;
2072         }
2073         q = ( bSig <= aSig );
2074         if ( q ) aSig -= bSig;
2075         if ( 0 < expDiff ) {
2076             q = ( ( (bits64) aSig )<<32 ) / bSig;
2077             q >>= 32 - expDiff;
2078             bSig >>= 2;
2079             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2080         }
2081         else {
2082             aSig >>= 2;
2083             bSig >>= 2;
2084         }
2085     }
2086     else {
2087         if ( bSig <= aSig ) aSig -= bSig;
2088         aSig64 = ( (bits64) aSig )<<40;
2089         bSig64 = ( (bits64) bSig )<<40;
2090         expDiff -= 64;
2091         while ( 0 < expDiff ) {
2092             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094             aSig64 = - ( ( bSig * q64 )<<38 );
2095             expDiff -= 62;
2096         }
2097         expDiff += 64;
2098         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2099         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2100         q = q64>>( 64 - expDiff );
2101         bSig <<= 6;
2102         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2103     }
2104     do {
2105         alternateASig = aSig;
2106         ++q;
2107         aSig -= bSig;
2108     } while ( 0 <= (sbits32) aSig );
2109     sigMean = aSig + alternateASig;
2110     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2111         aSig = alternateASig;
2112     }
2113     zSign = ( (sbits32) aSig < 0 );
2114     if ( zSign ) aSig = - aSig;
2115     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2116 
2117 }
2118 #endif /* !SOFTFLOAT_FOR_GCC */
2119 
2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2121 /*
2122 -------------------------------------------------------------------------------
2123 Returns the square root of the single-precision floating-point value `a'.
2124 The operation is performed according to the IEC/IEEE Standard for Binary
2125 Floating-Point Arithmetic.
2126 -------------------------------------------------------------------------------
2127 */
2128 float32 float32_sqrt( float32 a )
2129 {
2130     flag aSign;
2131     int16 aExp, zExp;
2132     bits32 aSig, zSig;
2133     bits64 rem, term;
2134 
2135     aSig = extractFloat32Frac( a );
2136     aExp = extractFloat32Exp( a );
2137     aSign = extractFloat32Sign( a );
2138     if ( aExp == 0xFF ) {
2139         if ( aSig ) return propagateFloat32NaN( a, 0 );
2140         if ( ! aSign ) return a;
2141         float_raise( float_flag_invalid );
2142         return float32_default_nan;
2143     }
2144     if ( aSign ) {
2145         if ( ( aExp | aSig ) == 0 ) return a;
2146         float_raise( float_flag_invalid );
2147         return float32_default_nan;
2148     }
2149     if ( aExp == 0 ) {
2150         if ( aSig == 0 ) return 0;
2151         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2152     }
2153     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2154     aSig = ( aSig | 0x00800000 )<<8;
2155     zSig = estimateSqrt32( aExp, aSig ) + 2;
2156     if ( ( zSig & 0x7F ) <= 5 ) {
2157         if ( zSig < 2 ) {
2158             zSig = 0x7FFFFFFF;
2159             goto roundAndPack;
2160         }
2161         aSig >>= aExp & 1;
2162         term = ( (bits64) zSig ) * zSig;
2163         rem = ( ( (bits64) aSig )<<32 ) - term;
2164         while ( (sbits64) rem < 0 ) {
2165             --zSig;
2166             rem += ( ( (bits64) zSig )<<1 ) | 1;
2167         }
2168         zSig |= ( rem != 0 );
2169     }
2170     shift32RightJamming( zSig, 1, &zSig );
2171  roundAndPack:
2172     return roundAndPackFloat32( 0, zExp, zSig );
2173 
2174 }
2175 #endif /* !SOFTFLOAT_FOR_GCC */
2176 
2177 /*
2178 -------------------------------------------------------------------------------
2179 Returns 1 if the single-precision floating-point value `a' is equal to
2180 the corresponding value `b', and 0 otherwise.  The comparison is performed
2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 -------------------------------------------------------------------------------
2183 */
2184 flag float32_eq( float32 a, float32 b )
2185 {
2186 
2187     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2188          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2189        ) {
2190         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2191             float_raise( float_flag_invalid );
2192         }
2193         return 0;
2194     }
2195     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2196 
2197 }
2198 
2199 /*
2200 -------------------------------------------------------------------------------
2201 Returns 1 if the single-precision floating-point value `a' is less than
2202 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2204 Arithmetic.
2205 -------------------------------------------------------------------------------
2206 */
2207 flag float32_le( float32 a, float32 b )
2208 {
2209     flag aSign, bSign;
2210 
2211     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2212          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2213        ) {
2214         float_raise( float_flag_invalid );
2215         return 0;
2216     }
2217     aSign = extractFloat32Sign( a );
2218     bSign = extractFloat32Sign( b );
2219     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2220     return ( a == b ) || ( aSign ^ ( a < b ) );
2221 
2222 }
2223 
2224 /*
2225 -------------------------------------------------------------------------------
2226 Returns 1 if the single-precision floating-point value `a' is less than
2227 the corresponding value `b', and 0 otherwise.  The comparison is performed
2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 -------------------------------------------------------------------------------
2230 */
2231 flag float32_lt( float32 a, float32 b )
2232 {
2233     flag aSign, bSign;
2234 
2235     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2236          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2237        ) {
2238         float_raise( float_flag_invalid );
2239         return 0;
2240     }
2241     aSign = extractFloat32Sign( a );
2242     bSign = extractFloat32Sign( b );
2243     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2244     return ( a != b ) && ( aSign ^ ( a < b ) );
2245 
2246 }
2247 
2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2249 /*
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is equal to
2252 the corresponding value `b', and 0 otherwise.  The invalid exception is
2253 raised if either operand is a NaN.  Otherwise, the comparison is performed
2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2256 */
2257 flag float32_eq_signaling( float32 a, float32 b )
2258 {
2259 
2260     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2261          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2262        ) {
2263         float_raise( float_flag_invalid );
2264         return 0;
2265     }
2266     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2267 
2268 }
2269 
2270 /*
2271 -------------------------------------------------------------------------------
2272 Returns 1 if the single-precision floating-point value `a' is less than or
2273 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2274 cause an exception.  Otherwise, the comparison is performed according to the
2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 -------------------------------------------------------------------------------
2277 */
2278 flag float32_le_quiet( float32 a, float32 b )
2279 {
2280     flag aSign, bSign;
2281 
2282     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2283          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2284        ) {
2285         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2286             float_raise( float_flag_invalid );
2287         }
2288         return 0;
2289     }
2290     aSign = extractFloat32Sign( a );
2291     bSign = extractFloat32Sign( b );
2292     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2293     return ( a == b ) || ( aSign ^ ( a < b ) );
2294 
2295 }
2296 
2297 /*
2298 -------------------------------------------------------------------------------
2299 Returns 1 if the single-precision floating-point value `a' is less than
2300 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2301 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2302 Standard for Binary Floating-Point Arithmetic.
2303 -------------------------------------------------------------------------------
2304 */
2305 flag float32_lt_quiet( float32 a, float32 b )
2306 {
2307     flag aSign, bSign;
2308 
2309     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2310          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2311        ) {
2312         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2313             float_raise( float_flag_invalid );
2314         }
2315         return 0;
2316     }
2317     aSign = extractFloat32Sign( a );
2318     bSign = extractFloat32Sign( b );
2319     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2320     return ( a != b ) && ( aSign ^ ( a < b ) );
2321 
2322 }
2323 #endif /* !SOFTFLOAT_FOR_GCC */
2324 
2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2326 /*
2327 -------------------------------------------------------------------------------
2328 Returns the result of converting the double-precision floating-point value
2329 `a' to the 32-bit two's complement integer format.  The conversion is
2330 performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 Arithmetic---which means in particular that the conversion is rounded
2332 according to the current rounding mode.  If `a' is a NaN, the largest
2333 positive integer is returned.  Otherwise, if the conversion overflows, the
2334 largest integer with the same sign as `a' is returned.
2335 -------------------------------------------------------------------------------
2336 */
2337 int32 float64_to_int32( float64 a )
2338 {
2339     flag aSign;
2340     int16 aExp, shiftCount;
2341     bits64 aSig;
2342 
2343     aSig = extractFloat64Frac( a );
2344     aExp = extractFloat64Exp( a );
2345     aSign = extractFloat64Sign( a );
2346     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2347     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2348     shiftCount = 0x42C - aExp;
2349     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2350     return roundAndPackInt32( aSign, aSig );
2351 
2352 }
2353 #endif /* !SOFTFLOAT_FOR_GCC */
2354 
2355 /*
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 32-bit two's complement integer format.  The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic, except that the conversion is always rounded toward zero.
2361 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2362 the conversion overflows, the largest integer with the same sign as `a' is
2363 returned.
2364 -------------------------------------------------------------------------------
2365 */
2366 int32 float64_to_int32_round_to_zero( float64 a )
2367 {
2368     flag aSign;
2369     int16 aExp, shiftCount;
2370     bits64 aSig, savedASig;
2371     int32 z;
2372 
2373     aSig = extractFloat64Frac( a );
2374     aExp = extractFloat64Exp( a );
2375     aSign = extractFloat64Sign( a );
2376     if ( 0x41E < aExp ) {
2377         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2378         goto invalid;
2379     }
2380     else if ( aExp < 0x3FF ) {
2381         if ( aExp || aSig ) float_set_inexact();
2382         return 0;
2383     }
2384     aSig |= LIT64( 0x0010000000000000 );
2385     shiftCount = 0x433 - aExp;
2386     savedASig = aSig;
2387     aSig >>= shiftCount;
2388     z = aSig;
2389     if ( aSign ) z = - z;
2390     if ( ( z < 0 ) ^ aSign ) {
2391  invalid:
2392         float_raise( float_flag_invalid );
2393         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2394     }
2395     if ( ( aSig<<shiftCount ) != savedASig ) {
2396         float_set_inexact();
2397     }
2398     return z;
2399 
2400 }
2401 
2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2403 /*
2404 -------------------------------------------------------------------------------
2405 Returns the result of converting the double-precision floating-point value
2406 `a' to the 64-bit two's complement integer format.  The conversion is
2407 performed according to the IEC/IEEE Standard for Binary Floating-Point
2408 Arithmetic---which means in particular that the conversion is rounded
2409 according to the current rounding mode.  If `a' is a NaN, the largest
2410 positive integer is returned.  Otherwise, if the conversion overflows, the
2411 largest integer with the same sign as `a' is returned.
2412 -------------------------------------------------------------------------------
2413 */
2414 int64 float64_to_int64( float64 a )
2415 {
2416     flag aSign;
2417     int16 aExp, shiftCount;
2418     bits64 aSig, aSigExtra;
2419 
2420     aSig = extractFloat64Frac( a );
2421     aExp = extractFloat64Exp( a );
2422     aSign = extractFloat64Sign( a );
2423     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424     shiftCount = 0x433 - aExp;
2425     if ( shiftCount <= 0 ) {
2426         if ( 0x43E < aExp ) {
2427             float_raise( float_flag_invalid );
2428             if (    ! aSign
2429                  || (    ( aExp == 0x7FF )
2430                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2431                ) {
2432                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2433             }
2434             return (sbits64) LIT64( 0x8000000000000000 );
2435         }
2436         aSigExtra = 0;
2437         aSig <<= - shiftCount;
2438     }
2439     else {
2440         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2441     }
2442     return roundAndPackInt64( aSign, aSig, aSigExtra );
2443 
2444 }
2445 
2446 #ifdef __alpha__
2447 /*
2448 -------------------------------------------------------------------------------
2449 Returns the result of converting the double-precision floating-point value
2450 `a' to the 64-bit two's complement integer format.  The conversion is
2451 performed according to the IEC/IEEE Standard for Binary Floating-Point
2452 Arithmetic---which means in particular that the conversion is rounded
2453 according to the current rounding mode.  If `a' is a NaN, the invalid
2454 exception is raised and zero is returned.
2455 -------------------------------------------------------------------------------
2456 */
2457 int64 float64_to_int64_no_overflow( float64 a )
2458 {
2459     flag aSign;
2460     int16 aExp, shiftCount;
2461     bits64 aSig, aSigExtra;
2462 
2463     aSig = extractFloat64Frac( a );
2464     aExp = extractFloat64Exp( a );
2465     aSign = extractFloat64Sign( a );
2466     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2467     shiftCount = 0x433 - aExp;
2468     if ( shiftCount <= 0 ) {
2469         if ( 0x43E < aExp ) {
2470             float_raise( float_flag_invalid );
2471             return 0;
2472         }
2473         aSigExtra = 0;
2474         aSig <<= - shiftCount;
2475     }
2476     else {
2477         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2478     }
2479     return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra );
2480 
2481 }
2482 #endif /* __alpha__ */
2483 
2484 /*
2485 -------------------------------------------------------------------------------
2486 Returns the result of converting the double-precision floating-point value
2487 `a' to the 64-bit two's complement integer format.  The conversion is
2488 performed according to the IEC/IEEE Standard for Binary Floating-Point
2489 Arithmetic, except that the conversion is always rounded toward zero.
2490 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2491 the conversion overflows, the largest integer with the same sign as `a' is
2492 returned.
2493 -------------------------------------------------------------------------------
2494 */
2495 int64 float64_to_int64_round_to_zero( float64 a )
2496 {
2497     flag aSign;
2498     int16 aExp, shiftCount;
2499     bits64 aSig;
2500     int64 z;
2501 
2502     aSig = extractFloat64Frac( a );
2503     aExp = extractFloat64Exp( a );
2504     aSign = extractFloat64Sign( a );
2505     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2506     shiftCount = aExp - 0x433;
2507     if ( 0 <= shiftCount ) {
2508         if ( 0x43E <= aExp ) {
2509             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2510                 float_raise( float_flag_invalid );
2511                 if (    ! aSign
2512                      || (    ( aExp == 0x7FF )
2513                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2514                    ) {
2515                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2516                 }
2517             }
2518             return (sbits64) LIT64( 0x8000000000000000 );
2519         }
2520         z = aSig<<shiftCount;
2521     }
2522     else {
2523         if ( aExp < 0x3FE ) {
2524             if ( aExp | aSig ) float_set_inexact();
2525             return 0;
2526         }
2527         z = aSig>>( - shiftCount );
2528         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2529             float_set_inexact();
2530         }
2531     }
2532     if ( aSign ) z = - z;
2533     return z;
2534 
2535 }
2536 #endif /* !SOFTFLOAT_FOR_GCC */
2537 
2538 /*
2539 -------------------------------------------------------------------------------
2540 Returns the result of converting the double-precision floating-point value
2541 `a' to the single-precision floating-point format.  The conversion is
2542 performed according to the IEC/IEEE Standard for Binary Floating-Point
2543 Arithmetic.
2544 -------------------------------------------------------------------------------
2545 */
2546 float32 float64_to_float32( float64 a )
2547 {
2548     flag aSign;
2549     int16 aExp;
2550     bits64 aSig;
2551     bits32 zSig;
2552 
2553     aSig = extractFloat64Frac( a );
2554     aExp = extractFloat64Exp( a );
2555     aSign = extractFloat64Sign( a );
2556     if ( aExp == 0x7FF ) {
2557         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2558         return packFloat32( aSign, 0xFF, 0 );
2559     }
2560     shift64RightJamming( aSig, 22, &aSig );
2561     zSig = aSig;
2562     if ( aExp || zSig ) {
2563         zSig |= 0x40000000;
2564         aExp -= 0x381;
2565     }
2566     return roundAndPackFloat32( aSign, aExp, zSig );
2567 
2568 }
2569 
2570 #ifdef FLOATX80
2571 
2572 /*
2573 -------------------------------------------------------------------------------
2574 Returns the result of converting the double-precision floating-point value
2575 `a' to the extended double-precision floating-point format.  The conversion
2576 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2577 Arithmetic.
2578 -------------------------------------------------------------------------------
2579 */
2580 floatx80 float64_to_floatx80( float64 a )
2581 {
2582     flag aSign;
2583     int16 aExp;
2584     bits64 aSig;
2585 
2586     aSig = extractFloat64Frac( a );
2587     aExp = extractFloat64Exp( a );
2588     aSign = extractFloat64Sign( a );
2589     if ( aExp == 0x7FF ) {
2590         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2591         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2592     }
2593     if ( aExp == 0 ) {
2594         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2595         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2596     }
2597     return
2598         packFloatx80(
2599             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2600 
2601 }
2602 
2603 #endif
2604 
2605 #ifdef FLOAT128
2606 
2607 /*
2608 -------------------------------------------------------------------------------
2609 Returns the result of converting the double-precision floating-point value
2610 `a' to the quadruple-precision floating-point format.  The conversion is
2611 performed according to the IEC/IEEE Standard for Binary Floating-Point
2612 Arithmetic.
2613 -------------------------------------------------------------------------------
2614 */
2615 float128 float64_to_float128( float64 a )
2616 {
2617     flag aSign;
2618     int16 aExp;
2619     bits64 aSig, zSig0, zSig1;
2620 
2621     aSig = extractFloat64Frac( a );
2622     aExp = extractFloat64Exp( a );
2623     aSign = extractFloat64Sign( a );
2624     if ( aExp == 0x7FF ) {
2625         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2626         return packFloat128( aSign, 0x7FFF, 0, 0 );
2627     }
2628     if ( aExp == 0 ) {
2629         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2630         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2631         --aExp;
2632     }
2633     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2634     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2635 
2636 }
2637 
2638 #endif
2639 
2640 #ifndef SOFTFLOAT_FOR_GCC
2641 /*
2642 -------------------------------------------------------------------------------
2643 Rounds the double-precision floating-point value `a' to an integer, and
2644 returns the result as a double-precision floating-point value.  The
2645 operation is performed according to the IEC/IEEE Standard for Binary
2646 Floating-Point Arithmetic.
2647 -------------------------------------------------------------------------------
2648 */
2649 float64 float64_round_to_int( float64 a )
2650 {
2651     flag aSign;
2652     int16 aExp;
2653     bits64 lastBitMask, roundBitsMask;
2654     int8 roundingMode;
2655     float64 z;
2656 
2657     aExp = extractFloat64Exp( a );
2658     if ( 0x433 <= aExp ) {
2659         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2660             return propagateFloat64NaN( a, a );
2661         }
2662         return a;
2663     }
2664     if ( aExp < 0x3FF ) {
2665         if ( (bits64) ( a<<1 ) == 0 ) return a;
2666         float_set_inexact();
2667         aSign = extractFloat64Sign( a );
2668         switch ( float_rounding_mode() ) {
2669          case float_round_nearest_even:
2670             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2671                 return packFloat64( aSign, 0x3FF, 0 );
2672             }
2673             break;
2674          case float_round_down:
2675             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2676          case float_round_up:
2677             return
2678             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2679         }
2680         return packFloat64( aSign, 0, 0 );
2681     }
2682     lastBitMask = 1;
2683     lastBitMask <<= 0x433 - aExp;
2684     roundBitsMask = lastBitMask - 1;
2685     z = a;
2686     roundingMode = float_rounding_mode();
2687     if ( roundingMode == float_round_nearest_even ) {
2688         z += lastBitMask>>1;
2689         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2690     }
2691     else if ( roundingMode != float_round_to_zero ) {
2692         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2693             z += roundBitsMask;
2694         }
2695     }
2696     z &= ~ roundBitsMask;
2697     if ( z != a ) float_set_inexact();
2698     return z;
2699 
2700 }
2701 #endif
2702 
2703 /*
2704 -------------------------------------------------------------------------------
2705 Returns the result of adding the absolute values of the double-precision
2706 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2707 before being returned.  `zSign' is ignored if the result is a NaN.
2708 The addition is performed according to the IEC/IEEE Standard for Binary
2709 Floating-Point Arithmetic.
2710 -------------------------------------------------------------------------------
2711 */
2712 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2713 {
2714     int16 aExp, bExp, zExp;
2715     bits64 aSig, bSig, zSig;
2716     int16 expDiff;
2717 
2718     aSig = extractFloat64Frac( a );
2719     aExp = extractFloat64Exp( a );
2720     bSig = extractFloat64Frac( b );
2721     bExp = extractFloat64Exp( b );
2722     expDiff = aExp - bExp;
2723     aSig <<= 9;
2724     bSig <<= 9;
2725     if ( 0 < expDiff ) {
2726         if ( aExp == 0x7FF ) {
2727             if ( aSig ) return propagateFloat64NaN( a, b );
2728             return a;
2729         }
2730         if ( bExp == 0 ) {
2731             --expDiff;
2732         }
2733         else {
2734             bSig |= LIT64( 0x2000000000000000 );
2735         }
2736         shift64RightJamming( bSig, expDiff, &bSig );
2737         zExp = aExp;
2738     }
2739     else if ( expDiff < 0 ) {
2740         if ( bExp == 0x7FF ) {
2741             if ( bSig ) return propagateFloat64NaN( a, b );
2742             return packFloat64( zSign, 0x7FF, 0 );
2743         }
2744         if ( aExp == 0 ) {
2745             ++expDiff;
2746         }
2747         else {
2748             aSig |= LIT64( 0x2000000000000000 );
2749         }
2750         shift64RightJamming( aSig, - expDiff, &aSig );
2751         zExp = bExp;
2752     }
2753     else {
2754         if ( aExp == 0x7FF ) {
2755             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2756             return a;
2757         }
2758         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2759         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2760         zExp = aExp;
2761         goto roundAndPack;
2762     }
2763     aSig |= LIT64( 0x2000000000000000 );
2764     zSig = ( aSig + bSig )<<1;
2765     --zExp;
2766     if ( (sbits64) zSig < 0 ) {
2767         zSig = aSig + bSig;
2768         ++zExp;
2769     }
2770  roundAndPack:
2771     return roundAndPackFloat64( zSign, zExp, zSig );
2772 
2773 }
2774 
2775 /*
2776 -------------------------------------------------------------------------------
2777 Returns the result of subtracting the absolute values of the double-
2778 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2779 difference is negated before being returned.  `zSign' is ignored if the
2780 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2781 Standard for Binary Floating-Point Arithmetic.
2782 -------------------------------------------------------------------------------
2783 */
2784 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2785 {
2786     int16 aExp, bExp, zExp;
2787     bits64 aSig, bSig, zSig;
2788     int16 expDiff;
2789 
2790     aSig = extractFloat64Frac( a );
2791     aExp = extractFloat64Exp( a );
2792     bSig = extractFloat64Frac( b );
2793     bExp = extractFloat64Exp( b );
2794     expDiff = aExp - bExp;
2795     aSig <<= 10;
2796     bSig <<= 10;
2797     if ( 0 < expDiff ) goto aExpBigger;
2798     if ( expDiff < 0 ) goto bExpBigger;
2799     if ( aExp == 0x7FF ) {
2800         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2801         float_raise( float_flag_invalid );
2802         return float64_default_nan;
2803     }
2804     if ( aExp == 0 ) {
2805         aExp = 1;
2806         bExp = 1;
2807     }
2808     if ( bSig < aSig ) goto aBigger;
2809     if ( aSig < bSig ) goto bBigger;
2810     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2811  bExpBigger:
2812     if ( bExp == 0x7FF ) {
2813         if ( bSig ) return propagateFloat64NaN( a, b );
2814         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2815     }
2816     if ( aExp == 0 ) {
2817         ++expDiff;
2818     }
2819     else {
2820         aSig |= LIT64( 0x4000000000000000 );
2821     }
2822     shift64RightJamming( aSig, - expDiff, &aSig );
2823     bSig |= LIT64( 0x4000000000000000 );
2824  bBigger:
2825     zSig = bSig - aSig;
2826     zExp = bExp;
2827     zSign ^= 1;
2828     goto normalizeRoundAndPack;
2829  aExpBigger:
2830     if ( aExp == 0x7FF ) {
2831         if ( aSig ) return propagateFloat64NaN( a, b );
2832         return a;
2833     }
2834     if ( bExp == 0 ) {
2835         --expDiff;
2836     }
2837     else {
2838         bSig |= LIT64( 0x4000000000000000 );
2839     }
2840     shift64RightJamming( bSig, expDiff, &bSig );
2841     aSig |= LIT64( 0x4000000000000000 );
2842  aBigger:
2843     zSig = aSig - bSig;
2844     zExp = aExp;
2845  normalizeRoundAndPack:
2846     --zExp;
2847     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2848 
2849 }
2850 
2851 /*
2852 -------------------------------------------------------------------------------
2853 Returns the result of adding the double-precision floating-point values `a'
2854 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2855 Binary Floating-Point Arithmetic.
2856 -------------------------------------------------------------------------------
2857 */
2858 float64 float64_add( float64 a, float64 b )
2859 {
2860     flag aSign, bSign;
2861 
2862     aSign = extractFloat64Sign( a );
2863     bSign = extractFloat64Sign( b );
2864     if ( aSign == bSign ) {
2865         return addFloat64Sigs( a, b, aSign );
2866     }
2867     else {
2868         return subFloat64Sigs( a, b, aSign );
2869     }
2870 
2871 }
2872 
2873 /*
2874 -------------------------------------------------------------------------------
2875 Returns the result of subtracting the double-precision floating-point values
2876 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2877 for Binary Floating-Point Arithmetic.
2878 -------------------------------------------------------------------------------
2879 */
2880 float64 float64_sub( float64 a, float64 b )
2881 {
2882     flag aSign, bSign;
2883 
2884     aSign = extractFloat64Sign( a );
2885     bSign = extractFloat64Sign( b );
2886     if ( aSign == bSign ) {
2887         return subFloat64Sigs( a, b, aSign );
2888     }
2889     else {
2890         return addFloat64Sigs( a, b, aSign );
2891     }
2892 
2893 }
2894 
2895 /*
2896 -------------------------------------------------------------------------------
2897 Returns the result of multiplying the double-precision floating-point values
2898 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2899 for Binary Floating-Point Arithmetic.
2900 -------------------------------------------------------------------------------
2901 */
2902 float64 float64_mul( float64 a, float64 b )
2903 {
2904     flag aSign, bSign, zSign;
2905     int16 aExp, bExp, zExp;
2906     bits64 aSig, bSig, zSig0, zSig1;
2907 
2908     aSig = extractFloat64Frac( a );
2909     aExp = extractFloat64Exp( a );
2910     aSign = extractFloat64Sign( a );
2911     bSig = extractFloat64Frac( b );
2912     bExp = extractFloat64Exp( b );
2913     bSign = extractFloat64Sign( b );
2914     zSign = aSign ^ bSign;
2915     if ( aExp == 0x7FF ) {
2916         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2917             return propagateFloat64NaN( a, b );
2918         }
2919         if ( ( bExp | bSig ) == 0 ) {
2920             float_raise( float_flag_invalid );
2921             return float64_default_nan;
2922         }
2923         return packFloat64( zSign, 0x7FF, 0 );
2924     }
2925     if ( bExp == 0x7FF ) {
2926         if ( bSig ) return propagateFloat64NaN( a, b );
2927         if ( ( aExp | aSig ) == 0 ) {
2928             float_raise( float_flag_invalid );
2929             return float64_default_nan;
2930         }
2931         return packFloat64( zSign, 0x7FF, 0 );
2932     }
2933     if ( aExp == 0 ) {
2934         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2935         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2936     }
2937     if ( bExp == 0 ) {
2938         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2939         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2940     }
2941     zExp = aExp + bExp - 0x3FF;
2942     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2943     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2944     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2945     zSig0 |= ( zSig1 != 0 );
2946     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2947         zSig0 <<= 1;
2948         --zExp;
2949     }
2950     return roundAndPackFloat64( zSign, zExp, zSig0 );
2951 
2952 }
2953 
2954 /*
2955 -------------------------------------------------------------------------------
2956 Returns the result of dividing the double-precision floating-point value `a'
2957 by the corresponding value `b'.  The operation is performed according to
2958 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2959 -------------------------------------------------------------------------------
2960 */
2961 float64 float64_div( float64 a, float64 b )
2962 {
2963     flag aSign, bSign, zSign;
2964     int16 aExp, bExp, zExp;
2965     bits64 aSig, bSig, zSig;
2966     bits64 rem0, rem1;
2967     bits64 term0, term1;
2968 
2969     aSig = extractFloat64Frac( a );
2970     aExp = extractFloat64Exp( a );
2971     aSign = extractFloat64Sign( a );
2972     bSig = extractFloat64Frac( b );
2973     bExp = extractFloat64Exp( b );
2974     bSign = extractFloat64Sign( b );
2975     zSign = aSign ^ bSign;
2976     if ( aExp == 0x7FF ) {
2977         if ( aSig ) return propagateFloat64NaN( a, b );
2978         if ( bExp == 0x7FF ) {
2979             if ( bSig ) return propagateFloat64NaN( a, b );
2980             float_raise( float_flag_invalid );
2981             return float64_default_nan;
2982         }
2983         return packFloat64( zSign, 0x7FF, 0 );
2984     }
2985     if ( bExp == 0x7FF ) {
2986         if ( bSig ) return propagateFloat64NaN( a, b );
2987         return packFloat64( zSign, 0, 0 );
2988     }
2989     if ( bExp == 0 ) {
2990         if ( bSig == 0 ) {
2991             if ( ( aExp | aSig ) == 0 ) {
2992                 float_raise( float_flag_invalid );
2993                 return float64_default_nan;
2994             }
2995             float_raise( float_flag_divbyzero );
2996             return packFloat64( zSign, 0x7FF, 0 );
2997         }
2998         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2999     }
3000     if ( aExp == 0 ) {
3001         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3002         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3003     }
3004     zExp = aExp - bExp + 0x3FD;
3005     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3006     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3007     if ( bSig <= ( aSig + aSig ) ) {
3008         aSig >>= 1;
3009         ++zExp;
3010     }
3011     zSig = estimateDiv128To64( aSig, 0, bSig );
3012     if ( ( zSig & 0x1FF ) <= 2 ) {
3013         mul64To128( bSig, zSig, &term0, &term1 );
3014         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3015         while ( (sbits64) rem0 < 0 ) {
3016             --zSig;
3017             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3018         }
3019         zSig |= ( rem1 != 0 );
3020     }
3021     return roundAndPackFloat64( zSign, zExp, zSig );
3022 
3023 }
3024 
3025 #ifndef SOFTFLOAT_FOR_GCC
3026 /*
3027 -------------------------------------------------------------------------------
3028 Returns the remainder of the double-precision floating-point value `a'
3029 with respect to the corresponding value `b'.  The operation is performed
3030 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3031 -------------------------------------------------------------------------------
3032 */
3033 float64 float64_rem( float64 a, float64 b )
3034 {
3035     flag aSign, bSign, zSign;
3036     int16 aExp, bExp, expDiff;
3037     bits64 aSig, bSig;
3038     bits64 q, alternateASig;
3039     sbits64 sigMean;
3040 
3041     aSig = extractFloat64Frac( a );
3042     aExp = extractFloat64Exp( a );
3043     aSign = extractFloat64Sign( a );
3044     bSig = extractFloat64Frac( b );
3045     bExp = extractFloat64Exp( b );
3046     bSign = extractFloat64Sign( b );
3047     if ( aExp == 0x7FF ) {
3048         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3049             return propagateFloat64NaN( a, b );
3050         }
3051         float_raise( float_flag_invalid );
3052         return float64_default_nan;
3053     }
3054     if ( bExp == 0x7FF ) {
3055         if ( bSig ) return propagateFloat64NaN( a, b );
3056         return a;
3057     }
3058     if ( bExp == 0 ) {
3059         if ( bSig == 0 ) {
3060             float_raise( float_flag_invalid );
3061             return float64_default_nan;
3062         }
3063         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3064     }
3065     if ( aExp == 0 ) {
3066         if ( aSig == 0 ) return a;
3067         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3068     }
3069     expDiff = aExp - bExp;
3070     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3071     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3072     if ( expDiff < 0 ) {
3073         if ( expDiff < -1 ) return a;
3074         aSig >>= 1;
3075     }
3076     q = ( bSig <= aSig );
3077     if ( q ) aSig -= bSig;
3078     expDiff -= 64;
3079     while ( 0 < expDiff ) {
3080         q = estimateDiv128To64( aSig, 0, bSig );
3081         q = ( 2 < q ) ? q - 2 : 0;
3082         aSig = - ( ( bSig>>2 ) * q );
3083         expDiff -= 62;
3084     }
3085     expDiff += 64;
3086     if ( 0 < expDiff ) {
3087         q = estimateDiv128To64( aSig, 0, bSig );
3088         q = ( 2 < q ) ? q - 2 : 0;
3089         q >>= 64 - expDiff;
3090         bSig >>= 2;
3091         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3092     }
3093     else {
3094         aSig >>= 2;
3095         bSig >>= 2;
3096     }
3097     do {
3098         alternateASig = aSig;
3099         ++q;
3100         aSig -= bSig;
3101     } while ( 0 <= (sbits64) aSig );
3102     sigMean = aSig + alternateASig;
3103     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3104         aSig = alternateASig;
3105     }
3106     zSign = ( (sbits64) aSig < 0 );
3107     if ( zSign ) aSig = - aSig;
3108     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3109 
3110 }
3111 
3112 /*
3113 -------------------------------------------------------------------------------
3114 Returns the square root of the double-precision floating-point value `a'.
3115 The operation is performed according to the IEC/IEEE Standard for Binary
3116 Floating-Point Arithmetic.
3117 -------------------------------------------------------------------------------
3118 */
3119 float64 float64_sqrt( float64 a )
3120 {
3121     flag aSign;
3122     int16 aExp, zExp;
3123     bits64 aSig, zSig, doubleZSig;
3124     bits64 rem0, rem1, term0, term1;
3125 
3126     aSig = extractFloat64Frac( a );
3127     aExp = extractFloat64Exp( a );
3128     aSign = extractFloat64Sign( a );
3129     if ( aExp == 0x7FF ) {
3130         if ( aSig ) return propagateFloat64NaN( a, a );
3131         if ( ! aSign ) return a;
3132         float_raise( float_flag_invalid );
3133         return float64_default_nan;
3134     }
3135     if ( aSign ) {
3136         if ( ( aExp | aSig ) == 0 ) return a;
3137         float_raise( float_flag_invalid );
3138         return float64_default_nan;
3139     }
3140     if ( aExp == 0 ) {
3141         if ( aSig == 0 ) return 0;
3142         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3143     }
3144     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3145     aSig |= LIT64( 0x0010000000000000 );
3146     zSig = estimateSqrt32( aExp, aSig>>21 );
3147     aSig <<= 9 - ( aExp & 1 );
3148     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3149     if ( ( zSig & 0x1FF ) <= 5 ) {
3150         doubleZSig = zSig<<1;
3151         mul64To128( zSig, zSig, &term0, &term1 );
3152         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3153         while ( (sbits64) rem0 < 0 ) {
3154             --zSig;
3155             doubleZSig -= 2;
3156             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3157         }
3158         zSig |= ( ( rem0 | rem1 ) != 0 );
3159     }
3160     return roundAndPackFloat64( 0, zExp, zSig );
3161 
3162 }
3163 #endif
3164 
3165 /*
3166 -------------------------------------------------------------------------------
3167 Returns 1 if the double-precision floating-point value `a' is equal to the
3168 corresponding value `b', and 0 otherwise.  The comparison is performed
3169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3170 -------------------------------------------------------------------------------
3171 */
3172 flag float64_eq( float64 a, float64 b )
3173 {
3174 
3175     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3176          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3177        ) {
3178         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3179             float_raise( float_flag_invalid );
3180         }
3181         return 0;
3182     }
3183     return ( a == b ) ||
3184 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3185 
3186 }
3187 
3188 /*
3189 -------------------------------------------------------------------------------
3190 Returns 1 if the double-precision floating-point value `a' is less than or
3191 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3192 performed according to the IEC/IEEE Standard for Binary Floating-Point
3193 Arithmetic.
3194 -------------------------------------------------------------------------------
3195 */
3196 flag float64_le( float64 a, float64 b )
3197 {
3198     flag aSign, bSign;
3199 
3200     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3201          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3202        ) {
3203         float_raise( float_flag_invalid );
3204         return 0;
3205     }
3206     aSign = extractFloat64Sign( a );
3207     bSign = extractFloat64Sign( b );
3208     if ( aSign != bSign )
3209 	return aSign ||
3210 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3211 	      0 );
3212     return ( a == b ) ||
3213 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3214 
3215 }
3216 
3217 /*
3218 -------------------------------------------------------------------------------
3219 Returns 1 if the double-precision floating-point value `a' is less than
3220 the corresponding value `b', and 0 otherwise.  The comparison is performed
3221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3222 -------------------------------------------------------------------------------
3223 */
3224 flag float64_lt( float64 a, float64 b )
3225 {
3226     flag aSign, bSign;
3227 
3228     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3229          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3230        ) {
3231         float_raise( float_flag_invalid );
3232         return 0;
3233     }
3234     aSign = extractFloat64Sign( a );
3235     bSign = extractFloat64Sign( b );
3236     if ( aSign != bSign )
3237 	return aSign &&
3238 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3239 	      0 );
3240     return ( a != b ) &&
3241 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3242 
3243 }
3244 
3245 #ifndef SOFTFLOAT_FOR_GCC
3246 /*
3247 -------------------------------------------------------------------------------
3248 Returns 1 if the double-precision floating-point value `a' is equal to the
3249 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3250 if either operand is a NaN.  Otherwise, the comparison is performed
3251 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3252 -------------------------------------------------------------------------------
3253 */
3254 flag float64_eq_signaling( float64 a, float64 b )
3255 {
3256 
3257     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3258          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3259        ) {
3260         float_raise( float_flag_invalid );
3261         return 0;
3262     }
3263     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3264 
3265 }
3266 
3267 /*
3268 -------------------------------------------------------------------------------
3269 Returns 1 if the double-precision floating-point value `a' is less than or
3270 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3271 cause an exception.  Otherwise, the comparison is performed according to the
3272 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3273 -------------------------------------------------------------------------------
3274 */
3275 flag float64_le_quiet( float64 a, float64 b )
3276 {
3277     flag aSign, bSign;
3278 
3279     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3280          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3281        ) {
3282         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3283             float_raise( float_flag_invalid );
3284         }
3285         return 0;
3286     }
3287     aSign = extractFloat64Sign( a );
3288     bSign = extractFloat64Sign( b );
3289     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3290     return ( a == b ) || ( aSign ^ ( a < b ) );
3291 
3292 }
3293 
3294 /*
3295 -------------------------------------------------------------------------------
3296 Returns 1 if the double-precision floating-point value `a' is less than
3297 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3298 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3299 Standard for Binary Floating-Point Arithmetic.
3300 -------------------------------------------------------------------------------
3301 */
3302 flag float64_lt_quiet( float64 a, float64 b )
3303 {
3304     flag aSign, bSign;
3305 
3306     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3307          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3308        ) {
3309         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3310             float_raise( float_flag_invalid );
3311         }
3312         return 0;
3313     }
3314     aSign = extractFloat64Sign( a );
3315     bSign = extractFloat64Sign( b );
3316     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3317     return ( a != b ) && ( aSign ^ ( a < b ) );
3318 
3319 }
3320 #endif
3321 
3322 #ifdef FLOATX80
3323 
3324 /*
3325 -------------------------------------------------------------------------------
3326 Returns the result of converting the extended double-precision floating-
3327 point value `a' to the 32-bit two's complement integer format.  The
3328 conversion is performed according to the IEC/IEEE Standard for Binary
3329 Floating-Point Arithmetic---which means in particular that the conversion
3330 is rounded according to the current rounding mode.  If `a' is a NaN, the
3331 largest positive integer is returned.  Otherwise, if the conversion
3332 overflows, the largest integer with the same sign as `a' is returned.
3333 -------------------------------------------------------------------------------
3334 */
3335 int32 floatx80_to_int32( floatx80 a )
3336 {
3337     flag aSign;
3338     int32 aExp, shiftCount;
3339     bits64 aSig;
3340 
3341     aSig = extractFloatx80Frac( a );
3342     aExp = extractFloatx80Exp( a );
3343     aSign = extractFloatx80Sign( a );
3344     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3345     shiftCount = 0x4037 - aExp;
3346     if ( shiftCount <= 0 ) shiftCount = 1;
3347     shift64RightJamming( aSig, shiftCount, &aSig );
3348     return roundAndPackInt32( aSign, aSig );
3349 
3350 }
3351 
3352 /*
3353 -------------------------------------------------------------------------------
3354 Returns the result of converting the extended double-precision floating-
3355 point value `a' to the 32-bit two's complement integer format.  The
3356 conversion is performed according to the IEC/IEEE Standard for Binary
3357 Floating-Point Arithmetic, except that the conversion is always rounded
3358 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3359 Otherwise, if the conversion overflows, the largest integer with the same
3360 sign as `a' is returned.
3361 -------------------------------------------------------------------------------
3362 */
3363 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3364 {
3365     flag aSign;
3366     int32 aExp, shiftCount;
3367     bits64 aSig, savedASig;
3368     int32 z;
3369 
3370     aSig = extractFloatx80Frac( a );
3371     aExp = extractFloatx80Exp( a );
3372     aSign = extractFloatx80Sign( a );
3373     if ( 0x401E < aExp ) {
3374         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3375         goto invalid;
3376     }
3377     else if ( aExp < 0x3FFF ) {
3378         if ( aExp || aSig ) float_set_inexact();
3379         return 0;
3380     }
3381     shiftCount = 0x403E - aExp;
3382     savedASig = aSig;
3383     aSig >>= shiftCount;
3384     z = aSig;
3385     if ( aSign ) z = - z;
3386     if ( ( z < 0 ) ^ aSign ) {
3387  invalid:
3388         float_raise( float_flag_invalid );
3389         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3390     }
3391     if ( ( aSig<<shiftCount ) != savedASig ) {
3392         float_set_inexact();
3393     }
3394     return z;
3395 
3396 }
3397 
3398 /*
3399 -------------------------------------------------------------------------------
3400 Returns the result of converting the extended double-precision floating-
3401 point value `a' to the 64-bit two's complement integer format.  The
3402 conversion is performed according to the IEC/IEEE Standard for Binary
3403 Floating-Point Arithmetic---which means in particular that the conversion
3404 is rounded according to the current rounding mode.  If `a' is a NaN,
3405 the largest positive integer is returned.  Otherwise, if the conversion
3406 overflows, the largest integer with the same sign as `a' is returned.
3407 -------------------------------------------------------------------------------
3408 */
3409 int64 floatx80_to_int64( floatx80 a )
3410 {
3411     flag aSign;
3412     int32 aExp, shiftCount;
3413     bits64 aSig, aSigExtra;
3414 
3415     aSig = extractFloatx80Frac( a );
3416     aExp = extractFloatx80Exp( a );
3417     aSign = extractFloatx80Sign( a );
3418     shiftCount = 0x403E - aExp;
3419     if ( shiftCount <= 0 ) {
3420         if ( shiftCount ) {
3421             float_raise( float_flag_invalid );
3422             if (    ! aSign
3423                  || (    ( aExp == 0x7FFF )
3424                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3425                ) {
3426                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3427             }
3428             return (sbits64) LIT64( 0x8000000000000000 );
3429         }
3430         aSigExtra = 0;
3431     }
3432     else {
3433         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3434     }
3435     return roundAndPackInt64( aSign, aSig, aSigExtra );
3436 
3437 }
3438 
3439 /*
3440 -------------------------------------------------------------------------------
3441 Returns the result of converting the extended double-precision floating-
3442 point value `a' to the 64-bit two's complement integer format.  The
3443 conversion is performed according to the IEC/IEEE Standard for Binary
3444 Floating-Point Arithmetic, except that the conversion is always rounded
3445 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3446 Otherwise, if the conversion overflows, the largest integer with the same
3447 sign as `a' is returned.
3448 -------------------------------------------------------------------------------
3449 */
3450 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3451 {
3452     flag aSign;
3453     int32 aExp, shiftCount;
3454     bits64 aSig;
3455     int64 z;
3456 
3457     aSig = extractFloatx80Frac( a );
3458     aExp = extractFloatx80Exp( a );
3459     aSign = extractFloatx80Sign( a );
3460     shiftCount = aExp - 0x403E;
3461     if ( 0 <= shiftCount ) {
3462         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3463         if ( ( a.high != 0xC03E ) || aSig ) {
3464             float_raise( float_flag_invalid );
3465             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3466                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3467             }
3468         }
3469         return (sbits64) LIT64( 0x8000000000000000 );
3470     }
3471     else if ( aExp < 0x3FFF ) {
3472         if ( aExp | aSig ) float_set_inexact();
3473         return 0;
3474     }
3475     z = aSig>>( - shiftCount );
3476     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3477         float_set_inexact();
3478     }
3479     if ( aSign ) z = - z;
3480     return z;
3481 
3482 }
3483 
3484 /*
3485 -------------------------------------------------------------------------------
3486 Returns the result of converting the extended double-precision floating-
3487 point value `a' to the single-precision floating-point format.  The
3488 conversion is performed according to the IEC/IEEE Standard for Binary
3489 Floating-Point Arithmetic.
3490 -------------------------------------------------------------------------------
3491 */
3492 float32 floatx80_to_float32( floatx80 a )
3493 {
3494     flag aSign;
3495     int32 aExp;
3496     bits64 aSig;
3497 
3498     aSig = extractFloatx80Frac( a );
3499     aExp = extractFloatx80Exp( a );
3500     aSign = extractFloatx80Sign( a );
3501     if ( aExp == 0x7FFF ) {
3502         if ( (bits64) ( aSig<<1 ) ) {
3503             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3504         }
3505         return packFloat32( aSign, 0xFF, 0 );
3506     }
3507     shift64RightJamming( aSig, 33, &aSig );
3508     if ( aExp || aSig ) aExp -= 0x3F81;
3509     return roundAndPackFloat32( aSign, aExp, aSig );
3510 
3511 }
3512 
3513 /*
3514 -------------------------------------------------------------------------------
3515 Returns the result of converting the extended double-precision floating-
3516 point value `a' to the double-precision floating-point format.  The
3517 conversion is performed according to the IEC/IEEE Standard for Binary
3518 Floating-Point Arithmetic.
3519 -------------------------------------------------------------------------------
3520 */
3521 float64 floatx80_to_float64( floatx80 a )
3522 {
3523     flag aSign;
3524     int32 aExp;
3525     bits64 aSig, zSig;
3526 
3527     aSig = extractFloatx80Frac( a );
3528     aExp = extractFloatx80Exp( a );
3529     aSign = extractFloatx80Sign( a );
3530     if ( aExp == 0x7FFF ) {
3531         if ( (bits64) ( aSig<<1 ) ) {
3532             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3533         }
3534         return packFloat64( aSign, 0x7FF, 0 );
3535     }
3536     shift64RightJamming( aSig, 1, &zSig );
3537     if ( aExp || aSig ) aExp -= 0x3C01;
3538     return roundAndPackFloat64( aSign, aExp, zSig );
3539 
3540 }
3541 
3542 #ifdef FLOAT128
3543 
3544 /*
3545 -------------------------------------------------------------------------------
3546 Returns the result of converting the extended double-precision floating-
3547 point value `a' to the quadruple-precision floating-point format.  The
3548 conversion is performed according to the IEC/IEEE Standard for Binary
3549 Floating-Point Arithmetic.
3550 -------------------------------------------------------------------------------
3551 */
3552 float128 floatx80_to_float128( floatx80 a )
3553 {
3554     flag aSign;
3555     int16 aExp;
3556     bits64 aSig, zSig0, zSig1;
3557 
3558     aSig = extractFloatx80Frac( a );
3559     aExp = extractFloatx80Exp( a );
3560     aSign = extractFloatx80Sign( a );
3561     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3562         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3563     }
3564     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3565     return packFloat128( aSign, aExp, zSig0, zSig1 );
3566 
3567 }
3568 
3569 #endif
3570 
3571 /*
3572 -------------------------------------------------------------------------------
3573 Rounds the extended double-precision floating-point value `a' to an integer,
3574 and returns the result as an extended quadruple-precision floating-point
3575 value.  The operation is performed according to the IEC/IEEE Standard for
3576 Binary Floating-Point Arithmetic.
3577 -------------------------------------------------------------------------------
3578 */
3579 floatx80 floatx80_round_to_int( floatx80 a )
3580 {
3581     flag aSign;
3582     int32 aExp;
3583     bits64 lastBitMask, roundBitsMask;
3584     int8 roundingMode;
3585     floatx80 z;
3586 
3587     aExp = extractFloatx80Exp( a );
3588     if ( 0x403E <= aExp ) {
3589         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3590             return propagateFloatx80NaN( a, a );
3591         }
3592         return a;
3593     }
3594     if ( aExp < 0x3FFF ) {
3595         if (    ( aExp == 0 )
3596              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3597             return a;
3598         }
3599         float_set_inexact();
3600         aSign = extractFloatx80Sign( a );
3601         switch ( float_rounding_mode() ) {
3602          case float_round_nearest_even:
3603             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3604                ) {
3605                 return
3606                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3607             }
3608             break;
3609          case float_round_down:
3610             return
3611                   aSign ?
3612                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3613                 : packFloatx80( 0, 0, 0 );
3614          case float_round_up:
3615             return
3616                   aSign ? packFloatx80( 1, 0, 0 )
3617                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3618         }
3619         return packFloatx80( aSign, 0, 0 );
3620     }
3621     lastBitMask = 1;
3622     lastBitMask <<= 0x403E - aExp;
3623     roundBitsMask = lastBitMask - 1;
3624     z = a;
3625     roundingMode = float_rounding_mode();
3626     if ( roundingMode == float_round_nearest_even ) {
3627         z.low += lastBitMask>>1;
3628         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3629     }
3630     else if ( roundingMode != float_round_to_zero ) {
3631         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3632             z.low += roundBitsMask;
3633         }
3634     }
3635     z.low &= ~ roundBitsMask;
3636     if ( z.low == 0 ) {
3637         ++z.high;
3638         z.low = LIT64( 0x8000000000000000 );
3639     }
3640     if ( z.low != a.low ) float_set_inexact();
3641     return z;
3642 
3643 }
3644 
3645 /*
3646 -------------------------------------------------------------------------------
3647 Returns the result of adding the absolute values of the extended double-
3648 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3649 negated before being returned.  `zSign' is ignored if the result is a NaN.
3650 The addition is performed according to the IEC/IEEE Standard for Binary
3651 Floating-Point Arithmetic.
3652 -------------------------------------------------------------------------------
3653 */
3654 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3655 {
3656     int32 aExp, bExp, zExp;
3657     bits64 aSig, bSig, zSig0, zSig1;
3658     int32 expDiff;
3659 
3660     aSig = extractFloatx80Frac( a );
3661     aExp = extractFloatx80Exp( a );
3662     bSig = extractFloatx80Frac( b );
3663     bExp = extractFloatx80Exp( b );
3664     expDiff = aExp - bExp;
3665     if ( 0 < expDiff ) {
3666         if ( aExp == 0x7FFF ) {
3667             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3668             return a;
3669         }
3670         if ( bExp == 0 ) --expDiff;
3671         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3672         zExp = aExp;
3673     }
3674     else if ( expDiff < 0 ) {
3675         if ( bExp == 0x7FFF ) {
3676             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3677             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3678         }
3679         if ( aExp == 0 ) ++expDiff;
3680         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3681         zExp = bExp;
3682     }
3683     else {
3684         if ( aExp == 0x7FFF ) {
3685             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3686                 return propagateFloatx80NaN( a, b );
3687             }
3688             return a;
3689         }
3690         zSig1 = 0;
3691         zSig0 = aSig + bSig;
3692         if ( aExp == 0 ) {
3693             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3694             goto roundAndPack;
3695         }
3696         zExp = aExp;
3697         goto shiftRight1;
3698     }
3699     zSig0 = aSig + bSig;
3700     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3701  shiftRight1:
3702     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3703     zSig0 |= LIT64( 0x8000000000000000 );
3704     ++zExp;
3705  roundAndPack:
3706     return
3707         roundAndPackFloatx80(
3708             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3709 
3710 }
3711 
3712 /*
3713 -------------------------------------------------------------------------------
3714 Returns the result of subtracting the absolute values of the extended
3715 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3716 difference is negated before being returned.  `zSign' is ignored if the
3717 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3718 Standard for Binary Floating-Point Arithmetic.
3719 -------------------------------------------------------------------------------
3720 */
3721 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3722 {
3723     int32 aExp, bExp, zExp;
3724     bits64 aSig, bSig, zSig0, zSig1;
3725     int32 expDiff;
3726     floatx80 z;
3727 
3728     aSig = extractFloatx80Frac( a );
3729     aExp = extractFloatx80Exp( a );
3730     bSig = extractFloatx80Frac( b );
3731     bExp = extractFloatx80Exp( b );
3732     expDiff = aExp - bExp;
3733     if ( 0 < expDiff ) goto aExpBigger;
3734     if ( expDiff < 0 ) goto bExpBigger;
3735     if ( aExp == 0x7FFF ) {
3736         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3737             return propagateFloatx80NaN( a, b );
3738         }
3739         float_raise( float_flag_invalid );
3740         z.low = floatx80_default_nan_low;
3741         z.high = floatx80_default_nan_high;
3742         return z;
3743     }
3744     if ( aExp == 0 ) {
3745         aExp = 1;
3746         bExp = 1;
3747     }
3748     zSig1 = 0;
3749     if ( bSig < aSig ) goto aBigger;
3750     if ( aSig < bSig ) goto bBigger;
3751     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3752  bExpBigger:
3753     if ( bExp == 0x7FFF ) {
3754         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3755         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3756     }
3757     if ( aExp == 0 ) ++expDiff;
3758     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3759  bBigger:
3760     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3761     zExp = bExp;
3762     zSign ^= 1;
3763     goto normalizeRoundAndPack;
3764  aExpBigger:
3765     if ( aExp == 0x7FFF ) {
3766         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3767         return a;
3768     }
3769     if ( bExp == 0 ) --expDiff;
3770     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3771  aBigger:
3772     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3773     zExp = aExp;
3774  normalizeRoundAndPack:
3775     return
3776         normalizeRoundAndPackFloatx80(
3777             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3778 
3779 }
3780 
3781 /*
3782 -------------------------------------------------------------------------------
3783 Returns the result of adding the extended double-precision floating-point
3784 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3785 Standard for Binary Floating-Point Arithmetic.
3786 -------------------------------------------------------------------------------
3787 */
3788 floatx80 floatx80_add( floatx80 a, floatx80 b )
3789 {
3790     flag aSign, bSign;
3791 
3792     aSign = extractFloatx80Sign( a );
3793     bSign = extractFloatx80Sign( b );
3794     if ( aSign == bSign ) {
3795         return addFloatx80Sigs( a, b, aSign );
3796     }
3797     else {
3798         return subFloatx80Sigs( a, b, aSign );
3799     }
3800 
3801 }
3802 
3803 /*
3804 -------------------------------------------------------------------------------
3805 Returns the result of subtracting the extended double-precision floating-
3806 point values `a' and `b'.  The operation is performed according to the
3807 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 -------------------------------------------------------------------------------
3809 */
3810 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3811 {
3812     flag aSign, bSign;
3813 
3814     aSign = extractFloatx80Sign( a );
3815     bSign = extractFloatx80Sign( b );
3816     if ( aSign == bSign ) {
3817         return subFloatx80Sigs( a, b, aSign );
3818     }
3819     else {
3820         return addFloatx80Sigs( a, b, aSign );
3821     }
3822 
3823 }
3824 
3825 /*
3826 -------------------------------------------------------------------------------
3827 Returns the result of multiplying the extended double-precision floating-
3828 point values `a' and `b'.  The operation is performed according to the
3829 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3830 -------------------------------------------------------------------------------
3831 */
3832 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3833 {
3834     flag aSign, bSign, zSign;
3835     int32 aExp, bExp, zExp;
3836     bits64 aSig, bSig, zSig0, zSig1;
3837     floatx80 z;
3838 
3839     aSig = extractFloatx80Frac( a );
3840     aExp = extractFloatx80Exp( a );
3841     aSign = extractFloatx80Sign( a );
3842     bSig = extractFloatx80Frac( b );
3843     bExp = extractFloatx80Exp( b );
3844     bSign = extractFloatx80Sign( b );
3845     zSign = aSign ^ bSign;
3846     if ( aExp == 0x7FFF ) {
3847         if (    (bits64) ( aSig<<1 )
3848              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3849             return propagateFloatx80NaN( a, b );
3850         }
3851         if ( ( bExp | bSig ) == 0 ) goto invalid;
3852         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3853     }
3854     if ( bExp == 0x7FFF ) {
3855         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3856         if ( ( aExp | aSig ) == 0 ) {
3857  invalid:
3858             float_raise( float_flag_invalid );
3859             z.low = floatx80_default_nan_low;
3860             z.high = floatx80_default_nan_high;
3861             return z;
3862         }
3863         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3864     }
3865     if ( aExp == 0 ) {
3866         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3867         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3868     }
3869     if ( bExp == 0 ) {
3870         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3871         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3872     }
3873     zExp = aExp + bExp - 0x3FFE;
3874     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3875     if ( 0 < (sbits64) zSig0 ) {
3876         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3877         --zExp;
3878     }
3879     return
3880         roundAndPackFloatx80(
3881             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3882 
3883 }
3884 
3885 /*
3886 -------------------------------------------------------------------------------
3887 Returns the result of dividing the extended double-precision floating-point
3888 value `a' by the corresponding value `b'.  The operation is performed
3889 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3890 -------------------------------------------------------------------------------
3891 */
3892 floatx80 floatx80_div( floatx80 a, floatx80 b )
3893 {
3894     flag aSign, bSign, zSign;
3895     int32 aExp, bExp, zExp;
3896     bits64 aSig, bSig, zSig0, zSig1;
3897     bits64 rem0, rem1, rem2, term0, term1, term2;
3898     floatx80 z;
3899 
3900     aSig = extractFloatx80Frac( a );
3901     aExp = extractFloatx80Exp( a );
3902     aSign = extractFloatx80Sign( a );
3903     bSig = extractFloatx80Frac( b );
3904     bExp = extractFloatx80Exp( b );
3905     bSign = extractFloatx80Sign( b );
3906     zSign = aSign ^ bSign;
3907     if ( aExp == 0x7FFF ) {
3908         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3909         if ( bExp == 0x7FFF ) {
3910             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3911             goto invalid;
3912         }
3913         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3914     }
3915     if ( bExp == 0x7FFF ) {
3916         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3917         return packFloatx80( zSign, 0, 0 );
3918     }
3919     if ( bExp == 0 ) {
3920         if ( bSig == 0 ) {
3921             if ( ( aExp | aSig ) == 0 ) {
3922  invalid:
3923                 float_raise( float_flag_invalid );
3924                 z.low = floatx80_default_nan_low;
3925                 z.high = floatx80_default_nan_high;
3926                 return z;
3927             }
3928             float_raise( float_flag_divbyzero );
3929             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3930         }
3931         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3932     }
3933     if ( aExp == 0 ) {
3934         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3935         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3936     }
3937     zExp = aExp - bExp + 0x3FFE;
3938     rem1 = 0;
3939     if ( bSig <= aSig ) {
3940         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3941         ++zExp;
3942     }
3943     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3944     mul64To128( bSig, zSig0, &term0, &term1 );
3945     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3946     while ( (sbits64) rem0 < 0 ) {
3947         --zSig0;
3948         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3949     }
3950     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3951     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3952         mul64To128( bSig, zSig1, &term1, &term2 );
3953         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3954         while ( (sbits64) rem1 < 0 ) {
3955             --zSig1;
3956             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3957         }
3958         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3959     }
3960     return
3961         roundAndPackFloatx80(
3962             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3963 
3964 }
3965 
3966 /*
3967 -------------------------------------------------------------------------------
3968 Returns the remainder of the extended double-precision floating-point value
3969 `a' with respect to the corresponding value `b'.  The operation is performed
3970 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3971 -------------------------------------------------------------------------------
3972 */
3973 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3974 {
3975     flag aSign, bSign, zSign;
3976     int32 aExp, bExp, expDiff;
3977     bits64 aSig0, aSig1, bSig;
3978     bits64 q, term0, term1, alternateASig0, alternateASig1;
3979     floatx80 z;
3980 
3981     aSig0 = extractFloatx80Frac( a );
3982     aExp = extractFloatx80Exp( a );
3983     aSign = extractFloatx80Sign( a );
3984     bSig = extractFloatx80Frac( b );
3985     bExp = extractFloatx80Exp( b );
3986     bSign = extractFloatx80Sign( b );
3987     if ( aExp == 0x7FFF ) {
3988         if (    (bits64) ( aSig0<<1 )
3989              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3990             return propagateFloatx80NaN( a, b );
3991         }
3992         goto invalid;
3993     }
3994     if ( bExp == 0x7FFF ) {
3995         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3996         return a;
3997     }
3998     if ( bExp == 0 ) {
3999         if ( bSig == 0 ) {
4000  invalid:
4001             float_raise( float_flag_invalid );
4002             z.low = floatx80_default_nan_low;
4003             z.high = floatx80_default_nan_high;
4004             return z;
4005         }
4006         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4007     }
4008     if ( aExp == 0 ) {
4009         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4010         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4011     }
4012     bSig |= LIT64( 0x8000000000000000 );
4013     zSign = aSign;
4014     expDiff = aExp - bExp;
4015     aSig1 = 0;
4016     if ( expDiff < 0 ) {
4017         if ( expDiff < -1 ) return a;
4018         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4019         expDiff = 0;
4020     }
4021     q = ( bSig <= aSig0 );
4022     if ( q ) aSig0 -= bSig;
4023     expDiff -= 64;
4024     while ( 0 < expDiff ) {
4025         q = estimateDiv128To64( aSig0, aSig1, bSig );
4026         q = ( 2 < q ) ? q - 2 : 0;
4027         mul64To128( bSig, q, &term0, &term1 );
4028         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4029         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4030         expDiff -= 62;
4031     }
4032     expDiff += 64;
4033     if ( 0 < expDiff ) {
4034         q = estimateDiv128To64( aSig0, aSig1, bSig );
4035         q = ( 2 < q ) ? q - 2 : 0;
4036         q >>= 64 - expDiff;
4037         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4038         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4039         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4040         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4041             ++q;
4042             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4043         }
4044     }
4045     else {
4046         term1 = 0;
4047         term0 = bSig;
4048     }
4049     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4050     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4051          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4052               && ( q & 1 ) )
4053        ) {
4054         aSig0 = alternateASig0;
4055         aSig1 = alternateASig1;
4056         zSign = ! zSign;
4057     }
4058     return
4059         normalizeRoundAndPackFloatx80(
4060             80, zSign, bExp + expDiff, aSig0, aSig1 );
4061 
4062 }
4063 
4064 /*
4065 -------------------------------------------------------------------------------
4066 Returns the square root of the extended double-precision floating-point
4067 value `a'.  The operation is performed according to the IEC/IEEE Standard
4068 for Binary Floating-Point Arithmetic.
4069 -------------------------------------------------------------------------------
4070 */
4071 floatx80 floatx80_sqrt( floatx80 a )
4072 {
4073     flag aSign;
4074     int32 aExp, zExp;
4075     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4076     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4077     floatx80 z;
4078 
4079     aSig0 = extractFloatx80Frac( a );
4080     aExp = extractFloatx80Exp( a );
4081     aSign = extractFloatx80Sign( a );
4082     if ( aExp == 0x7FFF ) {
4083         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4084         if ( ! aSign ) return a;
4085         goto invalid;
4086     }
4087     if ( aSign ) {
4088         if ( ( aExp | aSig0 ) == 0 ) return a;
4089  invalid:
4090         float_raise( float_flag_invalid );
4091         z.low = floatx80_default_nan_low;
4092         z.high = floatx80_default_nan_high;
4093         return z;
4094     }
4095     if ( aExp == 0 ) {
4096         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4097         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4098     }
4099     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4100     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4101     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4102     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4103     doubleZSig0 = zSig0<<1;
4104     mul64To128( zSig0, zSig0, &term0, &term1 );
4105     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4106     while ( (sbits64) rem0 < 0 ) {
4107         --zSig0;
4108         doubleZSig0 -= 2;
4109         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4110     }
4111     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4112     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4113         if ( zSig1 == 0 ) zSig1 = 1;
4114         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4115         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4116         mul64To128( zSig1, zSig1, &term2, &term3 );
4117         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4118         while ( (sbits64) rem1 < 0 ) {
4119             --zSig1;
4120             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4121             term3 |= 1;
4122             term2 |= doubleZSig0;
4123             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4124         }
4125         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4126     }
4127     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4128     zSig0 |= doubleZSig0;
4129     return
4130         roundAndPackFloatx80(
4131             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4132 
4133 }
4134 
4135 /*
4136 -------------------------------------------------------------------------------
4137 Returns 1 if the extended double-precision floating-point value `a' is
4138 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4139 performed according to the IEC/IEEE Standard for Binary Floating-Point
4140 Arithmetic.
4141 -------------------------------------------------------------------------------
4142 */
4143 flag floatx80_eq( floatx80 a, floatx80 b )
4144 {
4145 
4146     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4147               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4148          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4149               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4150        ) {
4151         if (    floatx80_is_signaling_nan( a )
4152              || floatx80_is_signaling_nan( b ) ) {
4153             float_raise( float_flag_invalid );
4154         }
4155         return 0;
4156     }
4157     return
4158            ( a.low == b.low )
4159         && (    ( a.high == b.high )
4160              || (    ( a.low == 0 )
4161                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4162            );
4163 
4164 }
4165 
4166 /*
4167 -------------------------------------------------------------------------------
4168 Returns 1 if the extended double-precision floating-point value `a' is
4169 less than or equal to the corresponding value `b', and 0 otherwise.  The
4170 comparison is performed according to the IEC/IEEE Standard for Binary
4171 Floating-Point Arithmetic.
4172 -------------------------------------------------------------------------------
4173 */
4174 flag floatx80_le( floatx80 a, floatx80 b )
4175 {
4176     flag aSign, bSign;
4177 
4178     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4179               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4180          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4181               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4182        ) {
4183         float_raise( float_flag_invalid );
4184         return 0;
4185     }
4186     aSign = extractFloatx80Sign( a );
4187     bSign = extractFloatx80Sign( b );
4188     if ( aSign != bSign ) {
4189         return
4190                aSign
4191             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4192                  == 0 );
4193     }
4194     return
4195           aSign ? le128( b.high, b.low, a.high, a.low )
4196         : le128( a.high, a.low, b.high, b.low );
4197 
4198 }
4199 
4200 /*
4201 -------------------------------------------------------------------------------
4202 Returns 1 if the extended double-precision floating-point value `a' is
4203 less than the corresponding value `b', and 0 otherwise.  The comparison
4204 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4205 Arithmetic.
4206 -------------------------------------------------------------------------------
4207 */
4208 flag floatx80_lt( floatx80 a, floatx80 b )
4209 {
4210     flag aSign, bSign;
4211 
4212     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4213               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4214          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4215               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4216        ) {
4217         float_raise( float_flag_invalid );
4218         return 0;
4219     }
4220     aSign = extractFloatx80Sign( a );
4221     bSign = extractFloatx80Sign( b );
4222     if ( aSign != bSign ) {
4223         return
4224                aSign
4225             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4226                  != 0 );
4227     }
4228     return
4229           aSign ? lt128( b.high, b.low, a.high, a.low )
4230         : lt128( a.high, a.low, b.high, b.low );
4231 
4232 }
4233 
4234 /*
4235 -------------------------------------------------------------------------------
4236 Returns 1 if the extended double-precision floating-point value `a' is equal
4237 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4238 raised if either operand is a NaN.  Otherwise, the comparison is performed
4239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4240 -------------------------------------------------------------------------------
4241 */
4242 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4243 {
4244 
4245     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4246               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4247          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4248               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4249        ) {
4250         float_raise( float_flag_invalid );
4251         return 0;
4252     }
4253     return
4254            ( a.low == b.low )
4255         && (    ( a.high == b.high )
4256              || (    ( a.low == 0 )
4257                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4258            );
4259 
4260 }
4261 
4262 /*
4263 -------------------------------------------------------------------------------
4264 Returns 1 if the extended double-precision floating-point value `a' is less
4265 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4266 do not cause an exception.  Otherwise, the comparison is performed according
4267 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4268 -------------------------------------------------------------------------------
4269 */
4270 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4271 {
4272     flag aSign, bSign;
4273 
4274     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4275               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4276          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4277               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4278        ) {
4279         if (    floatx80_is_signaling_nan( a )
4280              || floatx80_is_signaling_nan( b ) ) {
4281             float_raise( float_flag_invalid );
4282         }
4283         return 0;
4284     }
4285     aSign = extractFloatx80Sign( a );
4286     bSign = extractFloatx80Sign( b );
4287     if ( aSign != bSign ) {
4288         return
4289                aSign
4290             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4291                  == 0 );
4292     }
4293     return
4294           aSign ? le128( b.high, b.low, a.high, a.low )
4295         : le128( a.high, a.low, b.high, b.low );
4296 
4297 }
4298 
4299 /*
4300 -------------------------------------------------------------------------------
4301 Returns 1 if the extended double-precision floating-point value `a' is less
4302 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4303 an exception.  Otherwise, the comparison is performed according to the
4304 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4305 -------------------------------------------------------------------------------
4306 */
4307 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4308 {
4309     flag aSign, bSign;
4310 
4311     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4312               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4313          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4314               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4315        ) {
4316         if (    floatx80_is_signaling_nan( a )
4317              || floatx80_is_signaling_nan( b ) ) {
4318             float_raise( float_flag_invalid );
4319         }
4320         return 0;
4321     }
4322     aSign = extractFloatx80Sign( a );
4323     bSign = extractFloatx80Sign( b );
4324     if ( aSign != bSign ) {
4325         return
4326                aSign
4327             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4328                  != 0 );
4329     }
4330     return
4331           aSign ? lt128( b.high, b.low, a.high, a.low )
4332         : lt128( a.high, a.low, b.high, b.low );
4333 
4334 }
4335 
4336 #endif
4337 
4338 #ifdef FLOAT128
4339 
4340 /*
4341 -------------------------------------------------------------------------------
4342 Returns the result of converting the quadruple-precision floating-point
4343 value `a' to the 32-bit two's complement integer format.  The conversion
4344 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4345 Arithmetic---which means in particular that the conversion is rounded
4346 according to the current rounding mode.  If `a' is a NaN, the largest
4347 positive integer is returned.  Otherwise, if the conversion overflows, the
4348 largest integer with the same sign as `a' is returned.
4349 -------------------------------------------------------------------------------
4350 */
4351 int32 float128_to_int32( float128 a )
4352 {
4353     flag aSign;
4354     int32 aExp, shiftCount;
4355     bits64 aSig0, aSig1;
4356 
4357     aSig1 = extractFloat128Frac1( a );
4358     aSig0 = extractFloat128Frac0( a );
4359     aExp = extractFloat128Exp( a );
4360     aSign = extractFloat128Sign( a );
4361     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4362     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4363     aSig0 |= ( aSig1 != 0 );
4364     shiftCount = 0x4028 - aExp;
4365     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4366     return roundAndPackInt32( aSign, aSig0 );
4367 
4368 }
4369 
4370 /*
4371 -------------------------------------------------------------------------------
4372 Returns the result of converting the quadruple-precision floating-point
4373 value `a' to the 32-bit two's complement integer format.  The conversion
4374 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4375 Arithmetic, except that the conversion is always rounded toward zero.  If
4376 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4377 conversion overflows, the largest integer with the same sign as `a' is
4378 returned.
4379 -------------------------------------------------------------------------------
4380 */
4381 int32 float128_to_int32_round_to_zero( float128 a )
4382 {
4383     flag aSign;
4384     int32 aExp, shiftCount;
4385     bits64 aSig0, aSig1, savedASig;
4386     int32 z;
4387 
4388     aSig1 = extractFloat128Frac1( a );
4389     aSig0 = extractFloat128Frac0( a );
4390     aExp = extractFloat128Exp( a );
4391     aSign = extractFloat128Sign( a );
4392     aSig0 |= ( aSig1 != 0 );
4393     if ( 0x401E < aExp ) {
4394         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4395         goto invalid;
4396     }
4397     else if ( aExp < 0x3FFF ) {
4398         if ( aExp || aSig0 ) float_set_inexact();
4399         return 0;
4400     }
4401     aSig0 |= LIT64( 0x0001000000000000 );
4402     shiftCount = 0x402F - aExp;
4403     savedASig = aSig0;
4404     aSig0 >>= shiftCount;
4405     z = aSig0;
4406     if ( aSign ) z = - z;
4407     if ( ( z < 0 ) ^ aSign ) {
4408  invalid:
4409         float_raise( float_flag_invalid );
4410         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4411     }
4412     if ( ( aSig0<<shiftCount ) != savedASig ) {
4413         float_set_inexact();
4414     }
4415     return z;
4416 
4417 }
4418 
4419 /*
4420 -------------------------------------------------------------------------------
4421 Returns the result of converting the quadruple-precision floating-point
4422 value `a' to the 64-bit two's complement integer format.  The conversion
4423 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4424 Arithmetic---which means in particular that the conversion is rounded
4425 according to the current rounding mode.  If `a' is a NaN, the largest
4426 positive integer is returned.  Otherwise, if the conversion overflows, the
4427 largest integer with the same sign as `a' is returned.
4428 -------------------------------------------------------------------------------
4429 */
4430 int64 float128_to_int64( float128 a )
4431 {
4432     flag aSign;
4433     int32 aExp, shiftCount;
4434     bits64 aSig0, aSig1;
4435 
4436     aSig1 = extractFloat128Frac1( a );
4437     aSig0 = extractFloat128Frac0( a );
4438     aExp = extractFloat128Exp( a );
4439     aSign = extractFloat128Sign( a );
4440     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4441     shiftCount = 0x402F - aExp;
4442     if ( shiftCount <= 0 ) {
4443         if ( 0x403E < aExp ) {
4444             float_raise( float_flag_invalid );
4445             if (    ! aSign
4446                  || (    ( aExp == 0x7FFF )
4447                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4448                     )
4449                ) {
4450                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4451             }
4452             return (sbits64) LIT64( 0x8000000000000000 );
4453         }
4454         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4455     }
4456     else {
4457         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4458     }
4459     return roundAndPackInt64( aSign, aSig0, aSig1 );
4460 
4461 }
4462 
4463 /*
4464 -------------------------------------------------------------------------------
4465 Returns the result of converting the quadruple-precision floating-point
4466 value `a' to the 64-bit two's complement integer format.  The conversion
4467 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4468 Arithmetic, except that the conversion is always rounded toward zero.
4469 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4470 the conversion overflows, the largest integer with the same sign as `a' is
4471 returned.
4472 -------------------------------------------------------------------------------
4473 */
4474 int64 float128_to_int64_round_to_zero( float128 a )
4475 {
4476     flag aSign;
4477     int32 aExp, shiftCount;
4478     bits64 aSig0, aSig1;
4479     int64 z;
4480 
4481     aSig1 = extractFloat128Frac1( a );
4482     aSig0 = extractFloat128Frac0( a );
4483     aExp = extractFloat128Exp( a );
4484     aSign = extractFloat128Sign( a );
4485     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4486     shiftCount = aExp - 0x402F;
4487     if ( 0 < shiftCount ) {
4488         if ( 0x403E <= aExp ) {
4489             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4490             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4491                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4492                 if ( aSig1 ) float_set_inexact();
4493             }
4494             else {
4495                 float_raise( float_flag_invalid );
4496                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4497                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4498                 }
4499             }
4500             return (sbits64) LIT64( 0x8000000000000000 );
4501         }
4502         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4503         if ( (bits64) ( aSig1<<shiftCount ) ) {
4504             float_set_inexact();
4505         }
4506     }
4507     else {
4508         if ( aExp < 0x3FFF ) {
4509             if ( aExp | aSig0 | aSig1 ) {
4510                 float_set_inexact();
4511             }
4512             return 0;
4513         }
4514         z = aSig0>>( - shiftCount );
4515         if (    aSig1
4516              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4517             float_set_inexact();
4518         }
4519     }
4520     if ( aSign ) z = - z;
4521     return z;
4522 
4523 }
4524 
4525 /*
4526 -------------------------------------------------------------------------------
4527 Returns the result of converting the quadruple-precision floating-point
4528 value `a' to the single-precision floating-point format.  The conversion
4529 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4530 Arithmetic.
4531 -------------------------------------------------------------------------------
4532 */
4533 float32 float128_to_float32( float128 a )
4534 {
4535     flag aSign;
4536     int32 aExp;
4537     bits64 aSig0, aSig1;
4538     bits32 zSig;
4539 
4540     aSig1 = extractFloat128Frac1( a );
4541     aSig0 = extractFloat128Frac0( a );
4542     aExp = extractFloat128Exp( a );
4543     aSign = extractFloat128Sign( a );
4544     if ( aExp == 0x7FFF ) {
4545         if ( aSig0 | aSig1 ) {
4546             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4547         }
4548         return packFloat32( aSign, 0xFF, 0 );
4549     }
4550     aSig0 |= ( aSig1 != 0 );
4551     shift64RightJamming( aSig0, 18, &aSig0 );
4552     zSig = aSig0;
4553     if ( aExp || zSig ) {
4554         zSig |= 0x40000000;
4555         aExp -= 0x3F81;
4556     }
4557     return roundAndPackFloat32( aSign, aExp, zSig );
4558 
4559 }
4560 
4561 /*
4562 -------------------------------------------------------------------------------
4563 Returns the result of converting the quadruple-precision floating-point
4564 value `a' to the double-precision floating-point format.  The conversion
4565 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4566 Arithmetic.
4567 -------------------------------------------------------------------------------
4568 */
4569 float64 float128_to_float64( float128 a )
4570 {
4571     flag aSign;
4572     int32 aExp;
4573     bits64 aSig0, aSig1;
4574 
4575     aSig1 = extractFloat128Frac1( a );
4576     aSig0 = extractFloat128Frac0( a );
4577     aExp = extractFloat128Exp( a );
4578     aSign = extractFloat128Sign( a );
4579     if ( aExp == 0x7FFF ) {
4580         if ( aSig0 | aSig1 ) {
4581             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4582         }
4583         return packFloat64( aSign, 0x7FF, 0 );
4584     }
4585     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4586     aSig0 |= ( aSig1 != 0 );
4587     if ( aExp || aSig0 ) {
4588         aSig0 |= LIT64( 0x4000000000000000 );
4589         aExp -= 0x3C01;
4590     }
4591     return roundAndPackFloat64( aSign, aExp, aSig0 );
4592 
4593 }
4594 
4595 #ifdef FLOATX80
4596 
4597 /*
4598 -------------------------------------------------------------------------------
4599 Returns the result of converting the quadruple-precision floating-point
4600 value `a' to the extended double-precision floating-point format.  The
4601 conversion is performed according to the IEC/IEEE Standard for Binary
4602 Floating-Point Arithmetic.
4603 -------------------------------------------------------------------------------
4604 */
4605 floatx80 float128_to_floatx80( float128 a )
4606 {
4607     flag aSign;
4608     int32 aExp;
4609     bits64 aSig0, aSig1;
4610 
4611     aSig1 = extractFloat128Frac1( a );
4612     aSig0 = extractFloat128Frac0( a );
4613     aExp = extractFloat128Exp( a );
4614     aSign = extractFloat128Sign( a );
4615     if ( aExp == 0x7FFF ) {
4616         if ( aSig0 | aSig1 ) {
4617             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4618         }
4619         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4620     }
4621     if ( aExp == 0 ) {
4622         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4623         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4624     }
4625     else {
4626         aSig0 |= LIT64( 0x0001000000000000 );
4627     }
4628     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4629     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4630 
4631 }
4632 
4633 #endif
4634 
4635 /*
4636 -------------------------------------------------------------------------------
4637 Rounds the quadruple-precision floating-point value `a' to an integer, and
4638 returns the result as a quadruple-precision floating-point value.  The
4639 operation is performed according to the IEC/IEEE Standard for Binary
4640 Floating-Point Arithmetic.
4641 -------------------------------------------------------------------------------
4642 */
4643 float128 float128_round_to_int( float128 a )
4644 {
4645     flag aSign;
4646     int32 aExp;
4647     bits64 lastBitMask, roundBitsMask;
4648     int8 roundingMode;
4649     float128 z;
4650 
4651     aExp = extractFloat128Exp( a );
4652     if ( 0x402F <= aExp ) {
4653         if ( 0x406F <= aExp ) {
4654             if (    ( aExp == 0x7FFF )
4655                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4656                ) {
4657                 return propagateFloat128NaN( a, a );
4658             }
4659             return a;
4660         }
4661         lastBitMask = 1;
4662         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4663         roundBitsMask = lastBitMask - 1;
4664         z = a;
4665         roundingMode = float_rounding_mode();
4666         if ( roundingMode == float_round_nearest_even ) {
4667             if ( lastBitMask ) {
4668                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4669                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4670             }
4671             else {
4672                 if ( (sbits64) z.low < 0 ) {
4673                     ++z.high;
4674                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4675                 }
4676             }
4677         }
4678         else if ( roundingMode != float_round_to_zero ) {
4679             if (   extractFloat128Sign( z )
4680                  ^ ( roundingMode == float_round_up ) ) {
4681                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4682             }
4683         }
4684         z.low &= ~ roundBitsMask;
4685     }
4686     else {
4687         if ( aExp < 0x3FFF ) {
4688             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4689             float_set_inexact();
4690             aSign = extractFloat128Sign( a );
4691             switch ( float_rounding_mode() ) {
4692              case float_round_nearest_even:
4693                 if (    ( aExp == 0x3FFE )
4694                      && (   extractFloat128Frac0( a )
4695                           | extractFloat128Frac1( a ) )
4696                    ) {
4697                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4698                 }
4699                 break;
4700              case float_round_down:
4701                 return
4702                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4703                     : packFloat128( 0, 0, 0, 0 );
4704              case float_round_up:
4705                 return
4706                       aSign ? packFloat128( 1, 0, 0, 0 )
4707                     : packFloat128( 0, 0x3FFF, 0, 0 );
4708             }
4709             return packFloat128( aSign, 0, 0, 0 );
4710         }
4711         lastBitMask = 1;
4712         lastBitMask <<= 0x402F - aExp;
4713         roundBitsMask = lastBitMask - 1;
4714         z.low = 0;
4715         z.high = a.high;
4716         roundingMode = float_rounding_mode();
4717         if ( roundingMode == float_round_nearest_even ) {
4718             z.high += lastBitMask>>1;
4719             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4720                 z.high &= ~ lastBitMask;
4721             }
4722         }
4723         else if ( roundingMode != float_round_to_zero ) {
4724             if (   extractFloat128Sign( z )
4725                  ^ ( roundingMode == float_round_up ) ) {
4726                 z.high |= ( a.low != 0 );
4727                 z.high += roundBitsMask;
4728             }
4729         }
4730         z.high &= ~ roundBitsMask;
4731     }
4732     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4733         float_set_inexact();
4734     }
4735     return z;
4736 
4737 }
4738 
4739 /*
4740 -------------------------------------------------------------------------------
4741 Returns the result of adding the absolute values of the quadruple-precision
4742 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4743 before being returned.  `zSign' is ignored if the result is a NaN.
4744 The addition is performed according to the IEC/IEEE Standard for Binary
4745 Floating-Point Arithmetic.
4746 -------------------------------------------------------------------------------
4747 */
4748 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4749 {
4750     int32 aExp, bExp, zExp;
4751     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4752     int32 expDiff;
4753 
4754     aSig1 = extractFloat128Frac1( a );
4755     aSig0 = extractFloat128Frac0( a );
4756     aExp = extractFloat128Exp( a );
4757     bSig1 = extractFloat128Frac1( b );
4758     bSig0 = extractFloat128Frac0( b );
4759     bExp = extractFloat128Exp( b );
4760     expDiff = aExp - bExp;
4761     if ( 0 < expDiff ) {
4762         if ( aExp == 0x7FFF ) {
4763             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4764             return a;
4765         }
4766         if ( bExp == 0 ) {
4767             --expDiff;
4768         }
4769         else {
4770             bSig0 |= LIT64( 0x0001000000000000 );
4771         }
4772         shift128ExtraRightJamming(
4773             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4774         zExp = aExp;
4775     }
4776     else if ( expDiff < 0 ) {
4777         if ( bExp == 0x7FFF ) {
4778             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4779             return packFloat128( zSign, 0x7FFF, 0, 0 );
4780         }
4781         if ( aExp == 0 ) {
4782             ++expDiff;
4783         }
4784         else {
4785             aSig0 |= LIT64( 0x0001000000000000 );
4786         }
4787         shift128ExtraRightJamming(
4788             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4789         zExp = bExp;
4790     }
4791     else {
4792         if ( aExp == 0x7FFF ) {
4793             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4794                 return propagateFloat128NaN( a, b );
4795             }
4796             return a;
4797         }
4798         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4799         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4800         zSig2 = 0;
4801         zSig0 |= LIT64( 0x0002000000000000 );
4802         zExp = aExp;
4803         goto shiftRight1;
4804     }
4805     aSig0 |= LIT64( 0x0001000000000000 );
4806     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4807     --zExp;
4808     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4809     ++zExp;
4810  shiftRight1:
4811     shift128ExtraRightJamming(
4812         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4813  roundAndPack:
4814     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4815 
4816 }
4817 
4818 /*
4819 -------------------------------------------------------------------------------
4820 Returns the result of subtracting the absolute values of the quadruple-
4821 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4822 difference is negated before being returned.  `zSign' is ignored if the
4823 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4824 Standard for Binary Floating-Point Arithmetic.
4825 -------------------------------------------------------------------------------
4826 */
4827 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4828 {
4829     int32 aExp, bExp, zExp;
4830     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4831     int32 expDiff;
4832     float128 z;
4833 
4834     aSig1 = extractFloat128Frac1( a );
4835     aSig0 = extractFloat128Frac0( a );
4836     aExp = extractFloat128Exp( a );
4837     bSig1 = extractFloat128Frac1( b );
4838     bSig0 = extractFloat128Frac0( b );
4839     bExp = extractFloat128Exp( b );
4840     expDiff = aExp - bExp;
4841     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4842     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4843     if ( 0 < expDiff ) goto aExpBigger;
4844     if ( expDiff < 0 ) goto bExpBigger;
4845     if ( aExp == 0x7FFF ) {
4846         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4847             return propagateFloat128NaN( a, b );
4848         }
4849         float_raise( float_flag_invalid );
4850         z.low = float128_default_nan_low;
4851         z.high = float128_default_nan_high;
4852         return z;
4853     }
4854     if ( aExp == 0 ) {
4855         aExp = 1;
4856         bExp = 1;
4857     }
4858     if ( bSig0 < aSig0 ) goto aBigger;
4859     if ( aSig0 < bSig0 ) goto bBigger;
4860     if ( bSig1 < aSig1 ) goto aBigger;
4861     if ( aSig1 < bSig1 ) goto bBigger;
4862     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4863  bExpBigger:
4864     if ( bExp == 0x7FFF ) {
4865         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4866         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4867     }
4868     if ( aExp == 0 ) {
4869         ++expDiff;
4870     }
4871     else {
4872         aSig0 |= LIT64( 0x4000000000000000 );
4873     }
4874     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4875     bSig0 |= LIT64( 0x4000000000000000 );
4876  bBigger:
4877     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4878     zExp = bExp;
4879     zSign ^= 1;
4880     goto normalizeRoundAndPack;
4881  aExpBigger:
4882     if ( aExp == 0x7FFF ) {
4883         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4884         return a;
4885     }
4886     if ( bExp == 0 ) {
4887         --expDiff;
4888     }
4889     else {
4890         bSig0 |= LIT64( 0x4000000000000000 );
4891     }
4892     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4893     aSig0 |= LIT64( 0x4000000000000000 );
4894  aBigger:
4895     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4896     zExp = aExp;
4897  normalizeRoundAndPack:
4898     --zExp;
4899     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4900 
4901 }
4902 
4903 /*
4904 -------------------------------------------------------------------------------
4905 Returns the result of adding the quadruple-precision floating-point values
4906 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4907 for Binary Floating-Point Arithmetic.
4908 -------------------------------------------------------------------------------
4909 */
4910 float128 float128_add( float128 a, float128 b )
4911 {
4912     flag aSign, bSign;
4913 
4914     aSign = extractFloat128Sign( a );
4915     bSign = extractFloat128Sign( b );
4916     if ( aSign == bSign ) {
4917         return addFloat128Sigs( a, b, aSign );
4918     }
4919     else {
4920         return subFloat128Sigs( a, b, aSign );
4921     }
4922 
4923 }
4924 
4925 /*
4926 -------------------------------------------------------------------------------
4927 Returns the result of subtracting the quadruple-precision floating-point
4928 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4929 Standard for Binary Floating-Point Arithmetic.
4930 -------------------------------------------------------------------------------
4931 */
4932 float128 float128_sub( float128 a, float128 b )
4933 {
4934     flag aSign, bSign;
4935 
4936     aSign = extractFloat128Sign( a );
4937     bSign = extractFloat128Sign( b );
4938     if ( aSign == bSign ) {
4939         return subFloat128Sigs( a, b, aSign );
4940     }
4941     else {
4942         return addFloat128Sigs( a, b, aSign );
4943     }
4944 
4945 }
4946 
4947 /*
4948 -------------------------------------------------------------------------------
4949 Returns the result of multiplying the quadruple-precision floating-point
4950 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4951 Standard for Binary Floating-Point Arithmetic.
4952 -------------------------------------------------------------------------------
4953 */
4954 float128 float128_mul( float128 a, float128 b )
4955 {
4956     flag aSign, bSign, zSign;
4957     int32 aExp, bExp, zExp;
4958     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4959     float128 z;
4960 
4961     aSig1 = extractFloat128Frac1( a );
4962     aSig0 = extractFloat128Frac0( a );
4963     aExp = extractFloat128Exp( a );
4964     aSign = extractFloat128Sign( a );
4965     bSig1 = extractFloat128Frac1( b );
4966     bSig0 = extractFloat128Frac0( b );
4967     bExp = extractFloat128Exp( b );
4968     bSign = extractFloat128Sign( b );
4969     zSign = aSign ^ bSign;
4970     if ( aExp == 0x7FFF ) {
4971         if (    ( aSig0 | aSig1 )
4972              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4973             return propagateFloat128NaN( a, b );
4974         }
4975         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4976         return packFloat128( zSign, 0x7FFF, 0, 0 );
4977     }
4978     if ( bExp == 0x7FFF ) {
4979         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4980         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4981  invalid:
4982             float_raise( float_flag_invalid );
4983             z.low = float128_default_nan_low;
4984             z.high = float128_default_nan_high;
4985             return z;
4986         }
4987         return packFloat128( zSign, 0x7FFF, 0, 0 );
4988     }
4989     if ( aExp == 0 ) {
4990         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4991         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4992     }
4993     if ( bExp == 0 ) {
4994         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4995         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4996     }
4997     zExp = aExp + bExp - 0x4000;
4998     aSig0 |= LIT64( 0x0001000000000000 );
4999     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5000     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5001     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5002     zSig2 |= ( zSig3 != 0 );
5003     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5004         shift128ExtraRightJamming(
5005             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5006         ++zExp;
5007     }
5008     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5009 
5010 }
5011 
5012 /*
5013 -------------------------------------------------------------------------------
5014 Returns the result of dividing the quadruple-precision floating-point value
5015 `a' by the corresponding value `b'.  The operation is performed according to
5016 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5017 -------------------------------------------------------------------------------
5018 */
5019 float128 float128_div( float128 a, float128 b )
5020 {
5021     flag aSign, bSign, zSign;
5022     int32 aExp, bExp, zExp;
5023     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5024     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5025     float128 z;
5026 
5027     aSig1 = extractFloat128Frac1( a );
5028     aSig0 = extractFloat128Frac0( a );
5029     aExp = extractFloat128Exp( a );
5030     aSign = extractFloat128Sign( a );
5031     bSig1 = extractFloat128Frac1( b );
5032     bSig0 = extractFloat128Frac0( b );
5033     bExp = extractFloat128Exp( b );
5034     bSign = extractFloat128Sign( b );
5035     zSign = aSign ^ bSign;
5036     if ( aExp == 0x7FFF ) {
5037         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5038         if ( bExp == 0x7FFF ) {
5039             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5040             goto invalid;
5041         }
5042         return packFloat128( zSign, 0x7FFF, 0, 0 );
5043     }
5044     if ( bExp == 0x7FFF ) {
5045         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5046         return packFloat128( zSign, 0, 0, 0 );
5047     }
5048     if ( bExp == 0 ) {
5049         if ( ( bSig0 | bSig1 ) == 0 ) {
5050             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5051  invalid:
5052                 float_raise( float_flag_invalid );
5053                 z.low = float128_default_nan_low;
5054                 z.high = float128_default_nan_high;
5055                 return z;
5056             }
5057             float_raise( float_flag_divbyzero );
5058             return packFloat128( zSign, 0x7FFF, 0, 0 );
5059         }
5060         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5061     }
5062     if ( aExp == 0 ) {
5063         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5064         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5065     }
5066     zExp = aExp - bExp + 0x3FFD;
5067     shortShift128Left(
5068         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5069     shortShift128Left(
5070         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5071     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5072         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5073         ++zExp;
5074     }
5075     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5076     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5077     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5078     while ( (sbits64) rem0 < 0 ) {
5079         --zSig0;
5080         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5081     }
5082     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5083     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5084         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5085         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5086         while ( (sbits64) rem1 < 0 ) {
5087             --zSig1;
5088             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5089         }
5090         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5091     }
5092     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5093     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5094 
5095 }
5096 
5097 /*
5098 -------------------------------------------------------------------------------
5099 Returns the remainder of the quadruple-precision floating-point value `a'
5100 with respect to the corresponding value `b'.  The operation is performed
5101 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5102 -------------------------------------------------------------------------------
5103 */
5104 float128 float128_rem( float128 a, float128 b )
5105 {
5106     flag aSign, bSign, zSign;
5107     int32 aExp, bExp, expDiff;
5108     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5109     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5110     sbits64 sigMean0;
5111     float128 z;
5112 
5113     aSig1 = extractFloat128Frac1( a );
5114     aSig0 = extractFloat128Frac0( a );
5115     aExp = extractFloat128Exp( a );
5116     aSign = extractFloat128Sign( a );
5117     bSig1 = extractFloat128Frac1( b );
5118     bSig0 = extractFloat128Frac0( b );
5119     bExp = extractFloat128Exp( b );
5120     bSign = extractFloat128Sign( b );
5121     if ( aExp == 0x7FFF ) {
5122         if (    ( aSig0 | aSig1 )
5123              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5124             return propagateFloat128NaN( a, b );
5125         }
5126         goto invalid;
5127     }
5128     if ( bExp == 0x7FFF ) {
5129         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5130         return a;
5131     }
5132     if ( bExp == 0 ) {
5133         if ( ( bSig0 | bSig1 ) == 0 ) {
5134  invalid:
5135             float_raise( float_flag_invalid );
5136             z.low = float128_default_nan_low;
5137             z.high = float128_default_nan_high;
5138             return z;
5139         }
5140         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5141     }
5142     if ( aExp == 0 ) {
5143         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5144         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5145     }
5146     expDiff = aExp - bExp;
5147     if ( expDiff < -1 ) return a;
5148     shortShift128Left(
5149         aSig0 | LIT64( 0x0001000000000000 ),
5150         aSig1,
5151         15 - ( expDiff < 0 ),
5152         &aSig0,
5153         &aSig1
5154     );
5155     shortShift128Left(
5156         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5157     q = le128( bSig0, bSig1, aSig0, aSig1 );
5158     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5159     expDiff -= 64;
5160     while ( 0 < expDiff ) {
5161         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5162         q = ( 4 < q ) ? q - 4 : 0;
5163         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5164         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5165         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5166         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5167         expDiff -= 61;
5168     }
5169     if ( -64 < expDiff ) {
5170         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5171         q = ( 4 < q ) ? q - 4 : 0;
5172         q >>= - expDiff;
5173         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5174         expDiff += 52;
5175         if ( expDiff < 0 ) {
5176             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5177         }
5178         else {
5179             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5180         }
5181         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5182         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5183     }
5184     else {
5185         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5186         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5187     }
5188     do {
5189         alternateASig0 = aSig0;
5190         alternateASig1 = aSig1;
5191         ++q;
5192         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5193     } while ( 0 <= (sbits64) aSig0 );
5194     add128(
5195         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5196     if (    ( sigMean0 < 0 )
5197          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5198         aSig0 = alternateASig0;
5199         aSig1 = alternateASig1;
5200     }
5201     zSign = ( (sbits64) aSig0 < 0 );
5202     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5203     return
5204         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5205 
5206 }
5207 
5208 /*
5209 -------------------------------------------------------------------------------
5210 Returns the square root of the quadruple-precision floating-point value `a'.
5211 The operation is performed according to the IEC/IEEE Standard for Binary
5212 Floating-Point Arithmetic.
5213 -------------------------------------------------------------------------------
5214 */
5215 float128 float128_sqrt( float128 a )
5216 {
5217     flag aSign;
5218     int32 aExp, zExp;
5219     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5220     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5221     float128 z;
5222 
5223     aSig1 = extractFloat128Frac1( a );
5224     aSig0 = extractFloat128Frac0( a );
5225     aExp = extractFloat128Exp( a );
5226     aSign = extractFloat128Sign( a );
5227     if ( aExp == 0x7FFF ) {
5228         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5229         if ( ! aSign ) return a;
5230         goto invalid;
5231     }
5232     if ( aSign ) {
5233         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5234  invalid:
5235         float_raise( float_flag_invalid );
5236         z.low = float128_default_nan_low;
5237         z.high = float128_default_nan_high;
5238         return z;
5239     }
5240     if ( aExp == 0 ) {
5241         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5242         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5243     }
5244     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5245     aSig0 |= LIT64( 0x0001000000000000 );
5246     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5247     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5248     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5249     doubleZSig0 = zSig0<<1;
5250     mul64To128( zSig0, zSig0, &term0, &term1 );
5251     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5252     while ( (sbits64) rem0 < 0 ) {
5253         --zSig0;
5254         doubleZSig0 -= 2;
5255         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5256     }
5257     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5258     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5259         if ( zSig1 == 0 ) zSig1 = 1;
5260         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5261         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5262         mul64To128( zSig1, zSig1, &term2, &term3 );
5263         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5264         while ( (sbits64) rem1 < 0 ) {
5265             --zSig1;
5266             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5267             term3 |= 1;
5268             term2 |= doubleZSig0;
5269             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5270         }
5271         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5272     }
5273     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5274     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5275 
5276 }
5277 
5278 /*
5279 -------------------------------------------------------------------------------
5280 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5281 the corresponding value `b', and 0 otherwise.  The comparison is performed
5282 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5283 -------------------------------------------------------------------------------
5284 */
5285 flag float128_eq( float128 a, float128 b )
5286 {
5287 
5288     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5289               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5290          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5291               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5292        ) {
5293         if (    float128_is_signaling_nan( a )
5294              || float128_is_signaling_nan( b ) ) {
5295             float_raise( float_flag_invalid );
5296         }
5297         return 0;
5298     }
5299     return
5300            ( a.low == b.low )
5301         && (    ( a.high == b.high )
5302              || (    ( a.low == 0 )
5303                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5304            );
5305 
5306 }
5307 
5308 /*
5309 -------------------------------------------------------------------------------
5310 Returns 1 if the quadruple-precision floating-point value `a' is less than
5311 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5312 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5313 Arithmetic.
5314 -------------------------------------------------------------------------------
5315 */
5316 flag float128_le( float128 a, float128 b )
5317 {
5318     flag aSign, bSign;
5319 
5320     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5321               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5322          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5323               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5324        ) {
5325         float_raise( float_flag_invalid );
5326         return 0;
5327     }
5328     aSign = extractFloat128Sign( a );
5329     bSign = extractFloat128Sign( b );
5330     if ( aSign != bSign ) {
5331         return
5332                aSign
5333             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5334                  == 0 );
5335     }
5336     return
5337           aSign ? le128( b.high, b.low, a.high, a.low )
5338         : le128( a.high, a.low, b.high, b.low );
5339 
5340 }
5341 
5342 /*
5343 -------------------------------------------------------------------------------
5344 Returns 1 if the quadruple-precision floating-point value `a' is less than
5345 the corresponding value `b', and 0 otherwise.  The comparison is performed
5346 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5347 -------------------------------------------------------------------------------
5348 */
5349 flag float128_lt( float128 a, float128 b )
5350 {
5351     flag aSign, bSign;
5352 
5353     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5354               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5355          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5356               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5357        ) {
5358         float_raise( float_flag_invalid );
5359         return 0;
5360     }
5361     aSign = extractFloat128Sign( a );
5362     bSign = extractFloat128Sign( b );
5363     if ( aSign != bSign ) {
5364         return
5365                aSign
5366             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5367                  != 0 );
5368     }
5369     return
5370           aSign ? lt128( b.high, b.low, a.high, a.low )
5371         : lt128( a.high, a.low, b.high, b.low );
5372 
5373 }
5374 
5375 /*
5376 -------------------------------------------------------------------------------
5377 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5378 the corresponding value `b', and 0 otherwise.  The invalid exception is
5379 raised if either operand is a NaN.  Otherwise, the comparison is performed
5380 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5381 -------------------------------------------------------------------------------
5382 */
5383 flag float128_eq_signaling( float128 a, float128 b )
5384 {
5385 
5386     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5387               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5388          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5389               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5390        ) {
5391         float_raise( float_flag_invalid );
5392         return 0;
5393     }
5394     return
5395            ( a.low == b.low )
5396         && (    ( a.high == b.high )
5397              || (    ( a.low == 0 )
5398                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5399            );
5400 
5401 }
5402 
5403 /*
5404 -------------------------------------------------------------------------------
5405 Returns 1 if the quadruple-precision floating-point value `a' is less than
5406 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5407 cause an exception.  Otherwise, the comparison is performed according to the
5408 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5409 -------------------------------------------------------------------------------
5410 */
5411 flag float128_le_quiet( float128 a, float128 b )
5412 {
5413     flag aSign, bSign;
5414 
5415     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5416               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5417          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5418               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5419        ) {
5420         if (    float128_is_signaling_nan( a )
5421              || float128_is_signaling_nan( b ) ) {
5422             float_raise( float_flag_invalid );
5423         }
5424         return 0;
5425     }
5426     aSign = extractFloat128Sign( a );
5427     bSign = extractFloat128Sign( b );
5428     if ( aSign != bSign ) {
5429         return
5430                aSign
5431             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5432                  == 0 );
5433     }
5434     return
5435           aSign ? le128( b.high, b.low, a.high, a.low )
5436         : le128( a.high, a.low, b.high, b.low );
5437 
5438 }
5439 
5440 /*
5441 -------------------------------------------------------------------------------
5442 Returns 1 if the quadruple-precision floating-point value `a' is less than
5443 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5444 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5445 Standard for Binary Floating-Point Arithmetic.
5446 -------------------------------------------------------------------------------
5447 */
5448 flag float128_lt_quiet( float128 a, float128 b )
5449 {
5450     flag aSign, bSign;
5451 
5452     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5453               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5454          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5455               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5456        ) {
5457         if (    float128_is_signaling_nan( a )
5458              || float128_is_signaling_nan( b ) ) {
5459             float_raise( float_flag_invalid );
5460         }
5461         return 0;
5462     }
5463     aSign = extractFloat128Sign( a );
5464     bSign = extractFloat128Sign( b );
5465     if ( aSign != bSign ) {
5466         return
5467                aSign
5468             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5469                  != 0 );
5470     }
5471     return
5472           aSign ? lt128( b.high, b.low, a.high, a.low )
5473         : lt128( a.high, a.low, b.high, b.low );
5474 
5475 }
5476 
5477 #endif
5478 
5479 
5480 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5481 
5482 /*
5483  * These two routines are not part of the original softfloat distribution.
5484  *
5485  * They are based on the corresponding conversions to integer but return
5486  * unsigned numbers instead since these functions are required by GCC.
5487  *
5488  * Added by Mark Brinicombe <mark@netbsd.org>	27/09/97
5489  *
5490  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5491  */
5492 
5493 /*
5494 -------------------------------------------------------------------------------
5495 Returns the result of converting the double-precision floating-point value
5496 `a' to the 32-bit unsigned integer format.  The conversion is
5497 performed according to the IEC/IEEE Standard for Binary Floating-point
5498 Arithmetic, except that the conversion is always rounded toward zero.  If
5499 `a' is a NaN, the largest positive integer is returned.  If the conversion
5500 overflows, the largest integer positive is returned.
5501 -------------------------------------------------------------------------------
5502 */
5503 uint32 float64_to_uint32_round_to_zero( float64 a )
5504 {
5505     flag aSign;
5506     int16 aExp, shiftCount;
5507     bits64 aSig, savedASig;
5508     uint32 z;
5509 
5510     aSig = extractFloat64Frac( a );
5511     aExp = extractFloat64Exp( a );
5512     aSign = extractFloat64Sign( a );
5513 
5514     if (aSign) {
5515         float_raise( float_flag_invalid );
5516     	return(0);
5517     }
5518 
5519     if ( 0x41E < aExp ) {
5520         float_raise( float_flag_invalid );
5521         return 0xffffffff;
5522     }
5523     else if ( aExp < 0x3FF ) {
5524         if ( aExp || aSig ) float_set_inexact();
5525         return 0;
5526     }
5527     aSig |= LIT64( 0x0010000000000000 );
5528     shiftCount = 0x433 - aExp;
5529     savedASig = aSig;
5530     aSig >>= shiftCount;
5531     z = aSig;
5532     if ( ( aSig<<shiftCount ) != savedASig ) {
5533         float_set_inexact();
5534     }
5535     return z;
5536 
5537 }
5538 
5539 /*
5540 -------------------------------------------------------------------------------
5541 Returns the result of converting the single-precision floating-point value
5542 `a' to the 32-bit unsigned integer format.  The conversion is
5543 performed according to the IEC/IEEE Standard for Binary Floating-point
5544 Arithmetic, except that the conversion is always rounded toward zero.  If
5545 `a' is a NaN, the largest positive integer is returned.  If the conversion
5546 overflows, the largest positive integer is returned.
5547 -------------------------------------------------------------------------------
5548 */
5549 uint32 float32_to_uint32_round_to_zero( float32 a )
5550 {
5551     flag aSign;
5552     int16 aExp, shiftCount;
5553     bits32 aSig;
5554     uint32 z;
5555 
5556     aSig = extractFloat32Frac( a );
5557     aExp = extractFloat32Exp( a );
5558     aSign = extractFloat32Sign( a );
5559     shiftCount = aExp - 0x9E;
5560 
5561     if (aSign) {
5562         float_raise( float_flag_invalid );
5563     	return(0);
5564     }
5565     if ( 0 < shiftCount ) {
5566         float_raise( float_flag_invalid );
5567         return 0xFFFFFFFF;
5568     }
5569     else if ( aExp <= 0x7E ) {
5570         if ( aExp | aSig ) float_set_inexact();
5571         return 0;
5572     }
5573     aSig = ( aSig | 0x800000 )<<8;
5574     z = aSig>>( - shiftCount );
5575     if ( aSig<<( shiftCount & 31 ) ) {
5576         float_set_inexact();
5577     }
5578     return z;
5579 
5580 }
5581 
5582 #endif
5583