xref: /openbsd-src/sys/lib/libkern/softfloat.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: softfloat.c,v 1.3 2008/10/07 22:06:29 martynas Exp $	*/
2 /*	$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $	*/
3 
4 /*
5  * This version hacked for use with gcc -msoft-float by bjh21.
6  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7  *  itself).
8  */
9 
10 /*
11  * Things you may want to define:
12  *
13  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
15  *   properly renamed.
16  */
17 
18 /*
19 ===============================================================================
20 
21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
22 Arithmetic Package, Release 2a.
23 
24 Written by John R. Hauser.  This work was made possible in part by the
25 International Computer Science Institute, located at Suite 600, 1947 Center
26 Street, Berkeley, California 94704.  Funding was partially provided by the
27 National Science Foundation under grant MIP-9311980.  The original version
28 of this code was written as part of a project to build a fixed-point vector
29 processor in collaboration with the University of California at Berkeley,
30 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
32 arithmetic/SoftFloat.html'.
33 
34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable
35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS
37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
43 
44 Derivative works are acceptable, even for commercial purposes, so long as
45 (1) they include prominent notice that the work is derivative, and (2) they
46 include prominent notice akin to these four paragraphs for those parts of
47 this code that are retained.
48 
49 ===============================================================================
50 */
51 
52 #include <sys/cdefs.h>
53 #if defined(LIBC_SCCS) && !defined(lint)
54 __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
55 #endif /* LIBC_SCCS and not lint */
56 
57 #ifdef SOFTFLOAT_FOR_GCC
58 #include "softfloat-for-gcc.h"
59 #endif
60 
61 #include "milieu.h"
62 #include "softfloat.h"
63 
64 /*
65  * Conversions between floats as stored in memory and floats as
66  * SoftFloat uses them
67  */
68 #ifndef FLOAT64_DEMANGLE
69 #define FLOAT64_DEMANGLE(a)	(a)
70 #endif
71 #ifndef FLOAT64_MANGLE
72 #define FLOAT64_MANGLE(a)	(a)
73 #endif
74 
75 /*
76 -------------------------------------------------------------------------------
77 Floating-point rounding mode, extended double-precision rounding precision,
78 and exception flags.
79 -------------------------------------------------------------------------------
80 */
81 
82 /*
83  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
84  * 	Right now, it does not.  I've removed all other dynamic global
85  * 	variables. [ross]
86  */
87 #ifdef FLOATX80
88 int8 floatx80_rounding_precision = 80;
89 #endif
90 
91 /*
92 -------------------------------------------------------------------------------
93 Primitive arithmetic functions, including multi-word arithmetic, and
94 division and square root approximations.  (Can be specialized to target if
95 desired.)
96 -------------------------------------------------------------------------------
97 */
98 #include "softfloat-macros.h"
99 
100 /*
101 -------------------------------------------------------------------------------
102 Functions and definitions to determine:  (1) whether tininess for underflow
103 is detected before or after rounding by default, (2) what (if anything)
104 happens when exceptions are raised, (3) how signaling NaNs are distinguished
105 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
106 are propagated from function inputs to output.  These details are target-
107 specific.
108 -------------------------------------------------------------------------------
109 */
110 #include "softfloat-specialize.h"
111 
112 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
113 /*
114 -------------------------------------------------------------------------------
115 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
116 and 7, and returns the properly rounded 32-bit integer corresponding to the
117 input.  If `zSign' is 1, the input is negated before being converted to an
118 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
119 is simply rounded to an integer, with the inexact exception raised if the
120 input cannot be represented exactly as an integer.  However, if the fixed-
121 point input is too large, the invalid exception is raised and the largest
122 positive or negative integer is returned.
123 -------------------------------------------------------------------------------
124 */
125 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
126 {
127     int8 roundingMode;
128     flag roundNearestEven;
129     int8 roundIncrement, roundBits;
130     int32 z;
131 
132     roundingMode = float_rounding_mode();
133     roundNearestEven = ( roundingMode == float_round_nearest_even );
134     roundIncrement = 0x40;
135     if ( ! roundNearestEven ) {
136         if ( roundingMode == float_round_to_zero ) {
137             roundIncrement = 0;
138         }
139         else {
140             roundIncrement = 0x7F;
141             if ( zSign ) {
142                 if ( roundingMode == float_round_up ) roundIncrement = 0;
143             }
144             else {
145                 if ( roundingMode == float_round_down ) roundIncrement = 0;
146             }
147         }
148     }
149     roundBits = absZ & 0x7F;
150     absZ = ( absZ + roundIncrement )>>7;
151     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
152     z = absZ;
153     if ( zSign ) z = - z;
154     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
155         float_raise( float_flag_invalid );
156         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
157     }
158     if ( roundBits ) float_set_inexact();
159     return z;
160 
161 }
162 
163 /*
164 -------------------------------------------------------------------------------
165 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
166 `absZ1', with binary point between bits 63 and 64 (between the input words),
167 and returns the properly rounded 64-bit integer corresponding to the input.
168 If `zSign' is 1, the input is negated before being converted to an integer.
169 Ordinarily, the fixed-point input is simply rounded to an integer, with
170 the inexact exception raised if the input cannot be represented exactly as
171 an integer.  However, if the fixed-point input is too large, the invalid
172 exception is raised and the largest positive or negative integer is
173 returned.
174 -------------------------------------------------------------------------------
175 */
176 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
177 {
178     int8 roundingMode;
179     flag roundNearestEven, increment;
180     int64 z;
181 
182     roundingMode = float_rounding_mode();
183     roundNearestEven = ( roundingMode == float_round_nearest_even );
184     increment = ( (sbits64) absZ1 < 0 );
185     if ( ! roundNearestEven ) {
186         if ( roundingMode == float_round_to_zero ) {
187             increment = 0;
188         }
189         else {
190             if ( zSign ) {
191                 increment = ( roundingMode == float_round_down ) && absZ1;
192             }
193             else {
194                 increment = ( roundingMode == float_round_up ) && absZ1;
195             }
196         }
197     }
198     if ( increment ) {
199         ++absZ0;
200         if ( absZ0 == 0 ) goto overflow;
201         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
202     }
203     z = absZ0;
204     if ( zSign ) z = - z;
205     if ( z && ( ( z < 0 ) ^ zSign ) ) {
206  overflow:
207         float_raise( float_flag_invalid );
208         return
209               zSign ? (sbits64) LIT64( 0x8000000000000000 )
210             : LIT64( 0x7FFFFFFFFFFFFFFF );
211     }
212     if ( absZ1 ) float_set_inexact();
213     return z;
214 
215 }
216 
217 #ifdef __alpha__
218 /*
219 -------------------------------------------------------------------------------
220 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
221 `absZ1', with binary point between bits 63 and 64 (between the input words),
222 and returns the properly rounded 64-bit integer corresponding to the input.
223 If `zSign' is 1, the input is negated before being converted to an integer.
224 Ordinarily, the fixed-point input is simply rounded to an integer, with
225 the inexact exception raised if the input cannot be represented exactly as
226 an integer.
227 -------------------------------------------------------------------------------
228 */
229 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0,
230     bits64 absZ1 )
231 {
232     int8 roundingMode;
233     flag roundNearestEven, increment;
234     int64 z;
235 
236     roundingMode = float_rounding_mode();
237     roundNearestEven = ( roundingMode == float_round_nearest_even );
238     increment = ( (sbits64) absZ1 < 0 );
239     if ( ! roundNearestEven ) {
240         if ( roundingMode == float_round_to_zero ) {
241             increment = 0;
242         }
243         else {
244             if ( zSign ) {
245                 increment = ( roundingMode == float_round_down ) && absZ1;
246             }
247             else {
248                 increment = ( roundingMode == float_round_up ) && absZ1;
249             }
250         }
251     }
252     if ( increment ) {
253         ++absZ0;
254         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
255     }
256     z = absZ0;
257     if ( zSign ) z = - z;
258     if ( absZ1 ) float_set_inexact();
259     return z;
260 
261 }
262 #endif /* __alpha__ */
263 #endif
264 
265 /*
266 -------------------------------------------------------------------------------
267 Returns the fraction bits of the single-precision floating-point value `a'.
268 -------------------------------------------------------------------------------
269 */
270 INLINE bits32 extractFloat32Frac( float32 a )
271 {
272 
273     return a & 0x007FFFFF;
274 
275 }
276 
277 /*
278 -------------------------------------------------------------------------------
279 Returns the exponent bits of the single-precision floating-point value `a'.
280 -------------------------------------------------------------------------------
281 */
282 INLINE int16 extractFloat32Exp( float32 a )
283 {
284 
285     return ( a>>23 ) & 0xFF;
286 
287 }
288 
289 /*
290 -------------------------------------------------------------------------------
291 Returns the sign bit of the single-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
293 */
294 INLINE flag extractFloat32Sign( float32 a )
295 {
296 
297     return a>>31;
298 
299 }
300 
301 /*
302 -------------------------------------------------------------------------------
303 Normalizes the subnormal single-precision floating-point value represented
304 by the denormalized significand `aSig'.  The normalized exponent and
305 significand are stored at the locations pointed to by `zExpPtr' and
306 `zSigPtr', respectively.
307 -------------------------------------------------------------------------------
308 */
309 static void
310  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
311 {
312     int8 shiftCount;
313 
314     shiftCount = countLeadingZeros32( aSig ) - 8;
315     *zSigPtr = aSig<<shiftCount;
316     *zExpPtr = 1 - shiftCount;
317 
318 }
319 
320 /*
321 -------------------------------------------------------------------------------
322 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
323 single-precision floating-point value, returning the result.  After being
324 shifted into the proper positions, the three fields are simply added
325 together to form the result.  This means that any integer portion of `zSig'
326 will be added into the exponent.  Since a properly normalized significand
327 will have an integer portion equal to 1, the `zExp' input should be 1 less
328 than the desired result exponent whenever `zSig' is a complete, normalized
329 significand.
330 -------------------------------------------------------------------------------
331 */
332 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
333 {
334 
335     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
336 
337 }
338 
339 /*
340 -------------------------------------------------------------------------------
341 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
342 and significand `zSig', and returns the proper single-precision floating-
343 point value corresponding to the abstract input.  Ordinarily, the abstract
344 value is simply rounded and packed into the single-precision format, with
345 the inexact exception raised if the abstract input cannot be represented
346 exactly.  However, if the abstract value is too large, the overflow and
347 inexact exceptions are raised and an infinity or maximal finite value is
348 returned.  If the abstract value is too small, the input value is rounded to
349 a subnormal number, and the underflow and inexact exceptions are raised if
350 the abstract input cannot be represented exactly as a subnormal single-
351 precision floating-point number.
352     The input significand `zSig' has its binary point between bits 30
353 and 29, which is 7 bits to the left of the usual location.  This shifted
354 significand must be normalized or smaller.  If `zSig' is not normalized,
355 `zExp' must be 0; in that case, the result returned is a subnormal number,
356 and it must not require rounding.  In the usual case that `zSig' is
357 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
358 The handling of underflow and overflow follows the IEC/IEEE Standard for
359 Binary Floating-Point Arithmetic.
360 -------------------------------------------------------------------------------
361 */
362 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
363 {
364     int8 roundingMode;
365     flag roundNearestEven;
366     int8 roundIncrement, roundBits;
367     flag isTiny;
368 
369     roundingMode = float_rounding_mode();
370     roundNearestEven = ( roundingMode == float_round_nearest_even );
371     roundIncrement = 0x40;
372     if ( ! roundNearestEven ) {
373         if ( roundingMode == float_round_to_zero ) {
374             roundIncrement = 0;
375         }
376         else {
377             roundIncrement = 0x7F;
378             if ( zSign ) {
379                 if ( roundingMode == float_round_up ) roundIncrement = 0;
380             }
381             else {
382                 if ( roundingMode == float_round_down ) roundIncrement = 0;
383             }
384         }
385     }
386     roundBits = zSig & 0x7F;
387     if ( 0xFD <= (bits16) zExp ) {
388         if (    ( 0xFD < zExp )
389              || (    ( zExp == 0xFD )
390                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
391            ) {
392             float_raise( float_flag_overflow | float_flag_inexact );
393             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
394         }
395         if ( zExp < 0 ) {
396             isTiny =
397                    ( float_detect_tininess == float_tininess_before_rounding )
398                 || ( zExp < -1 )
399                 || ( zSig + roundIncrement < 0x80000000 );
400             shift32RightJamming( zSig, - zExp, &zSig );
401             zExp = 0;
402             roundBits = zSig & 0x7F;
403             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
404         }
405     }
406     if ( roundBits ) float_set_inexact();
407     zSig = ( zSig + roundIncrement )>>7;
408     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
409     if ( zSig == 0 ) zExp = 0;
410     return packFloat32( zSign, zExp, zSig );
411 
412 }
413 
414 /*
415 -------------------------------------------------------------------------------
416 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
417 and significand `zSig', and returns the proper single-precision floating-
418 point value corresponding to the abstract input.  This routine is just like
419 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
420 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
421 floating-point exponent.
422 -------------------------------------------------------------------------------
423 */
424 static float32
425  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
426 {
427     int8 shiftCount;
428 
429     shiftCount = countLeadingZeros32( zSig ) - 1;
430     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
431 
432 }
433 
434 /*
435 -------------------------------------------------------------------------------
436 Returns the fraction bits of the double-precision floating-point value `a'.
437 -------------------------------------------------------------------------------
438 */
439 INLINE bits64 extractFloat64Frac( float64 a )
440 {
441 
442     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
443 
444 }
445 
446 /*
447 -------------------------------------------------------------------------------
448 Returns the exponent bits of the double-precision floating-point value `a'.
449 -------------------------------------------------------------------------------
450 */
451 INLINE int16 extractFloat64Exp( float64 a )
452 {
453 
454     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
455 
456 }
457 
458 /*
459 -------------------------------------------------------------------------------
460 Returns the sign bit of the double-precision floating-point value `a'.
461 -------------------------------------------------------------------------------
462 */
463 INLINE flag extractFloat64Sign( float64 a )
464 {
465 
466     return FLOAT64_DEMANGLE(a)>>63;
467 
468 }
469 
470 /*
471 -------------------------------------------------------------------------------
472 Normalizes the subnormal double-precision floating-point value represented
473 by the denormalized significand `aSig'.  The normalized exponent and
474 significand are stored at the locations pointed to by `zExpPtr' and
475 `zSigPtr', respectively.
476 -------------------------------------------------------------------------------
477 */
478 static void
479  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
480 {
481     int8 shiftCount;
482 
483     shiftCount = countLeadingZeros64( aSig ) - 11;
484     *zSigPtr = aSig<<shiftCount;
485     *zExpPtr = 1 - shiftCount;
486 
487 }
488 
489 /*
490 -------------------------------------------------------------------------------
491 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
492 double-precision floating-point value, returning the result.  After being
493 shifted into the proper positions, the three fields are simply added
494 together to form the result.  This means that any integer portion of `zSig'
495 will be added into the exponent.  Since a properly normalized significand
496 will have an integer portion equal to 1, the `zExp' input should be 1 less
497 than the desired result exponent whenever `zSig' is a complete, normalized
498 significand.
499 -------------------------------------------------------------------------------
500 */
501 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
502 {
503 
504     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
505 			   ( ( (bits64) zExp )<<52 ) + zSig );
506 
507 }
508 
509 /*
510 -------------------------------------------------------------------------------
511 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
512 and significand `zSig', and returns the proper double-precision floating-
513 point value corresponding to the abstract input.  Ordinarily, the abstract
514 value is simply rounded and packed into the double-precision format, with
515 the inexact exception raised if the abstract input cannot be represented
516 exactly.  However, if the abstract value is too large, the overflow and
517 inexact exceptions are raised and an infinity or maximal finite value is
518 returned.  If the abstract value is too small, the input value is rounded to
519 a subnormal number, and the underflow and inexact exceptions are raised if
520 the abstract input cannot be represented exactly as a subnormal double-
521 precision floating-point number.
522     The input significand `zSig' has its binary point between bits 62
523 and 61, which is 10 bits to the left of the usual location.  This shifted
524 significand must be normalized or smaller.  If `zSig' is not normalized,
525 `zExp' must be 0; in that case, the result returned is a subnormal number,
526 and it must not require rounding.  In the usual case that `zSig' is
527 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
528 The handling of underflow and overflow follows the IEC/IEEE Standard for
529 Binary Floating-Point Arithmetic.
530 -------------------------------------------------------------------------------
531 */
532 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
533 {
534     int8 roundingMode;
535     flag roundNearestEven;
536     int16 roundIncrement, roundBits;
537     flag isTiny;
538 
539     roundingMode = float_rounding_mode();
540     roundNearestEven = ( roundingMode == float_round_nearest_even );
541     roundIncrement = 0x200;
542     if ( ! roundNearestEven ) {
543         if ( roundingMode == float_round_to_zero ) {
544             roundIncrement = 0;
545         }
546         else {
547             roundIncrement = 0x3FF;
548             if ( zSign ) {
549                 if ( roundingMode == float_round_up ) roundIncrement = 0;
550             }
551             else {
552                 if ( roundingMode == float_round_down ) roundIncrement = 0;
553             }
554         }
555     }
556     roundBits = zSig & 0x3FF;
557     if ( 0x7FD <= (bits16) zExp ) {
558         if (    ( 0x7FD < zExp )
559              || (    ( zExp == 0x7FD )
560                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
561            ) {
562             float_raise( float_flag_overflow | float_flag_inexact );
563             return FLOAT64_MANGLE(
564 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
565 		( roundIncrement == 0 ));
566         }
567         if ( zExp < 0 ) {
568             isTiny =
569                    ( float_detect_tininess == float_tininess_before_rounding )
570                 || ( zExp < -1 )
571                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
572             shift64RightJamming( zSig, - zExp, &zSig );
573             zExp = 0;
574             roundBits = zSig & 0x3FF;
575             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
576         }
577     }
578     if ( roundBits ) float_set_inexact();
579     zSig = ( zSig + roundIncrement )>>10;
580     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
581     if ( zSig == 0 ) zExp = 0;
582     return packFloat64( zSign, zExp, zSig );
583 
584 }
585 
586 /*
587 -------------------------------------------------------------------------------
588 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
589 and significand `zSig', and returns the proper double-precision floating-
590 point value corresponding to the abstract input.  This routine is just like
591 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
592 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
593 floating-point exponent.
594 -------------------------------------------------------------------------------
595 */
596 static float64
597  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
598 {
599     int8 shiftCount;
600 
601     shiftCount = countLeadingZeros64( zSig ) - 1;
602     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
603 
604 }
605 
606 #ifdef FLOATX80
607 
608 /*
609 -------------------------------------------------------------------------------
610 Returns the fraction bits of the extended double-precision floating-point
611 value `a'.
612 -------------------------------------------------------------------------------
613 */
614 INLINE bits64 extractFloatx80Frac( floatx80 a )
615 {
616 
617     return a.low;
618 
619 }
620 
621 /*
622 -------------------------------------------------------------------------------
623 Returns the exponent bits of the extended double-precision floating-point
624 value `a'.
625 -------------------------------------------------------------------------------
626 */
627 INLINE int32 extractFloatx80Exp( floatx80 a )
628 {
629 
630     return a.high & 0x7FFF;
631 
632 }
633 
634 /*
635 -------------------------------------------------------------------------------
636 Returns the sign bit of the extended double-precision floating-point value
637 `a'.
638 -------------------------------------------------------------------------------
639 */
640 INLINE flag extractFloatx80Sign( floatx80 a )
641 {
642 
643     return a.high>>15;
644 
645 }
646 
647 /*
648 -------------------------------------------------------------------------------
649 Normalizes the subnormal extended double-precision floating-point value
650 represented by the denormalized significand `aSig'.  The normalized exponent
651 and significand are stored at the locations pointed to by `zExpPtr' and
652 `zSigPtr', respectively.
653 -------------------------------------------------------------------------------
654 */
655 static void
656  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
657 {
658     int8 shiftCount;
659 
660     shiftCount = countLeadingZeros64( aSig );
661     *zSigPtr = aSig<<shiftCount;
662     *zExpPtr = 1 - shiftCount;
663 
664 }
665 
666 /*
667 -------------------------------------------------------------------------------
668 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
669 extended double-precision floating-point value, returning the result.
670 -------------------------------------------------------------------------------
671 */
672 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
673 {
674     floatx80 z;
675 
676     z.low = zSig;
677     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
678     return z;
679 
680 }
681 
682 /*
683 -------------------------------------------------------------------------------
684 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
685 and extended significand formed by the concatenation of `zSig0' and `zSig1',
686 and returns the proper extended double-precision floating-point value
687 corresponding to the abstract input.  Ordinarily, the abstract value is
688 rounded and packed into the extended double-precision format, with the
689 inexact exception raised if the abstract input cannot be represented
690 exactly.  However, if the abstract value is too large, the overflow and
691 inexact exceptions are raised and an infinity or maximal finite value is
692 returned.  If the abstract value is too small, the input value is rounded to
693 a subnormal number, and the underflow and inexact exceptions are raised if
694 the abstract input cannot be represented exactly as a subnormal extended
695 double-precision floating-point number.
696     If `roundingPrecision' is 32 or 64, the result is rounded to the same
697 number of bits as single or double precision, respectively.  Otherwise, the
698 result is rounded to the full precision of the extended double-precision
699 format.
700     The input significand must be normalized or smaller.  If the input
701 significand is not normalized, `zExp' must be 0; in that case, the result
702 returned is a subnormal number, and it must not require rounding.  The
703 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
704 Floating-Point Arithmetic.
705 -------------------------------------------------------------------------------
706 */
707 static floatx80
708  roundAndPackFloatx80(
709      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
710  )
711 {
712     int8 roundingMode;
713     flag roundNearestEven, increment, isTiny;
714     int64 roundIncrement, roundMask, roundBits;
715 
716     roundingMode = float_rounding_mode();
717     roundNearestEven = ( roundingMode == float_round_nearest_even );
718     if ( roundingPrecision == 80 ) goto precision80;
719     if ( roundingPrecision == 64 ) {
720         roundIncrement = LIT64( 0x0000000000000400 );
721         roundMask = LIT64( 0x00000000000007FF );
722     }
723     else if ( roundingPrecision == 32 ) {
724         roundIncrement = LIT64( 0x0000008000000000 );
725         roundMask = LIT64( 0x000000FFFFFFFFFF );
726     }
727     else {
728         goto precision80;
729     }
730     zSig0 |= ( zSig1 != 0 );
731     if ( ! roundNearestEven ) {
732         if ( roundingMode == float_round_to_zero ) {
733             roundIncrement = 0;
734         }
735         else {
736             roundIncrement = roundMask;
737             if ( zSign ) {
738                 if ( roundingMode == float_round_up ) roundIncrement = 0;
739             }
740             else {
741                 if ( roundingMode == float_round_down ) roundIncrement = 0;
742             }
743         }
744     }
745     roundBits = zSig0 & roundMask;
746     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
747         if (    ( 0x7FFE < zExp )
748              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
749            ) {
750             goto overflow;
751         }
752         if ( zExp <= 0 ) {
753             isTiny =
754                    ( float_detect_tininess == float_tininess_before_rounding )
755                 || ( zExp < 0 )
756                 || ( zSig0 <= zSig0 + roundIncrement );
757             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
758             zExp = 0;
759             roundBits = zSig0 & roundMask;
760             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
761             if ( roundBits ) float_set_inexact();
762             zSig0 += roundIncrement;
763             if ( (sbits64) zSig0 < 0 ) zExp = 1;
764             roundIncrement = roundMask + 1;
765             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
766                 roundMask |= roundIncrement;
767             }
768             zSig0 &= ~ roundMask;
769             return packFloatx80( zSign, zExp, zSig0 );
770         }
771     }
772     if ( roundBits ) float_set_inexact();
773     zSig0 += roundIncrement;
774     if ( zSig0 < roundIncrement ) {
775         ++zExp;
776         zSig0 = LIT64( 0x8000000000000000 );
777     }
778     roundIncrement = roundMask + 1;
779     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
780         roundMask |= roundIncrement;
781     }
782     zSig0 &= ~ roundMask;
783     if ( zSig0 == 0 ) zExp = 0;
784     return packFloatx80( zSign, zExp, zSig0 );
785  precision80:
786     increment = ( (sbits64) zSig1 < 0 );
787     if ( ! roundNearestEven ) {
788         if ( roundingMode == float_round_to_zero ) {
789             increment = 0;
790         }
791         else {
792             if ( zSign ) {
793                 increment = ( roundingMode == float_round_down ) && zSig1;
794             }
795             else {
796                 increment = ( roundingMode == float_round_up ) && zSig1;
797             }
798         }
799     }
800     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
801         if (    ( 0x7FFE < zExp )
802              || (    ( zExp == 0x7FFE )
803                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
804                   && increment
805                 )
806            ) {
807             roundMask = 0;
808  overflow:
809             float_raise( float_flag_overflow | float_flag_inexact );
810             if (    ( roundingMode == float_round_to_zero )
811                  || ( zSign && ( roundingMode == float_round_up ) )
812                  || ( ! zSign && ( roundingMode == float_round_down ) )
813                ) {
814                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
815             }
816             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
817         }
818         if ( zExp <= 0 ) {
819             isTiny =
820                    ( float_detect_tininess == float_tininess_before_rounding )
821                 || ( zExp < 0 )
822                 || ! increment
823                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
824             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
825             zExp = 0;
826             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
827             if ( zSig1 ) float_set_inexact();
828             if ( roundNearestEven ) {
829                 increment = ( (sbits64) zSig1 < 0 );
830             }
831             else {
832                 if ( zSign ) {
833                     increment = ( roundingMode == float_round_down ) && zSig1;
834                 }
835                 else {
836                     increment = ( roundingMode == float_round_up ) && zSig1;
837                 }
838             }
839             if ( increment ) {
840                 ++zSig0;
841                 zSig0 &=
842                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
843                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
844             }
845             return packFloatx80( zSign, zExp, zSig0 );
846         }
847     }
848     if ( zSig1 ) float_set_inexact();
849     if ( increment ) {
850         ++zSig0;
851         if ( zSig0 == 0 ) {
852             ++zExp;
853             zSig0 = LIT64( 0x8000000000000000 );
854         }
855         else {
856             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
857         }
858     }
859     else {
860         if ( zSig0 == 0 ) zExp = 0;
861     }
862     return packFloatx80( zSign, zExp, zSig0 );
863 
864 }
865 
866 /*
867 -------------------------------------------------------------------------------
868 Takes an abstract floating-point value having sign `zSign', exponent
869 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
870 and returns the proper extended double-precision floating-point value
871 corresponding to the abstract input.  This routine is just like
872 `roundAndPackFloatx80' except that the input significand does not have to be
873 normalized.
874 -------------------------------------------------------------------------------
875 */
876 static floatx80
877  normalizeRoundAndPackFloatx80(
878      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
879  )
880 {
881     int8 shiftCount;
882 
883     if ( zSig0 == 0 ) {
884         zSig0 = zSig1;
885         zSig1 = 0;
886         zExp -= 64;
887     }
888     shiftCount = countLeadingZeros64( zSig0 );
889     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
890     zExp -= shiftCount;
891     return
892         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
893 
894 }
895 
896 #endif
897 
898 #ifdef FLOAT128
899 
900 /*
901 -------------------------------------------------------------------------------
902 Returns the least-significant 64 fraction bits of the quadruple-precision
903 floating-point value `a'.
904 -------------------------------------------------------------------------------
905 */
906 INLINE bits64 extractFloat128Frac1( float128 a )
907 {
908 
909     return a.low;
910 
911 }
912 
913 /*
914 -------------------------------------------------------------------------------
915 Returns the most-significant 48 fraction bits of the quadruple-precision
916 floating-point value `a'.
917 -------------------------------------------------------------------------------
918 */
919 INLINE bits64 extractFloat128Frac0( float128 a )
920 {
921 
922     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
923 
924 }
925 
926 /*
927 -------------------------------------------------------------------------------
928 Returns the exponent bits of the quadruple-precision floating-point value
929 `a'.
930 -------------------------------------------------------------------------------
931 */
932 INLINE int32 extractFloat128Exp( float128 a )
933 {
934 
935     return ( a.high>>48 ) & 0x7FFF;
936 
937 }
938 
939 /*
940 -------------------------------------------------------------------------------
941 Returns the sign bit of the quadruple-precision floating-point value `a'.
942 -------------------------------------------------------------------------------
943 */
944 INLINE flag extractFloat128Sign( float128 a )
945 {
946 
947     return a.high>>63;
948 
949 }
950 
951 /*
952 -------------------------------------------------------------------------------
953 Normalizes the subnormal quadruple-precision floating-point value
954 represented by the denormalized significand formed by the concatenation of
955 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
956 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
957 significand are stored at the location pointed to by `zSig0Ptr', and the
958 least significant 64 bits of the normalized significand are stored at the
959 location pointed to by `zSig1Ptr'.
960 -------------------------------------------------------------------------------
961 */
962 static void
963  normalizeFloat128Subnormal(
964      bits64 aSig0,
965      bits64 aSig1,
966      int32 *zExpPtr,
967      bits64 *zSig0Ptr,
968      bits64 *zSig1Ptr
969  )
970 {
971     int8 shiftCount;
972 
973     if ( aSig0 == 0 ) {
974         shiftCount = countLeadingZeros64( aSig1 ) - 15;
975         if ( shiftCount < 0 ) {
976             *zSig0Ptr = aSig1>>( - shiftCount );
977             *zSig1Ptr = aSig1<<( shiftCount & 63 );
978         }
979         else {
980             *zSig0Ptr = aSig1<<shiftCount;
981             *zSig1Ptr = 0;
982         }
983         *zExpPtr = - shiftCount - 63;
984     }
985     else {
986         shiftCount = countLeadingZeros64( aSig0 ) - 15;
987         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
988         *zExpPtr = 1 - shiftCount;
989     }
990 
991 }
992 
993 /*
994 -------------------------------------------------------------------------------
995 Packs the sign `zSign', the exponent `zExp', and the significand formed
996 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
997 floating-point value, returning the result.  After being shifted into the
998 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
999 added together to form the most significant 32 bits of the result.  This
1000 means that any integer portion of `zSig0' will be added into the exponent.
1001 Since a properly normalized significand will have an integer portion equal
1002 to 1, the `zExp' input should be 1 less than the desired result exponent
1003 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1004 significand.
1005 -------------------------------------------------------------------------------
1006 */
1007 INLINE float128
1008  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1009 {
1010     float128 z;
1011 
1012     z.low = zSig1;
1013     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
1014     return z;
1015 
1016 }
1017 
1018 /*
1019 -------------------------------------------------------------------------------
1020 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1021 and extended significand formed by the concatenation of `zSig0', `zSig1',
1022 and `zSig2', and returns the proper quadruple-precision floating-point value
1023 corresponding to the abstract input.  Ordinarily, the abstract value is
1024 simply rounded and packed into the quadruple-precision format, with the
1025 inexact exception raised if the abstract input cannot be represented
1026 exactly.  However, if the abstract value is too large, the overflow and
1027 inexact exceptions are raised and an infinity or maximal finite value is
1028 returned.  If the abstract value is too small, the input value is rounded to
1029 a subnormal number, and the underflow and inexact exceptions are raised if
1030 the abstract input cannot be represented exactly as a subnormal quadruple-
1031 precision floating-point number.
1032     The input significand must be normalized or smaller.  If the input
1033 significand is not normalized, `zExp' must be 0; in that case, the result
1034 returned is a subnormal number, and it must not require rounding.  In the
1035 usual case that the input significand is normalized, `zExp' must be 1 less
1036 than the ``true'' floating-point exponent.  The handling of underflow and
1037 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1038 -------------------------------------------------------------------------------
1039 */
1040 static float128
1041  roundAndPackFloat128(
1042      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
1043 {
1044     int8 roundingMode;
1045     flag roundNearestEven, increment, isTiny;
1046 
1047     roundingMode = float_rounding_mode();
1048     roundNearestEven = ( roundingMode == float_round_nearest_even );
1049     increment = ( (sbits64) zSig2 < 0 );
1050     if ( ! roundNearestEven ) {
1051         if ( roundingMode == float_round_to_zero ) {
1052             increment = 0;
1053         }
1054         else {
1055             if ( zSign ) {
1056                 increment = ( roundingMode == float_round_down ) && zSig2;
1057             }
1058             else {
1059                 increment = ( roundingMode == float_round_up ) && zSig2;
1060             }
1061         }
1062     }
1063     if ( 0x7FFD <= (bits32) zExp ) {
1064         if (    ( 0x7FFD < zExp )
1065              || (    ( zExp == 0x7FFD )
1066                   && eq128(
1067                          LIT64( 0x0001FFFFFFFFFFFF ),
1068                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1069                          zSig0,
1070                          zSig1
1071                      )
1072                   && increment
1073                 )
1074            ) {
1075             float_raise( float_flag_overflow | float_flag_inexact );
1076             if (    ( roundingMode == float_round_to_zero )
1077                  || ( zSign && ( roundingMode == float_round_up ) )
1078                  || ( ! zSign && ( roundingMode == float_round_down ) )
1079                ) {
1080                 return
1081                     packFloat128(
1082                         zSign,
1083                         0x7FFE,
1084                         LIT64( 0x0000FFFFFFFFFFFF ),
1085                         LIT64( 0xFFFFFFFFFFFFFFFF )
1086                     );
1087             }
1088             return packFloat128( zSign, 0x7FFF, 0, 0 );
1089         }
1090         if ( zExp < 0 ) {
1091             isTiny =
1092                    ( float_detect_tininess == float_tininess_before_rounding )
1093                 || ( zExp < -1 )
1094                 || ! increment
1095                 || lt128(
1096                        zSig0,
1097                        zSig1,
1098                        LIT64( 0x0001FFFFFFFFFFFF ),
1099                        LIT64( 0xFFFFFFFFFFFFFFFF )
1100                    );
1101             shift128ExtraRightJamming(
1102                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1103             zExp = 0;
1104             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1105             if ( roundNearestEven ) {
1106                 increment = ( (sbits64) zSig2 < 0 );
1107             }
1108             else {
1109                 if ( zSign ) {
1110                     increment = ( roundingMode == float_round_down ) && zSig2;
1111                 }
1112                 else {
1113                     increment = ( roundingMode == float_round_up ) && zSig2;
1114                 }
1115             }
1116         }
1117     }
1118     if ( zSig2 ) float_set_inexact();
1119     if ( increment ) {
1120         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1121         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1122     }
1123     else {
1124         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1125     }
1126     return packFloat128( zSign, zExp, zSig0, zSig1 );
1127 
1128 }
1129 
1130 /*
1131 -------------------------------------------------------------------------------
1132 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1133 and significand formed by the concatenation of `zSig0' and `zSig1', and
1134 returns the proper quadruple-precision floating-point value corresponding
1135 to the abstract input.  This routine is just like `roundAndPackFloat128'
1136 except that the input significand has fewer bits and does not have to be
1137 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1138 point exponent.
1139 -------------------------------------------------------------------------------
1140 */
1141 static float128
1142  normalizeRoundAndPackFloat128(
1143      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1144 {
1145     int8 shiftCount;
1146     bits64 zSig2;
1147 
1148     if ( zSig0 == 0 ) {
1149         zSig0 = zSig1;
1150         zSig1 = 0;
1151         zExp -= 64;
1152     }
1153     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1154     if ( 0 <= shiftCount ) {
1155         zSig2 = 0;
1156         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1157     }
1158     else {
1159         shift128ExtraRightJamming(
1160             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1161     }
1162     zExp -= shiftCount;
1163     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1164 
1165 }
1166 
1167 #endif
1168 
1169 /*
1170 -------------------------------------------------------------------------------
1171 Returns the result of converting the 32-bit two's complement integer `a'
1172 to the single-precision floating-point format.  The conversion is performed
1173 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1174 -------------------------------------------------------------------------------
1175 */
1176 float32 int32_to_float32( int32 a )
1177 {
1178     flag zSign;
1179 
1180     if ( a == 0 ) return 0;
1181     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1182     zSign = ( a < 0 );
1183     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1184 
1185 }
1186 
1187 /*
1188 -------------------------------------------------------------------------------
1189 Returns the result of converting the 32-bit two's complement integer `a'
1190 to the double-precision floating-point format.  The conversion is performed
1191 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1192 -------------------------------------------------------------------------------
1193 */
1194 float64 int32_to_float64( int32 a )
1195 {
1196     flag zSign;
1197     uint32 absA;
1198     int8 shiftCount;
1199     bits64 zSig;
1200 
1201     if ( a == 0 ) return 0;
1202     zSign = ( a < 0 );
1203     absA = zSign ? - a : a;
1204     shiftCount = countLeadingZeros32( absA ) + 21;
1205     zSig = absA;
1206     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1207 
1208 }
1209 
1210 #ifdef FLOATX80
1211 
1212 /*
1213 -------------------------------------------------------------------------------
1214 Returns the result of converting the 32-bit two's complement integer `a'
1215 to the extended double-precision floating-point format.  The conversion
1216 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1217 Arithmetic.
1218 -------------------------------------------------------------------------------
1219 */
1220 floatx80 int32_to_floatx80( int32 a )
1221 {
1222     flag zSign;
1223     uint32 absA;
1224     int8 shiftCount;
1225     bits64 zSig;
1226 
1227     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1228     zSign = ( a < 0 );
1229     absA = zSign ? - a : a;
1230     shiftCount = countLeadingZeros32( absA ) + 32;
1231     zSig = absA;
1232     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1233 
1234 }
1235 
1236 #endif
1237 
1238 #ifdef FLOAT128
1239 
1240 /*
1241 -------------------------------------------------------------------------------
1242 Returns the result of converting the 32-bit two's complement integer `a' to
1243 the quadruple-precision floating-point format.  The conversion is performed
1244 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1245 -------------------------------------------------------------------------------
1246 */
1247 float128 int32_to_float128( int32 a )
1248 {
1249     flag zSign;
1250     uint32 absA;
1251     int8 shiftCount;
1252     bits64 zSig0;
1253 
1254     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1255     zSign = ( a < 0 );
1256     absA = zSign ? - a : a;
1257     shiftCount = countLeadingZeros32( absA ) + 17;
1258     zSig0 = absA;
1259     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1260 
1261 }
1262 
1263 #endif
1264 
1265 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1266 /*
1267 -------------------------------------------------------------------------------
1268 Returns the result of converting the 64-bit two's complement integer `a'
1269 to the single-precision floating-point format.  The conversion is performed
1270 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1271 -------------------------------------------------------------------------------
1272 */
1273 float32 int64_to_float32( int64 a )
1274 {
1275     flag zSign;
1276     uint64 absA;
1277     int8 shiftCount;
1278 
1279     if ( a == 0 ) return 0;
1280     zSign = ( a < 0 );
1281     absA = zSign ? - a : a;
1282     shiftCount = countLeadingZeros64( absA ) - 40;
1283     if ( 0 <= shiftCount ) {
1284         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1285     }
1286     else {
1287         shiftCount += 7;
1288         if ( shiftCount < 0 ) {
1289             shift64RightJamming( absA, - shiftCount, &absA );
1290         }
1291         else {
1292             absA <<= shiftCount;
1293         }
1294         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1295     }
1296 
1297 }
1298 
1299 /*
1300 -------------------------------------------------------------------------------
1301 Returns the result of converting the 64-bit two's complement integer `a'
1302 to the double-precision floating-point format.  The conversion is performed
1303 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1304 -------------------------------------------------------------------------------
1305 */
1306 float64 int64_to_float64( int64 a )
1307 {
1308     flag zSign;
1309 
1310     if ( a == 0 ) return 0;
1311     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1312         return packFloat64( 1, 0x43E, 0 );
1313     }
1314     zSign = ( a < 0 );
1315     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1316 
1317 }
1318 
1319 #ifdef FLOATX80
1320 
1321 /*
1322 -------------------------------------------------------------------------------
1323 Returns the result of converting the 64-bit two's complement integer `a'
1324 to the extended double-precision floating-point format.  The conversion
1325 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1326 Arithmetic.
1327 -------------------------------------------------------------------------------
1328 */
1329 floatx80 int64_to_floatx80( int64 a )
1330 {
1331     flag zSign;
1332     uint64 absA;
1333     int8 shiftCount;
1334 
1335     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1336     zSign = ( a < 0 );
1337     absA = zSign ? - a : a;
1338     shiftCount = countLeadingZeros64( absA );
1339     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1340 
1341 }
1342 
1343 #endif
1344 
1345 #ifdef FLOAT128
1346 
1347 /*
1348 -------------------------------------------------------------------------------
1349 Returns the result of converting the 64-bit two's complement integer `a' to
1350 the quadruple-precision floating-point format.  The conversion is performed
1351 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1352 -------------------------------------------------------------------------------
1353 */
1354 float128 int64_to_float128( int64 a )
1355 {
1356     flag zSign;
1357     uint64 absA;
1358     int8 shiftCount;
1359     int32 zExp;
1360     bits64 zSig0, zSig1;
1361 
1362     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1363     zSign = ( a < 0 );
1364     absA = zSign ? - a : a;
1365     shiftCount = countLeadingZeros64( absA ) + 49;
1366     zExp = 0x406E - shiftCount;
1367     if ( 64 <= shiftCount ) {
1368         zSig1 = 0;
1369         zSig0 = absA;
1370         shiftCount -= 64;
1371     }
1372     else {
1373         zSig1 = absA;
1374         zSig0 = 0;
1375     }
1376     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1377     return packFloat128( zSign, zExp, zSig0, zSig1 );
1378 
1379 }
1380 
1381 #endif
1382 #endif /* !SOFTFLOAT_FOR_GCC */
1383 
1384 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1385 /*
1386 -------------------------------------------------------------------------------
1387 Returns the result of converting the single-precision floating-point value
1388 `a' to the 32-bit two's complement integer format.  The conversion is
1389 performed according to the IEC/IEEE Standard for Binary Floating-Point
1390 Arithmetic---which means in particular that the conversion is rounded
1391 according to the current rounding mode.  If `a' is a NaN, the largest
1392 positive integer is returned.  Otherwise, if the conversion overflows, the
1393 largest integer with the same sign as `a' is returned.
1394 -------------------------------------------------------------------------------
1395 */
1396 int32 float32_to_int32( float32 a )
1397 {
1398     flag aSign;
1399     int16 aExp, shiftCount;
1400     bits32 aSig;
1401     bits64 aSig64;
1402 
1403     aSig = extractFloat32Frac( a );
1404     aExp = extractFloat32Exp( a );
1405     aSign = extractFloat32Sign( a );
1406     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1407     if ( aExp ) aSig |= 0x00800000;
1408     shiftCount = 0xAF - aExp;
1409     aSig64 = aSig;
1410     aSig64 <<= 32;
1411     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1412     return roundAndPackInt32( aSign, aSig64 );
1413 
1414 }
1415 #endif /* !SOFTFLOAT_FOR_GCC */
1416 
1417 /*
1418 -------------------------------------------------------------------------------
1419 Returns the result of converting the single-precision floating-point value
1420 `a' to the 32-bit two's complement integer format.  The conversion is
1421 performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 Arithmetic, except that the conversion is always rounded toward zero.
1423 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424 the conversion overflows, the largest integer with the same sign as `a' is
1425 returned.
1426 -------------------------------------------------------------------------------
1427 */
1428 int32 float32_to_int32_round_to_zero( float32 a )
1429 {
1430     flag aSign;
1431     int16 aExp, shiftCount;
1432     bits32 aSig;
1433     int32 z;
1434 
1435     aSig = extractFloat32Frac( a );
1436     aExp = extractFloat32Exp( a );
1437     aSign = extractFloat32Sign( a );
1438     shiftCount = aExp - 0x9E;
1439     if ( 0 <= shiftCount ) {
1440         if ( a != 0xCF000000 ) {
1441             float_raise( float_flag_invalid );
1442             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1443         }
1444         return (sbits32) 0x80000000;
1445     }
1446     else if ( aExp <= 0x7E ) {
1447         if ( aExp | aSig ) float_set_inexact();
1448         return 0;
1449     }
1450     aSig = ( aSig | 0x00800000 )<<8;
1451     z = aSig>>( - shiftCount );
1452     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453         float_set_inexact();
1454     }
1455     if ( aSign ) z = - z;
1456     return z;
1457 
1458 }
1459 
1460 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1461 /*
1462 -------------------------------------------------------------------------------
1463 Returns the result of converting the single-precision floating-point value
1464 `a' to the 64-bit two's complement integer format.  The conversion is
1465 performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 Arithmetic---which means in particular that the conversion is rounded
1467 according to the current rounding mode.  If `a' is a NaN, the largest
1468 positive integer is returned.  Otherwise, if the conversion overflows, the
1469 largest integer with the same sign as `a' is returned.
1470 -------------------------------------------------------------------------------
1471 */
1472 int64 float32_to_int64( float32 a )
1473 {
1474     flag aSign;
1475     int16 aExp, shiftCount;
1476     bits32 aSig;
1477     bits64 aSig64, aSigExtra;
1478 
1479     aSig = extractFloat32Frac( a );
1480     aExp = extractFloat32Exp( a );
1481     aSign = extractFloat32Sign( a );
1482     shiftCount = 0xBE - aExp;
1483     if ( shiftCount < 0 ) {
1484         float_raise( float_flag_invalid );
1485         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1486             return LIT64( 0x7FFFFFFFFFFFFFFF );
1487         }
1488         return (sbits64) LIT64( 0x8000000000000000 );
1489     }
1490     if ( aExp ) aSig |= 0x00800000;
1491     aSig64 = aSig;
1492     aSig64 <<= 40;
1493     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1494     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1495 
1496 }
1497 
1498 /*
1499 -------------------------------------------------------------------------------
1500 Returns the result of converting the single-precision floating-point value
1501 `a' to the 64-bit two's complement integer format.  The conversion is
1502 performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 Arithmetic, except that the conversion is always rounded toward zero.  If
1504 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1505 conversion overflows, the largest integer with the same sign as `a' is
1506 returned.
1507 -------------------------------------------------------------------------------
1508 */
1509 int64 float32_to_int64_round_to_zero( float32 a )
1510 {
1511     flag aSign;
1512     int16 aExp, shiftCount;
1513     bits32 aSig;
1514     bits64 aSig64;
1515     int64 z;
1516 
1517     aSig = extractFloat32Frac( a );
1518     aExp = extractFloat32Exp( a );
1519     aSign = extractFloat32Sign( a );
1520     shiftCount = aExp - 0xBE;
1521     if ( 0 <= shiftCount ) {
1522         if ( a != 0xDF000000 ) {
1523             float_raise( float_flag_invalid );
1524             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1525                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1526             }
1527         }
1528         return (sbits64) LIT64( 0x8000000000000000 );
1529     }
1530     else if ( aExp <= 0x7E ) {
1531         if ( aExp | aSig ) float_set_inexact();
1532         return 0;
1533     }
1534     aSig64 = aSig | 0x00800000;
1535     aSig64 <<= 40;
1536     z = aSig64>>( - shiftCount );
1537     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1538         float_set_inexact();
1539     }
1540     if ( aSign ) z = - z;
1541     return z;
1542 
1543 }
1544 #endif /* !SOFTFLOAT_FOR_GCC */
1545 
1546 /*
1547 -------------------------------------------------------------------------------
1548 Returns the result of converting the single-precision floating-point value
1549 `a' to the double-precision floating-point format.  The conversion is
1550 performed according to the IEC/IEEE Standard for Binary Floating-Point
1551 Arithmetic.
1552 -------------------------------------------------------------------------------
1553 */
1554 float64 float32_to_float64( float32 a )
1555 {
1556     flag aSign;
1557     int16 aExp;
1558     bits32 aSig;
1559 
1560     aSig = extractFloat32Frac( a );
1561     aExp = extractFloat32Exp( a );
1562     aSign = extractFloat32Sign( a );
1563     if ( aExp == 0xFF ) {
1564         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1565         return packFloat64( aSign, 0x7FF, 0 );
1566     }
1567     if ( aExp == 0 ) {
1568         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570         --aExp;
1571     }
1572     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1573 
1574 }
1575 
1576 #ifdef FLOATX80
1577 
1578 /*
1579 -------------------------------------------------------------------------------
1580 Returns the result of converting the single-precision floating-point value
1581 `a' to the extended double-precision floating-point format.  The conversion
1582 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1583 Arithmetic.
1584 -------------------------------------------------------------------------------
1585 */
1586 floatx80 float32_to_floatx80( float32 a )
1587 {
1588     flag aSign;
1589     int16 aExp;
1590     bits32 aSig;
1591 
1592     aSig = extractFloat32Frac( a );
1593     aExp = extractFloat32Exp( a );
1594     aSign = extractFloat32Sign( a );
1595     if ( aExp == 0xFF ) {
1596         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1597         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1598     }
1599     if ( aExp == 0 ) {
1600         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1602     }
1603     aSig |= 0x00800000;
1604     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1605 
1606 }
1607 
1608 #endif
1609 
1610 #ifdef FLOAT128
1611 
1612 /*
1613 -------------------------------------------------------------------------------
1614 Returns the result of converting the single-precision floating-point value
1615 `a' to the double-precision floating-point format.  The conversion is
1616 performed according to the IEC/IEEE Standard for Binary Floating-Point
1617 Arithmetic.
1618 -------------------------------------------------------------------------------
1619 */
1620 float128 float32_to_float128( float32 a )
1621 {
1622     flag aSign;
1623     int16 aExp;
1624     bits32 aSig;
1625 
1626     aSig = extractFloat32Frac( a );
1627     aExp = extractFloat32Exp( a );
1628     aSign = extractFloat32Sign( a );
1629     if ( aExp == 0xFF ) {
1630         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1631         return packFloat128( aSign, 0x7FFF, 0, 0 );
1632     }
1633     if ( aExp == 0 ) {
1634         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636         --aExp;
1637     }
1638     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1639 
1640 }
1641 
1642 #endif
1643 
1644 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1645 /*
1646 -------------------------------------------------------------------------------
1647 Rounds the single-precision floating-point value `a' to an integer, and
1648 returns the result as a single-precision floating-point value.  The
1649 operation is performed according to the IEC/IEEE Standard for Binary
1650 Floating-Point Arithmetic.
1651 -------------------------------------------------------------------------------
1652 */
1653 float32 float32_round_to_int( float32 a )
1654 {
1655     flag aSign;
1656     int16 aExp;
1657     bits32 lastBitMask, roundBitsMask;
1658     int8 roundingMode;
1659     float32 z;
1660 
1661     aExp = extractFloat32Exp( a );
1662     if ( 0x96 <= aExp ) {
1663         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1664             return propagateFloat32NaN( a, a );
1665         }
1666         return a;
1667     }
1668     if ( aExp <= 0x7E ) {
1669         if ( (bits32) ( a<<1 ) == 0 ) return a;
1670         float_set_inexact();
1671         aSign = extractFloat32Sign( a );
1672         switch ( float_rounding_mode() ) {
1673          case float_round_nearest_even:
1674             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1675                 return packFloat32( aSign, 0x7F, 0 );
1676             }
1677             break;
1678          case float_round_down:
1679             return aSign ? 0xBF800000 : 0;
1680          case float_round_up:
1681             return aSign ? 0x80000000 : 0x3F800000;
1682         }
1683         return packFloat32( aSign, 0, 0 );
1684     }
1685     lastBitMask = 1;
1686     lastBitMask <<= 0x96 - aExp;
1687     roundBitsMask = lastBitMask - 1;
1688     z = a;
1689     roundingMode = float_rounding_mode();
1690     if ( roundingMode == float_round_nearest_even ) {
1691         z += lastBitMask>>1;
1692         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1693     }
1694     else if ( roundingMode != float_round_to_zero ) {
1695         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1696             z += roundBitsMask;
1697         }
1698     }
1699     z &= ~ roundBitsMask;
1700     if ( z != a ) float_set_inexact();
1701     return z;
1702 
1703 }
1704 #endif /* !SOFTFLOAT_FOR_GCC */
1705 
1706 /*
1707 -------------------------------------------------------------------------------
1708 Returns the result of adding the absolute values of the single-precision
1709 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1710 before being returned.  `zSign' is ignored if the result is a NaN.
1711 The addition is performed according to the IEC/IEEE Standard for Binary
1712 Floating-Point Arithmetic.
1713 -------------------------------------------------------------------------------
1714 */
1715 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1716 {
1717     int16 aExp, bExp, zExp;
1718     bits32 aSig, bSig, zSig;
1719     int16 expDiff;
1720 
1721     aSig = extractFloat32Frac( a );
1722     aExp = extractFloat32Exp( a );
1723     bSig = extractFloat32Frac( b );
1724     bExp = extractFloat32Exp( b );
1725     expDiff = aExp - bExp;
1726     aSig <<= 6;
1727     bSig <<= 6;
1728     if ( 0 < expDiff ) {
1729         if ( aExp == 0xFF ) {
1730             if ( aSig ) return propagateFloat32NaN( a, b );
1731             return a;
1732         }
1733         if ( bExp == 0 ) {
1734             --expDiff;
1735         }
1736         else {
1737             bSig |= 0x20000000;
1738         }
1739         shift32RightJamming( bSig, expDiff, &bSig );
1740         zExp = aExp;
1741     }
1742     else if ( expDiff < 0 ) {
1743         if ( bExp == 0xFF ) {
1744             if ( bSig ) return propagateFloat32NaN( a, b );
1745             return packFloat32( zSign, 0xFF, 0 );
1746         }
1747         if ( aExp == 0 ) {
1748             ++expDiff;
1749         }
1750         else {
1751             aSig |= 0x20000000;
1752         }
1753         shift32RightJamming( aSig, - expDiff, &aSig );
1754         zExp = bExp;
1755     }
1756     else {
1757         if ( aExp == 0xFF ) {
1758             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1759             return a;
1760         }
1761         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1762         zSig = 0x40000000 + aSig + bSig;
1763         zExp = aExp;
1764         goto roundAndPack;
1765     }
1766     aSig |= 0x20000000;
1767     zSig = ( aSig + bSig )<<1;
1768     --zExp;
1769     if ( (sbits32) zSig < 0 ) {
1770         zSig = aSig + bSig;
1771         ++zExp;
1772     }
1773  roundAndPack:
1774     return roundAndPackFloat32( zSign, zExp, zSig );
1775 
1776 }
1777 
1778 /*
1779 -------------------------------------------------------------------------------
1780 Returns the result of subtracting the absolute values of the single-
1781 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1782 difference is negated before being returned.  `zSign' is ignored if the
1783 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1784 Standard for Binary Floating-Point Arithmetic.
1785 -------------------------------------------------------------------------------
1786 */
1787 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1788 {
1789     int16 aExp, bExp, zExp;
1790     bits32 aSig, bSig, zSig;
1791     int16 expDiff;
1792 
1793     aSig = extractFloat32Frac( a );
1794     aExp = extractFloat32Exp( a );
1795     bSig = extractFloat32Frac( b );
1796     bExp = extractFloat32Exp( b );
1797     expDiff = aExp - bExp;
1798     aSig <<= 7;
1799     bSig <<= 7;
1800     if ( 0 < expDiff ) goto aExpBigger;
1801     if ( expDiff < 0 ) goto bExpBigger;
1802     if ( aExp == 0xFF ) {
1803         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1804         float_raise( float_flag_invalid );
1805         return float32_default_nan;
1806     }
1807     if ( aExp == 0 ) {
1808         aExp = 1;
1809         bExp = 1;
1810     }
1811     if ( bSig < aSig ) goto aBigger;
1812     if ( aSig < bSig ) goto bBigger;
1813     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1814  bExpBigger:
1815     if ( bExp == 0xFF ) {
1816         if ( bSig ) return propagateFloat32NaN( a, b );
1817         return packFloat32( zSign ^ 1, 0xFF, 0 );
1818     }
1819     if ( aExp == 0 ) {
1820         ++expDiff;
1821     }
1822     else {
1823         aSig |= 0x40000000;
1824     }
1825     shift32RightJamming( aSig, - expDiff, &aSig );
1826     bSig |= 0x40000000;
1827  bBigger:
1828     zSig = bSig - aSig;
1829     zExp = bExp;
1830     zSign ^= 1;
1831     goto normalizeRoundAndPack;
1832  aExpBigger:
1833     if ( aExp == 0xFF ) {
1834         if ( aSig ) return propagateFloat32NaN( a, b );
1835         return a;
1836     }
1837     if ( bExp == 0 ) {
1838         --expDiff;
1839     }
1840     else {
1841         bSig |= 0x40000000;
1842     }
1843     shift32RightJamming( bSig, expDiff, &bSig );
1844     aSig |= 0x40000000;
1845  aBigger:
1846     zSig = aSig - bSig;
1847     zExp = aExp;
1848  normalizeRoundAndPack:
1849     --zExp;
1850     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1851 
1852 }
1853 
1854 /*
1855 -------------------------------------------------------------------------------
1856 Returns the result of adding the single-precision floating-point values `a'
1857 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1858 Binary Floating-Point Arithmetic.
1859 -------------------------------------------------------------------------------
1860 */
1861 float32 float32_add( float32 a, float32 b )
1862 {
1863     flag aSign, bSign;
1864 
1865     aSign = extractFloat32Sign( a );
1866     bSign = extractFloat32Sign( b );
1867     if ( aSign == bSign ) {
1868         return addFloat32Sigs( a, b, aSign );
1869     }
1870     else {
1871         return subFloat32Sigs( a, b, aSign );
1872     }
1873 
1874 }
1875 
1876 /*
1877 -------------------------------------------------------------------------------
1878 Returns the result of subtracting the single-precision floating-point values
1879 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1880 for Binary Floating-Point Arithmetic.
1881 -------------------------------------------------------------------------------
1882 */
1883 float32 float32_sub( float32 a, float32 b )
1884 {
1885     flag aSign, bSign;
1886 
1887     aSign = extractFloat32Sign( a );
1888     bSign = extractFloat32Sign( b );
1889     if ( aSign == bSign ) {
1890         return subFloat32Sigs( a, b, aSign );
1891     }
1892     else {
1893         return addFloat32Sigs( a, b, aSign );
1894     }
1895 
1896 }
1897 
1898 /*
1899 -------------------------------------------------------------------------------
1900 Returns the result of multiplying the single-precision floating-point values
1901 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1902 for Binary Floating-Point Arithmetic.
1903 -------------------------------------------------------------------------------
1904 */
1905 float32 float32_mul( float32 a, float32 b )
1906 {
1907     flag aSign, bSign, zSign;
1908     int16 aExp, bExp, zExp;
1909     bits32 aSig, bSig;
1910     bits64 zSig64;
1911     bits32 zSig;
1912 
1913     aSig = extractFloat32Frac( a );
1914     aExp = extractFloat32Exp( a );
1915     aSign = extractFloat32Sign( a );
1916     bSig = extractFloat32Frac( b );
1917     bExp = extractFloat32Exp( b );
1918     bSign = extractFloat32Sign( b );
1919     zSign = aSign ^ bSign;
1920     if ( aExp == 0xFF ) {
1921         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1922             return propagateFloat32NaN( a, b );
1923         }
1924         if ( ( bExp | bSig ) == 0 ) {
1925             float_raise( float_flag_invalid );
1926             return float32_default_nan;
1927         }
1928         return packFloat32( zSign, 0xFF, 0 );
1929     }
1930     if ( bExp == 0xFF ) {
1931         if ( bSig ) return propagateFloat32NaN( a, b );
1932         if ( ( aExp | aSig ) == 0 ) {
1933             float_raise( float_flag_invalid );
1934             return float32_default_nan;
1935         }
1936         return packFloat32( zSign, 0xFF, 0 );
1937     }
1938     if ( aExp == 0 ) {
1939         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1940         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1941     }
1942     if ( bExp == 0 ) {
1943         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1944         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1945     }
1946     zExp = aExp + bExp - 0x7F;
1947     aSig = ( aSig | 0x00800000 )<<7;
1948     bSig = ( bSig | 0x00800000 )<<8;
1949     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1950     zSig = zSig64;
1951     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1952         zSig <<= 1;
1953         --zExp;
1954     }
1955     return roundAndPackFloat32( zSign, zExp, zSig );
1956 
1957 }
1958 
1959 /*
1960 -------------------------------------------------------------------------------
1961 Returns the result of dividing the single-precision floating-point value `a'
1962 by the corresponding value `b'.  The operation is performed according to the
1963 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1964 -------------------------------------------------------------------------------
1965 */
1966 float32 float32_div( float32 a, float32 b )
1967 {
1968     flag aSign, bSign, zSign;
1969     int16 aExp, bExp, zExp;
1970     bits32 aSig, bSig, zSig;
1971 
1972     aSig = extractFloat32Frac( a );
1973     aExp = extractFloat32Exp( a );
1974     aSign = extractFloat32Sign( a );
1975     bSig = extractFloat32Frac( b );
1976     bExp = extractFloat32Exp( b );
1977     bSign = extractFloat32Sign( b );
1978     zSign = aSign ^ bSign;
1979     if ( aExp == 0xFF ) {
1980         if ( aSig ) return propagateFloat32NaN( a, b );
1981         if ( bExp == 0xFF ) {
1982             if ( bSig ) return propagateFloat32NaN( a, b );
1983             float_raise( float_flag_invalid );
1984             return float32_default_nan;
1985         }
1986         return packFloat32( zSign, 0xFF, 0 );
1987     }
1988     if ( bExp == 0xFF ) {
1989         if ( bSig ) return propagateFloat32NaN( a, b );
1990         return packFloat32( zSign, 0, 0 );
1991     }
1992     if ( bExp == 0 ) {
1993         if ( bSig == 0 ) {
1994             if ( ( aExp | aSig ) == 0 ) {
1995                 float_raise( float_flag_invalid );
1996                 return float32_default_nan;
1997             }
1998             float_raise( float_flag_divbyzero );
1999             return packFloat32( zSign, 0xFF, 0 );
2000         }
2001         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2002     }
2003     if ( aExp == 0 ) {
2004         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2005         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2006     }
2007     zExp = aExp - bExp + 0x7D;
2008     aSig = ( aSig | 0x00800000 )<<7;
2009     bSig = ( bSig | 0x00800000 )<<8;
2010     if ( bSig <= ( aSig + aSig ) ) {
2011         aSig >>= 1;
2012         ++zExp;
2013     }
2014     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2015     if ( ( zSig & 0x3F ) == 0 ) {
2016         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2017     }
2018     return roundAndPackFloat32( zSign, zExp, zSig );
2019 
2020 }
2021 
2022 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2023 /*
2024 -------------------------------------------------------------------------------
2025 Returns the remainder of the single-precision floating-point value `a'
2026 with respect to the corresponding value `b'.  The operation is performed
2027 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2028 -------------------------------------------------------------------------------
2029 */
2030 float32 float32_rem( float32 a, float32 b )
2031 {
2032     flag aSign, bSign, zSign;
2033     int16 aExp, bExp, expDiff;
2034     bits32 aSig, bSig;
2035     bits32 q;
2036     bits64 aSig64, bSig64, q64;
2037     bits32 alternateASig;
2038     sbits32 sigMean;
2039 
2040     aSig = extractFloat32Frac( a );
2041     aExp = extractFloat32Exp( a );
2042     aSign = extractFloat32Sign( a );
2043     bSig = extractFloat32Frac( b );
2044     bExp = extractFloat32Exp( b );
2045     bSign = extractFloat32Sign( b );
2046     if ( aExp == 0xFF ) {
2047         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2048             return propagateFloat32NaN( a, b );
2049         }
2050         float_raise( float_flag_invalid );
2051         return float32_default_nan;
2052     }
2053     if ( bExp == 0xFF ) {
2054         if ( bSig ) return propagateFloat32NaN( a, b );
2055         return a;
2056     }
2057     if ( bExp == 0 ) {
2058         if ( bSig == 0 ) {
2059             float_raise( float_flag_invalid );
2060             return float32_default_nan;
2061         }
2062         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2063     }
2064     if ( aExp == 0 ) {
2065         if ( aSig == 0 ) return a;
2066         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2067     }
2068     expDiff = aExp - bExp;
2069     aSig |= 0x00800000;
2070     bSig |= 0x00800000;
2071     if ( expDiff < 32 ) {
2072         aSig <<= 8;
2073         bSig <<= 8;
2074         if ( expDiff < 0 ) {
2075             if ( expDiff < -1 ) return a;
2076             aSig >>= 1;
2077         }
2078         q = ( bSig <= aSig );
2079         if ( q ) aSig -= bSig;
2080         if ( 0 < expDiff ) {
2081             q = ( ( (bits64) aSig )<<32 ) / bSig;
2082             q >>= 32 - expDiff;
2083             bSig >>= 2;
2084             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2085         }
2086         else {
2087             aSig >>= 2;
2088             bSig >>= 2;
2089         }
2090     }
2091     else {
2092         if ( bSig <= aSig ) aSig -= bSig;
2093         aSig64 = ( (bits64) aSig )<<40;
2094         bSig64 = ( (bits64) bSig )<<40;
2095         expDiff -= 64;
2096         while ( 0 < expDiff ) {
2097             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2098             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2099             aSig64 = - ( ( bSig * q64 )<<38 );
2100             expDiff -= 62;
2101         }
2102         expDiff += 64;
2103         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2104         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2105         q = q64>>( 64 - expDiff );
2106         bSig <<= 6;
2107         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2108     }
2109     do {
2110         alternateASig = aSig;
2111         ++q;
2112         aSig -= bSig;
2113     } while ( 0 <= (sbits32) aSig );
2114     sigMean = aSig + alternateASig;
2115     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2116         aSig = alternateASig;
2117     }
2118     zSign = ( (sbits32) aSig < 0 );
2119     if ( zSign ) aSig = - aSig;
2120     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2121 
2122 }
2123 #endif /* !SOFTFLOAT_FOR_GCC */
2124 
2125 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2126 /*
2127 -------------------------------------------------------------------------------
2128 Returns the square root of the single-precision floating-point value `a'.
2129 The operation is performed according to the IEC/IEEE Standard for Binary
2130 Floating-Point Arithmetic.
2131 -------------------------------------------------------------------------------
2132 */
2133 float32 float32_sqrt( float32 a )
2134 {
2135     flag aSign;
2136     int16 aExp, zExp;
2137     bits32 aSig, zSig;
2138     bits64 rem, term;
2139 
2140     aSig = extractFloat32Frac( a );
2141     aExp = extractFloat32Exp( a );
2142     aSign = extractFloat32Sign( a );
2143     if ( aExp == 0xFF ) {
2144         if ( aSig ) return propagateFloat32NaN( a, 0 );
2145         if ( ! aSign ) return a;
2146         float_raise( float_flag_invalid );
2147         return float32_default_nan;
2148     }
2149     if ( aSign ) {
2150         if ( ( aExp | aSig ) == 0 ) return a;
2151         float_raise( float_flag_invalid );
2152         return float32_default_nan;
2153     }
2154     if ( aExp == 0 ) {
2155         if ( aSig == 0 ) return 0;
2156         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2157     }
2158     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2159     aSig = ( aSig | 0x00800000 )<<8;
2160     zSig = estimateSqrt32( aExp, aSig ) + 2;
2161     if ( ( zSig & 0x7F ) <= 5 ) {
2162         if ( zSig < 2 ) {
2163             zSig = 0x7FFFFFFF;
2164             goto roundAndPack;
2165         }
2166         aSig >>= aExp & 1;
2167         term = ( (bits64) zSig ) * zSig;
2168         rem = ( ( (bits64) aSig )<<32 ) - term;
2169         while ( (sbits64) rem < 0 ) {
2170             --zSig;
2171             rem += ( ( (bits64) zSig )<<1 ) | 1;
2172         }
2173         zSig |= ( rem != 0 );
2174     }
2175     shift32RightJamming( zSig, 1, &zSig );
2176  roundAndPack:
2177     return roundAndPackFloat32( 0, zExp, zSig );
2178 
2179 }
2180 #endif /* !SOFTFLOAT_FOR_GCC */
2181 
2182 /*
2183 -------------------------------------------------------------------------------
2184 Returns 1 if the single-precision floating-point value `a' is equal to
2185 the corresponding value `b', and 0 otherwise.  The comparison is performed
2186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2187 -------------------------------------------------------------------------------
2188 */
2189 flag float32_eq( float32 a, float32 b )
2190 {
2191 
2192     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2193          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194        ) {
2195         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2196             float_raise( float_flag_invalid );
2197         }
2198         return 0;
2199     }
2200     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2201 
2202 }
2203 
2204 /*
2205 -------------------------------------------------------------------------------
2206 Returns 1 if the single-precision floating-point value `a' is less than
2207 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2208 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2209 Arithmetic.
2210 -------------------------------------------------------------------------------
2211 */
2212 flag float32_le( float32 a, float32 b )
2213 {
2214     flag aSign, bSign;
2215 
2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218        ) {
2219         float_raise( float_flag_invalid );
2220         return 0;
2221     }
2222     aSign = extractFloat32Sign( a );
2223     bSign = extractFloat32Sign( b );
2224     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2225     return ( a == b ) || ( aSign ^ ( a < b ) );
2226 
2227 }
2228 
2229 /*
2230 -------------------------------------------------------------------------------
2231 Returns 1 if the single-precision floating-point value `a' is less than
2232 the corresponding value `b', and 0 otherwise.  The comparison is performed
2233 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2234 -------------------------------------------------------------------------------
2235 */
2236 flag float32_lt( float32 a, float32 b )
2237 {
2238     flag aSign, bSign;
2239 
2240     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2241          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2242        ) {
2243         float_raise( float_flag_invalid );
2244         return 0;
2245     }
2246     aSign = extractFloat32Sign( a );
2247     bSign = extractFloat32Sign( b );
2248     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2249     return ( a != b ) && ( aSign ^ ( a < b ) );
2250 
2251 }
2252 
2253 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2254 /*
2255 -------------------------------------------------------------------------------
2256 Returns 1 if the single-precision floating-point value `a' is equal to
2257 the corresponding value `b', and 0 otherwise.  The invalid exception is
2258 raised if either operand is a NaN.  Otherwise, the comparison is performed
2259 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2260 -------------------------------------------------------------------------------
2261 */
2262 flag float32_eq_signaling( float32 a, float32 b )
2263 {
2264 
2265     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2266          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2267        ) {
2268         float_raise( float_flag_invalid );
2269         return 0;
2270     }
2271     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2272 
2273 }
2274 
2275 /*
2276 -------------------------------------------------------------------------------
2277 Returns 1 if the single-precision floating-point value `a' is less than or
2278 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2279 cause an exception.  Otherwise, the comparison is performed according to the
2280 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2281 -------------------------------------------------------------------------------
2282 */
2283 flag float32_le_quiet( float32 a, float32 b )
2284 {
2285     flag aSign, bSign;
2286 
2287     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2288          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2289        ) {
2290         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2291             float_raise( float_flag_invalid );
2292         }
2293         return 0;
2294     }
2295     aSign = extractFloat32Sign( a );
2296     bSign = extractFloat32Sign( b );
2297     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2298     return ( a == b ) || ( aSign ^ ( a < b ) );
2299 
2300 }
2301 
2302 /*
2303 -------------------------------------------------------------------------------
2304 Returns 1 if the single-precision floating-point value `a' is less than
2305 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2306 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2307 Standard for Binary Floating-Point Arithmetic.
2308 -------------------------------------------------------------------------------
2309 */
2310 flag float32_lt_quiet( float32 a, float32 b )
2311 {
2312     flag aSign, bSign;
2313 
2314     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2315          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2316        ) {
2317         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2318             float_raise( float_flag_invalid );
2319         }
2320         return 0;
2321     }
2322     aSign = extractFloat32Sign( a );
2323     bSign = extractFloat32Sign( b );
2324     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2325     return ( a != b ) && ( aSign ^ ( a < b ) );
2326 
2327 }
2328 #endif /* !SOFTFLOAT_FOR_GCC */
2329 
2330 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2331 /*
2332 -------------------------------------------------------------------------------
2333 Returns the result of converting the double-precision floating-point value
2334 `a' to the 32-bit two's complement integer format.  The conversion is
2335 performed according to the IEC/IEEE Standard for Binary Floating-Point
2336 Arithmetic---which means in particular that the conversion is rounded
2337 according to the current rounding mode.  If `a' is a NaN, the largest
2338 positive integer is returned.  Otherwise, if the conversion overflows, the
2339 largest integer with the same sign as `a' is returned.
2340 -------------------------------------------------------------------------------
2341 */
2342 int32 float64_to_int32( float64 a )
2343 {
2344     flag aSign;
2345     int16 aExp, shiftCount;
2346     bits64 aSig;
2347 
2348     aSig = extractFloat64Frac( a );
2349     aExp = extractFloat64Exp( a );
2350     aSign = extractFloat64Sign( a );
2351     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2352     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2353     shiftCount = 0x42C - aExp;
2354     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2355     return roundAndPackInt32( aSign, aSig );
2356 
2357 }
2358 #endif /* !SOFTFLOAT_FOR_GCC */
2359 
2360 /*
2361 -------------------------------------------------------------------------------
2362 Returns the result of converting the double-precision floating-point value
2363 `a' to the 32-bit two's complement integer format.  The conversion is
2364 performed according to the IEC/IEEE Standard for Binary Floating-Point
2365 Arithmetic, except that the conversion is always rounded toward zero.
2366 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2367 the conversion overflows, the largest integer with the same sign as `a' is
2368 returned.
2369 -------------------------------------------------------------------------------
2370 */
2371 int32 float64_to_int32_round_to_zero( float64 a )
2372 {
2373     flag aSign;
2374     int16 aExp, shiftCount;
2375     bits64 aSig, savedASig;
2376     int32 z;
2377 
2378     aSig = extractFloat64Frac( a );
2379     aExp = extractFloat64Exp( a );
2380     aSign = extractFloat64Sign( a );
2381     if ( 0x41E < aExp ) {
2382         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2383         goto invalid;
2384     }
2385     else if ( aExp < 0x3FF ) {
2386         if ( aExp || aSig ) float_set_inexact();
2387         return 0;
2388     }
2389     aSig |= LIT64( 0x0010000000000000 );
2390     shiftCount = 0x433 - aExp;
2391     savedASig = aSig;
2392     aSig >>= shiftCount;
2393     z = aSig;
2394     if ( aSign ) z = - z;
2395     if ( ( z < 0 ) ^ aSign ) {
2396  invalid:
2397         float_raise( float_flag_invalid );
2398         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2399     }
2400     if ( ( aSig<<shiftCount ) != savedASig ) {
2401         float_set_inexact();
2402     }
2403     return z;
2404 
2405 }
2406 
2407 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2408 /*
2409 -------------------------------------------------------------------------------
2410 Returns the result of converting the double-precision floating-point value
2411 `a' to the 64-bit two's complement integer format.  The conversion is
2412 performed according to the IEC/IEEE Standard for Binary Floating-Point
2413 Arithmetic---which means in particular that the conversion is rounded
2414 according to the current rounding mode.  If `a' is a NaN, the largest
2415 positive integer is returned.  Otherwise, if the conversion overflows, the
2416 largest integer with the same sign as `a' is returned.
2417 -------------------------------------------------------------------------------
2418 */
2419 int64 float64_to_int64( float64 a )
2420 {
2421     flag aSign;
2422     int16 aExp, shiftCount;
2423     bits64 aSig, aSigExtra;
2424 
2425     aSig = extractFloat64Frac( a );
2426     aExp = extractFloat64Exp( a );
2427     aSign = extractFloat64Sign( a );
2428     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2429     shiftCount = 0x433 - aExp;
2430     if ( shiftCount <= 0 ) {
2431         if ( 0x43E < aExp ) {
2432             float_raise( float_flag_invalid );
2433             if (    ! aSign
2434                  || (    ( aExp == 0x7FF )
2435                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2436                ) {
2437                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2438             }
2439             return (sbits64) LIT64( 0x8000000000000000 );
2440         }
2441         aSigExtra = 0;
2442         aSig <<= - shiftCount;
2443     }
2444     else {
2445         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2446     }
2447     return roundAndPackInt64( aSign, aSig, aSigExtra );
2448 
2449 }
2450 
2451 #ifdef __alpha__
2452 /*
2453 -------------------------------------------------------------------------------
2454 Returns the result of converting the double-precision floating-point value
2455 `a' to the 64-bit two's complement integer format.  The conversion is
2456 performed according to the IEC/IEEE Standard for Binary Floating-Point
2457 Arithmetic---which means in particular that the conversion is rounded
2458 according to the current rounding mode.  If `a' is a NaN, the invalid
2459 exception is raised and zero is returned.
2460 -------------------------------------------------------------------------------
2461 */
2462 int64 float64_to_int64_no_overflow( float64 a )
2463 {
2464     flag aSign;
2465     int16 aExp, shiftCount;
2466     bits64 aSig, aSigExtra;
2467 
2468     aSig = extractFloat64Frac( a );
2469     aExp = extractFloat64Exp( a );
2470     aSign = extractFloat64Sign( a );
2471     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2472     shiftCount = 0x433 - aExp;
2473     if ( shiftCount <= 0 ) {
2474         if ( 0x43E < aExp ) {
2475             float_raise( float_flag_invalid );
2476             return 0;
2477         }
2478         aSigExtra = 0;
2479         aSig <<= - shiftCount;
2480     }
2481     else {
2482         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2483     }
2484     return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra );
2485 
2486 }
2487 #endif /* __alpha__ */
2488 
2489 /*
2490 -------------------------------------------------------------------------------
2491 Returns the result of converting the double-precision floating-point value
2492 `a' to the 64-bit two's complement integer format.  The conversion is
2493 performed according to the IEC/IEEE Standard for Binary Floating-Point
2494 Arithmetic, except that the conversion is always rounded toward zero.
2495 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2496 the conversion overflows, the largest integer with the same sign as `a' is
2497 returned.
2498 -------------------------------------------------------------------------------
2499 */
2500 int64 float64_to_int64_round_to_zero( float64 a )
2501 {
2502     flag aSign;
2503     int16 aExp, shiftCount;
2504     bits64 aSig;
2505     int64 z;
2506 
2507     aSig = extractFloat64Frac( a );
2508     aExp = extractFloat64Exp( a );
2509     aSign = extractFloat64Sign( a );
2510     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2511     shiftCount = aExp - 0x433;
2512     if ( 0 <= shiftCount ) {
2513         if ( 0x43E <= aExp ) {
2514             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2515                 float_raise( float_flag_invalid );
2516                 if (    ! aSign
2517                      || (    ( aExp == 0x7FF )
2518                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2519                    ) {
2520                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2521                 }
2522             }
2523             return (sbits64) LIT64( 0x8000000000000000 );
2524         }
2525         z = aSig<<shiftCount;
2526     }
2527     else {
2528         if ( aExp < 0x3FE ) {
2529             if ( aExp | aSig ) float_set_inexact();
2530             return 0;
2531         }
2532         z = aSig>>( - shiftCount );
2533         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2534             float_set_inexact();
2535         }
2536     }
2537     if ( aSign ) z = - z;
2538     return z;
2539 
2540 }
2541 #endif /* !SOFTFLOAT_FOR_GCC */
2542 
2543 /*
2544 -------------------------------------------------------------------------------
2545 Returns the result of converting the double-precision floating-point value
2546 `a' to the single-precision floating-point format.  The conversion is
2547 performed according to the IEC/IEEE Standard for Binary Floating-Point
2548 Arithmetic.
2549 -------------------------------------------------------------------------------
2550 */
2551 float32 float64_to_float32( float64 a )
2552 {
2553     flag aSign;
2554     int16 aExp;
2555     bits64 aSig;
2556     bits32 zSig;
2557 
2558     aSig = extractFloat64Frac( a );
2559     aExp = extractFloat64Exp( a );
2560     aSign = extractFloat64Sign( a );
2561     if ( aExp == 0x7FF ) {
2562         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2563         return packFloat32( aSign, 0xFF, 0 );
2564     }
2565     shift64RightJamming( aSig, 22, &aSig );
2566     zSig = aSig;
2567     if ( aExp || zSig ) {
2568         zSig |= 0x40000000;
2569         aExp -= 0x381;
2570     }
2571     return roundAndPackFloat32( aSign, aExp, zSig );
2572 
2573 }
2574 
2575 #ifdef FLOATX80
2576 
2577 /*
2578 -------------------------------------------------------------------------------
2579 Returns the result of converting the double-precision floating-point value
2580 `a' to the extended double-precision floating-point format.  The conversion
2581 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2582 Arithmetic.
2583 -------------------------------------------------------------------------------
2584 */
2585 floatx80 float64_to_floatx80( float64 a )
2586 {
2587     flag aSign;
2588     int16 aExp;
2589     bits64 aSig;
2590 
2591     aSig = extractFloat64Frac( a );
2592     aExp = extractFloat64Exp( a );
2593     aSign = extractFloat64Sign( a );
2594     if ( aExp == 0x7FF ) {
2595         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2596         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2597     }
2598     if ( aExp == 0 ) {
2599         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2600         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2601     }
2602     return
2603         packFloatx80(
2604             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2605 
2606 }
2607 
2608 #endif
2609 
2610 #ifdef FLOAT128
2611 
2612 /*
2613 -------------------------------------------------------------------------------
2614 Returns the result of converting the double-precision floating-point value
2615 `a' to the quadruple-precision floating-point format.  The conversion is
2616 performed according to the IEC/IEEE Standard for Binary Floating-Point
2617 Arithmetic.
2618 -------------------------------------------------------------------------------
2619 */
2620 float128 float64_to_float128( float64 a )
2621 {
2622     flag aSign;
2623     int16 aExp;
2624     bits64 aSig, zSig0, zSig1;
2625 
2626     aSig = extractFloat64Frac( a );
2627     aExp = extractFloat64Exp( a );
2628     aSign = extractFloat64Sign( a );
2629     if ( aExp == 0x7FF ) {
2630         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2631         return packFloat128( aSign, 0x7FFF, 0, 0 );
2632     }
2633     if ( aExp == 0 ) {
2634         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2635         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2636         --aExp;
2637     }
2638     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2639     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2640 
2641 }
2642 
2643 #endif
2644 
2645 #ifndef SOFTFLOAT_FOR_GCC
2646 /*
2647 -------------------------------------------------------------------------------
2648 Rounds the double-precision floating-point value `a' to an integer, and
2649 returns the result as a double-precision floating-point value.  The
2650 operation is performed according to the IEC/IEEE Standard for Binary
2651 Floating-Point Arithmetic.
2652 -------------------------------------------------------------------------------
2653 */
2654 float64 float64_round_to_int( float64 a )
2655 {
2656     flag aSign;
2657     int16 aExp;
2658     bits64 lastBitMask, roundBitsMask;
2659     int8 roundingMode;
2660     float64 z;
2661 
2662     aExp = extractFloat64Exp( a );
2663     if ( 0x433 <= aExp ) {
2664         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2665             return propagateFloat64NaN( a, a );
2666         }
2667         return a;
2668     }
2669     if ( aExp < 0x3FF ) {
2670         if ( (bits64) ( a<<1 ) == 0 ) return a;
2671         float_set_inexact();
2672         aSign = extractFloat64Sign( a );
2673         switch ( float_rounding_mode() ) {
2674          case float_round_nearest_even:
2675             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2676                 return packFloat64( aSign, 0x3FF, 0 );
2677             }
2678             break;
2679          case float_round_down:
2680             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2681          case float_round_up:
2682             return
2683             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2684         }
2685         return packFloat64( aSign, 0, 0 );
2686     }
2687     lastBitMask = 1;
2688     lastBitMask <<= 0x433 - aExp;
2689     roundBitsMask = lastBitMask - 1;
2690     z = a;
2691     roundingMode = float_rounding_mode();
2692     if ( roundingMode == float_round_nearest_even ) {
2693         z += lastBitMask>>1;
2694         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2695     }
2696     else if ( roundingMode != float_round_to_zero ) {
2697         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2698             z += roundBitsMask;
2699         }
2700     }
2701     z &= ~ roundBitsMask;
2702     if ( z != a ) float_set_inexact();
2703     return z;
2704 
2705 }
2706 #endif
2707 
2708 /*
2709 -------------------------------------------------------------------------------
2710 Returns the result of adding the absolute values of the double-precision
2711 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2712 before being returned.  `zSign' is ignored if the result is a NaN.
2713 The addition is performed according to the IEC/IEEE Standard for Binary
2714 Floating-Point Arithmetic.
2715 -------------------------------------------------------------------------------
2716 */
2717 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2718 {
2719     int16 aExp, bExp, zExp;
2720     bits64 aSig, bSig, zSig;
2721     int16 expDiff;
2722 
2723     aSig = extractFloat64Frac( a );
2724     aExp = extractFloat64Exp( a );
2725     bSig = extractFloat64Frac( b );
2726     bExp = extractFloat64Exp( b );
2727     expDiff = aExp - bExp;
2728     aSig <<= 9;
2729     bSig <<= 9;
2730     if ( 0 < expDiff ) {
2731         if ( aExp == 0x7FF ) {
2732             if ( aSig ) return propagateFloat64NaN( a, b );
2733             return a;
2734         }
2735         if ( bExp == 0 ) {
2736             --expDiff;
2737         }
2738         else {
2739             bSig |= LIT64( 0x2000000000000000 );
2740         }
2741         shift64RightJamming( bSig, expDiff, &bSig );
2742         zExp = aExp;
2743     }
2744     else if ( expDiff < 0 ) {
2745         if ( bExp == 0x7FF ) {
2746             if ( bSig ) return propagateFloat64NaN( a, b );
2747             return packFloat64( zSign, 0x7FF, 0 );
2748         }
2749         if ( aExp == 0 ) {
2750             ++expDiff;
2751         }
2752         else {
2753             aSig |= LIT64( 0x2000000000000000 );
2754         }
2755         shift64RightJamming( aSig, - expDiff, &aSig );
2756         zExp = bExp;
2757     }
2758     else {
2759         if ( aExp == 0x7FF ) {
2760             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2761             return a;
2762         }
2763         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2764         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2765         zExp = aExp;
2766         goto roundAndPack;
2767     }
2768     aSig |= LIT64( 0x2000000000000000 );
2769     zSig = ( aSig + bSig )<<1;
2770     --zExp;
2771     if ( (sbits64) zSig < 0 ) {
2772         zSig = aSig + bSig;
2773         ++zExp;
2774     }
2775  roundAndPack:
2776     return roundAndPackFloat64( zSign, zExp, zSig );
2777 
2778 }
2779 
2780 /*
2781 -------------------------------------------------------------------------------
2782 Returns the result of subtracting the absolute values of the double-
2783 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2784 difference is negated before being returned.  `zSign' is ignored if the
2785 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2786 Standard for Binary Floating-Point Arithmetic.
2787 -------------------------------------------------------------------------------
2788 */
2789 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2790 {
2791     int16 aExp, bExp, zExp;
2792     bits64 aSig, bSig, zSig;
2793     int16 expDiff;
2794 
2795     aSig = extractFloat64Frac( a );
2796     aExp = extractFloat64Exp( a );
2797     bSig = extractFloat64Frac( b );
2798     bExp = extractFloat64Exp( b );
2799     expDiff = aExp - bExp;
2800     aSig <<= 10;
2801     bSig <<= 10;
2802     if ( 0 < expDiff ) goto aExpBigger;
2803     if ( expDiff < 0 ) goto bExpBigger;
2804     if ( aExp == 0x7FF ) {
2805         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2806         float_raise( float_flag_invalid );
2807         return float64_default_nan;
2808     }
2809     if ( aExp == 0 ) {
2810         aExp = 1;
2811         bExp = 1;
2812     }
2813     if ( bSig < aSig ) goto aBigger;
2814     if ( aSig < bSig ) goto bBigger;
2815     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2816  bExpBigger:
2817     if ( bExp == 0x7FF ) {
2818         if ( bSig ) return propagateFloat64NaN( a, b );
2819         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2820     }
2821     if ( aExp == 0 ) {
2822         ++expDiff;
2823     }
2824     else {
2825         aSig |= LIT64( 0x4000000000000000 );
2826     }
2827     shift64RightJamming( aSig, - expDiff, &aSig );
2828     bSig |= LIT64( 0x4000000000000000 );
2829  bBigger:
2830     zSig = bSig - aSig;
2831     zExp = bExp;
2832     zSign ^= 1;
2833     goto normalizeRoundAndPack;
2834  aExpBigger:
2835     if ( aExp == 0x7FF ) {
2836         if ( aSig ) return propagateFloat64NaN( a, b );
2837         return a;
2838     }
2839     if ( bExp == 0 ) {
2840         --expDiff;
2841     }
2842     else {
2843         bSig |= LIT64( 0x4000000000000000 );
2844     }
2845     shift64RightJamming( bSig, expDiff, &bSig );
2846     aSig |= LIT64( 0x4000000000000000 );
2847  aBigger:
2848     zSig = aSig - bSig;
2849     zExp = aExp;
2850  normalizeRoundAndPack:
2851     --zExp;
2852     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2853 
2854 }
2855 
2856 /*
2857 -------------------------------------------------------------------------------
2858 Returns the result of adding the double-precision floating-point values `a'
2859 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2860 Binary Floating-Point Arithmetic.
2861 -------------------------------------------------------------------------------
2862 */
2863 float64 float64_add( float64 a, float64 b )
2864 {
2865     flag aSign, bSign;
2866 
2867     aSign = extractFloat64Sign( a );
2868     bSign = extractFloat64Sign( b );
2869     if ( aSign == bSign ) {
2870         return addFloat64Sigs( a, b, aSign );
2871     }
2872     else {
2873         return subFloat64Sigs( a, b, aSign );
2874     }
2875 
2876 }
2877 
2878 /*
2879 -------------------------------------------------------------------------------
2880 Returns the result of subtracting the double-precision floating-point values
2881 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2882 for Binary Floating-Point Arithmetic.
2883 -------------------------------------------------------------------------------
2884 */
2885 float64 float64_sub( float64 a, float64 b )
2886 {
2887     flag aSign, bSign;
2888 
2889     aSign = extractFloat64Sign( a );
2890     bSign = extractFloat64Sign( b );
2891     if ( aSign == bSign ) {
2892         return subFloat64Sigs( a, b, aSign );
2893     }
2894     else {
2895         return addFloat64Sigs( a, b, aSign );
2896     }
2897 
2898 }
2899 
2900 /*
2901 -------------------------------------------------------------------------------
2902 Returns the result of multiplying the double-precision floating-point values
2903 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2904 for Binary Floating-Point Arithmetic.
2905 -------------------------------------------------------------------------------
2906 */
2907 float64 float64_mul( float64 a, float64 b )
2908 {
2909     flag aSign, bSign, zSign;
2910     int16 aExp, bExp, zExp;
2911     bits64 aSig, bSig, zSig0, zSig1;
2912 
2913     aSig = extractFloat64Frac( a );
2914     aExp = extractFloat64Exp( a );
2915     aSign = extractFloat64Sign( a );
2916     bSig = extractFloat64Frac( b );
2917     bExp = extractFloat64Exp( b );
2918     bSign = extractFloat64Sign( b );
2919     zSign = aSign ^ bSign;
2920     if ( aExp == 0x7FF ) {
2921         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2922             return propagateFloat64NaN( a, b );
2923         }
2924         if ( ( bExp | bSig ) == 0 ) {
2925             float_raise( float_flag_invalid );
2926             return float64_default_nan;
2927         }
2928         return packFloat64( zSign, 0x7FF, 0 );
2929     }
2930     if ( bExp == 0x7FF ) {
2931         if ( bSig ) return propagateFloat64NaN( a, b );
2932         if ( ( aExp | aSig ) == 0 ) {
2933             float_raise( float_flag_invalid );
2934             return float64_default_nan;
2935         }
2936         return packFloat64( zSign, 0x7FF, 0 );
2937     }
2938     if ( aExp == 0 ) {
2939         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2940         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2941     }
2942     if ( bExp == 0 ) {
2943         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2944         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2945     }
2946     zExp = aExp + bExp - 0x3FF;
2947     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2948     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2949     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2950     zSig0 |= ( zSig1 != 0 );
2951     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2952         zSig0 <<= 1;
2953         --zExp;
2954     }
2955     return roundAndPackFloat64( zSign, zExp, zSig0 );
2956 
2957 }
2958 
2959 /*
2960 -------------------------------------------------------------------------------
2961 Returns the result of dividing the double-precision floating-point value `a'
2962 by the corresponding value `b'.  The operation is performed according to
2963 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2964 -------------------------------------------------------------------------------
2965 */
2966 float64 float64_div( float64 a, float64 b )
2967 {
2968     flag aSign, bSign, zSign;
2969     int16 aExp, bExp, zExp;
2970     bits64 aSig, bSig, zSig;
2971     bits64 rem0, rem1;
2972     bits64 term0, term1;
2973 
2974     aSig = extractFloat64Frac( a );
2975     aExp = extractFloat64Exp( a );
2976     aSign = extractFloat64Sign( a );
2977     bSig = extractFloat64Frac( b );
2978     bExp = extractFloat64Exp( b );
2979     bSign = extractFloat64Sign( b );
2980     zSign = aSign ^ bSign;
2981     if ( aExp == 0x7FF ) {
2982         if ( aSig ) return propagateFloat64NaN( a, b );
2983         if ( bExp == 0x7FF ) {
2984             if ( bSig ) return propagateFloat64NaN( a, b );
2985             float_raise( float_flag_invalid );
2986             return float64_default_nan;
2987         }
2988         return packFloat64( zSign, 0x7FF, 0 );
2989     }
2990     if ( bExp == 0x7FF ) {
2991         if ( bSig ) return propagateFloat64NaN( a, b );
2992         return packFloat64( zSign, 0, 0 );
2993     }
2994     if ( bExp == 0 ) {
2995         if ( bSig == 0 ) {
2996             if ( ( aExp | aSig ) == 0 ) {
2997                 float_raise( float_flag_invalid );
2998                 return float64_default_nan;
2999             }
3000             float_raise( float_flag_divbyzero );
3001             return packFloat64( zSign, 0x7FF, 0 );
3002         }
3003         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3004     }
3005     if ( aExp == 0 ) {
3006         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3007         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3008     }
3009     zExp = aExp - bExp + 0x3FD;
3010     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3011     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3012     if ( bSig <= ( aSig + aSig ) ) {
3013         aSig >>= 1;
3014         ++zExp;
3015     }
3016     zSig = estimateDiv128To64( aSig, 0, bSig );
3017     if ( ( zSig & 0x1FF ) <= 2 ) {
3018         mul64To128( bSig, zSig, &term0, &term1 );
3019         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3020         while ( (sbits64) rem0 < 0 ) {
3021             --zSig;
3022             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3023         }
3024         zSig |= ( rem1 != 0 );
3025     }
3026     return roundAndPackFloat64( zSign, zExp, zSig );
3027 
3028 }
3029 
3030 #ifndef SOFTFLOAT_FOR_GCC
3031 /*
3032 -------------------------------------------------------------------------------
3033 Returns the remainder of the double-precision floating-point value `a'
3034 with respect to the corresponding value `b'.  The operation is performed
3035 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3036 -------------------------------------------------------------------------------
3037 */
3038 float64 float64_rem( float64 a, float64 b )
3039 {
3040     flag aSign, bSign, zSign;
3041     int16 aExp, bExp, expDiff;
3042     bits64 aSig, bSig;
3043     bits64 q, alternateASig;
3044     sbits64 sigMean;
3045 
3046     aSig = extractFloat64Frac( a );
3047     aExp = extractFloat64Exp( a );
3048     aSign = extractFloat64Sign( a );
3049     bSig = extractFloat64Frac( b );
3050     bExp = extractFloat64Exp( b );
3051     bSign = extractFloat64Sign( b );
3052     if ( aExp == 0x7FF ) {
3053         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3054             return propagateFloat64NaN( a, b );
3055         }
3056         float_raise( float_flag_invalid );
3057         return float64_default_nan;
3058     }
3059     if ( bExp == 0x7FF ) {
3060         if ( bSig ) return propagateFloat64NaN( a, b );
3061         return a;
3062     }
3063     if ( bExp == 0 ) {
3064         if ( bSig == 0 ) {
3065             float_raise( float_flag_invalid );
3066             return float64_default_nan;
3067         }
3068         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3069     }
3070     if ( aExp == 0 ) {
3071         if ( aSig == 0 ) return a;
3072         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3073     }
3074     expDiff = aExp - bExp;
3075     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3076     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3077     if ( expDiff < 0 ) {
3078         if ( expDiff < -1 ) return a;
3079         aSig >>= 1;
3080     }
3081     q = ( bSig <= aSig );
3082     if ( q ) aSig -= bSig;
3083     expDiff -= 64;
3084     while ( 0 < expDiff ) {
3085         q = estimateDiv128To64( aSig, 0, bSig );
3086         q = ( 2 < q ) ? q - 2 : 0;
3087         aSig = - ( ( bSig>>2 ) * q );
3088         expDiff -= 62;
3089     }
3090     expDiff += 64;
3091     if ( 0 < expDiff ) {
3092         q = estimateDiv128To64( aSig, 0, bSig );
3093         q = ( 2 < q ) ? q - 2 : 0;
3094         q >>= 64 - expDiff;
3095         bSig >>= 2;
3096         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3097     }
3098     else {
3099         aSig >>= 2;
3100         bSig >>= 2;
3101     }
3102     do {
3103         alternateASig = aSig;
3104         ++q;
3105         aSig -= bSig;
3106     } while ( 0 <= (sbits64) aSig );
3107     sigMean = aSig + alternateASig;
3108     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3109         aSig = alternateASig;
3110     }
3111     zSign = ( (sbits64) aSig < 0 );
3112     if ( zSign ) aSig = - aSig;
3113     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3114 
3115 }
3116 
3117 /*
3118 -------------------------------------------------------------------------------
3119 Returns the square root of the double-precision floating-point value `a'.
3120 The operation is performed according to the IEC/IEEE Standard for Binary
3121 Floating-Point Arithmetic.
3122 -------------------------------------------------------------------------------
3123 */
3124 float64 float64_sqrt( float64 a )
3125 {
3126     flag aSign;
3127     int16 aExp, zExp;
3128     bits64 aSig, zSig, doubleZSig;
3129     bits64 rem0, rem1, term0, term1;
3130 
3131     aSig = extractFloat64Frac( a );
3132     aExp = extractFloat64Exp( a );
3133     aSign = extractFloat64Sign( a );
3134     if ( aExp == 0x7FF ) {
3135         if ( aSig ) return propagateFloat64NaN( a, a );
3136         if ( ! aSign ) return a;
3137         float_raise( float_flag_invalid );
3138         return float64_default_nan;
3139     }
3140     if ( aSign ) {
3141         if ( ( aExp | aSig ) == 0 ) return a;
3142         float_raise( float_flag_invalid );
3143         return float64_default_nan;
3144     }
3145     if ( aExp == 0 ) {
3146         if ( aSig == 0 ) return 0;
3147         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3148     }
3149     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3150     aSig |= LIT64( 0x0010000000000000 );
3151     zSig = estimateSqrt32( aExp, aSig>>21 );
3152     aSig <<= 9 - ( aExp & 1 );
3153     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3154     if ( ( zSig & 0x1FF ) <= 5 ) {
3155         doubleZSig = zSig<<1;
3156         mul64To128( zSig, zSig, &term0, &term1 );
3157         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3158         while ( (sbits64) rem0 < 0 ) {
3159             --zSig;
3160             doubleZSig -= 2;
3161             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3162         }
3163         zSig |= ( ( rem0 | rem1 ) != 0 );
3164     }
3165     return roundAndPackFloat64( 0, zExp, zSig );
3166 
3167 }
3168 #endif
3169 
3170 /*
3171 -------------------------------------------------------------------------------
3172 Returns 1 if the double-precision floating-point value `a' is equal to the
3173 corresponding value `b', and 0 otherwise.  The comparison is performed
3174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3175 -------------------------------------------------------------------------------
3176 */
3177 flag float64_eq( float64 a, float64 b )
3178 {
3179 
3180     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3181          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3182        ) {
3183         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3184             float_raise( float_flag_invalid );
3185         }
3186         return 0;
3187     }
3188     return ( a == b ) ||
3189 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3190 
3191 }
3192 
3193 /*
3194 -------------------------------------------------------------------------------
3195 Returns 1 if the double-precision floating-point value `a' is less than or
3196 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3197 performed according to the IEC/IEEE Standard for Binary Floating-Point
3198 Arithmetic.
3199 -------------------------------------------------------------------------------
3200 */
3201 flag float64_le( float64 a, float64 b )
3202 {
3203     flag aSign, bSign;
3204 
3205     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3206          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3207        ) {
3208         float_raise( float_flag_invalid );
3209         return 0;
3210     }
3211     aSign = extractFloat64Sign( a );
3212     bSign = extractFloat64Sign( b );
3213     if ( aSign != bSign )
3214 	return aSign ||
3215 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3216 	      0 );
3217     return ( a == b ) ||
3218 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3219 
3220 }
3221 
3222 /*
3223 -------------------------------------------------------------------------------
3224 Returns 1 if the double-precision floating-point value `a' is less than
3225 the corresponding value `b', and 0 otherwise.  The comparison is performed
3226 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3227 -------------------------------------------------------------------------------
3228 */
3229 flag float64_lt( float64 a, float64 b )
3230 {
3231     flag aSign, bSign;
3232 
3233     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3234          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3235        ) {
3236         float_raise( float_flag_invalid );
3237         return 0;
3238     }
3239     aSign = extractFloat64Sign( a );
3240     bSign = extractFloat64Sign( b );
3241     if ( aSign != bSign )
3242 	return aSign &&
3243 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3244 	      0 );
3245     return ( a != b ) &&
3246 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3247 
3248 }
3249 
3250 #ifndef SOFTFLOAT_FOR_GCC
3251 /*
3252 -------------------------------------------------------------------------------
3253 Returns 1 if the double-precision floating-point value `a' is equal to the
3254 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3255 if either operand is a NaN.  Otherwise, the comparison is performed
3256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3257 -------------------------------------------------------------------------------
3258 */
3259 flag float64_eq_signaling( float64 a, float64 b )
3260 {
3261 
3262     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3263          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3264        ) {
3265         float_raise( float_flag_invalid );
3266         return 0;
3267     }
3268     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3269 
3270 }
3271 
3272 /*
3273 -------------------------------------------------------------------------------
3274 Returns 1 if the double-precision floating-point value `a' is less than or
3275 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3276 cause an exception.  Otherwise, the comparison is performed according to the
3277 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3278 -------------------------------------------------------------------------------
3279 */
3280 flag float64_le_quiet( float64 a, float64 b )
3281 {
3282     flag aSign, bSign;
3283 
3284     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3285          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3286        ) {
3287         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3288             float_raise( float_flag_invalid );
3289         }
3290         return 0;
3291     }
3292     aSign = extractFloat64Sign( a );
3293     bSign = extractFloat64Sign( b );
3294     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3295     return ( a == b ) || ( aSign ^ ( a < b ) );
3296 
3297 }
3298 
3299 /*
3300 -------------------------------------------------------------------------------
3301 Returns 1 if the double-precision floating-point value `a' is less than
3302 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3303 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3304 Standard for Binary Floating-Point Arithmetic.
3305 -------------------------------------------------------------------------------
3306 */
3307 flag float64_lt_quiet( float64 a, float64 b )
3308 {
3309     flag aSign, bSign;
3310 
3311     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3312          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3313        ) {
3314         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3315             float_raise( float_flag_invalid );
3316         }
3317         return 0;
3318     }
3319     aSign = extractFloat64Sign( a );
3320     bSign = extractFloat64Sign( b );
3321     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3322     return ( a != b ) && ( aSign ^ ( a < b ) );
3323 
3324 }
3325 #endif
3326 
3327 #ifdef FLOATX80
3328 
3329 /*
3330 -------------------------------------------------------------------------------
3331 Returns the result of converting the extended double-precision floating-
3332 point value `a' to the 32-bit two's complement integer format.  The
3333 conversion is performed according to the IEC/IEEE Standard for Binary
3334 Floating-Point Arithmetic---which means in particular that the conversion
3335 is rounded according to the current rounding mode.  If `a' is a NaN, the
3336 largest positive integer is returned.  Otherwise, if the conversion
3337 overflows, the largest integer with the same sign as `a' is returned.
3338 -------------------------------------------------------------------------------
3339 */
3340 int32 floatx80_to_int32( floatx80 a )
3341 {
3342     flag aSign;
3343     int32 aExp, shiftCount;
3344     bits64 aSig;
3345 
3346     aSig = extractFloatx80Frac( a );
3347     aExp = extractFloatx80Exp( a );
3348     aSign = extractFloatx80Sign( a );
3349     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3350     shiftCount = 0x4037 - aExp;
3351     if ( shiftCount <= 0 ) shiftCount = 1;
3352     shift64RightJamming( aSig, shiftCount, &aSig );
3353     return roundAndPackInt32( aSign, aSig );
3354 
3355 }
3356 
3357 /*
3358 -------------------------------------------------------------------------------
3359 Returns the result of converting the extended double-precision floating-
3360 point value `a' to the 32-bit two's complement integer format.  The
3361 conversion is performed according to the IEC/IEEE Standard for Binary
3362 Floating-Point Arithmetic, except that the conversion is always rounded
3363 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3364 Otherwise, if the conversion overflows, the largest integer with the same
3365 sign as `a' is returned.
3366 -------------------------------------------------------------------------------
3367 */
3368 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3369 {
3370     flag aSign;
3371     int32 aExp, shiftCount;
3372     bits64 aSig, savedASig;
3373     int32 z;
3374 
3375     aSig = extractFloatx80Frac( a );
3376     aExp = extractFloatx80Exp( a );
3377     aSign = extractFloatx80Sign( a );
3378     if ( 0x401E < aExp ) {
3379         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3380         goto invalid;
3381     }
3382     else if ( aExp < 0x3FFF ) {
3383         if ( aExp || aSig ) float_set_inexact();
3384         return 0;
3385     }
3386     shiftCount = 0x403E - aExp;
3387     savedASig = aSig;
3388     aSig >>= shiftCount;
3389     z = aSig;
3390     if ( aSign ) z = - z;
3391     if ( ( z < 0 ) ^ aSign ) {
3392  invalid:
3393         float_raise( float_flag_invalid );
3394         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3395     }
3396     if ( ( aSig<<shiftCount ) != savedASig ) {
3397         float_set_inexact();
3398     }
3399     return z;
3400 
3401 }
3402 
3403 /*
3404 -------------------------------------------------------------------------------
3405 Returns the result of converting the extended double-precision floating-
3406 point value `a' to the 64-bit two's complement integer format.  The
3407 conversion is performed according to the IEC/IEEE Standard for Binary
3408 Floating-Point Arithmetic---which means in particular that the conversion
3409 is rounded according to the current rounding mode.  If `a' is a NaN,
3410 the largest positive integer is returned.  Otherwise, if the conversion
3411 overflows, the largest integer with the same sign as `a' is returned.
3412 -------------------------------------------------------------------------------
3413 */
3414 int64 floatx80_to_int64( floatx80 a )
3415 {
3416     flag aSign;
3417     int32 aExp, shiftCount;
3418     bits64 aSig, aSigExtra;
3419 
3420     aSig = extractFloatx80Frac( a );
3421     aExp = extractFloatx80Exp( a );
3422     aSign = extractFloatx80Sign( a );
3423     shiftCount = 0x403E - aExp;
3424     if ( shiftCount <= 0 ) {
3425         if ( shiftCount ) {
3426             float_raise( float_flag_invalid );
3427             if (    ! aSign
3428                  || (    ( aExp == 0x7FFF )
3429                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3430                ) {
3431                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3432             }
3433             return (sbits64) LIT64( 0x8000000000000000 );
3434         }
3435         aSigExtra = 0;
3436     }
3437     else {
3438         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3439     }
3440     return roundAndPackInt64( aSign, aSig, aSigExtra );
3441 
3442 }
3443 
3444 /*
3445 -------------------------------------------------------------------------------
3446 Returns the result of converting the extended double-precision floating-
3447 point value `a' to the 64-bit two's complement integer format.  The
3448 conversion is performed according to the IEC/IEEE Standard for Binary
3449 Floating-Point Arithmetic, except that the conversion is always rounded
3450 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3451 Otherwise, if the conversion overflows, the largest integer with the same
3452 sign as `a' is returned.
3453 -------------------------------------------------------------------------------
3454 */
3455 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3456 {
3457     flag aSign;
3458     int32 aExp, shiftCount;
3459     bits64 aSig;
3460     int64 z;
3461 
3462     aSig = extractFloatx80Frac( a );
3463     aExp = extractFloatx80Exp( a );
3464     aSign = extractFloatx80Sign( a );
3465     shiftCount = aExp - 0x403E;
3466     if ( 0 <= shiftCount ) {
3467         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3468         if ( ( a.high != 0xC03E ) || aSig ) {
3469             float_raise( float_flag_invalid );
3470             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3471                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3472             }
3473         }
3474         return (sbits64) LIT64( 0x8000000000000000 );
3475     }
3476     else if ( aExp < 0x3FFF ) {
3477         if ( aExp | aSig ) float_set_inexact();
3478         return 0;
3479     }
3480     z = aSig>>( - shiftCount );
3481     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3482         float_set_inexact();
3483     }
3484     if ( aSign ) z = - z;
3485     return z;
3486 
3487 }
3488 
3489 /*
3490 -------------------------------------------------------------------------------
3491 Returns the result of converting the extended double-precision floating-
3492 point value `a' to the single-precision floating-point format.  The
3493 conversion is performed according to the IEC/IEEE Standard for Binary
3494 Floating-Point Arithmetic.
3495 -------------------------------------------------------------------------------
3496 */
3497 float32 floatx80_to_float32( floatx80 a )
3498 {
3499     flag aSign;
3500     int32 aExp;
3501     bits64 aSig;
3502 
3503     aSig = extractFloatx80Frac( a );
3504     aExp = extractFloatx80Exp( a );
3505     aSign = extractFloatx80Sign( a );
3506     if ( aExp == 0x7FFF ) {
3507         if ( (bits64) ( aSig<<1 ) ) {
3508             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3509         }
3510         return packFloat32( aSign, 0xFF, 0 );
3511     }
3512     shift64RightJamming( aSig, 33, &aSig );
3513     if ( aExp || aSig ) aExp -= 0x3F81;
3514     return roundAndPackFloat32( aSign, aExp, aSig );
3515 
3516 }
3517 
3518 /*
3519 -------------------------------------------------------------------------------
3520 Returns the result of converting the extended double-precision floating-
3521 point value `a' to the double-precision floating-point format.  The
3522 conversion is performed according to the IEC/IEEE Standard for Binary
3523 Floating-Point Arithmetic.
3524 -------------------------------------------------------------------------------
3525 */
3526 float64 floatx80_to_float64( floatx80 a )
3527 {
3528     flag aSign;
3529     int32 aExp;
3530     bits64 aSig, zSig;
3531 
3532     aSig = extractFloatx80Frac( a );
3533     aExp = extractFloatx80Exp( a );
3534     aSign = extractFloatx80Sign( a );
3535     if ( aExp == 0x7FFF ) {
3536         if ( (bits64) ( aSig<<1 ) ) {
3537             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3538         }
3539         return packFloat64( aSign, 0x7FF, 0 );
3540     }
3541     shift64RightJamming( aSig, 1, &zSig );
3542     if ( aExp || aSig ) aExp -= 0x3C01;
3543     return roundAndPackFloat64( aSign, aExp, zSig );
3544 
3545 }
3546 
3547 #ifdef FLOAT128
3548 
3549 /*
3550 -------------------------------------------------------------------------------
3551 Returns the result of converting the extended double-precision floating-
3552 point value `a' to the quadruple-precision floating-point format.  The
3553 conversion is performed according to the IEC/IEEE Standard for Binary
3554 Floating-Point Arithmetic.
3555 -------------------------------------------------------------------------------
3556 */
3557 float128 floatx80_to_float128( floatx80 a )
3558 {
3559     flag aSign;
3560     int16 aExp;
3561     bits64 aSig, zSig0, zSig1;
3562 
3563     aSig = extractFloatx80Frac( a );
3564     aExp = extractFloatx80Exp( a );
3565     aSign = extractFloatx80Sign( a );
3566     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3567         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3568     }
3569     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3570     return packFloat128( aSign, aExp, zSig0, zSig1 );
3571 
3572 }
3573 
3574 #endif
3575 
3576 /*
3577 -------------------------------------------------------------------------------
3578 Rounds the extended double-precision floating-point value `a' to an integer,
3579 and returns the result as an extended quadruple-precision floating-point
3580 value.  The operation is performed according to the IEC/IEEE Standard for
3581 Binary Floating-Point Arithmetic.
3582 -------------------------------------------------------------------------------
3583 */
3584 floatx80 floatx80_round_to_int( floatx80 a )
3585 {
3586     flag aSign;
3587     int32 aExp;
3588     bits64 lastBitMask, roundBitsMask;
3589     int8 roundingMode;
3590     floatx80 z;
3591 
3592     aExp = extractFloatx80Exp( a );
3593     if ( 0x403E <= aExp ) {
3594         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3595             return propagateFloatx80NaN( a, a );
3596         }
3597         return a;
3598     }
3599     if ( aExp < 0x3FFF ) {
3600         if (    ( aExp == 0 )
3601              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3602             return a;
3603         }
3604         float_set_inexact();
3605         aSign = extractFloatx80Sign( a );
3606         switch ( float_rounding_mode() ) {
3607          case float_round_nearest_even:
3608             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3609                ) {
3610                 return
3611                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3612             }
3613             break;
3614          case float_round_down:
3615             return
3616                   aSign ?
3617                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3618                 : packFloatx80( 0, 0, 0 );
3619          case float_round_up:
3620             return
3621                   aSign ? packFloatx80( 1, 0, 0 )
3622                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3623         }
3624         return packFloatx80( aSign, 0, 0 );
3625     }
3626     lastBitMask = 1;
3627     lastBitMask <<= 0x403E - aExp;
3628     roundBitsMask = lastBitMask - 1;
3629     z = a;
3630     roundingMode = float_rounding_mode();
3631     if ( roundingMode == float_round_nearest_even ) {
3632         z.low += lastBitMask>>1;
3633         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3634     }
3635     else if ( roundingMode != float_round_to_zero ) {
3636         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3637             z.low += roundBitsMask;
3638         }
3639     }
3640     z.low &= ~ roundBitsMask;
3641     if ( z.low == 0 ) {
3642         ++z.high;
3643         z.low = LIT64( 0x8000000000000000 );
3644     }
3645     if ( z.low != a.low ) float_set_inexact();
3646     return z;
3647 
3648 }
3649 
3650 /*
3651 -------------------------------------------------------------------------------
3652 Returns the result of adding the absolute values of the extended double-
3653 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3654 negated before being returned.  `zSign' is ignored if the result is a NaN.
3655 The addition is performed according to the IEC/IEEE Standard for Binary
3656 Floating-Point Arithmetic.
3657 -------------------------------------------------------------------------------
3658 */
3659 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3660 {
3661     int32 aExp, bExp, zExp;
3662     bits64 aSig, bSig, zSig0, zSig1;
3663     int32 expDiff;
3664 
3665     aSig = extractFloatx80Frac( a );
3666     aExp = extractFloatx80Exp( a );
3667     bSig = extractFloatx80Frac( b );
3668     bExp = extractFloatx80Exp( b );
3669     expDiff = aExp - bExp;
3670     if ( 0 < expDiff ) {
3671         if ( aExp == 0x7FFF ) {
3672             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3673             return a;
3674         }
3675         if ( bExp == 0 ) --expDiff;
3676         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3677         zExp = aExp;
3678     }
3679     else if ( expDiff < 0 ) {
3680         if ( bExp == 0x7FFF ) {
3681             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3682             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683         }
3684         if ( aExp == 0 ) ++expDiff;
3685         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3686         zExp = bExp;
3687     }
3688     else {
3689         if ( aExp == 0x7FFF ) {
3690             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3691                 return propagateFloatx80NaN( a, b );
3692             }
3693             return a;
3694         }
3695         zSig1 = 0;
3696         zSig0 = aSig + bSig;
3697         if ( aExp == 0 ) {
3698             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3699             goto roundAndPack;
3700         }
3701         zExp = aExp;
3702         goto shiftRight1;
3703     }
3704     zSig0 = aSig + bSig;
3705     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3706  shiftRight1:
3707     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3708     zSig0 |= LIT64( 0x8000000000000000 );
3709     ++zExp;
3710  roundAndPack:
3711     return
3712         roundAndPackFloatx80(
3713             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3714 
3715 }
3716 
3717 /*
3718 -------------------------------------------------------------------------------
3719 Returns the result of subtracting the absolute values of the extended
3720 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3721 difference is negated before being returned.  `zSign' is ignored if the
3722 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3723 Standard for Binary Floating-Point Arithmetic.
3724 -------------------------------------------------------------------------------
3725 */
3726 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3727 {
3728     int32 aExp, bExp, zExp;
3729     bits64 aSig, bSig, zSig0, zSig1;
3730     int32 expDiff;
3731     floatx80 z;
3732 
3733     aSig = extractFloatx80Frac( a );
3734     aExp = extractFloatx80Exp( a );
3735     bSig = extractFloatx80Frac( b );
3736     bExp = extractFloatx80Exp( b );
3737     expDiff = aExp - bExp;
3738     if ( 0 < expDiff ) goto aExpBigger;
3739     if ( expDiff < 0 ) goto bExpBigger;
3740     if ( aExp == 0x7FFF ) {
3741         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3742             return propagateFloatx80NaN( a, b );
3743         }
3744         float_raise( float_flag_invalid );
3745         z.low = floatx80_default_nan_low;
3746         z.high = floatx80_default_nan_high;
3747         return z;
3748     }
3749     if ( aExp == 0 ) {
3750         aExp = 1;
3751         bExp = 1;
3752     }
3753     zSig1 = 0;
3754     if ( bSig < aSig ) goto aBigger;
3755     if ( aSig < bSig ) goto bBigger;
3756     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3757  bExpBigger:
3758     if ( bExp == 0x7FFF ) {
3759         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3760         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3761     }
3762     if ( aExp == 0 ) ++expDiff;
3763     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3764  bBigger:
3765     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3766     zExp = bExp;
3767     zSign ^= 1;
3768     goto normalizeRoundAndPack;
3769  aExpBigger:
3770     if ( aExp == 0x7FFF ) {
3771         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3772         return a;
3773     }
3774     if ( bExp == 0 ) --expDiff;
3775     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3776  aBigger:
3777     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3778     zExp = aExp;
3779  normalizeRoundAndPack:
3780     return
3781         normalizeRoundAndPackFloatx80(
3782             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3783 
3784 }
3785 
3786 /*
3787 -------------------------------------------------------------------------------
3788 Returns the result of adding the extended double-precision floating-point
3789 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3790 Standard for Binary Floating-Point Arithmetic.
3791 -------------------------------------------------------------------------------
3792 */
3793 floatx80 floatx80_add( floatx80 a, floatx80 b )
3794 {
3795     flag aSign, bSign;
3796 
3797     aSign = extractFloatx80Sign( a );
3798     bSign = extractFloatx80Sign( b );
3799     if ( aSign == bSign ) {
3800         return addFloatx80Sigs( a, b, aSign );
3801     }
3802     else {
3803         return subFloatx80Sigs( a, b, aSign );
3804     }
3805 
3806 }
3807 
3808 /*
3809 -------------------------------------------------------------------------------
3810 Returns the result of subtracting the extended double-precision floating-
3811 point values `a' and `b'.  The operation is performed according to the
3812 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3813 -------------------------------------------------------------------------------
3814 */
3815 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3816 {
3817     flag aSign, bSign;
3818 
3819     aSign = extractFloatx80Sign( a );
3820     bSign = extractFloatx80Sign( b );
3821     if ( aSign == bSign ) {
3822         return subFloatx80Sigs( a, b, aSign );
3823     }
3824     else {
3825         return addFloatx80Sigs( a, b, aSign );
3826     }
3827 
3828 }
3829 
3830 /*
3831 -------------------------------------------------------------------------------
3832 Returns the result of multiplying the extended double-precision floating-
3833 point values `a' and `b'.  The operation is performed according to the
3834 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3835 -------------------------------------------------------------------------------
3836 */
3837 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3838 {
3839     flag aSign, bSign, zSign;
3840     int32 aExp, bExp, zExp;
3841     bits64 aSig, bSig, zSig0, zSig1;
3842     floatx80 z;
3843 
3844     aSig = extractFloatx80Frac( a );
3845     aExp = extractFloatx80Exp( a );
3846     aSign = extractFloatx80Sign( a );
3847     bSig = extractFloatx80Frac( b );
3848     bExp = extractFloatx80Exp( b );
3849     bSign = extractFloatx80Sign( b );
3850     zSign = aSign ^ bSign;
3851     if ( aExp == 0x7FFF ) {
3852         if (    (bits64) ( aSig<<1 )
3853              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3854             return propagateFloatx80NaN( a, b );
3855         }
3856         if ( ( bExp | bSig ) == 0 ) goto invalid;
3857         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3858     }
3859     if ( bExp == 0x7FFF ) {
3860         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3861         if ( ( aExp | aSig ) == 0 ) {
3862  invalid:
3863             float_raise( float_flag_invalid );
3864             z.low = floatx80_default_nan_low;
3865             z.high = floatx80_default_nan_high;
3866             return z;
3867         }
3868         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3869     }
3870     if ( aExp == 0 ) {
3871         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3872         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3873     }
3874     if ( bExp == 0 ) {
3875         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3876         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3877     }
3878     zExp = aExp + bExp - 0x3FFE;
3879     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3880     if ( 0 < (sbits64) zSig0 ) {
3881         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3882         --zExp;
3883     }
3884     return
3885         roundAndPackFloatx80(
3886             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3887 
3888 }
3889 
3890 /*
3891 -------------------------------------------------------------------------------
3892 Returns the result of dividing the extended double-precision floating-point
3893 value `a' by the corresponding value `b'.  The operation is performed
3894 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3895 -------------------------------------------------------------------------------
3896 */
3897 floatx80 floatx80_div( floatx80 a, floatx80 b )
3898 {
3899     flag aSign, bSign, zSign;
3900     int32 aExp, bExp, zExp;
3901     bits64 aSig, bSig, zSig0, zSig1;
3902     bits64 rem0, rem1, rem2, term0, term1, term2;
3903     floatx80 z;
3904 
3905     aSig = extractFloatx80Frac( a );
3906     aExp = extractFloatx80Exp( a );
3907     aSign = extractFloatx80Sign( a );
3908     bSig = extractFloatx80Frac( b );
3909     bExp = extractFloatx80Exp( b );
3910     bSign = extractFloatx80Sign( b );
3911     zSign = aSign ^ bSign;
3912     if ( aExp == 0x7FFF ) {
3913         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914         if ( bExp == 0x7FFF ) {
3915             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3916             goto invalid;
3917         }
3918         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3919     }
3920     if ( bExp == 0x7FFF ) {
3921         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3922         return packFloatx80( zSign, 0, 0 );
3923     }
3924     if ( bExp == 0 ) {
3925         if ( bSig == 0 ) {
3926             if ( ( aExp | aSig ) == 0 ) {
3927  invalid:
3928                 float_raise( float_flag_invalid );
3929                 z.low = floatx80_default_nan_low;
3930                 z.high = floatx80_default_nan_high;
3931                 return z;
3932             }
3933             float_raise( float_flag_divbyzero );
3934             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3935         }
3936         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3937     }
3938     if ( aExp == 0 ) {
3939         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3940         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3941     }
3942     zExp = aExp - bExp + 0x3FFE;
3943     rem1 = 0;
3944     if ( bSig <= aSig ) {
3945         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3946         ++zExp;
3947     }
3948     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3949     mul64To128( bSig, zSig0, &term0, &term1 );
3950     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3951     while ( (sbits64) rem0 < 0 ) {
3952         --zSig0;
3953         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3954     }
3955     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3956     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3957         mul64To128( bSig, zSig1, &term1, &term2 );
3958         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3959         while ( (sbits64) rem1 < 0 ) {
3960             --zSig1;
3961             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3962         }
3963         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3964     }
3965     return
3966         roundAndPackFloatx80(
3967             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3968 
3969 }
3970 
3971 /*
3972 -------------------------------------------------------------------------------
3973 Returns the remainder of the extended double-precision floating-point value
3974 `a' with respect to the corresponding value `b'.  The operation is performed
3975 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3976 -------------------------------------------------------------------------------
3977 */
3978 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3979 {
3980     flag aSign, bSign, zSign;
3981     int32 aExp, bExp, expDiff;
3982     bits64 aSig0, aSig1, bSig;
3983     bits64 q, term0, term1, alternateASig0, alternateASig1;
3984     floatx80 z;
3985 
3986     aSig0 = extractFloatx80Frac( a );
3987     aExp = extractFloatx80Exp( a );
3988     aSign = extractFloatx80Sign( a );
3989     bSig = extractFloatx80Frac( b );
3990     bExp = extractFloatx80Exp( b );
3991     bSign = extractFloatx80Sign( b );
3992     if ( aExp == 0x7FFF ) {
3993         if (    (bits64) ( aSig0<<1 )
3994              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3995             return propagateFloatx80NaN( a, b );
3996         }
3997         goto invalid;
3998     }
3999     if ( bExp == 0x7FFF ) {
4000         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4001         return a;
4002     }
4003     if ( bExp == 0 ) {
4004         if ( bSig == 0 ) {
4005  invalid:
4006             float_raise( float_flag_invalid );
4007             z.low = floatx80_default_nan_low;
4008             z.high = floatx80_default_nan_high;
4009             return z;
4010         }
4011         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4012     }
4013     if ( aExp == 0 ) {
4014         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4015         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4016     }
4017     bSig |= LIT64( 0x8000000000000000 );
4018     zSign = aSign;
4019     expDiff = aExp - bExp;
4020     aSig1 = 0;
4021     if ( expDiff < 0 ) {
4022         if ( expDiff < -1 ) return a;
4023         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4024         expDiff = 0;
4025     }
4026     q = ( bSig <= aSig0 );
4027     if ( q ) aSig0 -= bSig;
4028     expDiff -= 64;
4029     while ( 0 < expDiff ) {
4030         q = estimateDiv128To64( aSig0, aSig1, bSig );
4031         q = ( 2 < q ) ? q - 2 : 0;
4032         mul64To128( bSig, q, &term0, &term1 );
4033         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4034         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4035         expDiff -= 62;
4036     }
4037     expDiff += 64;
4038     if ( 0 < expDiff ) {
4039         q = estimateDiv128To64( aSig0, aSig1, bSig );
4040         q = ( 2 < q ) ? q - 2 : 0;
4041         q >>= 64 - expDiff;
4042         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4043         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4044         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4045         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4046             ++q;
4047             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4048         }
4049     }
4050     else {
4051         term1 = 0;
4052         term0 = bSig;
4053     }
4054     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4055     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4056          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4057               && ( q & 1 ) )
4058        ) {
4059         aSig0 = alternateASig0;
4060         aSig1 = alternateASig1;
4061         zSign = ! zSign;
4062     }
4063     return
4064         normalizeRoundAndPackFloatx80(
4065             80, zSign, bExp + expDiff, aSig0, aSig1 );
4066 
4067 }
4068 
4069 /*
4070 -------------------------------------------------------------------------------
4071 Returns the square root of the extended double-precision floating-point
4072 value `a'.  The operation is performed according to the IEC/IEEE Standard
4073 for Binary Floating-Point Arithmetic.
4074 -------------------------------------------------------------------------------
4075 */
4076 floatx80 floatx80_sqrt( floatx80 a )
4077 {
4078     flag aSign;
4079     int32 aExp, zExp;
4080     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4081     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4082     floatx80 z;
4083 
4084     aSig0 = extractFloatx80Frac( a );
4085     aExp = extractFloatx80Exp( a );
4086     aSign = extractFloatx80Sign( a );
4087     if ( aExp == 0x7FFF ) {
4088         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4089         if ( ! aSign ) return a;
4090         goto invalid;
4091     }
4092     if ( aSign ) {
4093         if ( ( aExp | aSig0 ) == 0 ) return a;
4094  invalid:
4095         float_raise( float_flag_invalid );
4096         z.low = floatx80_default_nan_low;
4097         z.high = floatx80_default_nan_high;
4098         return z;
4099     }
4100     if ( aExp == 0 ) {
4101         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4102         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4103     }
4104     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4105     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4106     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4107     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4108     doubleZSig0 = zSig0<<1;
4109     mul64To128( zSig0, zSig0, &term0, &term1 );
4110     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4111     while ( (sbits64) rem0 < 0 ) {
4112         --zSig0;
4113         doubleZSig0 -= 2;
4114         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4115     }
4116     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4117     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4118         if ( zSig1 == 0 ) zSig1 = 1;
4119         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4120         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4121         mul64To128( zSig1, zSig1, &term2, &term3 );
4122         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4123         while ( (sbits64) rem1 < 0 ) {
4124             --zSig1;
4125             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4126             term3 |= 1;
4127             term2 |= doubleZSig0;
4128             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4129         }
4130         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4131     }
4132     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4133     zSig0 |= doubleZSig0;
4134     return
4135         roundAndPackFloatx80(
4136             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4137 
4138 }
4139 
4140 /*
4141 -------------------------------------------------------------------------------
4142 Returns 1 if the extended double-precision floating-point value `a' is
4143 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4144 performed according to the IEC/IEEE Standard for Binary Floating-Point
4145 Arithmetic.
4146 -------------------------------------------------------------------------------
4147 */
4148 flag floatx80_eq( floatx80 a, floatx80 b )
4149 {
4150 
4151     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4152               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4153          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4154               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4155        ) {
4156         if (    floatx80_is_signaling_nan( a )
4157              || floatx80_is_signaling_nan( b ) ) {
4158             float_raise( float_flag_invalid );
4159         }
4160         return 0;
4161     }
4162     return
4163            ( a.low == b.low )
4164         && (    ( a.high == b.high )
4165              || (    ( a.low == 0 )
4166                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4167            );
4168 
4169 }
4170 
4171 /*
4172 -------------------------------------------------------------------------------
4173 Returns 1 if the extended double-precision floating-point value `a' is
4174 less than or equal to the corresponding value `b', and 0 otherwise.  The
4175 comparison is performed according to the IEC/IEEE Standard for Binary
4176 Floating-Point Arithmetic.
4177 -------------------------------------------------------------------------------
4178 */
4179 flag floatx80_le( floatx80 a, floatx80 b )
4180 {
4181     flag aSign, bSign;
4182 
4183     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4184               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4185          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4186               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4187        ) {
4188         float_raise( float_flag_invalid );
4189         return 0;
4190     }
4191     aSign = extractFloatx80Sign( a );
4192     bSign = extractFloatx80Sign( b );
4193     if ( aSign != bSign ) {
4194         return
4195                aSign
4196             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4197                  == 0 );
4198     }
4199     return
4200           aSign ? le128( b.high, b.low, a.high, a.low )
4201         : le128( a.high, a.low, b.high, b.low );
4202 
4203 }
4204 
4205 /*
4206 -------------------------------------------------------------------------------
4207 Returns 1 if the extended double-precision floating-point value `a' is
4208 less than the corresponding value `b', and 0 otherwise.  The comparison
4209 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4210 Arithmetic.
4211 -------------------------------------------------------------------------------
4212 */
4213 flag floatx80_lt( floatx80 a, floatx80 b )
4214 {
4215     flag aSign, bSign;
4216 
4217     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4218               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4219          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4220               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4221        ) {
4222         float_raise( float_flag_invalid );
4223         return 0;
4224     }
4225     aSign = extractFloatx80Sign( a );
4226     bSign = extractFloatx80Sign( b );
4227     if ( aSign != bSign ) {
4228         return
4229                aSign
4230             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4231                  != 0 );
4232     }
4233     return
4234           aSign ? lt128( b.high, b.low, a.high, a.low )
4235         : lt128( a.high, a.low, b.high, b.low );
4236 
4237 }
4238 
4239 /*
4240 -------------------------------------------------------------------------------
4241 Returns 1 if the extended double-precision floating-point value `a' is equal
4242 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4243 raised if either operand is a NaN.  Otherwise, the comparison is performed
4244 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4245 -------------------------------------------------------------------------------
4246 */
4247 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4248 {
4249 
4250     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4251               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4252          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4253               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4254        ) {
4255         float_raise( float_flag_invalid );
4256         return 0;
4257     }
4258     return
4259            ( a.low == b.low )
4260         && (    ( a.high == b.high )
4261              || (    ( a.low == 0 )
4262                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4263            );
4264 
4265 }
4266 
4267 /*
4268 -------------------------------------------------------------------------------
4269 Returns 1 if the extended double-precision floating-point value `a' is less
4270 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4271 do not cause an exception.  Otherwise, the comparison is performed according
4272 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273 -------------------------------------------------------------------------------
4274 */
4275 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4276 {
4277     flag aSign, bSign;
4278 
4279     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4280               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4281          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4282               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4283        ) {
4284         if (    floatx80_is_signaling_nan( a )
4285              || floatx80_is_signaling_nan( b ) ) {
4286             float_raise( float_flag_invalid );
4287         }
4288         return 0;
4289     }
4290     aSign = extractFloatx80Sign( a );
4291     bSign = extractFloatx80Sign( b );
4292     if ( aSign != bSign ) {
4293         return
4294                aSign
4295             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4296                  == 0 );
4297     }
4298     return
4299           aSign ? le128( b.high, b.low, a.high, a.low )
4300         : le128( a.high, a.low, b.high, b.low );
4301 
4302 }
4303 
4304 /*
4305 -------------------------------------------------------------------------------
4306 Returns 1 if the extended double-precision floating-point value `a' is less
4307 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4308 an exception.  Otherwise, the comparison is performed according to the
4309 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4310 -------------------------------------------------------------------------------
4311 */
4312 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4313 {
4314     flag aSign, bSign;
4315 
4316     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4317               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4318          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4319               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4320        ) {
4321         if (    floatx80_is_signaling_nan( a )
4322              || floatx80_is_signaling_nan( b ) ) {
4323             float_raise( float_flag_invalid );
4324         }
4325         return 0;
4326     }
4327     aSign = extractFloatx80Sign( a );
4328     bSign = extractFloatx80Sign( b );
4329     if ( aSign != bSign ) {
4330         return
4331                aSign
4332             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4333                  != 0 );
4334     }
4335     return
4336           aSign ? lt128( b.high, b.low, a.high, a.low )
4337         : lt128( a.high, a.low, b.high, b.low );
4338 
4339 }
4340 
4341 #endif
4342 
4343 #ifdef FLOAT128
4344 
4345 /*
4346 -------------------------------------------------------------------------------
4347 Returns the result of converting the quadruple-precision floating-point
4348 value `a' to the 32-bit two's complement integer format.  The conversion
4349 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4350 Arithmetic---which means in particular that the conversion is rounded
4351 according to the current rounding mode.  If `a' is a NaN, the largest
4352 positive integer is returned.  Otherwise, if the conversion overflows, the
4353 largest integer with the same sign as `a' is returned.
4354 -------------------------------------------------------------------------------
4355 */
4356 int32 float128_to_int32( float128 a )
4357 {
4358     flag aSign;
4359     int32 aExp, shiftCount;
4360     bits64 aSig0, aSig1;
4361 
4362     aSig1 = extractFloat128Frac1( a );
4363     aSig0 = extractFloat128Frac0( a );
4364     aExp = extractFloat128Exp( a );
4365     aSign = extractFloat128Sign( a );
4366     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4367     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4368     aSig0 |= ( aSig1 != 0 );
4369     shiftCount = 0x4028 - aExp;
4370     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4371     return roundAndPackInt32( aSign, aSig0 );
4372 
4373 }
4374 
4375 /*
4376 -------------------------------------------------------------------------------
4377 Returns the result of converting the quadruple-precision floating-point
4378 value `a' to the 32-bit two's complement integer format.  The conversion
4379 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4380 Arithmetic, except that the conversion is always rounded toward zero.  If
4381 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4382 conversion overflows, the largest integer with the same sign as `a' is
4383 returned.
4384 -------------------------------------------------------------------------------
4385 */
4386 int32 float128_to_int32_round_to_zero( float128 a )
4387 {
4388     flag aSign;
4389     int32 aExp, shiftCount;
4390     bits64 aSig0, aSig1, savedASig;
4391     int32 z;
4392 
4393     aSig1 = extractFloat128Frac1( a );
4394     aSig0 = extractFloat128Frac0( a );
4395     aExp = extractFloat128Exp( a );
4396     aSign = extractFloat128Sign( a );
4397     aSig0 |= ( aSig1 != 0 );
4398     if ( 0x401E < aExp ) {
4399         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4400         goto invalid;
4401     }
4402     else if ( aExp < 0x3FFF ) {
4403         if ( aExp || aSig0 ) float_set_inexact();
4404         return 0;
4405     }
4406     aSig0 |= LIT64( 0x0001000000000000 );
4407     shiftCount = 0x402F - aExp;
4408     savedASig = aSig0;
4409     aSig0 >>= shiftCount;
4410     z = aSig0;
4411     if ( aSign ) z = - z;
4412     if ( ( z < 0 ) ^ aSign ) {
4413  invalid:
4414         float_raise( float_flag_invalid );
4415         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4416     }
4417     if ( ( aSig0<<shiftCount ) != savedASig ) {
4418         float_set_inexact();
4419     }
4420     return z;
4421 
4422 }
4423 
4424 /*
4425 -------------------------------------------------------------------------------
4426 Returns the result of converting the quadruple-precision floating-point
4427 value `a' to the 64-bit two's complement integer format.  The conversion
4428 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4429 Arithmetic---which means in particular that the conversion is rounded
4430 according to the current rounding mode.  If `a' is a NaN, the largest
4431 positive integer is returned.  Otherwise, if the conversion overflows, the
4432 largest integer with the same sign as `a' is returned.
4433 -------------------------------------------------------------------------------
4434 */
4435 int64 float128_to_int64( float128 a )
4436 {
4437     flag aSign;
4438     int32 aExp, shiftCount;
4439     bits64 aSig0, aSig1;
4440 
4441     aSig1 = extractFloat128Frac1( a );
4442     aSig0 = extractFloat128Frac0( a );
4443     aExp = extractFloat128Exp( a );
4444     aSign = extractFloat128Sign( a );
4445     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4446     shiftCount = 0x402F - aExp;
4447     if ( shiftCount <= 0 ) {
4448         if ( 0x403E < aExp ) {
4449             float_raise( float_flag_invalid );
4450             if (    ! aSign
4451                  || (    ( aExp == 0x7FFF )
4452                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4453                     )
4454                ) {
4455                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4456             }
4457             return (sbits64) LIT64( 0x8000000000000000 );
4458         }
4459         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4460     }
4461     else {
4462         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4463     }
4464     return roundAndPackInt64( aSign, aSig0, aSig1 );
4465 
4466 }
4467 
4468 /*
4469 -------------------------------------------------------------------------------
4470 Returns the result of converting the quadruple-precision floating-point
4471 value `a' to the 64-bit two's complement integer format.  The conversion
4472 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4473 Arithmetic, except that the conversion is always rounded toward zero.
4474 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4475 the conversion overflows, the largest integer with the same sign as `a' is
4476 returned.
4477 -------------------------------------------------------------------------------
4478 */
4479 int64 float128_to_int64_round_to_zero( float128 a )
4480 {
4481     flag aSign;
4482     int32 aExp, shiftCount;
4483     bits64 aSig0, aSig1;
4484     int64 z;
4485 
4486     aSig1 = extractFloat128Frac1( a );
4487     aSig0 = extractFloat128Frac0( a );
4488     aExp = extractFloat128Exp( a );
4489     aSign = extractFloat128Sign( a );
4490     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4491     shiftCount = aExp - 0x402F;
4492     if ( 0 < shiftCount ) {
4493         if ( 0x403E <= aExp ) {
4494             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4495             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4496                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4497                 if ( aSig1 ) float_set_inexact();
4498             }
4499             else {
4500                 float_raise( float_flag_invalid );
4501                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4502                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4503                 }
4504             }
4505             return (sbits64) LIT64( 0x8000000000000000 );
4506         }
4507         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4508         if ( (bits64) ( aSig1<<shiftCount ) ) {
4509             float_set_inexact();
4510         }
4511     }
4512     else {
4513         if ( aExp < 0x3FFF ) {
4514             if ( aExp | aSig0 | aSig1 ) {
4515                 float_set_inexact();
4516             }
4517             return 0;
4518         }
4519         z = aSig0>>( - shiftCount );
4520         if (    aSig1
4521              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4522             float_set_inexact();
4523         }
4524     }
4525     if ( aSign ) z = - z;
4526     return z;
4527 
4528 }
4529 
4530 /*
4531 -------------------------------------------------------------------------------
4532 Returns the result of converting the quadruple-precision floating-point
4533 value `a' to the single-precision floating-point format.  The conversion
4534 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4535 Arithmetic.
4536 -------------------------------------------------------------------------------
4537 */
4538 float32 float128_to_float32( float128 a )
4539 {
4540     flag aSign;
4541     int32 aExp;
4542     bits64 aSig0, aSig1;
4543     bits32 zSig;
4544 
4545     aSig1 = extractFloat128Frac1( a );
4546     aSig0 = extractFloat128Frac0( a );
4547     aExp = extractFloat128Exp( a );
4548     aSign = extractFloat128Sign( a );
4549     if ( aExp == 0x7FFF ) {
4550         if ( aSig0 | aSig1 ) {
4551             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4552         }
4553         return packFloat32( aSign, 0xFF, 0 );
4554     }
4555     aSig0 |= ( aSig1 != 0 );
4556     shift64RightJamming( aSig0, 18, &aSig0 );
4557     zSig = aSig0;
4558     if ( aExp || zSig ) {
4559         zSig |= 0x40000000;
4560         aExp -= 0x3F81;
4561     }
4562     return roundAndPackFloat32( aSign, aExp, zSig );
4563 
4564 }
4565 
4566 /*
4567 -------------------------------------------------------------------------------
4568 Returns the result of converting the quadruple-precision floating-point
4569 value `a' to the double-precision floating-point format.  The conversion
4570 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4571 Arithmetic.
4572 -------------------------------------------------------------------------------
4573 */
4574 float64 float128_to_float64( float128 a )
4575 {
4576     flag aSign;
4577     int32 aExp;
4578     bits64 aSig0, aSig1;
4579 
4580     aSig1 = extractFloat128Frac1( a );
4581     aSig0 = extractFloat128Frac0( a );
4582     aExp = extractFloat128Exp( a );
4583     aSign = extractFloat128Sign( a );
4584     if ( aExp == 0x7FFF ) {
4585         if ( aSig0 | aSig1 ) {
4586             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4587         }
4588         return packFloat64( aSign, 0x7FF, 0 );
4589     }
4590     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4591     aSig0 |= ( aSig1 != 0 );
4592     if ( aExp || aSig0 ) {
4593         aSig0 |= LIT64( 0x4000000000000000 );
4594         aExp -= 0x3C01;
4595     }
4596     return roundAndPackFloat64( aSign, aExp, aSig0 );
4597 
4598 }
4599 
4600 #ifdef FLOATX80
4601 
4602 /*
4603 -------------------------------------------------------------------------------
4604 Returns the result of converting the quadruple-precision floating-point
4605 value `a' to the extended double-precision floating-point format.  The
4606 conversion is performed according to the IEC/IEEE Standard for Binary
4607 Floating-Point Arithmetic.
4608 -------------------------------------------------------------------------------
4609 */
4610 floatx80 float128_to_floatx80( float128 a )
4611 {
4612     flag aSign;
4613     int32 aExp;
4614     bits64 aSig0, aSig1;
4615 
4616     aSig1 = extractFloat128Frac1( a );
4617     aSig0 = extractFloat128Frac0( a );
4618     aExp = extractFloat128Exp( a );
4619     aSign = extractFloat128Sign( a );
4620     if ( aExp == 0x7FFF ) {
4621         if ( aSig0 | aSig1 ) {
4622             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4623         }
4624         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4625     }
4626     if ( aExp == 0 ) {
4627         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4628         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4629     }
4630     else {
4631         aSig0 |= LIT64( 0x0001000000000000 );
4632     }
4633     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4634     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4635 
4636 }
4637 
4638 #endif
4639 
4640 /*
4641 -------------------------------------------------------------------------------
4642 Rounds the quadruple-precision floating-point value `a' to an integer, and
4643 returns the result as a quadruple-precision floating-point value.  The
4644 operation is performed according to the IEC/IEEE Standard for Binary
4645 Floating-Point Arithmetic.
4646 -------------------------------------------------------------------------------
4647 */
4648 float128 float128_round_to_int( float128 a )
4649 {
4650     flag aSign;
4651     int32 aExp;
4652     bits64 lastBitMask, roundBitsMask;
4653     int8 roundingMode;
4654     float128 z;
4655 
4656     aExp = extractFloat128Exp( a );
4657     if ( 0x402F <= aExp ) {
4658         if ( 0x406F <= aExp ) {
4659             if (    ( aExp == 0x7FFF )
4660                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4661                ) {
4662                 return propagateFloat128NaN( a, a );
4663             }
4664             return a;
4665         }
4666         lastBitMask = 1;
4667         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4668         roundBitsMask = lastBitMask - 1;
4669         z = a;
4670         roundingMode = float_rounding_mode();
4671         if ( roundingMode == float_round_nearest_even ) {
4672             if ( lastBitMask ) {
4673                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4674                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4675             }
4676             else {
4677                 if ( (sbits64) z.low < 0 ) {
4678                     ++z.high;
4679                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4680                 }
4681             }
4682         }
4683         else if ( roundingMode != float_round_to_zero ) {
4684             if (   extractFloat128Sign( z )
4685                  ^ ( roundingMode == float_round_up ) ) {
4686                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4687             }
4688         }
4689         z.low &= ~ roundBitsMask;
4690     }
4691     else {
4692         if ( aExp < 0x3FFF ) {
4693             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4694             float_set_inexact();
4695             aSign = extractFloat128Sign( a );
4696             switch ( float_rounding_mode() ) {
4697              case float_round_nearest_even:
4698                 if (    ( aExp == 0x3FFE )
4699                      && (   extractFloat128Frac0( a )
4700                           | extractFloat128Frac1( a ) )
4701                    ) {
4702                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4703                 }
4704                 break;
4705              case float_round_down:
4706                 return
4707                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4708                     : packFloat128( 0, 0, 0, 0 );
4709              case float_round_up:
4710                 return
4711                       aSign ? packFloat128( 1, 0, 0, 0 )
4712                     : packFloat128( 0, 0x3FFF, 0, 0 );
4713             }
4714             return packFloat128( aSign, 0, 0, 0 );
4715         }
4716         lastBitMask = 1;
4717         lastBitMask <<= 0x402F - aExp;
4718         roundBitsMask = lastBitMask - 1;
4719         z.low = 0;
4720         z.high = a.high;
4721         roundingMode = float_rounding_mode();
4722         if ( roundingMode == float_round_nearest_even ) {
4723             z.high += lastBitMask>>1;
4724             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4725                 z.high &= ~ lastBitMask;
4726             }
4727         }
4728         else if ( roundingMode != float_round_to_zero ) {
4729             if (   extractFloat128Sign( z )
4730                  ^ ( roundingMode == float_round_up ) ) {
4731                 z.high |= ( a.low != 0 );
4732                 z.high += roundBitsMask;
4733             }
4734         }
4735         z.high &= ~ roundBitsMask;
4736     }
4737     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4738         float_set_inexact();
4739     }
4740     return z;
4741 
4742 }
4743 
4744 /*
4745 -------------------------------------------------------------------------------
4746 Returns the result of adding the absolute values of the quadruple-precision
4747 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4748 before being returned.  `zSign' is ignored if the result is a NaN.
4749 The addition is performed according to the IEC/IEEE Standard for Binary
4750 Floating-Point Arithmetic.
4751 -------------------------------------------------------------------------------
4752 */
4753 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4754 {
4755     int32 aExp, bExp, zExp;
4756     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4757     int32 expDiff;
4758 
4759     aSig1 = extractFloat128Frac1( a );
4760     aSig0 = extractFloat128Frac0( a );
4761     aExp = extractFloat128Exp( a );
4762     bSig1 = extractFloat128Frac1( b );
4763     bSig0 = extractFloat128Frac0( b );
4764     bExp = extractFloat128Exp( b );
4765     expDiff = aExp - bExp;
4766     if ( 0 < expDiff ) {
4767         if ( aExp == 0x7FFF ) {
4768             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4769             return a;
4770         }
4771         if ( bExp == 0 ) {
4772             --expDiff;
4773         }
4774         else {
4775             bSig0 |= LIT64( 0x0001000000000000 );
4776         }
4777         shift128ExtraRightJamming(
4778             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4779         zExp = aExp;
4780     }
4781     else if ( expDiff < 0 ) {
4782         if ( bExp == 0x7FFF ) {
4783             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4784             return packFloat128( zSign, 0x7FFF, 0, 0 );
4785         }
4786         if ( aExp == 0 ) {
4787             ++expDiff;
4788         }
4789         else {
4790             aSig0 |= LIT64( 0x0001000000000000 );
4791         }
4792         shift128ExtraRightJamming(
4793             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4794         zExp = bExp;
4795     }
4796     else {
4797         if ( aExp == 0x7FFF ) {
4798             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4799                 return propagateFloat128NaN( a, b );
4800             }
4801             return a;
4802         }
4803         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4804         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4805         zSig2 = 0;
4806         zSig0 |= LIT64( 0x0002000000000000 );
4807         zExp = aExp;
4808         goto shiftRight1;
4809     }
4810     aSig0 |= LIT64( 0x0001000000000000 );
4811     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4812     --zExp;
4813     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4814     ++zExp;
4815  shiftRight1:
4816     shift128ExtraRightJamming(
4817         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4818  roundAndPack:
4819     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4820 
4821 }
4822 
4823 /*
4824 -------------------------------------------------------------------------------
4825 Returns the result of subtracting the absolute values of the quadruple-
4826 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4827 difference is negated before being returned.  `zSign' is ignored if the
4828 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4829 Standard for Binary Floating-Point Arithmetic.
4830 -------------------------------------------------------------------------------
4831 */
4832 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4833 {
4834     int32 aExp, bExp, zExp;
4835     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4836     int32 expDiff;
4837     float128 z;
4838 
4839     aSig1 = extractFloat128Frac1( a );
4840     aSig0 = extractFloat128Frac0( a );
4841     aExp = extractFloat128Exp( a );
4842     bSig1 = extractFloat128Frac1( b );
4843     bSig0 = extractFloat128Frac0( b );
4844     bExp = extractFloat128Exp( b );
4845     expDiff = aExp - bExp;
4846     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4847     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4848     if ( 0 < expDiff ) goto aExpBigger;
4849     if ( expDiff < 0 ) goto bExpBigger;
4850     if ( aExp == 0x7FFF ) {
4851         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4852             return propagateFloat128NaN( a, b );
4853         }
4854         float_raise( float_flag_invalid );
4855         z.low = float128_default_nan_low;
4856         z.high = float128_default_nan_high;
4857         return z;
4858     }
4859     if ( aExp == 0 ) {
4860         aExp = 1;
4861         bExp = 1;
4862     }
4863     if ( bSig0 < aSig0 ) goto aBigger;
4864     if ( aSig0 < bSig0 ) goto bBigger;
4865     if ( bSig1 < aSig1 ) goto aBigger;
4866     if ( aSig1 < bSig1 ) goto bBigger;
4867     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4868  bExpBigger:
4869     if ( bExp == 0x7FFF ) {
4870         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4871         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4872     }
4873     if ( aExp == 0 ) {
4874         ++expDiff;
4875     }
4876     else {
4877         aSig0 |= LIT64( 0x4000000000000000 );
4878     }
4879     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4880     bSig0 |= LIT64( 0x4000000000000000 );
4881  bBigger:
4882     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4883     zExp = bExp;
4884     zSign ^= 1;
4885     goto normalizeRoundAndPack;
4886  aExpBigger:
4887     if ( aExp == 0x7FFF ) {
4888         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4889         return a;
4890     }
4891     if ( bExp == 0 ) {
4892         --expDiff;
4893     }
4894     else {
4895         bSig0 |= LIT64( 0x4000000000000000 );
4896     }
4897     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4898     aSig0 |= LIT64( 0x4000000000000000 );
4899  aBigger:
4900     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4901     zExp = aExp;
4902  normalizeRoundAndPack:
4903     --zExp;
4904     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4905 
4906 }
4907 
4908 /*
4909 -------------------------------------------------------------------------------
4910 Returns the result of adding the quadruple-precision floating-point values
4911 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4912 for Binary Floating-Point Arithmetic.
4913 -------------------------------------------------------------------------------
4914 */
4915 float128 float128_add( float128 a, float128 b )
4916 {
4917     flag aSign, bSign;
4918 
4919     aSign = extractFloat128Sign( a );
4920     bSign = extractFloat128Sign( b );
4921     if ( aSign == bSign ) {
4922         return addFloat128Sigs( a, b, aSign );
4923     }
4924     else {
4925         return subFloat128Sigs( a, b, aSign );
4926     }
4927 
4928 }
4929 
4930 /*
4931 -------------------------------------------------------------------------------
4932 Returns the result of subtracting the quadruple-precision floating-point
4933 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4934 Standard for Binary Floating-Point Arithmetic.
4935 -------------------------------------------------------------------------------
4936 */
4937 float128 float128_sub( float128 a, float128 b )
4938 {
4939     flag aSign, bSign;
4940 
4941     aSign = extractFloat128Sign( a );
4942     bSign = extractFloat128Sign( b );
4943     if ( aSign == bSign ) {
4944         return subFloat128Sigs( a, b, aSign );
4945     }
4946     else {
4947         return addFloat128Sigs( a, b, aSign );
4948     }
4949 
4950 }
4951 
4952 /*
4953 -------------------------------------------------------------------------------
4954 Returns the result of multiplying the quadruple-precision floating-point
4955 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4956 Standard for Binary Floating-Point Arithmetic.
4957 -------------------------------------------------------------------------------
4958 */
4959 float128 float128_mul( float128 a, float128 b )
4960 {
4961     flag aSign, bSign, zSign;
4962     int32 aExp, bExp, zExp;
4963     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4964     float128 z;
4965 
4966     aSig1 = extractFloat128Frac1( a );
4967     aSig0 = extractFloat128Frac0( a );
4968     aExp = extractFloat128Exp( a );
4969     aSign = extractFloat128Sign( a );
4970     bSig1 = extractFloat128Frac1( b );
4971     bSig0 = extractFloat128Frac0( b );
4972     bExp = extractFloat128Exp( b );
4973     bSign = extractFloat128Sign( b );
4974     zSign = aSign ^ bSign;
4975     if ( aExp == 0x7FFF ) {
4976         if (    ( aSig0 | aSig1 )
4977              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4978             return propagateFloat128NaN( a, b );
4979         }
4980         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4981         return packFloat128( zSign, 0x7FFF, 0, 0 );
4982     }
4983     if ( bExp == 0x7FFF ) {
4984         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4985         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4986  invalid:
4987             float_raise( float_flag_invalid );
4988             z.low = float128_default_nan_low;
4989             z.high = float128_default_nan_high;
4990             return z;
4991         }
4992         return packFloat128( zSign, 0x7FFF, 0, 0 );
4993     }
4994     if ( aExp == 0 ) {
4995         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4996         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4997     }
4998     if ( bExp == 0 ) {
4999         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5000         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5001     }
5002     zExp = aExp + bExp - 0x4000;
5003     aSig0 |= LIT64( 0x0001000000000000 );
5004     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5005     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5006     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5007     zSig2 |= ( zSig3 != 0 );
5008     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5009         shift128ExtraRightJamming(
5010             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5011         ++zExp;
5012     }
5013     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5014 
5015 }
5016 
5017 /*
5018 -------------------------------------------------------------------------------
5019 Returns the result of dividing the quadruple-precision floating-point value
5020 `a' by the corresponding value `b'.  The operation is performed according to
5021 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 -------------------------------------------------------------------------------
5023 */
5024 float128 float128_div( float128 a, float128 b )
5025 {
5026     flag aSign, bSign, zSign;
5027     int32 aExp, bExp, zExp;
5028     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5029     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5030     float128 z;
5031 
5032     aSig1 = extractFloat128Frac1( a );
5033     aSig0 = extractFloat128Frac0( a );
5034     aExp = extractFloat128Exp( a );
5035     aSign = extractFloat128Sign( a );
5036     bSig1 = extractFloat128Frac1( b );
5037     bSig0 = extractFloat128Frac0( b );
5038     bExp = extractFloat128Exp( b );
5039     bSign = extractFloat128Sign( b );
5040     zSign = aSign ^ bSign;
5041     if ( aExp == 0x7FFF ) {
5042         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5043         if ( bExp == 0x7FFF ) {
5044             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5045             goto invalid;
5046         }
5047         return packFloat128( zSign, 0x7FFF, 0, 0 );
5048     }
5049     if ( bExp == 0x7FFF ) {
5050         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5051         return packFloat128( zSign, 0, 0, 0 );
5052     }
5053     if ( bExp == 0 ) {
5054         if ( ( bSig0 | bSig1 ) == 0 ) {
5055             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5056  invalid:
5057                 float_raise( float_flag_invalid );
5058                 z.low = float128_default_nan_low;
5059                 z.high = float128_default_nan_high;
5060                 return z;
5061             }
5062             float_raise( float_flag_divbyzero );
5063             return packFloat128( zSign, 0x7FFF, 0, 0 );
5064         }
5065         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5066     }
5067     if ( aExp == 0 ) {
5068         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5069         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5070     }
5071     zExp = aExp - bExp + 0x3FFD;
5072     shortShift128Left(
5073         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5074     shortShift128Left(
5075         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5076     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5077         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5078         ++zExp;
5079     }
5080     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5081     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5082     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5083     while ( (sbits64) rem0 < 0 ) {
5084         --zSig0;
5085         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5086     }
5087     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5088     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5089         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5090         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5091         while ( (sbits64) rem1 < 0 ) {
5092             --zSig1;
5093             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5094         }
5095         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5096     }
5097     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5098     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5099 
5100 }
5101 
5102 /*
5103 -------------------------------------------------------------------------------
5104 Returns the remainder of the quadruple-precision floating-point value `a'
5105 with respect to the corresponding value `b'.  The operation is performed
5106 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5107 -------------------------------------------------------------------------------
5108 */
5109 float128 float128_rem( float128 a, float128 b )
5110 {
5111     flag aSign, bSign, zSign;
5112     int32 aExp, bExp, expDiff;
5113     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5114     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5115     sbits64 sigMean0;
5116     float128 z;
5117 
5118     aSig1 = extractFloat128Frac1( a );
5119     aSig0 = extractFloat128Frac0( a );
5120     aExp = extractFloat128Exp( a );
5121     aSign = extractFloat128Sign( a );
5122     bSig1 = extractFloat128Frac1( b );
5123     bSig0 = extractFloat128Frac0( b );
5124     bExp = extractFloat128Exp( b );
5125     bSign = extractFloat128Sign( b );
5126     if ( aExp == 0x7FFF ) {
5127         if (    ( aSig0 | aSig1 )
5128              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5129             return propagateFloat128NaN( a, b );
5130         }
5131         goto invalid;
5132     }
5133     if ( bExp == 0x7FFF ) {
5134         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5135         return a;
5136     }
5137     if ( bExp == 0 ) {
5138         if ( ( bSig0 | bSig1 ) == 0 ) {
5139  invalid:
5140             float_raise( float_flag_invalid );
5141             z.low = float128_default_nan_low;
5142             z.high = float128_default_nan_high;
5143             return z;
5144         }
5145         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5146     }
5147     if ( aExp == 0 ) {
5148         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5149         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5150     }
5151     expDiff = aExp - bExp;
5152     if ( expDiff < -1 ) return a;
5153     shortShift128Left(
5154         aSig0 | LIT64( 0x0001000000000000 ),
5155         aSig1,
5156         15 - ( expDiff < 0 ),
5157         &aSig0,
5158         &aSig1
5159     );
5160     shortShift128Left(
5161         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5162     q = le128( bSig0, bSig1, aSig0, aSig1 );
5163     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5164     expDiff -= 64;
5165     while ( 0 < expDiff ) {
5166         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5167         q = ( 4 < q ) ? q - 4 : 0;
5168         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5169         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5170         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5171         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5172         expDiff -= 61;
5173     }
5174     if ( -64 < expDiff ) {
5175         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176         q = ( 4 < q ) ? q - 4 : 0;
5177         q >>= - expDiff;
5178         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5179         expDiff += 52;
5180         if ( expDiff < 0 ) {
5181             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5182         }
5183         else {
5184             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5185         }
5186         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5187         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5188     }
5189     else {
5190         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5191         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5192     }
5193     do {
5194         alternateASig0 = aSig0;
5195         alternateASig1 = aSig1;
5196         ++q;
5197         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5198     } while ( 0 <= (sbits64) aSig0 );
5199     add128(
5200         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5201     if (    ( sigMean0 < 0 )
5202          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5203         aSig0 = alternateASig0;
5204         aSig1 = alternateASig1;
5205     }
5206     zSign = ( (sbits64) aSig0 < 0 );
5207     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5208     return
5209         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5210 
5211 }
5212 
5213 /*
5214 -------------------------------------------------------------------------------
5215 Returns the square root of the quadruple-precision floating-point value `a'.
5216 The operation is performed according to the IEC/IEEE Standard for Binary
5217 Floating-Point Arithmetic.
5218 -------------------------------------------------------------------------------
5219 */
5220 float128 float128_sqrt( float128 a )
5221 {
5222     flag aSign;
5223     int32 aExp, zExp;
5224     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5225     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5226     float128 z;
5227 
5228     aSig1 = extractFloat128Frac1( a );
5229     aSig0 = extractFloat128Frac0( a );
5230     aExp = extractFloat128Exp( a );
5231     aSign = extractFloat128Sign( a );
5232     if ( aExp == 0x7FFF ) {
5233         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5234         if ( ! aSign ) return a;
5235         goto invalid;
5236     }
5237     if ( aSign ) {
5238         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5239  invalid:
5240         float_raise( float_flag_invalid );
5241         z.low = float128_default_nan_low;
5242         z.high = float128_default_nan_high;
5243         return z;
5244     }
5245     if ( aExp == 0 ) {
5246         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5247         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5248     }
5249     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5250     aSig0 |= LIT64( 0x0001000000000000 );
5251     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5252     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5253     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5254     doubleZSig0 = zSig0<<1;
5255     mul64To128( zSig0, zSig0, &term0, &term1 );
5256     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5257     while ( (sbits64) rem0 < 0 ) {
5258         --zSig0;
5259         doubleZSig0 -= 2;
5260         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5261     }
5262     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5263     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5264         if ( zSig1 == 0 ) zSig1 = 1;
5265         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5266         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5267         mul64To128( zSig1, zSig1, &term2, &term3 );
5268         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5269         while ( (sbits64) rem1 < 0 ) {
5270             --zSig1;
5271             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5272             term3 |= 1;
5273             term2 |= doubleZSig0;
5274             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5275         }
5276         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5277     }
5278     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5279     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5280 
5281 }
5282 
5283 /*
5284 -------------------------------------------------------------------------------
5285 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5286 the corresponding value `b', and 0 otherwise.  The comparison is performed
5287 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5288 -------------------------------------------------------------------------------
5289 */
5290 flag float128_eq( float128 a, float128 b )
5291 {
5292 
5293     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5294               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5295          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5296               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5297        ) {
5298         if (    float128_is_signaling_nan( a )
5299              || float128_is_signaling_nan( b ) ) {
5300             float_raise( float_flag_invalid );
5301         }
5302         return 0;
5303     }
5304     return
5305            ( a.low == b.low )
5306         && (    ( a.high == b.high )
5307              || (    ( a.low == 0 )
5308                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5309            );
5310 
5311 }
5312 
5313 /*
5314 -------------------------------------------------------------------------------
5315 Returns 1 if the quadruple-precision floating-point value `a' is less than
5316 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5317 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5318 Arithmetic.
5319 -------------------------------------------------------------------------------
5320 */
5321 flag float128_le( float128 a, float128 b )
5322 {
5323     flag aSign, bSign;
5324 
5325     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5326               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5327          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5328               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5329        ) {
5330         float_raise( float_flag_invalid );
5331         return 0;
5332     }
5333     aSign = extractFloat128Sign( a );
5334     bSign = extractFloat128Sign( b );
5335     if ( aSign != bSign ) {
5336         return
5337                aSign
5338             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5339                  == 0 );
5340     }
5341     return
5342           aSign ? le128( b.high, b.low, a.high, a.low )
5343         : le128( a.high, a.low, b.high, b.low );
5344 
5345 }
5346 
5347 /*
5348 -------------------------------------------------------------------------------
5349 Returns 1 if the quadruple-precision floating-point value `a' is less than
5350 the corresponding value `b', and 0 otherwise.  The comparison is performed
5351 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5352 -------------------------------------------------------------------------------
5353 */
5354 flag float128_lt( float128 a, float128 b )
5355 {
5356     flag aSign, bSign;
5357 
5358     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5359               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5360          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5361               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5362        ) {
5363         float_raise( float_flag_invalid );
5364         return 0;
5365     }
5366     aSign = extractFloat128Sign( a );
5367     bSign = extractFloat128Sign( b );
5368     if ( aSign != bSign ) {
5369         return
5370                aSign
5371             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5372                  != 0 );
5373     }
5374     return
5375           aSign ? lt128( b.high, b.low, a.high, a.low )
5376         : lt128( a.high, a.low, b.high, b.low );
5377 
5378 }
5379 
5380 /*
5381 -------------------------------------------------------------------------------
5382 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5383 the corresponding value `b', and 0 otherwise.  The invalid exception is
5384 raised if either operand is a NaN.  Otherwise, the comparison is performed
5385 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5386 -------------------------------------------------------------------------------
5387 */
5388 flag float128_eq_signaling( float128 a, float128 b )
5389 {
5390 
5391     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5392               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5393          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5394               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5395        ) {
5396         float_raise( float_flag_invalid );
5397         return 0;
5398     }
5399     return
5400            ( a.low == b.low )
5401         && (    ( a.high == b.high )
5402              || (    ( a.low == 0 )
5403                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5404            );
5405 
5406 }
5407 
5408 /*
5409 -------------------------------------------------------------------------------
5410 Returns 1 if the quadruple-precision floating-point value `a' is less than
5411 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5412 cause an exception.  Otherwise, the comparison is performed according to the
5413 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5414 -------------------------------------------------------------------------------
5415 */
5416 flag float128_le_quiet( float128 a, float128 b )
5417 {
5418     flag aSign, bSign;
5419 
5420     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5421               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5422          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5423               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5424        ) {
5425         if (    float128_is_signaling_nan( a )
5426              || float128_is_signaling_nan( b ) ) {
5427             float_raise( float_flag_invalid );
5428         }
5429         return 0;
5430     }
5431     aSign = extractFloat128Sign( a );
5432     bSign = extractFloat128Sign( b );
5433     if ( aSign != bSign ) {
5434         return
5435                aSign
5436             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5437                  == 0 );
5438     }
5439     return
5440           aSign ? le128( b.high, b.low, a.high, a.low )
5441         : le128( a.high, a.low, b.high, b.low );
5442 
5443 }
5444 
5445 /*
5446 -------------------------------------------------------------------------------
5447 Returns 1 if the quadruple-precision floating-point value `a' is less than
5448 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5449 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5450 Standard for Binary Floating-Point Arithmetic.
5451 -------------------------------------------------------------------------------
5452 */
5453 flag float128_lt_quiet( float128 a, float128 b )
5454 {
5455     flag aSign, bSign;
5456 
5457     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5458               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5459          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5460               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5461        ) {
5462         if (    float128_is_signaling_nan( a )
5463              || float128_is_signaling_nan( b ) ) {
5464             float_raise( float_flag_invalid );
5465         }
5466         return 0;
5467     }
5468     aSign = extractFloat128Sign( a );
5469     bSign = extractFloat128Sign( b );
5470     if ( aSign != bSign ) {
5471         return
5472                aSign
5473             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5474                  != 0 );
5475     }
5476     return
5477           aSign ? lt128( b.high, b.low, a.high, a.low )
5478         : lt128( a.high, a.low, b.high, b.low );
5479 
5480 }
5481 
5482 #endif
5483 
5484 
5485 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5486 
5487 /*
5488  * These two routines are not part of the original softfloat distribution.
5489  *
5490  * They are based on the corresponding conversions to integer but return
5491  * unsigned numbers instead since these functions are required by GCC.
5492  *
5493  * Added by Mark Brinicombe <mark@netbsd.org>	27/09/97
5494  *
5495  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5496  */
5497 
5498 /*
5499 -------------------------------------------------------------------------------
5500 Returns the result of converting the double-precision floating-point value
5501 `a' to the 32-bit unsigned integer format.  The conversion is
5502 performed according to the IEC/IEEE Standard for Binary Floating-point
5503 Arithmetic, except that the conversion is always rounded toward zero.  If
5504 `a' is a NaN, the largest positive integer is returned.  If the conversion
5505 overflows, the largest integer positive is returned.
5506 -------------------------------------------------------------------------------
5507 */
5508 uint32 float64_to_uint32_round_to_zero( float64 a )
5509 {
5510     flag aSign;
5511     int16 aExp, shiftCount;
5512     bits64 aSig, savedASig;
5513     uint32 z;
5514 
5515     aSig = extractFloat64Frac( a );
5516     aExp = extractFloat64Exp( a );
5517     aSign = extractFloat64Sign( a );
5518 
5519     if (aSign) {
5520         float_raise( float_flag_invalid );
5521     	return(0);
5522     }
5523 
5524     if ( 0x41E < aExp ) {
5525         float_raise( float_flag_invalid );
5526         return 0xffffffff;
5527     }
5528     else if ( aExp < 0x3FF ) {
5529         if ( aExp || aSig ) float_set_inexact();
5530         return 0;
5531     }
5532     aSig |= LIT64( 0x0010000000000000 );
5533     shiftCount = 0x433 - aExp;
5534     savedASig = aSig;
5535     aSig >>= shiftCount;
5536     z = aSig;
5537     if ( ( aSig<<shiftCount ) != savedASig ) {
5538         float_set_inexact();
5539     }
5540     return z;
5541 
5542 }
5543 
5544 /*
5545 -------------------------------------------------------------------------------
5546 Returns the result of converting the single-precision floating-point value
5547 `a' to the 32-bit unsigned integer format.  The conversion is
5548 performed according to the IEC/IEEE Standard for Binary Floating-point
5549 Arithmetic, except that the conversion is always rounded toward zero.  If
5550 `a' is a NaN, the largest positive integer is returned.  If the conversion
5551 overflows, the largest positive integer is returned.
5552 -------------------------------------------------------------------------------
5553 */
5554 uint32 float32_to_uint32_round_to_zero( float32 a )
5555 {
5556     flag aSign;
5557     int16 aExp, shiftCount;
5558     bits32 aSig;
5559     uint32 z;
5560 
5561     aSig = extractFloat32Frac( a );
5562     aExp = extractFloat32Exp( a );
5563     aSign = extractFloat32Sign( a );
5564     shiftCount = aExp - 0x9E;
5565 
5566     if (aSign) {
5567         float_raise( float_flag_invalid );
5568     	return(0);
5569     }
5570     if ( 0 < shiftCount ) {
5571         float_raise( float_flag_invalid );
5572         return 0xFFFFFFFF;
5573     }
5574     else if ( aExp <= 0x7E ) {
5575         if ( aExp | aSig ) float_set_inexact();
5576         return 0;
5577     }
5578     aSig = ( aSig | 0x800000 )<<8;
5579     z = aSig>>( - shiftCount );
5580     if ( aSig<<( shiftCount & 31 ) ) {
5581         float_set_inexact();
5582     }
5583     return z;
5584 
5585 }
5586 
5587 #endif
5588