xref: /openbsd-src/sys/lib/libkern/softfloat.c (revision d874cce4b1d9fe6b41c9e4f2117a77d8a4a37b92)
1 /*	$OpenBSD: softfloat.c,v 1.2 2007/12/29 16:59:16 miod Exp $	*/
2 /*	$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $	*/
3 
4 /*
5  * This version hacked for use with gcc -msoft-float by bjh21.
6  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7  *  itself).
8  */
9 
10 /*
11  * Things you may want to define:
12  *
13  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
15  *   properly renamed.
16  */
17 
18 /*
19 ===============================================================================
20 
21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
22 Arithmetic Package, Release 2a.
23 
24 Written by John R. Hauser.  This work was made possible in part by the
25 International Computer Science Institute, located at Suite 600, 1947 Center
26 Street, Berkeley, California 94704.  Funding was partially provided by the
27 National Science Foundation under grant MIP-9311980.  The original version
28 of this code was written as part of a project to build a fixed-point vector
29 processor in collaboration with the University of California at Berkeley,
30 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
32 arithmetic/SoftFloat.html'.
33 
34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable
35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS
37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
43 
44 Derivative works are acceptable, even for commercial purposes, so long as
45 (1) they include prominent notice that the work is derivative, and (2) they
46 include prominent notice akin to these four paragraphs for those parts of
47 this code that are retained.
48 
49 ===============================================================================
50 */
51 
52 #include <sys/cdefs.h>
53 #if defined(LIBC_SCCS) && !defined(lint)
54 __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
55 #endif /* LIBC_SCCS and not lint */
56 
57 #ifdef SOFTFLOAT_FOR_GCC
58 #include "softfloat-for-gcc.h"
59 #endif
60 
61 #include "milieu.h"
62 #include "softfloat.h"
63 
64 /*
65  * Conversions between floats as stored in memory and floats as
66  * SoftFloat uses them
67  */
68 #ifndef FLOAT64_DEMANGLE
69 #define FLOAT64_DEMANGLE(a)	(a)
70 #endif
71 #ifndef FLOAT64_MANGLE
72 #define FLOAT64_MANGLE(a)	(a)
73 #endif
74 
75 /*
76 -------------------------------------------------------------------------------
77 Floating-point rounding mode, extended double-precision rounding precision,
78 and exception flags.
79 -------------------------------------------------------------------------------
80 */
81 
82 /*
83  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
84  * 	Right now, it does not.  I've removed all other dynamic global
85  * 	variables. [ross]
86  */
87 #ifdef FLOATX80
88 int8 floatx80_rounding_precision = 80;
89 #endif
90 
91 /*
92 -------------------------------------------------------------------------------
93 Primitive arithmetic functions, including multi-word arithmetic, and
94 division and square root approximations.  (Can be specialized to target if
95 desired.)
96 -------------------------------------------------------------------------------
97 */
98 #include "softfloat-macros.h"
99 
100 /*
101 -------------------------------------------------------------------------------
102 Functions and definitions to determine:  (1) whether tininess for underflow
103 is detected before or after rounding by default, (2) what (if anything)
104 happens when exceptions are raised, (3) how signaling NaNs are distinguished
105 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
106 are propagated from function inputs to output.  These details are target-
107 specific.
108 -------------------------------------------------------------------------------
109 */
110 #include "softfloat-specialize.h"
111 
112 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
113 /*
114 -------------------------------------------------------------------------------
115 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
116 and 7, and returns the properly rounded 32-bit integer corresponding to the
117 input.  If `zSign' is 1, the input is negated before being converted to an
118 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
119 is simply rounded to an integer, with the inexact exception raised if the
120 input cannot be represented exactly as an integer.  However, if the fixed-
121 point input is too large, the invalid exception is raised and the largest
122 positive or negative integer is returned.
123 -------------------------------------------------------------------------------
124 */
125 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
126 {
127     int8 roundingMode;
128     flag roundNearestEven;
129     int8 roundIncrement, roundBits;
130     int32 z;
131 
132     roundingMode = float_rounding_mode();
133     roundNearestEven = ( roundingMode == float_round_nearest_even );
134     roundIncrement = 0x40;
135     if ( ! roundNearestEven ) {
136         if ( roundingMode == float_round_to_zero ) {
137             roundIncrement = 0;
138         }
139         else {
140             roundIncrement = 0x7F;
141             if ( zSign ) {
142                 if ( roundingMode == float_round_up ) roundIncrement = 0;
143             }
144             else {
145                 if ( roundingMode == float_round_down ) roundIncrement = 0;
146             }
147         }
148     }
149     roundBits = absZ & 0x7F;
150     absZ = ( absZ + roundIncrement )>>7;
151     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
152     z = absZ;
153     if ( zSign ) z = - z;
154     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
155         float_raise( float_flag_invalid );
156         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
157     }
158     if ( roundBits ) float_set_inexact();
159     return z;
160 
161 }
162 
163 /*
164 -------------------------------------------------------------------------------
165 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
166 `absZ1', with binary point between bits 63 and 64 (between the input words),
167 and returns the properly rounded 64-bit integer corresponding to the input.
168 If `zSign' is 1, the input is negated before being converted to an integer.
169 Ordinarily, the fixed-point input is simply rounded to an integer, with
170 the inexact exception raised if the input cannot be represented exactly as
171 an integer.  However, if the fixed-point input is too large, the invalid
172 exception is raised and the largest positive or negative integer is
173 returned.
174 -------------------------------------------------------------------------------
175 */
176 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
177 {
178     int8 roundingMode;
179     flag roundNearestEven, increment;
180     int64 z;
181 
182     roundingMode = float_rounding_mode();
183     roundNearestEven = ( roundingMode == float_round_nearest_even );
184     increment = ( (sbits64) absZ1 < 0 );
185     if ( ! roundNearestEven ) {
186         if ( roundingMode == float_round_to_zero ) {
187             increment = 0;
188         }
189         else {
190             if ( zSign ) {
191                 increment = ( roundingMode == float_round_down ) && absZ1;
192             }
193             else {
194                 increment = ( roundingMode == float_round_up ) && absZ1;
195             }
196         }
197     }
198     if ( increment ) {
199         ++absZ0;
200         if ( absZ0 == 0 ) goto overflow;
201         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
202     }
203     z = absZ0;
204     if ( zSign ) z = - z;
205     if ( z && ( ( z < 0 ) ^ zSign ) ) {
206  overflow:
207         float_raise( float_flag_invalid );
208         return
209               zSign ? (sbits64) LIT64( 0x8000000000000000 )
210             : LIT64( 0x7FFFFFFFFFFFFFFF );
211     }
212     if ( absZ1 ) float_set_inexact();
213     return z;
214 
215 }
216 #endif
217 
218 /*
219 -------------------------------------------------------------------------------
220 Returns the fraction bits of the single-precision floating-point value `a'.
221 -------------------------------------------------------------------------------
222 */
223 INLINE bits32 extractFloat32Frac( float32 a )
224 {
225 
226     return a & 0x007FFFFF;
227 
228 }
229 
230 /*
231 -------------------------------------------------------------------------------
232 Returns the exponent bits of the single-precision floating-point value `a'.
233 -------------------------------------------------------------------------------
234 */
235 INLINE int16 extractFloat32Exp( float32 a )
236 {
237 
238     return ( a>>23 ) & 0xFF;
239 
240 }
241 
242 /*
243 -------------------------------------------------------------------------------
244 Returns the sign bit of the single-precision floating-point value `a'.
245 -------------------------------------------------------------------------------
246 */
247 INLINE flag extractFloat32Sign( float32 a )
248 {
249 
250     return a>>31;
251 
252 }
253 
254 /*
255 -------------------------------------------------------------------------------
256 Normalizes the subnormal single-precision floating-point value represented
257 by the denormalized significand `aSig'.  The normalized exponent and
258 significand are stored at the locations pointed to by `zExpPtr' and
259 `zSigPtr', respectively.
260 -------------------------------------------------------------------------------
261 */
262 static void
263  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
264 {
265     int8 shiftCount;
266 
267     shiftCount = countLeadingZeros32( aSig ) - 8;
268     *zSigPtr = aSig<<shiftCount;
269     *zExpPtr = 1 - shiftCount;
270 
271 }
272 
273 /*
274 -------------------------------------------------------------------------------
275 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
276 single-precision floating-point value, returning the result.  After being
277 shifted into the proper positions, the three fields are simply added
278 together to form the result.  This means that any integer portion of `zSig'
279 will be added into the exponent.  Since a properly normalized significand
280 will have an integer portion equal to 1, the `zExp' input should be 1 less
281 than the desired result exponent whenever `zSig' is a complete, normalized
282 significand.
283 -------------------------------------------------------------------------------
284 */
285 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
286 {
287 
288     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
289 
290 }
291 
292 /*
293 -------------------------------------------------------------------------------
294 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
295 and significand `zSig', and returns the proper single-precision floating-
296 point value corresponding to the abstract input.  Ordinarily, the abstract
297 value is simply rounded and packed into the single-precision format, with
298 the inexact exception raised if the abstract input cannot be represented
299 exactly.  However, if the abstract value is too large, the overflow and
300 inexact exceptions are raised and an infinity or maximal finite value is
301 returned.  If the abstract value is too small, the input value is rounded to
302 a subnormal number, and the underflow and inexact exceptions are raised if
303 the abstract input cannot be represented exactly as a subnormal single-
304 precision floating-point number.
305     The input significand `zSig' has its binary point between bits 30
306 and 29, which is 7 bits to the left of the usual location.  This shifted
307 significand must be normalized or smaller.  If `zSig' is not normalized,
308 `zExp' must be 0; in that case, the result returned is a subnormal number,
309 and it must not require rounding.  In the usual case that `zSig' is
310 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
311 The handling of underflow and overflow follows the IEC/IEEE Standard for
312 Binary Floating-Point Arithmetic.
313 -------------------------------------------------------------------------------
314 */
315 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
316 {
317     int8 roundingMode;
318     flag roundNearestEven;
319     int8 roundIncrement, roundBits;
320     flag isTiny;
321 
322     roundingMode = float_rounding_mode();
323     roundNearestEven = ( roundingMode == float_round_nearest_even );
324     roundIncrement = 0x40;
325     if ( ! roundNearestEven ) {
326         if ( roundingMode == float_round_to_zero ) {
327             roundIncrement = 0;
328         }
329         else {
330             roundIncrement = 0x7F;
331             if ( zSign ) {
332                 if ( roundingMode == float_round_up ) roundIncrement = 0;
333             }
334             else {
335                 if ( roundingMode == float_round_down ) roundIncrement = 0;
336             }
337         }
338     }
339     roundBits = zSig & 0x7F;
340     if ( 0xFD <= (bits16) zExp ) {
341         if (    ( 0xFD < zExp )
342              || (    ( zExp == 0xFD )
343                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
344            ) {
345             float_raise( float_flag_overflow | float_flag_inexact );
346             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
347         }
348         if ( zExp < 0 ) {
349             isTiny =
350                    ( float_detect_tininess == float_tininess_before_rounding )
351                 || ( zExp < -1 )
352                 || ( zSig + roundIncrement < 0x80000000 );
353             shift32RightJamming( zSig, - zExp, &zSig );
354             zExp = 0;
355             roundBits = zSig & 0x7F;
356             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
357         }
358     }
359     if ( roundBits ) float_set_inexact();
360     zSig = ( zSig + roundIncrement )>>7;
361     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
362     if ( zSig == 0 ) zExp = 0;
363     return packFloat32( zSign, zExp, zSig );
364 
365 }
366 
367 /*
368 -------------------------------------------------------------------------------
369 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
370 and significand `zSig', and returns the proper single-precision floating-
371 point value corresponding to the abstract input.  This routine is just like
372 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
373 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
374 floating-point exponent.
375 -------------------------------------------------------------------------------
376 */
377 static float32
378  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
379 {
380     int8 shiftCount;
381 
382     shiftCount = countLeadingZeros32( zSig ) - 1;
383     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
384 
385 }
386 
387 /*
388 -------------------------------------------------------------------------------
389 Returns the fraction bits of the double-precision floating-point value `a'.
390 -------------------------------------------------------------------------------
391 */
392 INLINE bits64 extractFloat64Frac( float64 a )
393 {
394 
395     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
396 
397 }
398 
399 /*
400 -------------------------------------------------------------------------------
401 Returns the exponent bits of the double-precision floating-point value `a'.
402 -------------------------------------------------------------------------------
403 */
404 INLINE int16 extractFloat64Exp( float64 a )
405 {
406 
407     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
408 
409 }
410 
411 /*
412 -------------------------------------------------------------------------------
413 Returns the sign bit of the double-precision floating-point value `a'.
414 -------------------------------------------------------------------------------
415 */
416 INLINE flag extractFloat64Sign( float64 a )
417 {
418 
419     return FLOAT64_DEMANGLE(a)>>63;
420 
421 }
422 
423 /*
424 -------------------------------------------------------------------------------
425 Normalizes the subnormal double-precision floating-point value represented
426 by the denormalized significand `aSig'.  The normalized exponent and
427 significand are stored at the locations pointed to by `zExpPtr' and
428 `zSigPtr', respectively.
429 -------------------------------------------------------------------------------
430 */
431 static void
432  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
433 {
434     int8 shiftCount;
435 
436     shiftCount = countLeadingZeros64( aSig ) - 11;
437     *zSigPtr = aSig<<shiftCount;
438     *zExpPtr = 1 - shiftCount;
439 
440 }
441 
442 /*
443 -------------------------------------------------------------------------------
444 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 double-precision floating-point value, returning the result.  After being
446 shifted into the proper positions, the three fields are simply added
447 together to form the result.  This means that any integer portion of `zSig'
448 will be added into the exponent.  Since a properly normalized significand
449 will have an integer portion equal to 1, the `zExp' input should be 1 less
450 than the desired result exponent whenever `zSig' is a complete, normalized
451 significand.
452 -------------------------------------------------------------------------------
453 */
454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
455 {
456 
457     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
458 			   ( ( (bits64) zExp )<<52 ) + zSig );
459 
460 }
461 
462 /*
463 -------------------------------------------------------------------------------
464 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
465 and significand `zSig', and returns the proper double-precision floating-
466 point value corresponding to the abstract input.  Ordinarily, the abstract
467 value is simply rounded and packed into the double-precision format, with
468 the inexact exception raised if the abstract input cannot be represented
469 exactly.  However, if the abstract value is too large, the overflow and
470 inexact exceptions are raised and an infinity or maximal finite value is
471 returned.  If the abstract value is too small, the input value is rounded to
472 a subnormal number, and the underflow and inexact exceptions are raised if
473 the abstract input cannot be represented exactly as a subnormal double-
474 precision floating-point number.
475     The input significand `zSig' has its binary point between bits 62
476 and 61, which is 10 bits to the left of the usual location.  This shifted
477 significand must be normalized or smaller.  If `zSig' is not normalized,
478 `zExp' must be 0; in that case, the result returned is a subnormal number,
479 and it must not require rounding.  In the usual case that `zSig' is
480 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
481 The handling of underflow and overflow follows the IEC/IEEE Standard for
482 Binary Floating-Point Arithmetic.
483 -------------------------------------------------------------------------------
484 */
485 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
486 {
487     int8 roundingMode;
488     flag roundNearestEven;
489     int16 roundIncrement, roundBits;
490     flag isTiny;
491 
492     roundingMode = float_rounding_mode();
493     roundNearestEven = ( roundingMode == float_round_nearest_even );
494     roundIncrement = 0x200;
495     if ( ! roundNearestEven ) {
496         if ( roundingMode == float_round_to_zero ) {
497             roundIncrement = 0;
498         }
499         else {
500             roundIncrement = 0x3FF;
501             if ( zSign ) {
502                 if ( roundingMode == float_round_up ) roundIncrement = 0;
503             }
504             else {
505                 if ( roundingMode == float_round_down ) roundIncrement = 0;
506             }
507         }
508     }
509     roundBits = zSig & 0x3FF;
510     if ( 0x7FD <= (bits16) zExp ) {
511         if (    ( 0x7FD < zExp )
512              || (    ( zExp == 0x7FD )
513                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
514            ) {
515             float_raise( float_flag_overflow | float_flag_inexact );
516             return FLOAT64_MANGLE(
517 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
518 		( roundIncrement == 0 ));
519         }
520         if ( zExp < 0 ) {
521             isTiny =
522                    ( float_detect_tininess == float_tininess_before_rounding )
523                 || ( zExp < -1 )
524                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
525             shift64RightJamming( zSig, - zExp, &zSig );
526             zExp = 0;
527             roundBits = zSig & 0x3FF;
528             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
529         }
530     }
531     if ( roundBits ) float_set_inexact();
532     zSig = ( zSig + roundIncrement )>>10;
533     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
534     if ( zSig == 0 ) zExp = 0;
535     return packFloat64( zSign, zExp, zSig );
536 
537 }
538 
539 /*
540 -------------------------------------------------------------------------------
541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
542 and significand `zSig', and returns the proper double-precision floating-
543 point value corresponding to the abstract input.  This routine is just like
544 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
545 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
546 floating-point exponent.
547 -------------------------------------------------------------------------------
548 */
549 static float64
550  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
551 {
552     int8 shiftCount;
553 
554     shiftCount = countLeadingZeros64( zSig ) - 1;
555     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
556 
557 }
558 
559 #ifdef FLOATX80
560 
561 /*
562 -------------------------------------------------------------------------------
563 Returns the fraction bits of the extended double-precision floating-point
564 value `a'.
565 -------------------------------------------------------------------------------
566 */
567 INLINE bits64 extractFloatx80Frac( floatx80 a )
568 {
569 
570     return a.low;
571 
572 }
573 
574 /*
575 -------------------------------------------------------------------------------
576 Returns the exponent bits of the extended double-precision floating-point
577 value `a'.
578 -------------------------------------------------------------------------------
579 */
580 INLINE int32 extractFloatx80Exp( floatx80 a )
581 {
582 
583     return a.high & 0x7FFF;
584 
585 }
586 
587 /*
588 -------------------------------------------------------------------------------
589 Returns the sign bit of the extended double-precision floating-point value
590 `a'.
591 -------------------------------------------------------------------------------
592 */
593 INLINE flag extractFloatx80Sign( floatx80 a )
594 {
595 
596     return a.high>>15;
597 
598 }
599 
600 /*
601 -------------------------------------------------------------------------------
602 Normalizes the subnormal extended double-precision floating-point value
603 represented by the denormalized significand `aSig'.  The normalized exponent
604 and significand are stored at the locations pointed to by `zExpPtr' and
605 `zSigPtr', respectively.
606 -------------------------------------------------------------------------------
607 */
608 static void
609  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
610 {
611     int8 shiftCount;
612 
613     shiftCount = countLeadingZeros64( aSig );
614     *zSigPtr = aSig<<shiftCount;
615     *zExpPtr = 1 - shiftCount;
616 
617 }
618 
619 /*
620 -------------------------------------------------------------------------------
621 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
622 extended double-precision floating-point value, returning the result.
623 -------------------------------------------------------------------------------
624 */
625 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
626 {
627     floatx80 z;
628 
629     z.low = zSig;
630     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
631     return z;
632 
633 }
634 
635 /*
636 -------------------------------------------------------------------------------
637 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
638 and extended significand formed by the concatenation of `zSig0' and `zSig1',
639 and returns the proper extended double-precision floating-point value
640 corresponding to the abstract input.  Ordinarily, the abstract value is
641 rounded and packed into the extended double-precision format, with the
642 inexact exception raised if the abstract input cannot be represented
643 exactly.  However, if the abstract value is too large, the overflow and
644 inexact exceptions are raised and an infinity or maximal finite value is
645 returned.  If the abstract value is too small, the input value is rounded to
646 a subnormal number, and the underflow and inexact exceptions are raised if
647 the abstract input cannot be represented exactly as a subnormal extended
648 double-precision floating-point number.
649     If `roundingPrecision' is 32 or 64, the result is rounded to the same
650 number of bits as single or double precision, respectively.  Otherwise, the
651 result is rounded to the full precision of the extended double-precision
652 format.
653     The input significand must be normalized or smaller.  If the input
654 significand is not normalized, `zExp' must be 0; in that case, the result
655 returned is a subnormal number, and it must not require rounding.  The
656 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
657 Floating-Point Arithmetic.
658 -------------------------------------------------------------------------------
659 */
660 static floatx80
661  roundAndPackFloatx80(
662      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
663  )
664 {
665     int8 roundingMode;
666     flag roundNearestEven, increment, isTiny;
667     int64 roundIncrement, roundMask, roundBits;
668 
669     roundingMode = float_rounding_mode();
670     roundNearestEven = ( roundingMode == float_round_nearest_even );
671     if ( roundingPrecision == 80 ) goto precision80;
672     if ( roundingPrecision == 64 ) {
673         roundIncrement = LIT64( 0x0000000000000400 );
674         roundMask = LIT64( 0x00000000000007FF );
675     }
676     else if ( roundingPrecision == 32 ) {
677         roundIncrement = LIT64( 0x0000008000000000 );
678         roundMask = LIT64( 0x000000FFFFFFFFFF );
679     }
680     else {
681         goto precision80;
682     }
683     zSig0 |= ( zSig1 != 0 );
684     if ( ! roundNearestEven ) {
685         if ( roundingMode == float_round_to_zero ) {
686             roundIncrement = 0;
687         }
688         else {
689             roundIncrement = roundMask;
690             if ( zSign ) {
691                 if ( roundingMode == float_round_up ) roundIncrement = 0;
692             }
693             else {
694                 if ( roundingMode == float_round_down ) roundIncrement = 0;
695             }
696         }
697     }
698     roundBits = zSig0 & roundMask;
699     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
700         if (    ( 0x7FFE < zExp )
701              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
702            ) {
703             goto overflow;
704         }
705         if ( zExp <= 0 ) {
706             isTiny =
707                    ( float_detect_tininess == float_tininess_before_rounding )
708                 || ( zExp < 0 )
709                 || ( zSig0 <= zSig0 + roundIncrement );
710             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
711             zExp = 0;
712             roundBits = zSig0 & roundMask;
713             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
714             if ( roundBits ) float_set_inexact();
715             zSig0 += roundIncrement;
716             if ( (sbits64) zSig0 < 0 ) zExp = 1;
717             roundIncrement = roundMask + 1;
718             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719                 roundMask |= roundIncrement;
720             }
721             zSig0 &= ~ roundMask;
722             return packFloatx80( zSign, zExp, zSig0 );
723         }
724     }
725     if ( roundBits ) float_set_inexact();
726     zSig0 += roundIncrement;
727     if ( zSig0 < roundIncrement ) {
728         ++zExp;
729         zSig0 = LIT64( 0x8000000000000000 );
730     }
731     roundIncrement = roundMask + 1;
732     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
733         roundMask |= roundIncrement;
734     }
735     zSig0 &= ~ roundMask;
736     if ( zSig0 == 0 ) zExp = 0;
737     return packFloatx80( zSign, zExp, zSig0 );
738  precision80:
739     increment = ( (sbits64) zSig1 < 0 );
740     if ( ! roundNearestEven ) {
741         if ( roundingMode == float_round_to_zero ) {
742             increment = 0;
743         }
744         else {
745             if ( zSign ) {
746                 increment = ( roundingMode == float_round_down ) && zSig1;
747             }
748             else {
749                 increment = ( roundingMode == float_round_up ) && zSig1;
750             }
751         }
752     }
753     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
754         if (    ( 0x7FFE < zExp )
755              || (    ( zExp == 0x7FFE )
756                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
757                   && increment
758                 )
759            ) {
760             roundMask = 0;
761  overflow:
762             float_raise( float_flag_overflow | float_flag_inexact );
763             if (    ( roundingMode == float_round_to_zero )
764                  || ( zSign && ( roundingMode == float_round_up ) )
765                  || ( ! zSign && ( roundingMode == float_round_down ) )
766                ) {
767                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
768             }
769             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
770         }
771         if ( zExp <= 0 ) {
772             isTiny =
773                    ( float_detect_tininess == float_tininess_before_rounding )
774                 || ( zExp < 0 )
775                 || ! increment
776                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
777             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
778             zExp = 0;
779             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
780             if ( zSig1 ) float_set_inexact();
781             if ( roundNearestEven ) {
782                 increment = ( (sbits64) zSig1 < 0 );
783             }
784             else {
785                 if ( zSign ) {
786                     increment = ( roundingMode == float_round_down ) && zSig1;
787                 }
788                 else {
789                     increment = ( roundingMode == float_round_up ) && zSig1;
790                 }
791             }
792             if ( increment ) {
793                 ++zSig0;
794                 zSig0 &=
795                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
796                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
797             }
798             return packFloatx80( zSign, zExp, zSig0 );
799         }
800     }
801     if ( zSig1 ) float_set_inexact();
802     if ( increment ) {
803         ++zSig0;
804         if ( zSig0 == 0 ) {
805             ++zExp;
806             zSig0 = LIT64( 0x8000000000000000 );
807         }
808         else {
809             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
810         }
811     }
812     else {
813         if ( zSig0 == 0 ) zExp = 0;
814     }
815     return packFloatx80( zSign, zExp, zSig0 );
816 
817 }
818 
819 /*
820 -------------------------------------------------------------------------------
821 Takes an abstract floating-point value having sign `zSign', exponent
822 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
823 and returns the proper extended double-precision floating-point value
824 corresponding to the abstract input.  This routine is just like
825 `roundAndPackFloatx80' except that the input significand does not have to be
826 normalized.
827 -------------------------------------------------------------------------------
828 */
829 static floatx80
830  normalizeRoundAndPackFloatx80(
831      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
832  )
833 {
834     int8 shiftCount;
835 
836     if ( zSig0 == 0 ) {
837         zSig0 = zSig1;
838         zSig1 = 0;
839         zExp -= 64;
840     }
841     shiftCount = countLeadingZeros64( zSig0 );
842     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
843     zExp -= shiftCount;
844     return
845         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
846 
847 }
848 
849 #endif
850 
851 #ifdef FLOAT128
852 
853 /*
854 -------------------------------------------------------------------------------
855 Returns the least-significant 64 fraction bits of the quadruple-precision
856 floating-point value `a'.
857 -------------------------------------------------------------------------------
858 */
859 INLINE bits64 extractFloat128Frac1( float128 a )
860 {
861 
862     return a.low;
863 
864 }
865 
866 /*
867 -------------------------------------------------------------------------------
868 Returns the most-significant 48 fraction bits of the quadruple-precision
869 floating-point value `a'.
870 -------------------------------------------------------------------------------
871 */
872 INLINE bits64 extractFloat128Frac0( float128 a )
873 {
874 
875     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876 
877 }
878 
879 /*
880 -------------------------------------------------------------------------------
881 Returns the exponent bits of the quadruple-precision floating-point value
882 `a'.
883 -------------------------------------------------------------------------------
884 */
885 INLINE int32 extractFloat128Exp( float128 a )
886 {
887 
888     return ( a.high>>48 ) & 0x7FFF;
889 
890 }
891 
892 /*
893 -------------------------------------------------------------------------------
894 Returns the sign bit of the quadruple-precision floating-point value `a'.
895 -------------------------------------------------------------------------------
896 */
897 INLINE flag extractFloat128Sign( float128 a )
898 {
899 
900     return a.high>>63;
901 
902 }
903 
904 /*
905 -------------------------------------------------------------------------------
906 Normalizes the subnormal quadruple-precision floating-point value
907 represented by the denormalized significand formed by the concatenation of
908 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
909 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
910 significand are stored at the location pointed to by `zSig0Ptr', and the
911 least significant 64 bits of the normalized significand are stored at the
912 location pointed to by `zSig1Ptr'.
913 -------------------------------------------------------------------------------
914 */
915 static void
916  normalizeFloat128Subnormal(
917      bits64 aSig0,
918      bits64 aSig1,
919      int32 *zExpPtr,
920      bits64 *zSig0Ptr,
921      bits64 *zSig1Ptr
922  )
923 {
924     int8 shiftCount;
925 
926     if ( aSig0 == 0 ) {
927         shiftCount = countLeadingZeros64( aSig1 ) - 15;
928         if ( shiftCount < 0 ) {
929             *zSig0Ptr = aSig1>>( - shiftCount );
930             *zSig1Ptr = aSig1<<( shiftCount & 63 );
931         }
932         else {
933             *zSig0Ptr = aSig1<<shiftCount;
934             *zSig1Ptr = 0;
935         }
936         *zExpPtr = - shiftCount - 63;
937     }
938     else {
939         shiftCount = countLeadingZeros64( aSig0 ) - 15;
940         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
941         *zExpPtr = 1 - shiftCount;
942     }
943 
944 }
945 
946 /*
947 -------------------------------------------------------------------------------
948 Packs the sign `zSign', the exponent `zExp', and the significand formed
949 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
950 floating-point value, returning the result.  After being shifted into the
951 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
952 added together to form the most significant 32 bits of the result.  This
953 means that any integer portion of `zSig0' will be added into the exponent.
954 Since a properly normalized significand will have an integer portion equal
955 to 1, the `zExp' input should be 1 less than the desired result exponent
956 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
957 significand.
958 -------------------------------------------------------------------------------
959 */
960 INLINE float128
961  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
962 {
963     float128 z;
964 
965     z.low = zSig1;
966     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
967     return z;
968 
969 }
970 
971 /*
972 -------------------------------------------------------------------------------
973 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
974 and extended significand formed by the concatenation of `zSig0', `zSig1',
975 and `zSig2', and returns the proper quadruple-precision floating-point value
976 corresponding to the abstract input.  Ordinarily, the abstract value is
977 simply rounded and packed into the quadruple-precision format, with the
978 inexact exception raised if the abstract input cannot be represented
979 exactly.  However, if the abstract value is too large, the overflow and
980 inexact exceptions are raised and an infinity or maximal finite value is
981 returned.  If the abstract value is too small, the input value is rounded to
982 a subnormal number, and the underflow and inexact exceptions are raised if
983 the abstract input cannot be represented exactly as a subnormal quadruple-
984 precision floating-point number.
985     The input significand must be normalized or smaller.  If the input
986 significand is not normalized, `zExp' must be 0; in that case, the result
987 returned is a subnormal number, and it must not require rounding.  In the
988 usual case that the input significand is normalized, `zExp' must be 1 less
989 than the ``true'' floating-point exponent.  The handling of underflow and
990 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
991 -------------------------------------------------------------------------------
992 */
993 static float128
994  roundAndPackFloat128(
995      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
996 {
997     int8 roundingMode;
998     flag roundNearestEven, increment, isTiny;
999 
1000     roundingMode = float_rounding_mode();
1001     roundNearestEven = ( roundingMode == float_round_nearest_even );
1002     increment = ( (sbits64) zSig2 < 0 );
1003     if ( ! roundNearestEven ) {
1004         if ( roundingMode == float_round_to_zero ) {
1005             increment = 0;
1006         }
1007         else {
1008             if ( zSign ) {
1009                 increment = ( roundingMode == float_round_down ) && zSig2;
1010             }
1011             else {
1012                 increment = ( roundingMode == float_round_up ) && zSig2;
1013             }
1014         }
1015     }
1016     if ( 0x7FFD <= (bits32) zExp ) {
1017         if (    ( 0x7FFD < zExp )
1018              || (    ( zExp == 0x7FFD )
1019                   && eq128(
1020                          LIT64( 0x0001FFFFFFFFFFFF ),
1021                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1022                          zSig0,
1023                          zSig1
1024                      )
1025                   && increment
1026                 )
1027            ) {
1028             float_raise( float_flag_overflow | float_flag_inexact );
1029             if (    ( roundingMode == float_round_to_zero )
1030                  || ( zSign && ( roundingMode == float_round_up ) )
1031                  || ( ! zSign && ( roundingMode == float_round_down ) )
1032                ) {
1033                 return
1034                     packFloat128(
1035                         zSign,
1036                         0x7FFE,
1037                         LIT64( 0x0000FFFFFFFFFFFF ),
1038                         LIT64( 0xFFFFFFFFFFFFFFFF )
1039                     );
1040             }
1041             return packFloat128( zSign, 0x7FFF, 0, 0 );
1042         }
1043         if ( zExp < 0 ) {
1044             isTiny =
1045                    ( float_detect_tininess == float_tininess_before_rounding )
1046                 || ( zExp < -1 )
1047                 || ! increment
1048                 || lt128(
1049                        zSig0,
1050                        zSig1,
1051                        LIT64( 0x0001FFFFFFFFFFFF ),
1052                        LIT64( 0xFFFFFFFFFFFFFFFF )
1053                    );
1054             shift128ExtraRightJamming(
1055                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1056             zExp = 0;
1057             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1058             if ( roundNearestEven ) {
1059                 increment = ( (sbits64) zSig2 < 0 );
1060             }
1061             else {
1062                 if ( zSign ) {
1063                     increment = ( roundingMode == float_round_down ) && zSig2;
1064                 }
1065                 else {
1066                     increment = ( roundingMode == float_round_up ) && zSig2;
1067                 }
1068             }
1069         }
1070     }
1071     if ( zSig2 ) float_set_inexact();
1072     if ( increment ) {
1073         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1074         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1075     }
1076     else {
1077         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1078     }
1079     return packFloat128( zSign, zExp, zSig0, zSig1 );
1080 
1081 }
1082 
1083 /*
1084 -------------------------------------------------------------------------------
1085 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1086 and significand formed by the concatenation of `zSig0' and `zSig1', and
1087 returns the proper quadruple-precision floating-point value corresponding
1088 to the abstract input.  This routine is just like `roundAndPackFloat128'
1089 except that the input significand has fewer bits and does not have to be
1090 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1091 point exponent.
1092 -------------------------------------------------------------------------------
1093 */
1094 static float128
1095  normalizeRoundAndPackFloat128(
1096      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1097 {
1098     int8 shiftCount;
1099     bits64 zSig2;
1100 
1101     if ( zSig0 == 0 ) {
1102         zSig0 = zSig1;
1103         zSig1 = 0;
1104         zExp -= 64;
1105     }
1106     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1107     if ( 0 <= shiftCount ) {
1108         zSig2 = 0;
1109         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1110     }
1111     else {
1112         shift128ExtraRightJamming(
1113             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1114     }
1115     zExp -= shiftCount;
1116     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1117 
1118 }
1119 
1120 #endif
1121 
1122 /*
1123 -------------------------------------------------------------------------------
1124 Returns the result of converting the 32-bit two's complement integer `a'
1125 to the single-precision floating-point format.  The conversion is performed
1126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 -------------------------------------------------------------------------------
1128 */
1129 float32 int32_to_float32( int32 a )
1130 {
1131     flag zSign;
1132 
1133     if ( a == 0 ) return 0;
1134     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1135     zSign = ( a < 0 );
1136     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1137 
1138 }
1139 
1140 /*
1141 -------------------------------------------------------------------------------
1142 Returns the result of converting the 32-bit two's complement integer `a'
1143 to the double-precision floating-point format.  The conversion is performed
1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 -------------------------------------------------------------------------------
1146 */
1147 float64 int32_to_float64( int32 a )
1148 {
1149     flag zSign;
1150     uint32 absA;
1151     int8 shiftCount;
1152     bits64 zSig;
1153 
1154     if ( a == 0 ) return 0;
1155     zSign = ( a < 0 );
1156     absA = zSign ? - a : a;
1157     shiftCount = countLeadingZeros32( absA ) + 21;
1158     zSig = absA;
1159     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1160 
1161 }
1162 
1163 #ifdef FLOATX80
1164 
1165 /*
1166 -------------------------------------------------------------------------------
1167 Returns the result of converting the 32-bit two's complement integer `a'
1168 to the extended double-precision floating-point format.  The conversion
1169 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1170 Arithmetic.
1171 -------------------------------------------------------------------------------
1172 */
1173 floatx80 int32_to_floatx80( int32 a )
1174 {
1175     flag zSign;
1176     uint32 absA;
1177     int8 shiftCount;
1178     bits64 zSig;
1179 
1180     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1181     zSign = ( a < 0 );
1182     absA = zSign ? - a : a;
1183     shiftCount = countLeadingZeros32( absA ) + 32;
1184     zSig = absA;
1185     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1186 
1187 }
1188 
1189 #endif
1190 
1191 #ifdef FLOAT128
1192 
1193 /*
1194 -------------------------------------------------------------------------------
1195 Returns the result of converting the 32-bit two's complement integer `a' to
1196 the quadruple-precision floating-point format.  The conversion is performed
1197 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1198 -------------------------------------------------------------------------------
1199 */
1200 float128 int32_to_float128( int32 a )
1201 {
1202     flag zSign;
1203     uint32 absA;
1204     int8 shiftCount;
1205     bits64 zSig0;
1206 
1207     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1208     zSign = ( a < 0 );
1209     absA = zSign ? - a : a;
1210     shiftCount = countLeadingZeros32( absA ) + 17;
1211     zSig0 = absA;
1212     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1213 
1214 }
1215 
1216 #endif
1217 
1218 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1219 /*
1220 -------------------------------------------------------------------------------
1221 Returns the result of converting the 64-bit two's complement integer `a'
1222 to the single-precision floating-point format.  The conversion is performed
1223 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1224 -------------------------------------------------------------------------------
1225 */
1226 float32 int64_to_float32( int64 a )
1227 {
1228     flag zSign;
1229     uint64 absA;
1230     int8 shiftCount;
1231 
1232     if ( a == 0 ) return 0;
1233     zSign = ( a < 0 );
1234     absA = zSign ? - a : a;
1235     shiftCount = countLeadingZeros64( absA ) - 40;
1236     if ( 0 <= shiftCount ) {
1237         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1238     }
1239     else {
1240         shiftCount += 7;
1241         if ( shiftCount < 0 ) {
1242             shift64RightJamming( absA, - shiftCount, &absA );
1243         }
1244         else {
1245             absA <<= shiftCount;
1246         }
1247         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1248     }
1249 
1250 }
1251 
1252 /*
1253 -------------------------------------------------------------------------------
1254 Returns the result of converting the 64-bit two's complement integer `a'
1255 to the double-precision floating-point format.  The conversion is performed
1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 -------------------------------------------------------------------------------
1258 */
1259 float64 int64_to_float64( int64 a )
1260 {
1261     flag zSign;
1262 
1263     if ( a == 0 ) return 0;
1264     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1265         return packFloat64( 1, 0x43E, 0 );
1266     }
1267     zSign = ( a < 0 );
1268     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1269 
1270 }
1271 
1272 #ifdef FLOATX80
1273 
1274 /*
1275 -------------------------------------------------------------------------------
1276 Returns the result of converting the 64-bit two's complement integer `a'
1277 to the extended double-precision floating-point format.  The conversion
1278 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1279 Arithmetic.
1280 -------------------------------------------------------------------------------
1281 */
1282 floatx80 int64_to_floatx80( int64 a )
1283 {
1284     flag zSign;
1285     uint64 absA;
1286     int8 shiftCount;
1287 
1288     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1289     zSign = ( a < 0 );
1290     absA = zSign ? - a : a;
1291     shiftCount = countLeadingZeros64( absA );
1292     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1293 
1294 }
1295 
1296 #endif
1297 
1298 #ifdef FLOAT128
1299 
1300 /*
1301 -------------------------------------------------------------------------------
1302 Returns the result of converting the 64-bit two's complement integer `a' to
1303 the quadruple-precision floating-point format.  The conversion is performed
1304 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1305 -------------------------------------------------------------------------------
1306 */
1307 float128 int64_to_float128( int64 a )
1308 {
1309     flag zSign;
1310     uint64 absA;
1311     int8 shiftCount;
1312     int32 zExp;
1313     bits64 zSig0, zSig1;
1314 
1315     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1316     zSign = ( a < 0 );
1317     absA = zSign ? - a : a;
1318     shiftCount = countLeadingZeros64( absA ) + 49;
1319     zExp = 0x406E - shiftCount;
1320     if ( 64 <= shiftCount ) {
1321         zSig1 = 0;
1322         zSig0 = absA;
1323         shiftCount -= 64;
1324     }
1325     else {
1326         zSig1 = absA;
1327         zSig0 = 0;
1328     }
1329     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1330     return packFloat128( zSign, zExp, zSig0, zSig1 );
1331 
1332 }
1333 
1334 #endif
1335 #endif /* !SOFTFLOAT_FOR_GCC */
1336 
1337 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1338 /*
1339 -------------------------------------------------------------------------------
1340 Returns the result of converting the single-precision floating-point value
1341 `a' to the 32-bit two's complement integer format.  The conversion is
1342 performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 Arithmetic---which means in particular that the conversion is rounded
1344 according to the current rounding mode.  If `a' is a NaN, the largest
1345 positive integer is returned.  Otherwise, if the conversion overflows, the
1346 largest integer with the same sign as `a' is returned.
1347 -------------------------------------------------------------------------------
1348 */
1349 int32 float32_to_int32( float32 a )
1350 {
1351     flag aSign;
1352     int16 aExp, shiftCount;
1353     bits32 aSig;
1354     bits64 aSig64;
1355 
1356     aSig = extractFloat32Frac( a );
1357     aExp = extractFloat32Exp( a );
1358     aSign = extractFloat32Sign( a );
1359     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1360     if ( aExp ) aSig |= 0x00800000;
1361     shiftCount = 0xAF - aExp;
1362     aSig64 = aSig;
1363     aSig64 <<= 32;
1364     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1365     return roundAndPackInt32( aSign, aSig64 );
1366 
1367 }
1368 #endif /* !SOFTFLOAT_FOR_GCC */
1369 
1370 /*
1371 -------------------------------------------------------------------------------
1372 Returns the result of converting the single-precision floating-point value
1373 `a' to the 32-bit two's complement integer format.  The conversion is
1374 performed according to the IEC/IEEE Standard for Binary Floating-Point
1375 Arithmetic, except that the conversion is always rounded toward zero.
1376 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1377 the conversion overflows, the largest integer with the same sign as `a' is
1378 returned.
1379 -------------------------------------------------------------------------------
1380 */
1381 int32 float32_to_int32_round_to_zero( float32 a )
1382 {
1383     flag aSign;
1384     int16 aExp, shiftCount;
1385     bits32 aSig;
1386     int32 z;
1387 
1388     aSig = extractFloat32Frac( a );
1389     aExp = extractFloat32Exp( a );
1390     aSign = extractFloat32Sign( a );
1391     shiftCount = aExp - 0x9E;
1392     if ( 0 <= shiftCount ) {
1393         if ( a != 0xCF000000 ) {
1394             float_raise( float_flag_invalid );
1395             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1396         }
1397         return (sbits32) 0x80000000;
1398     }
1399     else if ( aExp <= 0x7E ) {
1400         if ( aExp | aSig ) float_set_inexact();
1401         return 0;
1402     }
1403     aSig = ( aSig | 0x00800000 )<<8;
1404     z = aSig>>( - shiftCount );
1405     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406         float_set_inexact();
1407     }
1408     if ( aSign ) z = - z;
1409     return z;
1410 
1411 }
1412 
1413 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1414 /*
1415 -------------------------------------------------------------------------------
1416 Returns the result of converting the single-precision floating-point value
1417 `a' to the 64-bit two's complement integer format.  The conversion is
1418 performed according to the IEC/IEEE Standard for Binary Floating-Point
1419 Arithmetic---which means in particular that the conversion is rounded
1420 according to the current rounding mode.  If `a' is a NaN, the largest
1421 positive integer is returned.  Otherwise, if the conversion overflows, the
1422 largest integer with the same sign as `a' is returned.
1423 -------------------------------------------------------------------------------
1424 */
1425 int64 float32_to_int64( float32 a )
1426 {
1427     flag aSign;
1428     int16 aExp, shiftCount;
1429     bits32 aSig;
1430     bits64 aSig64, aSigExtra;
1431 
1432     aSig = extractFloat32Frac( a );
1433     aExp = extractFloat32Exp( a );
1434     aSign = extractFloat32Sign( a );
1435     shiftCount = 0xBE - aExp;
1436     if ( shiftCount < 0 ) {
1437         float_raise( float_flag_invalid );
1438         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1439             return LIT64( 0x7FFFFFFFFFFFFFFF );
1440         }
1441         return (sbits64) LIT64( 0x8000000000000000 );
1442     }
1443     if ( aExp ) aSig |= 0x00800000;
1444     aSig64 = aSig;
1445     aSig64 <<= 40;
1446     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1447     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1448 
1449 }
1450 
1451 /*
1452 -------------------------------------------------------------------------------
1453 Returns the result of converting the single-precision floating-point value
1454 `a' to the 64-bit two's complement integer format.  The conversion is
1455 performed according to the IEC/IEEE Standard for Binary Floating-Point
1456 Arithmetic, except that the conversion is always rounded toward zero.  If
1457 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1458 conversion overflows, the largest integer with the same sign as `a' is
1459 returned.
1460 -------------------------------------------------------------------------------
1461 */
1462 int64 float32_to_int64_round_to_zero( float32 a )
1463 {
1464     flag aSign;
1465     int16 aExp, shiftCount;
1466     bits32 aSig;
1467     bits64 aSig64;
1468     int64 z;
1469 
1470     aSig = extractFloat32Frac( a );
1471     aExp = extractFloat32Exp( a );
1472     aSign = extractFloat32Sign( a );
1473     shiftCount = aExp - 0xBE;
1474     if ( 0 <= shiftCount ) {
1475         if ( a != 0xDF000000 ) {
1476             float_raise( float_flag_invalid );
1477             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1478                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1479             }
1480         }
1481         return (sbits64) LIT64( 0x8000000000000000 );
1482     }
1483     else if ( aExp <= 0x7E ) {
1484         if ( aExp | aSig ) float_set_inexact();
1485         return 0;
1486     }
1487     aSig64 = aSig | 0x00800000;
1488     aSig64 <<= 40;
1489     z = aSig64>>( - shiftCount );
1490     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1491         float_set_inexact();
1492     }
1493     if ( aSign ) z = - z;
1494     return z;
1495 
1496 }
1497 #endif /* !SOFTFLOAT_FOR_GCC */
1498 
1499 /*
1500 -------------------------------------------------------------------------------
1501 Returns the result of converting the single-precision floating-point value
1502 `a' to the double-precision floating-point format.  The conversion is
1503 performed according to the IEC/IEEE Standard for Binary Floating-Point
1504 Arithmetic.
1505 -------------------------------------------------------------------------------
1506 */
1507 float64 float32_to_float64( float32 a )
1508 {
1509     flag aSign;
1510     int16 aExp;
1511     bits32 aSig;
1512 
1513     aSig = extractFloat32Frac( a );
1514     aExp = extractFloat32Exp( a );
1515     aSign = extractFloat32Sign( a );
1516     if ( aExp == 0xFF ) {
1517         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1518         return packFloat64( aSign, 0x7FF, 0 );
1519     }
1520     if ( aExp == 0 ) {
1521         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1522         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1523         --aExp;
1524     }
1525     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1526 
1527 }
1528 
1529 #ifdef FLOATX80
1530 
1531 /*
1532 -------------------------------------------------------------------------------
1533 Returns the result of converting the single-precision floating-point value
1534 `a' to the extended double-precision floating-point format.  The conversion
1535 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1536 Arithmetic.
1537 -------------------------------------------------------------------------------
1538 */
1539 floatx80 float32_to_floatx80( float32 a )
1540 {
1541     flag aSign;
1542     int16 aExp;
1543     bits32 aSig;
1544 
1545     aSig = extractFloat32Frac( a );
1546     aExp = extractFloat32Exp( a );
1547     aSign = extractFloat32Sign( a );
1548     if ( aExp == 0xFF ) {
1549         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1550         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1551     }
1552     if ( aExp == 0 ) {
1553         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1554         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1555     }
1556     aSig |= 0x00800000;
1557     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1558 
1559 }
1560 
1561 #endif
1562 
1563 #ifdef FLOAT128
1564 
1565 /*
1566 -------------------------------------------------------------------------------
1567 Returns the result of converting the single-precision floating-point value
1568 `a' to the double-precision floating-point format.  The conversion is
1569 performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 Arithmetic.
1571 -------------------------------------------------------------------------------
1572 */
1573 float128 float32_to_float128( float32 a )
1574 {
1575     flag aSign;
1576     int16 aExp;
1577     bits32 aSig;
1578 
1579     aSig = extractFloat32Frac( a );
1580     aExp = extractFloat32Exp( a );
1581     aSign = extractFloat32Sign( a );
1582     if ( aExp == 0xFF ) {
1583         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1584         return packFloat128( aSign, 0x7FFF, 0, 0 );
1585     }
1586     if ( aExp == 0 ) {
1587         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1588         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589         --aExp;
1590     }
1591     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1592 
1593 }
1594 
1595 #endif
1596 
1597 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1598 /*
1599 -------------------------------------------------------------------------------
1600 Rounds the single-precision floating-point value `a' to an integer, and
1601 returns the result as a single-precision floating-point value.  The
1602 operation is performed according to the IEC/IEEE Standard for Binary
1603 Floating-Point Arithmetic.
1604 -------------------------------------------------------------------------------
1605 */
1606 float32 float32_round_to_int( float32 a )
1607 {
1608     flag aSign;
1609     int16 aExp;
1610     bits32 lastBitMask, roundBitsMask;
1611     int8 roundingMode;
1612     float32 z;
1613 
1614     aExp = extractFloat32Exp( a );
1615     if ( 0x96 <= aExp ) {
1616         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1617             return propagateFloat32NaN( a, a );
1618         }
1619         return a;
1620     }
1621     if ( aExp <= 0x7E ) {
1622         if ( (bits32) ( a<<1 ) == 0 ) return a;
1623         float_set_inexact();
1624         aSign = extractFloat32Sign( a );
1625         switch ( float_rounding_mode() ) {
1626          case float_round_nearest_even:
1627             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1628                 return packFloat32( aSign, 0x7F, 0 );
1629             }
1630             break;
1631          case float_round_down:
1632             return aSign ? 0xBF800000 : 0;
1633          case float_round_up:
1634             return aSign ? 0x80000000 : 0x3F800000;
1635         }
1636         return packFloat32( aSign, 0, 0 );
1637     }
1638     lastBitMask = 1;
1639     lastBitMask <<= 0x96 - aExp;
1640     roundBitsMask = lastBitMask - 1;
1641     z = a;
1642     roundingMode = float_rounding_mode();
1643     if ( roundingMode == float_round_nearest_even ) {
1644         z += lastBitMask>>1;
1645         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1646     }
1647     else if ( roundingMode != float_round_to_zero ) {
1648         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1649             z += roundBitsMask;
1650         }
1651     }
1652     z &= ~ roundBitsMask;
1653     if ( z != a ) float_set_inexact();
1654     return z;
1655 
1656 }
1657 #endif /* !SOFTFLOAT_FOR_GCC */
1658 
1659 /*
1660 -------------------------------------------------------------------------------
1661 Returns the result of adding the absolute values of the single-precision
1662 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1663 before being returned.  `zSign' is ignored if the result is a NaN.
1664 The addition is performed according to the IEC/IEEE Standard for Binary
1665 Floating-Point Arithmetic.
1666 -------------------------------------------------------------------------------
1667 */
1668 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1669 {
1670     int16 aExp, bExp, zExp;
1671     bits32 aSig, bSig, zSig;
1672     int16 expDiff;
1673 
1674     aSig = extractFloat32Frac( a );
1675     aExp = extractFloat32Exp( a );
1676     bSig = extractFloat32Frac( b );
1677     bExp = extractFloat32Exp( b );
1678     expDiff = aExp - bExp;
1679     aSig <<= 6;
1680     bSig <<= 6;
1681     if ( 0 < expDiff ) {
1682         if ( aExp == 0xFF ) {
1683             if ( aSig ) return propagateFloat32NaN( a, b );
1684             return a;
1685         }
1686         if ( bExp == 0 ) {
1687             --expDiff;
1688         }
1689         else {
1690             bSig |= 0x20000000;
1691         }
1692         shift32RightJamming( bSig, expDiff, &bSig );
1693         zExp = aExp;
1694     }
1695     else if ( expDiff < 0 ) {
1696         if ( bExp == 0xFF ) {
1697             if ( bSig ) return propagateFloat32NaN( a, b );
1698             return packFloat32( zSign, 0xFF, 0 );
1699         }
1700         if ( aExp == 0 ) {
1701             ++expDiff;
1702         }
1703         else {
1704             aSig |= 0x20000000;
1705         }
1706         shift32RightJamming( aSig, - expDiff, &aSig );
1707         zExp = bExp;
1708     }
1709     else {
1710         if ( aExp == 0xFF ) {
1711             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1712             return a;
1713         }
1714         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1715         zSig = 0x40000000 + aSig + bSig;
1716         zExp = aExp;
1717         goto roundAndPack;
1718     }
1719     aSig |= 0x20000000;
1720     zSig = ( aSig + bSig )<<1;
1721     --zExp;
1722     if ( (sbits32) zSig < 0 ) {
1723         zSig = aSig + bSig;
1724         ++zExp;
1725     }
1726  roundAndPack:
1727     return roundAndPackFloat32( zSign, zExp, zSig );
1728 
1729 }
1730 
1731 /*
1732 -------------------------------------------------------------------------------
1733 Returns the result of subtracting the absolute values of the single-
1734 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1735 difference is negated before being returned.  `zSign' is ignored if the
1736 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1737 Standard for Binary Floating-Point Arithmetic.
1738 -------------------------------------------------------------------------------
1739 */
1740 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1741 {
1742     int16 aExp, bExp, zExp;
1743     bits32 aSig, bSig, zSig;
1744     int16 expDiff;
1745 
1746     aSig = extractFloat32Frac( a );
1747     aExp = extractFloat32Exp( a );
1748     bSig = extractFloat32Frac( b );
1749     bExp = extractFloat32Exp( b );
1750     expDiff = aExp - bExp;
1751     aSig <<= 7;
1752     bSig <<= 7;
1753     if ( 0 < expDiff ) goto aExpBigger;
1754     if ( expDiff < 0 ) goto bExpBigger;
1755     if ( aExp == 0xFF ) {
1756         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1757         float_raise( float_flag_invalid );
1758         return float32_default_nan;
1759     }
1760     if ( aExp == 0 ) {
1761         aExp = 1;
1762         bExp = 1;
1763     }
1764     if ( bSig < aSig ) goto aBigger;
1765     if ( aSig < bSig ) goto bBigger;
1766     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1767  bExpBigger:
1768     if ( bExp == 0xFF ) {
1769         if ( bSig ) return propagateFloat32NaN( a, b );
1770         return packFloat32( zSign ^ 1, 0xFF, 0 );
1771     }
1772     if ( aExp == 0 ) {
1773         ++expDiff;
1774     }
1775     else {
1776         aSig |= 0x40000000;
1777     }
1778     shift32RightJamming( aSig, - expDiff, &aSig );
1779     bSig |= 0x40000000;
1780  bBigger:
1781     zSig = bSig - aSig;
1782     zExp = bExp;
1783     zSign ^= 1;
1784     goto normalizeRoundAndPack;
1785  aExpBigger:
1786     if ( aExp == 0xFF ) {
1787         if ( aSig ) return propagateFloat32NaN( a, b );
1788         return a;
1789     }
1790     if ( bExp == 0 ) {
1791         --expDiff;
1792     }
1793     else {
1794         bSig |= 0x40000000;
1795     }
1796     shift32RightJamming( bSig, expDiff, &bSig );
1797     aSig |= 0x40000000;
1798  aBigger:
1799     zSig = aSig - bSig;
1800     zExp = aExp;
1801  normalizeRoundAndPack:
1802     --zExp;
1803     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1804 
1805 }
1806 
1807 /*
1808 -------------------------------------------------------------------------------
1809 Returns the result of adding the single-precision floating-point values `a'
1810 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1811 Binary Floating-Point Arithmetic.
1812 -------------------------------------------------------------------------------
1813 */
1814 float32 float32_add( float32 a, float32 b )
1815 {
1816     flag aSign, bSign;
1817 
1818     aSign = extractFloat32Sign( a );
1819     bSign = extractFloat32Sign( b );
1820     if ( aSign == bSign ) {
1821         return addFloat32Sigs( a, b, aSign );
1822     }
1823     else {
1824         return subFloat32Sigs( a, b, aSign );
1825     }
1826 
1827 }
1828 
1829 /*
1830 -------------------------------------------------------------------------------
1831 Returns the result of subtracting the single-precision floating-point values
1832 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1833 for Binary Floating-Point Arithmetic.
1834 -------------------------------------------------------------------------------
1835 */
1836 float32 float32_sub( float32 a, float32 b )
1837 {
1838     flag aSign, bSign;
1839 
1840     aSign = extractFloat32Sign( a );
1841     bSign = extractFloat32Sign( b );
1842     if ( aSign == bSign ) {
1843         return subFloat32Sigs( a, b, aSign );
1844     }
1845     else {
1846         return addFloat32Sigs( a, b, aSign );
1847     }
1848 
1849 }
1850 
1851 /*
1852 -------------------------------------------------------------------------------
1853 Returns the result of multiplying the single-precision floating-point values
1854 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1855 for Binary Floating-Point Arithmetic.
1856 -------------------------------------------------------------------------------
1857 */
1858 float32 float32_mul( float32 a, float32 b )
1859 {
1860     flag aSign, bSign, zSign;
1861     int16 aExp, bExp, zExp;
1862     bits32 aSig, bSig;
1863     bits64 zSig64;
1864     bits32 zSig;
1865 
1866     aSig = extractFloat32Frac( a );
1867     aExp = extractFloat32Exp( a );
1868     aSign = extractFloat32Sign( a );
1869     bSig = extractFloat32Frac( b );
1870     bExp = extractFloat32Exp( b );
1871     bSign = extractFloat32Sign( b );
1872     zSign = aSign ^ bSign;
1873     if ( aExp == 0xFF ) {
1874         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1875             return propagateFloat32NaN( a, b );
1876         }
1877         if ( ( bExp | bSig ) == 0 ) {
1878             float_raise( float_flag_invalid );
1879             return float32_default_nan;
1880         }
1881         return packFloat32( zSign, 0xFF, 0 );
1882     }
1883     if ( bExp == 0xFF ) {
1884         if ( bSig ) return propagateFloat32NaN( a, b );
1885         if ( ( aExp | aSig ) == 0 ) {
1886             float_raise( float_flag_invalid );
1887             return float32_default_nan;
1888         }
1889         return packFloat32( zSign, 0xFF, 0 );
1890     }
1891     if ( aExp == 0 ) {
1892         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1893         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1894     }
1895     if ( bExp == 0 ) {
1896         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1897         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1898     }
1899     zExp = aExp + bExp - 0x7F;
1900     aSig = ( aSig | 0x00800000 )<<7;
1901     bSig = ( bSig | 0x00800000 )<<8;
1902     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1903     zSig = zSig64;
1904     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1905         zSig <<= 1;
1906         --zExp;
1907     }
1908     return roundAndPackFloat32( zSign, zExp, zSig );
1909 
1910 }
1911 
1912 /*
1913 -------------------------------------------------------------------------------
1914 Returns the result of dividing the single-precision floating-point value `a'
1915 by the corresponding value `b'.  The operation is performed according to the
1916 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1917 -------------------------------------------------------------------------------
1918 */
1919 float32 float32_div( float32 a, float32 b )
1920 {
1921     flag aSign, bSign, zSign;
1922     int16 aExp, bExp, zExp;
1923     bits32 aSig, bSig, zSig;
1924 
1925     aSig = extractFloat32Frac( a );
1926     aExp = extractFloat32Exp( a );
1927     aSign = extractFloat32Sign( a );
1928     bSig = extractFloat32Frac( b );
1929     bExp = extractFloat32Exp( b );
1930     bSign = extractFloat32Sign( b );
1931     zSign = aSign ^ bSign;
1932     if ( aExp == 0xFF ) {
1933         if ( aSig ) return propagateFloat32NaN( a, b );
1934         if ( bExp == 0xFF ) {
1935             if ( bSig ) return propagateFloat32NaN( a, b );
1936             float_raise( float_flag_invalid );
1937             return float32_default_nan;
1938         }
1939         return packFloat32( zSign, 0xFF, 0 );
1940     }
1941     if ( bExp == 0xFF ) {
1942         if ( bSig ) return propagateFloat32NaN( a, b );
1943         return packFloat32( zSign, 0, 0 );
1944     }
1945     if ( bExp == 0 ) {
1946         if ( bSig == 0 ) {
1947             if ( ( aExp | aSig ) == 0 ) {
1948                 float_raise( float_flag_invalid );
1949                 return float32_default_nan;
1950             }
1951             float_raise( float_flag_divbyzero );
1952             return packFloat32( zSign, 0xFF, 0 );
1953         }
1954         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1955     }
1956     if ( aExp == 0 ) {
1957         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1958         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1959     }
1960     zExp = aExp - bExp + 0x7D;
1961     aSig = ( aSig | 0x00800000 )<<7;
1962     bSig = ( bSig | 0x00800000 )<<8;
1963     if ( bSig <= ( aSig + aSig ) ) {
1964         aSig >>= 1;
1965         ++zExp;
1966     }
1967     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1968     if ( ( zSig & 0x3F ) == 0 ) {
1969         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1970     }
1971     return roundAndPackFloat32( zSign, zExp, zSig );
1972 
1973 }
1974 
1975 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1976 /*
1977 -------------------------------------------------------------------------------
1978 Returns the remainder of the single-precision floating-point value `a'
1979 with respect to the corresponding value `b'.  The operation is performed
1980 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1981 -------------------------------------------------------------------------------
1982 */
1983 float32 float32_rem( float32 a, float32 b )
1984 {
1985     flag aSign, bSign, zSign;
1986     int16 aExp, bExp, expDiff;
1987     bits32 aSig, bSig;
1988     bits32 q;
1989     bits64 aSig64, bSig64, q64;
1990     bits32 alternateASig;
1991     sbits32 sigMean;
1992 
1993     aSig = extractFloat32Frac( a );
1994     aExp = extractFloat32Exp( a );
1995     aSign = extractFloat32Sign( a );
1996     bSig = extractFloat32Frac( b );
1997     bExp = extractFloat32Exp( b );
1998     bSign = extractFloat32Sign( b );
1999     if ( aExp == 0xFF ) {
2000         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2001             return propagateFloat32NaN( a, b );
2002         }
2003         float_raise( float_flag_invalid );
2004         return float32_default_nan;
2005     }
2006     if ( bExp == 0xFF ) {
2007         if ( bSig ) return propagateFloat32NaN( a, b );
2008         return a;
2009     }
2010     if ( bExp == 0 ) {
2011         if ( bSig == 0 ) {
2012             float_raise( float_flag_invalid );
2013             return float32_default_nan;
2014         }
2015         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2016     }
2017     if ( aExp == 0 ) {
2018         if ( aSig == 0 ) return a;
2019         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2020     }
2021     expDiff = aExp - bExp;
2022     aSig |= 0x00800000;
2023     bSig |= 0x00800000;
2024     if ( expDiff < 32 ) {
2025         aSig <<= 8;
2026         bSig <<= 8;
2027         if ( expDiff < 0 ) {
2028             if ( expDiff < -1 ) return a;
2029             aSig >>= 1;
2030         }
2031         q = ( bSig <= aSig );
2032         if ( q ) aSig -= bSig;
2033         if ( 0 < expDiff ) {
2034             q = ( ( (bits64) aSig )<<32 ) / bSig;
2035             q >>= 32 - expDiff;
2036             bSig >>= 2;
2037             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2038         }
2039         else {
2040             aSig >>= 2;
2041             bSig >>= 2;
2042         }
2043     }
2044     else {
2045         if ( bSig <= aSig ) aSig -= bSig;
2046         aSig64 = ( (bits64) aSig )<<40;
2047         bSig64 = ( (bits64) bSig )<<40;
2048         expDiff -= 64;
2049         while ( 0 < expDiff ) {
2050             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2051             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2052             aSig64 = - ( ( bSig * q64 )<<38 );
2053             expDiff -= 62;
2054         }
2055         expDiff += 64;
2056         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2057         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2058         q = q64>>( 64 - expDiff );
2059         bSig <<= 6;
2060         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2061     }
2062     do {
2063         alternateASig = aSig;
2064         ++q;
2065         aSig -= bSig;
2066     } while ( 0 <= (sbits32) aSig );
2067     sigMean = aSig + alternateASig;
2068     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2069         aSig = alternateASig;
2070     }
2071     zSign = ( (sbits32) aSig < 0 );
2072     if ( zSign ) aSig = - aSig;
2073     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2074 
2075 }
2076 #endif /* !SOFTFLOAT_FOR_GCC */
2077 
2078 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2079 /*
2080 -------------------------------------------------------------------------------
2081 Returns the square root of the single-precision floating-point value `a'.
2082 The operation is performed according to the IEC/IEEE Standard for Binary
2083 Floating-Point Arithmetic.
2084 -------------------------------------------------------------------------------
2085 */
2086 float32 float32_sqrt( float32 a )
2087 {
2088     flag aSign;
2089     int16 aExp, zExp;
2090     bits32 aSig, zSig;
2091     bits64 rem, term;
2092 
2093     aSig = extractFloat32Frac( a );
2094     aExp = extractFloat32Exp( a );
2095     aSign = extractFloat32Sign( a );
2096     if ( aExp == 0xFF ) {
2097         if ( aSig ) return propagateFloat32NaN( a, 0 );
2098         if ( ! aSign ) return a;
2099         float_raise( float_flag_invalid );
2100         return float32_default_nan;
2101     }
2102     if ( aSign ) {
2103         if ( ( aExp | aSig ) == 0 ) return a;
2104         float_raise( float_flag_invalid );
2105         return float32_default_nan;
2106     }
2107     if ( aExp == 0 ) {
2108         if ( aSig == 0 ) return 0;
2109         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2110     }
2111     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2112     aSig = ( aSig | 0x00800000 )<<8;
2113     zSig = estimateSqrt32( aExp, aSig ) + 2;
2114     if ( ( zSig & 0x7F ) <= 5 ) {
2115         if ( zSig < 2 ) {
2116             zSig = 0x7FFFFFFF;
2117             goto roundAndPack;
2118         }
2119         aSig >>= aExp & 1;
2120         term = ( (bits64) zSig ) * zSig;
2121         rem = ( ( (bits64) aSig )<<32 ) - term;
2122         while ( (sbits64) rem < 0 ) {
2123             --zSig;
2124             rem += ( ( (bits64) zSig )<<1 ) | 1;
2125         }
2126         zSig |= ( rem != 0 );
2127     }
2128     shift32RightJamming( zSig, 1, &zSig );
2129  roundAndPack:
2130     return roundAndPackFloat32( 0, zExp, zSig );
2131 
2132 }
2133 #endif /* !SOFTFLOAT_FOR_GCC */
2134 
2135 /*
2136 -------------------------------------------------------------------------------
2137 Returns 1 if the single-precision floating-point value `a' is equal to
2138 the corresponding value `b', and 0 otherwise.  The comparison is performed
2139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2140 -------------------------------------------------------------------------------
2141 */
2142 flag float32_eq( float32 a, float32 b )
2143 {
2144 
2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147        ) {
2148         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149             float_raise( float_flag_invalid );
2150         }
2151         return 0;
2152     }
2153     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2154 
2155 }
2156 
2157 /*
2158 -------------------------------------------------------------------------------
2159 Returns 1 if the single-precision floating-point value `a' is less than
2160 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2161 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2162 Arithmetic.
2163 -------------------------------------------------------------------------------
2164 */
2165 flag float32_le( float32 a, float32 b )
2166 {
2167     flag aSign, bSign;
2168 
2169     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2170          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2171        ) {
2172         float_raise( float_flag_invalid );
2173         return 0;
2174     }
2175     aSign = extractFloat32Sign( a );
2176     bSign = extractFloat32Sign( b );
2177     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2178     return ( a == b ) || ( aSign ^ ( a < b ) );
2179 
2180 }
2181 
2182 /*
2183 -------------------------------------------------------------------------------
2184 Returns 1 if the single-precision floating-point value `a' is less than
2185 the corresponding value `b', and 0 otherwise.  The comparison is performed
2186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2187 -------------------------------------------------------------------------------
2188 */
2189 flag float32_lt( float32 a, float32 b )
2190 {
2191     flag aSign, bSign;
2192 
2193     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2194          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2195        ) {
2196         float_raise( float_flag_invalid );
2197         return 0;
2198     }
2199     aSign = extractFloat32Sign( a );
2200     bSign = extractFloat32Sign( b );
2201     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2202     return ( a != b ) && ( aSign ^ ( a < b ) );
2203 
2204 }
2205 
2206 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2207 /*
2208 -------------------------------------------------------------------------------
2209 Returns 1 if the single-precision floating-point value `a' is equal to
2210 the corresponding value `b', and 0 otherwise.  The invalid exception is
2211 raised if either operand is a NaN.  Otherwise, the comparison is performed
2212 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2213 -------------------------------------------------------------------------------
2214 */
2215 flag float32_eq_signaling( float32 a, float32 b )
2216 {
2217 
2218     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2219          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2220        ) {
2221         float_raise( float_flag_invalid );
2222         return 0;
2223     }
2224     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2225 
2226 }
2227 
2228 /*
2229 -------------------------------------------------------------------------------
2230 Returns 1 if the single-precision floating-point value `a' is less than or
2231 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2232 cause an exception.  Otherwise, the comparison is performed according to the
2233 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2234 -------------------------------------------------------------------------------
2235 */
2236 flag float32_le_quiet( float32 a, float32 b )
2237 {
2238     flag aSign, bSign;
2239 
2240     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2241          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2242        ) {
2243         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2244             float_raise( float_flag_invalid );
2245         }
2246         return 0;
2247     }
2248     aSign = extractFloat32Sign( a );
2249     bSign = extractFloat32Sign( b );
2250     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2251     return ( a == b ) || ( aSign ^ ( a < b ) );
2252 
2253 }
2254 
2255 /*
2256 -------------------------------------------------------------------------------
2257 Returns 1 if the single-precision floating-point value `a' is less than
2258 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2259 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2260 Standard for Binary Floating-Point Arithmetic.
2261 -------------------------------------------------------------------------------
2262 */
2263 flag float32_lt_quiet( float32 a, float32 b )
2264 {
2265     flag aSign, bSign;
2266 
2267     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2268          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2269        ) {
2270         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2271             float_raise( float_flag_invalid );
2272         }
2273         return 0;
2274     }
2275     aSign = extractFloat32Sign( a );
2276     bSign = extractFloat32Sign( b );
2277     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2278     return ( a != b ) && ( aSign ^ ( a < b ) );
2279 
2280 }
2281 #endif /* !SOFTFLOAT_FOR_GCC */
2282 
2283 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2284 /*
2285 -------------------------------------------------------------------------------
2286 Returns the result of converting the double-precision floating-point value
2287 `a' to the 32-bit two's complement integer format.  The conversion is
2288 performed according to the IEC/IEEE Standard for Binary Floating-Point
2289 Arithmetic---which means in particular that the conversion is rounded
2290 according to the current rounding mode.  If `a' is a NaN, the largest
2291 positive integer is returned.  Otherwise, if the conversion overflows, the
2292 largest integer with the same sign as `a' is returned.
2293 -------------------------------------------------------------------------------
2294 */
2295 int32 float64_to_int32( float64 a )
2296 {
2297     flag aSign;
2298     int16 aExp, shiftCount;
2299     bits64 aSig;
2300 
2301     aSig = extractFloat64Frac( a );
2302     aExp = extractFloat64Exp( a );
2303     aSign = extractFloat64Sign( a );
2304     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2305     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2306     shiftCount = 0x42C - aExp;
2307     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2308     return roundAndPackInt32( aSign, aSig );
2309 
2310 }
2311 #endif /* !SOFTFLOAT_FOR_GCC */
2312 
2313 /*
2314 -------------------------------------------------------------------------------
2315 Returns the result of converting the double-precision floating-point value
2316 `a' to the 32-bit two's complement integer format.  The conversion is
2317 performed according to the IEC/IEEE Standard for Binary Floating-Point
2318 Arithmetic, except that the conversion is always rounded toward zero.
2319 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2320 the conversion overflows, the largest integer with the same sign as `a' is
2321 returned.
2322 -------------------------------------------------------------------------------
2323 */
2324 int32 float64_to_int32_round_to_zero( float64 a )
2325 {
2326     flag aSign;
2327     int16 aExp, shiftCount;
2328     bits64 aSig, savedASig;
2329     int32 z;
2330 
2331     aSig = extractFloat64Frac( a );
2332     aExp = extractFloat64Exp( a );
2333     aSign = extractFloat64Sign( a );
2334     if ( 0x41E < aExp ) {
2335         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2336         goto invalid;
2337     }
2338     else if ( aExp < 0x3FF ) {
2339         if ( aExp || aSig ) float_set_inexact();
2340         return 0;
2341     }
2342     aSig |= LIT64( 0x0010000000000000 );
2343     shiftCount = 0x433 - aExp;
2344     savedASig = aSig;
2345     aSig >>= shiftCount;
2346     z = aSig;
2347     if ( aSign ) z = - z;
2348     if ( ( z < 0 ) ^ aSign ) {
2349  invalid:
2350         float_raise( float_flag_invalid );
2351         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2352     }
2353     if ( ( aSig<<shiftCount ) != savedASig ) {
2354         float_set_inexact();
2355     }
2356     return z;
2357 
2358 }
2359 
2360 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2361 /*
2362 -------------------------------------------------------------------------------
2363 Returns the result of converting the double-precision floating-point value
2364 `a' to the 64-bit two's complement integer format.  The conversion is
2365 performed according to the IEC/IEEE Standard for Binary Floating-Point
2366 Arithmetic---which means in particular that the conversion is rounded
2367 according to the current rounding mode.  If `a' is a NaN, the largest
2368 positive integer is returned.  Otherwise, if the conversion overflows, the
2369 largest integer with the same sign as `a' is returned.
2370 -------------------------------------------------------------------------------
2371 */
2372 int64 float64_to_int64( float64 a )
2373 {
2374     flag aSign;
2375     int16 aExp, shiftCount;
2376     bits64 aSig, aSigExtra;
2377 
2378     aSig = extractFloat64Frac( a );
2379     aExp = extractFloat64Exp( a );
2380     aSign = extractFloat64Sign( a );
2381     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2382     shiftCount = 0x433 - aExp;
2383     if ( shiftCount <= 0 ) {
2384         if ( 0x43E < aExp ) {
2385             float_raise( float_flag_invalid );
2386             if (    ! aSign
2387                  || (    ( aExp == 0x7FF )
2388                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2389                ) {
2390                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2391             }
2392             return (sbits64) LIT64( 0x8000000000000000 );
2393         }
2394         aSigExtra = 0;
2395         aSig <<= - shiftCount;
2396     }
2397     else {
2398         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2399     }
2400     return roundAndPackInt64( aSign, aSig, aSigExtra );
2401 
2402 }
2403 
2404 /*
2405 -------------------------------------------------------------------------------
2406 Returns the result of converting the double-precision floating-point value
2407 `a' to the 64-bit two's complement integer format.  The conversion is
2408 performed according to the IEC/IEEE Standard for Binary Floating-Point
2409 Arithmetic, except that the conversion is always rounded toward zero.
2410 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2411 the conversion overflows, the largest integer with the same sign as `a' is
2412 returned.
2413 -------------------------------------------------------------------------------
2414 */
2415 int64 float64_to_int64_round_to_zero( float64 a )
2416 {
2417     flag aSign;
2418     int16 aExp, shiftCount;
2419     bits64 aSig;
2420     int64 z;
2421 
2422     aSig = extractFloat64Frac( a );
2423     aExp = extractFloat64Exp( a );
2424     aSign = extractFloat64Sign( a );
2425     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2426     shiftCount = aExp - 0x433;
2427     if ( 0 <= shiftCount ) {
2428         if ( 0x43E <= aExp ) {
2429             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2430                 float_raise( float_flag_invalid );
2431                 if (    ! aSign
2432                      || (    ( aExp == 0x7FF )
2433                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2434                    ) {
2435                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2436                 }
2437             }
2438             return (sbits64) LIT64( 0x8000000000000000 );
2439         }
2440         z = aSig<<shiftCount;
2441     }
2442     else {
2443         if ( aExp < 0x3FE ) {
2444             if ( aExp | aSig ) float_set_inexact();
2445             return 0;
2446         }
2447         z = aSig>>( - shiftCount );
2448         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2449             float_set_inexact();
2450         }
2451     }
2452     if ( aSign ) z = - z;
2453     return z;
2454 
2455 }
2456 #endif /* !SOFTFLOAT_FOR_GCC */
2457 
2458 /*
2459 -------------------------------------------------------------------------------
2460 Returns the result of converting the double-precision floating-point value
2461 `a' to the single-precision floating-point format.  The conversion is
2462 performed according to the IEC/IEEE Standard for Binary Floating-Point
2463 Arithmetic.
2464 -------------------------------------------------------------------------------
2465 */
2466 float32 float64_to_float32( float64 a )
2467 {
2468     flag aSign;
2469     int16 aExp;
2470     bits64 aSig;
2471     bits32 zSig;
2472 
2473     aSig = extractFloat64Frac( a );
2474     aExp = extractFloat64Exp( a );
2475     aSign = extractFloat64Sign( a );
2476     if ( aExp == 0x7FF ) {
2477         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2478         return packFloat32( aSign, 0xFF, 0 );
2479     }
2480     shift64RightJamming( aSig, 22, &aSig );
2481     zSig = aSig;
2482     if ( aExp || zSig ) {
2483         zSig |= 0x40000000;
2484         aExp -= 0x381;
2485     }
2486     return roundAndPackFloat32( aSign, aExp, zSig );
2487 
2488 }
2489 
2490 #ifdef FLOATX80
2491 
2492 /*
2493 -------------------------------------------------------------------------------
2494 Returns the result of converting the double-precision floating-point value
2495 `a' to the extended double-precision floating-point format.  The conversion
2496 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2497 Arithmetic.
2498 -------------------------------------------------------------------------------
2499 */
2500 floatx80 float64_to_floatx80( float64 a )
2501 {
2502     flag aSign;
2503     int16 aExp;
2504     bits64 aSig;
2505 
2506     aSig = extractFloat64Frac( a );
2507     aExp = extractFloat64Exp( a );
2508     aSign = extractFloat64Sign( a );
2509     if ( aExp == 0x7FF ) {
2510         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2511         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2512     }
2513     if ( aExp == 0 ) {
2514         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2515         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2516     }
2517     return
2518         packFloatx80(
2519             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2520 
2521 }
2522 
2523 #endif
2524 
2525 #ifdef FLOAT128
2526 
2527 /*
2528 -------------------------------------------------------------------------------
2529 Returns the result of converting the double-precision floating-point value
2530 `a' to the quadruple-precision floating-point format.  The conversion is
2531 performed according to the IEC/IEEE Standard for Binary Floating-Point
2532 Arithmetic.
2533 -------------------------------------------------------------------------------
2534 */
2535 float128 float64_to_float128( float64 a )
2536 {
2537     flag aSign;
2538     int16 aExp;
2539     bits64 aSig, zSig0, zSig1;
2540 
2541     aSig = extractFloat64Frac( a );
2542     aExp = extractFloat64Exp( a );
2543     aSign = extractFloat64Sign( a );
2544     if ( aExp == 0x7FF ) {
2545         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2546         return packFloat128( aSign, 0x7FFF, 0, 0 );
2547     }
2548     if ( aExp == 0 ) {
2549         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2550         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2551         --aExp;
2552     }
2553     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2554     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2555 
2556 }
2557 
2558 #endif
2559 
2560 #ifndef SOFTFLOAT_FOR_GCC
2561 /*
2562 -------------------------------------------------------------------------------
2563 Rounds the double-precision floating-point value `a' to an integer, and
2564 returns the result as a double-precision floating-point value.  The
2565 operation is performed according to the IEC/IEEE Standard for Binary
2566 Floating-Point Arithmetic.
2567 -------------------------------------------------------------------------------
2568 */
2569 float64 float64_round_to_int( float64 a )
2570 {
2571     flag aSign;
2572     int16 aExp;
2573     bits64 lastBitMask, roundBitsMask;
2574     int8 roundingMode;
2575     float64 z;
2576 
2577     aExp = extractFloat64Exp( a );
2578     if ( 0x433 <= aExp ) {
2579         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2580             return propagateFloat64NaN( a, a );
2581         }
2582         return a;
2583     }
2584     if ( aExp < 0x3FF ) {
2585         if ( (bits64) ( a<<1 ) == 0 ) return a;
2586         float_set_inexact();
2587         aSign = extractFloat64Sign( a );
2588         switch ( float_rounding_mode() ) {
2589          case float_round_nearest_even:
2590             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2591                 return packFloat64( aSign, 0x3FF, 0 );
2592             }
2593             break;
2594          case float_round_down:
2595             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2596          case float_round_up:
2597             return
2598             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2599         }
2600         return packFloat64( aSign, 0, 0 );
2601     }
2602     lastBitMask = 1;
2603     lastBitMask <<= 0x433 - aExp;
2604     roundBitsMask = lastBitMask - 1;
2605     z = a;
2606     roundingMode = float_rounding_mode();
2607     if ( roundingMode == float_round_nearest_even ) {
2608         z += lastBitMask>>1;
2609         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2610     }
2611     else if ( roundingMode != float_round_to_zero ) {
2612         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2613             z += roundBitsMask;
2614         }
2615     }
2616     z &= ~ roundBitsMask;
2617     if ( z != a ) float_set_inexact();
2618     return z;
2619 
2620 }
2621 #endif
2622 
2623 /*
2624 -------------------------------------------------------------------------------
2625 Returns the result of adding the absolute values of the double-precision
2626 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2627 before being returned.  `zSign' is ignored if the result is a NaN.
2628 The addition is performed according to the IEC/IEEE Standard for Binary
2629 Floating-Point Arithmetic.
2630 -------------------------------------------------------------------------------
2631 */
2632 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2633 {
2634     int16 aExp, bExp, zExp;
2635     bits64 aSig, bSig, zSig;
2636     int16 expDiff;
2637 
2638     aSig = extractFloat64Frac( a );
2639     aExp = extractFloat64Exp( a );
2640     bSig = extractFloat64Frac( b );
2641     bExp = extractFloat64Exp( b );
2642     expDiff = aExp - bExp;
2643     aSig <<= 9;
2644     bSig <<= 9;
2645     if ( 0 < expDiff ) {
2646         if ( aExp == 0x7FF ) {
2647             if ( aSig ) return propagateFloat64NaN( a, b );
2648             return a;
2649         }
2650         if ( bExp == 0 ) {
2651             --expDiff;
2652         }
2653         else {
2654             bSig |= LIT64( 0x2000000000000000 );
2655         }
2656         shift64RightJamming( bSig, expDiff, &bSig );
2657         zExp = aExp;
2658     }
2659     else if ( expDiff < 0 ) {
2660         if ( bExp == 0x7FF ) {
2661             if ( bSig ) return propagateFloat64NaN( a, b );
2662             return packFloat64( zSign, 0x7FF, 0 );
2663         }
2664         if ( aExp == 0 ) {
2665             ++expDiff;
2666         }
2667         else {
2668             aSig |= LIT64( 0x2000000000000000 );
2669         }
2670         shift64RightJamming( aSig, - expDiff, &aSig );
2671         zExp = bExp;
2672     }
2673     else {
2674         if ( aExp == 0x7FF ) {
2675             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2676             return a;
2677         }
2678         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2679         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2680         zExp = aExp;
2681         goto roundAndPack;
2682     }
2683     aSig |= LIT64( 0x2000000000000000 );
2684     zSig = ( aSig + bSig )<<1;
2685     --zExp;
2686     if ( (sbits64) zSig < 0 ) {
2687         zSig = aSig + bSig;
2688         ++zExp;
2689     }
2690  roundAndPack:
2691     return roundAndPackFloat64( zSign, zExp, zSig );
2692 
2693 }
2694 
2695 /*
2696 -------------------------------------------------------------------------------
2697 Returns the result of subtracting the absolute values of the double-
2698 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2699 difference is negated before being returned.  `zSign' is ignored if the
2700 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2701 Standard for Binary Floating-Point Arithmetic.
2702 -------------------------------------------------------------------------------
2703 */
2704 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2705 {
2706     int16 aExp, bExp, zExp;
2707     bits64 aSig, bSig, zSig;
2708     int16 expDiff;
2709 
2710     aSig = extractFloat64Frac( a );
2711     aExp = extractFloat64Exp( a );
2712     bSig = extractFloat64Frac( b );
2713     bExp = extractFloat64Exp( b );
2714     expDiff = aExp - bExp;
2715     aSig <<= 10;
2716     bSig <<= 10;
2717     if ( 0 < expDiff ) goto aExpBigger;
2718     if ( expDiff < 0 ) goto bExpBigger;
2719     if ( aExp == 0x7FF ) {
2720         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2721         float_raise( float_flag_invalid );
2722         return float64_default_nan;
2723     }
2724     if ( aExp == 0 ) {
2725         aExp = 1;
2726         bExp = 1;
2727     }
2728     if ( bSig < aSig ) goto aBigger;
2729     if ( aSig < bSig ) goto bBigger;
2730     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2731  bExpBigger:
2732     if ( bExp == 0x7FF ) {
2733         if ( bSig ) return propagateFloat64NaN( a, b );
2734         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2735     }
2736     if ( aExp == 0 ) {
2737         ++expDiff;
2738     }
2739     else {
2740         aSig |= LIT64( 0x4000000000000000 );
2741     }
2742     shift64RightJamming( aSig, - expDiff, &aSig );
2743     bSig |= LIT64( 0x4000000000000000 );
2744  bBigger:
2745     zSig = bSig - aSig;
2746     zExp = bExp;
2747     zSign ^= 1;
2748     goto normalizeRoundAndPack;
2749  aExpBigger:
2750     if ( aExp == 0x7FF ) {
2751         if ( aSig ) return propagateFloat64NaN( a, b );
2752         return a;
2753     }
2754     if ( bExp == 0 ) {
2755         --expDiff;
2756     }
2757     else {
2758         bSig |= LIT64( 0x4000000000000000 );
2759     }
2760     shift64RightJamming( bSig, expDiff, &bSig );
2761     aSig |= LIT64( 0x4000000000000000 );
2762  aBigger:
2763     zSig = aSig - bSig;
2764     zExp = aExp;
2765  normalizeRoundAndPack:
2766     --zExp;
2767     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2768 
2769 }
2770 
2771 /*
2772 -------------------------------------------------------------------------------
2773 Returns the result of adding the double-precision floating-point values `a'
2774 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2775 Binary Floating-Point Arithmetic.
2776 -------------------------------------------------------------------------------
2777 */
2778 float64 float64_add( float64 a, float64 b )
2779 {
2780     flag aSign, bSign;
2781 
2782     aSign = extractFloat64Sign( a );
2783     bSign = extractFloat64Sign( b );
2784     if ( aSign == bSign ) {
2785         return addFloat64Sigs( a, b, aSign );
2786     }
2787     else {
2788         return subFloat64Sigs( a, b, aSign );
2789     }
2790 
2791 }
2792 
2793 /*
2794 -------------------------------------------------------------------------------
2795 Returns the result of subtracting the double-precision floating-point values
2796 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2797 for Binary Floating-Point Arithmetic.
2798 -------------------------------------------------------------------------------
2799 */
2800 float64 float64_sub( float64 a, float64 b )
2801 {
2802     flag aSign, bSign;
2803 
2804     aSign = extractFloat64Sign( a );
2805     bSign = extractFloat64Sign( b );
2806     if ( aSign == bSign ) {
2807         return subFloat64Sigs( a, b, aSign );
2808     }
2809     else {
2810         return addFloat64Sigs( a, b, aSign );
2811     }
2812 
2813 }
2814 
2815 /*
2816 -------------------------------------------------------------------------------
2817 Returns the result of multiplying the double-precision floating-point values
2818 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2819 for Binary Floating-Point Arithmetic.
2820 -------------------------------------------------------------------------------
2821 */
2822 float64 float64_mul( float64 a, float64 b )
2823 {
2824     flag aSign, bSign, zSign;
2825     int16 aExp, bExp, zExp;
2826     bits64 aSig, bSig, zSig0, zSig1;
2827 
2828     aSig = extractFloat64Frac( a );
2829     aExp = extractFloat64Exp( a );
2830     aSign = extractFloat64Sign( a );
2831     bSig = extractFloat64Frac( b );
2832     bExp = extractFloat64Exp( b );
2833     bSign = extractFloat64Sign( b );
2834     zSign = aSign ^ bSign;
2835     if ( aExp == 0x7FF ) {
2836         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2837             return propagateFloat64NaN( a, b );
2838         }
2839         if ( ( bExp | bSig ) == 0 ) {
2840             float_raise( float_flag_invalid );
2841             return float64_default_nan;
2842         }
2843         return packFloat64( zSign, 0x7FF, 0 );
2844     }
2845     if ( bExp == 0x7FF ) {
2846         if ( bSig ) return propagateFloat64NaN( a, b );
2847         if ( ( aExp | aSig ) == 0 ) {
2848             float_raise( float_flag_invalid );
2849             return float64_default_nan;
2850         }
2851         return packFloat64( zSign, 0x7FF, 0 );
2852     }
2853     if ( aExp == 0 ) {
2854         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2855         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2856     }
2857     if ( bExp == 0 ) {
2858         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2859         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2860     }
2861     zExp = aExp + bExp - 0x3FF;
2862     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2863     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2864     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2865     zSig0 |= ( zSig1 != 0 );
2866     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2867         zSig0 <<= 1;
2868         --zExp;
2869     }
2870     return roundAndPackFloat64( zSign, zExp, zSig0 );
2871 
2872 }
2873 
2874 /*
2875 -------------------------------------------------------------------------------
2876 Returns the result of dividing the double-precision floating-point value `a'
2877 by the corresponding value `b'.  The operation is performed according to
2878 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2879 -------------------------------------------------------------------------------
2880 */
2881 float64 float64_div( float64 a, float64 b )
2882 {
2883     flag aSign, bSign, zSign;
2884     int16 aExp, bExp, zExp;
2885     bits64 aSig, bSig, zSig;
2886     bits64 rem0, rem1;
2887     bits64 term0, term1;
2888 
2889     aSig = extractFloat64Frac( a );
2890     aExp = extractFloat64Exp( a );
2891     aSign = extractFloat64Sign( a );
2892     bSig = extractFloat64Frac( b );
2893     bExp = extractFloat64Exp( b );
2894     bSign = extractFloat64Sign( b );
2895     zSign = aSign ^ bSign;
2896     if ( aExp == 0x7FF ) {
2897         if ( aSig ) return propagateFloat64NaN( a, b );
2898         if ( bExp == 0x7FF ) {
2899             if ( bSig ) return propagateFloat64NaN( a, b );
2900             float_raise( float_flag_invalid );
2901             return float64_default_nan;
2902         }
2903         return packFloat64( zSign, 0x7FF, 0 );
2904     }
2905     if ( bExp == 0x7FF ) {
2906         if ( bSig ) return propagateFloat64NaN( a, b );
2907         return packFloat64( zSign, 0, 0 );
2908     }
2909     if ( bExp == 0 ) {
2910         if ( bSig == 0 ) {
2911             if ( ( aExp | aSig ) == 0 ) {
2912                 float_raise( float_flag_invalid );
2913                 return float64_default_nan;
2914             }
2915             float_raise( float_flag_divbyzero );
2916             return packFloat64( zSign, 0x7FF, 0 );
2917         }
2918         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2919     }
2920     if ( aExp == 0 ) {
2921         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2922         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2923     }
2924     zExp = aExp - bExp + 0x3FD;
2925     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2926     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2927     if ( bSig <= ( aSig + aSig ) ) {
2928         aSig >>= 1;
2929         ++zExp;
2930     }
2931     zSig = estimateDiv128To64( aSig, 0, bSig );
2932     if ( ( zSig & 0x1FF ) <= 2 ) {
2933         mul64To128( bSig, zSig, &term0, &term1 );
2934         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2935         while ( (sbits64) rem0 < 0 ) {
2936             --zSig;
2937             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2938         }
2939         zSig |= ( rem1 != 0 );
2940     }
2941     return roundAndPackFloat64( zSign, zExp, zSig );
2942 
2943 }
2944 
2945 #ifndef SOFTFLOAT_FOR_GCC
2946 /*
2947 -------------------------------------------------------------------------------
2948 Returns the remainder of the double-precision floating-point value `a'
2949 with respect to the corresponding value `b'.  The operation is performed
2950 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2951 -------------------------------------------------------------------------------
2952 */
2953 float64 float64_rem( float64 a, float64 b )
2954 {
2955     flag aSign, bSign, zSign;
2956     int16 aExp, bExp, expDiff;
2957     bits64 aSig, bSig;
2958     bits64 q, alternateASig;
2959     sbits64 sigMean;
2960 
2961     aSig = extractFloat64Frac( a );
2962     aExp = extractFloat64Exp( a );
2963     aSign = extractFloat64Sign( a );
2964     bSig = extractFloat64Frac( b );
2965     bExp = extractFloat64Exp( b );
2966     bSign = extractFloat64Sign( b );
2967     if ( aExp == 0x7FF ) {
2968         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2969             return propagateFloat64NaN( a, b );
2970         }
2971         float_raise( float_flag_invalid );
2972         return float64_default_nan;
2973     }
2974     if ( bExp == 0x7FF ) {
2975         if ( bSig ) return propagateFloat64NaN( a, b );
2976         return a;
2977     }
2978     if ( bExp == 0 ) {
2979         if ( bSig == 0 ) {
2980             float_raise( float_flag_invalid );
2981             return float64_default_nan;
2982         }
2983         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2984     }
2985     if ( aExp == 0 ) {
2986         if ( aSig == 0 ) return a;
2987         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2988     }
2989     expDiff = aExp - bExp;
2990     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2991     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2992     if ( expDiff < 0 ) {
2993         if ( expDiff < -1 ) return a;
2994         aSig >>= 1;
2995     }
2996     q = ( bSig <= aSig );
2997     if ( q ) aSig -= bSig;
2998     expDiff -= 64;
2999     while ( 0 < expDiff ) {
3000         q = estimateDiv128To64( aSig, 0, bSig );
3001         q = ( 2 < q ) ? q - 2 : 0;
3002         aSig = - ( ( bSig>>2 ) * q );
3003         expDiff -= 62;
3004     }
3005     expDiff += 64;
3006     if ( 0 < expDiff ) {
3007         q = estimateDiv128To64( aSig, 0, bSig );
3008         q = ( 2 < q ) ? q - 2 : 0;
3009         q >>= 64 - expDiff;
3010         bSig >>= 2;
3011         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3012     }
3013     else {
3014         aSig >>= 2;
3015         bSig >>= 2;
3016     }
3017     do {
3018         alternateASig = aSig;
3019         ++q;
3020         aSig -= bSig;
3021     } while ( 0 <= (sbits64) aSig );
3022     sigMean = aSig + alternateASig;
3023     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3024         aSig = alternateASig;
3025     }
3026     zSign = ( (sbits64) aSig < 0 );
3027     if ( zSign ) aSig = - aSig;
3028     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3029 
3030 }
3031 
3032 /*
3033 -------------------------------------------------------------------------------
3034 Returns the square root of the double-precision floating-point value `a'.
3035 The operation is performed according to the IEC/IEEE Standard for Binary
3036 Floating-Point Arithmetic.
3037 -------------------------------------------------------------------------------
3038 */
3039 float64 float64_sqrt( float64 a )
3040 {
3041     flag aSign;
3042     int16 aExp, zExp;
3043     bits64 aSig, zSig, doubleZSig;
3044     bits64 rem0, rem1, term0, term1;
3045 
3046     aSig = extractFloat64Frac( a );
3047     aExp = extractFloat64Exp( a );
3048     aSign = extractFloat64Sign( a );
3049     if ( aExp == 0x7FF ) {
3050         if ( aSig ) return propagateFloat64NaN( a, a );
3051         if ( ! aSign ) return a;
3052         float_raise( float_flag_invalid );
3053         return float64_default_nan;
3054     }
3055     if ( aSign ) {
3056         if ( ( aExp | aSig ) == 0 ) return a;
3057         float_raise( float_flag_invalid );
3058         return float64_default_nan;
3059     }
3060     if ( aExp == 0 ) {
3061         if ( aSig == 0 ) return 0;
3062         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3063     }
3064     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3065     aSig |= LIT64( 0x0010000000000000 );
3066     zSig = estimateSqrt32( aExp, aSig>>21 );
3067     aSig <<= 9 - ( aExp & 1 );
3068     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3069     if ( ( zSig & 0x1FF ) <= 5 ) {
3070         doubleZSig = zSig<<1;
3071         mul64To128( zSig, zSig, &term0, &term1 );
3072         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3073         while ( (sbits64) rem0 < 0 ) {
3074             --zSig;
3075             doubleZSig -= 2;
3076             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3077         }
3078         zSig |= ( ( rem0 | rem1 ) != 0 );
3079     }
3080     return roundAndPackFloat64( 0, zExp, zSig );
3081 
3082 }
3083 #endif
3084 
3085 /*
3086 -------------------------------------------------------------------------------
3087 Returns 1 if the double-precision floating-point value `a' is equal to the
3088 corresponding value `b', and 0 otherwise.  The comparison is performed
3089 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3090 -------------------------------------------------------------------------------
3091 */
3092 flag float64_eq( float64 a, float64 b )
3093 {
3094 
3095     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3096          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3097        ) {
3098         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3099             float_raise( float_flag_invalid );
3100         }
3101         return 0;
3102     }
3103     return ( a == b ) ||
3104 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3105 
3106 }
3107 
3108 /*
3109 -------------------------------------------------------------------------------
3110 Returns 1 if the double-precision floating-point value `a' is less than or
3111 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3112 performed according to the IEC/IEEE Standard for Binary Floating-Point
3113 Arithmetic.
3114 -------------------------------------------------------------------------------
3115 */
3116 flag float64_le( float64 a, float64 b )
3117 {
3118     flag aSign, bSign;
3119 
3120     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3121          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3122        ) {
3123         float_raise( float_flag_invalid );
3124         return 0;
3125     }
3126     aSign = extractFloat64Sign( a );
3127     bSign = extractFloat64Sign( b );
3128     if ( aSign != bSign )
3129 	return aSign ||
3130 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3131 	      0 );
3132     return ( a == b ) ||
3133 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3134 
3135 }
3136 
3137 /*
3138 -------------------------------------------------------------------------------
3139 Returns 1 if the double-precision floating-point value `a' is less than
3140 the corresponding value `b', and 0 otherwise.  The comparison is performed
3141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3142 -------------------------------------------------------------------------------
3143 */
3144 flag float64_lt( float64 a, float64 b )
3145 {
3146     flag aSign, bSign;
3147 
3148     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3149          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3150        ) {
3151         float_raise( float_flag_invalid );
3152         return 0;
3153     }
3154     aSign = extractFloat64Sign( a );
3155     bSign = extractFloat64Sign( b );
3156     if ( aSign != bSign )
3157 	return aSign &&
3158 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3159 	      0 );
3160     return ( a != b ) &&
3161 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3162 
3163 }
3164 
3165 #ifndef SOFTFLOAT_FOR_GCC
3166 /*
3167 -------------------------------------------------------------------------------
3168 Returns 1 if the double-precision floating-point value `a' is equal to the
3169 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3170 if either operand is a NaN.  Otherwise, the comparison is performed
3171 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3172 -------------------------------------------------------------------------------
3173 */
3174 flag float64_eq_signaling( float64 a, float64 b )
3175 {
3176 
3177     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3178          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3179        ) {
3180         float_raise( float_flag_invalid );
3181         return 0;
3182     }
3183     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3184 
3185 }
3186 
3187 /*
3188 -------------------------------------------------------------------------------
3189 Returns 1 if the double-precision floating-point value `a' is less than or
3190 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3191 cause an exception.  Otherwise, the comparison is performed according to the
3192 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3193 -------------------------------------------------------------------------------
3194 */
3195 flag float64_le_quiet( float64 a, float64 b )
3196 {
3197     flag aSign, bSign;
3198 
3199     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3200          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3201        ) {
3202         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3203             float_raise( float_flag_invalid );
3204         }
3205         return 0;
3206     }
3207     aSign = extractFloat64Sign( a );
3208     bSign = extractFloat64Sign( b );
3209     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3210     return ( a == b ) || ( aSign ^ ( a < b ) );
3211 
3212 }
3213 
3214 /*
3215 -------------------------------------------------------------------------------
3216 Returns 1 if the double-precision floating-point value `a' is less than
3217 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3218 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3219 Standard for Binary Floating-Point Arithmetic.
3220 -------------------------------------------------------------------------------
3221 */
3222 flag float64_lt_quiet( float64 a, float64 b )
3223 {
3224     flag aSign, bSign;
3225 
3226     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3227          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3228        ) {
3229         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3230             float_raise( float_flag_invalid );
3231         }
3232         return 0;
3233     }
3234     aSign = extractFloat64Sign( a );
3235     bSign = extractFloat64Sign( b );
3236     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3237     return ( a != b ) && ( aSign ^ ( a < b ) );
3238 
3239 }
3240 #endif
3241 
3242 #ifdef FLOATX80
3243 
3244 /*
3245 -------------------------------------------------------------------------------
3246 Returns the result of converting the extended double-precision floating-
3247 point value `a' to the 32-bit two's complement integer format.  The
3248 conversion is performed according to the IEC/IEEE Standard for Binary
3249 Floating-Point Arithmetic---which means in particular that the conversion
3250 is rounded according to the current rounding mode.  If `a' is a NaN, the
3251 largest positive integer is returned.  Otherwise, if the conversion
3252 overflows, the largest integer with the same sign as `a' is returned.
3253 -------------------------------------------------------------------------------
3254 */
3255 int32 floatx80_to_int32( floatx80 a )
3256 {
3257     flag aSign;
3258     int32 aExp, shiftCount;
3259     bits64 aSig;
3260 
3261     aSig = extractFloatx80Frac( a );
3262     aExp = extractFloatx80Exp( a );
3263     aSign = extractFloatx80Sign( a );
3264     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3265     shiftCount = 0x4037 - aExp;
3266     if ( shiftCount <= 0 ) shiftCount = 1;
3267     shift64RightJamming( aSig, shiftCount, &aSig );
3268     return roundAndPackInt32( aSign, aSig );
3269 
3270 }
3271 
3272 /*
3273 -------------------------------------------------------------------------------
3274 Returns the result of converting the extended double-precision floating-
3275 point value `a' to the 32-bit two's complement integer format.  The
3276 conversion is performed according to the IEC/IEEE Standard for Binary
3277 Floating-Point Arithmetic, except that the conversion is always rounded
3278 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3279 Otherwise, if the conversion overflows, the largest integer with the same
3280 sign as `a' is returned.
3281 -------------------------------------------------------------------------------
3282 */
3283 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3284 {
3285     flag aSign;
3286     int32 aExp, shiftCount;
3287     bits64 aSig, savedASig;
3288     int32 z;
3289 
3290     aSig = extractFloatx80Frac( a );
3291     aExp = extractFloatx80Exp( a );
3292     aSign = extractFloatx80Sign( a );
3293     if ( 0x401E < aExp ) {
3294         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3295         goto invalid;
3296     }
3297     else if ( aExp < 0x3FFF ) {
3298         if ( aExp || aSig ) float_set_inexact();
3299         return 0;
3300     }
3301     shiftCount = 0x403E - aExp;
3302     savedASig = aSig;
3303     aSig >>= shiftCount;
3304     z = aSig;
3305     if ( aSign ) z = - z;
3306     if ( ( z < 0 ) ^ aSign ) {
3307  invalid:
3308         float_raise( float_flag_invalid );
3309         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3310     }
3311     if ( ( aSig<<shiftCount ) != savedASig ) {
3312         float_set_inexact();
3313     }
3314     return z;
3315 
3316 }
3317 
3318 /*
3319 -------------------------------------------------------------------------------
3320 Returns the result of converting the extended double-precision floating-
3321 point value `a' to the 64-bit two's complement integer format.  The
3322 conversion is performed according to the IEC/IEEE Standard for Binary
3323 Floating-Point Arithmetic---which means in particular that the conversion
3324 is rounded according to the current rounding mode.  If `a' is a NaN,
3325 the largest positive integer is returned.  Otherwise, if the conversion
3326 overflows, the largest integer with the same sign as `a' is returned.
3327 -------------------------------------------------------------------------------
3328 */
3329 int64 floatx80_to_int64( floatx80 a )
3330 {
3331     flag aSign;
3332     int32 aExp, shiftCount;
3333     bits64 aSig, aSigExtra;
3334 
3335     aSig = extractFloatx80Frac( a );
3336     aExp = extractFloatx80Exp( a );
3337     aSign = extractFloatx80Sign( a );
3338     shiftCount = 0x403E - aExp;
3339     if ( shiftCount <= 0 ) {
3340         if ( shiftCount ) {
3341             float_raise( float_flag_invalid );
3342             if (    ! aSign
3343                  || (    ( aExp == 0x7FFF )
3344                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3345                ) {
3346                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3347             }
3348             return (sbits64) LIT64( 0x8000000000000000 );
3349         }
3350         aSigExtra = 0;
3351     }
3352     else {
3353         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3354     }
3355     return roundAndPackInt64( aSign, aSig, aSigExtra );
3356 
3357 }
3358 
3359 /*
3360 -------------------------------------------------------------------------------
3361 Returns the result of converting the extended double-precision floating-
3362 point value `a' to the 64-bit two's complement integer format.  The
3363 conversion is performed according to the IEC/IEEE Standard for Binary
3364 Floating-Point Arithmetic, except that the conversion is always rounded
3365 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3366 Otherwise, if the conversion overflows, the largest integer with the same
3367 sign as `a' is returned.
3368 -------------------------------------------------------------------------------
3369 */
3370 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3371 {
3372     flag aSign;
3373     int32 aExp, shiftCount;
3374     bits64 aSig;
3375     int64 z;
3376 
3377     aSig = extractFloatx80Frac( a );
3378     aExp = extractFloatx80Exp( a );
3379     aSign = extractFloatx80Sign( a );
3380     shiftCount = aExp - 0x403E;
3381     if ( 0 <= shiftCount ) {
3382         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3383         if ( ( a.high != 0xC03E ) || aSig ) {
3384             float_raise( float_flag_invalid );
3385             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3386                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3387             }
3388         }
3389         return (sbits64) LIT64( 0x8000000000000000 );
3390     }
3391     else if ( aExp < 0x3FFF ) {
3392         if ( aExp | aSig ) float_set_inexact();
3393         return 0;
3394     }
3395     z = aSig>>( - shiftCount );
3396     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3397         float_set_inexact();
3398     }
3399     if ( aSign ) z = - z;
3400     return z;
3401 
3402 }
3403 
3404 /*
3405 -------------------------------------------------------------------------------
3406 Returns the result of converting the extended double-precision floating-
3407 point value `a' to the single-precision floating-point format.  The
3408 conversion is performed according to the IEC/IEEE Standard for Binary
3409 Floating-Point Arithmetic.
3410 -------------------------------------------------------------------------------
3411 */
3412 float32 floatx80_to_float32( floatx80 a )
3413 {
3414     flag aSign;
3415     int32 aExp;
3416     bits64 aSig;
3417 
3418     aSig = extractFloatx80Frac( a );
3419     aExp = extractFloatx80Exp( a );
3420     aSign = extractFloatx80Sign( a );
3421     if ( aExp == 0x7FFF ) {
3422         if ( (bits64) ( aSig<<1 ) ) {
3423             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3424         }
3425         return packFloat32( aSign, 0xFF, 0 );
3426     }
3427     shift64RightJamming( aSig, 33, &aSig );
3428     if ( aExp || aSig ) aExp -= 0x3F81;
3429     return roundAndPackFloat32( aSign, aExp, aSig );
3430 
3431 }
3432 
3433 /*
3434 -------------------------------------------------------------------------------
3435 Returns the result of converting the extended double-precision floating-
3436 point value `a' to the double-precision floating-point format.  The
3437 conversion is performed according to the IEC/IEEE Standard for Binary
3438 Floating-Point Arithmetic.
3439 -------------------------------------------------------------------------------
3440 */
3441 float64 floatx80_to_float64( floatx80 a )
3442 {
3443     flag aSign;
3444     int32 aExp;
3445     bits64 aSig, zSig;
3446 
3447     aSig = extractFloatx80Frac( a );
3448     aExp = extractFloatx80Exp( a );
3449     aSign = extractFloatx80Sign( a );
3450     if ( aExp == 0x7FFF ) {
3451         if ( (bits64) ( aSig<<1 ) ) {
3452             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3453         }
3454         return packFloat64( aSign, 0x7FF, 0 );
3455     }
3456     shift64RightJamming( aSig, 1, &zSig );
3457     if ( aExp || aSig ) aExp -= 0x3C01;
3458     return roundAndPackFloat64( aSign, aExp, zSig );
3459 
3460 }
3461 
3462 #ifdef FLOAT128
3463 
3464 /*
3465 -------------------------------------------------------------------------------
3466 Returns the result of converting the extended double-precision floating-
3467 point value `a' to the quadruple-precision floating-point format.  The
3468 conversion is performed according to the IEC/IEEE Standard for Binary
3469 Floating-Point Arithmetic.
3470 -------------------------------------------------------------------------------
3471 */
3472 float128 floatx80_to_float128( floatx80 a )
3473 {
3474     flag aSign;
3475     int16 aExp;
3476     bits64 aSig, zSig0, zSig1;
3477 
3478     aSig = extractFloatx80Frac( a );
3479     aExp = extractFloatx80Exp( a );
3480     aSign = extractFloatx80Sign( a );
3481     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3482         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3483     }
3484     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3485     return packFloat128( aSign, aExp, zSig0, zSig1 );
3486 
3487 }
3488 
3489 #endif
3490 
3491 /*
3492 -------------------------------------------------------------------------------
3493 Rounds the extended double-precision floating-point value `a' to an integer,
3494 and returns the result as an extended quadruple-precision floating-point
3495 value.  The operation is performed according to the IEC/IEEE Standard for
3496 Binary Floating-Point Arithmetic.
3497 -------------------------------------------------------------------------------
3498 */
3499 floatx80 floatx80_round_to_int( floatx80 a )
3500 {
3501     flag aSign;
3502     int32 aExp;
3503     bits64 lastBitMask, roundBitsMask;
3504     int8 roundingMode;
3505     floatx80 z;
3506 
3507     aExp = extractFloatx80Exp( a );
3508     if ( 0x403E <= aExp ) {
3509         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3510             return propagateFloatx80NaN( a, a );
3511         }
3512         return a;
3513     }
3514     if ( aExp < 0x3FFF ) {
3515         if (    ( aExp == 0 )
3516              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3517             return a;
3518         }
3519         float_set_inexact();
3520         aSign = extractFloatx80Sign( a );
3521         switch ( float_rounding_mode() ) {
3522          case float_round_nearest_even:
3523             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3524                ) {
3525                 return
3526                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3527             }
3528             break;
3529          case float_round_down:
3530             return
3531                   aSign ?
3532                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3533                 : packFloatx80( 0, 0, 0 );
3534          case float_round_up:
3535             return
3536                   aSign ? packFloatx80( 1, 0, 0 )
3537                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3538         }
3539         return packFloatx80( aSign, 0, 0 );
3540     }
3541     lastBitMask = 1;
3542     lastBitMask <<= 0x403E - aExp;
3543     roundBitsMask = lastBitMask - 1;
3544     z = a;
3545     roundingMode = float_rounding_mode();
3546     if ( roundingMode == float_round_nearest_even ) {
3547         z.low += lastBitMask>>1;
3548         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3549     }
3550     else if ( roundingMode != float_round_to_zero ) {
3551         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3552             z.low += roundBitsMask;
3553         }
3554     }
3555     z.low &= ~ roundBitsMask;
3556     if ( z.low == 0 ) {
3557         ++z.high;
3558         z.low = LIT64( 0x8000000000000000 );
3559     }
3560     if ( z.low != a.low ) float_set_inexact();
3561     return z;
3562 
3563 }
3564 
3565 /*
3566 -------------------------------------------------------------------------------
3567 Returns the result of adding the absolute values of the extended double-
3568 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3569 negated before being returned.  `zSign' is ignored if the result is a NaN.
3570 The addition is performed according to the IEC/IEEE Standard for Binary
3571 Floating-Point Arithmetic.
3572 -------------------------------------------------------------------------------
3573 */
3574 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3575 {
3576     int32 aExp, bExp, zExp;
3577     bits64 aSig, bSig, zSig0, zSig1;
3578     int32 expDiff;
3579 
3580     aSig = extractFloatx80Frac( a );
3581     aExp = extractFloatx80Exp( a );
3582     bSig = extractFloatx80Frac( b );
3583     bExp = extractFloatx80Exp( b );
3584     expDiff = aExp - bExp;
3585     if ( 0 < expDiff ) {
3586         if ( aExp == 0x7FFF ) {
3587             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3588             return a;
3589         }
3590         if ( bExp == 0 ) --expDiff;
3591         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3592         zExp = aExp;
3593     }
3594     else if ( expDiff < 0 ) {
3595         if ( bExp == 0x7FFF ) {
3596             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3597             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3598         }
3599         if ( aExp == 0 ) ++expDiff;
3600         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3601         zExp = bExp;
3602     }
3603     else {
3604         if ( aExp == 0x7FFF ) {
3605             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3606                 return propagateFloatx80NaN( a, b );
3607             }
3608             return a;
3609         }
3610         zSig1 = 0;
3611         zSig0 = aSig + bSig;
3612         if ( aExp == 0 ) {
3613             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3614             goto roundAndPack;
3615         }
3616         zExp = aExp;
3617         goto shiftRight1;
3618     }
3619     zSig0 = aSig + bSig;
3620     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3621  shiftRight1:
3622     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3623     zSig0 |= LIT64( 0x8000000000000000 );
3624     ++zExp;
3625  roundAndPack:
3626     return
3627         roundAndPackFloatx80(
3628             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3629 
3630 }
3631 
3632 /*
3633 -------------------------------------------------------------------------------
3634 Returns the result of subtracting the absolute values of the extended
3635 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3636 difference is negated before being returned.  `zSign' is ignored if the
3637 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3638 Standard for Binary Floating-Point Arithmetic.
3639 -------------------------------------------------------------------------------
3640 */
3641 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3642 {
3643     int32 aExp, bExp, zExp;
3644     bits64 aSig, bSig, zSig0, zSig1;
3645     int32 expDiff;
3646     floatx80 z;
3647 
3648     aSig = extractFloatx80Frac( a );
3649     aExp = extractFloatx80Exp( a );
3650     bSig = extractFloatx80Frac( b );
3651     bExp = extractFloatx80Exp( b );
3652     expDiff = aExp - bExp;
3653     if ( 0 < expDiff ) goto aExpBigger;
3654     if ( expDiff < 0 ) goto bExpBigger;
3655     if ( aExp == 0x7FFF ) {
3656         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3657             return propagateFloatx80NaN( a, b );
3658         }
3659         float_raise( float_flag_invalid );
3660         z.low = floatx80_default_nan_low;
3661         z.high = floatx80_default_nan_high;
3662         return z;
3663     }
3664     if ( aExp == 0 ) {
3665         aExp = 1;
3666         bExp = 1;
3667     }
3668     zSig1 = 0;
3669     if ( bSig < aSig ) goto aBigger;
3670     if ( aSig < bSig ) goto bBigger;
3671     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3672  bExpBigger:
3673     if ( bExp == 0x7FFF ) {
3674         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3675         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3676     }
3677     if ( aExp == 0 ) ++expDiff;
3678     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3679  bBigger:
3680     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3681     zExp = bExp;
3682     zSign ^= 1;
3683     goto normalizeRoundAndPack;
3684  aExpBigger:
3685     if ( aExp == 0x7FFF ) {
3686         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3687         return a;
3688     }
3689     if ( bExp == 0 ) --expDiff;
3690     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3691  aBigger:
3692     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3693     zExp = aExp;
3694  normalizeRoundAndPack:
3695     return
3696         normalizeRoundAndPackFloatx80(
3697             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3698 
3699 }
3700 
3701 /*
3702 -------------------------------------------------------------------------------
3703 Returns the result of adding the extended double-precision floating-point
3704 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3705 Standard for Binary Floating-Point Arithmetic.
3706 -------------------------------------------------------------------------------
3707 */
3708 floatx80 floatx80_add( floatx80 a, floatx80 b )
3709 {
3710     flag aSign, bSign;
3711 
3712     aSign = extractFloatx80Sign( a );
3713     bSign = extractFloatx80Sign( b );
3714     if ( aSign == bSign ) {
3715         return addFloatx80Sigs( a, b, aSign );
3716     }
3717     else {
3718         return subFloatx80Sigs( a, b, aSign );
3719     }
3720 
3721 }
3722 
3723 /*
3724 -------------------------------------------------------------------------------
3725 Returns the result of subtracting the extended double-precision floating-
3726 point values `a' and `b'.  The operation is performed according to the
3727 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3728 -------------------------------------------------------------------------------
3729 */
3730 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3731 {
3732     flag aSign, bSign;
3733 
3734     aSign = extractFloatx80Sign( a );
3735     bSign = extractFloatx80Sign( b );
3736     if ( aSign == bSign ) {
3737         return subFloatx80Sigs( a, b, aSign );
3738     }
3739     else {
3740         return addFloatx80Sigs( a, b, aSign );
3741     }
3742 
3743 }
3744 
3745 /*
3746 -------------------------------------------------------------------------------
3747 Returns the result of multiplying the extended double-precision floating-
3748 point values `a' and `b'.  The operation is performed according to the
3749 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3750 -------------------------------------------------------------------------------
3751 */
3752 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3753 {
3754     flag aSign, bSign, zSign;
3755     int32 aExp, bExp, zExp;
3756     bits64 aSig, bSig, zSig0, zSig1;
3757     floatx80 z;
3758 
3759     aSig = extractFloatx80Frac( a );
3760     aExp = extractFloatx80Exp( a );
3761     aSign = extractFloatx80Sign( a );
3762     bSig = extractFloatx80Frac( b );
3763     bExp = extractFloatx80Exp( b );
3764     bSign = extractFloatx80Sign( b );
3765     zSign = aSign ^ bSign;
3766     if ( aExp == 0x7FFF ) {
3767         if (    (bits64) ( aSig<<1 )
3768              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3769             return propagateFloatx80NaN( a, b );
3770         }
3771         if ( ( bExp | bSig ) == 0 ) goto invalid;
3772         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3773     }
3774     if ( bExp == 0x7FFF ) {
3775         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3776         if ( ( aExp | aSig ) == 0 ) {
3777  invalid:
3778             float_raise( float_flag_invalid );
3779             z.low = floatx80_default_nan_low;
3780             z.high = floatx80_default_nan_high;
3781             return z;
3782         }
3783         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3784     }
3785     if ( aExp == 0 ) {
3786         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3787         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3788     }
3789     if ( bExp == 0 ) {
3790         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3791         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3792     }
3793     zExp = aExp + bExp - 0x3FFE;
3794     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3795     if ( 0 < (sbits64) zSig0 ) {
3796         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3797         --zExp;
3798     }
3799     return
3800         roundAndPackFloatx80(
3801             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3802 
3803 }
3804 
3805 /*
3806 -------------------------------------------------------------------------------
3807 Returns the result of dividing the extended double-precision floating-point
3808 value `a' by the corresponding value `b'.  The operation is performed
3809 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3810 -------------------------------------------------------------------------------
3811 */
3812 floatx80 floatx80_div( floatx80 a, floatx80 b )
3813 {
3814     flag aSign, bSign, zSign;
3815     int32 aExp, bExp, zExp;
3816     bits64 aSig, bSig, zSig0, zSig1;
3817     bits64 rem0, rem1, rem2, term0, term1, term2;
3818     floatx80 z;
3819 
3820     aSig = extractFloatx80Frac( a );
3821     aExp = extractFloatx80Exp( a );
3822     aSign = extractFloatx80Sign( a );
3823     bSig = extractFloatx80Frac( b );
3824     bExp = extractFloatx80Exp( b );
3825     bSign = extractFloatx80Sign( b );
3826     zSign = aSign ^ bSign;
3827     if ( aExp == 0x7FFF ) {
3828         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829         if ( bExp == 0x7FFF ) {
3830             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3831             goto invalid;
3832         }
3833         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3834     }
3835     if ( bExp == 0x7FFF ) {
3836         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3837         return packFloatx80( zSign, 0, 0 );
3838     }
3839     if ( bExp == 0 ) {
3840         if ( bSig == 0 ) {
3841             if ( ( aExp | aSig ) == 0 ) {
3842  invalid:
3843                 float_raise( float_flag_invalid );
3844                 z.low = floatx80_default_nan_low;
3845                 z.high = floatx80_default_nan_high;
3846                 return z;
3847             }
3848             float_raise( float_flag_divbyzero );
3849             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3850         }
3851         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3852     }
3853     if ( aExp == 0 ) {
3854         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3855         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3856     }
3857     zExp = aExp - bExp + 0x3FFE;
3858     rem1 = 0;
3859     if ( bSig <= aSig ) {
3860         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3861         ++zExp;
3862     }
3863     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3864     mul64To128( bSig, zSig0, &term0, &term1 );
3865     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3866     while ( (sbits64) rem0 < 0 ) {
3867         --zSig0;
3868         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3869     }
3870     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3871     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3872         mul64To128( bSig, zSig1, &term1, &term2 );
3873         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3874         while ( (sbits64) rem1 < 0 ) {
3875             --zSig1;
3876             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3877         }
3878         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3879     }
3880     return
3881         roundAndPackFloatx80(
3882             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3883 
3884 }
3885 
3886 /*
3887 -------------------------------------------------------------------------------
3888 Returns the remainder of the extended double-precision floating-point value
3889 `a' with respect to the corresponding value `b'.  The operation is performed
3890 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3891 -------------------------------------------------------------------------------
3892 */
3893 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3894 {
3895     flag aSign, bSign, zSign;
3896     int32 aExp, bExp, expDiff;
3897     bits64 aSig0, aSig1, bSig;
3898     bits64 q, term0, term1, alternateASig0, alternateASig1;
3899     floatx80 z;
3900 
3901     aSig0 = extractFloatx80Frac( a );
3902     aExp = extractFloatx80Exp( a );
3903     aSign = extractFloatx80Sign( a );
3904     bSig = extractFloatx80Frac( b );
3905     bExp = extractFloatx80Exp( b );
3906     bSign = extractFloatx80Sign( b );
3907     if ( aExp == 0x7FFF ) {
3908         if (    (bits64) ( aSig0<<1 )
3909              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3910             return propagateFloatx80NaN( a, b );
3911         }
3912         goto invalid;
3913     }
3914     if ( bExp == 0x7FFF ) {
3915         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3916         return a;
3917     }
3918     if ( bExp == 0 ) {
3919         if ( bSig == 0 ) {
3920  invalid:
3921             float_raise( float_flag_invalid );
3922             z.low = floatx80_default_nan_low;
3923             z.high = floatx80_default_nan_high;
3924             return z;
3925         }
3926         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3927     }
3928     if ( aExp == 0 ) {
3929         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3930         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3931     }
3932     bSig |= LIT64( 0x8000000000000000 );
3933     zSign = aSign;
3934     expDiff = aExp - bExp;
3935     aSig1 = 0;
3936     if ( expDiff < 0 ) {
3937         if ( expDiff < -1 ) return a;
3938         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3939         expDiff = 0;
3940     }
3941     q = ( bSig <= aSig0 );
3942     if ( q ) aSig0 -= bSig;
3943     expDiff -= 64;
3944     while ( 0 < expDiff ) {
3945         q = estimateDiv128To64( aSig0, aSig1, bSig );
3946         q = ( 2 < q ) ? q - 2 : 0;
3947         mul64To128( bSig, q, &term0, &term1 );
3948         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3949         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3950         expDiff -= 62;
3951     }
3952     expDiff += 64;
3953     if ( 0 < expDiff ) {
3954         q = estimateDiv128To64( aSig0, aSig1, bSig );
3955         q = ( 2 < q ) ? q - 2 : 0;
3956         q >>= 64 - expDiff;
3957         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3958         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3959         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3960         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3961             ++q;
3962             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3963         }
3964     }
3965     else {
3966         term1 = 0;
3967         term0 = bSig;
3968     }
3969     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3970     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3971          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3972               && ( q & 1 ) )
3973        ) {
3974         aSig0 = alternateASig0;
3975         aSig1 = alternateASig1;
3976         zSign = ! zSign;
3977     }
3978     return
3979         normalizeRoundAndPackFloatx80(
3980             80, zSign, bExp + expDiff, aSig0, aSig1 );
3981 
3982 }
3983 
3984 /*
3985 -------------------------------------------------------------------------------
3986 Returns the square root of the extended double-precision floating-point
3987 value `a'.  The operation is performed according to the IEC/IEEE Standard
3988 for Binary Floating-Point Arithmetic.
3989 -------------------------------------------------------------------------------
3990 */
3991 floatx80 floatx80_sqrt( floatx80 a )
3992 {
3993     flag aSign;
3994     int32 aExp, zExp;
3995     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3996     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3997     floatx80 z;
3998 
3999     aSig0 = extractFloatx80Frac( a );
4000     aExp = extractFloatx80Exp( a );
4001     aSign = extractFloatx80Sign( a );
4002     if ( aExp == 0x7FFF ) {
4003         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4004         if ( ! aSign ) return a;
4005         goto invalid;
4006     }
4007     if ( aSign ) {
4008         if ( ( aExp | aSig0 ) == 0 ) return a;
4009  invalid:
4010         float_raise( float_flag_invalid );
4011         z.low = floatx80_default_nan_low;
4012         z.high = floatx80_default_nan_high;
4013         return z;
4014     }
4015     if ( aExp == 0 ) {
4016         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4017         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4018     }
4019     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4020     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4021     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4022     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4023     doubleZSig0 = zSig0<<1;
4024     mul64To128( zSig0, zSig0, &term0, &term1 );
4025     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4026     while ( (sbits64) rem0 < 0 ) {
4027         --zSig0;
4028         doubleZSig0 -= 2;
4029         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4030     }
4031     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4032     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4033         if ( zSig1 == 0 ) zSig1 = 1;
4034         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4035         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4036         mul64To128( zSig1, zSig1, &term2, &term3 );
4037         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4038         while ( (sbits64) rem1 < 0 ) {
4039             --zSig1;
4040             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4041             term3 |= 1;
4042             term2 |= doubleZSig0;
4043             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4044         }
4045         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4046     }
4047     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4048     zSig0 |= doubleZSig0;
4049     return
4050         roundAndPackFloatx80(
4051             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4052 
4053 }
4054 
4055 /*
4056 -------------------------------------------------------------------------------
4057 Returns 1 if the extended double-precision floating-point value `a' is
4058 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4059 performed according to the IEC/IEEE Standard for Binary Floating-Point
4060 Arithmetic.
4061 -------------------------------------------------------------------------------
4062 */
4063 flag floatx80_eq( floatx80 a, floatx80 b )
4064 {
4065 
4066     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4067               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4068          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4069               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4070        ) {
4071         if (    floatx80_is_signaling_nan( a )
4072              || floatx80_is_signaling_nan( b ) ) {
4073             float_raise( float_flag_invalid );
4074         }
4075         return 0;
4076     }
4077     return
4078            ( a.low == b.low )
4079         && (    ( a.high == b.high )
4080              || (    ( a.low == 0 )
4081                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4082            );
4083 
4084 }
4085 
4086 /*
4087 -------------------------------------------------------------------------------
4088 Returns 1 if the extended double-precision floating-point value `a' is
4089 less than or equal to the corresponding value `b', and 0 otherwise.  The
4090 comparison is performed according to the IEC/IEEE Standard for Binary
4091 Floating-Point Arithmetic.
4092 -------------------------------------------------------------------------------
4093 */
4094 flag floatx80_le( floatx80 a, floatx80 b )
4095 {
4096     flag aSign, bSign;
4097 
4098     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4099               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4100          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4101               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4102        ) {
4103         float_raise( float_flag_invalid );
4104         return 0;
4105     }
4106     aSign = extractFloatx80Sign( a );
4107     bSign = extractFloatx80Sign( b );
4108     if ( aSign != bSign ) {
4109         return
4110                aSign
4111             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4112                  == 0 );
4113     }
4114     return
4115           aSign ? le128( b.high, b.low, a.high, a.low )
4116         : le128( a.high, a.low, b.high, b.low );
4117 
4118 }
4119 
4120 /*
4121 -------------------------------------------------------------------------------
4122 Returns 1 if the extended double-precision floating-point value `a' is
4123 less than the corresponding value `b', and 0 otherwise.  The comparison
4124 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4125 Arithmetic.
4126 -------------------------------------------------------------------------------
4127 */
4128 flag floatx80_lt( floatx80 a, floatx80 b )
4129 {
4130     flag aSign, bSign;
4131 
4132     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4133               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4134          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4135               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4136        ) {
4137         float_raise( float_flag_invalid );
4138         return 0;
4139     }
4140     aSign = extractFloatx80Sign( a );
4141     bSign = extractFloatx80Sign( b );
4142     if ( aSign != bSign ) {
4143         return
4144                aSign
4145             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4146                  != 0 );
4147     }
4148     return
4149           aSign ? lt128( b.high, b.low, a.high, a.low )
4150         : lt128( a.high, a.low, b.high, b.low );
4151 
4152 }
4153 
4154 /*
4155 -------------------------------------------------------------------------------
4156 Returns 1 if the extended double-precision floating-point value `a' is equal
4157 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4158 raised if either operand is a NaN.  Otherwise, the comparison is performed
4159 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4160 -------------------------------------------------------------------------------
4161 */
4162 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4163 {
4164 
4165     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4166               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4167          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4168               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4169        ) {
4170         float_raise( float_flag_invalid );
4171         return 0;
4172     }
4173     return
4174            ( a.low == b.low )
4175         && (    ( a.high == b.high )
4176              || (    ( a.low == 0 )
4177                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4178            );
4179 
4180 }
4181 
4182 /*
4183 -------------------------------------------------------------------------------
4184 Returns 1 if the extended double-precision floating-point value `a' is less
4185 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4186 do not cause an exception.  Otherwise, the comparison is performed according
4187 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4188 -------------------------------------------------------------------------------
4189 */
4190 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4191 {
4192     flag aSign, bSign;
4193 
4194     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4195               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4196          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4197               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4198        ) {
4199         if (    floatx80_is_signaling_nan( a )
4200              || floatx80_is_signaling_nan( b ) ) {
4201             float_raise( float_flag_invalid );
4202         }
4203         return 0;
4204     }
4205     aSign = extractFloatx80Sign( a );
4206     bSign = extractFloatx80Sign( b );
4207     if ( aSign != bSign ) {
4208         return
4209                aSign
4210             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4211                  == 0 );
4212     }
4213     return
4214           aSign ? le128( b.high, b.low, a.high, a.low )
4215         : le128( a.high, a.low, b.high, b.low );
4216 
4217 }
4218 
4219 /*
4220 -------------------------------------------------------------------------------
4221 Returns 1 if the extended double-precision floating-point value `a' is less
4222 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4223 an exception.  Otherwise, the comparison is performed according to the
4224 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4225 -------------------------------------------------------------------------------
4226 */
4227 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4228 {
4229     flag aSign, bSign;
4230 
4231     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4232               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4233          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4234               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4235        ) {
4236         if (    floatx80_is_signaling_nan( a )
4237              || floatx80_is_signaling_nan( b ) ) {
4238             float_raise( float_flag_invalid );
4239         }
4240         return 0;
4241     }
4242     aSign = extractFloatx80Sign( a );
4243     bSign = extractFloatx80Sign( b );
4244     if ( aSign != bSign ) {
4245         return
4246                aSign
4247             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4248                  != 0 );
4249     }
4250     return
4251           aSign ? lt128( b.high, b.low, a.high, a.low )
4252         : lt128( a.high, a.low, b.high, b.low );
4253 
4254 }
4255 
4256 #endif
4257 
4258 #ifdef FLOAT128
4259 
4260 /*
4261 -------------------------------------------------------------------------------
4262 Returns the result of converting the quadruple-precision floating-point
4263 value `a' to the 32-bit two's complement integer format.  The conversion
4264 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4265 Arithmetic---which means in particular that the conversion is rounded
4266 according to the current rounding mode.  If `a' is a NaN, the largest
4267 positive integer is returned.  Otherwise, if the conversion overflows, the
4268 largest integer with the same sign as `a' is returned.
4269 -------------------------------------------------------------------------------
4270 */
4271 int32 float128_to_int32( float128 a )
4272 {
4273     flag aSign;
4274     int32 aExp, shiftCount;
4275     bits64 aSig0, aSig1;
4276 
4277     aSig1 = extractFloat128Frac1( a );
4278     aSig0 = extractFloat128Frac0( a );
4279     aExp = extractFloat128Exp( a );
4280     aSign = extractFloat128Sign( a );
4281     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4282     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4283     aSig0 |= ( aSig1 != 0 );
4284     shiftCount = 0x4028 - aExp;
4285     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4286     return roundAndPackInt32( aSign, aSig0 );
4287 
4288 }
4289 
4290 /*
4291 -------------------------------------------------------------------------------
4292 Returns the result of converting the quadruple-precision floating-point
4293 value `a' to the 32-bit two's complement integer format.  The conversion
4294 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4295 Arithmetic, except that the conversion is always rounded toward zero.  If
4296 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4297 conversion overflows, the largest integer with the same sign as `a' is
4298 returned.
4299 -------------------------------------------------------------------------------
4300 */
4301 int32 float128_to_int32_round_to_zero( float128 a )
4302 {
4303     flag aSign;
4304     int32 aExp, shiftCount;
4305     bits64 aSig0, aSig1, savedASig;
4306     int32 z;
4307 
4308     aSig1 = extractFloat128Frac1( a );
4309     aSig0 = extractFloat128Frac0( a );
4310     aExp = extractFloat128Exp( a );
4311     aSign = extractFloat128Sign( a );
4312     aSig0 |= ( aSig1 != 0 );
4313     if ( 0x401E < aExp ) {
4314         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4315         goto invalid;
4316     }
4317     else if ( aExp < 0x3FFF ) {
4318         if ( aExp || aSig0 ) float_set_inexact();
4319         return 0;
4320     }
4321     aSig0 |= LIT64( 0x0001000000000000 );
4322     shiftCount = 0x402F - aExp;
4323     savedASig = aSig0;
4324     aSig0 >>= shiftCount;
4325     z = aSig0;
4326     if ( aSign ) z = - z;
4327     if ( ( z < 0 ) ^ aSign ) {
4328  invalid:
4329         float_raise( float_flag_invalid );
4330         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4331     }
4332     if ( ( aSig0<<shiftCount ) != savedASig ) {
4333         float_set_inexact();
4334     }
4335     return z;
4336 
4337 }
4338 
4339 /*
4340 -------------------------------------------------------------------------------
4341 Returns the result of converting the quadruple-precision floating-point
4342 value `a' to the 64-bit two's complement integer format.  The conversion
4343 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4344 Arithmetic---which means in particular that the conversion is rounded
4345 according to the current rounding mode.  If `a' is a NaN, the largest
4346 positive integer is returned.  Otherwise, if the conversion overflows, the
4347 largest integer with the same sign as `a' is returned.
4348 -------------------------------------------------------------------------------
4349 */
4350 int64 float128_to_int64( float128 a )
4351 {
4352     flag aSign;
4353     int32 aExp, shiftCount;
4354     bits64 aSig0, aSig1;
4355 
4356     aSig1 = extractFloat128Frac1( a );
4357     aSig0 = extractFloat128Frac0( a );
4358     aExp = extractFloat128Exp( a );
4359     aSign = extractFloat128Sign( a );
4360     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4361     shiftCount = 0x402F - aExp;
4362     if ( shiftCount <= 0 ) {
4363         if ( 0x403E < aExp ) {
4364             float_raise( float_flag_invalid );
4365             if (    ! aSign
4366                  || (    ( aExp == 0x7FFF )
4367                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4368                     )
4369                ) {
4370                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4371             }
4372             return (sbits64) LIT64( 0x8000000000000000 );
4373         }
4374         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4375     }
4376     else {
4377         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4378     }
4379     return roundAndPackInt64( aSign, aSig0, aSig1 );
4380 
4381 }
4382 
4383 /*
4384 -------------------------------------------------------------------------------
4385 Returns the result of converting the quadruple-precision floating-point
4386 value `a' to the 64-bit two's complement integer format.  The conversion
4387 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4388 Arithmetic, except that the conversion is always rounded toward zero.
4389 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4390 the conversion overflows, the largest integer with the same sign as `a' is
4391 returned.
4392 -------------------------------------------------------------------------------
4393 */
4394 int64 float128_to_int64_round_to_zero( float128 a )
4395 {
4396     flag aSign;
4397     int32 aExp, shiftCount;
4398     bits64 aSig0, aSig1;
4399     int64 z;
4400 
4401     aSig1 = extractFloat128Frac1( a );
4402     aSig0 = extractFloat128Frac0( a );
4403     aExp = extractFloat128Exp( a );
4404     aSign = extractFloat128Sign( a );
4405     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4406     shiftCount = aExp - 0x402F;
4407     if ( 0 < shiftCount ) {
4408         if ( 0x403E <= aExp ) {
4409             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4410             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4411                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4412                 if ( aSig1 ) float_set_inexact();
4413             }
4414             else {
4415                 float_raise( float_flag_invalid );
4416                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4417                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4418                 }
4419             }
4420             return (sbits64) LIT64( 0x8000000000000000 );
4421         }
4422         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4423         if ( (bits64) ( aSig1<<shiftCount ) ) {
4424             float_set_inexact();
4425         }
4426     }
4427     else {
4428         if ( aExp < 0x3FFF ) {
4429             if ( aExp | aSig0 | aSig1 ) {
4430                 float_set_inexact();
4431             }
4432             return 0;
4433         }
4434         z = aSig0>>( - shiftCount );
4435         if (    aSig1
4436              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4437             float_set_inexact();
4438         }
4439     }
4440     if ( aSign ) z = - z;
4441     return z;
4442 
4443 }
4444 
4445 /*
4446 -------------------------------------------------------------------------------
4447 Returns the result of converting the quadruple-precision floating-point
4448 value `a' to the single-precision floating-point format.  The conversion
4449 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4450 Arithmetic.
4451 -------------------------------------------------------------------------------
4452 */
4453 float32 float128_to_float32( float128 a )
4454 {
4455     flag aSign;
4456     int32 aExp;
4457     bits64 aSig0, aSig1;
4458     bits32 zSig;
4459 
4460     aSig1 = extractFloat128Frac1( a );
4461     aSig0 = extractFloat128Frac0( a );
4462     aExp = extractFloat128Exp( a );
4463     aSign = extractFloat128Sign( a );
4464     if ( aExp == 0x7FFF ) {
4465         if ( aSig0 | aSig1 ) {
4466             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4467         }
4468         return packFloat32( aSign, 0xFF, 0 );
4469     }
4470     aSig0 |= ( aSig1 != 0 );
4471     shift64RightJamming( aSig0, 18, &aSig0 );
4472     zSig = aSig0;
4473     if ( aExp || zSig ) {
4474         zSig |= 0x40000000;
4475         aExp -= 0x3F81;
4476     }
4477     return roundAndPackFloat32( aSign, aExp, zSig );
4478 
4479 }
4480 
4481 /*
4482 -------------------------------------------------------------------------------
4483 Returns the result of converting the quadruple-precision floating-point
4484 value `a' to the double-precision floating-point format.  The conversion
4485 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4486 Arithmetic.
4487 -------------------------------------------------------------------------------
4488 */
4489 float64 float128_to_float64( float128 a )
4490 {
4491     flag aSign;
4492     int32 aExp;
4493     bits64 aSig0, aSig1;
4494 
4495     aSig1 = extractFloat128Frac1( a );
4496     aSig0 = extractFloat128Frac0( a );
4497     aExp = extractFloat128Exp( a );
4498     aSign = extractFloat128Sign( a );
4499     if ( aExp == 0x7FFF ) {
4500         if ( aSig0 | aSig1 ) {
4501             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4502         }
4503         return packFloat64( aSign, 0x7FF, 0 );
4504     }
4505     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4506     aSig0 |= ( aSig1 != 0 );
4507     if ( aExp || aSig0 ) {
4508         aSig0 |= LIT64( 0x4000000000000000 );
4509         aExp -= 0x3C01;
4510     }
4511     return roundAndPackFloat64( aSign, aExp, aSig0 );
4512 
4513 }
4514 
4515 #ifdef FLOATX80
4516 
4517 /*
4518 -------------------------------------------------------------------------------
4519 Returns the result of converting the quadruple-precision floating-point
4520 value `a' to the extended double-precision floating-point format.  The
4521 conversion is performed according to the IEC/IEEE Standard for Binary
4522 Floating-Point Arithmetic.
4523 -------------------------------------------------------------------------------
4524 */
4525 floatx80 float128_to_floatx80( float128 a )
4526 {
4527     flag aSign;
4528     int32 aExp;
4529     bits64 aSig0, aSig1;
4530 
4531     aSig1 = extractFloat128Frac1( a );
4532     aSig0 = extractFloat128Frac0( a );
4533     aExp = extractFloat128Exp( a );
4534     aSign = extractFloat128Sign( a );
4535     if ( aExp == 0x7FFF ) {
4536         if ( aSig0 | aSig1 ) {
4537             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4538         }
4539         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4540     }
4541     if ( aExp == 0 ) {
4542         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4543         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4544     }
4545     else {
4546         aSig0 |= LIT64( 0x0001000000000000 );
4547     }
4548     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4549     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4550 
4551 }
4552 
4553 #endif
4554 
4555 /*
4556 -------------------------------------------------------------------------------
4557 Rounds the quadruple-precision floating-point value `a' to an integer, and
4558 returns the result as a quadruple-precision floating-point value.  The
4559 operation is performed according to the IEC/IEEE Standard for Binary
4560 Floating-Point Arithmetic.
4561 -------------------------------------------------------------------------------
4562 */
4563 float128 float128_round_to_int( float128 a )
4564 {
4565     flag aSign;
4566     int32 aExp;
4567     bits64 lastBitMask, roundBitsMask;
4568     int8 roundingMode;
4569     float128 z;
4570 
4571     aExp = extractFloat128Exp( a );
4572     if ( 0x402F <= aExp ) {
4573         if ( 0x406F <= aExp ) {
4574             if (    ( aExp == 0x7FFF )
4575                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4576                ) {
4577                 return propagateFloat128NaN( a, a );
4578             }
4579             return a;
4580         }
4581         lastBitMask = 1;
4582         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4583         roundBitsMask = lastBitMask - 1;
4584         z = a;
4585         roundingMode = float_rounding_mode();
4586         if ( roundingMode == float_round_nearest_even ) {
4587             if ( lastBitMask ) {
4588                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4589                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4590             }
4591             else {
4592                 if ( (sbits64) z.low < 0 ) {
4593                     ++z.high;
4594                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4595                 }
4596             }
4597         }
4598         else if ( roundingMode != float_round_to_zero ) {
4599             if (   extractFloat128Sign( z )
4600                  ^ ( roundingMode == float_round_up ) ) {
4601                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4602             }
4603         }
4604         z.low &= ~ roundBitsMask;
4605     }
4606     else {
4607         if ( aExp < 0x3FFF ) {
4608             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4609             float_set_inexact();
4610             aSign = extractFloat128Sign( a );
4611             switch ( float_rounding_mode() ) {
4612              case float_round_nearest_even:
4613                 if (    ( aExp == 0x3FFE )
4614                      && (   extractFloat128Frac0( a )
4615                           | extractFloat128Frac1( a ) )
4616                    ) {
4617                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4618                 }
4619                 break;
4620              case float_round_down:
4621                 return
4622                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4623                     : packFloat128( 0, 0, 0, 0 );
4624              case float_round_up:
4625                 return
4626                       aSign ? packFloat128( 1, 0, 0, 0 )
4627                     : packFloat128( 0, 0x3FFF, 0, 0 );
4628             }
4629             return packFloat128( aSign, 0, 0, 0 );
4630         }
4631         lastBitMask = 1;
4632         lastBitMask <<= 0x402F - aExp;
4633         roundBitsMask = lastBitMask - 1;
4634         z.low = 0;
4635         z.high = a.high;
4636         roundingMode = float_rounding_mode();
4637         if ( roundingMode == float_round_nearest_even ) {
4638             z.high += lastBitMask>>1;
4639             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4640                 z.high &= ~ lastBitMask;
4641             }
4642         }
4643         else if ( roundingMode != float_round_to_zero ) {
4644             if (   extractFloat128Sign( z )
4645                  ^ ( roundingMode == float_round_up ) ) {
4646                 z.high |= ( a.low != 0 );
4647                 z.high += roundBitsMask;
4648             }
4649         }
4650         z.high &= ~ roundBitsMask;
4651     }
4652     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4653         float_set_inexact();
4654     }
4655     return z;
4656 
4657 }
4658 
4659 /*
4660 -------------------------------------------------------------------------------
4661 Returns the result of adding the absolute values of the quadruple-precision
4662 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4663 before being returned.  `zSign' is ignored if the result is a NaN.
4664 The addition is performed according to the IEC/IEEE Standard for Binary
4665 Floating-Point Arithmetic.
4666 -------------------------------------------------------------------------------
4667 */
4668 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4669 {
4670     int32 aExp, bExp, zExp;
4671     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4672     int32 expDiff;
4673 
4674     aSig1 = extractFloat128Frac1( a );
4675     aSig0 = extractFloat128Frac0( a );
4676     aExp = extractFloat128Exp( a );
4677     bSig1 = extractFloat128Frac1( b );
4678     bSig0 = extractFloat128Frac0( b );
4679     bExp = extractFloat128Exp( b );
4680     expDiff = aExp - bExp;
4681     if ( 0 < expDiff ) {
4682         if ( aExp == 0x7FFF ) {
4683             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4684             return a;
4685         }
4686         if ( bExp == 0 ) {
4687             --expDiff;
4688         }
4689         else {
4690             bSig0 |= LIT64( 0x0001000000000000 );
4691         }
4692         shift128ExtraRightJamming(
4693             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4694         zExp = aExp;
4695     }
4696     else if ( expDiff < 0 ) {
4697         if ( bExp == 0x7FFF ) {
4698             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4699             return packFloat128( zSign, 0x7FFF, 0, 0 );
4700         }
4701         if ( aExp == 0 ) {
4702             ++expDiff;
4703         }
4704         else {
4705             aSig0 |= LIT64( 0x0001000000000000 );
4706         }
4707         shift128ExtraRightJamming(
4708             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4709         zExp = bExp;
4710     }
4711     else {
4712         if ( aExp == 0x7FFF ) {
4713             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4714                 return propagateFloat128NaN( a, b );
4715             }
4716             return a;
4717         }
4718         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4719         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4720         zSig2 = 0;
4721         zSig0 |= LIT64( 0x0002000000000000 );
4722         zExp = aExp;
4723         goto shiftRight1;
4724     }
4725     aSig0 |= LIT64( 0x0001000000000000 );
4726     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4727     --zExp;
4728     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4729     ++zExp;
4730  shiftRight1:
4731     shift128ExtraRightJamming(
4732         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4733  roundAndPack:
4734     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4735 
4736 }
4737 
4738 /*
4739 -------------------------------------------------------------------------------
4740 Returns the result of subtracting the absolute values of the quadruple-
4741 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4742 difference is negated before being returned.  `zSign' is ignored if the
4743 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4744 Standard for Binary Floating-Point Arithmetic.
4745 -------------------------------------------------------------------------------
4746 */
4747 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4748 {
4749     int32 aExp, bExp, zExp;
4750     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4751     int32 expDiff;
4752     float128 z;
4753 
4754     aSig1 = extractFloat128Frac1( a );
4755     aSig0 = extractFloat128Frac0( a );
4756     aExp = extractFloat128Exp( a );
4757     bSig1 = extractFloat128Frac1( b );
4758     bSig0 = extractFloat128Frac0( b );
4759     bExp = extractFloat128Exp( b );
4760     expDiff = aExp - bExp;
4761     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4762     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4763     if ( 0 < expDiff ) goto aExpBigger;
4764     if ( expDiff < 0 ) goto bExpBigger;
4765     if ( aExp == 0x7FFF ) {
4766         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4767             return propagateFloat128NaN( a, b );
4768         }
4769         float_raise( float_flag_invalid );
4770         z.low = float128_default_nan_low;
4771         z.high = float128_default_nan_high;
4772         return z;
4773     }
4774     if ( aExp == 0 ) {
4775         aExp = 1;
4776         bExp = 1;
4777     }
4778     if ( bSig0 < aSig0 ) goto aBigger;
4779     if ( aSig0 < bSig0 ) goto bBigger;
4780     if ( bSig1 < aSig1 ) goto aBigger;
4781     if ( aSig1 < bSig1 ) goto bBigger;
4782     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4783  bExpBigger:
4784     if ( bExp == 0x7FFF ) {
4785         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4786         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4787     }
4788     if ( aExp == 0 ) {
4789         ++expDiff;
4790     }
4791     else {
4792         aSig0 |= LIT64( 0x4000000000000000 );
4793     }
4794     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4795     bSig0 |= LIT64( 0x4000000000000000 );
4796  bBigger:
4797     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4798     zExp = bExp;
4799     zSign ^= 1;
4800     goto normalizeRoundAndPack;
4801  aExpBigger:
4802     if ( aExp == 0x7FFF ) {
4803         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4804         return a;
4805     }
4806     if ( bExp == 0 ) {
4807         --expDiff;
4808     }
4809     else {
4810         bSig0 |= LIT64( 0x4000000000000000 );
4811     }
4812     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4813     aSig0 |= LIT64( 0x4000000000000000 );
4814  aBigger:
4815     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4816     zExp = aExp;
4817  normalizeRoundAndPack:
4818     --zExp;
4819     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4820 
4821 }
4822 
4823 /*
4824 -------------------------------------------------------------------------------
4825 Returns the result of adding the quadruple-precision floating-point values
4826 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4827 for Binary Floating-Point Arithmetic.
4828 -------------------------------------------------------------------------------
4829 */
4830 float128 float128_add( float128 a, float128 b )
4831 {
4832     flag aSign, bSign;
4833 
4834     aSign = extractFloat128Sign( a );
4835     bSign = extractFloat128Sign( b );
4836     if ( aSign == bSign ) {
4837         return addFloat128Sigs( a, b, aSign );
4838     }
4839     else {
4840         return subFloat128Sigs( a, b, aSign );
4841     }
4842 
4843 }
4844 
4845 /*
4846 -------------------------------------------------------------------------------
4847 Returns the result of subtracting the quadruple-precision floating-point
4848 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4849 Standard for Binary Floating-Point Arithmetic.
4850 -------------------------------------------------------------------------------
4851 */
4852 float128 float128_sub( float128 a, float128 b )
4853 {
4854     flag aSign, bSign;
4855 
4856     aSign = extractFloat128Sign( a );
4857     bSign = extractFloat128Sign( b );
4858     if ( aSign == bSign ) {
4859         return subFloat128Sigs( a, b, aSign );
4860     }
4861     else {
4862         return addFloat128Sigs( a, b, aSign );
4863     }
4864 
4865 }
4866 
4867 /*
4868 -------------------------------------------------------------------------------
4869 Returns the result of multiplying the quadruple-precision floating-point
4870 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4871 Standard for Binary Floating-Point Arithmetic.
4872 -------------------------------------------------------------------------------
4873 */
4874 float128 float128_mul( float128 a, float128 b )
4875 {
4876     flag aSign, bSign, zSign;
4877     int32 aExp, bExp, zExp;
4878     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4879     float128 z;
4880 
4881     aSig1 = extractFloat128Frac1( a );
4882     aSig0 = extractFloat128Frac0( a );
4883     aExp = extractFloat128Exp( a );
4884     aSign = extractFloat128Sign( a );
4885     bSig1 = extractFloat128Frac1( b );
4886     bSig0 = extractFloat128Frac0( b );
4887     bExp = extractFloat128Exp( b );
4888     bSign = extractFloat128Sign( b );
4889     zSign = aSign ^ bSign;
4890     if ( aExp == 0x7FFF ) {
4891         if (    ( aSig0 | aSig1 )
4892              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4893             return propagateFloat128NaN( a, b );
4894         }
4895         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4896         return packFloat128( zSign, 0x7FFF, 0, 0 );
4897     }
4898     if ( bExp == 0x7FFF ) {
4899         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4900         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4901  invalid:
4902             float_raise( float_flag_invalid );
4903             z.low = float128_default_nan_low;
4904             z.high = float128_default_nan_high;
4905             return z;
4906         }
4907         return packFloat128( zSign, 0x7FFF, 0, 0 );
4908     }
4909     if ( aExp == 0 ) {
4910         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4911         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4912     }
4913     if ( bExp == 0 ) {
4914         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4915         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4916     }
4917     zExp = aExp + bExp - 0x4000;
4918     aSig0 |= LIT64( 0x0001000000000000 );
4919     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4920     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4921     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4922     zSig2 |= ( zSig3 != 0 );
4923     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4924         shift128ExtraRightJamming(
4925             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4926         ++zExp;
4927     }
4928     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4929 
4930 }
4931 
4932 /*
4933 -------------------------------------------------------------------------------
4934 Returns the result of dividing the quadruple-precision floating-point value
4935 `a' by the corresponding value `b'.  The operation is performed according to
4936 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4937 -------------------------------------------------------------------------------
4938 */
4939 float128 float128_div( float128 a, float128 b )
4940 {
4941     flag aSign, bSign, zSign;
4942     int32 aExp, bExp, zExp;
4943     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4944     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4945     float128 z;
4946 
4947     aSig1 = extractFloat128Frac1( a );
4948     aSig0 = extractFloat128Frac0( a );
4949     aExp = extractFloat128Exp( a );
4950     aSign = extractFloat128Sign( a );
4951     bSig1 = extractFloat128Frac1( b );
4952     bSig0 = extractFloat128Frac0( b );
4953     bExp = extractFloat128Exp( b );
4954     bSign = extractFloat128Sign( b );
4955     zSign = aSign ^ bSign;
4956     if ( aExp == 0x7FFF ) {
4957         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4958         if ( bExp == 0x7FFF ) {
4959             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4960             goto invalid;
4961         }
4962         return packFloat128( zSign, 0x7FFF, 0, 0 );
4963     }
4964     if ( bExp == 0x7FFF ) {
4965         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4966         return packFloat128( zSign, 0, 0, 0 );
4967     }
4968     if ( bExp == 0 ) {
4969         if ( ( bSig0 | bSig1 ) == 0 ) {
4970             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4971  invalid:
4972                 float_raise( float_flag_invalid );
4973                 z.low = float128_default_nan_low;
4974                 z.high = float128_default_nan_high;
4975                 return z;
4976             }
4977             float_raise( float_flag_divbyzero );
4978             return packFloat128( zSign, 0x7FFF, 0, 0 );
4979         }
4980         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4981     }
4982     if ( aExp == 0 ) {
4983         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4984         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4985     }
4986     zExp = aExp - bExp + 0x3FFD;
4987     shortShift128Left(
4988         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4989     shortShift128Left(
4990         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4991     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4992         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4993         ++zExp;
4994     }
4995     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4996     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4997     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4998     while ( (sbits64) rem0 < 0 ) {
4999         --zSig0;
5000         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5001     }
5002     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5003     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5004         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5005         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5006         while ( (sbits64) rem1 < 0 ) {
5007             --zSig1;
5008             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5009         }
5010         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5011     }
5012     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5013     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5014 
5015 }
5016 
5017 /*
5018 -------------------------------------------------------------------------------
5019 Returns the remainder of the quadruple-precision floating-point value `a'
5020 with respect to the corresponding value `b'.  The operation is performed
5021 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 -------------------------------------------------------------------------------
5023 */
5024 float128 float128_rem( float128 a, float128 b )
5025 {
5026     flag aSign, bSign, zSign;
5027     int32 aExp, bExp, expDiff;
5028     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5029     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5030     sbits64 sigMean0;
5031     float128 z;
5032 
5033     aSig1 = extractFloat128Frac1( a );
5034     aSig0 = extractFloat128Frac0( a );
5035     aExp = extractFloat128Exp( a );
5036     aSign = extractFloat128Sign( a );
5037     bSig1 = extractFloat128Frac1( b );
5038     bSig0 = extractFloat128Frac0( b );
5039     bExp = extractFloat128Exp( b );
5040     bSign = extractFloat128Sign( b );
5041     if ( aExp == 0x7FFF ) {
5042         if (    ( aSig0 | aSig1 )
5043              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5044             return propagateFloat128NaN( a, b );
5045         }
5046         goto invalid;
5047     }
5048     if ( bExp == 0x7FFF ) {
5049         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5050         return a;
5051     }
5052     if ( bExp == 0 ) {
5053         if ( ( bSig0 | bSig1 ) == 0 ) {
5054  invalid:
5055             float_raise( float_flag_invalid );
5056             z.low = float128_default_nan_low;
5057             z.high = float128_default_nan_high;
5058             return z;
5059         }
5060         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5061     }
5062     if ( aExp == 0 ) {
5063         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5064         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5065     }
5066     expDiff = aExp - bExp;
5067     if ( expDiff < -1 ) return a;
5068     shortShift128Left(
5069         aSig0 | LIT64( 0x0001000000000000 ),
5070         aSig1,
5071         15 - ( expDiff < 0 ),
5072         &aSig0,
5073         &aSig1
5074     );
5075     shortShift128Left(
5076         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5077     q = le128( bSig0, bSig1, aSig0, aSig1 );
5078     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5079     expDiff -= 64;
5080     while ( 0 < expDiff ) {
5081         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5082         q = ( 4 < q ) ? q - 4 : 0;
5083         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5084         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5085         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5086         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5087         expDiff -= 61;
5088     }
5089     if ( -64 < expDiff ) {
5090         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5091         q = ( 4 < q ) ? q - 4 : 0;
5092         q >>= - expDiff;
5093         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5094         expDiff += 52;
5095         if ( expDiff < 0 ) {
5096             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5097         }
5098         else {
5099             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5100         }
5101         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5102         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5103     }
5104     else {
5105         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5106         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5107     }
5108     do {
5109         alternateASig0 = aSig0;
5110         alternateASig1 = aSig1;
5111         ++q;
5112         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5113     } while ( 0 <= (sbits64) aSig0 );
5114     add128(
5115         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5116     if (    ( sigMean0 < 0 )
5117          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5118         aSig0 = alternateASig0;
5119         aSig1 = alternateASig1;
5120     }
5121     zSign = ( (sbits64) aSig0 < 0 );
5122     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5123     return
5124         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5125 
5126 }
5127 
5128 /*
5129 -------------------------------------------------------------------------------
5130 Returns the square root of the quadruple-precision floating-point value `a'.
5131 The operation is performed according to the IEC/IEEE Standard for Binary
5132 Floating-Point Arithmetic.
5133 -------------------------------------------------------------------------------
5134 */
5135 float128 float128_sqrt( float128 a )
5136 {
5137     flag aSign;
5138     int32 aExp, zExp;
5139     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5140     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5141     float128 z;
5142 
5143     aSig1 = extractFloat128Frac1( a );
5144     aSig0 = extractFloat128Frac0( a );
5145     aExp = extractFloat128Exp( a );
5146     aSign = extractFloat128Sign( a );
5147     if ( aExp == 0x7FFF ) {
5148         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5149         if ( ! aSign ) return a;
5150         goto invalid;
5151     }
5152     if ( aSign ) {
5153         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5154  invalid:
5155         float_raise( float_flag_invalid );
5156         z.low = float128_default_nan_low;
5157         z.high = float128_default_nan_high;
5158         return z;
5159     }
5160     if ( aExp == 0 ) {
5161         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5162         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5163     }
5164     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5165     aSig0 |= LIT64( 0x0001000000000000 );
5166     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5167     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5168     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5169     doubleZSig0 = zSig0<<1;
5170     mul64To128( zSig0, zSig0, &term0, &term1 );
5171     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5172     while ( (sbits64) rem0 < 0 ) {
5173         --zSig0;
5174         doubleZSig0 -= 2;
5175         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5176     }
5177     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5178     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5179         if ( zSig1 == 0 ) zSig1 = 1;
5180         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5181         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5182         mul64To128( zSig1, zSig1, &term2, &term3 );
5183         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5184         while ( (sbits64) rem1 < 0 ) {
5185             --zSig1;
5186             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5187             term3 |= 1;
5188             term2 |= doubleZSig0;
5189             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5190         }
5191         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5192     }
5193     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5194     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5195 
5196 }
5197 
5198 /*
5199 -------------------------------------------------------------------------------
5200 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5201 the corresponding value `b', and 0 otherwise.  The comparison is performed
5202 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5203 -------------------------------------------------------------------------------
5204 */
5205 flag float128_eq( float128 a, float128 b )
5206 {
5207 
5208     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5209               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5210          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5211               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5212        ) {
5213         if (    float128_is_signaling_nan( a )
5214              || float128_is_signaling_nan( b ) ) {
5215             float_raise( float_flag_invalid );
5216         }
5217         return 0;
5218     }
5219     return
5220            ( a.low == b.low )
5221         && (    ( a.high == b.high )
5222              || (    ( a.low == 0 )
5223                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5224            );
5225 
5226 }
5227 
5228 /*
5229 -------------------------------------------------------------------------------
5230 Returns 1 if the quadruple-precision floating-point value `a' is less than
5231 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5232 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5233 Arithmetic.
5234 -------------------------------------------------------------------------------
5235 */
5236 flag float128_le( float128 a, float128 b )
5237 {
5238     flag aSign, bSign;
5239 
5240     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5241               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5242          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5243               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5244        ) {
5245         float_raise( float_flag_invalid );
5246         return 0;
5247     }
5248     aSign = extractFloat128Sign( a );
5249     bSign = extractFloat128Sign( b );
5250     if ( aSign != bSign ) {
5251         return
5252                aSign
5253             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5254                  == 0 );
5255     }
5256     return
5257           aSign ? le128( b.high, b.low, a.high, a.low )
5258         : le128( a.high, a.low, b.high, b.low );
5259 
5260 }
5261 
5262 /*
5263 -------------------------------------------------------------------------------
5264 Returns 1 if the quadruple-precision floating-point value `a' is less than
5265 the corresponding value `b', and 0 otherwise.  The comparison is performed
5266 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5267 -------------------------------------------------------------------------------
5268 */
5269 flag float128_lt( float128 a, float128 b )
5270 {
5271     flag aSign, bSign;
5272 
5273     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5274               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5275          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5276               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5277        ) {
5278         float_raise( float_flag_invalid );
5279         return 0;
5280     }
5281     aSign = extractFloat128Sign( a );
5282     bSign = extractFloat128Sign( b );
5283     if ( aSign != bSign ) {
5284         return
5285                aSign
5286             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5287                  != 0 );
5288     }
5289     return
5290           aSign ? lt128( b.high, b.low, a.high, a.low )
5291         : lt128( a.high, a.low, b.high, b.low );
5292 
5293 }
5294 
5295 /*
5296 -------------------------------------------------------------------------------
5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5298 the corresponding value `b', and 0 otherwise.  The invalid exception is
5299 raised if either operand is a NaN.  Otherwise, the comparison is performed
5300 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5301 -------------------------------------------------------------------------------
5302 */
5303 flag float128_eq_signaling( float128 a, float128 b )
5304 {
5305 
5306     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5307               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5308          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5309               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5310        ) {
5311         float_raise( float_flag_invalid );
5312         return 0;
5313     }
5314     return
5315            ( a.low == b.low )
5316         && (    ( a.high == b.high )
5317              || (    ( a.low == 0 )
5318                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5319            );
5320 
5321 }
5322 
5323 /*
5324 -------------------------------------------------------------------------------
5325 Returns 1 if the quadruple-precision floating-point value `a' is less than
5326 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5327 cause an exception.  Otherwise, the comparison is performed according to the
5328 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5329 -------------------------------------------------------------------------------
5330 */
5331 flag float128_le_quiet( float128 a, float128 b )
5332 {
5333     flag aSign, bSign;
5334 
5335     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5336               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5337          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5338               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5339        ) {
5340         if (    float128_is_signaling_nan( a )
5341              || float128_is_signaling_nan( b ) ) {
5342             float_raise( float_flag_invalid );
5343         }
5344         return 0;
5345     }
5346     aSign = extractFloat128Sign( a );
5347     bSign = extractFloat128Sign( b );
5348     if ( aSign != bSign ) {
5349         return
5350                aSign
5351             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5352                  == 0 );
5353     }
5354     return
5355           aSign ? le128( b.high, b.low, a.high, a.low )
5356         : le128( a.high, a.low, b.high, b.low );
5357 
5358 }
5359 
5360 /*
5361 -------------------------------------------------------------------------------
5362 Returns 1 if the quadruple-precision floating-point value `a' is less than
5363 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5364 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5365 Standard for Binary Floating-Point Arithmetic.
5366 -------------------------------------------------------------------------------
5367 */
5368 flag float128_lt_quiet( float128 a, float128 b )
5369 {
5370     flag aSign, bSign;
5371 
5372     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5373               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5374          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5375               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5376        ) {
5377         if (    float128_is_signaling_nan( a )
5378              || float128_is_signaling_nan( b ) ) {
5379             float_raise( float_flag_invalid );
5380         }
5381         return 0;
5382     }
5383     aSign = extractFloat128Sign( a );
5384     bSign = extractFloat128Sign( b );
5385     if ( aSign != bSign ) {
5386         return
5387                aSign
5388             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5389                  != 0 );
5390     }
5391     return
5392           aSign ? lt128( b.high, b.low, a.high, a.low )
5393         : lt128( a.high, a.low, b.high, b.low );
5394 
5395 }
5396 
5397 #endif
5398 
5399 
5400 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5401 
5402 /*
5403  * These two routines are not part of the original softfloat distribution.
5404  *
5405  * They are based on the corresponding conversions to integer but return
5406  * unsigned numbers instead since these functions are required by GCC.
5407  *
5408  * Added by Mark Brinicombe <mark@netbsd.org>	27/09/97
5409  *
5410  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5411  */
5412 
5413 /*
5414 -------------------------------------------------------------------------------
5415 Returns the result of converting the double-precision floating-point value
5416 `a' to the 32-bit unsigned integer format.  The conversion is
5417 performed according to the IEC/IEEE Standard for Binary Floating-point
5418 Arithmetic, except that the conversion is always rounded toward zero.  If
5419 `a' is a NaN, the largest positive integer is returned.  If the conversion
5420 overflows, the largest integer positive is returned.
5421 -------------------------------------------------------------------------------
5422 */
5423 uint32 float64_to_uint32_round_to_zero( float64 a )
5424 {
5425     flag aSign;
5426     int16 aExp, shiftCount;
5427     bits64 aSig, savedASig;
5428     uint32 z;
5429 
5430     aSig = extractFloat64Frac( a );
5431     aExp = extractFloat64Exp( a );
5432     aSign = extractFloat64Sign( a );
5433 
5434     if (aSign) {
5435         float_raise( float_flag_invalid );
5436     	return(0);
5437     }
5438 
5439     if ( 0x41E < aExp ) {
5440         float_raise( float_flag_invalid );
5441         return 0xffffffff;
5442     }
5443     else if ( aExp < 0x3FF ) {
5444         if ( aExp || aSig ) float_set_inexact();
5445         return 0;
5446     }
5447     aSig |= LIT64( 0x0010000000000000 );
5448     shiftCount = 0x433 - aExp;
5449     savedASig = aSig;
5450     aSig >>= shiftCount;
5451     z = aSig;
5452     if ( ( aSig<<shiftCount ) != savedASig ) {
5453         float_set_inexact();
5454     }
5455     return z;
5456 
5457 }
5458 
5459 /*
5460 -------------------------------------------------------------------------------
5461 Returns the result of converting the single-precision floating-point value
5462 `a' to the 32-bit unsigned integer format.  The conversion is
5463 performed according to the IEC/IEEE Standard for Binary Floating-point
5464 Arithmetic, except that the conversion is always rounded toward zero.  If
5465 `a' is a NaN, the largest positive integer is returned.  If the conversion
5466 overflows, the largest positive integer is returned.
5467 -------------------------------------------------------------------------------
5468 */
5469 uint32 float32_to_uint32_round_to_zero( float32 a )
5470 {
5471     flag aSign;
5472     int16 aExp, shiftCount;
5473     bits32 aSig;
5474     uint32 z;
5475 
5476     aSig = extractFloat32Frac( a );
5477     aExp = extractFloat32Exp( a );
5478     aSign = extractFloat32Sign( a );
5479     shiftCount = aExp - 0x9E;
5480 
5481     if (aSign) {
5482         float_raise( float_flag_invalid );
5483     	return(0);
5484     }
5485     if ( 0 < shiftCount ) {
5486         float_raise( float_flag_invalid );
5487         return 0xFFFFFFFF;
5488     }
5489     else if ( aExp <= 0x7E ) {
5490         if ( aExp | aSig ) float_set_inexact();
5491         return 0;
5492     }
5493     aSig = ( aSig | 0x800000 )<<8;
5494     z = aSig>>( - shiftCount );
5495     if ( aSig<<( shiftCount & 31 ) ) {
5496         float_set_inexact();
5497     }
5498     return z;
5499 
5500 }
5501 
5502 #endif
5503