xref: /netbsd-src/lib/libc/softfloat/bits64/softfloat.c (revision 88fcb00c0357f2d7c1774f86a352637bfda96184)
1 /* $NetBSD: softfloat.c,v 1.6 2011/07/04 08:02:35 matt Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.6 2011/07/04 08:02:35 matt Exp $");
50 #endif /* LIBC_SCCS and not lint */
51 
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55 
56 #include "milieu.h"
57 #include "softfloat.h"
58 
59 /*
60  * Conversions between floats as stored in memory and floats as
61  * SoftFloat uses them
62  */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a)	(a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a)	(a)
68 #endif
69 
70 /*
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
75 */
76 fp_rnd float_rounding_mode = float_round_nearest_even;
77 fp_except float_exception_flags = 0;
78 #ifdef FLOATX80
79 int8 floatx80_rounding_precision = 80;
80 #endif
81 
82 /*
83 -------------------------------------------------------------------------------
84 Primitive arithmetic functions, including multi-word arithmetic, and
85 division and square root approximations.  (Can be specialized to target if
86 desired.)
87 -------------------------------------------------------------------------------
88 */
89 #include "softfloat-macros"
90 
91 /*
92 -------------------------------------------------------------------------------
93 Functions and definitions to determine:  (1) whether tininess for underflow
94 is detected before or after rounding by default, (2) what (if anything)
95 happens when exceptions are raised, (3) how signaling NaNs are distinguished
96 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
97 are propagated from function inputs to output.  These details are target-
98 specific.
99 -------------------------------------------------------------------------------
100 */
101 #include "softfloat-specialize"
102 
103 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
104 /*
105 -------------------------------------------------------------------------------
106 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 and 7, and returns the properly rounded 32-bit integer corresponding to the
108 input.  If `zSign' is 1, the input is negated before being converted to an
109 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
110 is simply rounded to an integer, with the inexact exception raised if the
111 input cannot be represented exactly as an integer.  However, if the fixed-
112 point input is too large, the invalid exception is raised and the largest
113 positive or negative integer is returned.
114 -------------------------------------------------------------------------------
115 */
116 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 {
118     int8 roundingMode;
119     flag roundNearestEven;
120     int8 roundIncrement, roundBits;
121     int32 z;
122 
123     roundingMode = float_rounding_mode;
124     roundNearestEven = ( roundingMode == float_round_nearest_even );
125     roundIncrement = 0x40;
126     if ( ! roundNearestEven ) {
127         if ( roundingMode == float_round_to_zero ) {
128             roundIncrement = 0;
129         }
130         else {
131             roundIncrement = 0x7F;
132             if ( zSign ) {
133                 if ( roundingMode == float_round_up ) roundIncrement = 0;
134             }
135             else {
136                 if ( roundingMode == float_round_down ) roundIncrement = 0;
137             }
138         }
139     }
140     roundBits = absZ & 0x7F;
141     absZ = ( absZ + roundIncrement )>>7;
142     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143     z = absZ;
144     if ( zSign ) z = - z;
145     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146         float_raise( float_flag_invalid );
147         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148     }
149     if ( roundBits ) float_exception_flags |= float_flag_inexact;
150     return z;
151 
152 }
153 
154 /*
155 -------------------------------------------------------------------------------
156 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
157 `absZ1', with binary point between bits 63 and 64 (between the input words),
158 and returns the properly rounded 64-bit integer corresponding to the input.
159 If `zSign' is 1, the input is negated before being converted to an integer.
160 Ordinarily, the fixed-point input is simply rounded to an integer, with
161 the inexact exception raised if the input cannot be represented exactly as
162 an integer.  However, if the fixed-point input is too large, the invalid
163 exception is raised and the largest positive or negative integer is
164 returned.
165 -------------------------------------------------------------------------------
166 */
167 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
168 {
169     int8 roundingMode;
170     flag roundNearestEven, increment;
171     int64 z;
172 
173     roundingMode = float_rounding_mode;
174     roundNearestEven = ( roundingMode == float_round_nearest_even );
175     increment = ( (sbits64) absZ1 < 0 );
176     if ( ! roundNearestEven ) {
177         if ( roundingMode == float_round_to_zero ) {
178             increment = 0;
179         }
180         else {
181             if ( zSign ) {
182                 increment = ( roundingMode == float_round_down ) && absZ1;
183             }
184             else {
185                 increment = ( roundingMode == float_round_up ) && absZ1;
186             }
187         }
188     }
189     if ( increment ) {
190         ++absZ0;
191         if ( absZ0 == 0 ) goto overflow;
192         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193     }
194     z = absZ0;
195     if ( zSign ) z = - z;
196     if ( z && ( ( z < 0 ) ^ zSign ) ) {
197  overflow:
198         float_raise( float_flag_invalid );
199         return
200               zSign ? (sbits64) LIT64( 0x8000000000000000 )
201             : LIT64( 0x7FFFFFFFFFFFFFFF );
202     }
203     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
204     return z;
205 
206 }
207 #endif
208 
209 /*
210 -------------------------------------------------------------------------------
211 Returns the fraction bits of the single-precision floating-point value `a'.
212 -------------------------------------------------------------------------------
213 */
214 INLINE bits32 extractFloat32Frac( float32 a )
215 {
216 
217     return a & 0x007FFFFF;
218 
219 }
220 
221 /*
222 -------------------------------------------------------------------------------
223 Returns the exponent bits of the single-precision floating-point value `a'.
224 -------------------------------------------------------------------------------
225 */
226 INLINE int16 extractFloat32Exp( float32 a )
227 {
228 
229     return ( a>>23 ) & 0xFF;
230 
231 }
232 
233 /*
234 -------------------------------------------------------------------------------
235 Returns the sign bit of the single-precision floating-point value `a'.
236 -------------------------------------------------------------------------------
237 */
238 INLINE flag extractFloat32Sign( float32 a )
239 {
240 
241     return a>>31;
242 
243 }
244 
245 /*
246 -------------------------------------------------------------------------------
247 Normalizes the subnormal single-precision floating-point value represented
248 by the denormalized significand `aSig'.  The normalized exponent and
249 significand are stored at the locations pointed to by `zExpPtr' and
250 `zSigPtr', respectively.
251 -------------------------------------------------------------------------------
252 */
253 static void
254  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
255 {
256     int8 shiftCount;
257 
258     shiftCount = countLeadingZeros32( aSig ) - 8;
259     *zSigPtr = aSig<<shiftCount;
260     *zExpPtr = 1 - shiftCount;
261 
262 }
263 
264 /*
265 -------------------------------------------------------------------------------
266 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
267 single-precision floating-point value, returning the result.  After being
268 shifted into the proper positions, the three fields are simply added
269 together to form the result.  This means that any integer portion of `zSig'
270 will be added into the exponent.  Since a properly normalized significand
271 will have an integer portion equal to 1, the `zExp' input should be 1 less
272 than the desired result exponent whenever `zSig' is a complete, normalized
273 significand.
274 -------------------------------------------------------------------------------
275 */
276 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
277 {
278 
279     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
280 
281 }
282 
283 /*
284 -------------------------------------------------------------------------------
285 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
286 and significand `zSig', and returns the proper single-precision floating-
287 point value corresponding to the abstract input.  Ordinarily, the abstract
288 value is simply rounded and packed into the single-precision format, with
289 the inexact exception raised if the abstract input cannot be represented
290 exactly.  However, if the abstract value is too large, the overflow and
291 inexact exceptions are raised and an infinity or maximal finite value is
292 returned.  If the abstract value is too small, the input value is rounded to
293 a subnormal number, and the underflow and inexact exceptions are raised if
294 the abstract input cannot be represented exactly as a subnormal single-
295 precision floating-point number.
296     The input significand `zSig' has its binary point between bits 30
297 and 29, which is 7 bits to the left of the usual location.  This shifted
298 significand must be normalized or smaller.  If `zSig' is not normalized,
299 `zExp' must be 0; in that case, the result returned is a subnormal number,
300 and it must not require rounding.  In the usual case that `zSig' is
301 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
302 The handling of underflow and overflow follows the IEC/IEEE Standard for
303 Binary Floating-Point Arithmetic.
304 -------------------------------------------------------------------------------
305 */
306 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
307 {
308     int8 roundingMode;
309     flag roundNearestEven;
310     int8 roundIncrement, roundBits;
311     flag isTiny;
312 
313     roundingMode = float_rounding_mode;
314     roundNearestEven = ( roundingMode == float_round_nearest_even );
315     roundIncrement = 0x40;
316     if ( ! roundNearestEven ) {
317         if ( roundingMode == float_round_to_zero ) {
318             roundIncrement = 0;
319         }
320         else {
321             roundIncrement = 0x7F;
322             if ( zSign ) {
323                 if ( roundingMode == float_round_up ) roundIncrement = 0;
324             }
325             else {
326                 if ( roundingMode == float_round_down ) roundIncrement = 0;
327             }
328         }
329     }
330     roundBits = zSig & 0x7F;
331     if ( 0xFD <= (bits16) zExp ) {
332         if (    ( 0xFD < zExp )
333              || (    ( zExp == 0xFD )
334                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
335            ) {
336             float_raise( float_flag_overflow | float_flag_inexact );
337             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
338         }
339         if ( zExp < 0 ) {
340             isTiny =
341                    ( float_detect_tininess == float_tininess_before_rounding )
342                 || ( zExp < -1 )
343                 || ( zSig + roundIncrement < 0x80000000 );
344             shift32RightJamming( zSig, - zExp, &zSig );
345             zExp = 0;
346             roundBits = zSig & 0x7F;
347             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
348         }
349     }
350     if ( roundBits ) float_exception_flags |= float_flag_inexact;
351     zSig = ( zSig + roundIncrement )>>7;
352     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353     if ( zSig == 0 ) zExp = 0;
354     return packFloat32( zSign, zExp, zSig );
355 
356 }
357 
358 /*
359 -------------------------------------------------------------------------------
360 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
361 and significand `zSig', and returns the proper single-precision floating-
362 point value corresponding to the abstract input.  This routine is just like
363 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
364 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
365 floating-point exponent.
366 -------------------------------------------------------------------------------
367 */
368 static float32
369  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
370 {
371     int8 shiftCount;
372 
373     shiftCount = countLeadingZeros32( zSig ) - 1;
374     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
375 
376 }
377 
378 /*
379 -------------------------------------------------------------------------------
380 Returns the fraction bits of the double-precision floating-point value `a'.
381 -------------------------------------------------------------------------------
382 */
383 INLINE bits64 extractFloat64Frac( float64 a )
384 {
385 
386     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
387 
388 }
389 
390 /*
391 -------------------------------------------------------------------------------
392 Returns the exponent bits of the double-precision floating-point value `a'.
393 -------------------------------------------------------------------------------
394 */
395 INLINE int16 extractFloat64Exp( float64 a )
396 {
397 
398     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
399 
400 }
401 
402 /*
403 -------------------------------------------------------------------------------
404 Returns the sign bit of the double-precision floating-point value `a'.
405 -------------------------------------------------------------------------------
406 */
407 INLINE flag extractFloat64Sign( float64 a )
408 {
409 
410     return FLOAT64_DEMANGLE(a)>>63;
411 
412 }
413 
414 /*
415 -------------------------------------------------------------------------------
416 Normalizes the subnormal double-precision floating-point value represented
417 by the denormalized significand `aSig'.  The normalized exponent and
418 significand are stored at the locations pointed to by `zExpPtr' and
419 `zSigPtr', respectively.
420 -------------------------------------------------------------------------------
421 */
422 static void
423  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
424 {
425     int8 shiftCount;
426 
427     shiftCount = countLeadingZeros64( aSig ) - 11;
428     *zSigPtr = aSig<<shiftCount;
429     *zExpPtr = 1 - shiftCount;
430 
431 }
432 
433 /*
434 -------------------------------------------------------------------------------
435 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
436 double-precision floating-point value, returning the result.  After being
437 shifted into the proper positions, the three fields are simply added
438 together to form the result.  This means that any integer portion of `zSig'
439 will be added into the exponent.  Since a properly normalized significand
440 will have an integer portion equal to 1, the `zExp' input should be 1 less
441 than the desired result exponent whenever `zSig' is a complete, normalized
442 significand.
443 -------------------------------------------------------------------------------
444 */
445 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
446 {
447 
448     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
449 			   ( ( (bits64) zExp )<<52 ) + zSig );
450 
451 }
452 
453 /*
454 -------------------------------------------------------------------------------
455 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
456 and significand `zSig', and returns the proper double-precision floating-
457 point value corresponding to the abstract input.  Ordinarily, the abstract
458 value is simply rounded and packed into the double-precision format, with
459 the inexact exception raised if the abstract input cannot be represented
460 exactly.  However, if the abstract value is too large, the overflow and
461 inexact exceptions are raised and an infinity or maximal finite value is
462 returned.  If the abstract value is too small, the input value is rounded to
463 a subnormal number, and the underflow and inexact exceptions are raised if
464 the abstract input cannot be represented exactly as a subnormal double-
465 precision floating-point number.
466     The input significand `zSig' has its binary point between bits 62
467 and 61, which is 10 bits to the left of the usual location.  This shifted
468 significand must be normalized or smaller.  If `zSig' is not normalized,
469 `zExp' must be 0; in that case, the result returned is a subnormal number,
470 and it must not require rounding.  In the usual case that `zSig' is
471 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
472 The handling of underflow and overflow follows the IEC/IEEE Standard for
473 Binary Floating-Point Arithmetic.
474 -------------------------------------------------------------------------------
475 */
476 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
477 {
478     int8 roundingMode;
479     flag roundNearestEven;
480     int16 roundIncrement, roundBits;
481     flag isTiny;
482 
483     roundingMode = float_rounding_mode;
484     roundNearestEven = ( roundingMode == float_round_nearest_even );
485     roundIncrement = 0x200;
486     if ( ! roundNearestEven ) {
487         if ( roundingMode == float_round_to_zero ) {
488             roundIncrement = 0;
489         }
490         else {
491             roundIncrement = 0x3FF;
492             if ( zSign ) {
493                 if ( roundingMode == float_round_up ) roundIncrement = 0;
494             }
495             else {
496                 if ( roundingMode == float_round_down ) roundIncrement = 0;
497             }
498         }
499     }
500     roundBits = zSig & 0x3FF;
501     if ( 0x7FD <= (bits16) zExp ) {
502         if (    ( 0x7FD < zExp )
503              || (    ( zExp == 0x7FD )
504                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
505            ) {
506             float_raise( float_flag_overflow | float_flag_inexact );
507             return FLOAT64_MANGLE(
508 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
509 		( roundIncrement == 0 ));
510         }
511         if ( zExp < 0 ) {
512             isTiny =
513                    ( float_detect_tininess == float_tininess_before_rounding )
514                 || ( zExp < -1 )
515                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
516             shift64RightJamming( zSig, - zExp, &zSig );
517             zExp = 0;
518             roundBits = zSig & 0x3FF;
519             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
520         }
521     }
522     if ( roundBits ) float_exception_flags |= float_flag_inexact;
523     zSig = ( zSig + roundIncrement )>>10;
524     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
525     if ( zSig == 0 ) zExp = 0;
526     return packFloat64( zSign, zExp, zSig );
527 
528 }
529 
530 /*
531 -------------------------------------------------------------------------------
532 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533 and significand `zSig', and returns the proper double-precision floating-
534 point value corresponding to the abstract input.  This routine is just like
535 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
536 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
537 floating-point exponent.
538 -------------------------------------------------------------------------------
539 */
540 static float64
541  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
542 {
543     int8 shiftCount;
544 
545     shiftCount = countLeadingZeros64( zSig ) - 1;
546     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
547 
548 }
549 
550 #ifdef FLOATX80
551 
552 /*
553 -------------------------------------------------------------------------------
554 Returns the fraction bits of the extended double-precision floating-point
555 value `a'.
556 -------------------------------------------------------------------------------
557 */
558 INLINE bits64 extractFloatx80Frac( floatx80 a )
559 {
560 
561     return a.low;
562 
563 }
564 
565 /*
566 -------------------------------------------------------------------------------
567 Returns the exponent bits of the extended double-precision floating-point
568 value `a'.
569 -------------------------------------------------------------------------------
570 */
571 INLINE int32 extractFloatx80Exp( floatx80 a )
572 {
573 
574     return a.high & 0x7FFF;
575 
576 }
577 
578 /*
579 -------------------------------------------------------------------------------
580 Returns the sign bit of the extended double-precision floating-point value
581 `a'.
582 -------------------------------------------------------------------------------
583 */
584 INLINE flag extractFloatx80Sign( floatx80 a )
585 {
586 
587     return a.high>>15;
588 
589 }
590 
591 /*
592 -------------------------------------------------------------------------------
593 Normalizes the subnormal extended double-precision floating-point value
594 represented by the denormalized significand `aSig'.  The normalized exponent
595 and significand are stored at the locations pointed to by `zExpPtr' and
596 `zSigPtr', respectively.
597 -------------------------------------------------------------------------------
598 */
599 static void
600  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
601 {
602     int8 shiftCount;
603 
604     shiftCount = countLeadingZeros64( aSig );
605     *zSigPtr = aSig<<shiftCount;
606     *zExpPtr = 1 - shiftCount;
607 
608 }
609 
610 /*
611 -------------------------------------------------------------------------------
612 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
613 extended double-precision floating-point value, returning the result.
614 -------------------------------------------------------------------------------
615 */
616 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
617 {
618     floatx80 z;
619 
620     z.low = zSig;
621     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
622     return z;
623 
624 }
625 
626 /*
627 -------------------------------------------------------------------------------
628 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 and returns the proper extended double-precision floating-point value
631 corresponding to the abstract input.  Ordinarily, the abstract value is
632 rounded and packed into the extended double-precision format, with the
633 inexact exception raised if the abstract input cannot be represented
634 exactly.  However, if the abstract value is too large, the overflow and
635 inexact exceptions are raised and an infinity or maximal finite value is
636 returned.  If the abstract value is too small, the input value is rounded to
637 a subnormal number, and the underflow and inexact exceptions are raised if
638 the abstract input cannot be represented exactly as a subnormal extended
639 double-precision floating-point number.
640     If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 number of bits as single or double precision, respectively.  Otherwise, the
642 result is rounded to the full precision of the extended double-precision
643 format.
644     The input significand must be normalized or smaller.  If the input
645 significand is not normalized, `zExp' must be 0; in that case, the result
646 returned is a subnormal number, and it must not require rounding.  The
647 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 Floating-Point Arithmetic.
649 -------------------------------------------------------------------------------
650 */
651 static floatx80
652  roundAndPackFloatx80(
653      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654  )
655 {
656     int8 roundingMode;
657     flag roundNearestEven, increment, isTiny;
658     int64 roundIncrement, roundMask, roundBits;
659 
660     roundingMode = float_rounding_mode;
661     roundNearestEven = ( roundingMode == float_round_nearest_even );
662     if ( roundingPrecision == 80 ) goto precision80;
663     if ( roundingPrecision == 64 ) {
664         roundIncrement = LIT64( 0x0000000000000400 );
665         roundMask = LIT64( 0x00000000000007FF );
666     }
667     else if ( roundingPrecision == 32 ) {
668         roundIncrement = LIT64( 0x0000008000000000 );
669         roundMask = LIT64( 0x000000FFFFFFFFFF );
670     }
671     else {
672         goto precision80;
673     }
674     zSig0 |= ( zSig1 != 0 );
675     if ( ! roundNearestEven ) {
676         if ( roundingMode == float_round_to_zero ) {
677             roundIncrement = 0;
678         }
679         else {
680             roundIncrement = roundMask;
681             if ( zSign ) {
682                 if ( roundingMode == float_round_up ) roundIncrement = 0;
683             }
684             else {
685                 if ( roundingMode == float_round_down ) roundIncrement = 0;
686             }
687         }
688     }
689     roundBits = zSig0 & roundMask;
690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691         if (    ( 0x7FFE < zExp )
692              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
693            ) {
694             goto overflow;
695         }
696         if ( zExp <= 0 ) {
697             isTiny =
698                    ( float_detect_tininess == float_tininess_before_rounding )
699                 || ( zExp < 0 )
700                 || ( zSig0 <= zSig0 + roundIncrement );
701             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
702             zExp = 0;
703             roundBits = zSig0 & roundMask;
704             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
705             if ( roundBits ) float_exception_flags |= float_flag_inexact;
706             zSig0 += roundIncrement;
707             if ( (sbits64) zSig0 < 0 ) zExp = 1;
708             roundIncrement = roundMask + 1;
709             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
710                 roundMask |= roundIncrement;
711             }
712             zSig0 &= ~ roundMask;
713             return packFloatx80( zSign, zExp, zSig0 );
714         }
715     }
716     if ( roundBits ) float_exception_flags |= float_flag_inexact;
717     zSig0 += roundIncrement;
718     if ( zSig0 < roundIncrement ) {
719         ++zExp;
720         zSig0 = LIT64( 0x8000000000000000 );
721     }
722     roundIncrement = roundMask + 1;
723     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
724         roundMask |= roundIncrement;
725     }
726     zSig0 &= ~ roundMask;
727     if ( zSig0 == 0 ) zExp = 0;
728     return packFloatx80( zSign, zExp, zSig0 );
729  precision80:
730     increment = ( (sbits64) zSig1 < 0 );
731     if ( ! roundNearestEven ) {
732         if ( roundingMode == float_round_to_zero ) {
733             increment = 0;
734         }
735         else {
736             if ( zSign ) {
737                 increment = ( roundingMode == float_round_down ) && zSig1;
738             }
739             else {
740                 increment = ( roundingMode == float_round_up ) && zSig1;
741             }
742         }
743     }
744     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
745         if (    ( 0x7FFE < zExp )
746              || (    ( zExp == 0x7FFE )
747                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
748                   && increment
749                 )
750            ) {
751             roundMask = 0;
752  overflow:
753             float_raise( float_flag_overflow | float_flag_inexact );
754             if (    ( roundingMode == float_round_to_zero )
755                  || ( zSign && ( roundingMode == float_round_up ) )
756                  || ( ! zSign && ( roundingMode == float_round_down ) )
757                ) {
758                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
759             }
760             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
761         }
762         if ( zExp <= 0 ) {
763             isTiny =
764                    ( float_detect_tininess == float_tininess_before_rounding )
765                 || ( zExp < 0 )
766                 || ! increment
767                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
768             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
769             zExp = 0;
770             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
771             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
772             if ( roundNearestEven ) {
773                 increment = ( (sbits64) zSig1 < 0 );
774             }
775             else {
776                 if ( zSign ) {
777                     increment = ( roundingMode == float_round_down ) && zSig1;
778                 }
779                 else {
780                     increment = ( roundingMode == float_round_up ) && zSig1;
781                 }
782             }
783             if ( increment ) {
784                 ++zSig0;
785                 zSig0 &=
786                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
787                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
788             }
789             return packFloatx80( zSign, zExp, zSig0 );
790         }
791     }
792     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
793     if ( increment ) {
794         ++zSig0;
795         if ( zSig0 == 0 ) {
796             ++zExp;
797             zSig0 = LIT64( 0x8000000000000000 );
798         }
799         else {
800             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
801         }
802     }
803     else {
804         if ( zSig0 == 0 ) zExp = 0;
805     }
806     return packFloatx80( zSign, zExp, zSig0 );
807 
808 }
809 
810 /*
811 -------------------------------------------------------------------------------
812 Takes an abstract floating-point value having sign `zSign', exponent
813 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 and returns the proper extended double-precision floating-point value
815 corresponding to the abstract input.  This routine is just like
816 `roundAndPackFloatx80' except that the input significand does not have to be
817 normalized.
818 -------------------------------------------------------------------------------
819 */
820 static floatx80
821  normalizeRoundAndPackFloatx80(
822      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823  )
824 {
825     int8 shiftCount;
826 
827     if ( zSig0 == 0 ) {
828         zSig0 = zSig1;
829         zSig1 = 0;
830         zExp -= 64;
831     }
832     shiftCount = countLeadingZeros64( zSig0 );
833     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834     zExp -= shiftCount;
835     return
836         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
837 
838 }
839 
840 #endif
841 
842 #ifdef FLOAT128
843 
844 /*
845 -------------------------------------------------------------------------------
846 Returns the least-significant 64 fraction bits of the quadruple-precision
847 floating-point value `a'.
848 -------------------------------------------------------------------------------
849 */
850 INLINE bits64 extractFloat128Frac1( float128 a )
851 {
852 
853     return a.low;
854 
855 }
856 
857 /*
858 -------------------------------------------------------------------------------
859 Returns the most-significant 48 fraction bits of the quadruple-precision
860 floating-point value `a'.
861 -------------------------------------------------------------------------------
862 */
863 INLINE bits64 extractFloat128Frac0( float128 a )
864 {
865 
866     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
867 
868 }
869 
870 /*
871 -------------------------------------------------------------------------------
872 Returns the exponent bits of the quadruple-precision floating-point value
873 `a'.
874 -------------------------------------------------------------------------------
875 */
876 INLINE int32 extractFloat128Exp( float128 a )
877 {
878 
879     return ( a.high>>48 ) & 0x7FFF;
880 
881 }
882 
883 /*
884 -------------------------------------------------------------------------------
885 Returns the sign bit of the quadruple-precision floating-point value `a'.
886 -------------------------------------------------------------------------------
887 */
888 INLINE flag extractFloat128Sign( float128 a )
889 {
890 
891     return a.high>>63;
892 
893 }
894 
895 /*
896 -------------------------------------------------------------------------------
897 Normalizes the subnormal quadruple-precision floating-point value
898 represented by the denormalized significand formed by the concatenation of
899 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
900 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
901 significand are stored at the location pointed to by `zSig0Ptr', and the
902 least significant 64 bits of the normalized significand are stored at the
903 location pointed to by `zSig1Ptr'.
904 -------------------------------------------------------------------------------
905 */
906 static void
907  normalizeFloat128Subnormal(
908      bits64 aSig0,
909      bits64 aSig1,
910      int32 *zExpPtr,
911      bits64 *zSig0Ptr,
912      bits64 *zSig1Ptr
913  )
914 {
915     int8 shiftCount;
916 
917     if ( aSig0 == 0 ) {
918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
919         if ( shiftCount < 0 ) {
920             *zSig0Ptr = aSig1>>( - shiftCount );
921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
922         }
923         else {
924             *zSig0Ptr = aSig1<<shiftCount;
925             *zSig1Ptr = 0;
926         }
927         *zExpPtr = - shiftCount - 63;
928     }
929     else {
930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932         *zExpPtr = 1 - shiftCount;
933     }
934 
935 }
936 
937 /*
938 -------------------------------------------------------------------------------
939 Packs the sign `zSign', the exponent `zExp', and the significand formed
940 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 floating-point value, returning the result.  After being shifted into the
942 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 added together to form the most significant 32 bits of the result.  This
944 means that any integer portion of `zSig0' will be added into the exponent.
945 Since a properly normalized significand will have an integer portion equal
946 to 1, the `zExp' input should be 1 less than the desired result exponent
947 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948 significand.
949 -------------------------------------------------------------------------------
950 */
951 INLINE float128
952  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
953 {
954     float128 z;
955 
956     z.low = zSig1;
957     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
958     return z;
959 
960 }
961 
962 /*
963 -------------------------------------------------------------------------------
964 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
965 and extended significand formed by the concatenation of `zSig0', `zSig1',
966 and `zSig2', and returns the proper quadruple-precision floating-point value
967 corresponding to the abstract input.  Ordinarily, the abstract value is
968 simply rounded and packed into the quadruple-precision format, with the
969 inexact exception raised if the abstract input cannot be represented
970 exactly.  However, if the abstract value is too large, the overflow and
971 inexact exceptions are raised and an infinity or maximal finite value is
972 returned.  If the abstract value is too small, the input value is rounded to
973 a subnormal number, and the underflow and inexact exceptions are raised if
974 the abstract input cannot be represented exactly as a subnormal quadruple-
975 precision floating-point number.
976     The input significand must be normalized or smaller.  If the input
977 significand is not normalized, `zExp' must be 0; in that case, the result
978 returned is a subnormal number, and it must not require rounding.  In the
979 usual case that the input significand is normalized, `zExp' must be 1 less
980 than the ``true'' floating-point exponent.  The handling of underflow and
981 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
982 -------------------------------------------------------------------------------
983 */
984 static float128
985  roundAndPackFloat128(
986      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
987 {
988     int8 roundingMode;
989     flag roundNearestEven, increment, isTiny;
990 
991     roundingMode = float_rounding_mode;
992     roundNearestEven = ( roundingMode == float_round_nearest_even );
993     increment = ( (sbits64) zSig2 < 0 );
994     if ( ! roundNearestEven ) {
995         if ( roundingMode == float_round_to_zero ) {
996             increment = 0;
997         }
998         else {
999             if ( zSign ) {
1000                 increment = ( roundingMode == float_round_down ) && zSig2;
1001             }
1002             else {
1003                 increment = ( roundingMode == float_round_up ) && zSig2;
1004             }
1005         }
1006     }
1007     if ( 0x7FFD <= (bits32) zExp ) {
1008         if (    ( 0x7FFD < zExp )
1009              || (    ( zExp == 0x7FFD )
1010                   && eq128(
1011                          LIT64( 0x0001FFFFFFFFFFFF ),
1012                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1013                          zSig0,
1014                          zSig1
1015                      )
1016                   && increment
1017                 )
1018            ) {
1019             float_raise( float_flag_overflow | float_flag_inexact );
1020             if (    ( roundingMode == float_round_to_zero )
1021                  || ( zSign && ( roundingMode == float_round_up ) )
1022                  || ( ! zSign && ( roundingMode == float_round_down ) )
1023                ) {
1024                 return
1025                     packFloat128(
1026                         zSign,
1027                         0x7FFE,
1028                         LIT64( 0x0000FFFFFFFFFFFF ),
1029                         LIT64( 0xFFFFFFFFFFFFFFFF )
1030                     );
1031             }
1032             return packFloat128( zSign, 0x7FFF, 0, 0 );
1033         }
1034         if ( zExp < 0 ) {
1035             isTiny =
1036                    ( float_detect_tininess == float_tininess_before_rounding )
1037                 || ( zExp < -1 )
1038                 || ! increment
1039                 || lt128(
1040                        zSig0,
1041                        zSig1,
1042                        LIT64( 0x0001FFFFFFFFFFFF ),
1043                        LIT64( 0xFFFFFFFFFFFFFFFF )
1044                    );
1045             shift128ExtraRightJamming(
1046                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1047             zExp = 0;
1048             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1049             if ( roundNearestEven ) {
1050                 increment = ( (sbits64) zSig2 < 0 );
1051             }
1052             else {
1053                 if ( zSign ) {
1054                     increment = ( roundingMode == float_round_down ) && zSig2;
1055                 }
1056                 else {
1057                     increment = ( roundingMode == float_round_up ) && zSig2;
1058                 }
1059             }
1060         }
1061     }
1062     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1063     if ( increment ) {
1064         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1065         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066     }
1067     else {
1068         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069     }
1070     return packFloat128( zSign, zExp, zSig0, zSig1 );
1071 
1072 }
1073 
1074 /*
1075 -------------------------------------------------------------------------------
1076 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 and significand formed by the concatenation of `zSig0' and `zSig1', and
1078 returns the proper quadruple-precision floating-point value corresponding
1079 to the abstract input.  This routine is just like `roundAndPackFloat128'
1080 except that the input significand has fewer bits and does not have to be
1081 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1082 point exponent.
1083 -------------------------------------------------------------------------------
1084 */
1085 static float128
1086  normalizeRoundAndPackFloat128(
1087      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1088 {
1089     int8 shiftCount;
1090     bits64 zSig2;
1091 
1092     if ( zSig0 == 0 ) {
1093         zSig0 = zSig1;
1094         zSig1 = 0;
1095         zExp -= 64;
1096     }
1097     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1098     if ( 0 <= shiftCount ) {
1099         zSig2 = 0;
1100         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101     }
1102     else {
1103         shift128ExtraRightJamming(
1104             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105     }
1106     zExp -= shiftCount;
1107     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1108 
1109 }
1110 
1111 #endif
1112 
1113 /*
1114 -------------------------------------------------------------------------------
1115 Returns the result of converting the 32-bit two's complement integer `a'
1116 to the single-precision floating-point format.  The conversion is performed
1117 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 -------------------------------------------------------------------------------
1119 */
1120 float32 int32_to_float32( int32 a )
1121 {
1122     flag zSign;
1123 
1124     if ( a == 0 ) return 0;
1125     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1126     zSign = ( a < 0 );
1127     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1128 
1129 }
1130 
1131 float32 uint32_to_float32( uint32 a )
1132 {
1133     if ( a == 0 ) return 0;
1134     if ( a & (bits32) 0x80000000 )
1135 	return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1136     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1137 }
1138 
1139 
1140 /*
1141 -------------------------------------------------------------------------------
1142 Returns the result of converting the 32-bit two's complement integer `a'
1143 to the double-precision floating-point format.  The conversion is performed
1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 -------------------------------------------------------------------------------
1146 */
1147 float64 int32_to_float64( int32 a )
1148 {
1149     flag zSign;
1150     uint32 absA;
1151     int8 shiftCount;
1152     bits64 zSig;
1153 
1154     if ( a == 0 ) return 0;
1155     zSign = ( a < 0 );
1156     absA = zSign ? - a : a;
1157     shiftCount = countLeadingZeros32( absA ) + 21;
1158     zSig = absA;
1159     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1160 
1161 }
1162 
1163 float64 uint32_to_float64( uint32 a )
1164 {
1165     int8 shiftCount;
1166     bits64 zSig = a;
1167 
1168     if ( a == 0 ) return 0;
1169     shiftCount = countLeadingZeros32( a ) + 21;
1170     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1171 
1172 }
1173 
1174 #ifdef FLOATX80
1175 
1176 /*
1177 -------------------------------------------------------------------------------
1178 Returns the result of converting the 32-bit two's complement integer `a'
1179 to the extended double-precision floating-point format.  The conversion
1180 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1181 Arithmetic.
1182 -------------------------------------------------------------------------------
1183 */
1184 floatx80 int32_to_floatx80( int32 a )
1185 {
1186     flag zSign;
1187     uint32 absA;
1188     int8 shiftCount;
1189     bits64 zSig;
1190 
1191     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1192     zSign = ( a < 0 );
1193     absA = zSign ? - a : a;
1194     shiftCount = countLeadingZeros32( absA ) + 32;
1195     zSig = absA;
1196     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1197 
1198 }
1199 
1200 floatx80 uint32_to_floatx80( uint32 a )
1201 {
1202     int8 shiftCount;
1203     bits64 zSig = a;
1204 
1205     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1206     shiftCount = countLeadingZeros32( a ) + 32;
1207     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1208 
1209 }
1210 
1211 #endif
1212 
1213 #ifdef FLOAT128
1214 
1215 /*
1216 -------------------------------------------------------------------------------
1217 Returns the result of converting the 32-bit two's complement integer `a' to
1218 the quadruple-precision floating-point format.  The conversion is performed
1219 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1220 -------------------------------------------------------------------------------
1221 */
1222 float128 int32_to_float128( int32 a )
1223 {
1224     flag zSign;
1225     uint32 absA;
1226     int8 shiftCount;
1227     bits64 zSig0;
1228 
1229     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1230     zSign = ( a < 0 );
1231     absA = zSign ? - a : a;
1232     shiftCount = countLeadingZeros32( absA ) + 17;
1233     zSig0 = absA;
1234     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1235 
1236 }
1237 
1238 float128 uint32_to_float128( uint32 a )
1239 {
1240     int8 shiftCount;
1241     bits64 zSig0 = a;
1242 
1243     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1244     shiftCount = countLeadingZeros32( a ) + 17;
1245     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1246 
1247 }
1248 
1249 #endif
1250 
1251 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1252 /*
1253 -------------------------------------------------------------------------------
1254 Returns the result of converting the 64-bit two's complement integer `a'
1255 to the single-precision floating-point format.  The conversion is performed
1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 -------------------------------------------------------------------------------
1258 */
1259 float32 int64_to_float32( int64 a )
1260 {
1261     flag zSign;
1262     uint64 absA;
1263     int8 shiftCount;
1264 
1265     if ( a == 0 ) return 0;
1266     zSign = ( a < 0 );
1267     absA = zSign ? - a : a;
1268     shiftCount = countLeadingZeros64( absA ) - 40;
1269     if ( 0 <= shiftCount ) {
1270         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1271     }
1272     else {
1273         shiftCount += 7;
1274         if ( shiftCount < 0 ) {
1275             shift64RightJamming( absA, - shiftCount, &absA );
1276         }
1277         else {
1278             absA <<= shiftCount;
1279         }
1280         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1281     }
1282 
1283 }
1284 
1285 /*
1286 -------------------------------------------------------------------------------
1287 Returns the result of converting the 64-bit two's complement integer `a'
1288 to the double-precision floating-point format.  The conversion is performed
1289 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 -------------------------------------------------------------------------------
1291 */
1292 float64 int64_to_float64( int64 a )
1293 {
1294     flag zSign;
1295 
1296     if ( a == 0 ) return 0;
1297     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1298         return packFloat64( 1, 0x43E, 0 );
1299     }
1300     zSign = ( a < 0 );
1301     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1302 
1303 }
1304 
1305 #ifdef FLOATX80
1306 
1307 /*
1308 -------------------------------------------------------------------------------
1309 Returns the result of converting the 64-bit two's complement integer `a'
1310 to the extended double-precision floating-point format.  The conversion
1311 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 Arithmetic.
1313 -------------------------------------------------------------------------------
1314 */
1315 floatx80 int64_to_floatx80( int64 a )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320 
1321     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1322     zSign = ( a < 0 );
1323     absA = zSign ? - a : a;
1324     shiftCount = countLeadingZeros64( absA );
1325     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1326 
1327 }
1328 
1329 #endif
1330 
1331 #endif /* !SOFTFLOAT_FOR_GCC */
1332 
1333 #ifdef FLOAT128
1334 
1335 /*
1336 -------------------------------------------------------------------------------
1337 Returns the result of converting the 64-bit two's complement integer `a' to
1338 the quadruple-precision floating-point format.  The conversion is performed
1339 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1340 -------------------------------------------------------------------------------
1341 */
1342 float128 int64_to_float128( int64 a )
1343 {
1344     flag zSign;
1345     uint64 absA;
1346     int8 shiftCount;
1347     int32 zExp;
1348     bits64 zSig0, zSig1;
1349 
1350     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1351     zSign = ( a < 0 );
1352     absA = zSign ? - a : a;
1353     shiftCount = countLeadingZeros64( absA ) + 49;
1354     zExp = 0x406E - shiftCount;
1355     if ( 64 <= shiftCount ) {
1356         zSig1 = 0;
1357         zSig0 = absA;
1358         shiftCount -= 64;
1359     }
1360     else {
1361         zSig1 = absA;
1362         zSig0 = 0;
1363     }
1364     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1365     return packFloat128( zSign, zExp, zSig0, zSig1 );
1366 
1367 }
1368 
1369 #endif
1370 
1371 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1372 /*
1373 -------------------------------------------------------------------------------
1374 Returns the result of converting the single-precision floating-point value
1375 `a' to the 32-bit two's complement integer format.  The conversion is
1376 performed according to the IEC/IEEE Standard for Binary Floating-Point
1377 Arithmetic---which means in particular that the conversion is rounded
1378 according to the current rounding mode.  If `a' is a NaN, the largest
1379 positive integer is returned.  Otherwise, if the conversion overflows, the
1380 largest integer with the same sign as `a' is returned.
1381 -------------------------------------------------------------------------------
1382 */
1383 int32 float32_to_int32( float32 a )
1384 {
1385     flag aSign;
1386     int16 aExp, shiftCount;
1387     bits32 aSig;
1388     bits64 aSig64;
1389 
1390     aSig = extractFloat32Frac( a );
1391     aExp = extractFloat32Exp( a );
1392     aSign = extractFloat32Sign( a );
1393     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1394     if ( aExp ) aSig |= 0x00800000;
1395     shiftCount = 0xAF - aExp;
1396     aSig64 = aSig;
1397     aSig64 <<= 32;
1398     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1399     return roundAndPackInt32( aSign, aSig64 );
1400 
1401 }
1402 #endif /* !SOFTFLOAT_FOR_GCC */
1403 
1404 /*
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the single-precision floating-point value
1407 `a' to the 32-bit two's complement integer format.  The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic, except that the conversion is always rounded toward zero.
1410 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1411 the conversion overflows, the largest integer with the same sign as `a' is
1412 returned.
1413 -------------------------------------------------------------------------------
1414 */
1415 int32 float32_to_int32_round_to_zero( float32 a )
1416 {
1417     flag aSign;
1418     int16 aExp, shiftCount;
1419     bits32 aSig;
1420     int32 z;
1421 
1422     aSig = extractFloat32Frac( a );
1423     aExp = extractFloat32Exp( a );
1424     aSign = extractFloat32Sign( a );
1425     shiftCount = aExp - 0x9E;
1426     if ( 0 <= shiftCount ) {
1427         if ( a != 0xCF000000 ) {
1428             float_raise( float_flag_invalid );
1429             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1430         }
1431         return (sbits32) 0x80000000;
1432     }
1433     else if ( aExp <= 0x7E ) {
1434         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1435         return 0;
1436     }
1437     aSig = ( aSig | 0x00800000 )<<8;
1438     z = aSig>>( - shiftCount );
1439     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1440         float_exception_flags |= float_flag_inexact;
1441     }
1442     if ( aSign ) z = - z;
1443     return z;
1444 
1445 }
1446 
1447 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1448 /*
1449 -------------------------------------------------------------------------------
1450 Returns the result of converting the single-precision floating-point value
1451 `a' to the 64-bit two's complement integer format.  The conversion is
1452 performed according to the IEC/IEEE Standard for Binary Floating-Point
1453 Arithmetic---which means in particular that the conversion is rounded
1454 according to the current rounding mode.  If `a' is a NaN, the largest
1455 positive integer is returned.  Otherwise, if the conversion overflows, the
1456 largest integer with the same sign as `a' is returned.
1457 -------------------------------------------------------------------------------
1458 */
1459 int64 float32_to_int64( float32 a )
1460 {
1461     flag aSign;
1462     int16 aExp, shiftCount;
1463     bits32 aSig;
1464     bits64 aSig64, aSigExtra;
1465 
1466     aSig = extractFloat32Frac( a );
1467     aExp = extractFloat32Exp( a );
1468     aSign = extractFloat32Sign( a );
1469     shiftCount = 0xBE - aExp;
1470     if ( shiftCount < 0 ) {
1471         float_raise( float_flag_invalid );
1472         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1473             return LIT64( 0x7FFFFFFFFFFFFFFF );
1474         }
1475         return (sbits64) LIT64( 0x8000000000000000 );
1476     }
1477     if ( aExp ) aSig |= 0x00800000;
1478     aSig64 = aSig;
1479     aSig64 <<= 40;
1480     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1481     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1482 
1483 }
1484 
1485 /*
1486 -------------------------------------------------------------------------------
1487 Returns the result of converting the single-precision floating-point value
1488 `a' to the 64-bit two's complement integer format.  The conversion is
1489 performed according to the IEC/IEEE Standard for Binary Floating-Point
1490 Arithmetic, except that the conversion is always rounded toward zero.  If
1491 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1492 conversion overflows, the largest integer with the same sign as `a' is
1493 returned.
1494 -------------------------------------------------------------------------------
1495 */
1496 int64 float32_to_int64_round_to_zero( float32 a )
1497 {
1498     flag aSign;
1499     int16 aExp, shiftCount;
1500     bits32 aSig;
1501     bits64 aSig64;
1502     int64 z;
1503 
1504     aSig = extractFloat32Frac( a );
1505     aExp = extractFloat32Exp( a );
1506     aSign = extractFloat32Sign( a );
1507     shiftCount = aExp - 0xBE;
1508     if ( 0 <= shiftCount ) {
1509         if ( a != 0xDF000000 ) {
1510             float_raise( float_flag_invalid );
1511             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1512                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1513             }
1514         }
1515         return (sbits64) LIT64( 0x8000000000000000 );
1516     }
1517     else if ( aExp <= 0x7E ) {
1518         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1519         return 0;
1520     }
1521     aSig64 = aSig | 0x00800000;
1522     aSig64 <<= 40;
1523     z = aSig64>>( - shiftCount );
1524     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1525         float_exception_flags |= float_flag_inexact;
1526     }
1527     if ( aSign ) z = - z;
1528     return z;
1529 
1530 }
1531 #endif /* !SOFTFLOAT_FOR_GCC */
1532 
1533 /*
1534 -------------------------------------------------------------------------------
1535 Returns the result of converting the single-precision floating-point value
1536 `a' to the double-precision floating-point format.  The conversion is
1537 performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 Arithmetic.
1539 -------------------------------------------------------------------------------
1540 */
1541 float64 float32_to_float64( float32 a )
1542 {
1543     flag aSign;
1544     int16 aExp;
1545     bits32 aSig;
1546 
1547     aSig = extractFloat32Frac( a );
1548     aExp = extractFloat32Exp( a );
1549     aSign = extractFloat32Sign( a );
1550     if ( aExp == 0xFF ) {
1551         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1552         return packFloat64( aSign, 0x7FF, 0 );
1553     }
1554     if ( aExp == 0 ) {
1555         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1556         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1557         --aExp;
1558     }
1559     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1560 
1561 }
1562 
1563 #ifdef FLOATX80
1564 
1565 /*
1566 -------------------------------------------------------------------------------
1567 Returns the result of converting the single-precision floating-point value
1568 `a' to the extended double-precision floating-point format.  The conversion
1569 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 Arithmetic.
1571 -------------------------------------------------------------------------------
1572 */
1573 floatx80 float32_to_floatx80( float32 a )
1574 {
1575     flag aSign;
1576     int16 aExp;
1577     bits32 aSig;
1578 
1579     aSig = extractFloat32Frac( a );
1580     aExp = extractFloat32Exp( a );
1581     aSign = extractFloat32Sign( a );
1582     if ( aExp == 0xFF ) {
1583         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1584         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1585     }
1586     if ( aExp == 0 ) {
1587         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1588         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589     }
1590     aSig |= 0x00800000;
1591     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1592 
1593 }
1594 
1595 #endif
1596 
1597 #ifdef FLOAT128
1598 
1599 /*
1600 -------------------------------------------------------------------------------
1601 Returns the result of converting the single-precision floating-point value
1602 `a' to the double-precision floating-point format.  The conversion is
1603 performed according to the IEC/IEEE Standard for Binary Floating-Point
1604 Arithmetic.
1605 -------------------------------------------------------------------------------
1606 */
1607 float128 float32_to_float128( float32 a )
1608 {
1609     flag aSign;
1610     int16 aExp;
1611     bits32 aSig;
1612 
1613     aSig = extractFloat32Frac( a );
1614     aExp = extractFloat32Exp( a );
1615     aSign = extractFloat32Sign( a );
1616     if ( aExp == 0xFF ) {
1617         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1618         return packFloat128( aSign, 0x7FFF, 0, 0 );
1619     }
1620     if ( aExp == 0 ) {
1621         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1622         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1623         --aExp;
1624     }
1625     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1626 
1627 }
1628 
1629 #endif
1630 
1631 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1632 /*
1633 -------------------------------------------------------------------------------
1634 Rounds the single-precision floating-point value `a' to an integer, and
1635 returns the result as a single-precision floating-point value.  The
1636 operation is performed according to the IEC/IEEE Standard for Binary
1637 Floating-Point Arithmetic.
1638 -------------------------------------------------------------------------------
1639 */
1640 float32 float32_round_to_int( float32 a )
1641 {
1642     flag aSign;
1643     int16 aExp;
1644     bits32 lastBitMask, roundBitsMask;
1645     int8 roundingMode;
1646     float32 z;
1647 
1648     aExp = extractFloat32Exp( a );
1649     if ( 0x96 <= aExp ) {
1650         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651             return propagateFloat32NaN( a, a );
1652         }
1653         return a;
1654     }
1655     if ( aExp <= 0x7E ) {
1656         if ( (bits32) ( a<<1 ) == 0 ) return a;
1657         float_exception_flags |= float_flag_inexact;
1658         aSign = extractFloat32Sign( a );
1659         switch ( float_rounding_mode ) {
1660          case float_round_nearest_even:
1661             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662                 return packFloat32( aSign, 0x7F, 0 );
1663             }
1664             break;
1665 	 case float_round_to_zero:
1666 	    break;
1667          case float_round_down:
1668             return aSign ? 0xBF800000 : 0;
1669          case float_round_up:
1670             return aSign ? 0x80000000 : 0x3F800000;
1671         }
1672         return packFloat32( aSign, 0, 0 );
1673     }
1674     lastBitMask = 1;
1675     lastBitMask <<= 0x96 - aExp;
1676     roundBitsMask = lastBitMask - 1;
1677     z = a;
1678     roundingMode = float_rounding_mode;
1679     if ( roundingMode == float_round_nearest_even ) {
1680         z += lastBitMask>>1;
1681         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1682     }
1683     else if ( roundingMode != float_round_to_zero ) {
1684         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1685             z += roundBitsMask;
1686         }
1687     }
1688     z &= ~ roundBitsMask;
1689     if ( z != a ) float_exception_flags |= float_flag_inexact;
1690     return z;
1691 
1692 }
1693 #endif /* !SOFTFLOAT_FOR_GCC */
1694 
1695 /*
1696 -------------------------------------------------------------------------------
1697 Returns the result of adding the absolute values of the single-precision
1698 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1699 before being returned.  `zSign' is ignored if the result is a NaN.
1700 The addition is performed according to the IEC/IEEE Standard for Binary
1701 Floating-Point Arithmetic.
1702 -------------------------------------------------------------------------------
1703 */
1704 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1705 {
1706     int16 aExp, bExp, zExp;
1707     bits32 aSig, bSig, zSig;
1708     int16 expDiff;
1709 
1710     aSig = extractFloat32Frac( a );
1711     aExp = extractFloat32Exp( a );
1712     bSig = extractFloat32Frac( b );
1713     bExp = extractFloat32Exp( b );
1714     expDiff = aExp - bExp;
1715     aSig <<= 6;
1716     bSig <<= 6;
1717     if ( 0 < expDiff ) {
1718         if ( aExp == 0xFF ) {
1719             if ( aSig ) return propagateFloat32NaN( a, b );
1720             return a;
1721         }
1722         if ( bExp == 0 ) {
1723             --expDiff;
1724         }
1725         else {
1726             bSig |= 0x20000000;
1727         }
1728         shift32RightJamming( bSig, expDiff, &bSig );
1729         zExp = aExp;
1730     }
1731     else if ( expDiff < 0 ) {
1732         if ( bExp == 0xFF ) {
1733             if ( bSig ) return propagateFloat32NaN( a, b );
1734             return packFloat32( zSign, 0xFF, 0 );
1735         }
1736         if ( aExp == 0 ) {
1737             ++expDiff;
1738         }
1739         else {
1740             aSig |= 0x20000000;
1741         }
1742         shift32RightJamming( aSig, - expDiff, &aSig );
1743         zExp = bExp;
1744     }
1745     else {
1746         if ( aExp == 0xFF ) {
1747             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1748             return a;
1749         }
1750         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1751         zSig = 0x40000000 + aSig + bSig;
1752         zExp = aExp;
1753         goto roundAndPack;
1754     }
1755     aSig |= 0x20000000;
1756     zSig = ( aSig + bSig )<<1;
1757     --zExp;
1758     if ( (sbits32) zSig < 0 ) {
1759         zSig = aSig + bSig;
1760         ++zExp;
1761     }
1762  roundAndPack:
1763     return roundAndPackFloat32( zSign, zExp, zSig );
1764 
1765 }
1766 
1767 /*
1768 -------------------------------------------------------------------------------
1769 Returns the result of subtracting the absolute values of the single-
1770 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1771 difference is negated before being returned.  `zSign' is ignored if the
1772 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1773 Standard for Binary Floating-Point Arithmetic.
1774 -------------------------------------------------------------------------------
1775 */
1776 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1777 {
1778     int16 aExp, bExp, zExp;
1779     bits32 aSig, bSig, zSig;
1780     int16 expDiff;
1781 
1782     aSig = extractFloat32Frac( a );
1783     aExp = extractFloat32Exp( a );
1784     bSig = extractFloat32Frac( b );
1785     bExp = extractFloat32Exp( b );
1786     expDiff = aExp - bExp;
1787     aSig <<= 7;
1788     bSig <<= 7;
1789     if ( 0 < expDiff ) goto aExpBigger;
1790     if ( expDiff < 0 ) goto bExpBigger;
1791     if ( aExp == 0xFF ) {
1792         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1793         float_raise( float_flag_invalid );
1794         return float32_default_nan;
1795     }
1796     if ( aExp == 0 ) {
1797         aExp = 1;
1798         bExp = 1;
1799     }
1800     if ( bSig < aSig ) goto aBigger;
1801     if ( aSig < bSig ) goto bBigger;
1802     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1803  bExpBigger:
1804     if ( bExp == 0xFF ) {
1805         if ( bSig ) return propagateFloat32NaN( a, b );
1806         return packFloat32( zSign ^ 1, 0xFF, 0 );
1807     }
1808     if ( aExp == 0 ) {
1809         ++expDiff;
1810     }
1811     else {
1812         aSig |= 0x40000000;
1813     }
1814     shift32RightJamming( aSig, - expDiff, &aSig );
1815     bSig |= 0x40000000;
1816  bBigger:
1817     zSig = bSig - aSig;
1818     zExp = bExp;
1819     zSign ^= 1;
1820     goto normalizeRoundAndPack;
1821  aExpBigger:
1822     if ( aExp == 0xFF ) {
1823         if ( aSig ) return propagateFloat32NaN( a, b );
1824         return a;
1825     }
1826     if ( bExp == 0 ) {
1827         --expDiff;
1828     }
1829     else {
1830         bSig |= 0x40000000;
1831     }
1832     shift32RightJamming( bSig, expDiff, &bSig );
1833     aSig |= 0x40000000;
1834  aBigger:
1835     zSig = aSig - bSig;
1836     zExp = aExp;
1837  normalizeRoundAndPack:
1838     --zExp;
1839     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1840 
1841 }
1842 
1843 /*
1844 -------------------------------------------------------------------------------
1845 Returns the result of adding the single-precision floating-point values `a'
1846 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1847 Binary Floating-Point Arithmetic.
1848 -------------------------------------------------------------------------------
1849 */
1850 float32 float32_add( float32 a, float32 b )
1851 {
1852     flag aSign, bSign;
1853 
1854     aSign = extractFloat32Sign( a );
1855     bSign = extractFloat32Sign( b );
1856     if ( aSign == bSign ) {
1857         return addFloat32Sigs( a, b, aSign );
1858     }
1859     else {
1860         return subFloat32Sigs( a, b, aSign );
1861     }
1862 
1863 }
1864 
1865 /*
1866 -------------------------------------------------------------------------------
1867 Returns the result of subtracting the single-precision floating-point values
1868 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1869 for Binary Floating-Point Arithmetic.
1870 -------------------------------------------------------------------------------
1871 */
1872 float32 float32_sub( float32 a, float32 b )
1873 {
1874     flag aSign, bSign;
1875 
1876     aSign = extractFloat32Sign( a );
1877     bSign = extractFloat32Sign( b );
1878     if ( aSign == bSign ) {
1879         return subFloat32Sigs( a, b, aSign );
1880     }
1881     else {
1882         return addFloat32Sigs( a, b, aSign );
1883     }
1884 
1885 }
1886 
1887 /*
1888 -------------------------------------------------------------------------------
1889 Returns the result of multiplying the single-precision floating-point values
1890 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1891 for Binary Floating-Point Arithmetic.
1892 -------------------------------------------------------------------------------
1893 */
1894 float32 float32_mul( float32 a, float32 b )
1895 {
1896     flag aSign, bSign, zSign;
1897     int16 aExp, bExp, zExp;
1898     bits32 aSig, bSig;
1899     bits64 zSig64;
1900     bits32 zSig;
1901 
1902     aSig = extractFloat32Frac( a );
1903     aExp = extractFloat32Exp( a );
1904     aSign = extractFloat32Sign( a );
1905     bSig = extractFloat32Frac( b );
1906     bExp = extractFloat32Exp( b );
1907     bSign = extractFloat32Sign( b );
1908     zSign = aSign ^ bSign;
1909     if ( aExp == 0xFF ) {
1910         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1911             return propagateFloat32NaN( a, b );
1912         }
1913         if ( ( bExp | bSig ) == 0 ) {
1914             float_raise( float_flag_invalid );
1915             return float32_default_nan;
1916         }
1917         return packFloat32( zSign, 0xFF, 0 );
1918     }
1919     if ( bExp == 0xFF ) {
1920         if ( bSig ) return propagateFloat32NaN( a, b );
1921         if ( ( aExp | aSig ) == 0 ) {
1922             float_raise( float_flag_invalid );
1923             return float32_default_nan;
1924         }
1925         return packFloat32( zSign, 0xFF, 0 );
1926     }
1927     if ( aExp == 0 ) {
1928         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930     }
1931     if ( bExp == 0 ) {
1932         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1933         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934     }
1935     zExp = aExp + bExp - 0x7F;
1936     aSig = ( aSig | 0x00800000 )<<7;
1937     bSig = ( bSig | 0x00800000 )<<8;
1938     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1939     zSig = zSig64;
1940     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1941         zSig <<= 1;
1942         --zExp;
1943     }
1944     return roundAndPackFloat32( zSign, zExp, zSig );
1945 
1946 }
1947 
1948 /*
1949 -------------------------------------------------------------------------------
1950 Returns the result of dividing the single-precision floating-point value `a'
1951 by the corresponding value `b'.  The operation is performed according to the
1952 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1953 -------------------------------------------------------------------------------
1954 */
1955 float32 float32_div( float32 a, float32 b )
1956 {
1957     flag aSign, bSign, zSign;
1958     int16 aExp, bExp, zExp;
1959     bits32 aSig, bSig, zSig;
1960 
1961     aSig = extractFloat32Frac( a );
1962     aExp = extractFloat32Exp( a );
1963     aSign = extractFloat32Sign( a );
1964     bSig = extractFloat32Frac( b );
1965     bExp = extractFloat32Exp( b );
1966     bSign = extractFloat32Sign( b );
1967     zSign = aSign ^ bSign;
1968     if ( aExp == 0xFF ) {
1969         if ( aSig ) return propagateFloat32NaN( a, b );
1970         if ( bExp == 0xFF ) {
1971             if ( bSig ) return propagateFloat32NaN( a, b );
1972             float_raise( float_flag_invalid );
1973             return float32_default_nan;
1974         }
1975         return packFloat32( zSign, 0xFF, 0 );
1976     }
1977     if ( bExp == 0xFF ) {
1978         if ( bSig ) return propagateFloat32NaN( a, b );
1979         return packFloat32( zSign, 0, 0 );
1980     }
1981     if ( bExp == 0 ) {
1982         if ( bSig == 0 ) {
1983             if ( ( aExp | aSig ) == 0 ) {
1984                 float_raise( float_flag_invalid );
1985                 return float32_default_nan;
1986             }
1987             float_raise( float_flag_divbyzero );
1988             return packFloat32( zSign, 0xFF, 0 );
1989         }
1990         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1991     }
1992     if ( aExp == 0 ) {
1993         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1994         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1995     }
1996     zExp = aExp - bExp + 0x7D;
1997     aSig = ( aSig | 0x00800000 )<<7;
1998     bSig = ( bSig | 0x00800000 )<<8;
1999     if ( bSig <= ( aSig + aSig ) ) {
2000         aSig >>= 1;
2001         ++zExp;
2002     }
2003     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2004     if ( ( zSig & 0x3F ) == 0 ) {
2005         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2006     }
2007     return roundAndPackFloat32( zSign, zExp, zSig );
2008 
2009 }
2010 
2011 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2012 /*
2013 -------------------------------------------------------------------------------
2014 Returns the remainder of the single-precision floating-point value `a'
2015 with respect to the corresponding value `b'.  The operation is performed
2016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2017 -------------------------------------------------------------------------------
2018 */
2019 float32 float32_rem( float32 a, float32 b )
2020 {
2021     flag aSign, bSign, zSign;
2022     int16 aExp, bExp, expDiff;
2023     bits32 aSig, bSig;
2024     bits32 q;
2025     bits64 aSig64, bSig64, q64;
2026     bits32 alternateASig;
2027     sbits32 sigMean;
2028 
2029     aSig = extractFloat32Frac( a );
2030     aExp = extractFloat32Exp( a );
2031     aSign = extractFloat32Sign( a );
2032     bSig = extractFloat32Frac( b );
2033     bExp = extractFloat32Exp( b );
2034     bSign = extractFloat32Sign( b );
2035     if ( aExp == 0xFF ) {
2036         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2037             return propagateFloat32NaN( a, b );
2038         }
2039         float_raise( float_flag_invalid );
2040         return float32_default_nan;
2041     }
2042     if ( bExp == 0xFF ) {
2043         if ( bSig ) return propagateFloat32NaN( a, b );
2044         return a;
2045     }
2046     if ( bExp == 0 ) {
2047         if ( bSig == 0 ) {
2048             float_raise( float_flag_invalid );
2049             return float32_default_nan;
2050         }
2051         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2052     }
2053     if ( aExp == 0 ) {
2054         if ( aSig == 0 ) return a;
2055         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2056     }
2057     expDiff = aExp - bExp;
2058     aSig |= 0x00800000;
2059     bSig |= 0x00800000;
2060     if ( expDiff < 32 ) {
2061         aSig <<= 8;
2062         bSig <<= 8;
2063         if ( expDiff < 0 ) {
2064             if ( expDiff < -1 ) return a;
2065             aSig >>= 1;
2066         }
2067         q = ( bSig <= aSig );
2068         if ( q ) aSig -= bSig;
2069         if ( 0 < expDiff ) {
2070             q = ( ( (bits64) aSig )<<32 ) / bSig;
2071             q >>= 32 - expDiff;
2072             bSig >>= 2;
2073             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2074         }
2075         else {
2076             aSig >>= 2;
2077             bSig >>= 2;
2078         }
2079     }
2080     else {
2081         if ( bSig <= aSig ) aSig -= bSig;
2082         aSig64 = ( (bits64) aSig )<<40;
2083         bSig64 = ( (bits64) bSig )<<40;
2084         expDiff -= 64;
2085         while ( 0 < expDiff ) {
2086             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2087             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2088             aSig64 = - ( ( bSig * q64 )<<38 );
2089             expDiff -= 62;
2090         }
2091         expDiff += 64;
2092         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094         q = q64>>( 64 - expDiff );
2095         bSig <<= 6;
2096         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2097     }
2098     do {
2099         alternateASig = aSig;
2100         ++q;
2101         aSig -= bSig;
2102     } while ( 0 <= (sbits32) aSig );
2103     sigMean = aSig + alternateASig;
2104     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2105         aSig = alternateASig;
2106     }
2107     zSign = ( (sbits32) aSig < 0 );
2108     if ( zSign ) aSig = - aSig;
2109     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2110 
2111 }
2112 #endif /* !SOFTFLOAT_FOR_GCC */
2113 
2114 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2115 /*
2116 -------------------------------------------------------------------------------
2117 Returns the square root of the single-precision floating-point value `a'.
2118 The operation is performed according to the IEC/IEEE Standard for Binary
2119 Floating-Point Arithmetic.
2120 -------------------------------------------------------------------------------
2121 */
2122 float32 float32_sqrt( float32 a )
2123 {
2124     flag aSign;
2125     int16 aExp, zExp;
2126     bits32 aSig, zSig;
2127     bits64 rem, term;
2128 
2129     aSig = extractFloat32Frac( a );
2130     aExp = extractFloat32Exp( a );
2131     aSign = extractFloat32Sign( a );
2132     if ( aExp == 0xFF ) {
2133         if ( aSig ) return propagateFloat32NaN( a, 0 );
2134         if ( ! aSign ) return a;
2135         float_raise( float_flag_invalid );
2136         return float32_default_nan;
2137     }
2138     if ( aSign ) {
2139         if ( ( aExp | aSig ) == 0 ) return a;
2140         float_raise( float_flag_invalid );
2141         return float32_default_nan;
2142     }
2143     if ( aExp == 0 ) {
2144         if ( aSig == 0 ) return 0;
2145         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146     }
2147     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148     aSig = ( aSig | 0x00800000 )<<8;
2149     zSig = estimateSqrt32( aExp, aSig ) + 2;
2150     if ( ( zSig & 0x7F ) <= 5 ) {
2151         if ( zSig < 2 ) {
2152             zSig = 0x7FFFFFFF;
2153             goto roundAndPack;
2154         }
2155         aSig >>= aExp & 1;
2156         term = ( (bits64) zSig ) * zSig;
2157         rem = ( ( (bits64) aSig )<<32 ) - term;
2158         while ( (sbits64) rem < 0 ) {
2159             --zSig;
2160             rem += ( ( (bits64) zSig )<<1 ) | 1;
2161         }
2162         zSig |= ( rem != 0 );
2163     }
2164     shift32RightJamming( zSig, 1, &zSig );
2165  roundAndPack:
2166     return roundAndPackFloat32( 0, zExp, zSig );
2167 
2168 }
2169 #endif /* !SOFTFLOAT_FOR_GCC */
2170 
2171 /*
2172 -------------------------------------------------------------------------------
2173 Returns 1 if the single-precision floating-point value `a' is equal to
2174 the corresponding value `b', and 0 otherwise.  The comparison is performed
2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 -------------------------------------------------------------------------------
2177 */
2178 flag float32_eq( float32 a, float32 b )
2179 {
2180 
2181     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2182          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2183        ) {
2184         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2185             float_raise( float_flag_invalid );
2186         }
2187         return 0;
2188     }
2189     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2190 
2191 }
2192 
2193 /*
2194 -------------------------------------------------------------------------------
2195 Returns 1 if the single-precision floating-point value `a' is less than
2196 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2197 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2198 Arithmetic.
2199 -------------------------------------------------------------------------------
2200 */
2201 flag float32_le( float32 a, float32 b )
2202 {
2203     flag aSign, bSign;
2204 
2205     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207        ) {
2208         float_raise( float_flag_invalid );
2209         return 0;
2210     }
2211     aSign = extractFloat32Sign( a );
2212     bSign = extractFloat32Sign( b );
2213     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2214     return ( a == b ) || ( aSign ^ ( a < b ) );
2215 
2216 }
2217 
2218 /*
2219 -------------------------------------------------------------------------------
2220 Returns 1 if the single-precision floating-point value `a' is less than
2221 the corresponding value `b', and 0 otherwise.  The comparison is performed
2222 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 -------------------------------------------------------------------------------
2224 */
2225 flag float32_lt( float32 a, float32 b )
2226 {
2227     flag aSign, bSign;
2228 
2229     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2230          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2231        ) {
2232         float_raise( float_flag_invalid );
2233         return 0;
2234     }
2235     aSign = extractFloat32Sign( a );
2236     bSign = extractFloat32Sign( b );
2237     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2238     return ( a != b ) && ( aSign ^ ( a < b ) );
2239 
2240 }
2241 
2242 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2243 /*
2244 -------------------------------------------------------------------------------
2245 Returns 1 if the single-precision floating-point value `a' is equal to
2246 the corresponding value `b', and 0 otherwise.  The invalid exception is
2247 raised if either operand is a NaN.  Otherwise, the comparison is performed
2248 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2249 -------------------------------------------------------------------------------
2250 */
2251 flag float32_eq_signaling( float32 a, float32 b )
2252 {
2253 
2254     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2255          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2256        ) {
2257         float_raise( float_flag_invalid );
2258         return 0;
2259     }
2260     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2261 
2262 }
2263 
2264 /*
2265 -------------------------------------------------------------------------------
2266 Returns 1 if the single-precision floating-point value `a' is less than or
2267 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2268 cause an exception.  Otherwise, the comparison is performed according to the
2269 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 -------------------------------------------------------------------------------
2271 */
2272 flag float32_le_quiet( float32 a, float32 b )
2273 {
2274     flag aSign, bSign;
2275 
2276     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2277          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2278        ) {
2279         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2280             float_raise( float_flag_invalid );
2281         }
2282         return 0;
2283     }
2284     aSign = extractFloat32Sign( a );
2285     bSign = extractFloat32Sign( b );
2286     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2287     return ( a == b ) || ( aSign ^ ( a < b ) );
2288 
2289 }
2290 
2291 /*
2292 -------------------------------------------------------------------------------
2293 Returns 1 if the single-precision floating-point value `a' is less than
2294 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2295 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2296 Standard for Binary Floating-Point Arithmetic.
2297 -------------------------------------------------------------------------------
2298 */
2299 flag float32_lt_quiet( float32 a, float32 b )
2300 {
2301     flag aSign, bSign;
2302 
2303     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2304          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2305        ) {
2306         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2307             float_raise( float_flag_invalid );
2308         }
2309         return 0;
2310     }
2311     aSign = extractFloat32Sign( a );
2312     bSign = extractFloat32Sign( b );
2313     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2314     return ( a != b ) && ( aSign ^ ( a < b ) );
2315 
2316 }
2317 #endif /* !SOFTFLOAT_FOR_GCC */
2318 
2319 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2320 /*
2321 -------------------------------------------------------------------------------
2322 Returns the result of converting the double-precision floating-point value
2323 `a' to the 32-bit two's complement integer format.  The conversion is
2324 performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 Arithmetic---which means in particular that the conversion is rounded
2326 according to the current rounding mode.  If `a' is a NaN, the largest
2327 positive integer is returned.  Otherwise, if the conversion overflows, the
2328 largest integer with the same sign as `a' is returned.
2329 -------------------------------------------------------------------------------
2330 */
2331 int32 float64_to_int32( float64 a )
2332 {
2333     flag aSign;
2334     int16 aExp, shiftCount;
2335     bits64 aSig;
2336 
2337     aSig = extractFloat64Frac( a );
2338     aExp = extractFloat64Exp( a );
2339     aSign = extractFloat64Sign( a );
2340     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2341     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342     shiftCount = 0x42C - aExp;
2343     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2344     return roundAndPackInt32( aSign, aSig );
2345 
2346 }
2347 #endif /* !SOFTFLOAT_FOR_GCC */
2348 
2349 /*
2350 -------------------------------------------------------------------------------
2351 Returns the result of converting the double-precision floating-point value
2352 `a' to the 32-bit two's complement integer format.  The conversion is
2353 performed according to the IEC/IEEE Standard for Binary Floating-Point
2354 Arithmetic, except that the conversion is always rounded toward zero.
2355 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2356 the conversion overflows, the largest integer with the same sign as `a' is
2357 returned.
2358 -------------------------------------------------------------------------------
2359 */
2360 int32 float64_to_int32_round_to_zero( float64 a )
2361 {
2362     flag aSign;
2363     int16 aExp, shiftCount;
2364     bits64 aSig, savedASig;
2365     int32 z;
2366 
2367     aSig = extractFloat64Frac( a );
2368     aExp = extractFloat64Exp( a );
2369     aSign = extractFloat64Sign( a );
2370     if ( 0x41E < aExp ) {
2371         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2372         goto invalid;
2373     }
2374     else if ( aExp < 0x3FF ) {
2375         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2376         return 0;
2377     }
2378     aSig |= LIT64( 0x0010000000000000 );
2379     shiftCount = 0x433 - aExp;
2380     savedASig = aSig;
2381     aSig >>= shiftCount;
2382     z = aSig;
2383     if ( aSign ) z = - z;
2384     if ( ( z < 0 ) ^ aSign ) {
2385  invalid:
2386         float_raise( float_flag_invalid );
2387         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2388     }
2389     if ( ( aSig<<shiftCount ) != savedASig ) {
2390         float_exception_flags |= float_flag_inexact;
2391     }
2392     return z;
2393 
2394 }
2395 
2396 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2397 /*
2398 -------------------------------------------------------------------------------
2399 Returns the result of converting the double-precision floating-point value
2400 `a' to the 64-bit two's complement integer format.  The conversion is
2401 performed according to the IEC/IEEE Standard for Binary Floating-Point
2402 Arithmetic---which means in particular that the conversion is rounded
2403 according to the current rounding mode.  If `a' is a NaN, the largest
2404 positive integer is returned.  Otherwise, if the conversion overflows, the
2405 largest integer with the same sign as `a' is returned.
2406 -------------------------------------------------------------------------------
2407 */
2408 int64 float64_to_int64( float64 a )
2409 {
2410     flag aSign;
2411     int16 aExp, shiftCount;
2412     bits64 aSig, aSigExtra;
2413 
2414     aSig = extractFloat64Frac( a );
2415     aExp = extractFloat64Exp( a );
2416     aSign = extractFloat64Sign( a );
2417     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418     shiftCount = 0x433 - aExp;
2419     if ( shiftCount <= 0 ) {
2420         if ( 0x43E < aExp ) {
2421             float_raise( float_flag_invalid );
2422             if (    ! aSign
2423                  || (    ( aExp == 0x7FF )
2424                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2425                ) {
2426                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2427             }
2428             return (sbits64) LIT64( 0x8000000000000000 );
2429         }
2430         aSigExtra = 0;
2431         aSig <<= - shiftCount;
2432     }
2433     else {
2434         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2435     }
2436     return roundAndPackInt64( aSign, aSig, aSigExtra );
2437 
2438 }
2439 
2440 /*
2441 -------------------------------------------------------------------------------
2442 Returns the result of converting the double-precision floating-point value
2443 `a' to the 64-bit two's complement integer format.  The conversion is
2444 performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 Arithmetic, except that the conversion is always rounded toward zero.
2446 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2447 the conversion overflows, the largest integer with the same sign as `a' is
2448 returned.
2449 -------------------------------------------------------------------------------
2450 */
2451 int64 float64_to_int64_round_to_zero( float64 a )
2452 {
2453     flag aSign;
2454     int16 aExp, shiftCount;
2455     bits64 aSig;
2456     int64 z;
2457 
2458     aSig = extractFloat64Frac( a );
2459     aExp = extractFloat64Exp( a );
2460     aSign = extractFloat64Sign( a );
2461     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2462     shiftCount = aExp - 0x433;
2463     if ( 0 <= shiftCount ) {
2464         if ( 0x43E <= aExp ) {
2465             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2466                 float_raise( float_flag_invalid );
2467                 if (    ! aSign
2468                      || (    ( aExp == 0x7FF )
2469                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2470                    ) {
2471                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2472                 }
2473             }
2474             return (sbits64) LIT64( 0x8000000000000000 );
2475         }
2476         z = aSig<<shiftCount;
2477     }
2478     else {
2479         if ( aExp < 0x3FE ) {
2480             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2481             return 0;
2482         }
2483         z = aSig>>( - shiftCount );
2484         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2485             float_exception_flags |= float_flag_inexact;
2486         }
2487     }
2488     if ( aSign ) z = - z;
2489     return z;
2490 
2491 }
2492 #endif /* !SOFTFLOAT_FOR_GCC */
2493 
2494 /*
2495 -------------------------------------------------------------------------------
2496 Returns the result of converting the double-precision floating-point value
2497 `a' to the single-precision floating-point format.  The conversion is
2498 performed according to the IEC/IEEE Standard for Binary Floating-Point
2499 Arithmetic.
2500 -------------------------------------------------------------------------------
2501 */
2502 float32 float64_to_float32( float64 a )
2503 {
2504     flag aSign;
2505     int16 aExp;
2506     bits64 aSig;
2507     bits32 zSig;
2508 
2509     aSig = extractFloat64Frac( a );
2510     aExp = extractFloat64Exp( a );
2511     aSign = extractFloat64Sign( a );
2512     if ( aExp == 0x7FF ) {
2513         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2514         return packFloat32( aSign, 0xFF, 0 );
2515     }
2516     shift64RightJamming( aSig, 22, &aSig );
2517     zSig = aSig;
2518     if ( aExp || zSig ) {
2519         zSig |= 0x40000000;
2520         aExp -= 0x381;
2521     }
2522     return roundAndPackFloat32( aSign, aExp, zSig );
2523 
2524 }
2525 
2526 #ifdef FLOATX80
2527 
2528 /*
2529 -------------------------------------------------------------------------------
2530 Returns the result of converting the double-precision floating-point value
2531 `a' to the extended double-precision floating-point format.  The conversion
2532 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2533 Arithmetic.
2534 -------------------------------------------------------------------------------
2535 */
2536 floatx80 float64_to_floatx80( float64 a )
2537 {
2538     flag aSign;
2539     int16 aExp;
2540     bits64 aSig;
2541 
2542     aSig = extractFloat64Frac( a );
2543     aExp = extractFloat64Exp( a );
2544     aSign = extractFloat64Sign( a );
2545     if ( aExp == 0x7FF ) {
2546         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2547         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2548     }
2549     if ( aExp == 0 ) {
2550         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2551         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2552     }
2553     return
2554         packFloatx80(
2555             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2556 
2557 }
2558 
2559 #endif
2560 
2561 #ifdef FLOAT128
2562 
2563 /*
2564 -------------------------------------------------------------------------------
2565 Returns the result of converting the double-precision floating-point value
2566 `a' to the quadruple-precision floating-point format.  The conversion is
2567 performed according to the IEC/IEEE Standard for Binary Floating-Point
2568 Arithmetic.
2569 -------------------------------------------------------------------------------
2570 */
2571 float128 float64_to_float128( float64 a )
2572 {
2573     flag aSign;
2574     int16 aExp;
2575     bits64 aSig, zSig0, zSig1;
2576 
2577     aSig = extractFloat64Frac( a );
2578     aExp = extractFloat64Exp( a );
2579     aSign = extractFloat64Sign( a );
2580     if ( aExp == 0x7FF ) {
2581         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2582         return packFloat128( aSign, 0x7FFF, 0, 0 );
2583     }
2584     if ( aExp == 0 ) {
2585         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2586         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2587         --aExp;
2588     }
2589     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2590     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2591 
2592 }
2593 
2594 #endif
2595 
2596 #ifndef SOFTFLOAT_FOR_GCC
2597 /*
2598 -------------------------------------------------------------------------------
2599 Rounds the double-precision floating-point value `a' to an integer, and
2600 returns the result as a double-precision floating-point value.  The
2601 operation is performed according to the IEC/IEEE Standard for Binary
2602 Floating-Point Arithmetic.
2603 -------------------------------------------------------------------------------
2604 */
2605 float64 float64_round_to_int( float64 a )
2606 {
2607     flag aSign;
2608     int16 aExp;
2609     bits64 lastBitMask, roundBitsMask;
2610     int8 roundingMode;
2611     float64 z;
2612 
2613     aExp = extractFloat64Exp( a );
2614     if ( 0x433 <= aExp ) {
2615         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2616             return propagateFloat64NaN( a, a );
2617         }
2618         return a;
2619     }
2620     if ( aExp < 0x3FF ) {
2621         if ( (bits64) ( a<<1 ) == 0 ) return a;
2622         float_exception_flags |= float_flag_inexact;
2623         aSign = extractFloat64Sign( a );
2624         switch ( float_rounding_mode ) {
2625          case float_round_nearest_even:
2626             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2627                 return packFloat64( aSign, 0x3FF, 0 );
2628             }
2629             break;
2630 	 case float_round_to_zero:
2631 	    break;
2632          case float_round_down:
2633             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2634          case float_round_up:
2635             return
2636             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2637         }
2638         return packFloat64( aSign, 0, 0 );
2639     }
2640     lastBitMask = 1;
2641     lastBitMask <<= 0x433 - aExp;
2642     roundBitsMask = lastBitMask - 1;
2643     z = a;
2644     roundingMode = float_rounding_mode;
2645     if ( roundingMode == float_round_nearest_even ) {
2646         z += lastBitMask>>1;
2647         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2648     }
2649     else if ( roundingMode != float_round_to_zero ) {
2650         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2651             z += roundBitsMask;
2652         }
2653     }
2654     z &= ~ roundBitsMask;
2655     if ( z != a ) float_exception_flags |= float_flag_inexact;
2656     return z;
2657 
2658 }
2659 #endif
2660 
2661 /*
2662 -------------------------------------------------------------------------------
2663 Returns the result of adding the absolute values of the double-precision
2664 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2665 before being returned.  `zSign' is ignored if the result is a NaN.
2666 The addition is performed according to the IEC/IEEE Standard for Binary
2667 Floating-Point Arithmetic.
2668 -------------------------------------------------------------------------------
2669 */
2670 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2671 {
2672     int16 aExp, bExp, zExp;
2673     bits64 aSig, bSig, zSig;
2674     int16 expDiff;
2675 
2676     aSig = extractFloat64Frac( a );
2677     aExp = extractFloat64Exp( a );
2678     bSig = extractFloat64Frac( b );
2679     bExp = extractFloat64Exp( b );
2680     expDiff = aExp - bExp;
2681     aSig <<= 9;
2682     bSig <<= 9;
2683     if ( 0 < expDiff ) {
2684         if ( aExp == 0x7FF ) {
2685             if ( aSig ) return propagateFloat64NaN( a, b );
2686             return a;
2687         }
2688         if ( bExp == 0 ) {
2689             --expDiff;
2690         }
2691         else {
2692             bSig |= LIT64( 0x2000000000000000 );
2693         }
2694         shift64RightJamming( bSig, expDiff, &bSig );
2695         zExp = aExp;
2696     }
2697     else if ( expDiff < 0 ) {
2698         if ( bExp == 0x7FF ) {
2699             if ( bSig ) return propagateFloat64NaN( a, b );
2700             return packFloat64( zSign, 0x7FF, 0 );
2701         }
2702         if ( aExp == 0 ) {
2703             ++expDiff;
2704         }
2705         else {
2706             aSig |= LIT64( 0x2000000000000000 );
2707         }
2708         shift64RightJamming( aSig, - expDiff, &aSig );
2709         zExp = bExp;
2710     }
2711     else {
2712         if ( aExp == 0x7FF ) {
2713             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2714             return a;
2715         }
2716         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2717         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2718         zExp = aExp;
2719         goto roundAndPack;
2720     }
2721     aSig |= LIT64( 0x2000000000000000 );
2722     zSig = ( aSig + bSig )<<1;
2723     --zExp;
2724     if ( (sbits64) zSig < 0 ) {
2725         zSig = aSig + bSig;
2726         ++zExp;
2727     }
2728  roundAndPack:
2729     return roundAndPackFloat64( zSign, zExp, zSig );
2730 
2731 }
2732 
2733 /*
2734 -------------------------------------------------------------------------------
2735 Returns the result of subtracting the absolute values of the double-
2736 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2737 difference is negated before being returned.  `zSign' is ignored if the
2738 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2739 Standard for Binary Floating-Point Arithmetic.
2740 -------------------------------------------------------------------------------
2741 */
2742 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2743 {
2744     int16 aExp, bExp, zExp;
2745     bits64 aSig, bSig, zSig;
2746     int16 expDiff;
2747 
2748     aSig = extractFloat64Frac( a );
2749     aExp = extractFloat64Exp( a );
2750     bSig = extractFloat64Frac( b );
2751     bExp = extractFloat64Exp( b );
2752     expDiff = aExp - bExp;
2753     aSig <<= 10;
2754     bSig <<= 10;
2755     if ( 0 < expDiff ) goto aExpBigger;
2756     if ( expDiff < 0 ) goto bExpBigger;
2757     if ( aExp == 0x7FF ) {
2758         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2759         float_raise( float_flag_invalid );
2760         return float64_default_nan;
2761     }
2762     if ( aExp == 0 ) {
2763         aExp = 1;
2764         bExp = 1;
2765     }
2766     if ( bSig < aSig ) goto aBigger;
2767     if ( aSig < bSig ) goto bBigger;
2768     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2769  bExpBigger:
2770     if ( bExp == 0x7FF ) {
2771         if ( bSig ) return propagateFloat64NaN( a, b );
2772         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2773     }
2774     if ( aExp == 0 ) {
2775         ++expDiff;
2776     }
2777     else {
2778         aSig |= LIT64( 0x4000000000000000 );
2779     }
2780     shift64RightJamming( aSig, - expDiff, &aSig );
2781     bSig |= LIT64( 0x4000000000000000 );
2782  bBigger:
2783     zSig = bSig - aSig;
2784     zExp = bExp;
2785     zSign ^= 1;
2786     goto normalizeRoundAndPack;
2787  aExpBigger:
2788     if ( aExp == 0x7FF ) {
2789         if ( aSig ) return propagateFloat64NaN( a, b );
2790         return a;
2791     }
2792     if ( bExp == 0 ) {
2793         --expDiff;
2794     }
2795     else {
2796         bSig |= LIT64( 0x4000000000000000 );
2797     }
2798     shift64RightJamming( bSig, expDiff, &bSig );
2799     aSig |= LIT64( 0x4000000000000000 );
2800  aBigger:
2801     zSig = aSig - bSig;
2802     zExp = aExp;
2803  normalizeRoundAndPack:
2804     --zExp;
2805     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2806 
2807 }
2808 
2809 /*
2810 -------------------------------------------------------------------------------
2811 Returns the result of adding the double-precision floating-point values `a'
2812 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2813 Binary Floating-Point Arithmetic.
2814 -------------------------------------------------------------------------------
2815 */
2816 float64 float64_add( float64 a, float64 b )
2817 {
2818     flag aSign, bSign;
2819 
2820     aSign = extractFloat64Sign( a );
2821     bSign = extractFloat64Sign( b );
2822     if ( aSign == bSign ) {
2823         return addFloat64Sigs( a, b, aSign );
2824     }
2825     else {
2826         return subFloat64Sigs( a, b, aSign );
2827     }
2828 
2829 }
2830 
2831 /*
2832 -------------------------------------------------------------------------------
2833 Returns the result of subtracting the double-precision floating-point values
2834 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2835 for Binary Floating-Point Arithmetic.
2836 -------------------------------------------------------------------------------
2837 */
2838 float64 float64_sub( float64 a, float64 b )
2839 {
2840     flag aSign, bSign;
2841 
2842     aSign = extractFloat64Sign( a );
2843     bSign = extractFloat64Sign( b );
2844     if ( aSign == bSign ) {
2845         return subFloat64Sigs( a, b, aSign );
2846     }
2847     else {
2848         return addFloat64Sigs( a, b, aSign );
2849     }
2850 
2851 }
2852 
2853 /*
2854 -------------------------------------------------------------------------------
2855 Returns the result of multiplying the double-precision floating-point values
2856 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2857 for Binary Floating-Point Arithmetic.
2858 -------------------------------------------------------------------------------
2859 */
2860 float64 float64_mul( float64 a, float64 b )
2861 {
2862     flag aSign, bSign, zSign;
2863     int16 aExp, bExp, zExp;
2864     bits64 aSig, bSig, zSig0, zSig1;
2865 
2866     aSig = extractFloat64Frac( a );
2867     aExp = extractFloat64Exp( a );
2868     aSign = extractFloat64Sign( a );
2869     bSig = extractFloat64Frac( b );
2870     bExp = extractFloat64Exp( b );
2871     bSign = extractFloat64Sign( b );
2872     zSign = aSign ^ bSign;
2873     if ( aExp == 0x7FF ) {
2874         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2875             return propagateFloat64NaN( a, b );
2876         }
2877         if ( ( bExp | bSig ) == 0 ) {
2878             float_raise( float_flag_invalid );
2879             return float64_default_nan;
2880         }
2881         return packFloat64( zSign, 0x7FF, 0 );
2882     }
2883     if ( bExp == 0x7FF ) {
2884         if ( bSig ) return propagateFloat64NaN( a, b );
2885         if ( ( aExp | aSig ) == 0 ) {
2886             float_raise( float_flag_invalid );
2887             return float64_default_nan;
2888         }
2889         return packFloat64( zSign, 0x7FF, 0 );
2890     }
2891     if ( aExp == 0 ) {
2892         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2893         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2894     }
2895     if ( bExp == 0 ) {
2896         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2897         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2898     }
2899     zExp = aExp + bExp - 0x3FF;
2900     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2901     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2902     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2903     zSig0 |= ( zSig1 != 0 );
2904     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2905         zSig0 <<= 1;
2906         --zExp;
2907     }
2908     return roundAndPackFloat64( zSign, zExp, zSig0 );
2909 
2910 }
2911 
2912 /*
2913 -------------------------------------------------------------------------------
2914 Returns the result of dividing the double-precision floating-point value `a'
2915 by the corresponding value `b'.  The operation is performed according to
2916 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 -------------------------------------------------------------------------------
2918 */
2919 float64 float64_div( float64 a, float64 b )
2920 {
2921     flag aSign, bSign, zSign;
2922     int16 aExp, bExp, zExp;
2923     bits64 aSig, bSig, zSig;
2924     bits64 rem0, rem1;
2925     bits64 term0, term1;
2926 
2927     aSig = extractFloat64Frac( a );
2928     aExp = extractFloat64Exp( a );
2929     aSign = extractFloat64Sign( a );
2930     bSig = extractFloat64Frac( b );
2931     bExp = extractFloat64Exp( b );
2932     bSign = extractFloat64Sign( b );
2933     zSign = aSign ^ bSign;
2934     if ( aExp == 0x7FF ) {
2935         if ( aSig ) return propagateFloat64NaN( a, b );
2936         if ( bExp == 0x7FF ) {
2937             if ( bSig ) return propagateFloat64NaN( a, b );
2938             float_raise( float_flag_invalid );
2939             return float64_default_nan;
2940         }
2941         return packFloat64( zSign, 0x7FF, 0 );
2942     }
2943     if ( bExp == 0x7FF ) {
2944         if ( bSig ) return propagateFloat64NaN( a, b );
2945         return packFloat64( zSign, 0, 0 );
2946     }
2947     if ( bExp == 0 ) {
2948         if ( bSig == 0 ) {
2949             if ( ( aExp | aSig ) == 0 ) {
2950                 float_raise( float_flag_invalid );
2951                 return float64_default_nan;
2952             }
2953             float_raise( float_flag_divbyzero );
2954             return packFloat64( zSign, 0x7FF, 0 );
2955         }
2956         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2957     }
2958     if ( aExp == 0 ) {
2959         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2960         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2961     }
2962     zExp = aExp - bExp + 0x3FD;
2963     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2964     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2965     if ( bSig <= ( aSig + aSig ) ) {
2966         aSig >>= 1;
2967         ++zExp;
2968     }
2969     zSig = estimateDiv128To64( aSig, 0, bSig );
2970     if ( ( zSig & 0x1FF ) <= 2 ) {
2971         mul64To128( bSig, zSig, &term0, &term1 );
2972         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2973         while ( (sbits64) rem0 < 0 ) {
2974             --zSig;
2975             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2976         }
2977         zSig |= ( rem1 != 0 );
2978     }
2979     return roundAndPackFloat64( zSign, zExp, zSig );
2980 
2981 }
2982 
2983 #ifndef SOFTFLOAT_FOR_GCC
2984 /*
2985 -------------------------------------------------------------------------------
2986 Returns the remainder of the double-precision floating-point value `a'
2987 with respect to the corresponding value `b'.  The operation is performed
2988 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2989 -------------------------------------------------------------------------------
2990 */
2991 float64 float64_rem( float64 a, float64 b )
2992 {
2993     flag aSign, bSign, zSign;
2994     int16 aExp, bExp, expDiff;
2995     bits64 aSig, bSig;
2996     bits64 q, alternateASig;
2997     sbits64 sigMean;
2998 
2999     aSig = extractFloat64Frac( a );
3000     aExp = extractFloat64Exp( a );
3001     aSign = extractFloat64Sign( a );
3002     bSig = extractFloat64Frac( b );
3003     bExp = extractFloat64Exp( b );
3004     bSign = extractFloat64Sign( b );
3005     if ( aExp == 0x7FF ) {
3006         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3007             return propagateFloat64NaN( a, b );
3008         }
3009         float_raise( float_flag_invalid );
3010         return float64_default_nan;
3011     }
3012     if ( bExp == 0x7FF ) {
3013         if ( bSig ) return propagateFloat64NaN( a, b );
3014         return a;
3015     }
3016     if ( bExp == 0 ) {
3017         if ( bSig == 0 ) {
3018             float_raise( float_flag_invalid );
3019             return float64_default_nan;
3020         }
3021         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3022     }
3023     if ( aExp == 0 ) {
3024         if ( aSig == 0 ) return a;
3025         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3026     }
3027     expDiff = aExp - bExp;
3028     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3029     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3030     if ( expDiff < 0 ) {
3031         if ( expDiff < -1 ) return a;
3032         aSig >>= 1;
3033     }
3034     q = ( bSig <= aSig );
3035     if ( q ) aSig -= bSig;
3036     expDiff -= 64;
3037     while ( 0 < expDiff ) {
3038         q = estimateDiv128To64( aSig, 0, bSig );
3039         q = ( 2 < q ) ? q - 2 : 0;
3040         aSig = - ( ( bSig>>2 ) * q );
3041         expDiff -= 62;
3042     }
3043     expDiff += 64;
3044     if ( 0 < expDiff ) {
3045         q = estimateDiv128To64( aSig, 0, bSig );
3046         q = ( 2 < q ) ? q - 2 : 0;
3047         q >>= 64 - expDiff;
3048         bSig >>= 2;
3049         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3050     }
3051     else {
3052         aSig >>= 2;
3053         bSig >>= 2;
3054     }
3055     do {
3056         alternateASig = aSig;
3057         ++q;
3058         aSig -= bSig;
3059     } while ( 0 <= (sbits64) aSig );
3060     sigMean = aSig + alternateASig;
3061     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3062         aSig = alternateASig;
3063     }
3064     zSign = ( (sbits64) aSig < 0 );
3065     if ( zSign ) aSig = - aSig;
3066     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3067 
3068 }
3069 
3070 /*
3071 -------------------------------------------------------------------------------
3072 Returns the square root of the double-precision floating-point value `a'.
3073 The operation is performed according to the IEC/IEEE Standard for Binary
3074 Floating-Point Arithmetic.
3075 -------------------------------------------------------------------------------
3076 */
3077 float64 float64_sqrt( float64 a )
3078 {
3079     flag aSign;
3080     int16 aExp, zExp;
3081     bits64 aSig, zSig, doubleZSig;
3082     bits64 rem0, rem1, term0, term1;
3083 
3084     aSig = extractFloat64Frac( a );
3085     aExp = extractFloat64Exp( a );
3086     aSign = extractFloat64Sign( a );
3087     if ( aExp == 0x7FF ) {
3088         if ( aSig ) return propagateFloat64NaN( a, a );
3089         if ( ! aSign ) return a;
3090         float_raise( float_flag_invalid );
3091         return float64_default_nan;
3092     }
3093     if ( aSign ) {
3094         if ( ( aExp | aSig ) == 0 ) return a;
3095         float_raise( float_flag_invalid );
3096         return float64_default_nan;
3097     }
3098     if ( aExp == 0 ) {
3099         if ( aSig == 0 ) return 0;
3100         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3101     }
3102     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3103     aSig |= LIT64( 0x0010000000000000 );
3104     zSig = estimateSqrt32( aExp, aSig>>21 );
3105     aSig <<= 9 - ( aExp & 1 );
3106     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3107     if ( ( zSig & 0x1FF ) <= 5 ) {
3108         doubleZSig = zSig<<1;
3109         mul64To128( zSig, zSig, &term0, &term1 );
3110         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3111         while ( (sbits64) rem0 < 0 ) {
3112             --zSig;
3113             doubleZSig -= 2;
3114             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3115         }
3116         zSig |= ( ( rem0 | rem1 ) != 0 );
3117     }
3118     return roundAndPackFloat64( 0, zExp, zSig );
3119 
3120 }
3121 #endif
3122 
3123 /*
3124 -------------------------------------------------------------------------------
3125 Returns 1 if the double-precision floating-point value `a' is equal to the
3126 corresponding value `b', and 0 otherwise.  The comparison is performed
3127 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3128 -------------------------------------------------------------------------------
3129 */
3130 flag float64_eq( float64 a, float64 b )
3131 {
3132 
3133     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3134          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3135        ) {
3136         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3137             float_raise( float_flag_invalid );
3138         }
3139         return 0;
3140     }
3141     return ( a == b ) ||
3142 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3143 
3144 }
3145 
3146 /*
3147 -------------------------------------------------------------------------------
3148 Returns 1 if the double-precision floating-point value `a' is less than or
3149 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3150 performed according to the IEC/IEEE Standard for Binary Floating-Point
3151 Arithmetic.
3152 -------------------------------------------------------------------------------
3153 */
3154 flag float64_le( float64 a, float64 b )
3155 {
3156     flag aSign, bSign;
3157 
3158     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3159          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3160        ) {
3161         float_raise( float_flag_invalid );
3162         return 0;
3163     }
3164     aSign = extractFloat64Sign( a );
3165     bSign = extractFloat64Sign( b );
3166     if ( aSign != bSign )
3167 	return aSign ||
3168 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3169 	      0 );
3170     return ( a == b ) ||
3171 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3172 
3173 }
3174 
3175 /*
3176 -------------------------------------------------------------------------------
3177 Returns 1 if the double-precision floating-point value `a' is less than
3178 the corresponding value `b', and 0 otherwise.  The comparison is performed
3179 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3180 -------------------------------------------------------------------------------
3181 */
3182 flag float64_lt( float64 a, float64 b )
3183 {
3184     flag aSign, bSign;
3185 
3186     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3187          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3188        ) {
3189         float_raise( float_flag_invalid );
3190         return 0;
3191     }
3192     aSign = extractFloat64Sign( a );
3193     bSign = extractFloat64Sign( b );
3194     if ( aSign != bSign )
3195 	return aSign &&
3196 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3197 	      0 );
3198     return ( a != b ) &&
3199 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3200 
3201 }
3202 
3203 #ifndef SOFTFLOAT_FOR_GCC
3204 /*
3205 -------------------------------------------------------------------------------
3206 Returns 1 if the double-precision floating-point value `a' is equal to the
3207 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3208 if either operand is a NaN.  Otherwise, the comparison is performed
3209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3210 -------------------------------------------------------------------------------
3211 */
3212 flag float64_eq_signaling( float64 a, float64 b )
3213 {
3214 
3215     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3216          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3217        ) {
3218         float_raise( float_flag_invalid );
3219         return 0;
3220     }
3221     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3222 
3223 }
3224 
3225 /*
3226 -------------------------------------------------------------------------------
3227 Returns 1 if the double-precision floating-point value `a' is less than or
3228 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3229 cause an exception.  Otherwise, the comparison is performed according to the
3230 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3231 -------------------------------------------------------------------------------
3232 */
3233 flag float64_le_quiet( float64 a, float64 b )
3234 {
3235     flag aSign, bSign;
3236 
3237     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3238          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3239        ) {
3240         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3241             float_raise( float_flag_invalid );
3242         }
3243         return 0;
3244     }
3245     aSign = extractFloat64Sign( a );
3246     bSign = extractFloat64Sign( b );
3247     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3248     return ( a == b ) || ( aSign ^ ( a < b ) );
3249 
3250 }
3251 
3252 /*
3253 -------------------------------------------------------------------------------
3254 Returns 1 if the double-precision floating-point value `a' is less than
3255 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3256 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3257 Standard for Binary Floating-Point Arithmetic.
3258 -------------------------------------------------------------------------------
3259 */
3260 flag float64_lt_quiet( float64 a, float64 b )
3261 {
3262     flag aSign, bSign;
3263 
3264     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3265          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3266        ) {
3267         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3268             float_raise( float_flag_invalid );
3269         }
3270         return 0;
3271     }
3272     aSign = extractFloat64Sign( a );
3273     bSign = extractFloat64Sign( b );
3274     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3275     return ( a != b ) && ( aSign ^ ( a < b ) );
3276 
3277 }
3278 #endif
3279 
3280 #ifdef FLOATX80
3281 
3282 /*
3283 -------------------------------------------------------------------------------
3284 Returns the result of converting the extended double-precision floating-
3285 point value `a' to the 32-bit two's complement integer format.  The
3286 conversion is performed according to the IEC/IEEE Standard for Binary
3287 Floating-Point Arithmetic---which means in particular that the conversion
3288 is rounded according to the current rounding mode.  If `a' is a NaN, the
3289 largest positive integer is returned.  Otherwise, if the conversion
3290 overflows, the largest integer with the same sign as `a' is returned.
3291 -------------------------------------------------------------------------------
3292 */
3293 int32 floatx80_to_int32( floatx80 a )
3294 {
3295     flag aSign;
3296     int32 aExp, shiftCount;
3297     bits64 aSig;
3298 
3299     aSig = extractFloatx80Frac( a );
3300     aExp = extractFloatx80Exp( a );
3301     aSign = extractFloatx80Sign( a );
3302     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3303     shiftCount = 0x4037 - aExp;
3304     if ( shiftCount <= 0 ) shiftCount = 1;
3305     shift64RightJamming( aSig, shiftCount, &aSig );
3306     return roundAndPackInt32( aSign, aSig );
3307 
3308 }
3309 
3310 /*
3311 -------------------------------------------------------------------------------
3312 Returns the result of converting the extended double-precision floating-
3313 point value `a' to the 32-bit two's complement integer format.  The
3314 conversion is performed according to the IEC/IEEE Standard for Binary
3315 Floating-Point Arithmetic, except that the conversion is always rounded
3316 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3317 Otherwise, if the conversion overflows, the largest integer with the same
3318 sign as `a' is returned.
3319 -------------------------------------------------------------------------------
3320 */
3321 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3322 {
3323     flag aSign;
3324     int32 aExp, shiftCount;
3325     bits64 aSig, savedASig;
3326     int32 z;
3327 
3328     aSig = extractFloatx80Frac( a );
3329     aExp = extractFloatx80Exp( a );
3330     aSign = extractFloatx80Sign( a );
3331     if ( 0x401E < aExp ) {
3332         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3333         goto invalid;
3334     }
3335     else if ( aExp < 0x3FFF ) {
3336         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3337         return 0;
3338     }
3339     shiftCount = 0x403E - aExp;
3340     savedASig = aSig;
3341     aSig >>= shiftCount;
3342     z = aSig;
3343     if ( aSign ) z = - z;
3344     if ( ( z < 0 ) ^ aSign ) {
3345  invalid:
3346         float_raise( float_flag_invalid );
3347         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3348     }
3349     if ( ( aSig<<shiftCount ) != savedASig ) {
3350         float_exception_flags |= float_flag_inexact;
3351     }
3352     return z;
3353 
3354 }
3355 
3356 /*
3357 -------------------------------------------------------------------------------
3358 Returns the result of converting the extended double-precision floating-
3359 point value `a' to the 64-bit two's complement integer format.  The
3360 conversion is performed according to the IEC/IEEE Standard for Binary
3361 Floating-Point Arithmetic---which means in particular that the conversion
3362 is rounded according to the current rounding mode.  If `a' is a NaN,
3363 the largest positive integer is returned.  Otherwise, if the conversion
3364 overflows, the largest integer with the same sign as `a' is returned.
3365 -------------------------------------------------------------------------------
3366 */
3367 int64 floatx80_to_int64( floatx80 a )
3368 {
3369     flag aSign;
3370     int32 aExp, shiftCount;
3371     bits64 aSig, aSigExtra;
3372 
3373     aSig = extractFloatx80Frac( a );
3374     aExp = extractFloatx80Exp( a );
3375     aSign = extractFloatx80Sign( a );
3376     shiftCount = 0x403E - aExp;
3377     if ( shiftCount <= 0 ) {
3378         if ( shiftCount ) {
3379             float_raise( float_flag_invalid );
3380             if (    ! aSign
3381                  || (    ( aExp == 0x7FFF )
3382                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3383                ) {
3384                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3385             }
3386             return (sbits64) LIT64( 0x8000000000000000 );
3387         }
3388         aSigExtra = 0;
3389     }
3390     else {
3391         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3392     }
3393     return roundAndPackInt64( aSign, aSig, aSigExtra );
3394 
3395 }
3396 
3397 /*
3398 -------------------------------------------------------------------------------
3399 Returns the result of converting the extended double-precision floating-
3400 point value `a' to the 64-bit two's complement integer format.  The
3401 conversion is performed according to the IEC/IEEE Standard for Binary
3402 Floating-Point Arithmetic, except that the conversion is always rounded
3403 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3404 Otherwise, if the conversion overflows, the largest integer with the same
3405 sign as `a' is returned.
3406 -------------------------------------------------------------------------------
3407 */
3408 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3409 {
3410     flag aSign;
3411     int32 aExp, shiftCount;
3412     bits64 aSig;
3413     int64 z;
3414 
3415     aSig = extractFloatx80Frac( a );
3416     aExp = extractFloatx80Exp( a );
3417     aSign = extractFloatx80Sign( a );
3418     shiftCount = aExp - 0x403E;
3419     if ( 0 <= shiftCount ) {
3420         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3421         if ( ( a.high != 0xC03E ) || aSig ) {
3422             float_raise( float_flag_invalid );
3423             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3424                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3425             }
3426         }
3427         return (sbits64) LIT64( 0x8000000000000000 );
3428     }
3429     else if ( aExp < 0x3FFF ) {
3430         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3431         return 0;
3432     }
3433     z = aSig>>( - shiftCount );
3434     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3435         float_exception_flags |= float_flag_inexact;
3436     }
3437     if ( aSign ) z = - z;
3438     return z;
3439 
3440 }
3441 
3442 /*
3443 -------------------------------------------------------------------------------
3444 Returns the result of converting the extended double-precision floating-
3445 point value `a' to the single-precision floating-point format.  The
3446 conversion is performed according to the IEC/IEEE Standard for Binary
3447 Floating-Point Arithmetic.
3448 -------------------------------------------------------------------------------
3449 */
3450 float32 floatx80_to_float32( floatx80 a )
3451 {
3452     flag aSign;
3453     int32 aExp;
3454     bits64 aSig;
3455 
3456     aSig = extractFloatx80Frac( a );
3457     aExp = extractFloatx80Exp( a );
3458     aSign = extractFloatx80Sign( a );
3459     if ( aExp == 0x7FFF ) {
3460         if ( (bits64) ( aSig<<1 ) ) {
3461             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3462         }
3463         return packFloat32( aSign, 0xFF, 0 );
3464     }
3465     shift64RightJamming( aSig, 33, &aSig );
3466     if ( aExp || aSig ) aExp -= 0x3F81;
3467     return roundAndPackFloat32( aSign, aExp, aSig );
3468 
3469 }
3470 
3471 /*
3472 -------------------------------------------------------------------------------
3473 Returns the result of converting the extended double-precision floating-
3474 point value `a' to the double-precision floating-point format.  The
3475 conversion is performed according to the IEC/IEEE Standard for Binary
3476 Floating-Point Arithmetic.
3477 -------------------------------------------------------------------------------
3478 */
3479 float64 floatx80_to_float64( floatx80 a )
3480 {
3481     flag aSign;
3482     int32 aExp;
3483     bits64 aSig, zSig;
3484 
3485     aSig = extractFloatx80Frac( a );
3486     aExp = extractFloatx80Exp( a );
3487     aSign = extractFloatx80Sign( a );
3488     if ( aExp == 0x7FFF ) {
3489         if ( (bits64) ( aSig<<1 ) ) {
3490             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3491         }
3492         return packFloat64( aSign, 0x7FF, 0 );
3493     }
3494     shift64RightJamming( aSig, 1, &zSig );
3495     if ( aExp || aSig ) aExp -= 0x3C01;
3496     return roundAndPackFloat64( aSign, aExp, zSig );
3497 
3498 }
3499 
3500 #ifdef FLOAT128
3501 
3502 /*
3503 -------------------------------------------------------------------------------
3504 Returns the result of converting the extended double-precision floating-
3505 point value `a' to the quadruple-precision floating-point format.  The
3506 conversion is performed according to the IEC/IEEE Standard for Binary
3507 Floating-Point Arithmetic.
3508 -------------------------------------------------------------------------------
3509 */
3510 float128 floatx80_to_float128( floatx80 a )
3511 {
3512     flag aSign;
3513     int16 aExp;
3514     bits64 aSig, zSig0, zSig1;
3515 
3516     aSig = extractFloatx80Frac( a );
3517     aExp = extractFloatx80Exp( a );
3518     aSign = extractFloatx80Sign( a );
3519     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3520         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3521     }
3522     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3523     return packFloat128( aSign, aExp, zSig0, zSig1 );
3524 
3525 }
3526 
3527 #endif
3528 
3529 /*
3530 -------------------------------------------------------------------------------
3531 Rounds the extended double-precision floating-point value `a' to an integer,
3532 and returns the result as an extended quadruple-precision floating-point
3533 value.  The operation is performed according to the IEC/IEEE Standard for
3534 Binary Floating-Point Arithmetic.
3535 -------------------------------------------------------------------------------
3536 */
3537 floatx80 floatx80_round_to_int( floatx80 a )
3538 {
3539     flag aSign;
3540     int32 aExp;
3541     bits64 lastBitMask, roundBitsMask;
3542     int8 roundingMode;
3543     floatx80 z;
3544 
3545     aExp = extractFloatx80Exp( a );
3546     if ( 0x403E <= aExp ) {
3547         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3548             return propagateFloatx80NaN( a, a );
3549         }
3550         return a;
3551     }
3552     if ( aExp < 0x3FFF ) {
3553         if (    ( aExp == 0 )
3554              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3555             return a;
3556         }
3557         float_exception_flags |= float_flag_inexact;
3558         aSign = extractFloatx80Sign( a );
3559         switch ( float_rounding_mode ) {
3560          case float_round_nearest_even:
3561             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3562                ) {
3563                 return
3564                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3565             }
3566             break;
3567 	 case float_round_to_zero:
3568 	    break;
3569          case float_round_down:
3570             return
3571                   aSign ?
3572                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3573                 : packFloatx80( 0, 0, 0 );
3574          case float_round_up:
3575             return
3576                   aSign ? packFloatx80( 1, 0, 0 )
3577                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3578         }
3579         return packFloatx80( aSign, 0, 0 );
3580     }
3581     lastBitMask = 1;
3582     lastBitMask <<= 0x403E - aExp;
3583     roundBitsMask = lastBitMask - 1;
3584     z = a;
3585     roundingMode = float_rounding_mode;
3586     if ( roundingMode == float_round_nearest_even ) {
3587         z.low += lastBitMask>>1;
3588         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3589     }
3590     else if ( roundingMode != float_round_to_zero ) {
3591         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3592             z.low += roundBitsMask;
3593         }
3594     }
3595     z.low &= ~ roundBitsMask;
3596     if ( z.low == 0 ) {
3597         ++z.high;
3598         z.low = LIT64( 0x8000000000000000 );
3599     }
3600     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3601     return z;
3602 
3603 }
3604 
3605 /*
3606 -------------------------------------------------------------------------------
3607 Returns the result of adding the absolute values of the extended double-
3608 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3609 negated before being returned.  `zSign' is ignored if the result is a NaN.
3610 The addition is performed according to the IEC/IEEE Standard for Binary
3611 Floating-Point Arithmetic.
3612 -------------------------------------------------------------------------------
3613 */
3614 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3615 {
3616     int32 aExp, bExp, zExp;
3617     bits64 aSig, bSig, zSig0, zSig1;
3618     int32 expDiff;
3619 
3620     aSig = extractFloatx80Frac( a );
3621     aExp = extractFloatx80Exp( a );
3622     bSig = extractFloatx80Frac( b );
3623     bExp = extractFloatx80Exp( b );
3624     expDiff = aExp - bExp;
3625     if ( 0 < expDiff ) {
3626         if ( aExp == 0x7FFF ) {
3627             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3628             return a;
3629         }
3630         if ( bExp == 0 ) --expDiff;
3631         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3632         zExp = aExp;
3633     }
3634     else if ( expDiff < 0 ) {
3635         if ( bExp == 0x7FFF ) {
3636             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3637             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3638         }
3639         if ( aExp == 0 ) ++expDiff;
3640         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3641         zExp = bExp;
3642     }
3643     else {
3644         if ( aExp == 0x7FFF ) {
3645             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3646                 return propagateFloatx80NaN( a, b );
3647             }
3648             return a;
3649         }
3650         zSig1 = 0;
3651         zSig0 = aSig + bSig;
3652         if ( aExp == 0 ) {
3653             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3654             goto roundAndPack;
3655         }
3656         zExp = aExp;
3657         goto shiftRight1;
3658     }
3659     zSig0 = aSig + bSig;
3660     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3661  shiftRight1:
3662     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3663     zSig0 |= LIT64( 0x8000000000000000 );
3664     ++zExp;
3665  roundAndPack:
3666     return
3667         roundAndPackFloatx80(
3668             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3669 
3670 }
3671 
3672 /*
3673 -------------------------------------------------------------------------------
3674 Returns the result of subtracting the absolute values of the extended
3675 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3676 difference is negated before being returned.  `zSign' is ignored if the
3677 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3678 Standard for Binary Floating-Point Arithmetic.
3679 -------------------------------------------------------------------------------
3680 */
3681 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3682 {
3683     int32 aExp, bExp, zExp;
3684     bits64 aSig, bSig, zSig0, zSig1;
3685     int32 expDiff;
3686     floatx80 z;
3687 
3688     aSig = extractFloatx80Frac( a );
3689     aExp = extractFloatx80Exp( a );
3690     bSig = extractFloatx80Frac( b );
3691     bExp = extractFloatx80Exp( b );
3692     expDiff = aExp - bExp;
3693     if ( 0 < expDiff ) goto aExpBigger;
3694     if ( expDiff < 0 ) goto bExpBigger;
3695     if ( aExp == 0x7FFF ) {
3696         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3697             return propagateFloatx80NaN( a, b );
3698         }
3699         float_raise( float_flag_invalid );
3700         z.low = floatx80_default_nan_low;
3701         z.high = floatx80_default_nan_high;
3702         return z;
3703     }
3704     if ( aExp == 0 ) {
3705         aExp = 1;
3706         bExp = 1;
3707     }
3708     zSig1 = 0;
3709     if ( bSig < aSig ) goto aBigger;
3710     if ( aSig < bSig ) goto bBigger;
3711     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3712  bExpBigger:
3713     if ( bExp == 0x7FFF ) {
3714         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3715         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3716     }
3717     if ( aExp == 0 ) ++expDiff;
3718     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3719  bBigger:
3720     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3721     zExp = bExp;
3722     zSign ^= 1;
3723     goto normalizeRoundAndPack;
3724  aExpBigger:
3725     if ( aExp == 0x7FFF ) {
3726         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3727         return a;
3728     }
3729     if ( bExp == 0 ) --expDiff;
3730     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3731  aBigger:
3732     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3733     zExp = aExp;
3734  normalizeRoundAndPack:
3735     return
3736         normalizeRoundAndPackFloatx80(
3737             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3738 
3739 }
3740 
3741 /*
3742 -------------------------------------------------------------------------------
3743 Returns the result of adding the extended double-precision floating-point
3744 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3745 Standard for Binary Floating-Point Arithmetic.
3746 -------------------------------------------------------------------------------
3747 */
3748 floatx80 floatx80_add( floatx80 a, floatx80 b )
3749 {
3750     flag aSign, bSign;
3751 
3752     aSign = extractFloatx80Sign( a );
3753     bSign = extractFloatx80Sign( b );
3754     if ( aSign == bSign ) {
3755         return addFloatx80Sigs( a, b, aSign );
3756     }
3757     else {
3758         return subFloatx80Sigs( a, b, aSign );
3759     }
3760 
3761 }
3762 
3763 /*
3764 -------------------------------------------------------------------------------
3765 Returns the result of subtracting the extended double-precision floating-
3766 point values `a' and `b'.  The operation is performed according to the
3767 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3768 -------------------------------------------------------------------------------
3769 */
3770 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3771 {
3772     flag aSign, bSign;
3773 
3774     aSign = extractFloatx80Sign( a );
3775     bSign = extractFloatx80Sign( b );
3776     if ( aSign == bSign ) {
3777         return subFloatx80Sigs( a, b, aSign );
3778     }
3779     else {
3780         return addFloatx80Sigs( a, b, aSign );
3781     }
3782 
3783 }
3784 
3785 /*
3786 -------------------------------------------------------------------------------
3787 Returns the result of multiplying the extended double-precision floating-
3788 point values `a' and `b'.  The operation is performed according to the
3789 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3790 -------------------------------------------------------------------------------
3791 */
3792 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3793 {
3794     flag aSign, bSign, zSign;
3795     int32 aExp, bExp, zExp;
3796     bits64 aSig, bSig, zSig0, zSig1;
3797     floatx80 z;
3798 
3799     aSig = extractFloatx80Frac( a );
3800     aExp = extractFloatx80Exp( a );
3801     aSign = extractFloatx80Sign( a );
3802     bSig = extractFloatx80Frac( b );
3803     bExp = extractFloatx80Exp( b );
3804     bSign = extractFloatx80Sign( b );
3805     zSign = aSign ^ bSign;
3806     if ( aExp == 0x7FFF ) {
3807         if (    (bits64) ( aSig<<1 )
3808              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3809             return propagateFloatx80NaN( a, b );
3810         }
3811         if ( ( bExp | bSig ) == 0 ) goto invalid;
3812         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3813     }
3814     if ( bExp == 0x7FFF ) {
3815         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3816         if ( ( aExp | aSig ) == 0 ) {
3817  invalid:
3818             float_raise( float_flag_invalid );
3819             z.low = floatx80_default_nan_low;
3820             z.high = floatx80_default_nan_high;
3821             return z;
3822         }
3823         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3824     }
3825     if ( aExp == 0 ) {
3826         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3827         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3828     }
3829     if ( bExp == 0 ) {
3830         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3831         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3832     }
3833     zExp = aExp + bExp - 0x3FFE;
3834     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3835     if ( 0 < (sbits64) zSig0 ) {
3836         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3837         --zExp;
3838     }
3839     return
3840         roundAndPackFloatx80(
3841             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3842 
3843 }
3844 
3845 /*
3846 -------------------------------------------------------------------------------
3847 Returns the result of dividing the extended double-precision floating-point
3848 value `a' by the corresponding value `b'.  The operation is performed
3849 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3850 -------------------------------------------------------------------------------
3851 */
3852 floatx80 floatx80_div( floatx80 a, floatx80 b )
3853 {
3854     flag aSign, bSign, zSign;
3855     int32 aExp, bExp, zExp;
3856     bits64 aSig, bSig, zSig0, zSig1;
3857     bits64 rem0, rem1, rem2, term0, term1, term2;
3858     floatx80 z;
3859 
3860     aSig = extractFloatx80Frac( a );
3861     aExp = extractFloatx80Exp( a );
3862     aSign = extractFloatx80Sign( a );
3863     bSig = extractFloatx80Frac( b );
3864     bExp = extractFloatx80Exp( b );
3865     bSign = extractFloatx80Sign( b );
3866     zSign = aSign ^ bSign;
3867     if ( aExp == 0x7FFF ) {
3868         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3869         if ( bExp == 0x7FFF ) {
3870             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3871             goto invalid;
3872         }
3873         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3874     }
3875     if ( bExp == 0x7FFF ) {
3876         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877         return packFloatx80( zSign, 0, 0 );
3878     }
3879     if ( bExp == 0 ) {
3880         if ( bSig == 0 ) {
3881             if ( ( aExp | aSig ) == 0 ) {
3882  invalid:
3883                 float_raise( float_flag_invalid );
3884                 z.low = floatx80_default_nan_low;
3885                 z.high = floatx80_default_nan_high;
3886                 return z;
3887             }
3888             float_raise( float_flag_divbyzero );
3889             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3890         }
3891         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892     }
3893     if ( aExp == 0 ) {
3894         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3895         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3896     }
3897     zExp = aExp - bExp + 0x3FFE;
3898     rem1 = 0;
3899     if ( bSig <= aSig ) {
3900         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3901         ++zExp;
3902     }
3903     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3904     mul64To128( bSig, zSig0, &term0, &term1 );
3905     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3906     while ( (sbits64) rem0 < 0 ) {
3907         --zSig0;
3908         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3909     }
3910     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3911     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3912         mul64To128( bSig, zSig1, &term1, &term2 );
3913         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3914         while ( (sbits64) rem1 < 0 ) {
3915             --zSig1;
3916             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3917         }
3918         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3919     }
3920     return
3921         roundAndPackFloatx80(
3922             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3923 
3924 }
3925 
3926 /*
3927 -------------------------------------------------------------------------------
3928 Returns the remainder of the extended double-precision floating-point value
3929 `a' with respect to the corresponding value `b'.  The operation is performed
3930 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3931 -------------------------------------------------------------------------------
3932 */
3933 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3934 {
3935     flag aSign, bSign, zSign;
3936     int32 aExp, bExp, expDiff;
3937     bits64 aSig0, aSig1, bSig;
3938     bits64 q, term0, term1, alternateASig0, alternateASig1;
3939     floatx80 z;
3940 
3941     aSig0 = extractFloatx80Frac( a );
3942     aExp = extractFloatx80Exp( a );
3943     aSign = extractFloatx80Sign( a );
3944     bSig = extractFloatx80Frac( b );
3945     bExp = extractFloatx80Exp( b );
3946     bSign = extractFloatx80Sign( b );
3947     if ( aExp == 0x7FFF ) {
3948         if (    (bits64) ( aSig0<<1 )
3949              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3950             return propagateFloatx80NaN( a, b );
3951         }
3952         goto invalid;
3953     }
3954     if ( bExp == 0x7FFF ) {
3955         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3956         return a;
3957     }
3958     if ( bExp == 0 ) {
3959         if ( bSig == 0 ) {
3960  invalid:
3961             float_raise( float_flag_invalid );
3962             z.low = floatx80_default_nan_low;
3963             z.high = floatx80_default_nan_high;
3964             return z;
3965         }
3966         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3967     }
3968     if ( aExp == 0 ) {
3969         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3970         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3971     }
3972     bSig |= LIT64( 0x8000000000000000 );
3973     zSign = aSign;
3974     expDiff = aExp - bExp;
3975     aSig1 = 0;
3976     if ( expDiff < 0 ) {
3977         if ( expDiff < -1 ) return a;
3978         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3979         expDiff = 0;
3980     }
3981     q = ( bSig <= aSig0 );
3982     if ( q ) aSig0 -= bSig;
3983     expDiff -= 64;
3984     while ( 0 < expDiff ) {
3985         q = estimateDiv128To64( aSig0, aSig1, bSig );
3986         q = ( 2 < q ) ? q - 2 : 0;
3987         mul64To128( bSig, q, &term0, &term1 );
3988         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3989         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3990         expDiff -= 62;
3991     }
3992     expDiff += 64;
3993     if ( 0 < expDiff ) {
3994         q = estimateDiv128To64( aSig0, aSig1, bSig );
3995         q = ( 2 < q ) ? q - 2 : 0;
3996         q >>= 64 - expDiff;
3997         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3998         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3999         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4000         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4001             ++q;
4002             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4003         }
4004     }
4005     else {
4006         term1 = 0;
4007         term0 = bSig;
4008     }
4009     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4010     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4012               && ( q & 1 ) )
4013        ) {
4014         aSig0 = alternateASig0;
4015         aSig1 = alternateASig1;
4016         zSign = ! zSign;
4017     }
4018     return
4019         normalizeRoundAndPackFloatx80(
4020             80, zSign, bExp + expDiff, aSig0, aSig1 );
4021 
4022 }
4023 
4024 /*
4025 -------------------------------------------------------------------------------
4026 Returns the square root of the extended double-precision floating-point
4027 value `a'.  The operation is performed according to the IEC/IEEE Standard
4028 for Binary Floating-Point Arithmetic.
4029 -------------------------------------------------------------------------------
4030 */
4031 floatx80 floatx80_sqrt( floatx80 a )
4032 {
4033     flag aSign;
4034     int32 aExp, zExp;
4035     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4036     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4037     floatx80 z;
4038 
4039     aSig0 = extractFloatx80Frac( a );
4040     aExp = extractFloatx80Exp( a );
4041     aSign = extractFloatx80Sign( a );
4042     if ( aExp == 0x7FFF ) {
4043         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4044         if ( ! aSign ) return a;
4045         goto invalid;
4046     }
4047     if ( aSign ) {
4048         if ( ( aExp | aSig0 ) == 0 ) return a;
4049  invalid:
4050         float_raise( float_flag_invalid );
4051         z.low = floatx80_default_nan_low;
4052         z.high = floatx80_default_nan_high;
4053         return z;
4054     }
4055     if ( aExp == 0 ) {
4056         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4057         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4058     }
4059     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4060     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4061     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4062     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4063     doubleZSig0 = zSig0<<1;
4064     mul64To128( zSig0, zSig0, &term0, &term1 );
4065     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4066     while ( (sbits64) rem0 < 0 ) {
4067         --zSig0;
4068         doubleZSig0 -= 2;
4069         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4070     }
4071     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4072     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4073         if ( zSig1 == 0 ) zSig1 = 1;
4074         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4075         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4076         mul64To128( zSig1, zSig1, &term2, &term3 );
4077         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4078         while ( (sbits64) rem1 < 0 ) {
4079             --zSig1;
4080             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4081             term3 |= 1;
4082             term2 |= doubleZSig0;
4083             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4084         }
4085         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4086     }
4087     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4088     zSig0 |= doubleZSig0;
4089     return
4090         roundAndPackFloatx80(
4091             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4092 
4093 }
4094 
4095 /*
4096 -------------------------------------------------------------------------------
4097 Returns 1 if the extended double-precision floating-point value `a' is
4098 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4099 performed according to the IEC/IEEE Standard for Binary Floating-Point
4100 Arithmetic.
4101 -------------------------------------------------------------------------------
4102 */
4103 flag floatx80_eq( floatx80 a, floatx80 b )
4104 {
4105 
4106     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4107               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4108          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4109               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4110        ) {
4111         if (    floatx80_is_signaling_nan( a )
4112              || floatx80_is_signaling_nan( b ) ) {
4113             float_raise( float_flag_invalid );
4114         }
4115         return 0;
4116     }
4117     return
4118            ( a.low == b.low )
4119         && (    ( a.high == b.high )
4120              || (    ( a.low == 0 )
4121                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4122            );
4123 
4124 }
4125 
4126 /*
4127 -------------------------------------------------------------------------------
4128 Returns 1 if the extended double-precision floating-point value `a' is
4129 less than or equal to the corresponding value `b', and 0 otherwise.  The
4130 comparison is performed according to the IEC/IEEE Standard for Binary
4131 Floating-Point Arithmetic.
4132 -------------------------------------------------------------------------------
4133 */
4134 flag floatx80_le( floatx80 a, floatx80 b )
4135 {
4136     flag aSign, bSign;
4137 
4138     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4139               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4140          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4141               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4142        ) {
4143         float_raise( float_flag_invalid );
4144         return 0;
4145     }
4146     aSign = extractFloatx80Sign( a );
4147     bSign = extractFloatx80Sign( b );
4148     if ( aSign != bSign ) {
4149         return
4150                aSign
4151             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4152                  == 0 );
4153     }
4154     return
4155           aSign ? le128( b.high, b.low, a.high, a.low )
4156         : le128( a.high, a.low, b.high, b.low );
4157 
4158 }
4159 
4160 /*
4161 -------------------------------------------------------------------------------
4162 Returns 1 if the extended double-precision floating-point value `a' is
4163 less than the corresponding value `b', and 0 otherwise.  The comparison
4164 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 Arithmetic.
4166 -------------------------------------------------------------------------------
4167 */
4168 flag floatx80_lt( floatx80 a, floatx80 b )
4169 {
4170     flag aSign, bSign;
4171 
4172     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4173               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4174          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4175               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4176        ) {
4177         float_raise( float_flag_invalid );
4178         return 0;
4179     }
4180     aSign = extractFloatx80Sign( a );
4181     bSign = extractFloatx80Sign( b );
4182     if ( aSign != bSign ) {
4183         return
4184                aSign
4185             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4186                  != 0 );
4187     }
4188     return
4189           aSign ? lt128( b.high, b.low, a.high, a.low )
4190         : lt128( a.high, a.low, b.high, b.low );
4191 
4192 }
4193 
4194 /*
4195 -------------------------------------------------------------------------------
4196 Returns 1 if the extended double-precision floating-point value `a' is equal
4197 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4198 raised if either operand is a NaN.  Otherwise, the comparison is performed
4199 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4200 -------------------------------------------------------------------------------
4201 */
4202 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4203 {
4204 
4205     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4206               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4207          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4208               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4209        ) {
4210         float_raise( float_flag_invalid );
4211         return 0;
4212     }
4213     return
4214            ( a.low == b.low )
4215         && (    ( a.high == b.high )
4216              || (    ( a.low == 0 )
4217                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4218            );
4219 
4220 }
4221 
4222 /*
4223 -------------------------------------------------------------------------------
4224 Returns 1 if the extended double-precision floating-point value `a' is less
4225 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4226 do not cause an exception.  Otherwise, the comparison is performed according
4227 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 -------------------------------------------------------------------------------
4229 */
4230 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4231 {
4232     flag aSign, bSign;
4233 
4234     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4235               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4236          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4237               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4238        ) {
4239         if (    floatx80_is_signaling_nan( a )
4240              || floatx80_is_signaling_nan( b ) ) {
4241             float_raise( float_flag_invalid );
4242         }
4243         return 0;
4244     }
4245     aSign = extractFloatx80Sign( a );
4246     bSign = extractFloatx80Sign( b );
4247     if ( aSign != bSign ) {
4248         return
4249                aSign
4250             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4251                  == 0 );
4252     }
4253     return
4254           aSign ? le128( b.high, b.low, a.high, a.low )
4255         : le128( a.high, a.low, b.high, b.low );
4256 
4257 }
4258 
4259 /*
4260 -------------------------------------------------------------------------------
4261 Returns 1 if the extended double-precision floating-point value `a' is less
4262 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4263 an exception.  Otherwise, the comparison is performed according to the
4264 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4265 -------------------------------------------------------------------------------
4266 */
4267 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4268 {
4269     flag aSign, bSign;
4270 
4271     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4272               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4273          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4274               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4275        ) {
4276         if (    floatx80_is_signaling_nan( a )
4277              || floatx80_is_signaling_nan( b ) ) {
4278             float_raise( float_flag_invalid );
4279         }
4280         return 0;
4281     }
4282     aSign = extractFloatx80Sign( a );
4283     bSign = extractFloatx80Sign( b );
4284     if ( aSign != bSign ) {
4285         return
4286                aSign
4287             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4288                  != 0 );
4289     }
4290     return
4291           aSign ? lt128( b.high, b.low, a.high, a.low )
4292         : lt128( a.high, a.low, b.high, b.low );
4293 
4294 }
4295 
4296 #endif
4297 
4298 #ifdef FLOAT128
4299 
4300 /*
4301 -------------------------------------------------------------------------------
4302 Returns the result of converting the quadruple-precision floating-point
4303 value `a' to the 32-bit two's complement integer format.  The conversion
4304 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305 Arithmetic---which means in particular that the conversion is rounded
4306 according to the current rounding mode.  If `a' is a NaN, the largest
4307 positive integer is returned.  Otherwise, if the conversion overflows, the
4308 largest integer with the same sign as `a' is returned.
4309 -------------------------------------------------------------------------------
4310 */
4311 int32 float128_to_int32( float128 a )
4312 {
4313     flag aSign;
4314     int32 aExp, shiftCount;
4315     bits64 aSig0, aSig1;
4316 
4317     aSig1 = extractFloat128Frac1( a );
4318     aSig0 = extractFloat128Frac0( a );
4319     aExp = extractFloat128Exp( a );
4320     aSign = extractFloat128Sign( a );
4321     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4322     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4323     aSig0 |= ( aSig1 != 0 );
4324     shiftCount = 0x4028 - aExp;
4325     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4326     return roundAndPackInt32( aSign, aSig0 );
4327 
4328 }
4329 
4330 /*
4331 -------------------------------------------------------------------------------
4332 Returns the result of converting the quadruple-precision floating-point
4333 value `a' to the 32-bit two's complement integer format.  The conversion
4334 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335 Arithmetic, except that the conversion is always rounded toward zero.  If
4336 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4337 conversion overflows, the largest integer with the same sign as `a' is
4338 returned.
4339 -------------------------------------------------------------------------------
4340 */
4341 int32 float128_to_int32_round_to_zero( float128 a )
4342 {
4343     flag aSign;
4344     int32 aExp, shiftCount;
4345     bits64 aSig0, aSig1, savedASig;
4346     int32 z;
4347 
4348     aSig1 = extractFloat128Frac1( a );
4349     aSig0 = extractFloat128Frac0( a );
4350     aExp = extractFloat128Exp( a );
4351     aSign = extractFloat128Sign( a );
4352     aSig0 |= ( aSig1 != 0 );
4353     if ( 0x401E < aExp ) {
4354         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4355         goto invalid;
4356     }
4357     else if ( aExp < 0x3FFF ) {
4358         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4359         return 0;
4360     }
4361     aSig0 |= LIT64( 0x0001000000000000 );
4362     shiftCount = 0x402F - aExp;
4363     savedASig = aSig0;
4364     aSig0 >>= shiftCount;
4365     z = aSig0;
4366     if ( aSign ) z = - z;
4367     if ( ( z < 0 ) ^ aSign ) {
4368  invalid:
4369         float_raise( float_flag_invalid );
4370         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4371     }
4372     if ( ( aSig0<<shiftCount ) != savedASig ) {
4373         float_exception_flags |= float_flag_inexact;
4374     }
4375     return z;
4376 
4377 }
4378 
4379 /*
4380 -------------------------------------------------------------------------------
4381 Returns the result of converting the quadruple-precision floating-point
4382 value `a' to the 64-bit two's complement integer format.  The conversion
4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 Arithmetic---which means in particular that the conversion is rounded
4385 according to the current rounding mode.  If `a' is a NaN, the largest
4386 positive integer is returned.  Otherwise, if the conversion overflows, the
4387 largest integer with the same sign as `a' is returned.
4388 -------------------------------------------------------------------------------
4389 */
4390 int64 float128_to_int64( float128 a )
4391 {
4392     flag aSign;
4393     int32 aExp, shiftCount;
4394     bits64 aSig0, aSig1;
4395 
4396     aSig1 = extractFloat128Frac1( a );
4397     aSig0 = extractFloat128Frac0( a );
4398     aExp = extractFloat128Exp( a );
4399     aSign = extractFloat128Sign( a );
4400     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4401     shiftCount = 0x402F - aExp;
4402     if ( shiftCount <= 0 ) {
4403         if ( 0x403E < aExp ) {
4404             float_raise( float_flag_invalid );
4405             if (    ! aSign
4406                  || (    ( aExp == 0x7FFF )
4407                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4408                     )
4409                ) {
4410                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4411             }
4412             return (sbits64) LIT64( 0x8000000000000000 );
4413         }
4414         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4415     }
4416     else {
4417         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4418     }
4419     return roundAndPackInt64( aSign, aSig0, aSig1 );
4420 
4421 }
4422 
4423 /*
4424 -------------------------------------------------------------------------------
4425 Returns the result of converting the quadruple-precision floating-point
4426 value `a' to the 64-bit two's complement integer format.  The conversion
4427 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4428 Arithmetic, except that the conversion is always rounded toward zero.
4429 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4430 the conversion overflows, the largest integer with the same sign as `a' is
4431 returned.
4432 -------------------------------------------------------------------------------
4433 */
4434 int64 float128_to_int64_round_to_zero( float128 a )
4435 {
4436     flag aSign;
4437     int32 aExp, shiftCount;
4438     bits64 aSig0, aSig1;
4439     int64 z;
4440 
4441     aSig1 = extractFloat128Frac1( a );
4442     aSig0 = extractFloat128Frac0( a );
4443     aExp = extractFloat128Exp( a );
4444     aSign = extractFloat128Sign( a );
4445     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4446     shiftCount = aExp - 0x402F;
4447     if ( 0 < shiftCount ) {
4448         if ( 0x403E <= aExp ) {
4449             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4450             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4451                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4452                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4453             }
4454             else {
4455                 float_raise( float_flag_invalid );
4456                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4457                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4458                 }
4459             }
4460             return (sbits64) LIT64( 0x8000000000000000 );
4461         }
4462         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4463         if ( (bits64) ( aSig1<<shiftCount ) ) {
4464             float_exception_flags |= float_flag_inexact;
4465         }
4466     }
4467     else {
4468         if ( aExp < 0x3FFF ) {
4469             if ( aExp | aSig0 | aSig1 ) {
4470                 float_exception_flags |= float_flag_inexact;
4471             }
4472             return 0;
4473         }
4474         z = aSig0>>( - shiftCount );
4475         if (    aSig1
4476              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4477             float_exception_flags |= float_flag_inexact;
4478         }
4479     }
4480     if ( aSign ) z = - z;
4481     return z;
4482 
4483 }
4484 
4485 /*
4486  * just like above - but do not care for overflow of signed results
4487  */
4488 uint64 float128_to_uint64_round_to_zero( float128 a )
4489 {
4490     flag aSign;
4491     int32 aExp, shiftCount;
4492     bits64 aSig0, aSig1;
4493     uint64 z;
4494 
4495     aSig1 = extractFloat128Frac1( a );
4496     aSig0 = extractFloat128Frac0( a );
4497     aExp = extractFloat128Exp( a );
4498     aSign = extractFloat128Sign( a );
4499     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4500     shiftCount = aExp - 0x402F;
4501     if ( 0 < shiftCount ) {
4502         if ( 0x403F <= aExp ) {
4503             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4504             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4505                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4506                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4507             }
4508             else {
4509                 float_raise( float_flag_invalid );
4510             }
4511             return LIT64( 0xFFFFFFFFFFFFFFFF );
4512         }
4513         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4514         if ( (bits64) ( aSig1<<shiftCount ) ) {
4515             float_exception_flags |= float_flag_inexact;
4516         }
4517     }
4518     else {
4519         if ( aExp < 0x3FFF ) {
4520             if ( aExp | aSig0 | aSig1 ) {
4521                 float_exception_flags |= float_flag_inexact;
4522             }
4523             return 0;
4524         }
4525         z = aSig0>>( - shiftCount );
4526         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4527             float_exception_flags |= float_flag_inexact;
4528         }
4529     }
4530     if ( aSign ) z = - z;
4531     return z;
4532 
4533 }
4534 
4535 /*
4536 -------------------------------------------------------------------------------
4537 Returns the result of converting the quadruple-precision floating-point
4538 value `a' to the single-precision floating-point format.  The conversion
4539 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4540 Arithmetic.
4541 -------------------------------------------------------------------------------
4542 */
4543 float32 float128_to_float32( float128 a )
4544 {
4545     flag aSign;
4546     int32 aExp;
4547     bits64 aSig0, aSig1;
4548     bits32 zSig;
4549 
4550     aSig1 = extractFloat128Frac1( a );
4551     aSig0 = extractFloat128Frac0( a );
4552     aExp = extractFloat128Exp( a );
4553     aSign = extractFloat128Sign( a );
4554     if ( aExp == 0x7FFF ) {
4555         if ( aSig0 | aSig1 ) {
4556             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4557         }
4558         return packFloat32( aSign, 0xFF, 0 );
4559     }
4560     aSig0 |= ( aSig1 != 0 );
4561     shift64RightJamming( aSig0, 18, &aSig0 );
4562     zSig = aSig0;
4563     if ( aExp || zSig ) {
4564         zSig |= 0x40000000;
4565         aExp -= 0x3F81;
4566     }
4567     return roundAndPackFloat32( aSign, aExp, zSig );
4568 
4569 }
4570 
4571 /*
4572 -------------------------------------------------------------------------------
4573 Returns the result of converting the quadruple-precision floating-point
4574 value `a' to the double-precision floating-point format.  The conversion
4575 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4576 Arithmetic.
4577 -------------------------------------------------------------------------------
4578 */
4579 float64 float128_to_float64( float128 a )
4580 {
4581     flag aSign;
4582     int32 aExp;
4583     bits64 aSig0, aSig1;
4584 
4585     aSig1 = extractFloat128Frac1( a );
4586     aSig0 = extractFloat128Frac0( a );
4587     aExp = extractFloat128Exp( a );
4588     aSign = extractFloat128Sign( a );
4589     if ( aExp == 0x7FFF ) {
4590         if ( aSig0 | aSig1 ) {
4591             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4592         }
4593         return packFloat64( aSign, 0x7FF, 0 );
4594     }
4595     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4596     aSig0 |= ( aSig1 != 0 );
4597     if ( aExp || aSig0 ) {
4598         aSig0 |= LIT64( 0x4000000000000000 );
4599         aExp -= 0x3C01;
4600     }
4601     return roundAndPackFloat64( aSign, aExp, aSig0 );
4602 
4603 }
4604 
4605 #ifdef FLOATX80
4606 
4607 /*
4608 -------------------------------------------------------------------------------
4609 Returns the result of converting the quadruple-precision floating-point
4610 value `a' to the extended double-precision floating-point format.  The
4611 conversion is performed according to the IEC/IEEE Standard for Binary
4612 Floating-Point Arithmetic.
4613 -------------------------------------------------------------------------------
4614 */
4615 floatx80 float128_to_floatx80( float128 a )
4616 {
4617     flag aSign;
4618     int32 aExp;
4619     bits64 aSig0, aSig1;
4620 
4621     aSig1 = extractFloat128Frac1( a );
4622     aSig0 = extractFloat128Frac0( a );
4623     aExp = extractFloat128Exp( a );
4624     aSign = extractFloat128Sign( a );
4625     if ( aExp == 0x7FFF ) {
4626         if ( aSig0 | aSig1 ) {
4627             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4628         }
4629         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4630     }
4631     if ( aExp == 0 ) {
4632         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4633         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4634     }
4635     else {
4636         aSig0 |= LIT64( 0x0001000000000000 );
4637     }
4638     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4639     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4640 
4641 }
4642 
4643 #endif
4644 
4645 /*
4646 -------------------------------------------------------------------------------
4647 Rounds the quadruple-precision floating-point value `a' to an integer, and
4648 returns the result as a quadruple-precision floating-point value.  The
4649 operation is performed according to the IEC/IEEE Standard for Binary
4650 Floating-Point Arithmetic.
4651 -------------------------------------------------------------------------------
4652 */
4653 float128 float128_round_to_int( float128 a )
4654 {
4655     flag aSign;
4656     int32 aExp;
4657     bits64 lastBitMask, roundBitsMask;
4658     int8 roundingMode;
4659     float128 z;
4660 
4661     aExp = extractFloat128Exp( a );
4662     if ( 0x402F <= aExp ) {
4663         if ( 0x406F <= aExp ) {
4664             if (    ( aExp == 0x7FFF )
4665                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4666                ) {
4667                 return propagateFloat128NaN( a, a );
4668             }
4669             return a;
4670         }
4671         lastBitMask = 1;
4672         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4673         roundBitsMask = lastBitMask - 1;
4674         z = a;
4675         roundingMode = float_rounding_mode;
4676         if ( roundingMode == float_round_nearest_even ) {
4677             if ( lastBitMask ) {
4678                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4679                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4680             }
4681             else {
4682                 if ( (sbits64) z.low < 0 ) {
4683                     ++z.high;
4684                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4685                 }
4686             }
4687         }
4688         else if ( roundingMode != float_round_to_zero ) {
4689             if (   extractFloat128Sign( z )
4690                  ^ ( roundingMode == float_round_up ) ) {
4691                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4692             }
4693         }
4694         z.low &= ~ roundBitsMask;
4695     }
4696     else {
4697         if ( aExp < 0x3FFF ) {
4698             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4699             float_exception_flags |= float_flag_inexact;
4700             aSign = extractFloat128Sign( a );
4701             switch ( float_rounding_mode ) {
4702              case float_round_nearest_even:
4703                 if (    ( aExp == 0x3FFE )
4704                      && (   extractFloat128Frac0( a )
4705                           | extractFloat128Frac1( a ) )
4706                    ) {
4707                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4708                 }
4709                 break;
4710 	     case float_round_to_zero:
4711 		break;
4712              case float_round_down:
4713                 return
4714                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4715                     : packFloat128( 0, 0, 0, 0 );
4716              case float_round_up:
4717                 return
4718                       aSign ? packFloat128( 1, 0, 0, 0 )
4719                     : packFloat128( 0, 0x3FFF, 0, 0 );
4720             }
4721             return packFloat128( aSign, 0, 0, 0 );
4722         }
4723         lastBitMask = 1;
4724         lastBitMask <<= 0x402F - aExp;
4725         roundBitsMask = lastBitMask - 1;
4726         z.low = 0;
4727         z.high = a.high;
4728         roundingMode = float_rounding_mode;
4729         if ( roundingMode == float_round_nearest_even ) {
4730             z.high += lastBitMask>>1;
4731             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4732                 z.high &= ~ lastBitMask;
4733             }
4734         }
4735         else if ( roundingMode != float_round_to_zero ) {
4736             if (   extractFloat128Sign( z )
4737                  ^ ( roundingMode == float_round_up ) ) {
4738                 z.high |= ( a.low != 0 );
4739                 z.high += roundBitsMask;
4740             }
4741         }
4742         z.high &= ~ roundBitsMask;
4743     }
4744     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4745         float_exception_flags |= float_flag_inexact;
4746     }
4747     return z;
4748 
4749 }
4750 
4751 /*
4752 -------------------------------------------------------------------------------
4753 Returns the result of adding the absolute values of the quadruple-precision
4754 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4755 before being returned.  `zSign' is ignored if the result is a NaN.
4756 The addition is performed according to the IEC/IEEE Standard for Binary
4757 Floating-Point Arithmetic.
4758 -------------------------------------------------------------------------------
4759 */
4760 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4761 {
4762     int32 aExp, bExp, zExp;
4763     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4764     int32 expDiff;
4765 
4766     aSig1 = extractFloat128Frac1( a );
4767     aSig0 = extractFloat128Frac0( a );
4768     aExp = extractFloat128Exp( a );
4769     bSig1 = extractFloat128Frac1( b );
4770     bSig0 = extractFloat128Frac0( b );
4771     bExp = extractFloat128Exp( b );
4772     expDiff = aExp - bExp;
4773     if ( 0 < expDiff ) {
4774         if ( aExp == 0x7FFF ) {
4775             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4776             return a;
4777         }
4778         if ( bExp == 0 ) {
4779             --expDiff;
4780         }
4781         else {
4782             bSig0 |= LIT64( 0x0001000000000000 );
4783         }
4784         shift128ExtraRightJamming(
4785             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4786         zExp = aExp;
4787     }
4788     else if ( expDiff < 0 ) {
4789         if ( bExp == 0x7FFF ) {
4790             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4791             return packFloat128( zSign, 0x7FFF, 0, 0 );
4792         }
4793         if ( aExp == 0 ) {
4794             ++expDiff;
4795         }
4796         else {
4797             aSig0 |= LIT64( 0x0001000000000000 );
4798         }
4799         shift128ExtraRightJamming(
4800             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4801         zExp = bExp;
4802     }
4803     else {
4804         if ( aExp == 0x7FFF ) {
4805             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4806                 return propagateFloat128NaN( a, b );
4807             }
4808             return a;
4809         }
4810         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4811         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4812         zSig2 = 0;
4813         zSig0 |= LIT64( 0x0002000000000000 );
4814         zExp = aExp;
4815         goto shiftRight1;
4816     }
4817     aSig0 |= LIT64( 0x0001000000000000 );
4818     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4819     --zExp;
4820     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4821     ++zExp;
4822  shiftRight1:
4823     shift128ExtraRightJamming(
4824         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4825  roundAndPack:
4826     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4827 
4828 }
4829 
4830 /*
4831 -------------------------------------------------------------------------------
4832 Returns the result of subtracting the absolute values of the quadruple-
4833 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4834 difference is negated before being returned.  `zSign' is ignored if the
4835 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4836 Standard for Binary Floating-Point Arithmetic.
4837 -------------------------------------------------------------------------------
4838 */
4839 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4840 {
4841     int32 aExp, bExp, zExp;
4842     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4843     int32 expDiff;
4844     float128 z;
4845 
4846     aSig1 = extractFloat128Frac1( a );
4847     aSig0 = extractFloat128Frac0( a );
4848     aExp = extractFloat128Exp( a );
4849     bSig1 = extractFloat128Frac1( b );
4850     bSig0 = extractFloat128Frac0( b );
4851     bExp = extractFloat128Exp( b );
4852     expDiff = aExp - bExp;
4853     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4854     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4855     if ( 0 < expDiff ) goto aExpBigger;
4856     if ( expDiff < 0 ) goto bExpBigger;
4857     if ( aExp == 0x7FFF ) {
4858         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4859             return propagateFloat128NaN( a, b );
4860         }
4861         float_raise( float_flag_invalid );
4862         z.low = float128_default_nan_low;
4863         z.high = float128_default_nan_high;
4864         return z;
4865     }
4866     if ( aExp == 0 ) {
4867         aExp = 1;
4868         bExp = 1;
4869     }
4870     if ( bSig0 < aSig0 ) goto aBigger;
4871     if ( aSig0 < bSig0 ) goto bBigger;
4872     if ( bSig1 < aSig1 ) goto aBigger;
4873     if ( aSig1 < bSig1 ) goto bBigger;
4874     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4875  bExpBigger:
4876     if ( bExp == 0x7FFF ) {
4877         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4878         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4879     }
4880     if ( aExp == 0 ) {
4881         ++expDiff;
4882     }
4883     else {
4884         aSig0 |= LIT64( 0x4000000000000000 );
4885     }
4886     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4887     bSig0 |= LIT64( 0x4000000000000000 );
4888  bBigger:
4889     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4890     zExp = bExp;
4891     zSign ^= 1;
4892     goto normalizeRoundAndPack;
4893  aExpBigger:
4894     if ( aExp == 0x7FFF ) {
4895         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4896         return a;
4897     }
4898     if ( bExp == 0 ) {
4899         --expDiff;
4900     }
4901     else {
4902         bSig0 |= LIT64( 0x4000000000000000 );
4903     }
4904     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4905     aSig0 |= LIT64( 0x4000000000000000 );
4906  aBigger:
4907     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4908     zExp = aExp;
4909  normalizeRoundAndPack:
4910     --zExp;
4911     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4912 
4913 }
4914 
4915 /*
4916 -------------------------------------------------------------------------------
4917 Returns the result of adding the quadruple-precision floating-point values
4918 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4919 for Binary Floating-Point Arithmetic.
4920 -------------------------------------------------------------------------------
4921 */
4922 float128 float128_add( float128 a, float128 b )
4923 {
4924     flag aSign, bSign;
4925 
4926     aSign = extractFloat128Sign( a );
4927     bSign = extractFloat128Sign( b );
4928     if ( aSign == bSign ) {
4929         return addFloat128Sigs( a, b, aSign );
4930     }
4931     else {
4932         return subFloat128Sigs( a, b, aSign );
4933     }
4934 
4935 }
4936 
4937 /*
4938 -------------------------------------------------------------------------------
4939 Returns the result of subtracting the quadruple-precision floating-point
4940 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4941 Standard for Binary Floating-Point Arithmetic.
4942 -------------------------------------------------------------------------------
4943 */
4944 float128 float128_sub( float128 a, float128 b )
4945 {
4946     flag aSign, bSign;
4947 
4948     aSign = extractFloat128Sign( a );
4949     bSign = extractFloat128Sign( b );
4950     if ( aSign == bSign ) {
4951         return subFloat128Sigs( a, b, aSign );
4952     }
4953     else {
4954         return addFloat128Sigs( a, b, aSign );
4955     }
4956 
4957 }
4958 
4959 /*
4960 -------------------------------------------------------------------------------
4961 Returns the result of multiplying the quadruple-precision floating-point
4962 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4963 Standard for Binary Floating-Point Arithmetic.
4964 -------------------------------------------------------------------------------
4965 */
4966 float128 float128_mul( float128 a, float128 b )
4967 {
4968     flag aSign, bSign, zSign;
4969     int32 aExp, bExp, zExp;
4970     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4971     float128 z;
4972 
4973     aSig1 = extractFloat128Frac1( a );
4974     aSig0 = extractFloat128Frac0( a );
4975     aExp = extractFloat128Exp( a );
4976     aSign = extractFloat128Sign( a );
4977     bSig1 = extractFloat128Frac1( b );
4978     bSig0 = extractFloat128Frac0( b );
4979     bExp = extractFloat128Exp( b );
4980     bSign = extractFloat128Sign( b );
4981     zSign = aSign ^ bSign;
4982     if ( aExp == 0x7FFF ) {
4983         if (    ( aSig0 | aSig1 )
4984              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4985             return propagateFloat128NaN( a, b );
4986         }
4987         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4988         return packFloat128( zSign, 0x7FFF, 0, 0 );
4989     }
4990     if ( bExp == 0x7FFF ) {
4991         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4992         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4993  invalid:
4994             float_raise( float_flag_invalid );
4995             z.low = float128_default_nan_low;
4996             z.high = float128_default_nan_high;
4997             return z;
4998         }
4999         return packFloat128( zSign, 0x7FFF, 0, 0 );
5000     }
5001     if ( aExp == 0 ) {
5002         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5003         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5004     }
5005     if ( bExp == 0 ) {
5006         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5007         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5008     }
5009     zExp = aExp + bExp - 0x4000;
5010     aSig0 |= LIT64( 0x0001000000000000 );
5011     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5012     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5013     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5014     zSig2 |= ( zSig3 != 0 );
5015     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5016         shift128ExtraRightJamming(
5017             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5018         ++zExp;
5019     }
5020     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5021 
5022 }
5023 
5024 /*
5025 -------------------------------------------------------------------------------
5026 Returns the result of dividing the quadruple-precision floating-point value
5027 `a' by the corresponding value `b'.  The operation is performed according to
5028 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5029 -------------------------------------------------------------------------------
5030 */
5031 float128 float128_div( float128 a, float128 b )
5032 {
5033     flag aSign, bSign, zSign;
5034     int32 aExp, bExp, zExp;
5035     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5036     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5037     float128 z;
5038 
5039     aSig1 = extractFloat128Frac1( a );
5040     aSig0 = extractFloat128Frac0( a );
5041     aExp = extractFloat128Exp( a );
5042     aSign = extractFloat128Sign( a );
5043     bSig1 = extractFloat128Frac1( b );
5044     bSig0 = extractFloat128Frac0( b );
5045     bExp = extractFloat128Exp( b );
5046     bSign = extractFloat128Sign( b );
5047     zSign = aSign ^ bSign;
5048     if ( aExp == 0x7FFF ) {
5049         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5050         if ( bExp == 0x7FFF ) {
5051             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5052             goto invalid;
5053         }
5054         return packFloat128( zSign, 0x7FFF, 0, 0 );
5055     }
5056     if ( bExp == 0x7FFF ) {
5057         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5058         return packFloat128( zSign, 0, 0, 0 );
5059     }
5060     if ( bExp == 0 ) {
5061         if ( ( bSig0 | bSig1 ) == 0 ) {
5062             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5063  invalid:
5064                 float_raise( float_flag_invalid );
5065                 z.low = float128_default_nan_low;
5066                 z.high = float128_default_nan_high;
5067                 return z;
5068             }
5069             float_raise( float_flag_divbyzero );
5070             return packFloat128( zSign, 0x7FFF, 0, 0 );
5071         }
5072         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5073     }
5074     if ( aExp == 0 ) {
5075         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5076         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5077     }
5078     zExp = aExp - bExp + 0x3FFD;
5079     shortShift128Left(
5080         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5081     shortShift128Left(
5082         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5083     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5084         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5085         ++zExp;
5086     }
5087     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5088     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5089     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5090     while ( (sbits64) rem0 < 0 ) {
5091         --zSig0;
5092         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5093     }
5094     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5095     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5096         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5097         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5098         while ( (sbits64) rem1 < 0 ) {
5099             --zSig1;
5100             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5101         }
5102         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5103     }
5104     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5106 
5107 }
5108 
5109 /*
5110 -------------------------------------------------------------------------------
5111 Returns the remainder of the quadruple-precision floating-point value `a'
5112 with respect to the corresponding value `b'.  The operation is performed
5113 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5114 -------------------------------------------------------------------------------
5115 */
5116 float128 float128_rem( float128 a, float128 b )
5117 {
5118     flag aSign, bSign, zSign;
5119     int32 aExp, bExp, expDiff;
5120     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5121     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5122     sbits64 sigMean0;
5123     float128 z;
5124 
5125     aSig1 = extractFloat128Frac1( a );
5126     aSig0 = extractFloat128Frac0( a );
5127     aExp = extractFloat128Exp( a );
5128     aSign = extractFloat128Sign( a );
5129     bSig1 = extractFloat128Frac1( b );
5130     bSig0 = extractFloat128Frac0( b );
5131     bExp = extractFloat128Exp( b );
5132     bSign = extractFloat128Sign( b );
5133     if ( aExp == 0x7FFF ) {
5134         if (    ( aSig0 | aSig1 )
5135              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5136             return propagateFloat128NaN( a, b );
5137         }
5138         goto invalid;
5139     }
5140     if ( bExp == 0x7FFF ) {
5141         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5142         return a;
5143     }
5144     if ( bExp == 0 ) {
5145         if ( ( bSig0 | bSig1 ) == 0 ) {
5146  invalid:
5147             float_raise( float_flag_invalid );
5148             z.low = float128_default_nan_low;
5149             z.high = float128_default_nan_high;
5150             return z;
5151         }
5152         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5153     }
5154     if ( aExp == 0 ) {
5155         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5156         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5157     }
5158     expDiff = aExp - bExp;
5159     if ( expDiff < -1 ) return a;
5160     shortShift128Left(
5161         aSig0 | LIT64( 0x0001000000000000 ),
5162         aSig1,
5163         15 - ( expDiff < 0 ),
5164         &aSig0,
5165         &aSig1
5166     );
5167     shortShift128Left(
5168         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5169     q = le128( bSig0, bSig1, aSig0, aSig1 );
5170     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5171     expDiff -= 64;
5172     while ( 0 < expDiff ) {
5173         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5174         q = ( 4 < q ) ? q - 4 : 0;
5175         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5176         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5177         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5178         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5179         expDiff -= 61;
5180     }
5181     if ( -64 < expDiff ) {
5182         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5183         q = ( 4 < q ) ? q - 4 : 0;
5184         q >>= - expDiff;
5185         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5186         expDiff += 52;
5187         if ( expDiff < 0 ) {
5188             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5189         }
5190         else {
5191             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5192         }
5193         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5194         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5195     }
5196     else {
5197         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5198         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5199     }
5200     do {
5201         alternateASig0 = aSig0;
5202         alternateASig1 = aSig1;
5203         ++q;
5204         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5205     } while ( 0 <= (sbits64) aSig0 );
5206     add128(
5207         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5208     if (    ( sigMean0 < 0 )
5209          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5210         aSig0 = alternateASig0;
5211         aSig1 = alternateASig1;
5212     }
5213     zSign = ( (sbits64) aSig0 < 0 );
5214     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5215     return
5216         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5217 
5218 }
5219 
5220 /*
5221 -------------------------------------------------------------------------------
5222 Returns the square root of the quadruple-precision floating-point value `a'.
5223 The operation is performed according to the IEC/IEEE Standard for Binary
5224 Floating-Point Arithmetic.
5225 -------------------------------------------------------------------------------
5226 */
5227 float128 float128_sqrt( float128 a )
5228 {
5229     flag aSign;
5230     int32 aExp, zExp;
5231     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5232     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5233     float128 z;
5234 
5235     aSig1 = extractFloat128Frac1( a );
5236     aSig0 = extractFloat128Frac0( a );
5237     aExp = extractFloat128Exp( a );
5238     aSign = extractFloat128Sign( a );
5239     if ( aExp == 0x7FFF ) {
5240         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5241         if ( ! aSign ) return a;
5242         goto invalid;
5243     }
5244     if ( aSign ) {
5245         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5246  invalid:
5247         float_raise( float_flag_invalid );
5248         z.low = float128_default_nan_low;
5249         z.high = float128_default_nan_high;
5250         return z;
5251     }
5252     if ( aExp == 0 ) {
5253         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5254         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5255     }
5256     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5257     aSig0 |= LIT64( 0x0001000000000000 );
5258     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5259     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5260     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5261     doubleZSig0 = zSig0<<1;
5262     mul64To128( zSig0, zSig0, &term0, &term1 );
5263     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5264     while ( (sbits64) rem0 < 0 ) {
5265         --zSig0;
5266         doubleZSig0 -= 2;
5267         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5268     }
5269     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5270     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5271         if ( zSig1 == 0 ) zSig1 = 1;
5272         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5273         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5274         mul64To128( zSig1, zSig1, &term2, &term3 );
5275         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5276         while ( (sbits64) rem1 < 0 ) {
5277             --zSig1;
5278             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5279             term3 |= 1;
5280             term2 |= doubleZSig0;
5281             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5282         }
5283         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5284     }
5285     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5286     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5287 
5288 }
5289 
5290 /*
5291 -------------------------------------------------------------------------------
5292 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5293 the corresponding value `b', and 0 otherwise.  The comparison is performed
5294 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5295 -------------------------------------------------------------------------------
5296 */
5297 flag float128_eq( float128 a, float128 b )
5298 {
5299 
5300     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5301               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5302          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5303               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5304        ) {
5305         if (    float128_is_signaling_nan( a )
5306              || float128_is_signaling_nan( b ) ) {
5307             float_raise( float_flag_invalid );
5308         }
5309         return 0;
5310     }
5311     return
5312            ( a.low == b.low )
5313         && (    ( a.high == b.high )
5314              || (    ( a.low == 0 )
5315                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5316            );
5317 
5318 }
5319 
5320 /*
5321 -------------------------------------------------------------------------------
5322 Returns 1 if the quadruple-precision floating-point value `a' is less than
5323 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5324 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5325 Arithmetic.
5326 -------------------------------------------------------------------------------
5327 */
5328 flag float128_le( float128 a, float128 b )
5329 {
5330     flag aSign, bSign;
5331 
5332     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5333               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5334          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5335               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5336        ) {
5337         float_raise( float_flag_invalid );
5338         return 0;
5339     }
5340     aSign = extractFloat128Sign( a );
5341     bSign = extractFloat128Sign( b );
5342     if ( aSign != bSign ) {
5343         return
5344                aSign
5345             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5346                  == 0 );
5347     }
5348     return
5349           aSign ? le128( b.high, b.low, a.high, a.low )
5350         : le128( a.high, a.low, b.high, b.low );
5351 
5352 }
5353 
5354 /*
5355 -------------------------------------------------------------------------------
5356 Returns 1 if the quadruple-precision floating-point value `a' is less than
5357 the corresponding value `b', and 0 otherwise.  The comparison is performed
5358 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5359 -------------------------------------------------------------------------------
5360 */
5361 flag float128_lt( float128 a, float128 b )
5362 {
5363     flag aSign, bSign;
5364 
5365     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5366               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5367          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5368               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5369        ) {
5370         float_raise( float_flag_invalid );
5371         return 0;
5372     }
5373     aSign = extractFloat128Sign( a );
5374     bSign = extractFloat128Sign( b );
5375     if ( aSign != bSign ) {
5376         return
5377                aSign
5378             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5379                  != 0 );
5380     }
5381     return
5382           aSign ? lt128( b.high, b.low, a.high, a.low )
5383         : lt128( a.high, a.low, b.high, b.low );
5384 
5385 }
5386 
5387 /*
5388 -------------------------------------------------------------------------------
5389 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5390 the corresponding value `b', and 0 otherwise.  The invalid exception is
5391 raised if either operand is a NaN.  Otherwise, the comparison is performed
5392 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5393 -------------------------------------------------------------------------------
5394 */
5395 flag float128_eq_signaling( float128 a, float128 b )
5396 {
5397 
5398     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5399               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5400          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5401               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5402        ) {
5403         float_raise( float_flag_invalid );
5404         return 0;
5405     }
5406     return
5407            ( a.low == b.low )
5408         && (    ( a.high == b.high )
5409              || (    ( a.low == 0 )
5410                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5411            );
5412 
5413 }
5414 
5415 /*
5416 -------------------------------------------------------------------------------
5417 Returns 1 if the quadruple-precision floating-point value `a' is less than
5418 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5419 cause an exception.  Otherwise, the comparison is performed according to the
5420 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5421 -------------------------------------------------------------------------------
5422 */
5423 flag float128_le_quiet( float128 a, float128 b )
5424 {
5425     flag aSign, bSign;
5426 
5427     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5428               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5429          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5430               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5431        ) {
5432         if (    float128_is_signaling_nan( a )
5433              || float128_is_signaling_nan( b ) ) {
5434             float_raise( float_flag_invalid );
5435         }
5436         return 0;
5437     }
5438     aSign = extractFloat128Sign( a );
5439     bSign = extractFloat128Sign( b );
5440     if ( aSign != bSign ) {
5441         return
5442                aSign
5443             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5444                  == 0 );
5445     }
5446     return
5447           aSign ? le128( b.high, b.low, a.high, a.low )
5448         : le128( a.high, a.low, b.high, b.low );
5449 
5450 }
5451 
5452 /*
5453 -------------------------------------------------------------------------------
5454 Returns 1 if the quadruple-precision floating-point value `a' is less than
5455 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5456 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5457 Standard for Binary Floating-Point Arithmetic.
5458 -------------------------------------------------------------------------------
5459 */
5460 flag float128_lt_quiet( float128 a, float128 b )
5461 {
5462     flag aSign, bSign;
5463 
5464     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5465               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5466          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5467               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5468        ) {
5469         if (    float128_is_signaling_nan( a )
5470              || float128_is_signaling_nan( b ) ) {
5471             float_raise( float_flag_invalid );
5472         }
5473         return 0;
5474     }
5475     aSign = extractFloat128Sign( a );
5476     bSign = extractFloat128Sign( b );
5477     if ( aSign != bSign ) {
5478         return
5479                aSign
5480             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5481                  != 0 );
5482     }
5483     return
5484           aSign ? lt128( b.high, b.low, a.high, a.low )
5485         : lt128( a.high, a.low, b.high, b.low );
5486 
5487 }
5488 
5489 #endif
5490 
5491 
5492 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5493 
5494 /*
5495  * These two routines are not part of the original softfloat distribution.
5496  *
5497  * They are based on the corresponding conversions to integer but return
5498  * unsigned numbers instead since these functions are required by GCC.
5499  *
5500  * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5501  *
5502  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5503  */
5504 
5505 /*
5506 -------------------------------------------------------------------------------
5507 Returns the result of converting the double-precision floating-point value
5508 `a' to the 32-bit unsigned integer format.  The conversion is
5509 performed according to the IEC/IEEE Standard for Binary Floating-point
5510 Arithmetic, except that the conversion is always rounded toward zero.  If
5511 `a' is a NaN, the largest positive integer is returned.  If the conversion
5512 overflows, the largest integer positive is returned.
5513 -------------------------------------------------------------------------------
5514 */
5515 uint32 float64_to_uint32_round_to_zero( float64 a )
5516 {
5517     flag aSign;
5518     int16 aExp, shiftCount;
5519     bits64 aSig, savedASig;
5520     uint32 z;
5521 
5522     aSig = extractFloat64Frac( a );
5523     aExp = extractFloat64Exp( a );
5524     aSign = extractFloat64Sign( a );
5525 
5526     if (aSign) {
5527         float_raise( float_flag_invalid );
5528     	return(0);
5529     }
5530 
5531     if ( 0x41E < aExp ) {
5532         float_raise( float_flag_invalid );
5533         return 0xffffffff;
5534     }
5535     else if ( aExp < 0x3FF ) {
5536         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5537         return 0;
5538     }
5539     aSig |= LIT64( 0x0010000000000000 );
5540     shiftCount = 0x433 - aExp;
5541     savedASig = aSig;
5542     aSig >>= shiftCount;
5543     z = aSig;
5544     if ( ( aSig<<shiftCount ) != savedASig ) {
5545         float_exception_flags |= float_flag_inexact;
5546     }
5547     return z;
5548 
5549 }
5550 
5551 /*
5552 -------------------------------------------------------------------------------
5553 Returns the result of converting the single-precision floating-point value
5554 `a' to the 32-bit unsigned integer format.  The conversion is
5555 performed according to the IEC/IEEE Standard for Binary Floating-point
5556 Arithmetic, except that the conversion is always rounded toward zero.  If
5557 `a' is a NaN, the largest positive integer is returned.  If the conversion
5558 overflows, the largest positive integer is returned.
5559 -------------------------------------------------------------------------------
5560 */
5561 uint32 float32_to_uint32_round_to_zero( float32 a )
5562 {
5563     flag aSign;
5564     int16 aExp, shiftCount;
5565     bits32 aSig;
5566     uint32 z;
5567 
5568     aSig = extractFloat32Frac( a );
5569     aExp = extractFloat32Exp( a );
5570     aSign = extractFloat32Sign( a );
5571     shiftCount = aExp - 0x9E;
5572 
5573     if (aSign) {
5574         float_raise( float_flag_invalid );
5575     	return(0);
5576     }
5577     if ( 0 < shiftCount ) {
5578         float_raise( float_flag_invalid );
5579         return 0xFFFFFFFF;
5580     }
5581     else if ( aExp <= 0x7E ) {
5582         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5583         return 0;
5584     }
5585     aSig = ( aSig | 0x800000 )<<8;
5586     z = aSig>>( - shiftCount );
5587     if ( aSig<<( shiftCount & 31 ) ) {
5588         float_exception_flags |= float_flag_inexact;
5589     }
5590     return z;
5591 
5592 }
5593 
5594 #endif
5595