xref: /netbsd-src/sys/lib/libkern/softfloat.c (revision 3dad9f31b4a0e260e2f1594878e7d25d868b132f)
1 /* $NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*============================================================================
18 
19 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
20 Package, Release 2b.
21 
22 Written by John R. Hauser.  This work was made possible in part by the
23 International Computer Science Institute, located at Suite 600, 1947 Center
24 Street, Berkeley, California 94704.  Funding was partially provided by the
25 National Science Foundation under grant MIP-9311980.  The original version
26 of this code was written as part of a project to build a fixed-point vector
27 processor in collaboration with the University of California at Berkeley,
28 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
29 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
30 arithmetic/SoftFloat.html'.
31 
32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
33 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
34 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
35 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
36 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
37 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
38 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
39 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
40 
41 Derivative works are acceptable, even for commercial purposes, so long as
42 (1) the source code for the derivative work includes prominent notice that
43 the work is derivative, and (2) the source code includes prominent notice with
44 these four paragraphs for those parts of this code that are retained.
45 
46 =============================================================================*/
47 
48 /* If you need this in a boot program, you have bigger problems... */
49 #ifndef _STANDALONE
50 
51 #include <sys/cdefs.h>
52 #if defined(LIBC_SCCS) && !defined(lint)
53 __RCSID("$NetBSD: softfloat.c,v 1.7 2020/09/02 03:45:54 thorpej Exp $");
54 #endif /* LIBC_SCCS and not lint */
55 
56 #ifdef SOFTFLOAT_FOR_GCC
57 #include "softfloat-for-gcc.h"
58 #endif
59 
60 #include "milieu.h"
61 #include "softfloat.h"
62 
63 /*
64  * Conversions between floats as stored in memory and floats as
65  * SoftFloat uses them
66  */
67 #ifndef FLOAT64_DEMANGLE
68 #define FLOAT64_DEMANGLE(a)	(a)
69 #endif
70 #ifndef FLOAT64_MANGLE
71 #define FLOAT64_MANGLE(a)	(a)
72 #endif
73 
74 /*----------------------------------------------------------------------------
75 | Floating-point rounding mode, extended double-precision rounding precision,
76 | and exception flags.
77 *----------------------------------------------------------------------------*/
78 /*
79  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
80  * 	Right now, it does not.  I've removed all other dynamic global
81  * 	variables. [ross]
82  */
83 #ifdef FLOATX80
84 int8 floatx80_rounding_precision = 80;
85 #endif
86 
87 /*----------------------------------------------------------------------------
88 | Primitive arithmetic functions, including multi-word arithmetic, and
89 | division and square root approximations.  (Can be specialized to target if
90 | desired.)
91 *----------------------------------------------------------------------------*/
92 #include "softfloat-macros.h"
93 
94 /*----------------------------------------------------------------------------
95 | Functions and definitions to determine:  (1) whether tininess for underflow
96 | is detected before or after rounding by default, (2) what (if anything)
97 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
98 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
99 | are propagated from function inputs to output.  These details are target-
100 | specific.
101 *----------------------------------------------------------------------------*/
102 #include "softfloat-specialize.h"
103 
104 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
105 /*----------------------------------------------------------------------------
106 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 | and 7, and returns the properly rounded 32-bit integer corresponding to the
108 | input.  If `zSign' is 1, the input is negated before being converted to an
109 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
110 | is simply rounded to an integer, with the inexact exception raised if the
111 | input cannot be represented exactly as an integer.  However, if the fixed-
112 | point input is too large, the invalid exception is raised and the largest
113 | positive or negative integer is returned.
114 *----------------------------------------------------------------------------*/
115 
116 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 {
118     int8 roundingMode;
119     flag roundNearestEven;
120     int8 roundIncrement, roundBits;
121     int32 z;
122 
123     roundingMode = float_rounding_mode();
124     roundNearestEven = ( roundingMode == float_round_nearest_even );
125     roundIncrement = 0x40;
126     if ( ! roundNearestEven ) {
127         if ( roundingMode == float_round_to_zero ) {
128             roundIncrement = 0;
129         }
130         else {
131             roundIncrement = 0x7F;
132             if ( zSign ) {
133                 if ( roundingMode == float_round_up ) roundIncrement = 0;
134             }
135             else {
136                 if ( roundingMode == float_round_down ) roundIncrement = 0;
137             }
138         }
139     }
140     roundBits = absZ & 0x7F;
141     absZ = ( absZ + roundIncrement )>>7;
142     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143     z = absZ;
144     if ( zSign ) z = - z;
145     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146         float_raise( float_flag_invalid );
147         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148     }
149     if ( roundBits ) float_set_inexact();
150     return z;
151 
152 }
153 
154 /*----------------------------------------------------------------------------
155 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
156 | `absZ1', with binary point between bits 63 and 64 (between the input words),
157 | and returns the properly rounded 64-bit integer corresponding to the input.
158 | If `zSign' is 1, the input is negated before being converted to an integer.
159 | Ordinarily, the fixed-point input is simply rounded to an integer, with
160 | the inexact exception raised if the input cannot be represented exactly as
161 | an integer.  However, if the fixed-point input is too large, the invalid
162 | exception is raised and the largest positive or negative integer is
163 | returned.
164 *----------------------------------------------------------------------------*/
165 
166 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
167 {
168     int8 roundingMode;
169     flag roundNearestEven, increment;
170     int64 z;
171 
172     roundingMode = float_rounding_mode();
173     roundNearestEven = ( roundingMode == float_round_nearest_even );
174     increment = ( (sbits64) absZ1 < 0 );
175     if ( ! roundNearestEven ) {
176         if ( roundingMode == float_round_to_zero ) {
177             increment = 0;
178         }
179         else {
180             if ( zSign ) {
181                 increment = ( roundingMode == float_round_down ) && absZ1;
182             }
183             else {
184                 increment = ( roundingMode == float_round_up ) && absZ1;
185             }
186         }
187     }
188     if ( increment ) {
189         ++absZ0;
190         if ( absZ0 == 0 ) goto overflow;
191         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
192     }
193     z = absZ0;
194     if ( zSign ) z = - z;
195     if ( z && ( ( z < 0 ) ^ zSign ) ) {
196  overflow:
197         float_raise( float_flag_invalid );
198         return
199               zSign ? (sbits64) LIT64( 0x8000000000000000 )
200             : LIT64( 0x7FFFFFFFFFFFFFFF );
201     }
202     if ( absZ1 ) float_set_inexact();
203     return z;
204 
205 }
206 
207 /* same as above, but for unsigned values */
208 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
209 {
210     int8 roundingMode;
211     flag roundNearestEven, increment;
212     uint64 z;
213 
214     roundingMode = float_rounding_mode();
215     roundNearestEven = ( roundingMode == float_round_nearest_even );
216     increment = ( (sbits64) absZ1 < 0 );
217     if ( ! roundNearestEven ) {
218         if ( roundingMode == float_round_to_zero ) {
219             increment = 0;
220         }
221         else {
222             increment = ( roundingMode == float_round_up ) && absZ1;
223         }
224     }
225     if ( increment ) {
226         ++absZ0;
227         if ( absZ0 == 0 ) {
228             float_raise( float_flag_invalid );
229             return LIT64( 0x7FFFFFFFFFFFFFFF );
230 	}
231         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
232     }
233     z = absZ0;
234     if ( absZ1 ) float_set_inexact();
235     return z;
236 
237 }
238 #endif /* SOFTFLOAT_FOR_GCC */
239 
240 /*----------------------------------------------------------------------------
241 | Returns the fraction bits of the single-precision floating-point value `a'.
242 *----------------------------------------------------------------------------*/
243 
244 INLINE bits32 extractFloat32Frac( float32 a )
245 {
246 
247     return a & 0x007FFFFF;
248 
249 }
250 
251 /*----------------------------------------------------------------------------
252 | Returns the exponent bits of the single-precision floating-point value `a'.
253 *----------------------------------------------------------------------------*/
254 
255 INLINE int16 extractFloat32Exp( float32 a )
256 {
257 
258     return ( a>>23 ) & 0xFF;
259 
260 }
261 
262 /*----------------------------------------------------------------------------
263 | Returns the sign bit of the single-precision floating-point value `a'.
264 *----------------------------------------------------------------------------*/
265 
266 INLINE flag extractFloat32Sign( float32 a )
267 {
268 
269     return a>>31;
270 
271 }
272 
273 /*----------------------------------------------------------------------------
274 | Normalizes the subnormal single-precision floating-point value represented
275 | by the denormalized significand `aSig'.  The normalized exponent and
276 | significand are stored at the locations pointed to by `zExpPtr' and
277 | `zSigPtr', respectively.
278 *----------------------------------------------------------------------------*/
279 
280 static void
281  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
282 {
283     int8 shiftCount;
284 
285     shiftCount = countLeadingZeros32( aSig ) - 8;
286     *zSigPtr = aSig<<shiftCount;
287     *zExpPtr = 1 - shiftCount;
288 
289 }
290 
291 /*----------------------------------------------------------------------------
292 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
293 | single-precision floating-point value, returning the result.  After being
294 | shifted into the proper positions, the three fields are simply added
295 | together to form the result.  This means that any integer portion of `zSig'
296 | will be added into the exponent.  Since a properly normalized significand
297 | will have an integer portion equal to 1, the `zExp' input should be 1 less
298 | than the desired result exponent whenever `zSig' is a complete, normalized
299 | significand.
300 *----------------------------------------------------------------------------*/
301 
302 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
303 {
304 
305     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
306 
307 }
308 
309 /*----------------------------------------------------------------------------
310 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
311 | and significand `zSig', and returns the proper single-precision floating-
312 | point value corresponding to the abstract input.  Ordinarily, the abstract
313 | value is simply rounded and packed into the single-precision format, with
314 | the inexact exception raised if the abstract input cannot be represented
315 | exactly.  However, if the abstract value is too large, the overflow and
316 | inexact exceptions are raised and an infinity or maximal finite value is
317 | returned.  If the abstract value is too small, the input value is rounded to
318 | a subnormal number, and the underflow and inexact exceptions are raised if
319 | the abstract input cannot be represented exactly as a subnormal single-
320 | precision floating-point number.
321 |     The input significand `zSig' has its binary point between bits 30
322 | and 29, which is 7 bits to the left of the usual location.  This shifted
323 | significand must be normalized or smaller.  If `zSig' is not normalized,
324 | `zExp' must be 0; in that case, the result returned is a subnormal number,
325 | and it must not require rounding.  In the usual case that `zSig' is
326 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
327 | The handling of underflow and overflow follows the IEC/IEEE Standard for
328 | Binary Floating-Point Arithmetic.
329 *----------------------------------------------------------------------------*/
330 
331 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
332 {
333     int8 roundingMode;
334     flag roundNearestEven;
335     int8 roundIncrement, roundBits;
336     flag isTiny;
337 
338     roundingMode = float_rounding_mode();
339     roundNearestEven = ( roundingMode == float_round_nearest_even );
340     roundIncrement = 0x40;
341     if ( ! roundNearestEven ) {
342         if ( roundingMode == float_round_to_zero ) {
343             roundIncrement = 0;
344         }
345         else {
346             roundIncrement = 0x7F;
347             if ( zSign ) {
348                 if ( roundingMode == float_round_up ) roundIncrement = 0;
349             }
350             else {
351                 if ( roundingMode == float_round_down ) roundIncrement = 0;
352             }
353         }
354     }
355     roundBits = zSig & 0x7F;
356     if ( 0xFD <= (bits16) zExp ) {
357         if (    ( 0xFD < zExp )
358              || (    ( zExp == 0xFD )
359                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
360            ) {
361             float_raise( float_flag_overflow | float_flag_inexact );
362             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
363         }
364         if ( zExp < 0 ) {
365             isTiny =
366                    ( float_detect_tininess == float_tininess_before_rounding )
367                 || ( zExp < -1 )
368                 || ( zSig + roundIncrement < 0x80000000 );
369             shift32RightJamming( zSig, - zExp, &zSig );
370             zExp = 0;
371             roundBits = zSig & 0x7F;
372             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
373         }
374     }
375     if ( roundBits ) float_set_inexact();
376     zSig = ( zSig + roundIncrement )>>7;
377     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
378     if ( zSig == 0 ) zExp = 0;
379     return packFloat32( zSign, zExp, zSig );
380 
381 }
382 
383 /*----------------------------------------------------------------------------
384 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
385 | and significand `zSig', and returns the proper single-precision floating-
386 | point value corresponding to the abstract input.  This routine is just like
387 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
388 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
389 | floating-point exponent.
390 *----------------------------------------------------------------------------*/
391 
392 static float32
393  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
394 {
395     int8 shiftCount;
396 
397     shiftCount = countLeadingZeros32( zSig ) - 1;
398     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
399 
400 }
401 
402 /*----------------------------------------------------------------------------
403 | Returns the fraction bits of the double-precision floating-point value `a'.
404 *----------------------------------------------------------------------------*/
405 
406 INLINE bits64 extractFloat64Frac( float64 a )
407 {
408 
409     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
410 
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the double-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 INLINE int16 extractFloat64Exp( float64 a )
418 {
419 
420     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
421 
422 }
423 /*----------------------------------------------------------------------------
424 | Returns the sign bit of the double-precision floating-point value `a'.
425 *----------------------------------------------------------------------------*/
426 
427 INLINE flag extractFloat64Sign( float64 a )
428 {
429 
430     return FLOAT64_DEMANGLE(a)>>63;
431 
432 }
433 
434 /*----------------------------------------------------------------------------
435 | Normalizes the subnormal double-precision floating-point value represented
436 | by the denormalized significand `aSig'.  The normalized exponent and
437 | significand are stored at the locations pointed to by `zExpPtr' and
438 | `zSigPtr', respectively.
439 *----------------------------------------------------------------------------*/
440 
441 static void
442  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
443 {
444     int8 shiftCount;
445 
446     shiftCount = countLeadingZeros64( aSig ) - 11;
447     *zSigPtr = aSig<<shiftCount;
448     *zExpPtr = 1 - shiftCount;
449 
450 }
451 
452 /*----------------------------------------------------------------------------
453 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
454 | double-precision floating-point value, returning the result.  After being
455 | shifted into the proper positions, the three fields are simply added
456 | together to form the result.  This means that any integer portion of `zSig'
457 | will be added into the exponent.  Since a properly normalized significand
458 | will have an integer portion equal to 1, the `zExp' input should be 1 less
459 | than the desired result exponent whenever `zSig' is a complete, normalized
460 | significand.
461 *----------------------------------------------------------------------------*/
462 
463 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
464 {
465 
466     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
467 			   ( ( (bits64) zExp )<<52 ) + zSig );
468 
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
473 | and significand `zSig', and returns the proper double-precision floating-
474 | point value corresponding to the abstract input.  Ordinarily, the abstract
475 | value is simply rounded and packed into the double-precision format, with
476 | the inexact exception raised if the abstract input cannot be represented
477 | exactly.  However, if the abstract value is too large, the overflow and
478 | inexact exceptions are raised and an infinity or maximal finite value is
479 | returned.  If the abstract value is too small, the input value is rounded
480 | to a subnormal number, and the underflow and inexact exceptions are raised
481 | if the abstract input cannot be represented exactly as a subnormal double-
482 | precision floating-point number.
483 |     The input significand `zSig' has its binary point between bits 62
484 | and 61, which is 10 bits to the left of the usual location.  This shifted
485 | significand must be normalized or smaller.  If `zSig' is not normalized,
486 | `zExp' must be 0; in that case, the result returned is a subnormal number,
487 | and it must not require rounding.  In the usual case that `zSig' is
488 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
489 | The handling of underflow and overflow follows the IEC/IEEE Standard for
490 | Binary Floating-Point Arithmetic.
491 *----------------------------------------------------------------------------*/
492 
493 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
494 {
495     int8 roundingMode;
496     flag roundNearestEven;
497     int16 roundIncrement, roundBits;
498     flag isTiny;
499 
500     roundingMode = float_rounding_mode();
501     roundNearestEven = ( roundingMode == float_round_nearest_even );
502     roundIncrement = 0x200;
503     if ( ! roundNearestEven ) {
504         if ( roundingMode == float_round_to_zero ) {
505             roundIncrement = 0;
506         }
507         else {
508             roundIncrement = 0x3FF;
509             if ( zSign ) {
510                 if ( roundingMode == float_round_up ) roundIncrement = 0;
511             }
512             else {
513                 if ( roundingMode == float_round_down ) roundIncrement = 0;
514             }
515         }
516     }
517     roundBits = zSig & 0x3FF;
518     if ( 0x7FD <= (bits16) zExp ) {
519         if (    ( 0x7FD < zExp )
520              || (    ( zExp == 0x7FD )
521                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
522            ) {
523             float_raise( float_flag_overflow | float_flag_inexact );
524             return FLOAT64_MANGLE(
525 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
526 		( roundIncrement == 0 ));
527         }
528         if ( zExp < 0 ) {
529             isTiny =
530                    ( float_detect_tininess == float_tininess_before_rounding )
531                 || ( zExp < -1 )
532                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
533             shift64RightJamming( zSig, - zExp, &zSig );
534             zExp = 0;
535             roundBits = zSig & 0x3FF;
536             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
537         }
538     }
539     if ( roundBits ) float_set_inexact();
540     zSig = ( zSig + roundIncrement )>>10;
541     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
542     if ( zSig == 0 ) zExp = 0;
543     return packFloat64( zSign, zExp, zSig );
544 
545 }
546 
547 /*----------------------------------------------------------------------------
548 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
549 | and significand `zSig', and returns the proper double-precision floating-
550 | point value corresponding to the abstract input.  This routine is just like
551 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
552 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
553 | floating-point exponent.
554 *----------------------------------------------------------------------------*/
555 
556 static float64
557  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
558 {
559     int8 shiftCount;
560 
561     shiftCount = countLeadingZeros64( zSig ) - 1;
562     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
563 
564 }
565 
566 #ifdef FLOATX80
567 
568 /*----------------------------------------------------------------------------
569 | Returns the fraction bits of the extended double-precision floating-point
570 | value `a'.
571 *----------------------------------------------------------------------------*/
572 
573 INLINE bits64 extractFloatx80Frac( floatx80 a )
574 {
575 
576     return a.low;
577 
578 }
579 
580 /*----------------------------------------------------------------------------
581 | Returns the exponent bits of the extended double-precision floating-point
582 | value `a'.
583 *----------------------------------------------------------------------------*/
584 
585 INLINE int32 extractFloatx80Exp( floatx80 a )
586 {
587 
588     return a.high & 0x7FFF;
589 
590 }
591 
592 /*----------------------------------------------------------------------------
593 | Returns the sign bit of the extended double-precision floating-point value
594 | `a'.
595 *----------------------------------------------------------------------------*/
596 
597 INLINE flag extractFloatx80Sign( floatx80 a )
598 {
599 
600     return a.high>>15;
601 
602 }
603 
604 /*----------------------------------------------------------------------------
605 | Normalizes the subnormal extended double-precision floating-point value
606 | represented by the denormalized significand `aSig'.  The normalized exponent
607 | and significand are stored at the locations pointed to by `zExpPtr' and
608 | `zSigPtr', respectively.
609 *----------------------------------------------------------------------------*/
610 
611 static void
612  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
613 {
614     int8 shiftCount;
615 
616     shiftCount = countLeadingZeros64( aSig );
617     *zSigPtr = aSig<<shiftCount;
618     *zExpPtr = 1 - shiftCount;
619 
620 }
621 
622 /*----------------------------------------------------------------------------
623 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
624 | extended double-precision floating-point value, returning the result.
625 *----------------------------------------------------------------------------*/
626 
627 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
628 {
629     floatx80 z;
630 
631     z.low = zSig;
632     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
633     return z;
634 
635 }
636 
637 /*----------------------------------------------------------------------------
638 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
639 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
640 | and returns the proper extended double-precision floating-point value
641 | corresponding to the abstract input.  Ordinarily, the abstract value is
642 | rounded and packed into the extended double-precision format, with the
643 | inexact exception raised if the abstract input cannot be represented
644 | exactly.  However, if the abstract value is too large, the overflow and
645 | inexact exceptions are raised and an infinity or maximal finite value is
646 | returned.  If the abstract value is too small, the input value is rounded to
647 | a subnormal number, and the underflow and inexact exceptions are raised if
648 | the abstract input cannot be represented exactly as a subnormal extended
649 | double-precision floating-point number.
650 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
651 | number of bits as single or double precision, respectively.  Otherwise, the
652 | result is rounded to the full precision of the extended double-precision
653 | format.
654 |     The input significand must be normalized or smaller.  If the input
655 | significand is not normalized, `zExp' must be 0; in that case, the result
656 | returned is a subnormal number, and it must not require rounding.  The
657 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
658 | Floating-Point Arithmetic.
659 *----------------------------------------------------------------------------*/
660 
661 static floatx80
662  roundAndPackFloatx80(
663      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
664  )
665 {
666     int8 roundingMode;
667     flag roundNearestEven, increment, isTiny;
668     int64 roundIncrement, roundMask, roundBits;
669 
670     roundingMode = float_rounding_mode();
671     roundNearestEven = ( roundingMode == float_round_nearest_even );
672     if ( roundingPrecision == 80 ) goto precision80;
673     if ( roundingPrecision == 64 ) {
674         roundIncrement = LIT64( 0x0000000000000400 );
675         roundMask = LIT64( 0x00000000000007FF );
676     }
677     else if ( roundingPrecision == 32 ) {
678         roundIncrement = LIT64( 0x0000008000000000 );
679         roundMask = LIT64( 0x000000FFFFFFFFFF );
680     }
681     else {
682         goto precision80;
683     }
684     zSig0 |= ( zSig1 != 0 );
685     if ( ! roundNearestEven ) {
686         if ( roundingMode == float_round_to_zero ) {
687             roundIncrement = 0;
688         }
689         else {
690             roundIncrement = roundMask;
691             if ( zSign ) {
692                 if ( roundingMode == float_round_up ) roundIncrement = 0;
693             }
694             else {
695                 if ( roundingMode == float_round_down ) roundIncrement = 0;
696             }
697         }
698     }
699     roundBits = zSig0 & roundMask;
700     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
701         if (    ( 0x7FFE < zExp )
702              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
703            ) {
704             goto overflow;
705         }
706         if ( zExp <= 0 ) {
707             isTiny =
708                    ( float_detect_tininess == float_tininess_before_rounding )
709                 || ( zExp < 0 )
710                 || ( zSig0 <= zSig0 + roundIncrement );
711             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
712             zExp = 0;
713             roundBits = zSig0 & roundMask;
714             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
715             if ( roundBits ) float_set_inexact();
716             zSig0 += roundIncrement;
717             if ( (sbits64) zSig0 < 0 ) zExp = 1;
718             roundIncrement = roundMask + 1;
719             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
720                 roundMask |= roundIncrement;
721             }
722             zSig0 &= ~ roundMask;
723             return packFloatx80( zSign, zExp, zSig0 );
724         }
725     }
726     if ( roundBits ) float_set_inexact();
727     zSig0 += roundIncrement;
728     if ( zSig0 < roundIncrement ) {
729         ++zExp;
730         zSig0 = LIT64( 0x8000000000000000 );
731     }
732     roundIncrement = roundMask + 1;
733     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
734         roundMask |= roundIncrement;
735     }
736     zSig0 &= ~ roundMask;
737     if ( zSig0 == 0 ) zExp = 0;
738     return packFloatx80( zSign, zExp, zSig0 );
739  precision80:
740     increment = ( (sbits64) zSig1 < 0 );
741     if ( ! roundNearestEven ) {
742         if ( roundingMode == float_round_to_zero ) {
743             increment = 0;
744         }
745         else {
746             if ( zSign ) {
747                 increment = ( roundingMode == float_round_down ) && zSig1;
748             }
749             else {
750                 increment = ( roundingMode == float_round_up ) && zSig1;
751             }
752         }
753     }
754     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
755         if (    ( 0x7FFE < zExp )
756              || (    ( zExp == 0x7FFE )
757                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
758                   && increment
759                 )
760            ) {
761             roundMask = 0;
762  overflow:
763             float_raise( float_flag_overflow | float_flag_inexact );
764             if (    ( roundingMode == float_round_to_zero )
765                  || ( zSign && ( roundingMode == float_round_up ) )
766                  || ( ! zSign && ( roundingMode == float_round_down ) )
767                ) {
768                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
769             }
770             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
771         }
772         if ( zExp <= 0 ) {
773             isTiny =
774                    ( float_detect_tininess == float_tininess_before_rounding )
775                 || ( zExp < 0 )
776                 || ! increment
777                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
778             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
779             zExp = 0;
780             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
781             if ( zSig1 ) float_set_inexact();
782             if ( roundNearestEven ) {
783                 increment = ( (sbits64) zSig1 < 0 );
784             }
785             else {
786                 if ( zSign ) {
787                     increment = ( roundingMode == float_round_down ) && zSig1;
788                 }
789                 else {
790                     increment = ( roundingMode == float_round_up ) && zSig1;
791                 }
792             }
793             if ( increment ) {
794                 ++zSig0;
795                 zSig0 &=
796                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
797                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
798             }
799             return packFloatx80( zSign, zExp, zSig0 );
800         }
801     }
802     if ( zSig1 ) float_set_inexact();
803     if ( increment ) {
804         ++zSig0;
805         if ( zSig0 == 0 ) {
806             ++zExp;
807             zSig0 = LIT64( 0x8000000000000000 );
808         }
809         else {
810             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
811         }
812     }
813     else {
814         if ( zSig0 == 0 ) zExp = 0;
815     }
816     return packFloatx80( zSign, zExp, zSig0 );
817 
818 }
819 
820 /*----------------------------------------------------------------------------
821 | Takes an abstract floating-point value having sign `zSign', exponent
822 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
823 | and returns the proper extended double-precision floating-point value
824 | corresponding to the abstract input.  This routine is just like
825 | `roundAndPackFloatx80' except that the input significand does not have to be
826 | normalized.
827 *----------------------------------------------------------------------------*/
828 
829 static floatx80
830  normalizeRoundAndPackFloatx80(
831      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
832  )
833 {
834     int8 shiftCount;
835 
836     if ( zSig0 == 0 ) {
837         zSig0 = zSig1;
838         zSig1 = 0;
839         zExp -= 64;
840     }
841     shiftCount = countLeadingZeros64( zSig0 );
842     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
843     zExp -= shiftCount;
844     return
845         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
846 
847 }
848 
849 #endif
850 
851 #ifdef FLOAT128
852 
853 /*----------------------------------------------------------------------------
854 | Returns the least-significant 64 fraction bits of the quadruple-precision
855 | floating-point value `a'.
856 *----------------------------------------------------------------------------*/
857 
858 INLINE bits64 extractFloat128Frac1( float128 a )
859 {
860 
861     return a.low;
862 
863 }
864 
865 /*----------------------------------------------------------------------------
866 | Returns the most-significant 48 fraction bits of the quadruple-precision
867 | floating-point value `a'.
868 *----------------------------------------------------------------------------*/
869 
870 INLINE bits64 extractFloat128Frac0( float128 a )
871 {
872 
873     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
874 
875 }
876 
877 /*----------------------------------------------------------------------------
878 | Returns the exponent bits of the quadruple-precision floating-point value
879 | `a'.
880 *----------------------------------------------------------------------------*/
881 
882 INLINE int32 extractFloat128Exp( float128 a )
883 {
884 
885     return ( a.high>>48 ) & 0x7FFF;
886 
887 }
888 
889 
890 /*----------------------------------------------------------------------------
891 | Returns the sign bit of the quadruple-precision floating-point value `a'.
892 *----------------------------------------------------------------------------*/
893 
894 INLINE flag extractFloat128Sign( float128 a )
895 {
896 
897     return a.high>>63;
898 
899 }
900 
901 /*----------------------------------------------------------------------------
902 | Normalizes the subnormal quadruple-precision floating-point value
903 | represented by the denormalized significand formed by the concatenation of
904 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
905 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
906 | significand are stored at the location pointed to by `zSig0Ptr', and the
907 | least significant 64 bits of the normalized significand are stored at the
908 | location pointed to by `zSig1Ptr'.
909 *----------------------------------------------------------------------------*/
910 
911 static void
912  normalizeFloat128Subnormal(
913      bits64 aSig0,
914      bits64 aSig1,
915      int32 *zExpPtr,
916      bits64 *zSig0Ptr,
917      bits64 *zSig1Ptr
918  )
919 {
920     int8 shiftCount;
921 
922     if ( aSig0 == 0 ) {
923         shiftCount = countLeadingZeros64( aSig1 ) - 15;
924         if ( shiftCount < 0 ) {
925             *zSig0Ptr = aSig1>>( - shiftCount );
926             *zSig1Ptr = aSig1<<( shiftCount & 63 );
927         }
928         else {
929             *zSig0Ptr = aSig1<<shiftCount;
930             *zSig1Ptr = 0;
931         }
932         *zExpPtr = - shiftCount - 63;
933     }
934     else {
935         shiftCount = countLeadingZeros64( aSig0 ) - 15;
936         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
937         *zExpPtr = 1 - shiftCount;
938     }
939 
940 }
941 
942 /*----------------------------------------------------------------------------
943 | Packs the sign `zSign', the exponent `zExp', and the significand formed
944 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
945 | floating-point value, returning the result.  After being shifted into the
946 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
947 | added together to form the most significant 32 bits of the result.  This
948 | means that any integer portion of `zSig0' will be added into the exponent.
949 | Since a properly normalized significand will have an integer portion equal
950 | to 1, the `zExp' input should be 1 less than the desired result exponent
951 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
952 | significand.
953 *----------------------------------------------------------------------------*/
954 
955 INLINE float128
956  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
957 {
958     float128 z;
959 
960     z.low = zSig1;
961     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
962     return z;
963 
964 }
965 
966 /*----------------------------------------------------------------------------
967 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
968 | and extended significand formed by the concatenation of `zSig0', `zSig1',
969 | and `zSig2', and returns the proper quadruple-precision floating-point value
970 | corresponding to the abstract input.  Ordinarily, the abstract value is
971 | simply rounded and packed into the quadruple-precision format, with the
972 | inexact exception raised if the abstract input cannot be represented
973 | exactly.  However, if the abstract value is too large, the overflow and
974 | inexact exceptions are raised and an infinity or maximal finite value is
975 | returned.  If the abstract value is too small, the input value is rounded to
976 | a subnormal number, and the underflow and inexact exceptions are raised if
977 | the abstract input cannot be represented exactly as a subnormal quadruple-
978 | precision floating-point number.
979 |     The input significand must be normalized or smaller.  If the input
980 | significand is not normalized, `zExp' must be 0; in that case, the result
981 | returned is a subnormal number, and it must not require rounding.  In the
982 | usual case that the input significand is normalized, `zExp' must be 1 less
983 | than the ``true'' floating-point exponent.  The handling of underflow and
984 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
985 *----------------------------------------------------------------------------*/
986 
987 static float128
988  roundAndPackFloat128(
989      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
990 {
991     int8 roundingMode;
992     flag roundNearestEven, increment, isTiny;
993 
994     roundingMode = float_rounding_mode();
995     roundNearestEven = ( roundingMode == float_round_nearest_even );
996     increment = ( (sbits64) zSig2 < 0 );
997     if ( ! roundNearestEven ) {
998         if ( roundingMode == float_round_to_zero ) {
999             increment = 0;
1000         }
1001         else {
1002             if ( zSign ) {
1003                 increment = ( roundingMode == float_round_down ) && zSig2;
1004             }
1005             else {
1006                 increment = ( roundingMode == float_round_up ) && zSig2;
1007             }
1008         }
1009     }
1010     if ( 0x7FFD <= (bits32) zExp ) {
1011         if (    ( 0x7FFD < zExp )
1012              || (    ( zExp == 0x7FFD )
1013                   && eq128(
1014                          LIT64( 0x0001FFFFFFFFFFFF ),
1015                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1016                          zSig0,
1017                          zSig1
1018                      )
1019                   && increment
1020                 )
1021            ) {
1022             float_raise( float_flag_overflow | float_flag_inexact );
1023             if (    ( roundingMode == float_round_to_zero )
1024                  || ( zSign && ( roundingMode == float_round_up ) )
1025                  || ( ! zSign && ( roundingMode == float_round_down ) )
1026                ) {
1027                 return
1028                     packFloat128(
1029                         zSign,
1030                         0x7FFE,
1031                         LIT64( 0x0000FFFFFFFFFFFF ),
1032                         LIT64( 0xFFFFFFFFFFFFFFFF )
1033                     );
1034             }
1035             return packFloat128( zSign, 0x7FFF, 0, 0 );
1036         }
1037         if ( zExp < 0 ) {
1038             isTiny =
1039                    ( float_detect_tininess == float_tininess_before_rounding )
1040                 || ( zExp < -1 )
1041                 || ! increment
1042                 || lt128(
1043                        zSig0,
1044                        zSig1,
1045                        LIT64( 0x0001FFFFFFFFFFFF ),
1046                        LIT64( 0xFFFFFFFFFFFFFFFF )
1047                    );
1048             shift128ExtraRightJamming(
1049                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1050             zExp = 0;
1051             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1052             if ( roundNearestEven ) {
1053                 increment = ( (sbits64) zSig2 < 0 );
1054             }
1055             else {
1056                 if ( zSign ) {
1057                     increment = ( roundingMode == float_round_down ) && zSig2;
1058                 }
1059                 else {
1060                     increment = ( roundingMode == float_round_up ) && zSig2;
1061                 }
1062             }
1063         }
1064     }
1065     if ( zSig2 ) float_set_inexact();
1066     if ( increment ) {
1067         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1068         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1069     }
1070     else {
1071         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1072     }
1073     return packFloat128( zSign, zExp, zSig0, zSig1 );
1074 
1075 }
1076 
1077 /*----------------------------------------------------------------------------
1078 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1080 | returns the proper quadruple-precision floating-point value corresponding
1081 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1082 | except that the input significand has fewer bits and does not have to be
1083 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1084 | point exponent.
1085 *----------------------------------------------------------------------------*/
1086 
1087 static float128
1088  normalizeRoundAndPackFloat128(
1089      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1090 {
1091     int8 shiftCount;
1092     bits64 zSig2;
1093 
1094     if ( zSig0 == 0 ) {
1095         zSig0 = zSig1;
1096         zSig1 = 0;
1097         zExp -= 64;
1098     }
1099     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1100     if ( 0 <= shiftCount ) {
1101         zSig2 = 0;
1102         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1103     }
1104     else {
1105         shift128ExtraRightJamming(
1106             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1107     }
1108     zExp -= shiftCount;
1109     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1110 
1111 }
1112 
1113 #endif
1114 
1115 /*----------------------------------------------------------------------------
1116 | Returns the result of converting the 32-bit two's complement integer `a'
1117 | to the single-precision floating-point format.  The conversion is performed
1118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1119 *----------------------------------------------------------------------------*/
1120 
1121 float32 int32_to_float32( int32 a )
1122 {
1123     flag zSign;
1124 
1125     if ( a == 0 ) return 0;
1126     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1127     zSign = ( a < 0 );
1128     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1129 
1130 }
1131 
1132 /*----------------------------------------------------------------------------
1133 | Returns the result of converting the 32-bit two's complement integer `a'
1134 | to the double-precision floating-point format.  The conversion is performed
1135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1136 *----------------------------------------------------------------------------*/
1137 
1138 float64 int32_to_float64( int32 a )
1139 {
1140     flag zSign;
1141     uint32 absA;
1142     int8 shiftCount;
1143     bits64 zSig;
1144 
1145     if ( a == 0 ) return 0;
1146     zSign = ( a < 0 );
1147     absA = zSign ? - a : a;
1148     shiftCount = countLeadingZeros32( absA ) + 21;
1149     zSig = absA;
1150     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1151 
1152 }
1153 
1154 #ifdef FLOATX80
1155 
1156 /*----------------------------------------------------------------------------
1157 | Returns the result of converting the 32-bit two's complement integer `a'
1158 | to the extended double-precision floating-point format.  The conversion
1159 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1160 | Arithmetic.
1161 *----------------------------------------------------------------------------*/
1162 
1163 floatx80 int32_to_floatx80( int32 a )
1164 {
1165     flag zSign;
1166     uint32 absA;
1167     int8 shiftCount;
1168     bits64 zSig;
1169 
1170     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1171     zSign = ( a < 0 );
1172     absA = zSign ? - a : a;
1173     shiftCount = countLeadingZeros32( absA ) + 32;
1174     zSig = absA;
1175     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1176 
1177 }
1178 
1179 #endif
1180 
1181 #ifdef FLOAT128
1182 
1183 /*----------------------------------------------------------------------------
1184 | Returns the result of converting the 32-bit two's complement integer `a' to
1185 | the quadruple-precision floating-point format.  The conversion is performed
1186 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187 *----------------------------------------------------------------------------*/
1188 
1189 float128 int32_to_float128( int32 a )
1190 {
1191     flag zSign;
1192     uint32 absA;
1193     int8 shiftCount;
1194     bits64 zSig0;
1195 
1196     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1197     zSign = ( a < 0 );
1198     absA = zSign ? - a : a;
1199     shiftCount = countLeadingZeros32( absA ) + 17;
1200     zSig0 = absA;
1201     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202 
1203 }
1204 
1205 #endif
1206 
1207 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 64-bit two's complement integer `a'
1210 | to the single-precision floating-point format.  The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213 
1214 float32 int64_to_float32( int64 a )
1215 {
1216     flag zSign;
1217     uint64 absA;
1218     int8 shiftCount;
1219 
1220     if ( a == 0 ) return 0;
1221     zSign = ( a < 0 );
1222     absA = zSign ? - a : a;
1223     shiftCount = countLeadingZeros64( absA ) - 40;
1224     if ( 0 <= shiftCount ) {
1225         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1226     }
1227     else {
1228         shiftCount += 7;
1229         if ( shiftCount < 0 ) {
1230             shift64RightJamming( absA, - shiftCount, &absA );
1231         }
1232         else {
1233             absA <<= shiftCount;
1234         }
1235         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1236     }
1237 
1238 }
1239 
1240 /*----------------------------------------------------------------------------
1241 | Returns the result of converting the 64-bit two's complement integer `a'
1242 | to the double-precision floating-point format.  The conversion is performed
1243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1244 *----------------------------------------------------------------------------*/
1245 
1246 float64 int64_to_float64( int64 a )
1247 {
1248     flag zSign;
1249 
1250     if ( a == 0 ) return 0;
1251     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1252         return packFloat64( 1, 0x43E, 0 );
1253     }
1254     zSign = ( a < 0 );
1255     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1256 
1257 }
1258 
1259 #ifdef FLOATX80
1260 
1261 /*----------------------------------------------------------------------------
1262 | Returns the result of converting the 64-bit two's complement integer `a'
1263 | to the extended double-precision floating-point format.  The conversion
1264 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1265 | Arithmetic.
1266 *----------------------------------------------------------------------------*/
1267 
1268 floatx80 int64_to_floatx80( int64 a )
1269 {
1270     flag zSign;
1271     uint64 absA;
1272     int8 shiftCount;
1273 
1274     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1275     zSign = ( a < 0 );
1276     absA = zSign ? - a : a;
1277     shiftCount = countLeadingZeros64( absA );
1278     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1279 
1280 }
1281 
1282 #endif
1283 
1284 #ifdef FLOAT128
1285 
1286 /*----------------------------------------------------------------------------
1287 | Returns the result of converting the 64-bit two's complement integer `a' to
1288 | the quadruple-precision floating-point format.  The conversion is performed
1289 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 *----------------------------------------------------------------------------*/
1291 
1292 float128 int64_to_float128( int64 a )
1293 {
1294     flag zSign;
1295     uint64 absA;
1296     int8 shiftCount;
1297     int32 zExp;
1298     bits64 zSig0, zSig1;
1299 
1300     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1301     zSign = ( a < 0 );
1302     absA = zSign ? - a : a;
1303     shiftCount = countLeadingZeros64( absA ) + 49;
1304     zExp = 0x406E - shiftCount;
1305     if ( 64 <= shiftCount ) {
1306         zSig1 = 0;
1307         zSig0 = absA;
1308         shiftCount -= 64;
1309     }
1310     else {
1311         zSig1 = absA;
1312         zSig0 = 0;
1313     }
1314     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1315     return packFloat128( zSign, zExp, zSig0, zSig1 );
1316 
1317 }
1318 
1319 #endif
1320 #endif /* !SOFTFLOAT_FOR_GCC */
1321 
1322 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1323 /*----------------------------------------------------------------------------
1324 | Returns the result of converting the single-precision floating-point value
1325 | `a' to the 32-bit two's complement integer format.  The conversion is
1326 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1327 | Arithmetic---which means in particular that the conversion is rounded
1328 | according to the current rounding mode.  If `a' is a NaN, the largest
1329 | positive integer is returned.  Otherwise, if the conversion overflows, the
1330 | largest integer with the same sign as `a' is returned.
1331 *----------------------------------------------------------------------------*/
1332 
1333 int32 float32_to_int32( float32 a )
1334 {
1335     flag aSign;
1336     int16 aExp, shiftCount;
1337     bits32 aSig;
1338     bits64 aSig64;
1339 
1340     aSig = extractFloat32Frac( a );
1341     aExp = extractFloat32Exp( a );
1342     aSign = extractFloat32Sign( a );
1343     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1344     if ( aExp ) aSig |= 0x00800000;
1345     shiftCount = 0xAF - aExp;
1346     aSig64 = aSig;
1347     aSig64 <<= 32;
1348     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1349     return roundAndPackInt32( aSign, aSig64 );
1350 
1351 }
1352 #endif /* !SOFTFLOAT_FOR_GCC */
1353 
1354 /*----------------------------------------------------------------------------
1355 | Returns the result of converting the single-precision floating-point value
1356 | `a' to the 32-bit two's complement integer format.  The conversion is
1357 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1358 | Arithmetic, except that the conversion is always rounded toward zero.
1359 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1360 | the conversion overflows, the largest integer with the same sign as `a' is
1361 | returned.
1362 *----------------------------------------------------------------------------*/
1363 
1364 int32 float32_to_int32_round_to_zero( float32 a )
1365 {
1366     flag aSign;
1367     int16 aExp, shiftCount;
1368     bits32 aSig;
1369     int32 z;
1370 
1371     aSig = extractFloat32Frac( a );
1372     aExp = extractFloat32Exp( a );
1373     aSign = extractFloat32Sign( a );
1374     shiftCount = aExp - 0x9E;
1375     if ( 0 <= shiftCount ) {
1376         if ( a != 0xCF000000 ) {
1377             float_raise( float_flag_invalid );
1378             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1379         }
1380         return (sbits32) 0x80000000;
1381     }
1382     else if ( aExp <= 0x7E ) {
1383         if ( aExp | aSig ) float_set_inexact();
1384         return 0;
1385     }
1386     aSig = ( aSig | 0x00800000 )<<8;
1387     z = aSig>>( - shiftCount );
1388     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1389         float_set_inexact();
1390     }
1391     if ( aSign ) z = - z;
1392     return z;
1393 
1394 }
1395 
1396 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1397 /*----------------------------------------------------------------------------
1398 | Returns the result of converting the single-precision floating-point value
1399 | `a' to the 64-bit two's complement integer format.  The conversion is
1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 | Arithmetic---which means in particular that the conversion is rounded
1402 | according to the current rounding mode.  If `a' is a NaN, the largest
1403 | positive integer is returned.  Otherwise, if the conversion overflows, the
1404 | largest integer with the same sign as `a' is returned.
1405 *----------------------------------------------------------------------------*/
1406 
1407 int64 float32_to_int64( float32 a )
1408 {
1409     flag aSign;
1410     int16 aExp, shiftCount;
1411     bits32 aSig;
1412     bits64 aSig64, aSigExtra;
1413 
1414     aSig = extractFloat32Frac( a );
1415     aExp = extractFloat32Exp( a );
1416     aSign = extractFloat32Sign( a );
1417     shiftCount = 0xBE - aExp;
1418     if ( shiftCount < 0 ) {
1419         float_raise( float_flag_invalid );
1420         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1421             return LIT64( 0x7FFFFFFFFFFFFFFF );
1422         }
1423         return (sbits64) LIT64( 0x8000000000000000 );
1424     }
1425     if ( aExp ) aSig |= 0x00800000;
1426     aSig64 = aSig;
1427     aSig64 <<= 40;
1428     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1429     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1430 
1431 }
1432 
1433 /*----------------------------------------------------------------------------
1434 | Returns the result of converting the single-precision floating-point value
1435 | `a' to the 64-bit two's complement integer format.  The conversion is
1436 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1437 | Arithmetic, except that the conversion is always rounded toward zero.  If
1438 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1439 | conversion overflows, the largest integer with the same sign as `a' is
1440 | returned.
1441 *----------------------------------------------------------------------------*/
1442 
1443 int64 float32_to_int64_round_to_zero( float32 a )
1444 {
1445     flag aSign;
1446     int16 aExp, shiftCount;
1447     bits32 aSig;
1448     bits64 aSig64;
1449     int64 z;
1450 
1451     aSig = extractFloat32Frac( a );
1452     aExp = extractFloat32Exp( a );
1453     aSign = extractFloat32Sign( a );
1454     shiftCount = aExp - 0xBE;
1455     if ( 0 <= shiftCount ) {
1456         if ( a != 0xDF000000 ) {
1457             float_raise( float_flag_invalid );
1458             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1459                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1460             }
1461         }
1462         return (sbits64) LIT64( 0x8000000000000000 );
1463     }
1464     else if ( aExp <= 0x7E ) {
1465         if ( aExp | aSig ) float_set_inexact();
1466         return 0;
1467     }
1468     aSig64 = aSig | 0x00800000;
1469     aSig64 <<= 40;
1470     z = aSig64>>( - shiftCount );
1471     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1472         float_set_inexact();
1473     }
1474     if ( aSign ) z = - z;
1475     return z;
1476 
1477 }
1478 #endif /* !SOFTFLOAT_FOR_GCC */
1479 
1480 /*----------------------------------------------------------------------------
1481 | Returns the result of converting the single-precision floating-point value
1482 | `a' to the double-precision floating-point format.  The conversion is
1483 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1484 | Arithmetic.
1485 *----------------------------------------------------------------------------*/
1486 
1487 float64 float32_to_float64( float32 a )
1488 {
1489     flag aSign;
1490     int16 aExp;
1491     bits32 aSig;
1492 
1493     aSig = extractFloat32Frac( a );
1494     aExp = extractFloat32Exp( a );
1495     aSign = extractFloat32Sign( a );
1496     if ( aExp == 0xFF ) {
1497         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1498         return packFloat64( aSign, 0x7FF, 0 );
1499     }
1500     if ( aExp == 0 ) {
1501         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1502         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1503         --aExp;
1504     }
1505     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1506 
1507 }
1508 
1509 #ifdef FLOATX80
1510 
1511 /*----------------------------------------------------------------------------
1512 | Returns the result of converting the single-precision floating-point value
1513 | `a' to the extended double-precision floating-point format.  The conversion
1514 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1515 | Arithmetic.
1516 *----------------------------------------------------------------------------*/
1517 
1518 floatx80 float32_to_floatx80( float32 a )
1519 {
1520     flag aSign;
1521     int16 aExp;
1522     bits32 aSig;
1523 
1524     aSig = extractFloat32Frac( a );
1525     aExp = extractFloat32Exp( a );
1526     aSign = extractFloat32Sign( a );
1527     if ( aExp == 0xFF ) {
1528         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1529         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1530     }
1531     if ( aExp == 0 ) {
1532         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1533         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1534     }
1535     aSig |= 0x00800000;
1536     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1537 
1538 }
1539 
1540 #endif
1541 
1542 #ifdef FLOAT128
1543 
1544 /*----------------------------------------------------------------------------
1545 | Returns the result of converting the single-precision floating-point value
1546 | `a' to the double-precision floating-point format.  The conversion is
1547 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1548 | Arithmetic.
1549 *----------------------------------------------------------------------------*/
1550 
1551 float128 float32_to_float128( float32 a )
1552 {
1553     flag aSign;
1554     int16 aExp;
1555     bits32 aSig;
1556 
1557     aSig = extractFloat32Frac( a );
1558     aExp = extractFloat32Exp( a );
1559     aSign = extractFloat32Sign( a );
1560     if ( aExp == 0xFF ) {
1561         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1562         return packFloat128( aSign, 0x7FFF, 0, 0 );
1563     }
1564     if ( aExp == 0 ) {
1565         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1566         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1567         --aExp;
1568     }
1569     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1570 
1571 }
1572 
1573 #endif
1574 
1575 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1576 /*----------------------------------------------------------------------------
1577 | Rounds the single-precision floating-point value `a' to an integer, and
1578 | returns the result as a single-precision floating-point value.  The
1579 | operation is performed according to the IEC/IEEE Standard for Binary
1580 | Floating-Point Arithmetic.
1581 *----------------------------------------------------------------------------*/
1582 
1583 float32 float32_round_to_int( float32 a )
1584 {
1585     flag aSign;
1586     int16 aExp;
1587     bits32 lastBitMask, roundBitsMask;
1588     int8 roundingMode;
1589     float32 z;
1590 
1591     aExp = extractFloat32Exp( a );
1592     if ( 0x96 <= aExp ) {
1593         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1594             return propagateFloat32NaN( a, a );
1595         }
1596         return a;
1597     }
1598     if ( aExp <= 0x7E ) {
1599         if ( (bits32) ( a<<1 ) == 0 ) return a;
1600         float_set_inexact();
1601         aSign = extractFloat32Sign( a );
1602         switch ( float_rounding_mode() ) {
1603          case float_round_nearest_even:
1604             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1605                 return packFloat32( aSign, 0x7F, 0 );
1606             }
1607             break;
1608          case float_round_down:
1609             return aSign ? 0xBF800000 : 0;
1610          case float_round_up:
1611             return aSign ? 0x80000000 : 0x3F800000;
1612         }
1613         return packFloat32( aSign, 0, 0 );
1614     }
1615     lastBitMask = 1;
1616     lastBitMask <<= 0x96 - aExp;
1617     roundBitsMask = lastBitMask - 1;
1618     z = a;
1619     roundingMode = float_rounding_mode();
1620     if ( roundingMode == float_round_nearest_even ) {
1621         z += lastBitMask>>1;
1622         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1623     }
1624     else if ( roundingMode != float_round_to_zero ) {
1625         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1626             z += roundBitsMask;
1627         }
1628     }
1629     z &= ~ roundBitsMask;
1630     if ( z != a ) float_set_inexact();
1631     return z;
1632 
1633 }
1634 #endif /* !SOFTFLOAT_FOR_GCC */
1635 
1636 /*----------------------------------------------------------------------------
1637 | Returns the result of adding the absolute values of the single-precision
1638 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1639 | before being returned.  `zSign' is ignored if the result is a NaN.
1640 | The addition is performed according to the IEC/IEEE Standard for Binary
1641 | Floating-Point Arithmetic.
1642 *----------------------------------------------------------------------------*/
1643 
1644 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1645 {
1646     int16 aExp, bExp, zExp;
1647     bits32 aSig, bSig, zSig;
1648     int16 expDiff;
1649 
1650     aSig = extractFloat32Frac( a );
1651     aExp = extractFloat32Exp( a );
1652     bSig = extractFloat32Frac( b );
1653     bExp = extractFloat32Exp( b );
1654     expDiff = aExp - bExp;
1655     aSig <<= 6;
1656     bSig <<= 6;
1657     if ( 0 < expDiff ) {
1658         if ( aExp == 0xFF ) {
1659             if ( aSig ) return propagateFloat32NaN( a, b );
1660             return a;
1661         }
1662         if ( bExp == 0 ) {
1663             --expDiff;
1664         }
1665         else {
1666             bSig |= 0x20000000;
1667         }
1668         shift32RightJamming( bSig, expDiff, &bSig );
1669         zExp = aExp;
1670     }
1671     else if ( expDiff < 0 ) {
1672         if ( bExp == 0xFF ) {
1673             if ( bSig ) return propagateFloat32NaN( a, b );
1674             return packFloat32( zSign, 0xFF, 0 );
1675         }
1676         if ( aExp == 0 ) {
1677             ++expDiff;
1678         }
1679         else {
1680             aSig |= 0x20000000;
1681         }
1682         shift32RightJamming( aSig, - expDiff, &aSig );
1683         zExp = bExp;
1684     }
1685     else {
1686         if ( aExp == 0xFF ) {
1687             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1688             return a;
1689         }
1690         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1691         zSig = 0x40000000 + aSig + bSig;
1692         zExp = aExp;
1693         goto roundAndPack;
1694     }
1695     aSig |= 0x20000000;
1696     zSig = ( aSig + bSig )<<1;
1697     --zExp;
1698     if ( (sbits32) zSig < 0 ) {
1699         zSig = aSig + bSig;
1700         ++zExp;
1701     }
1702  roundAndPack:
1703     return roundAndPackFloat32( zSign, zExp, zSig );
1704 
1705 }
1706 
1707 /*----------------------------------------------------------------------------
1708 | Returns the result of subtracting the absolute values of the single-
1709 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1710 | difference is negated before being returned.  `zSign' is ignored if the
1711 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1712 | Standard for Binary Floating-Point Arithmetic.
1713 *----------------------------------------------------------------------------*/
1714 
1715 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1716 {
1717     int16 aExp, bExp, zExp;
1718     bits32 aSig, bSig, zSig;
1719     int16 expDiff;
1720 
1721     aSig = extractFloat32Frac( a );
1722     aExp = extractFloat32Exp( a );
1723     bSig = extractFloat32Frac( b );
1724     bExp = extractFloat32Exp( b );
1725     expDiff = aExp - bExp;
1726     aSig <<= 7;
1727     bSig <<= 7;
1728     if ( 0 < expDiff ) goto aExpBigger;
1729     if ( expDiff < 0 ) goto bExpBigger;
1730     if ( aExp == 0xFF ) {
1731         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1732         float_raise( float_flag_invalid );
1733         return float32_default_nan;
1734     }
1735     if ( aExp == 0 ) {
1736         aExp = 1;
1737         bExp = 1;
1738     }
1739     if ( bSig < aSig ) goto aBigger;
1740     if ( aSig < bSig ) goto bBigger;
1741     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1742  bExpBigger:
1743     if ( bExp == 0xFF ) {
1744         if ( bSig ) return propagateFloat32NaN( a, b );
1745         return packFloat32( zSign ^ 1, 0xFF, 0 );
1746     }
1747     if ( aExp == 0 ) {
1748         ++expDiff;
1749     }
1750     else {
1751         aSig |= 0x40000000;
1752     }
1753     shift32RightJamming( aSig, - expDiff, &aSig );
1754     bSig |= 0x40000000;
1755  bBigger:
1756     zSig = bSig - aSig;
1757     zExp = bExp;
1758     zSign ^= 1;
1759     goto normalizeRoundAndPack;
1760  aExpBigger:
1761     if ( aExp == 0xFF ) {
1762         if ( aSig ) return propagateFloat32NaN( a, b );
1763         return a;
1764     }
1765     if ( bExp == 0 ) {
1766         --expDiff;
1767     }
1768     else {
1769         bSig |= 0x40000000;
1770     }
1771     shift32RightJamming( bSig, expDiff, &bSig );
1772     aSig |= 0x40000000;
1773  aBigger:
1774     zSig = aSig - bSig;
1775     zExp = aExp;
1776  normalizeRoundAndPack:
1777     --zExp;
1778     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1779 
1780 }
1781 
1782 /*----------------------------------------------------------------------------
1783 | Returns the result of adding the single-precision floating-point values `a'
1784 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1785 | Binary Floating-Point Arithmetic.
1786 *----------------------------------------------------------------------------*/
1787 
1788 float32 float32_add( float32 a, float32 b )
1789 {
1790     flag aSign, bSign;
1791 
1792     aSign = extractFloat32Sign( a );
1793     bSign = extractFloat32Sign( b );
1794     if ( aSign == bSign ) {
1795         return addFloat32Sigs( a, b, aSign );
1796     }
1797     else {
1798         return subFloat32Sigs( a, b, aSign );
1799     }
1800 
1801 }
1802 
1803 /*----------------------------------------------------------------------------
1804 | Returns the result of subtracting the single-precision floating-point values
1805 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1806 | for Binary Floating-Point Arithmetic.
1807 *----------------------------------------------------------------------------*/
1808 
1809 float32 float32_sub( float32 a, float32 b )
1810 {
1811     flag aSign, bSign;
1812 
1813     aSign = extractFloat32Sign( a );
1814     bSign = extractFloat32Sign( b );
1815     if ( aSign == bSign ) {
1816         return subFloat32Sigs( a, b, aSign );
1817     }
1818     else {
1819         return addFloat32Sigs( a, b, aSign );
1820     }
1821 
1822 }
1823 
1824 /*----------------------------------------------------------------------------
1825 | Returns the result of multiplying the single-precision floating-point values
1826 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1827 | for Binary Floating-Point Arithmetic.
1828 *----------------------------------------------------------------------------*/
1829 
1830 float32 float32_mul( float32 a, float32 b )
1831 {
1832     flag aSign, bSign, zSign;
1833     int16 aExp, bExp, zExp;
1834     bits32 aSig, bSig;
1835     bits64 zSig64;
1836     bits32 zSig;
1837 
1838     aSig = extractFloat32Frac( a );
1839     aExp = extractFloat32Exp( a );
1840     aSign = extractFloat32Sign( a );
1841     bSig = extractFloat32Frac( b );
1842     bExp = extractFloat32Exp( b );
1843     bSign = extractFloat32Sign( b );
1844     zSign = aSign ^ bSign;
1845     if ( aExp == 0xFF ) {
1846         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1847             return propagateFloat32NaN( a, b );
1848         }
1849         if ( ( bExp | bSig ) == 0 ) {
1850             float_raise( float_flag_invalid );
1851             return float32_default_nan;
1852         }
1853         return packFloat32( zSign, 0xFF, 0 );
1854     }
1855     if ( bExp == 0xFF ) {
1856         if ( bSig ) return propagateFloat32NaN( a, b );
1857         if ( ( aExp | aSig ) == 0 ) {
1858             float_raise( float_flag_invalid );
1859             return float32_default_nan;
1860         }
1861         return packFloat32( zSign, 0xFF, 0 );
1862     }
1863     if ( aExp == 0 ) {
1864         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1865         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1866     }
1867     if ( bExp == 0 ) {
1868         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1869         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1870     }
1871     zExp = aExp + bExp - 0x7F;
1872     aSig = ( aSig | 0x00800000 )<<7;
1873     bSig = ( bSig | 0x00800000 )<<8;
1874     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1875     zSig = zSig64;
1876     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1877         zSig <<= 1;
1878         --zExp;
1879     }
1880     return roundAndPackFloat32( zSign, zExp, zSig );
1881 
1882 }
1883 
1884 /*----------------------------------------------------------------------------
1885 | Returns the result of dividing the single-precision floating-point value `a'
1886 | by the corresponding value `b'.  The operation is performed according to the
1887 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1888 *----------------------------------------------------------------------------*/
1889 
1890 float32 float32_div( float32 a, float32 b )
1891 {
1892     flag aSign, bSign, zSign;
1893     int16 aExp, bExp, zExp;
1894     bits32 aSig, bSig, zSig;
1895 
1896     aSig = extractFloat32Frac( a );
1897     aExp = extractFloat32Exp( a );
1898     aSign = extractFloat32Sign( a );
1899     bSig = extractFloat32Frac( b );
1900     bExp = extractFloat32Exp( b );
1901     bSign = extractFloat32Sign( b );
1902     zSign = aSign ^ bSign;
1903     if ( aExp == 0xFF ) {
1904         if ( aSig ) return propagateFloat32NaN( a, b );
1905         if ( bExp == 0xFF ) {
1906             if ( bSig ) return propagateFloat32NaN( a, b );
1907             float_raise( float_flag_invalid );
1908             return float32_default_nan;
1909         }
1910         return packFloat32( zSign, 0xFF, 0 );
1911     }
1912     if ( bExp == 0xFF ) {
1913         if ( bSig ) return propagateFloat32NaN( a, b );
1914         return packFloat32( zSign, 0, 0 );
1915     }
1916     if ( bExp == 0 ) {
1917         if ( bSig == 0 ) {
1918             if ( ( aExp | aSig ) == 0 ) {
1919                 float_raise( float_flag_invalid );
1920                 return float32_default_nan;
1921             }
1922             float_raise( float_flag_divbyzero );
1923             return packFloat32( zSign, 0xFF, 0 );
1924         }
1925         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1926     }
1927     if ( aExp == 0 ) {
1928         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930     }
1931     zExp = aExp - bExp + 0x7D;
1932     aSig = ( aSig | 0x00800000 )<<7;
1933     bSig = ( bSig | 0x00800000 )<<8;
1934     if ( bSig <= ( aSig + aSig ) ) {
1935         aSig >>= 1;
1936         ++zExp;
1937     }
1938     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1939     if ( ( zSig & 0x3F ) == 0 ) {
1940         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1941     }
1942     return roundAndPackFloat32( zSign, zExp, zSig );
1943 
1944 }
1945 
1946 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1947 /*----------------------------------------------------------------------------
1948 | Returns the remainder of the single-precision floating-point value `a'
1949 | with respect to the corresponding value `b'.  The operation is performed
1950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1951 *----------------------------------------------------------------------------*/
1952 
1953 float32 float32_rem( float32 a, float32 b )
1954 {
1955     flag aSign, bSign __unused, zSign;
1956     int16 aExp, bExp, expDiff;
1957     bits32 aSig, bSig;
1958     bits32 q;
1959     bits64 aSig64, bSig64, q64;
1960     bits32 alternateASig;
1961     sbits32 sigMean;
1962 
1963     aSig = extractFloat32Frac( a );
1964     aExp = extractFloat32Exp( a );
1965     aSign = extractFloat32Sign( a );
1966     bSig = extractFloat32Frac( b );
1967     bExp = extractFloat32Exp( b );
1968     bSign = extractFloat32Sign( b );
1969     if ( aExp == 0xFF ) {
1970         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1971             return propagateFloat32NaN( a, b );
1972         }
1973         float_raise( float_flag_invalid );
1974         return float32_default_nan;
1975     }
1976     if ( bExp == 0xFF ) {
1977         if ( bSig ) return propagateFloat32NaN( a, b );
1978         return a;
1979     }
1980     if ( bExp == 0 ) {
1981         if ( bSig == 0 ) {
1982             float_raise( float_flag_invalid );
1983             return float32_default_nan;
1984         }
1985         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1986     }
1987     if ( aExp == 0 ) {
1988         if ( aSig == 0 ) return a;
1989         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1990     }
1991     expDiff = aExp - bExp;
1992     aSig |= 0x00800000;
1993     bSig |= 0x00800000;
1994     if ( expDiff < 32 ) {
1995         aSig <<= 8;
1996         bSig <<= 8;
1997         if ( expDiff < 0 ) {
1998             if ( expDiff < -1 ) return a;
1999             aSig >>= 1;
2000         }
2001         q = ( bSig <= aSig );
2002         if ( q ) aSig -= bSig;
2003         if ( 0 < expDiff ) {
2004             q = ( ( (bits64) aSig )<<32 ) / bSig;
2005             q >>= 32 - expDiff;
2006             bSig >>= 2;
2007             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2008         }
2009         else {
2010             aSig >>= 2;
2011             bSig >>= 2;
2012         }
2013     }
2014     else {
2015         if ( bSig <= aSig ) aSig -= bSig;
2016         aSig64 = ( (bits64) aSig )<<40;
2017         bSig64 = ( (bits64) bSig )<<40;
2018         expDiff -= 64;
2019         while ( 0 < expDiff ) {
2020             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2021             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2022             aSig64 = - ( ( bSig * q64 )<<38 );
2023             expDiff -= 62;
2024         }
2025         expDiff += 64;
2026         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2027         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2028         q = q64>>( 64 - expDiff );
2029         bSig <<= 6;
2030         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2031     }
2032     do {
2033         alternateASig = aSig;
2034         ++q;
2035         aSig -= bSig;
2036     } while ( 0 <= (sbits32) aSig );
2037     sigMean = aSig + alternateASig;
2038     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2039         aSig = alternateASig;
2040     }
2041     zSign = ( (sbits32) aSig < 0 );
2042     if ( zSign ) aSig = - aSig;
2043     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2044 
2045 }
2046 #endif /* !SOFTFLOAT_FOR_GCC */
2047 
2048 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2049 
2050 /*----------------------------------------------------------------------------
2051 | Returns the square root of the single-precision floating-point value `a'.
2052 | The operation is performed according to the IEC/IEEE Standard for Binary
2053 | Floating-Point Arithmetic.
2054 *----------------------------------------------------------------------------*/
2055 
2056 float32 float32_sqrt( float32 a )
2057 {
2058     flag aSign;
2059     int16 aExp, zExp;
2060     bits32 aSig, zSig;
2061     bits64 rem, term;
2062 
2063     aSig = extractFloat32Frac( a );
2064     aExp = extractFloat32Exp( a );
2065     aSign = extractFloat32Sign( a );
2066     if ( aExp == 0xFF ) {
2067         if ( aSig ) return propagateFloat32NaN( a, 0 );
2068         if ( ! aSign ) return a;
2069         float_raise( float_flag_invalid );
2070         return float32_default_nan;
2071     }
2072     if ( aSign ) {
2073         if ( ( aExp | aSig ) == 0 ) return a;
2074         float_raise( float_flag_invalid );
2075         return float32_default_nan;
2076     }
2077     if ( aExp == 0 ) {
2078         if ( aSig == 0 ) return 0;
2079         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2080     }
2081     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2082     aSig = ( aSig | 0x00800000 )<<8;
2083     zSig = estimateSqrt32( aExp, aSig ) + 2;
2084     if ( ( zSig & 0x7F ) <= 5 ) {
2085         if ( zSig < 2 ) {
2086             zSig = 0x7FFFFFFF;
2087             goto roundAndPack;
2088         }
2089         aSig >>= aExp & 1;
2090         term = ( (bits64) zSig ) * zSig;
2091         rem = ( ( (bits64) aSig )<<32 ) - term;
2092         while ( (sbits64) rem < 0 ) {
2093             --zSig;
2094             rem += ( ( (bits64) zSig )<<1 ) | 1;
2095         }
2096         zSig |= ( rem != 0 );
2097     }
2098     shift32RightJamming( zSig, 1, &zSig );
2099  roundAndPack:
2100     return roundAndPackFloat32( 0, zExp, zSig );
2101 
2102 }
2103 #endif /* !SOFTFLOAT_FOR_GCC */
2104 
2105 /*----------------------------------------------------------------------------
2106 | Returns 1 if the single-precision floating-point value `a' is equal to
2107 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2109 *----------------------------------------------------------------------------*/
2110 
2111 flag float32_eq( float32 a, float32 b )
2112 {
2113 
2114     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2116        ) {
2117         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118             float_raise( float_flag_invalid );
2119         }
2120         return 0;
2121     }
2122     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2123 
2124 }
2125 
2126 /*----------------------------------------------------------------------------
2127 | Returns 1 if the single-precision floating-point value `a' is less than
2128 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2129 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2130 | Arithmetic.
2131 *----------------------------------------------------------------------------*/
2132 
2133 flag float32_le( float32 a, float32 b )
2134 {
2135     flag aSign, bSign;
2136 
2137     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2138          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2139        ) {
2140         float_raise( float_flag_invalid );
2141         return 0;
2142     }
2143     aSign = extractFloat32Sign( a );
2144     bSign = extractFloat32Sign( b );
2145     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2146     return ( a == b ) || ( aSign ^ ( a < b ) );
2147 
2148 }
2149 
2150 /*----------------------------------------------------------------------------
2151 | Returns 1 if the single-precision floating-point value `a' is less than
2152 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2153 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2154 *----------------------------------------------------------------------------*/
2155 
2156 flag float32_lt( float32 a, float32 b )
2157 {
2158     flag aSign, bSign;
2159 
2160     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2162        ) {
2163         float_raise( float_flag_invalid );
2164         return 0;
2165     }
2166     aSign = extractFloat32Sign( a );
2167     bSign = extractFloat32Sign( b );
2168     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2169     return ( a != b ) && ( aSign ^ ( a < b ) );
2170 
2171 }
2172 
2173 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2174 /*----------------------------------------------------------------------------
2175 | Returns 1 if the single-precision floating-point value `a' is equal to
2176 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2177 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2179 *----------------------------------------------------------------------------*/
2180 
2181 flag float32_eq_signaling( float32 a, float32 b )
2182 {
2183 
2184     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2185          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2186        ) {
2187         float_raise( float_flag_invalid );
2188         return 0;
2189     }
2190     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2191 
2192 }
2193 
2194 /*----------------------------------------------------------------------------
2195 | Returns 1 if the single-precision floating-point value `a' is less than or
2196 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2197 | cause an exception.  Otherwise, the comparison is performed according to the
2198 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2199 *----------------------------------------------------------------------------*/
2200 
2201 flag float32_le_quiet( float32 a, float32 b )
2202 {
2203     flag aSign, bSign;
2204 
2205     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207        ) {
2208         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2209             float_raise( float_flag_invalid );
2210         }
2211         return 0;
2212     }
2213     aSign = extractFloat32Sign( a );
2214     bSign = extractFloat32Sign( b );
2215     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2216     return ( a == b ) || ( aSign ^ ( a < b ) );
2217 
2218 }
2219 
2220 /*----------------------------------------------------------------------------
2221 | Returns 1 if the single-precision floating-point value `a' is less than
2222 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2223 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2224 | Standard for Binary Floating-Point Arithmetic.
2225 *----------------------------------------------------------------------------*/
2226 
2227 flag float32_lt_quiet( float32 a, float32 b )
2228 {
2229     flag aSign, bSign;
2230 
2231     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2232          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2233        ) {
2234         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2235             float_raise( float_flag_invalid );
2236         }
2237         return 0;
2238     }
2239     aSign = extractFloat32Sign( a );
2240     bSign = extractFloat32Sign( b );
2241     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2242     return ( a != b ) && ( aSign ^ ( a < b ) );
2243 
2244 }
2245 #endif /* !SOFTFLOAT_FOR_GCC */
2246 
2247 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2248 /*----------------------------------------------------------------------------
2249 | Returns the result of converting the double-precision floating-point value
2250 | `a' to the 32-bit two's complement integer format.  The conversion is
2251 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2252 | Arithmetic---which means in particular that the conversion is rounded
2253 | according to the current rounding mode.  If `a' is a NaN, the largest
2254 | positive integer is returned.  Otherwise, if the conversion overflows, the
2255 | largest integer with the same sign as `a' is returned.
2256 *----------------------------------------------------------------------------*/
2257 
2258 int32 float64_to_int32( float64 a )
2259 {
2260     flag aSign;
2261     int16 aExp, shiftCount;
2262     bits64 aSig;
2263 
2264     aSig = extractFloat64Frac( a );
2265     aExp = extractFloat64Exp( a );
2266     aSign = extractFloat64Sign( a );
2267     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2268     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2269     shiftCount = 0x42C - aExp;
2270     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2271     return roundAndPackInt32( aSign, aSig );
2272 
2273 }
2274 #endif /* !SOFTFLOAT_FOR_GCC */
2275 
2276 /*----------------------------------------------------------------------------
2277 | Returns the result of converting the double-precision floating-point value
2278 | `a' to the 32-bit two's complement integer format.  The conversion is
2279 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2280 | Arithmetic, except that the conversion is always rounded toward zero.
2281 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2282 | the conversion overflows, the largest integer with the same sign as `a' is
2283 | returned.
2284 *----------------------------------------------------------------------------*/
2285 
2286 int32 float64_to_int32_round_to_zero( float64 a )
2287 {
2288     flag aSign;
2289     int16 aExp, shiftCount;
2290     bits64 aSig, savedASig;
2291     int32 z;
2292 
2293     aSig = extractFloat64Frac( a );
2294     aExp = extractFloat64Exp( a );
2295     aSign = extractFloat64Sign( a );
2296     if ( 0x41E < aExp ) {
2297         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2298         goto invalid;
2299     }
2300     else if ( aExp < 0x3FF ) {
2301         if ( aExp || aSig ) float_set_inexact();
2302         return 0;
2303     }
2304     aSig |= LIT64( 0x0010000000000000 );
2305     shiftCount = 0x433 - aExp;
2306     savedASig = aSig;
2307     aSig >>= shiftCount;
2308     z = aSig;
2309     if ( aSign ) z = - z;
2310     if ( ( z < 0 ) ^ aSign ) {
2311  invalid:
2312         float_raise( float_flag_invalid );
2313         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2314     }
2315     if ( ( aSig<<shiftCount ) != savedASig ) {
2316         float_set_inexact();
2317     }
2318     return z;
2319 
2320 }
2321 
2322 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2323 /*----------------------------------------------------------------------------
2324 | Returns the result of converting the double-precision floating-point value
2325 | `a' to the 64-bit two's complement integer format.  The conversion is
2326 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 | Arithmetic---which means in particular that the conversion is rounded
2328 | according to the current rounding mode.  If `a' is a NaN, the largest
2329 | positive integer is returned.  Otherwise, if the conversion overflows, the
2330 | largest integer with the same sign as `a' is returned.
2331 *----------------------------------------------------------------------------*/
2332 
2333 int64 float64_to_int64( float64 a )
2334 {
2335     flag aSign;
2336     int16 aExp, shiftCount;
2337     bits64 aSig, aSigExtra;
2338 
2339     aSig = extractFloat64Frac( a );
2340     aExp = extractFloat64Exp( a );
2341     aSign = extractFloat64Sign( a );
2342     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2343     shiftCount = 0x433 - aExp;
2344     if ( shiftCount <= 0 ) {
2345         if ( 0x43E < aExp ) {
2346             float_raise( float_flag_invalid );
2347             if (    ! aSign
2348                  || (    ( aExp == 0x7FF )
2349                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2350                ) {
2351                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2352             }
2353             return (sbits64) LIT64( 0x8000000000000000 );
2354         }
2355         aSigExtra = 0;
2356         aSig <<= - shiftCount;
2357     }
2358     else {
2359         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2360     }
2361     return roundAndPackInt64( aSign, aSig, aSigExtra );
2362 
2363 }
2364 
2365 /* like above, but result is unsigned */
2366 uint64 float64_to_uint64( float64 a )
2367 {
2368     flag aSign;
2369     int16 aExp, shiftCount;
2370     bits64 aSig, aSigExtra;
2371 
2372     aSig = extractFloat64Frac( a );
2373     aExp = extractFloat64Exp( a );
2374     aSign = extractFloat64Sign( a );
2375 
2376     if (aSign) {
2377 	return float64_to_int64(a);
2378     }
2379 
2380     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2381     shiftCount = 0x433 - aExp;
2382     if ( shiftCount <= 0 ) {
2383         if ( 0x43E < aExp ) {
2384             float_raise( float_flag_invalid );
2385             if (    ! aSign
2386                  || (    ( aExp == 0x7FF )
2387                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2388                ) {
2389                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2390             }
2391             return (sbits64) LIT64( 0x8000000000000000 );
2392         }
2393         aSigExtra = 0;
2394         aSig <<= - shiftCount;
2395     }
2396     else {
2397         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2398     }
2399     return roundAndPackUInt64( aSig, aSigExtra );
2400 
2401 }
2402 
2403 /*----------------------------------------------------------------------------
2404 | Returns the result of converting the double-precision floating-point value
2405 | `a' to the 64-bit two's complement integer format.  The conversion is
2406 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2407 | Arithmetic, except that the conversion is always rounded toward zero.
2408 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2409 | the conversion overflows, the largest integer with the same sign as `a' is
2410 | returned.
2411 *----------------------------------------------------------------------------*/
2412 
2413 int64 float64_to_int64_round_to_zero( float64 a )
2414 {
2415     flag aSign;
2416     int16 aExp, shiftCount;
2417     bits64 aSig;
2418     int64 z;
2419 
2420     aSig = extractFloat64Frac( a );
2421     aExp = extractFloat64Exp( a );
2422     aSign = extractFloat64Sign( a );
2423     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424     shiftCount = aExp - 0x433;
2425     if ( 0 <= shiftCount ) {
2426         if ( 0x43E <= aExp ) {
2427             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2428                 float_raise( float_flag_invalid );
2429                 if (    ! aSign
2430                      || (    ( aExp == 0x7FF )
2431                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2432                    ) {
2433                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2434                 }
2435             }
2436             return (sbits64) LIT64( 0x8000000000000000 );
2437         }
2438         z = aSig<<shiftCount;
2439     }
2440     else {
2441         if ( aExp < 0x3FE ) {
2442             if ( aExp | aSig ) float_set_inexact();
2443             return 0;
2444         }
2445         z = aSig>>( - shiftCount );
2446         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2447             float_set_inexact();
2448         }
2449     }
2450     if ( aSign ) z = - z;
2451     return z;
2452 
2453 }
2454 #endif /* !SOFTFLOAT_FOR_GCC */
2455 
2456 /*----------------------------------------------------------------------------
2457 | Returns the result of converting the double-precision floating-point value
2458 | `a' to the single-precision floating-point format.  The conversion is
2459 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2460 | Arithmetic.
2461 *----------------------------------------------------------------------------*/
2462 
2463 float32 float64_to_float32( float64 a )
2464 {
2465     flag aSign;
2466     int16 aExp;
2467     bits64 aSig;
2468     bits32 zSig;
2469 
2470     aSig = extractFloat64Frac( a );
2471     aExp = extractFloat64Exp( a );
2472     aSign = extractFloat64Sign( a );
2473     if ( aExp == 0x7FF ) {
2474         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2475         return packFloat32( aSign, 0xFF, 0 );
2476     }
2477     shift64RightJamming( aSig, 22, &aSig );
2478     zSig = aSig;
2479     if ( aExp || zSig ) {
2480         zSig |= 0x40000000;
2481         aExp -= 0x381;
2482     }
2483     return roundAndPackFloat32( aSign, aExp, zSig );
2484 
2485 }
2486 
2487 #ifdef FLOATX80
2488 
2489 /*----------------------------------------------------------------------------
2490 | Returns the result of converting the double-precision floating-point value
2491 | `a' to the extended double-precision floating-point format.  The conversion
2492 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2493 | Arithmetic.
2494 *----------------------------------------------------------------------------*/
2495 
2496 floatx80 float64_to_floatx80( float64 a )
2497 {
2498     flag aSign;
2499     int16 aExp;
2500     bits64 aSig;
2501 
2502     aSig = extractFloat64Frac( a );
2503     aExp = extractFloat64Exp( a );
2504     aSign = extractFloat64Sign( a );
2505     if ( aExp == 0x7FF ) {
2506         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2507         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2508     }
2509     if ( aExp == 0 ) {
2510         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2511         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2512     }
2513     return
2514         packFloatx80(
2515             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2516 
2517 }
2518 
2519 #endif
2520 
2521 #ifdef FLOAT128
2522 
2523 /*----------------------------------------------------------------------------
2524 | Returns the result of converting the double-precision floating-point value
2525 | `a' to the quadruple-precision floating-point format.  The conversion is
2526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2527 | Arithmetic.
2528 *----------------------------------------------------------------------------*/
2529 
2530 float128 float64_to_float128( float64 a )
2531 {
2532     flag aSign;
2533     int16 aExp;
2534     bits64 aSig, zSig0, zSig1;
2535 
2536     aSig = extractFloat64Frac( a );
2537     aExp = extractFloat64Exp( a );
2538     aSign = extractFloat64Sign( a );
2539     if ( aExp == 0x7FF ) {
2540         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2541         return packFloat128( aSign, 0x7FFF, 0, 0 );
2542     }
2543     if ( aExp == 0 ) {
2544         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2545         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2546         --aExp;
2547     }
2548     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2549     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2550 
2551 }
2552 
2553 #endif
2554 
2555 #ifndef SOFTFLOAT_FOR_GCC
2556 /*----------------------------------------------------------------------------
2557 | Rounds the double-precision floating-point value `a' to an integer, and
2558 | returns the result as a double-precision floating-point value.  The
2559 | operation is performed according to the IEC/IEEE Standard for Binary
2560 | Floating-Point Arithmetic.
2561 *----------------------------------------------------------------------------*/
2562 
2563 float64 float64_round_to_int( float64 a )
2564 {
2565     flag aSign;
2566     int16 aExp;
2567     bits64 lastBitMask, roundBitsMask;
2568     int8 roundingMode;
2569     float64 z;
2570 
2571     aExp = extractFloat64Exp( a );
2572     if ( 0x433 <= aExp ) {
2573         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2574             return propagateFloat64NaN( a, a );
2575         }
2576         return a;
2577     }
2578     if ( aExp < 0x3FF ) {
2579         if ( (bits64) ( a<<1 ) == 0 ) return a;
2580         float_set_inexact();
2581         aSign = extractFloat64Sign( a );
2582         switch ( float_rounding_mode() ) {
2583          case float_round_nearest_even:
2584             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2585                 return packFloat64( aSign, 0x3FF, 0 );
2586             }
2587             break;
2588          case float_round_down:
2589             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2590          case float_round_up:
2591             return
2592             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2593         }
2594         return packFloat64( aSign, 0, 0 );
2595     }
2596     lastBitMask = 1;
2597     lastBitMask <<= 0x433 - aExp;
2598     roundBitsMask = lastBitMask - 1;
2599     z = a;
2600     roundingMode = float_rounding_mode();
2601     if ( roundingMode == float_round_nearest_even ) {
2602         z += lastBitMask>>1;
2603         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2604     }
2605     else if ( roundingMode != float_round_to_zero ) {
2606         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2607             z += roundBitsMask;
2608         }
2609     }
2610     z &= ~ roundBitsMask;
2611     if ( z != a ) float_set_inexact();
2612     return z;
2613 
2614 }
2615 #endif
2616 
2617 /*----------------------------------------------------------------------------
2618 | Returns the result of adding the absolute values of the double-precision
2619 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2620 | before being returned.  `zSign' is ignored if the result is a NaN.
2621 | The addition is performed according to the IEC/IEEE Standard for Binary
2622 | Floating-Point Arithmetic.
2623 *----------------------------------------------------------------------------*/
2624 
2625 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2626 {
2627     int16 aExp, bExp, zExp;
2628     bits64 aSig, bSig, zSig;
2629     int16 expDiff;
2630 
2631     aSig = extractFloat64Frac( a );
2632     aExp = extractFloat64Exp( a );
2633     bSig = extractFloat64Frac( b );
2634     bExp = extractFloat64Exp( b );
2635     expDiff = aExp - bExp;
2636     aSig <<= 9;
2637     bSig <<= 9;
2638     if ( 0 < expDiff ) {
2639         if ( aExp == 0x7FF ) {
2640             if ( aSig ) return propagateFloat64NaN( a, b );
2641             return a;
2642         }
2643         if ( bExp == 0 ) {
2644             --expDiff;
2645         }
2646         else {
2647             bSig |= LIT64( 0x2000000000000000 );
2648         }
2649         shift64RightJamming( bSig, expDiff, &bSig );
2650         zExp = aExp;
2651     }
2652     else if ( expDiff < 0 ) {
2653         if ( bExp == 0x7FF ) {
2654             if ( bSig ) return propagateFloat64NaN( a, b );
2655             return packFloat64( zSign, 0x7FF, 0 );
2656         }
2657         if ( aExp == 0 ) {
2658             ++expDiff;
2659         }
2660         else {
2661             aSig |= LIT64( 0x2000000000000000 );
2662         }
2663         shift64RightJamming( aSig, - expDiff, &aSig );
2664         zExp = bExp;
2665     }
2666     else {
2667         if ( aExp == 0x7FF ) {
2668             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2669             return a;
2670         }
2671         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2672         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2673         zExp = aExp;
2674         goto roundAndPack;
2675     }
2676     aSig |= LIT64( 0x2000000000000000 );
2677     zSig = ( aSig + bSig )<<1;
2678     --zExp;
2679     if ( (sbits64) zSig < 0 ) {
2680         zSig = aSig + bSig;
2681         ++zExp;
2682     }
2683  roundAndPack:
2684     return roundAndPackFloat64( zSign, zExp, zSig );
2685 
2686 }
2687 
2688 /*----------------------------------------------------------------------------
2689 | Returns the result of subtracting the absolute values of the double-
2690 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2691 | difference is negated before being returned.  `zSign' is ignored if the
2692 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2693 | Standard for Binary Floating-Point Arithmetic.
2694 *----------------------------------------------------------------------------*/
2695 
2696 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2697 {
2698     int16 aExp, bExp, zExp;
2699     bits64 aSig, bSig, zSig;
2700     int16 expDiff;
2701 
2702     aSig = extractFloat64Frac( a );
2703     aExp = extractFloat64Exp( a );
2704     bSig = extractFloat64Frac( b );
2705     bExp = extractFloat64Exp( b );
2706     expDiff = aExp - bExp;
2707     aSig <<= 10;
2708     bSig <<= 10;
2709     if ( 0 < expDiff ) goto aExpBigger;
2710     if ( expDiff < 0 ) goto bExpBigger;
2711     if ( aExp == 0x7FF ) {
2712         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2713         float_raise( float_flag_invalid );
2714         return float64_default_nan;
2715     }
2716     if ( aExp == 0 ) {
2717         aExp = 1;
2718         bExp = 1;
2719     }
2720     if ( bSig < aSig ) goto aBigger;
2721     if ( aSig < bSig ) goto bBigger;
2722     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2723  bExpBigger:
2724     if ( bExp == 0x7FF ) {
2725         if ( bSig ) return propagateFloat64NaN( a, b );
2726         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2727     }
2728     if ( aExp == 0 ) {
2729         ++expDiff;
2730     }
2731     else {
2732         aSig |= LIT64( 0x4000000000000000 );
2733     }
2734     shift64RightJamming( aSig, - expDiff, &aSig );
2735     bSig |= LIT64( 0x4000000000000000 );
2736  bBigger:
2737     zSig = bSig - aSig;
2738     zExp = bExp;
2739     zSign ^= 1;
2740     goto normalizeRoundAndPack;
2741  aExpBigger:
2742     if ( aExp == 0x7FF ) {
2743         if ( aSig ) return propagateFloat64NaN( a, b );
2744         return a;
2745     }
2746     if ( bExp == 0 ) {
2747         --expDiff;
2748     }
2749     else {
2750         bSig |= LIT64( 0x4000000000000000 );
2751     }
2752     shift64RightJamming( bSig, expDiff, &bSig );
2753     aSig |= LIT64( 0x4000000000000000 );
2754  aBigger:
2755     zSig = aSig - bSig;
2756     zExp = aExp;
2757  normalizeRoundAndPack:
2758     --zExp;
2759     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2760 
2761 }
2762 
2763 /*----------------------------------------------------------------------------
2764 | Returns the result of adding the double-precision floating-point values `a'
2765 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2766 | Binary Floating-Point Arithmetic.
2767 *----------------------------------------------------------------------------*/
2768 
2769 float64 float64_add( float64 a, float64 b )
2770 {
2771     flag aSign, bSign;
2772 
2773     aSign = extractFloat64Sign( a );
2774     bSign = extractFloat64Sign( b );
2775     if ( aSign == bSign ) {
2776         return addFloat64Sigs( a, b, aSign );
2777     }
2778     else {
2779         return subFloat64Sigs( a, b, aSign );
2780     }
2781 
2782 }
2783 
2784 /*----------------------------------------------------------------------------
2785 | Returns the result of subtracting the double-precision floating-point values
2786 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2787 | for Binary Floating-Point Arithmetic.
2788 *----------------------------------------------------------------------------*/
2789 
2790 float64 float64_sub( float64 a, float64 b )
2791 {
2792     flag aSign, bSign;
2793 
2794     aSign = extractFloat64Sign( a );
2795     bSign = extractFloat64Sign( b );
2796     if ( aSign == bSign ) {
2797         return subFloat64Sigs( a, b, aSign );
2798     }
2799     else {
2800         return addFloat64Sigs( a, b, aSign );
2801     }
2802 
2803 }
2804 
2805 /*----------------------------------------------------------------------------
2806 | Returns the result of multiplying the double-precision floating-point values
2807 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2808 | for Binary Floating-Point Arithmetic.
2809 *----------------------------------------------------------------------------*/
2810 
2811 float64 float64_mul( float64 a, float64 b )
2812 {
2813     flag aSign, bSign, zSign;
2814     int16 aExp, bExp, zExp;
2815     bits64 aSig, bSig, zSig0, zSig1;
2816 
2817     aSig = extractFloat64Frac( a );
2818     aExp = extractFloat64Exp( a );
2819     aSign = extractFloat64Sign( a );
2820     bSig = extractFloat64Frac( b );
2821     bExp = extractFloat64Exp( b );
2822     bSign = extractFloat64Sign( b );
2823     zSign = aSign ^ bSign;
2824     if ( aExp == 0x7FF ) {
2825         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2826             return propagateFloat64NaN( a, b );
2827         }
2828         if ( ( bExp | bSig ) == 0 ) {
2829             float_raise( float_flag_invalid );
2830             return float64_default_nan;
2831         }
2832         return packFloat64( zSign, 0x7FF, 0 );
2833     }
2834     if ( bExp == 0x7FF ) {
2835         if ( bSig ) return propagateFloat64NaN( a, b );
2836         if ( ( aExp | aSig ) == 0 ) {
2837             float_raise( float_flag_invalid );
2838             return float64_default_nan;
2839         }
2840         return packFloat64( zSign, 0x7FF, 0 );
2841     }
2842     if ( aExp == 0 ) {
2843         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2844         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2845     }
2846     if ( bExp == 0 ) {
2847         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2848         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2849     }
2850     zExp = aExp + bExp - 0x3FF;
2851     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2852     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2853     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2854     zSig0 |= ( zSig1 != 0 );
2855     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2856         zSig0 <<= 1;
2857         --zExp;
2858     }
2859     return roundAndPackFloat64( zSign, zExp, zSig0 );
2860 
2861 }
2862 
2863 /*----------------------------------------------------------------------------
2864 | Returns the result of dividing the double-precision floating-point value `a'
2865 | by the corresponding value `b'.  The operation is performed according to
2866 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 *----------------------------------------------------------------------------*/
2868 
2869 float64 float64_div( float64 a, float64 b )
2870 {
2871     flag aSign, bSign, zSign;
2872     int16 aExp, bExp, zExp;
2873     bits64 aSig, bSig, zSig;
2874     bits64 rem0, rem1;
2875     bits64 term0, term1;
2876 
2877     aSig = extractFloat64Frac( a );
2878     aExp = extractFloat64Exp( a );
2879     aSign = extractFloat64Sign( a );
2880     bSig = extractFloat64Frac( b );
2881     bExp = extractFloat64Exp( b );
2882     bSign = extractFloat64Sign( b );
2883     zSign = aSign ^ bSign;
2884     if ( aExp == 0x7FF ) {
2885         if ( aSig ) return propagateFloat64NaN( a, b );
2886         if ( bExp == 0x7FF ) {
2887             if ( bSig ) return propagateFloat64NaN( a, b );
2888             float_raise( float_flag_invalid );
2889             return float64_default_nan;
2890         }
2891         return packFloat64( zSign, 0x7FF, 0 );
2892     }
2893     if ( bExp == 0x7FF ) {
2894         if ( bSig ) return propagateFloat64NaN( a, b );
2895         return packFloat64( zSign, 0, 0 );
2896     }
2897     if ( bExp == 0 ) {
2898         if ( bSig == 0 ) {
2899             if ( ( aExp | aSig ) == 0 ) {
2900                 float_raise( float_flag_invalid );
2901                 return float64_default_nan;
2902             }
2903             float_raise( float_flag_divbyzero );
2904             return packFloat64( zSign, 0x7FF, 0 );
2905         }
2906         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2907     }
2908     if ( aExp == 0 ) {
2909         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2910         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2911     }
2912     zExp = aExp - bExp + 0x3FD;
2913     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2914     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2915     if ( bSig <= ( aSig + aSig ) ) {
2916         aSig >>= 1;
2917         ++zExp;
2918     }
2919     zSig = estimateDiv128To64( aSig, 0, bSig );
2920     if ( ( zSig & 0x1FF ) <= 2 ) {
2921         mul64To128( bSig, zSig, &term0, &term1 );
2922         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2923         while ( (sbits64) rem0 < 0 ) {
2924             --zSig;
2925             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2926         }
2927         zSig |= ( rem1 != 0 );
2928     }
2929     return roundAndPackFloat64( zSign, zExp, zSig );
2930 
2931 }
2932 
2933 #ifndef SOFTFLOAT_FOR_GCC
2934 /*----------------------------------------------------------------------------
2935 | Returns the remainder of the double-precision floating-point value `a'
2936 | with respect to the corresponding value `b'.  The operation is performed
2937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2939 
2940 float64 float64_rem( float64 a, float64 b )
2941 {
2942     flag aSign, bSign __unused, zSign;
2943     int16 aExp, bExp, expDiff;
2944     bits64 aSig, bSig;
2945     bits64 q, alternateASig;
2946     sbits64 sigMean;
2947 
2948     aSig = extractFloat64Frac( a );
2949     aExp = extractFloat64Exp( a );
2950     aSign = extractFloat64Sign( a );
2951     bSig = extractFloat64Frac( b );
2952     bExp = extractFloat64Exp( b );
2953     bSign = extractFloat64Sign( b );
2954     if ( aExp == 0x7FF ) {
2955         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2956             return propagateFloat64NaN( a, b );
2957         }
2958         float_raise( float_flag_invalid );
2959         return float64_default_nan;
2960     }
2961     if ( bExp == 0x7FF ) {
2962         if ( bSig ) return propagateFloat64NaN( a, b );
2963         return a;
2964     }
2965     if ( bExp == 0 ) {
2966         if ( bSig == 0 ) {
2967             float_raise( float_flag_invalid );
2968             return float64_default_nan;
2969         }
2970         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2971     }
2972     if ( aExp == 0 ) {
2973         if ( aSig == 0 ) return a;
2974         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2975     }
2976     expDiff = aExp - bExp;
2977     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2978     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2979     if ( expDiff < 0 ) {
2980         if ( expDiff < -1 ) return a;
2981         aSig >>= 1;
2982     }
2983     q = ( bSig <= aSig );
2984     if ( q ) aSig -= bSig;
2985     expDiff -= 64;
2986     while ( 0 < expDiff ) {
2987         q = estimateDiv128To64( aSig, 0, bSig );
2988         q = ( 2 < q ) ? q - 2 : 0;
2989         aSig = - ( ( bSig>>2 ) * q );
2990         expDiff -= 62;
2991     }
2992     expDiff += 64;
2993     if ( 0 < expDiff ) {
2994         q = estimateDiv128To64( aSig, 0, bSig );
2995         q = ( 2 < q ) ? q - 2 : 0;
2996         q >>= 64 - expDiff;
2997         bSig >>= 2;
2998         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2999     }
3000     else {
3001         aSig >>= 2;
3002         bSig >>= 2;
3003     }
3004     do {
3005         alternateASig = aSig;
3006         ++q;
3007         aSig -= bSig;
3008     } while ( 0 <= (sbits64) aSig );
3009     sigMean = aSig + alternateASig;
3010     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3011         aSig = alternateASig;
3012     }
3013     zSign = ( (sbits64) aSig < 0 );
3014     if ( zSign ) aSig = - aSig;
3015     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3016 
3017 }
3018 
3019 /*----------------------------------------------------------------------------
3020 | Returns the square root of the double-precision floating-point value `a'.
3021 | The operation is performed according to the IEC/IEEE Standard for Binary
3022 | Floating-Point Arithmetic.
3023 *----------------------------------------------------------------------------*/
3024 
3025 float64 float64_sqrt( float64 a )
3026 {
3027     flag aSign;
3028     int16 aExp, zExp;
3029     bits64 aSig, zSig, doubleZSig;
3030     bits64 rem0, rem1, term0, term1;
3031 
3032     aSig = extractFloat64Frac( a );
3033     aExp = extractFloat64Exp( a );
3034     aSign = extractFloat64Sign( a );
3035     if ( aExp == 0x7FF ) {
3036         if ( aSig ) return propagateFloat64NaN( a, a );
3037         if ( ! aSign ) return a;
3038         float_raise( float_flag_invalid );
3039         return float64_default_nan;
3040     }
3041     if ( aSign ) {
3042         if ( ( aExp | aSig ) == 0 ) return a;
3043         float_raise( float_flag_invalid );
3044         return float64_default_nan;
3045     }
3046     if ( aExp == 0 ) {
3047         if ( aSig == 0 ) return 0;
3048         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3049     }
3050     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3051     aSig |= LIT64( 0x0010000000000000 );
3052     zSig = estimateSqrt32( aExp, aSig>>21 );
3053     aSig <<= 9 - ( aExp & 1 );
3054     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3055     if ( ( zSig & 0x1FF ) <= 5 ) {
3056         doubleZSig = zSig<<1;
3057         mul64To128( zSig, zSig, &term0, &term1 );
3058         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3059         while ( (sbits64) rem0 < 0 ) {
3060             --zSig;
3061             doubleZSig -= 2;
3062             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3063         }
3064         zSig |= ( ( rem0 | rem1 ) != 0 );
3065     }
3066     return roundAndPackFloat64( 0, zExp, zSig );
3067 
3068 }
3069 #endif
3070 
3071 /*----------------------------------------------------------------------------
3072 | Returns 1 if the double-precision floating-point value `a' is equal to the
3073 | corresponding value `b', and 0 otherwise.  The comparison is performed
3074 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3076 
3077 flag float64_eq( float64 a, float64 b )
3078 {
3079 
3080     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3081          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3082        ) {
3083         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3084             float_raise( float_flag_invalid );
3085         }
3086         return 0;
3087     }
3088     return ( a == b ) ||
3089 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3090 
3091 }
3092 
3093 /*----------------------------------------------------------------------------
3094 | Returns 1 if the double-precision floating-point value `a' is less than or
3095 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3096 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3097 | Arithmetic.
3098 *----------------------------------------------------------------------------*/
3099 
3100 flag float64_le( float64 a, float64 b )
3101 {
3102     flag aSign, bSign;
3103 
3104     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3105          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3106        ) {
3107         float_raise( float_flag_invalid );
3108         return 0;
3109     }
3110     aSign = extractFloat64Sign( a );
3111     bSign = extractFloat64Sign( b );
3112     if ( aSign != bSign )
3113 	return aSign ||
3114 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3115 	      0 );
3116     return ( a == b ) ||
3117 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3118 
3119 }
3120 
3121 /*----------------------------------------------------------------------------
3122 | Returns 1 if the double-precision floating-point value `a' is less than
3123 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3124 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3125 *----------------------------------------------------------------------------*/
3126 
3127 flag float64_lt( float64 a, float64 b )
3128 {
3129     flag aSign, bSign;
3130 
3131     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3132          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3133        ) {
3134         float_raise( float_flag_invalid );
3135         return 0;
3136     }
3137     aSign = extractFloat64Sign( a );
3138     bSign = extractFloat64Sign( b );
3139     if ( aSign != bSign )
3140 	return aSign &&
3141 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3142 	      0 );
3143     return ( a != b ) &&
3144 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3145 
3146 }
3147 
3148 #ifndef SOFTFLOAT_FOR_GCC
3149 /*----------------------------------------------------------------------------
3150 | Returns 1 if the double-precision floating-point value `a' is equal to the
3151 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3152 | if either operand is a NaN.  Otherwise, the comparison is performed
3153 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3154 *----------------------------------------------------------------------------*/
3155 
3156 flag float64_eq_signaling( float64 a, float64 b )
3157 {
3158 
3159     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3160          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3161        ) {
3162         float_raise( float_flag_invalid );
3163         return 0;
3164     }
3165     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3166 
3167 }
3168 
3169 /*----------------------------------------------------------------------------
3170 | Returns 1 if the double-precision floating-point value `a' is less than or
3171 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3172 | cause an exception.  Otherwise, the comparison is performed according to the
3173 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3174 *----------------------------------------------------------------------------*/
3175 
3176 flag float64_le_quiet( float64 a, float64 b )
3177 {
3178     flag aSign, bSign;
3179 
3180     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3181          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3182        ) {
3183         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3184             float_raise( float_flag_invalid );
3185         }
3186         return 0;
3187     }
3188     aSign = extractFloat64Sign( a );
3189     bSign = extractFloat64Sign( b );
3190     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3191     return ( a == b ) || ( aSign ^ ( a < b ) );
3192 
3193 }
3194 
3195 /*----------------------------------------------------------------------------
3196 | Returns 1 if the double-precision floating-point value `a' is less than
3197 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3198 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3199 | Standard for Binary Floating-Point Arithmetic.
3200 *----------------------------------------------------------------------------*/
3201 
3202 flag float64_lt_quiet( float64 a, float64 b )
3203 {
3204     flag aSign, bSign;
3205 
3206     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3207          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3208        ) {
3209         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3210             float_raise( float_flag_invalid );
3211         }
3212         return 0;
3213     }
3214     aSign = extractFloat64Sign( a );
3215     bSign = extractFloat64Sign( b );
3216     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3217     return ( a != b ) && ( aSign ^ ( a < b ) );
3218 
3219 }
3220 #endif
3221 
3222 #ifdef FLOATX80
3223 
3224 /*----------------------------------------------------------------------------
3225 | Returns the result of converting the extended double-precision floating-
3226 | point value `a' to the 32-bit two's complement integer format.  The
3227 | conversion is performed according to the IEC/IEEE Standard for Binary
3228 | Floating-Point Arithmetic---which means in particular that the conversion
3229 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3230 | largest positive integer is returned.  Otherwise, if the conversion
3231 | overflows, the largest integer with the same sign as `a' is returned.
3232 *----------------------------------------------------------------------------*/
3233 
3234 int32 floatx80_to_int32( floatx80 a )
3235 {
3236     flag aSign;
3237     int32 aExp, shiftCount;
3238     bits64 aSig;
3239 
3240     aSig = extractFloatx80Frac( a );
3241     aExp = extractFloatx80Exp( a );
3242     aSign = extractFloatx80Sign( a );
3243     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3244     shiftCount = 0x4037 - aExp;
3245     if ( shiftCount <= 0 ) shiftCount = 1;
3246     shift64RightJamming( aSig, shiftCount, &aSig );
3247     return roundAndPackInt32( aSign, aSig );
3248 
3249 }
3250 
3251 /*----------------------------------------------------------------------------
3252 | Returns the result of converting the extended double-precision floating-
3253 | point value `a' to the 32-bit two's complement integer format.  The
3254 | conversion is performed according to the IEC/IEEE Standard for Binary
3255 | Floating-Point Arithmetic, except that the conversion is always rounded
3256 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3257 | Otherwise, if the conversion overflows, the largest integer with the same
3258 | sign as `a' is returned.
3259 *----------------------------------------------------------------------------*/
3260 
3261 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3262 {
3263     flag aSign;
3264     int32 aExp, shiftCount;
3265     bits64 aSig, savedASig;
3266     int32 z;
3267 
3268     aSig = extractFloatx80Frac( a );
3269     aExp = extractFloatx80Exp( a );
3270     aSign = extractFloatx80Sign( a );
3271     if ( 0x401E < aExp ) {
3272         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3273         goto invalid;
3274     }
3275     else if ( aExp < 0x3FFF ) {
3276         if ( aExp || aSig ) float_set_inexact();
3277         return 0;
3278     }
3279     shiftCount = 0x403E - aExp;
3280     savedASig = aSig;
3281     aSig >>= shiftCount;
3282     z = aSig;
3283     if ( aSign ) z = - z;
3284     if ( ( z < 0 ) ^ aSign ) {
3285  invalid:
3286         float_raise( float_flag_invalid );
3287         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3288     }
3289     if ( ( aSig<<shiftCount ) != savedASig ) {
3290         float_set_inexact();
3291     }
3292     return z;
3293 
3294 }
3295 
3296 /*----------------------------------------------------------------------------
3297 | Returns the result of converting the extended double-precision floating-
3298 | point value `a' to the 64-bit two's complement integer format.  The
3299 | conversion is performed according to the IEC/IEEE Standard for Binary
3300 | Floating-Point Arithmetic---which means in particular that the conversion
3301 | is rounded according to the current rounding mode.  If `a' is a NaN,
3302 | the largest positive integer is returned.  Otherwise, if the conversion
3303 | overflows, the largest integer with the same sign as `a' is returned.
3304 *----------------------------------------------------------------------------*/
3305 
3306 int64 floatx80_to_int64( floatx80 a )
3307 {
3308     flag aSign;
3309     int32 aExp, shiftCount;
3310     bits64 aSig, aSigExtra;
3311 
3312     aSig = extractFloatx80Frac( a );
3313     aExp = extractFloatx80Exp( a );
3314     aSign = extractFloatx80Sign( a );
3315     shiftCount = 0x403E - aExp;
3316     if ( shiftCount <= 0 ) {
3317         if ( shiftCount ) {
3318             float_raise( float_flag_invalid );
3319             if (    ! aSign
3320                  || (    ( aExp == 0x7FFF )
3321                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3322                ) {
3323                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3324             }
3325             return (sbits64) LIT64( 0x8000000000000000 );
3326         }
3327         aSigExtra = 0;
3328     }
3329     else {
3330         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3331     }
3332     return roundAndPackInt64( aSign, aSig, aSigExtra );
3333 
3334 }
3335 
3336 /*----------------------------------------------------------------------------
3337 | Returns the result of converting the extended double-precision floating-
3338 | point value `a' to the 64-bit two's complement integer format.  The
3339 | conversion is performed according to the IEC/IEEE Standard for Binary
3340 | Floating-Point Arithmetic, except that the conversion is always rounded
3341 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3342 | Otherwise, if the conversion overflows, the largest integer with the same
3343 | sign as `a' is returned.
3344 *----------------------------------------------------------------------------*/
3345 
3346 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3347 {
3348     flag aSign;
3349     int32 aExp, shiftCount;
3350     bits64 aSig;
3351     int64 z;
3352 
3353     aSig = extractFloatx80Frac( a );
3354     aExp = extractFloatx80Exp( a );
3355     aSign = extractFloatx80Sign( a );
3356     shiftCount = aExp - 0x403E;
3357     if ( 0 <= shiftCount ) {
3358         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3359         if ( ( a.high != 0xC03E ) || aSig ) {
3360             float_raise( float_flag_invalid );
3361             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3362                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3363             }
3364         }
3365         return (sbits64) LIT64( 0x8000000000000000 );
3366     }
3367     else if ( aExp < 0x3FFF ) {
3368         if ( aExp | aSig ) float_set_inexact();
3369         return 0;
3370     }
3371     z = aSig>>( - shiftCount );
3372     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3373         float_set_inexact();
3374     }
3375     if ( aSign ) z = - z;
3376     return z;
3377 
3378 }
3379 
3380 /*----------------------------------------------------------------------------
3381 | Returns the result of converting the extended double-precision floating-
3382 | point value `a' to the single-precision floating-point format.  The
3383 | conversion is performed according to the IEC/IEEE Standard for Binary
3384 | Floating-Point Arithmetic.
3385 *----------------------------------------------------------------------------*/
3386 
3387 float32 floatx80_to_float32( floatx80 a )
3388 {
3389     flag aSign;
3390     int32 aExp;
3391     bits64 aSig;
3392 
3393     aSig = extractFloatx80Frac( a );
3394     aExp = extractFloatx80Exp( a );
3395     aSign = extractFloatx80Sign( a );
3396     if ( aExp == 0x7FFF ) {
3397         if ( (bits64) ( aSig<<1 ) ) {
3398             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3399         }
3400         return packFloat32( aSign, 0xFF, 0 );
3401     }
3402     shift64RightJamming( aSig, 33, &aSig );
3403     if ( aExp || aSig ) aExp -= 0x3F81;
3404     return roundAndPackFloat32( aSign, aExp, aSig );
3405 
3406 }
3407 
3408 /*----------------------------------------------------------------------------
3409 | Returns the result of converting the extended double-precision floating-
3410 | point value `a' to the double-precision floating-point format.  The
3411 | conversion is performed according to the IEC/IEEE Standard for Binary
3412 | Floating-Point Arithmetic.
3413 *----------------------------------------------------------------------------*/
3414 
3415 float64 floatx80_to_float64( floatx80 a )
3416 {
3417     flag aSign;
3418     int32 aExp;
3419     bits64 aSig, zSig;
3420 
3421     aSig = extractFloatx80Frac( a );
3422     aExp = extractFloatx80Exp( a );
3423     aSign = extractFloatx80Sign( a );
3424     if ( aExp == 0x7FFF ) {
3425         if ( (bits64) ( aSig<<1 ) ) {
3426             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3427         }
3428         return packFloat64( aSign, 0x7FF, 0 );
3429     }
3430     shift64RightJamming( aSig, 1, &zSig );
3431     if ( aExp || aSig ) aExp -= 0x3C01;
3432     return roundAndPackFloat64( aSign, aExp, zSig );
3433 
3434 }
3435 
3436 #ifdef FLOAT128
3437 
3438 /*----------------------------------------------------------------------------
3439 | Returns the result of converting the extended double-precision floating-
3440 | point value `a' to the quadruple-precision floating-point format.  The
3441 | conversion is performed according to the IEC/IEEE Standard for Binary
3442 | Floating-Point Arithmetic.
3443 *----------------------------------------------------------------------------*/
3444 
3445 float128 floatx80_to_float128( floatx80 a )
3446 {
3447     flag aSign;
3448     int16 aExp;
3449     bits64 aSig, zSig0, zSig1;
3450 
3451     aSig = extractFloatx80Frac( a );
3452     aExp = extractFloatx80Exp( a );
3453     aSign = extractFloatx80Sign( a );
3454     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3455         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3456     }
3457     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3458     return packFloat128( aSign, aExp, zSig0, zSig1 );
3459 
3460 }
3461 
3462 #endif
3463 
3464 /*----------------------------------------------------------------------------
3465 | Rounds the extended double-precision floating-point value `a' to an integer,
3466 | and returns the result as an extended quadruple-precision floating-point
3467 | value.  The operation is performed according to the IEC/IEEE Standard for
3468 | Binary Floating-Point Arithmetic.
3469 *----------------------------------------------------------------------------*/
3470 
3471 floatx80 floatx80_round_to_int( floatx80 a )
3472 {
3473     flag aSign;
3474     int32 aExp;
3475     bits64 lastBitMask, roundBitsMask;
3476     int8 roundingMode;
3477     floatx80 z;
3478 
3479     aExp = extractFloatx80Exp( a );
3480     if ( 0x403E <= aExp ) {
3481         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3482             return propagateFloatx80NaN( a, a );
3483         }
3484         return a;
3485     }
3486     if ( aExp < 0x3FFF ) {
3487         if (    ( aExp == 0 )
3488              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3489             return a;
3490         }
3491         float_set_inexact();
3492         aSign = extractFloatx80Sign( a );
3493         switch ( float_rounding_mode() ) {
3494          case float_round_nearest_even:
3495             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3496                ) {
3497                 return
3498                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3499             }
3500             break;
3501          case float_round_down:
3502             return
3503                   aSign ?
3504                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3505                 : packFloatx80( 0, 0, 0 );
3506          case float_round_up:
3507             return
3508                   aSign ? packFloatx80( 1, 0, 0 )
3509                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3510         }
3511         return packFloatx80( aSign, 0, 0 );
3512     }
3513     lastBitMask = 1;
3514     lastBitMask <<= 0x403E - aExp;
3515     roundBitsMask = lastBitMask - 1;
3516     z = a;
3517     roundingMode = float_rounding_mode();
3518     if ( roundingMode == float_round_nearest_even ) {
3519         z.low += lastBitMask>>1;
3520         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3521     }
3522     else if ( roundingMode != float_round_to_zero ) {
3523         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3524             z.low += roundBitsMask;
3525         }
3526     }
3527     z.low &= ~ roundBitsMask;
3528     if ( z.low == 0 ) {
3529         ++z.high;
3530         z.low = LIT64( 0x8000000000000000 );
3531     }
3532     if ( z.low != a.low ) float_set_inexact();
3533     return z;
3534 
3535 }
3536 
3537 /*----------------------------------------------------------------------------
3538 | Returns the result of adding the absolute values of the extended double-
3539 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3540 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3541 | The addition is performed according to the IEC/IEEE Standard for Binary
3542 | Floating-Point Arithmetic.
3543 *----------------------------------------------------------------------------*/
3544 
3545 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3546 {
3547     int32 aExp, bExp, zExp;
3548     bits64 aSig, bSig, zSig0, zSig1;
3549     int32 expDiff;
3550 
3551     aSig = extractFloatx80Frac( a );
3552     aExp = extractFloatx80Exp( a );
3553     bSig = extractFloatx80Frac( b );
3554     bExp = extractFloatx80Exp( b );
3555     expDiff = aExp - bExp;
3556     if ( 0 < expDiff ) {
3557         if ( aExp == 0x7FFF ) {
3558             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3559             return a;
3560         }
3561         if ( bExp == 0 ) --expDiff;
3562         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3563         zExp = aExp;
3564     }
3565     else if ( expDiff < 0 ) {
3566         if ( bExp == 0x7FFF ) {
3567             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3568             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3569         }
3570         if ( aExp == 0 ) ++expDiff;
3571         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3572         zExp = bExp;
3573     }
3574     else {
3575         if ( aExp == 0x7FFF ) {
3576             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3577                 return propagateFloatx80NaN( a, b );
3578             }
3579             return a;
3580         }
3581         zSig1 = 0;
3582         zSig0 = aSig + bSig;
3583         if ( aExp == 0 ) {
3584             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3585             goto roundAndPack;
3586         }
3587         zExp = aExp;
3588         goto shiftRight1;
3589     }
3590     zSig0 = aSig + bSig;
3591     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3592  shiftRight1:
3593     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3594     zSig0 |= LIT64( 0x8000000000000000 );
3595     ++zExp;
3596  roundAndPack:
3597     return
3598         roundAndPackFloatx80(
3599             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3600 
3601 }
3602 
3603 /*----------------------------------------------------------------------------
3604 | Returns the result of subtracting the absolute values of the extended
3605 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3606 | difference is negated before being returned.  `zSign' is ignored if the
3607 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3608 | Standard for Binary Floating-Point Arithmetic.
3609 *----------------------------------------------------------------------------*/
3610 
3611 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3612 {
3613     int32 aExp, bExp, zExp;
3614     bits64 aSig, bSig, zSig0, zSig1;
3615     int32 expDiff;
3616     floatx80 z;
3617 
3618     aSig = extractFloatx80Frac( a );
3619     aExp = extractFloatx80Exp( a );
3620     bSig = extractFloatx80Frac( b );
3621     bExp = extractFloatx80Exp( b );
3622     expDiff = aExp - bExp;
3623     if ( 0 < expDiff ) goto aExpBigger;
3624     if ( expDiff < 0 ) goto bExpBigger;
3625     if ( aExp == 0x7FFF ) {
3626         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3627             return propagateFloatx80NaN( a, b );
3628         }
3629         float_raise( float_flag_invalid );
3630         z.low = floatx80_default_nan_low;
3631         z.high = floatx80_default_nan_high;
3632         return z;
3633     }
3634     if ( aExp == 0 ) {
3635         aExp = 1;
3636         bExp = 1;
3637     }
3638     zSig1 = 0;
3639     if ( bSig < aSig ) goto aBigger;
3640     if ( aSig < bSig ) goto bBigger;
3641     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3642  bExpBigger:
3643     if ( bExp == 0x7FFF ) {
3644         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3645         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3646     }
3647     if ( aExp == 0 ) ++expDiff;
3648     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3649  bBigger:
3650     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3651     zExp = bExp;
3652     zSign ^= 1;
3653     goto normalizeRoundAndPack;
3654  aExpBigger:
3655     if ( aExp == 0x7FFF ) {
3656         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3657         return a;
3658     }
3659     if ( bExp == 0 ) --expDiff;
3660     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3661  aBigger:
3662     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3663     zExp = aExp;
3664  normalizeRoundAndPack:
3665     return
3666         normalizeRoundAndPackFloatx80(
3667             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3668 
3669 }
3670 
3671 /*----------------------------------------------------------------------------
3672 | Returns the result of adding the extended double-precision floating-point
3673 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3674 | Standard for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3676 
3677 floatx80 floatx80_add( floatx80 a, floatx80 b )
3678 {
3679     flag aSign, bSign;
3680 
3681     aSign = extractFloatx80Sign( a );
3682     bSign = extractFloatx80Sign( b );
3683     if ( aSign == bSign ) {
3684         return addFloatx80Sigs( a, b, aSign );
3685     }
3686     else {
3687         return subFloatx80Sigs( a, b, aSign );
3688     }
3689 
3690 }
3691 
3692 /*----------------------------------------------------------------------------
3693 | Returns the result of subtracting the extended double-precision floating-
3694 | point values `a' and `b'.  The operation is performed according to the
3695 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3696 *----------------------------------------------------------------------------*/
3697 
3698 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3699 {
3700     flag aSign, bSign;
3701 
3702     aSign = extractFloatx80Sign( a );
3703     bSign = extractFloatx80Sign( b );
3704     if ( aSign == bSign ) {
3705         return subFloatx80Sigs( a, b, aSign );
3706     }
3707     else {
3708         return addFloatx80Sigs( a, b, aSign );
3709     }
3710 
3711 }
3712 
3713 /*----------------------------------------------------------------------------
3714 | Returns the result of multiplying the extended double-precision floating-
3715 | point values `a' and `b'.  The operation is performed according to the
3716 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 *----------------------------------------------------------------------------*/
3718 
3719 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3720 {
3721     flag aSign, bSign, zSign;
3722     int32 aExp, bExp, zExp;
3723     bits64 aSig, bSig, zSig0, zSig1;
3724     floatx80 z;
3725 
3726     aSig = extractFloatx80Frac( a );
3727     aExp = extractFloatx80Exp( a );
3728     aSign = extractFloatx80Sign( a );
3729     bSig = extractFloatx80Frac( b );
3730     bExp = extractFloatx80Exp( b );
3731     bSign = extractFloatx80Sign( b );
3732     zSign = aSign ^ bSign;
3733     if ( aExp == 0x7FFF ) {
3734         if (    (bits64) ( aSig<<1 )
3735              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3736             return propagateFloatx80NaN( a, b );
3737         }
3738         if ( ( bExp | bSig ) == 0 ) goto invalid;
3739         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3740     }
3741     if ( bExp == 0x7FFF ) {
3742         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3743         if ( ( aExp | aSig ) == 0 ) {
3744  invalid:
3745             float_raise( float_flag_invalid );
3746             z.low = floatx80_default_nan_low;
3747             z.high = floatx80_default_nan_high;
3748             return z;
3749         }
3750         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3751     }
3752     if ( aExp == 0 ) {
3753         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3754         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3755     }
3756     if ( bExp == 0 ) {
3757         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3758         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3759     }
3760     zExp = aExp + bExp - 0x3FFE;
3761     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3762     if ( 0 < (sbits64) zSig0 ) {
3763         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3764         --zExp;
3765     }
3766     return
3767         roundAndPackFloatx80(
3768             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3769 
3770 }
3771 
3772 /*----------------------------------------------------------------------------
3773 | Returns the result of dividing the extended double-precision floating-point
3774 | value `a' by the corresponding value `b'.  The operation is performed
3775 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3776 *----------------------------------------------------------------------------*/
3777 
3778 floatx80 floatx80_div( floatx80 a, floatx80 b )
3779 {
3780     flag aSign, bSign, zSign;
3781     int32 aExp, bExp, zExp;
3782     bits64 aSig, bSig, zSig0, zSig1;
3783     bits64 rem0, rem1, rem2, term0, term1, term2;
3784     floatx80 z;
3785 
3786     aSig = extractFloatx80Frac( a );
3787     aExp = extractFloatx80Exp( a );
3788     aSign = extractFloatx80Sign( a );
3789     bSig = extractFloatx80Frac( b );
3790     bExp = extractFloatx80Exp( b );
3791     bSign = extractFloatx80Sign( b );
3792     zSign = aSign ^ bSign;
3793     if ( aExp == 0x7FFF ) {
3794         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3795         if ( bExp == 0x7FFF ) {
3796             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3797             goto invalid;
3798         }
3799         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3800     }
3801     if ( bExp == 0x7FFF ) {
3802         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3803         return packFloatx80( zSign, 0, 0 );
3804     }
3805     if ( bExp == 0 ) {
3806         if ( bSig == 0 ) {
3807             if ( ( aExp | aSig ) == 0 ) {
3808  invalid:
3809                 float_raise( float_flag_invalid );
3810                 z.low = floatx80_default_nan_low;
3811                 z.high = floatx80_default_nan_high;
3812                 return z;
3813             }
3814             float_raise( float_flag_divbyzero );
3815             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3816         }
3817         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3818     }
3819     if ( aExp == 0 ) {
3820         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3821         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3822     }
3823     zExp = aExp - bExp + 0x3FFE;
3824     rem1 = 0;
3825     if ( bSig <= aSig ) {
3826         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3827         ++zExp;
3828     }
3829     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3830     mul64To128( bSig, zSig0, &term0, &term1 );
3831     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3832     while ( (sbits64) rem0 < 0 ) {
3833         --zSig0;
3834         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3835     }
3836     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3837     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3838         mul64To128( bSig, zSig1, &term1, &term2 );
3839         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3840         while ( (sbits64) rem1 < 0 ) {
3841             --zSig1;
3842             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3843         }
3844         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3845     }
3846     return
3847         roundAndPackFloatx80(
3848             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3849 
3850 }
3851 
3852 /*----------------------------------------------------------------------------
3853 | Returns the remainder of the extended double-precision floating-point value
3854 | `a' with respect to the corresponding value `b'.  The operation is performed
3855 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 *----------------------------------------------------------------------------*/
3857 
3858 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3859 {
3860     flag aSign, bSign, zSign;
3861     int32 aExp, bExp, expDiff;
3862     bits64 aSig0, aSig1, bSig;
3863     bits64 q, term0, term1, alternateASig0, alternateASig1;
3864     floatx80 z;
3865 
3866     aSig0 = extractFloatx80Frac( a );
3867     aExp = extractFloatx80Exp( a );
3868     aSign = extractFloatx80Sign( a );
3869     bSig = extractFloatx80Frac( b );
3870     bExp = extractFloatx80Exp( b );
3871     bSign = extractFloatx80Sign( b );
3872     if ( aExp == 0x7FFF ) {
3873         if (    (bits64) ( aSig0<<1 )
3874              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3875             return propagateFloatx80NaN( a, b );
3876         }
3877         goto invalid;
3878     }
3879     if ( bExp == 0x7FFF ) {
3880         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3881         return a;
3882     }
3883     if ( bExp == 0 ) {
3884         if ( bSig == 0 ) {
3885  invalid:
3886             float_raise( float_flag_invalid );
3887             z.low = floatx80_default_nan_low;
3888             z.high = floatx80_default_nan_high;
3889             return z;
3890         }
3891         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892     }
3893     if ( aExp == 0 ) {
3894         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3895         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3896     }
3897     bSig |= LIT64( 0x8000000000000000 );
3898     zSign = aSign;
3899     expDiff = aExp - bExp;
3900     aSig1 = 0;
3901     if ( expDiff < 0 ) {
3902         if ( expDiff < -1 ) return a;
3903         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3904         expDiff = 0;
3905     }
3906     q = ( bSig <= aSig0 );
3907     if ( q ) aSig0 -= bSig;
3908     expDiff -= 64;
3909     while ( 0 < expDiff ) {
3910         q = estimateDiv128To64( aSig0, aSig1, bSig );
3911         q = ( 2 < q ) ? q - 2 : 0;
3912         mul64To128( bSig, q, &term0, &term1 );
3913         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3914         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3915         expDiff -= 62;
3916     }
3917     expDiff += 64;
3918     if ( 0 < expDiff ) {
3919         q = estimateDiv128To64( aSig0, aSig1, bSig );
3920         q = ( 2 < q ) ? q - 2 : 0;
3921         q >>= 64 - expDiff;
3922         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3923         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3924         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3925         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3926             ++q;
3927             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3928         }
3929     }
3930     else {
3931         term1 = 0;
3932         term0 = bSig;
3933     }
3934     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3935     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3936          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3937               && ( q & 1 ) )
3938        ) {
3939         aSig0 = alternateASig0;
3940         aSig1 = alternateASig1;
3941         zSign = ! zSign;
3942     }
3943     return
3944         normalizeRoundAndPackFloatx80(
3945             80, zSign, bExp + expDiff, aSig0, aSig1 );
3946 
3947 }
3948 
3949 /*----------------------------------------------------------------------------
3950 | Returns the square root of the extended double-precision floating-point
3951 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3952 | for Binary Floating-Point Arithmetic.
3953 *----------------------------------------------------------------------------*/
3954 
3955 floatx80 floatx80_sqrt( floatx80 a )
3956 {
3957     flag aSign;
3958     int32 aExp, zExp;
3959     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3960     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3961     floatx80 z;
3962 
3963     aSig0 = extractFloatx80Frac( a );
3964     aExp = extractFloatx80Exp( a );
3965     aSign = extractFloatx80Sign( a );
3966     if ( aExp == 0x7FFF ) {
3967         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3968         if ( ! aSign ) return a;
3969         goto invalid;
3970     }
3971     if ( aSign ) {
3972         if ( ( aExp | aSig0 ) == 0 ) return a;
3973  invalid:
3974         float_raise( float_flag_invalid );
3975         z.low = floatx80_default_nan_low;
3976         z.high = floatx80_default_nan_high;
3977         return z;
3978     }
3979     if ( aExp == 0 ) {
3980         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3981         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3982     }
3983     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3984     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3985     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3986     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3987     doubleZSig0 = zSig0<<1;
3988     mul64To128( zSig0, zSig0, &term0, &term1 );
3989     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3990     while ( (sbits64) rem0 < 0 ) {
3991         --zSig0;
3992         doubleZSig0 -= 2;
3993         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3994     }
3995     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3996     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3997         if ( zSig1 == 0 ) zSig1 = 1;
3998         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3999         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4000         mul64To128( zSig1, zSig1, &term2, &term3 );
4001         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4002         while ( (sbits64) rem1 < 0 ) {
4003             --zSig1;
4004             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4005             term3 |= 1;
4006             term2 |= doubleZSig0;
4007             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4008         }
4009         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4010     }
4011     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4012     zSig0 |= doubleZSig0;
4013     return
4014         roundAndPackFloatx80(
4015             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4016 
4017 }
4018 
4019 /*----------------------------------------------------------------------------
4020 | Returns 1 if the extended double-precision floating-point value `a' is
4021 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4022 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4023 | Arithmetic.
4024 *----------------------------------------------------------------------------*/
4025 
4026 flag floatx80_eq( floatx80 a, floatx80 b )
4027 {
4028 
4029     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4030               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4031          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4032               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4033        ) {
4034         if (    floatx80_is_signaling_nan( a )
4035              || floatx80_is_signaling_nan( b ) ) {
4036             float_raise( float_flag_invalid );
4037         }
4038         return 0;
4039     }
4040     return
4041            ( a.low == b.low )
4042         && (    ( a.high == b.high )
4043              || (    ( a.low == 0 )
4044                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4045            );
4046 
4047 }
4048 
4049 /*----------------------------------------------------------------------------
4050 | Returns 1 if the extended double-precision floating-point value `a' is
4051 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4052 | comparison is performed according to the IEC/IEEE Standard for Binary
4053 | Floating-Point Arithmetic.
4054 *----------------------------------------------------------------------------*/
4055 
4056 flag floatx80_le( floatx80 a, floatx80 b )
4057 {
4058     flag aSign, bSign;
4059 
4060     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4061               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4062          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4063               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4064        ) {
4065         float_raise( float_flag_invalid );
4066         return 0;
4067     }
4068     aSign = extractFloatx80Sign( a );
4069     bSign = extractFloatx80Sign( b );
4070     if ( aSign != bSign ) {
4071         return
4072                aSign
4073             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4074                  == 0 );
4075     }
4076     return
4077           aSign ? le128( b.high, b.low, a.high, a.low )
4078         : le128( a.high, a.low, b.high, b.low );
4079 
4080 }
4081 
4082 /*----------------------------------------------------------------------------
4083 | Returns 1 if the extended double-precision floating-point value `a' is
4084 | less than the corresponding value `b', and 0 otherwise.  The comparison
4085 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4086 | Arithmetic.
4087 *----------------------------------------------------------------------------*/
4088 
4089 flag floatx80_lt( floatx80 a, floatx80 b )
4090 {
4091     flag aSign, bSign;
4092 
4093     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4094               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4095          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4096               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4097        ) {
4098         float_raise( float_flag_invalid );
4099         return 0;
4100     }
4101     aSign = extractFloatx80Sign( a );
4102     bSign = extractFloatx80Sign( b );
4103     if ( aSign != bSign ) {
4104         return
4105                aSign
4106             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4107                  != 0 );
4108     }
4109     return
4110           aSign ? lt128( b.high, b.low, a.high, a.low )
4111         : lt128( a.high, a.low, b.high, b.low );
4112 
4113 }
4114 
4115 /*----------------------------------------------------------------------------
4116 | Returns 1 if the extended double-precision floating-point value `a' is equal
4117 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4118 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4120 *----------------------------------------------------------------------------*/
4121 
4122 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4123 {
4124 
4125     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4126               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4127          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4128               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4129        ) {
4130         float_raise( float_flag_invalid );
4131         return 0;
4132     }
4133     return
4134            ( a.low == b.low )
4135         && (    ( a.high == b.high )
4136              || (    ( a.low == 0 )
4137                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4138            );
4139 
4140 }
4141 
4142 /*----------------------------------------------------------------------------
4143 | Returns 1 if the extended double-precision floating-point value `a' is less
4144 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4145 | do not cause an exception.  Otherwise, the comparison is performed according
4146 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147 *----------------------------------------------------------------------------*/
4148 
4149 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4150 {
4151     flag aSign, bSign;
4152 
4153     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4154               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4155          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4156               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4157        ) {
4158         if (    floatx80_is_signaling_nan( a )
4159              || floatx80_is_signaling_nan( b ) ) {
4160             float_raise( float_flag_invalid );
4161         }
4162         return 0;
4163     }
4164     aSign = extractFloatx80Sign( a );
4165     bSign = extractFloatx80Sign( b );
4166     if ( aSign != bSign ) {
4167         return
4168                aSign
4169             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4170                  == 0 );
4171     }
4172     return
4173           aSign ? le128( b.high, b.low, a.high, a.low )
4174         : le128( a.high, a.low, b.high, b.low );
4175 
4176 }
4177 
4178 /*----------------------------------------------------------------------------
4179 | Returns 1 if the extended double-precision floating-point value `a' is less
4180 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4181 | an exception.  Otherwise, the comparison is performed according to the
4182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 *----------------------------------------------------------------------------*/
4184 
4185 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4186 {
4187     flag aSign, bSign;
4188 
4189     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4190               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4191          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4192               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4193        ) {
4194         if (    floatx80_is_signaling_nan( a )
4195              || floatx80_is_signaling_nan( b ) ) {
4196             float_raise( float_flag_invalid );
4197         }
4198         return 0;
4199     }
4200     aSign = extractFloatx80Sign( a );
4201     bSign = extractFloatx80Sign( b );
4202     if ( aSign != bSign ) {
4203         return
4204                aSign
4205             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4206                  != 0 );
4207     }
4208     return
4209           aSign ? lt128( b.high, b.low, a.high, a.low )
4210         : lt128( a.high, a.low, b.high, b.low );
4211 
4212 }
4213 
4214 #endif
4215 
4216 #ifdef FLOAT128
4217 
4218 /*----------------------------------------------------------------------------
4219 | Returns the result of converting the quadruple-precision floating-point
4220 | value `a' to the 32-bit two's complement integer format.  The conversion
4221 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4222 | Arithmetic---which means in particular that the conversion is rounded
4223 | according to the current rounding mode.  If `a' is a NaN, the largest
4224 | positive integer is returned.  Otherwise, if the conversion overflows, the
4225 | largest integer with the same sign as `a' is returned.
4226 *----------------------------------------------------------------------------*/
4227 
4228 int32 float128_to_int32( float128 a )
4229 {
4230     flag aSign;
4231     int32 aExp, shiftCount;
4232     bits64 aSig0, aSig1;
4233 
4234     aSig1 = extractFloat128Frac1( a );
4235     aSig0 = extractFloat128Frac0( a );
4236     aExp = extractFloat128Exp( a );
4237     aSign = extractFloat128Sign( a );
4238     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4239     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4240     aSig0 |= ( aSig1 != 0 );
4241     shiftCount = 0x4028 - aExp;
4242     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4243     return roundAndPackInt32( aSign, aSig0 );
4244 
4245 }
4246 
4247 /*----------------------------------------------------------------------------
4248 | Returns the result of converting the quadruple-precision floating-point
4249 | value `a' to the 32-bit two's complement integer format.  The conversion
4250 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4251 | Arithmetic, except that the conversion is always rounded toward zero.  If
4252 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4253 | conversion overflows, the largest integer with the same sign as `a' is
4254 | returned.
4255 *----------------------------------------------------------------------------*/
4256 
4257 int32 float128_to_int32_round_to_zero( float128 a )
4258 {
4259     flag aSign;
4260     int32 aExp, shiftCount;
4261     bits64 aSig0, aSig1, savedASig;
4262     int32 z;
4263 
4264     aSig1 = extractFloat128Frac1( a );
4265     aSig0 = extractFloat128Frac0( a );
4266     aExp = extractFloat128Exp( a );
4267     aSign = extractFloat128Sign( a );
4268     aSig0 |= ( aSig1 != 0 );
4269     if ( 0x401E < aExp ) {
4270         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4271         goto invalid;
4272     }
4273     else if ( aExp < 0x3FFF ) {
4274         if ( aExp || aSig0 ) float_set_inexact();
4275         return 0;
4276     }
4277     aSig0 |= LIT64( 0x0001000000000000 );
4278     shiftCount = 0x402F - aExp;
4279     savedASig = aSig0;
4280     aSig0 >>= shiftCount;
4281     z = aSig0;
4282     if ( aSign ) z = - z;
4283     if ( ( z < 0 ) ^ aSign ) {
4284  invalid:
4285         float_raise( float_flag_invalid );
4286         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4287     }
4288     if ( ( aSig0<<shiftCount ) != savedASig ) {
4289         float_set_inexact();
4290     }
4291     return z;
4292 
4293 }
4294 
4295 /*----------------------------------------------------------------------------
4296 | Returns the result of converting the quadruple-precision floating-point
4297 | value `a' to the 64-bit two's complement integer format.  The conversion
4298 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4299 | Arithmetic---which means in particular that the conversion is rounded
4300 | according to the current rounding mode.  If `a' is a NaN, the largest
4301 | positive integer is returned.  Otherwise, if the conversion overflows, the
4302 | largest integer with the same sign as `a' is returned.
4303 *----------------------------------------------------------------------------*/
4304 
4305 int64 float128_to_int64( float128 a )
4306 {
4307     flag aSign;
4308     int32 aExp, shiftCount;
4309     bits64 aSig0, aSig1;
4310 
4311     aSig1 = extractFloat128Frac1( a );
4312     aSig0 = extractFloat128Frac0( a );
4313     aExp = extractFloat128Exp( a );
4314     aSign = extractFloat128Sign( a );
4315     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4316     shiftCount = 0x402F - aExp;
4317     if ( shiftCount <= 0 ) {
4318         if ( 0x403E < aExp ) {
4319             float_raise( float_flag_invalid );
4320             if (    ! aSign
4321                  || (    ( aExp == 0x7FFF )
4322                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4323                     )
4324                ) {
4325                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4326             }
4327             return (sbits64) LIT64( 0x8000000000000000 );
4328         }
4329         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4330     }
4331     else {
4332         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4333     }
4334     return roundAndPackInt64( aSign, aSig0, aSig1 );
4335 
4336 }
4337 
4338 /*----------------------------------------------------------------------------
4339 | Returns the result of converting the quadruple-precision floating-point
4340 | value `a' to the 64-bit two's complement integer format.  The conversion
4341 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 | Arithmetic, except that the conversion is always rounded toward zero.
4343 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4344 | the conversion overflows, the largest integer with the same sign as `a' is
4345 | returned.
4346 *----------------------------------------------------------------------------*/
4347 
4348 int64 float128_to_int64_round_to_zero( float128 a )
4349 {
4350     flag aSign;
4351     int32 aExp, shiftCount;
4352     bits64 aSig0, aSig1;
4353     int64 z;
4354 
4355     aSig1 = extractFloat128Frac1( a );
4356     aSig0 = extractFloat128Frac0( a );
4357     aExp = extractFloat128Exp( a );
4358     aSign = extractFloat128Sign( a );
4359     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4360     shiftCount = aExp - 0x402F;
4361     if ( 0 < shiftCount ) {
4362         if ( 0x403E <= aExp ) {
4363             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4364             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4365                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4366                 if ( aSig1 ) float_set_inexact();
4367             }
4368             else {
4369                 float_raise( float_flag_invalid );
4370                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4371                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4372                 }
4373             }
4374             return (sbits64) LIT64( 0x8000000000000000 );
4375         }
4376         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4377         if ( (bits64) ( aSig1<<shiftCount ) ) {
4378             float_set_inexact();
4379         }
4380     }
4381     else {
4382         if ( aExp < 0x3FFF ) {
4383             if ( aExp | aSig0 | aSig1 ) {
4384                 float_set_inexact();
4385             }
4386             return 0;
4387         }
4388         z = aSig0>>( - shiftCount );
4389         if (    aSig1
4390              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4391             float_set_inexact();
4392         }
4393     }
4394     if ( aSign ) z = - z;
4395     return z;
4396 
4397 }
4398 
4399 /*----------------------------------------------------------------------------
4400 | Returns the result of converting the quadruple-precision floating-point
4401 | value `a' to the single-precision floating-point format.  The conversion
4402 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4403 | Arithmetic.
4404 *----------------------------------------------------------------------------*/
4405 
4406 float32 float128_to_float32( float128 a )
4407 {
4408     flag aSign;
4409     int32 aExp;
4410     bits64 aSig0, aSig1;
4411     bits32 zSig;
4412 
4413     aSig1 = extractFloat128Frac1( a );
4414     aSig0 = extractFloat128Frac0( a );
4415     aExp = extractFloat128Exp( a );
4416     aSign = extractFloat128Sign( a );
4417     if ( aExp == 0x7FFF ) {
4418         if ( aSig0 | aSig1 ) {
4419             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4420         }
4421         return packFloat32( aSign, 0xFF, 0 );
4422     }
4423     aSig0 |= ( aSig1 != 0 );
4424     shift64RightJamming( aSig0, 18, &aSig0 );
4425     zSig = aSig0;
4426     if ( aExp || zSig ) {
4427         zSig |= 0x40000000;
4428         aExp -= 0x3F81;
4429     }
4430     return roundAndPackFloat32( aSign, aExp, zSig );
4431 
4432 }
4433 
4434 /*----------------------------------------------------------------------------
4435 | Returns the result of converting the quadruple-precision floating-point
4436 | value `a' to the double-precision floating-point format.  The conversion
4437 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4438 | Arithmetic.
4439 *----------------------------------------------------------------------------*/
4440 
4441 float64 float128_to_float64( float128 a )
4442 {
4443     flag aSign;
4444     int32 aExp;
4445     bits64 aSig0, aSig1;
4446 
4447     aSig1 = extractFloat128Frac1( a );
4448     aSig0 = extractFloat128Frac0( a );
4449     aExp = extractFloat128Exp( a );
4450     aSign = extractFloat128Sign( a );
4451     if ( aExp == 0x7FFF ) {
4452         if ( aSig0 | aSig1 ) {
4453             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4454         }
4455         return packFloat64( aSign, 0x7FF, 0 );
4456     }
4457     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4458     aSig0 |= ( aSig1 != 0 );
4459     if ( aExp || aSig0 ) {
4460         aSig0 |= LIT64( 0x4000000000000000 );
4461         aExp -= 0x3C01;
4462     }
4463     return roundAndPackFloat64( aSign, aExp, aSig0 );
4464 
4465 }
4466 
4467 #ifdef FLOATX80
4468 
4469 /*----------------------------------------------------------------------------
4470 | Returns the result of converting the quadruple-precision floating-point
4471 | value `a' to the extended double-precision floating-point format.  The
4472 | conversion is performed according to the IEC/IEEE Standard for Binary
4473 | Floating-Point Arithmetic.
4474 *----------------------------------------------------------------------------*/
4475 
4476 floatx80 float128_to_floatx80( float128 a )
4477 {
4478     flag aSign;
4479     int32 aExp;
4480     bits64 aSig0, aSig1;
4481 
4482     aSig1 = extractFloat128Frac1( a );
4483     aSig0 = extractFloat128Frac0( a );
4484     aExp = extractFloat128Exp( a );
4485     aSign = extractFloat128Sign( a );
4486     if ( aExp == 0x7FFF ) {
4487         if ( aSig0 | aSig1 ) {
4488             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4489         }
4490         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4491     }
4492     if ( aExp == 0 ) {
4493         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4494         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4495     }
4496     else {
4497         aSig0 |= LIT64( 0x0001000000000000 );
4498     }
4499     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4500     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4501 
4502 }
4503 
4504 #endif
4505 
4506 /*----------------------------------------------------------------------------
4507 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4508 | returns the result as a quadruple-precision floating-point value.  The
4509 | operation is performed according to the IEC/IEEE Standard for Binary
4510 | Floating-Point Arithmetic.
4511 *----------------------------------------------------------------------------*/
4512 
4513 float128 float128_round_to_int( float128 a )
4514 {
4515     flag aSign;
4516     int32 aExp;
4517     bits64 lastBitMask, roundBitsMask;
4518     int8 roundingMode;
4519     float128 z;
4520 
4521     aExp = extractFloat128Exp( a );
4522     if ( 0x402F <= aExp ) {
4523         if ( 0x406F <= aExp ) {
4524             if (    ( aExp == 0x7FFF )
4525                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4526                ) {
4527                 return propagateFloat128NaN( a, a );
4528             }
4529             return a;
4530         }
4531         lastBitMask = 1;
4532         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4533         roundBitsMask = lastBitMask - 1;
4534         z = a;
4535         roundingMode = float_rounding_mode();
4536         if ( roundingMode == float_round_nearest_even ) {
4537             if ( lastBitMask ) {
4538                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4539                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4540             }
4541             else {
4542                 if ( (sbits64) z.low < 0 ) {
4543                     ++z.high;
4544                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4545                 }
4546             }
4547         }
4548         else if ( roundingMode != float_round_to_zero ) {
4549             if (   extractFloat128Sign( z )
4550                  ^ ( roundingMode == float_round_up ) ) {
4551                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4552             }
4553         }
4554         z.low &= ~ roundBitsMask;
4555     }
4556     else {
4557         if ( aExp < 0x3FFF ) {
4558             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4559             float_set_inexact();
4560             aSign = extractFloat128Sign( a );
4561             switch ( float_rounding_mode() ) {
4562              case float_round_nearest_even:
4563                 if (    ( aExp == 0x3FFE )
4564                      && (   extractFloat128Frac0( a )
4565                           | extractFloat128Frac1( a ) )
4566                    ) {
4567                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4568                 }
4569                 break;
4570              case float_round_down:
4571                 return
4572                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4573                     : packFloat128( 0, 0, 0, 0 );
4574              case float_round_up:
4575                 return
4576                       aSign ? packFloat128( 1, 0, 0, 0 )
4577                     : packFloat128( 0, 0x3FFF, 0, 0 );
4578             }
4579             return packFloat128( aSign, 0, 0, 0 );
4580         }
4581         lastBitMask = 1;
4582         lastBitMask <<= 0x402F - aExp;
4583         roundBitsMask = lastBitMask - 1;
4584         z.low = 0;
4585         z.high = a.high;
4586         roundingMode = float_rounding_mode();
4587         if ( roundingMode == float_round_nearest_even ) {
4588             z.high += lastBitMask>>1;
4589             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4590                 z.high &= ~ lastBitMask;
4591             }
4592         }
4593         else if ( roundingMode != float_round_to_zero ) {
4594             if (   extractFloat128Sign( z )
4595                  ^ ( roundingMode == float_round_up ) ) {
4596                 z.high |= ( a.low != 0 );
4597                 z.high += roundBitsMask;
4598             }
4599         }
4600         z.high &= ~ roundBitsMask;
4601     }
4602     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4603         float_set_inexact();
4604     }
4605     return z;
4606 
4607 }
4608 
4609 /*----------------------------------------------------------------------------
4610 | Returns the result of adding the absolute values of the quadruple-precision
4611 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4612 | before being returned.  `zSign' is ignored if the result is a NaN.
4613 | The addition is performed according to the IEC/IEEE Standard for Binary
4614 | Floating-Point Arithmetic.
4615 *----------------------------------------------------------------------------*/
4616 
4617 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4618 {
4619     int32 aExp, bExp, zExp;
4620     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4621     int32 expDiff;
4622 
4623     aSig1 = extractFloat128Frac1( a );
4624     aSig0 = extractFloat128Frac0( a );
4625     aExp = extractFloat128Exp( a );
4626     bSig1 = extractFloat128Frac1( b );
4627     bSig0 = extractFloat128Frac0( b );
4628     bExp = extractFloat128Exp( b );
4629     expDiff = aExp - bExp;
4630     if ( 0 < expDiff ) {
4631         if ( aExp == 0x7FFF ) {
4632             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4633             return a;
4634         }
4635         if ( bExp == 0 ) {
4636             --expDiff;
4637         }
4638         else {
4639             bSig0 |= LIT64( 0x0001000000000000 );
4640         }
4641         shift128ExtraRightJamming(
4642             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4643         zExp = aExp;
4644     }
4645     else if ( expDiff < 0 ) {
4646         if ( bExp == 0x7FFF ) {
4647             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4648             return packFloat128( zSign, 0x7FFF, 0, 0 );
4649         }
4650         if ( aExp == 0 ) {
4651             ++expDiff;
4652         }
4653         else {
4654             aSig0 |= LIT64( 0x0001000000000000 );
4655         }
4656         shift128ExtraRightJamming(
4657             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4658         zExp = bExp;
4659     }
4660     else {
4661         if ( aExp == 0x7FFF ) {
4662             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4663                 return propagateFloat128NaN( a, b );
4664             }
4665             return a;
4666         }
4667         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4668         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4669         zSig2 = 0;
4670         zSig0 |= LIT64( 0x0002000000000000 );
4671         zExp = aExp;
4672         goto shiftRight1;
4673     }
4674     aSig0 |= LIT64( 0x0001000000000000 );
4675     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4676     --zExp;
4677     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4678     ++zExp;
4679  shiftRight1:
4680     shift128ExtraRightJamming(
4681         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4682  roundAndPack:
4683     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4684 
4685 }
4686 
4687 /*----------------------------------------------------------------------------
4688 | Returns the result of subtracting the absolute values of the quadruple-
4689 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4690 | difference is negated before being returned.  `zSign' is ignored if the
4691 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4692 | Standard for Binary Floating-Point Arithmetic.
4693 *----------------------------------------------------------------------------*/
4694 
4695 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4696 {
4697     int32 aExp, bExp, zExp;
4698     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4699     int32 expDiff;
4700     float128 z;
4701 
4702     aSig1 = extractFloat128Frac1( a );
4703     aSig0 = extractFloat128Frac0( a );
4704     aExp = extractFloat128Exp( a );
4705     bSig1 = extractFloat128Frac1( b );
4706     bSig0 = extractFloat128Frac0( b );
4707     bExp = extractFloat128Exp( b );
4708     expDiff = aExp - bExp;
4709     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4710     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4711     if ( 0 < expDiff ) goto aExpBigger;
4712     if ( expDiff < 0 ) goto bExpBigger;
4713     if ( aExp == 0x7FFF ) {
4714         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4715             return propagateFloat128NaN( a, b );
4716         }
4717         float_raise( float_flag_invalid );
4718         z.low = float128_default_nan_low;
4719         z.high = float128_default_nan_high;
4720         return z;
4721     }
4722     if ( aExp == 0 ) {
4723         aExp = 1;
4724         bExp = 1;
4725     }
4726     if ( bSig0 < aSig0 ) goto aBigger;
4727     if ( aSig0 < bSig0 ) goto bBigger;
4728     if ( bSig1 < aSig1 ) goto aBigger;
4729     if ( aSig1 < bSig1 ) goto bBigger;
4730     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4731  bExpBigger:
4732     if ( bExp == 0x7FFF ) {
4733         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4734         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4735     }
4736     if ( aExp == 0 ) {
4737         ++expDiff;
4738     }
4739     else {
4740         aSig0 |= LIT64( 0x4000000000000000 );
4741     }
4742     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4743     bSig0 |= LIT64( 0x4000000000000000 );
4744  bBigger:
4745     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4746     zExp = bExp;
4747     zSign ^= 1;
4748     goto normalizeRoundAndPack;
4749  aExpBigger:
4750     if ( aExp == 0x7FFF ) {
4751         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4752         return a;
4753     }
4754     if ( bExp == 0 ) {
4755         --expDiff;
4756     }
4757     else {
4758         bSig0 |= LIT64( 0x4000000000000000 );
4759     }
4760     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4761     aSig0 |= LIT64( 0x4000000000000000 );
4762  aBigger:
4763     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4764     zExp = aExp;
4765  normalizeRoundAndPack:
4766     --zExp;
4767     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4768 
4769 }
4770 
4771 /*----------------------------------------------------------------------------
4772 | Returns the result of adding the quadruple-precision floating-point values
4773 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4774 | for Binary Floating-Point Arithmetic.
4775 *----------------------------------------------------------------------------*/
4776 
4777 float128 float128_add( float128 a, float128 b )
4778 {
4779     flag aSign, bSign;
4780 
4781     aSign = extractFloat128Sign( a );
4782     bSign = extractFloat128Sign( b );
4783     if ( aSign == bSign ) {
4784         return addFloat128Sigs( a, b, aSign );
4785     }
4786     else {
4787         return subFloat128Sigs( a, b, aSign );
4788     }
4789 
4790 }
4791 
4792 /*----------------------------------------------------------------------------
4793 | Returns the result of subtracting the quadruple-precision floating-point
4794 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4795 | Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4797 
4798 float128 float128_sub( float128 a, float128 b )
4799 {
4800     flag aSign, bSign;
4801 
4802     aSign = extractFloat128Sign( a );
4803     bSign = extractFloat128Sign( b );
4804     if ( aSign == bSign ) {
4805         return subFloat128Sigs( a, b, aSign );
4806     }
4807     else {
4808         return addFloat128Sigs( a, b, aSign );
4809     }
4810 
4811 }
4812 
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of multiplying the quadruple-precision floating-point
4815 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4816 | Standard for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4818 
4819 float128 float128_mul( float128 a, float128 b )
4820 {
4821     flag aSign, bSign, zSign;
4822     int32 aExp, bExp, zExp;
4823     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4824     float128 z;
4825 
4826     aSig1 = extractFloat128Frac1( a );
4827     aSig0 = extractFloat128Frac0( a );
4828     aExp = extractFloat128Exp( a );
4829     aSign = extractFloat128Sign( a );
4830     bSig1 = extractFloat128Frac1( b );
4831     bSig0 = extractFloat128Frac0( b );
4832     bExp = extractFloat128Exp( b );
4833     bSign = extractFloat128Sign( b );
4834     zSign = aSign ^ bSign;
4835     if ( aExp == 0x7FFF ) {
4836         if (    ( aSig0 | aSig1 )
4837              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4838             return propagateFloat128NaN( a, b );
4839         }
4840         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4841         return packFloat128( zSign, 0x7FFF, 0, 0 );
4842     }
4843     if ( bExp == 0x7FFF ) {
4844         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4845         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4846  invalid:
4847             float_raise( float_flag_invalid );
4848             z.low = float128_default_nan_low;
4849             z.high = float128_default_nan_high;
4850             return z;
4851         }
4852         return packFloat128( zSign, 0x7FFF, 0, 0 );
4853     }
4854     if ( aExp == 0 ) {
4855         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4856         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4857     }
4858     if ( bExp == 0 ) {
4859         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4861     }
4862     zExp = aExp + bExp - 0x4000;
4863     aSig0 |= LIT64( 0x0001000000000000 );
4864     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4865     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4866     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4867     zSig2 |= ( zSig3 != 0 );
4868     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4869         shift128ExtraRightJamming(
4870             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4871         ++zExp;
4872     }
4873     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4874 
4875 }
4876 
4877 /*----------------------------------------------------------------------------
4878 | Returns the result of dividing the quadruple-precision floating-point value
4879 | `a' by the corresponding value `b'.  The operation is performed according to
4880 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4881 *----------------------------------------------------------------------------*/
4882 
4883 float128 float128_div( float128 a, float128 b )
4884 {
4885     flag aSign, bSign, zSign;
4886     int32 aExp, bExp, zExp;
4887     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4888     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4889     float128 z;
4890 
4891     aSig1 = extractFloat128Frac1( a );
4892     aSig0 = extractFloat128Frac0( a );
4893     aExp = extractFloat128Exp( a );
4894     aSign = extractFloat128Sign( a );
4895     bSig1 = extractFloat128Frac1( b );
4896     bSig0 = extractFloat128Frac0( b );
4897     bExp = extractFloat128Exp( b );
4898     bSign = extractFloat128Sign( b );
4899     zSign = aSign ^ bSign;
4900     if ( aExp == 0x7FFF ) {
4901         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4902         if ( bExp == 0x7FFF ) {
4903             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4904             goto invalid;
4905         }
4906         return packFloat128( zSign, 0x7FFF, 0, 0 );
4907     }
4908     if ( bExp == 0x7FFF ) {
4909         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4910         return packFloat128( zSign, 0, 0, 0 );
4911     }
4912     if ( bExp == 0 ) {
4913         if ( ( bSig0 | bSig1 ) == 0 ) {
4914             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4915  invalid:
4916                 float_raise( float_flag_invalid );
4917                 z.low = float128_default_nan_low;
4918                 z.high = float128_default_nan_high;
4919                 return z;
4920             }
4921             float_raise( float_flag_divbyzero );
4922             return packFloat128( zSign, 0x7FFF, 0, 0 );
4923         }
4924         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4925     }
4926     if ( aExp == 0 ) {
4927         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4928         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4929     }
4930     zExp = aExp - bExp + 0x3FFD;
4931     shortShift128Left(
4932         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4933     shortShift128Left(
4934         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4935     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4936         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4937         ++zExp;
4938     }
4939     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4940     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4941     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4942     while ( (sbits64) rem0 < 0 ) {
4943         --zSig0;
4944         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4945     }
4946     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4947     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4948         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4949         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4950         while ( (sbits64) rem1 < 0 ) {
4951             --zSig1;
4952             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4953         }
4954         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4955     }
4956     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4957     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4958 
4959 }
4960 
4961 /*----------------------------------------------------------------------------
4962 | Returns the remainder of the quadruple-precision floating-point value `a'
4963 | with respect to the corresponding value `b'.  The operation is performed
4964 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4965 *----------------------------------------------------------------------------*/
4966 
4967 float128 float128_rem( float128 a, float128 b )
4968 {
4969     flag aSign, bSign, zSign;
4970     int32 aExp, bExp, expDiff;
4971     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4972     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4973     sbits64 sigMean0;
4974     float128 z;
4975 
4976     aSig1 = extractFloat128Frac1( a );
4977     aSig0 = extractFloat128Frac0( a );
4978     aExp = extractFloat128Exp( a );
4979     aSign = extractFloat128Sign( a );
4980     bSig1 = extractFloat128Frac1( b );
4981     bSig0 = extractFloat128Frac0( b );
4982     bExp = extractFloat128Exp( b );
4983     bSign = extractFloat128Sign( b );
4984     if ( aExp == 0x7FFF ) {
4985         if (    ( aSig0 | aSig1 )
4986              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4987             return propagateFloat128NaN( a, b );
4988         }
4989         goto invalid;
4990     }
4991     if ( bExp == 0x7FFF ) {
4992         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4993         return a;
4994     }
4995     if ( bExp == 0 ) {
4996         if ( ( bSig0 | bSig1 ) == 0 ) {
4997  invalid:
4998             float_raise( float_flag_invalid );
4999             z.low = float128_default_nan_low;
5000             z.high = float128_default_nan_high;
5001             return z;
5002         }
5003         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5004     }
5005     if ( aExp == 0 ) {
5006         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5007         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5008     }
5009     expDiff = aExp - bExp;
5010     if ( expDiff < -1 ) return a;
5011     shortShift128Left(
5012         aSig0 | LIT64( 0x0001000000000000 ),
5013         aSig1,
5014         15 - ( expDiff < 0 ),
5015         &aSig0,
5016         &aSig1
5017     );
5018     shortShift128Left(
5019         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5020     q = le128( bSig0, bSig1, aSig0, aSig1 );
5021     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5022     expDiff -= 64;
5023     while ( 0 < expDiff ) {
5024         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5025         q = ( 4 < q ) ? q - 4 : 0;
5026         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5027         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5028         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5029         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5030         expDiff -= 61;
5031     }
5032     if ( -64 < expDiff ) {
5033         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5034         q = ( 4 < q ) ? q - 4 : 0;
5035         q >>= - expDiff;
5036         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5037         expDiff += 52;
5038         if ( expDiff < 0 ) {
5039             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5040         }
5041         else {
5042             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5043         }
5044         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5045         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5046     }
5047     else {
5048         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5049         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5050     }
5051     do {
5052         alternateASig0 = aSig0;
5053         alternateASig1 = aSig1;
5054         ++q;
5055         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5056     } while ( 0 <= (sbits64) aSig0 );
5057     add128(
5058         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5059     if (    ( sigMean0 < 0 )
5060          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5061         aSig0 = alternateASig0;
5062         aSig1 = alternateASig1;
5063     }
5064     zSign = ( (sbits64) aSig0 < 0 );
5065     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5066     return
5067         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5068 
5069 }
5070 
5071 /*----------------------------------------------------------------------------
5072 | Returns the square root of the quadruple-precision floating-point value `a'.
5073 | The operation is performed according to the IEC/IEEE Standard for Binary
5074 | Floating-Point Arithmetic.
5075 *----------------------------------------------------------------------------*/
5076 
5077 float128 float128_sqrt( float128 a )
5078 {
5079     flag aSign;
5080     int32 aExp, zExp;
5081     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5082     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5083     float128 z;
5084 
5085     aSig1 = extractFloat128Frac1( a );
5086     aSig0 = extractFloat128Frac0( a );
5087     aExp = extractFloat128Exp( a );
5088     aSign = extractFloat128Sign( a );
5089     if ( aExp == 0x7FFF ) {
5090         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5091         if ( ! aSign ) return a;
5092         goto invalid;
5093     }
5094     if ( aSign ) {
5095         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5096  invalid:
5097         float_raise( float_flag_invalid );
5098         z.low = float128_default_nan_low;
5099         z.high = float128_default_nan_high;
5100         return z;
5101     }
5102     if ( aExp == 0 ) {
5103         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5104         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5105     }
5106     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5107     aSig0 |= LIT64( 0x0001000000000000 );
5108     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5109     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5110     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5111     doubleZSig0 = zSig0<<1;
5112     mul64To128( zSig0, zSig0, &term0, &term1 );
5113     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5114     while ( (sbits64) rem0 < 0 ) {
5115         --zSig0;
5116         doubleZSig0 -= 2;
5117         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5118     }
5119     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5120     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5121         if ( zSig1 == 0 ) zSig1 = 1;
5122         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5123         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5124         mul64To128( zSig1, zSig1, &term2, &term3 );
5125         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5126         while ( (sbits64) rem1 < 0 ) {
5127             --zSig1;
5128             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5129             term3 |= 1;
5130             term2 |= doubleZSig0;
5131             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5132         }
5133         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5134     }
5135     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5136     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5137 
5138 }
5139 
5140 /*----------------------------------------------------------------------------
5141 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5142 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5144 *----------------------------------------------------------------------------*/
5145 
5146 flag float128_eq( float128 a, float128 b )
5147 {
5148 
5149     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5150               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5151          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5152               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5153        ) {
5154         if (    float128_is_signaling_nan( a )
5155              || float128_is_signaling_nan( b ) ) {
5156             float_raise( float_flag_invalid );
5157         }
5158         return 0;
5159     }
5160     return
5161            ( a.low == b.low )
5162         && (    ( a.high == b.high )
5163              || (    ( a.low == 0 )
5164                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5165            );
5166 
5167 }
5168 
5169 /*----------------------------------------------------------------------------
5170 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5171 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5172 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5173 | Arithmetic.
5174 *----------------------------------------------------------------------------*/
5175 
5176 flag float128_le( float128 a, float128 b )
5177 {
5178     flag aSign, bSign;
5179 
5180     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5181               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5182          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5183               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5184        ) {
5185         float_raise( float_flag_invalid );
5186         return 0;
5187     }
5188     aSign = extractFloat128Sign( a );
5189     bSign = extractFloat128Sign( b );
5190     if ( aSign != bSign ) {
5191         return
5192                aSign
5193             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5194                  == 0 );
5195     }
5196     return
5197           aSign ? le128( b.high, b.low, a.high, a.low )
5198         : le128( a.high, a.low, b.high, b.low );
5199 
5200 }
5201 
5202 /*----------------------------------------------------------------------------
5203 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5204 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5206 *----------------------------------------------------------------------------*/
5207 
5208 flag float128_lt( float128 a, float128 b )
5209 {
5210     flag aSign, bSign;
5211 
5212     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5213               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5214          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5215               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5216        ) {
5217         float_raise( float_flag_invalid );
5218         return 0;
5219     }
5220     aSign = extractFloat128Sign( a );
5221     bSign = extractFloat128Sign( b );
5222     if ( aSign != bSign ) {
5223         return
5224                aSign
5225             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5226                  != 0 );
5227     }
5228     return
5229           aSign ? lt128( b.high, b.low, a.high, a.low )
5230         : lt128( a.high, a.low, b.high, b.low );
5231 
5232 }
5233 
5234 /*----------------------------------------------------------------------------
5235 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5236 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5237 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5239 *----------------------------------------------------------------------------*/
5240 
5241 flag float128_eq_signaling( float128 a, float128 b )
5242 {
5243 
5244     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5245               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5246          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5247               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5248        ) {
5249         float_raise( float_flag_invalid );
5250         return 0;
5251     }
5252     return
5253            ( a.low == b.low )
5254         && (    ( a.high == b.high )
5255              || (    ( a.low == 0 )
5256                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5257            );
5258 
5259 }
5260 
5261 /*----------------------------------------------------------------------------
5262 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5264 | cause an exception.  Otherwise, the comparison is performed according to the
5265 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5266 *----------------------------------------------------------------------------*/
5267 
5268 flag float128_le_quiet( float128 a, float128 b )
5269 {
5270     flag aSign, bSign;
5271 
5272     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5273               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5274          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5275               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5276        ) {
5277         if (    float128_is_signaling_nan( a )
5278              || float128_is_signaling_nan( b ) ) {
5279             float_raise( float_flag_invalid );
5280         }
5281         return 0;
5282     }
5283     aSign = extractFloat128Sign( a );
5284     bSign = extractFloat128Sign( b );
5285     if ( aSign != bSign ) {
5286         return
5287                aSign
5288             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5289                  == 0 );
5290     }
5291     return
5292           aSign ? le128( b.high, b.low, a.high, a.low )
5293         : le128( a.high, a.low, b.high, b.low );
5294 
5295 }
5296 
5297 /*----------------------------------------------------------------------------
5298 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5299 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5300 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5301 | Standard for Binary Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5303 
5304 flag float128_lt_quiet( float128 a, float128 b )
5305 {
5306     flag aSign, bSign;
5307 
5308     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5309               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5311               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312        ) {
5313         if (    float128_is_signaling_nan( a )
5314              || float128_is_signaling_nan( b ) ) {
5315             float_raise( float_flag_invalid );
5316         }
5317         return 0;
5318     }
5319     aSign = extractFloat128Sign( a );
5320     bSign = extractFloat128Sign( b );
5321     if ( aSign != bSign ) {
5322         return
5323                aSign
5324             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5325                  != 0 );
5326     }
5327     return
5328           aSign ? lt128( b.high, b.low, a.high, a.low )
5329         : lt128( a.high, a.low, b.high, b.low );
5330 
5331 }
5332 
5333 #endif
5334 
5335 
5336 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5337 
5338 /*
5339  * These two routines are not part of the original softfloat distribution.
5340  *
5341  * They are based on the corresponding conversions to integer but return
5342  * unsigned numbers instead since these functions are required by GCC.
5343  *
5344  * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5345  *
5346  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5347  */
5348 
5349 /*----------------------------------------------------------------------------
5350 | Returns the result of converting the double-precision floating-point value
5351 | `a' to the 32-bit unsigned integer format.  The conversion is
5352 | performed according to the IEC/IEEE Standard for Binary Floating-point
5353 | Arithmetic, except that the conversion is always rounded toward zero.  If
5354 | `a' is a NaN, the largest positive integer is returned.  If the conversion
5355 | overflows, the largest integer positive is returned.
5356 *----------------------------------------------------------------------------*/
5357 
5358 uint32 float64_to_uint32_round_to_zero( float64 a )
5359 {
5360     flag aSign;
5361     int16 aExp, shiftCount;
5362     bits64 aSig, savedASig;
5363     uint32 z;
5364 
5365     aSig = extractFloat64Frac( a );
5366     aExp = extractFloat64Exp( a );
5367     aSign = extractFloat64Sign( a );
5368 
5369     if (aSign) {
5370         float_raise( float_flag_invalid );
5371     	return(0);
5372     }
5373 
5374     if ( 0x41E < aExp ) {
5375         float_raise( float_flag_invalid );
5376         return 0xffffffff;
5377     }
5378     else if ( aExp < 0x3FF ) {
5379         if ( aExp || aSig ) float_set_inexact();
5380         return 0;
5381     }
5382     aSig |= LIT64( 0x0010000000000000 );
5383     shiftCount = 0x433 - aExp;
5384     savedASig = aSig;
5385     aSig >>= shiftCount;
5386     z = aSig;
5387     if ( ( aSig<<shiftCount ) != savedASig ) {
5388         float_set_inexact();
5389     }
5390     return z;
5391 
5392 }
5393 
5394 /*----------------------------------------------------------------------------
5395 | Returns the result of converting the single-precision floating-point value
5396 | `a' to the 32-bit unsigned integer format.  The conversion is
5397 | performed according to the IEC/IEEE Standard for Binary Floating-point
5398 | Arithmetic, except that the conversion is always rounded toward zero.  If
5399 | `a' is a NaN, the largest positive integer is returned.  If the conversion
5400 | overflows, the largest positive integer is returned.
5401 *----------------------------------------------------------------------------*/
5402 
5403 uint32 float32_to_uint32_round_to_zero( float32 a )
5404 {
5405     flag aSign;
5406     int16 aExp, shiftCount;
5407     bits32 aSig;
5408     uint32 z;
5409 
5410     aSig = extractFloat32Frac( a );
5411     aExp = extractFloat32Exp( a );
5412     aSign = extractFloat32Sign( a );
5413     shiftCount = aExp - 0x9E;
5414 
5415     if (aSign) {
5416         float_raise( float_flag_invalid );
5417     	return(0);
5418     }
5419     if ( 0 < shiftCount ) {
5420         float_raise( float_flag_invalid );
5421         return 0xFFFFFFFF;
5422     }
5423     else if ( aExp <= 0x7E ) {
5424         if ( aExp | aSig ) float_set_inexact();
5425         return 0;
5426     }
5427     aSig = ( aSig | 0x800000 )<<8;
5428     z = aSig>>( - shiftCount );
5429     if ( aSig<<( shiftCount & 31 ) ) {
5430         float_set_inexact();
5431     }
5432     return z;
5433 
5434 }
5435 
5436 #endif
5437 
5438 #endif /* _STANDALONE */
5439