xref: /netbsd-src/sys/lib/libkern/softfloat.c (revision 0819cc0b79199ddad8dcc0602b08485c4bd19d7d)
1 /* $NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*============================================================================
18 
19 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
20 Package, Release 2b.
21 
22 Written by John R. Hauser.  This work was made possible in part by the
23 International Computer Science Institute, located at Suite 600, 1947 Center
24 Street, Berkeley, California 94704.  Funding was partially provided by the
25 National Science Foundation under grant MIP-9311980.  The original version
26 of this code was written as part of a project to build a fixed-point vector
27 processor in collaboration with the University of California at Berkeley,
28 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
29 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
30 arithmetic/SoftFloat.html'.
31 
32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
33 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
34 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
35 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
36 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
37 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
38 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
39 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
40 
41 Derivative works are acceptable, even for commercial purposes, so long as
42 (1) the source code for the derivative work includes prominent notice that
43 the work is derivative, and (2) the source code includes prominent notice with
44 these four paragraphs for those parts of this code that are retained.
45 
46 =============================================================================*/
47 
48 /* If you need this in a boot program, you have bigger problems... */
49 #ifndef _STANDALONE
50 
51 #include <sys/cdefs.h>
52 #if defined(LIBC_SCCS) && !defined(lint)
53 __RCSID("$NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $");
54 #endif /* LIBC_SCCS and not lint */
55 
56 #ifdef SOFTFLOAT_FOR_GCC
57 #include "softfloat-for-gcc.h"
58 #endif
59 
60 #include "milieu.h"
61 #include "softfloat.h"
62 
63 /*----------------------------------------------------------------------------
64 | Floating-point rounding mode, extended double-precision rounding precision,
65 | and exception flags.
66 *----------------------------------------------------------------------------*/
67 /*
68  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
69  * 	Right now, it does not.  I've removed all other dynamic global
70  * 	variables. [ross]
71  */
72 #ifdef FLOATX80
73 int8 floatx80_rounding_precision = 80;
74 #endif
75 
76 /*----------------------------------------------------------------------------
77 | Primitive arithmetic functions, including multi-word arithmetic, and
78 | division and square root approximations.  (Can be specialized to target if
79 | desired.)
80 *----------------------------------------------------------------------------*/
81 #include "softfloat-macros.h"
82 
83 /*----------------------------------------------------------------------------
84 | Functions and definitions to determine:  (1) whether tininess for underflow
85 | is detected before or after rounding by default, (2) what (if anything)
86 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
87 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
88 | are propagated from function inputs to output.  These details are target-
89 | specific.
90 *----------------------------------------------------------------------------*/
91 #include "softfloat-specialize.h"
92 
93 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
94 /*----------------------------------------------------------------------------
95 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
96 | and 7, and returns the properly rounded 32-bit integer corresponding to the
97 | input.  If `zSign' is 1, the input is negated before being converted to an
98 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
99 | is simply rounded to an integer, with the inexact exception raised if the
100 | input cannot be represented exactly as an integer.  However, if the fixed-
101 | point input is too large, the invalid exception is raised and the largest
102 | positive or negative integer is returned.
103 *----------------------------------------------------------------------------*/
104 
roundAndPackInt32(flag zSign,bits64 absZ)105 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
106 {
107     int8 roundingMode;
108     flag roundNearestEven;
109     int8 roundIncrement, roundBits;
110     int32 z;
111 
112     roundingMode = float_rounding_mode();
113     roundNearestEven = ( roundingMode == float_round_nearest_even );
114     roundIncrement = 0x40;
115     if ( ! roundNearestEven ) {
116         if ( roundingMode == float_round_to_zero ) {
117             roundIncrement = 0;
118         }
119         else {
120             roundIncrement = 0x7F;
121             if ( zSign ) {
122                 if ( roundingMode == float_round_up ) roundIncrement = 0;
123             }
124             else {
125                 if ( roundingMode == float_round_down ) roundIncrement = 0;
126             }
127         }
128     }
129     roundBits = absZ & 0x7F;
130     absZ = ( absZ + roundIncrement )>>7;
131     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132     z = absZ;
133     if ( zSign ) z = - z;
134     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135         float_raise( float_flag_invalid );
136         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
137     }
138     if ( roundBits ) float_set_inexact();
139     return z;
140 
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
146 | and returns the properly rounded 64-bit integer corresponding to the input.
147 | If `zSign' is 1, the input is negated before being converted to an integer.
148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
149 | the inexact exception raised if the input cannot be represented exactly as
150 | an integer.  However, if the fixed-point input is too large, the invalid
151 | exception is raised and the largest positive or negative integer is
152 | returned.
153 *----------------------------------------------------------------------------*/
154 
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)155 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
156 {
157     int8 roundingMode;
158     flag roundNearestEven, increment;
159     int64 z;
160 
161     roundingMode = float_rounding_mode();
162     roundNearestEven = ( roundingMode == float_round_nearest_even );
163     increment = ( (sbits64) absZ1 < 0 );
164     if ( ! roundNearestEven ) {
165         if ( roundingMode == float_round_to_zero ) {
166             increment = 0;
167         }
168         else {
169             if ( zSign ) {
170                 increment = ( roundingMode == float_round_down ) && absZ1;
171             }
172             else {
173                 increment = ( roundingMode == float_round_up ) && absZ1;
174             }
175         }
176     }
177     if ( increment ) {
178         ++absZ0;
179         if ( absZ0 == 0 ) goto overflow;
180         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
181     }
182     z = absZ0;
183     if ( zSign ) z = - z;
184     if ( z && ( ( z < 0 ) ^ zSign ) ) {
185  overflow:
186         float_raise( float_flag_invalid );
187         return
188               zSign ? (sbits64) LIT64( 0x8000000000000000 )
189             : LIT64( 0x7FFFFFFFFFFFFFFF );
190     }
191     if ( absZ1 ) float_set_inexact();
192     return z;
193 
194 }
195 
196 /* same as above, but for unsigned values */
roundAndPackUInt64(bits64 absZ0,bits64 absZ1)197 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
198 {
199     int8 roundingMode;
200     flag roundNearestEven, increment;
201     uint64 z;
202 
203     roundingMode = float_rounding_mode();
204     roundNearestEven = ( roundingMode == float_round_nearest_even );
205     increment = ( (sbits64) absZ1 < 0 );
206     if ( ! roundNearestEven ) {
207         if ( roundingMode == float_round_to_zero ) {
208             increment = 0;
209         }
210         else {
211             increment = ( roundingMode == float_round_up ) && absZ1;
212         }
213     }
214     if ( increment ) {
215         ++absZ0;
216         if ( absZ0 == 0 ) {
217             float_raise( float_flag_invalid );
218             return LIT64( 0x7FFFFFFFFFFFFFFF );
219 	}
220         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
221     }
222     z = absZ0;
223     if ( absZ1 ) float_set_inexact();
224     return z;
225 
226 }
227 #endif /* SOFTFLOAT_FOR_GCC */
228 
229 /*----------------------------------------------------------------------------
230 | Returns the fraction bits of the single-precision floating-point value `a'.
231 *----------------------------------------------------------------------------*/
232 
extractFloat32Frac(float32 a)233 INLINE bits32 extractFloat32Frac( float32 a )
234 {
235 
236     return a & 0x007FFFFF;
237 
238 }
239 
240 /*----------------------------------------------------------------------------
241 | Returns the exponent bits of the single-precision floating-point value `a'.
242 *----------------------------------------------------------------------------*/
243 
extractFloat32Exp(float32 a)244 INLINE int16 extractFloat32Exp( float32 a )
245 {
246 
247     return ( a>>23 ) & 0xFF;
248 
249 }
250 
251 /*----------------------------------------------------------------------------
252 | Returns the sign bit of the single-precision floating-point value `a'.
253 *----------------------------------------------------------------------------*/
254 
extractFloat32Sign(float32 a)255 INLINE flag extractFloat32Sign( float32 a )
256 {
257 
258     return a>>31;
259 
260 }
261 
262 /*----------------------------------------------------------------------------
263 | Normalizes the subnormal single-precision floating-point value represented
264 | by the denormalized significand `aSig'.  The normalized exponent and
265 | significand are stored at the locations pointed to by `zExpPtr' and
266 | `zSigPtr', respectively.
267 *----------------------------------------------------------------------------*/
268 
269 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)270  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
271 {
272     int8 shiftCount;
273 
274     shiftCount = countLeadingZeros32( aSig ) - 8;
275     *zSigPtr = aSig<<shiftCount;
276     *zExpPtr = 1 - shiftCount;
277 
278 }
279 
280 /*----------------------------------------------------------------------------
281 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
282 | single-precision floating-point value, returning the result.  After being
283 | shifted into the proper positions, the three fields are simply added
284 | together to form the result.  This means that any integer portion of `zSig'
285 | will be added into the exponent.  Since a properly normalized significand
286 | will have an integer portion equal to 1, the `zExp' input should be 1 less
287 | than the desired result exponent whenever `zSig' is a complete, normalized
288 | significand.
289 *----------------------------------------------------------------------------*/
290 
packFloat32(flag zSign,int16 zExp,bits32 zSig)291 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
292 {
293 
294     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
295 
296 }
297 
298 /*----------------------------------------------------------------------------
299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
300 | and significand `zSig', and returns the proper single-precision floating-
301 | point value corresponding to the abstract input.  Ordinarily, the abstract
302 | value is simply rounded and packed into the single-precision format, with
303 | the inexact exception raised if the abstract input cannot be represented
304 | exactly.  However, if the abstract value is too large, the overflow and
305 | inexact exceptions are raised and an infinity or maximal finite value is
306 | returned.  If the abstract value is too small, the input value is rounded to
307 | a subnormal number, and the underflow and inexact exceptions are raised if
308 | the abstract input cannot be represented exactly as a subnormal single-
309 | precision floating-point number.
310 |     The input significand `zSig' has its binary point between bits 30
311 | and 29, which is 7 bits to the left of the usual location.  This shifted
312 | significand must be normalized or smaller.  If `zSig' is not normalized,
313 | `zExp' must be 0; in that case, the result returned is a subnormal number,
314 | and it must not require rounding.  In the usual case that `zSig' is
315 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
316 | The handling of underflow and overflow follows the IEC/IEEE Standard for
317 | Binary Floating-Point Arithmetic.
318 *----------------------------------------------------------------------------*/
319 
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)320 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
321 {
322     int8 roundingMode;
323     flag roundNearestEven;
324     int8 roundIncrement, roundBits;
325     flag isTiny;
326 
327     roundingMode = float_rounding_mode();
328     roundNearestEven = ( roundingMode == float_round_nearest_even );
329     roundIncrement = 0x40;
330     if ( ! roundNearestEven ) {
331         if ( roundingMode == float_round_to_zero ) {
332             roundIncrement = 0;
333         }
334         else {
335             roundIncrement = 0x7F;
336             if ( zSign ) {
337                 if ( roundingMode == float_round_up ) roundIncrement = 0;
338             }
339             else {
340                 if ( roundingMode == float_round_down ) roundIncrement = 0;
341             }
342         }
343     }
344     roundBits = zSig & 0x7F;
345     if ( 0xFD <= (bits16) zExp ) {
346         if (    ( 0xFD < zExp )
347              || (    ( zExp == 0xFD )
348                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
349            ) {
350             float_raise( float_flag_overflow | float_flag_inexact );
351             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
352         }
353         if ( zExp < 0 ) {
354             isTiny =
355                    ( float_detect_tininess == float_tininess_before_rounding )
356                 || ( zExp < -1 )
357                 || ( zSig + roundIncrement < 0x80000000 );
358             shift32RightJamming( zSig, - zExp, &zSig );
359             zExp = 0;
360             roundBits = zSig & 0x7F;
361             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
362         }
363     }
364     if ( roundBits ) float_set_inexact();
365     zSig = ( zSig + roundIncrement )>>7;
366     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
367     if ( zSig == 0 ) zExp = 0;
368     return packFloat32( zSign, zExp, zSig );
369 
370 }
371 
372 /*----------------------------------------------------------------------------
373 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
374 | and significand `zSig', and returns the proper single-precision floating-
375 | point value corresponding to the abstract input.  This routine is just like
376 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
377 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
378 | floating-point exponent.
379 *----------------------------------------------------------------------------*/
380 
381 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)382  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
383 {
384     int8 shiftCount;
385 
386     shiftCount = countLeadingZeros32( zSig ) - 1;
387     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
388 
389 }
390 
391 /*----------------------------------------------------------------------------
392 | Returns the fraction bits of the double-precision floating-point value `a'.
393 *----------------------------------------------------------------------------*/
394 
extractFloat64Frac(float64 a)395 INLINE bits64 extractFloat64Frac( float64 a )
396 {
397 
398     return a & LIT64( 0x000FFFFFFFFFFFFF );
399 
400 }
401 
402 /*----------------------------------------------------------------------------
403 | Returns the exponent bits of the double-precision floating-point value `a'.
404 *----------------------------------------------------------------------------*/
405 
extractFloat64Exp(float64 a)406 INLINE int16 extractFloat64Exp( float64 a )
407 {
408 
409     return ( a>>52 ) & 0x7FF;
410 
411 }
412 /*----------------------------------------------------------------------------
413 | Returns the sign bit of the double-precision floating-point value `a'.
414 *----------------------------------------------------------------------------*/
415 
extractFloat64Sign(float64 a)416 INLINE flag extractFloat64Sign( float64 a )
417 {
418 
419     return a>>63;
420 
421 }
422 
423 /*----------------------------------------------------------------------------
424 | Normalizes the subnormal double-precision floating-point value represented
425 | by the denormalized significand `aSig'.  The normalized exponent and
426 | significand are stored at the locations pointed to by `zExpPtr' and
427 | `zSigPtr', respectively.
428 *----------------------------------------------------------------------------*/
429 
430 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)431  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
432 {
433     int8 shiftCount;
434 
435     shiftCount = countLeadingZeros64( aSig ) - 11;
436     *zSigPtr = aSig<<shiftCount;
437     *zExpPtr = 1 - shiftCount;
438 
439 }
440 
441 /*----------------------------------------------------------------------------
442 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
443 | double-precision floating-point value, returning the result.  After being
444 | shifted into the proper positions, the three fields are simply added
445 | together to form the result.  This means that any integer portion of `zSig'
446 | will be added into the exponent.  Since a properly normalized significand
447 | will have an integer portion equal to 1, the `zExp' input should be 1 less
448 | than the desired result exponent whenever `zSig' is a complete, normalized
449 | significand.
450 *----------------------------------------------------------------------------*/
451 
packFloat64(flag zSign,int16 zExp,bits64 zSig)452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
453 {
454 
455     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
456 
457 }
458 
459 /*----------------------------------------------------------------------------
460 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
461 | and significand `zSig', and returns the proper double-precision floating-
462 | point value corresponding to the abstract input.  Ordinarily, the abstract
463 | value is simply rounded and packed into the double-precision format, with
464 | the inexact exception raised if the abstract input cannot be represented
465 | exactly.  However, if the abstract value is too large, the overflow and
466 | inexact exceptions are raised and an infinity or maximal finite value is
467 | returned.  If the abstract value is too small, the input value is rounded
468 | to a subnormal number, and the underflow and inexact exceptions are raised
469 | if the abstract input cannot be represented exactly as a subnormal double-
470 | precision floating-point number.
471 |     The input significand `zSig' has its binary point between bits 62
472 | and 61, which is 10 bits to the left of the usual location.  This shifted
473 | significand must be normalized or smaller.  If `zSig' is not normalized,
474 | `zExp' must be 0; in that case, the result returned is a subnormal number,
475 | and it must not require rounding.  In the usual case that `zSig' is
476 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
477 | The handling of underflow and overflow follows the IEC/IEEE Standard for
478 | Binary Floating-Point Arithmetic.
479 *----------------------------------------------------------------------------*/
480 
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)481 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
482 {
483     int8 roundingMode;
484     flag roundNearestEven;
485     int16 roundIncrement, roundBits;
486     flag isTiny;
487 
488     roundingMode = float_rounding_mode();
489     roundNearestEven = ( roundingMode == float_round_nearest_even );
490     roundIncrement = 0x200;
491     if ( ! roundNearestEven ) {
492         if ( roundingMode == float_round_to_zero ) {
493             roundIncrement = 0;
494         }
495         else {
496             roundIncrement = 0x3FF;
497             if ( zSign ) {
498                 if ( roundingMode == float_round_up ) roundIncrement = 0;
499             }
500             else {
501                 if ( roundingMode == float_round_down ) roundIncrement = 0;
502             }
503         }
504     }
505     roundBits = zSig & 0x3FF;
506     if ( 0x7FD <= (bits16) zExp ) {
507         if (    ( 0x7FD < zExp )
508              || (    ( zExp == 0x7FD )
509                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
510            ) {
511             float_raise( float_flag_overflow | float_flag_inexact );
512             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
513         }
514         if ( zExp < 0 ) {
515             isTiny =
516                    ( float_detect_tininess == float_tininess_before_rounding )
517                 || ( zExp < -1 )
518                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
519             shift64RightJamming( zSig, - zExp, &zSig );
520             zExp = 0;
521             roundBits = zSig & 0x3FF;
522             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
523         }
524     }
525     if ( roundBits ) float_set_inexact();
526     zSig = ( zSig + roundIncrement )>>10;
527     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
528     if ( zSig == 0 ) zExp = 0;
529     return packFloat64( zSign, zExp, zSig );
530 
531 }
532 
533 /*----------------------------------------------------------------------------
534 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
535 | and significand `zSig', and returns the proper double-precision floating-
536 | point value corresponding to the abstract input.  This routine is just like
537 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
538 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
539 | floating-point exponent.
540 *----------------------------------------------------------------------------*/
541 
542 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)543  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
544 {
545     int8 shiftCount;
546 
547     shiftCount = countLeadingZeros64( zSig ) - 1;
548     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
549 
550 }
551 
552 #ifdef FLOATX80
553 
554 /*----------------------------------------------------------------------------
555 | Returns the fraction bits of the extended double-precision floating-point
556 | value `a'.
557 *----------------------------------------------------------------------------*/
558 
extractFloatx80Frac(floatx80 a)559 INLINE bits64 extractFloatx80Frac( floatx80 a )
560 {
561 
562     return a.low;
563 
564 }
565 
566 /*----------------------------------------------------------------------------
567 | Returns the exponent bits of the extended double-precision floating-point
568 | value `a'.
569 *----------------------------------------------------------------------------*/
570 
extractFloatx80Exp(floatx80 a)571 INLINE int32 extractFloatx80Exp( floatx80 a )
572 {
573 
574     return a.high & 0x7FFF;
575 
576 }
577 
578 /*----------------------------------------------------------------------------
579 | Returns the sign bit of the extended double-precision floating-point value
580 | `a'.
581 *----------------------------------------------------------------------------*/
582 
extractFloatx80Sign(floatx80 a)583 INLINE flag extractFloatx80Sign( floatx80 a )
584 {
585 
586     return a.high>>15;
587 
588 }
589 
590 /*----------------------------------------------------------------------------
591 | Normalizes the subnormal extended double-precision floating-point value
592 | represented by the denormalized significand `aSig'.  The normalized exponent
593 | and significand are stored at the locations pointed to by `zExpPtr' and
594 | `zSigPtr', respectively.
595 *----------------------------------------------------------------------------*/
596 
597 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)598  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
599 {
600     int8 shiftCount;
601 
602     shiftCount = countLeadingZeros64( aSig );
603     *zSigPtr = aSig<<shiftCount;
604     *zExpPtr = 1 - shiftCount;
605 
606 }
607 
608 /*----------------------------------------------------------------------------
609 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
610 | extended double-precision floating-point value, returning the result.
611 *----------------------------------------------------------------------------*/
612 
packFloatx80(flag zSign,int32 zExp,bits64 zSig)613 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
614 {
615     floatx80 z;
616 
617     z.low = zSig;
618     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
619     return z;
620 
621 }
622 
623 /*----------------------------------------------------------------------------
624 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
625 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
626 | and returns the proper extended double-precision floating-point value
627 | corresponding to the abstract input.  Ordinarily, the abstract value is
628 | rounded and packed into the extended double-precision format, with the
629 | inexact exception raised if the abstract input cannot be represented
630 | exactly.  However, if the abstract value is too large, the overflow and
631 | inexact exceptions are raised and an infinity or maximal finite value is
632 | returned.  If the abstract value is too small, the input value is rounded to
633 | a subnormal number, and the underflow and inexact exceptions are raised if
634 | the abstract input cannot be represented exactly as a subnormal extended
635 | double-precision floating-point number.
636 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
637 | number of bits as single or double precision, respectively.  Otherwise, the
638 | result is rounded to the full precision of the extended double-precision
639 | format.
640 |     The input significand must be normalized or smaller.  If the input
641 | significand is not normalized, `zExp' must be 0; in that case, the result
642 | returned is a subnormal number, and it must not require rounding.  The
643 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
644 | Floating-Point Arithmetic.
645 *----------------------------------------------------------------------------*/
646 
647 static floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)648  roundAndPackFloatx80(
649      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
650  )
651 {
652     int8 roundingMode;
653     flag roundNearestEven, increment, isTiny;
654     int64 roundIncrement, roundMask, roundBits;
655 
656     roundingMode = float_rounding_mode();
657     roundNearestEven = ( roundingMode == float_round_nearest_even );
658     if ( roundingPrecision == 80 ) goto precision80;
659     if ( roundingPrecision == 64 ) {
660         roundIncrement = LIT64( 0x0000000000000400 );
661         roundMask = LIT64( 0x00000000000007FF );
662     }
663     else if ( roundingPrecision == 32 ) {
664         roundIncrement = LIT64( 0x0000008000000000 );
665         roundMask = LIT64( 0x000000FFFFFFFFFF );
666     }
667     else {
668         goto precision80;
669     }
670     zSig0 |= ( zSig1 != 0 );
671     if ( ! roundNearestEven ) {
672         if ( roundingMode == float_round_to_zero ) {
673             roundIncrement = 0;
674         }
675         else {
676             roundIncrement = roundMask;
677             if ( zSign ) {
678                 if ( roundingMode == float_round_up ) roundIncrement = 0;
679             }
680             else {
681                 if ( roundingMode == float_round_down ) roundIncrement = 0;
682             }
683         }
684     }
685     roundBits = zSig0 & roundMask;
686     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
687         if (    ( 0x7FFE < zExp )
688              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
689            ) {
690             goto overflow;
691         }
692         if ( zExp <= 0 ) {
693             isTiny =
694                    ( float_detect_tininess == float_tininess_before_rounding )
695                 || ( zExp < 0 )
696                 || ( zSig0 <= zSig0 + roundIncrement );
697             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
698             zExp = 0;
699             roundBits = zSig0 & roundMask;
700             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
701             if ( roundBits ) float_set_inexact();
702             zSig0 += roundIncrement;
703             if ( (sbits64) zSig0 < 0 ) zExp = 1;
704             roundIncrement = roundMask + 1;
705             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
706                 roundMask |= roundIncrement;
707             }
708             zSig0 &= ~ roundMask;
709             return packFloatx80( zSign, zExp, zSig0 );
710         }
711     }
712     if ( roundBits ) float_set_inexact();
713     zSig0 += roundIncrement;
714     if ( zSig0 < roundIncrement ) {
715         ++zExp;
716         zSig0 = LIT64( 0x8000000000000000 );
717     }
718     roundIncrement = roundMask + 1;
719     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
720         roundMask |= roundIncrement;
721     }
722     zSig0 &= ~ roundMask;
723     if ( zSig0 == 0 ) zExp = 0;
724     return packFloatx80( zSign, zExp, zSig0 );
725  precision80:
726     increment = ( (sbits64) zSig1 < 0 );
727     if ( ! roundNearestEven ) {
728         if ( roundingMode == float_round_to_zero ) {
729             increment = 0;
730         }
731         else {
732             if ( zSign ) {
733                 increment = ( roundingMode == float_round_down ) && zSig1;
734             }
735             else {
736                 increment = ( roundingMode == float_round_up ) && zSig1;
737             }
738         }
739     }
740     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
741         if (    ( 0x7FFE < zExp )
742              || (    ( zExp == 0x7FFE )
743                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
744                   && increment
745                 )
746            ) {
747             roundMask = 0;
748  overflow:
749             float_raise( float_flag_overflow | float_flag_inexact );
750             if (    ( roundingMode == float_round_to_zero )
751                  || ( zSign && ( roundingMode == float_round_up ) )
752                  || ( ! zSign && ( roundingMode == float_round_down ) )
753                ) {
754                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
755             }
756             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
757         }
758         if ( zExp <= 0 ) {
759             isTiny =
760                    ( float_detect_tininess == float_tininess_before_rounding )
761                 || ( zExp < 0 )
762                 || ! increment
763                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
764             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
765             zExp = 0;
766             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
767             if ( zSig1 ) float_set_inexact();
768             if ( roundNearestEven ) {
769                 increment = ( (sbits64) zSig1 < 0 );
770             }
771             else {
772                 if ( zSign ) {
773                     increment = ( roundingMode == float_round_down ) && zSig1;
774                 }
775                 else {
776                     increment = ( roundingMode == float_round_up ) && zSig1;
777                 }
778             }
779             if ( increment ) {
780                 ++zSig0;
781                 zSig0 &=
782                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
783                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
784             }
785             return packFloatx80( zSign, zExp, zSig0 );
786         }
787     }
788     if ( zSig1 ) float_set_inexact();
789     if ( increment ) {
790         ++zSig0;
791         if ( zSig0 == 0 ) {
792             ++zExp;
793             zSig0 = LIT64( 0x8000000000000000 );
794         }
795         else {
796             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
797         }
798     }
799     else {
800         if ( zSig0 == 0 ) zExp = 0;
801     }
802     return packFloatx80( zSign, zExp, zSig0 );
803 
804 }
805 
806 /*----------------------------------------------------------------------------
807 | Takes an abstract floating-point value having sign `zSign', exponent
808 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809 | and returns the proper extended double-precision floating-point value
810 | corresponding to the abstract input.  This routine is just like
811 | `roundAndPackFloatx80' except that the input significand does not have to be
812 | normalized.
813 *----------------------------------------------------------------------------*/
814 
815 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)816  normalizeRoundAndPackFloatx80(
817      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
818  )
819 {
820     int8 shiftCount;
821 
822     if ( zSig0 == 0 ) {
823         zSig0 = zSig1;
824         zSig1 = 0;
825         zExp -= 64;
826     }
827     shiftCount = countLeadingZeros64( zSig0 );
828     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
829     zExp -= shiftCount;
830     return
831         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
832 
833 }
834 
835 #endif
836 
837 #ifdef FLOAT128
838 
839 /*----------------------------------------------------------------------------
840 | Returns the least-significant 64 fraction bits of the quadruple-precision
841 | floating-point value `a'.
842 *----------------------------------------------------------------------------*/
843 
extractFloat128Frac1(float128 a)844 INLINE bits64 extractFloat128Frac1( float128 a )
845 {
846 
847     return a.low;
848 
849 }
850 
851 /*----------------------------------------------------------------------------
852 | Returns the most-significant 48 fraction bits of the quadruple-precision
853 | floating-point value `a'.
854 *----------------------------------------------------------------------------*/
855 
extractFloat128Frac0(float128 a)856 INLINE bits64 extractFloat128Frac0( float128 a )
857 {
858 
859     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
860 
861 }
862 
863 /*----------------------------------------------------------------------------
864 | Returns the exponent bits of the quadruple-precision floating-point value
865 | `a'.
866 *----------------------------------------------------------------------------*/
867 
extractFloat128Exp(float128 a)868 INLINE int32 extractFloat128Exp( float128 a )
869 {
870 
871     return ( a.high>>48 ) & 0x7FFF;
872 
873 }
874 
875 /*----------------------------------------------------------------------------
876 | Returns the sign bit of the quadruple-precision floating-point value `a'.
877 *----------------------------------------------------------------------------*/
878 
extractFloat128Sign(float128 a)879 INLINE flag extractFloat128Sign( float128 a )
880 {
881 
882     return a.high>>63;
883 
884 }
885 
886 /*----------------------------------------------------------------------------
887 | Normalizes the subnormal quadruple-precision floating-point value
888 | represented by the denormalized significand formed by the concatenation of
889 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
890 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
891 | significand are stored at the location pointed to by `zSig0Ptr', and the
892 | least significant 64 bits of the normalized significand are stored at the
893 | location pointed to by `zSig1Ptr'.
894 *----------------------------------------------------------------------------*/
895 
896 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)897  normalizeFloat128Subnormal(
898      bits64 aSig0,
899      bits64 aSig1,
900      int32 *zExpPtr,
901      bits64 *zSig0Ptr,
902      bits64 *zSig1Ptr
903  )
904 {
905     int8 shiftCount;
906 
907     if ( aSig0 == 0 ) {
908         shiftCount = countLeadingZeros64( aSig1 ) - 15;
909         if ( shiftCount < 0 ) {
910             *zSig0Ptr = aSig1>>( - shiftCount );
911             *zSig1Ptr = aSig1<<( shiftCount & 63 );
912         }
913         else {
914             *zSig0Ptr = aSig1<<shiftCount;
915             *zSig1Ptr = 0;
916         }
917         *zExpPtr = - shiftCount - 63;
918     }
919     else {
920         shiftCount = countLeadingZeros64( aSig0 ) - 15;
921         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
922         *zExpPtr = 1 - shiftCount;
923     }
924 
925 }
926 
927 /*----------------------------------------------------------------------------
928 | Packs the sign `zSign', the exponent `zExp', and the significand formed
929 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
930 | floating-point value, returning the result.  After being shifted into the
931 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
932 | added together to form the most significant 32 bits of the result.  This
933 | means that any integer portion of `zSig0' will be added into the exponent.
934 | Since a properly normalized significand will have an integer portion equal
935 | to 1, the `zExp' input should be 1 less than the desired result exponent
936 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
937 | significand.
938 *----------------------------------------------------------------------------*/
939 
940 INLINE float128
packFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)941  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
942 {
943     float128 z;
944 
945     z.low = zSig1;
946     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
947     return z;
948 
949 }
950 
951 /*----------------------------------------------------------------------------
952 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
953 | and extended significand formed by the concatenation of `zSig0', `zSig1',
954 | and `zSig2', and returns the proper quadruple-precision floating-point value
955 | corresponding to the abstract input.  Ordinarily, the abstract value is
956 | simply rounded and packed into the quadruple-precision format, with the
957 | inexact exception raised if the abstract input cannot be represented
958 | exactly.  However, if the abstract value is too large, the overflow and
959 | inexact exceptions are raised and an infinity or maximal finite value is
960 | returned.  If the abstract value is too small, the input value is rounded to
961 | a subnormal number, and the underflow and inexact exceptions are raised if
962 | the abstract input cannot be represented exactly as a subnormal quadruple-
963 | precision floating-point number.
964 |     The input significand must be normalized or smaller.  If the input
965 | significand is not normalized, `zExp' must be 0; in that case, the result
966 | returned is a subnormal number, and it must not require rounding.  In the
967 | usual case that the input significand is normalized, `zExp' must be 1 less
968 | than the ``true'' floating-point exponent.  The handling of underflow and
969 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
970 *----------------------------------------------------------------------------*/
971 
972 static float128
roundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1,bits64 zSig2)973  roundAndPackFloat128(
974      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
975 {
976     int8 roundingMode;
977     flag roundNearestEven, increment, isTiny;
978 
979     roundingMode = float_rounding_mode();
980     roundNearestEven = ( roundingMode == float_round_nearest_even );
981     increment = ( (sbits64) zSig2 < 0 );
982     if ( ! roundNearestEven ) {
983         if ( roundingMode == float_round_to_zero ) {
984             increment = 0;
985         }
986         else {
987             if ( zSign ) {
988                 increment = ( roundingMode == float_round_down ) && zSig2;
989             }
990             else {
991                 increment = ( roundingMode == float_round_up ) && zSig2;
992             }
993         }
994     }
995     if ( 0x7FFD <= (bits32) zExp ) {
996         if (    ( 0x7FFD < zExp )
997              || (    ( zExp == 0x7FFD )
998                   && eq128(
999                          LIT64( 0x0001FFFFFFFFFFFF ),
1000                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1001                          zSig0,
1002                          zSig1
1003                      )
1004                   && increment
1005                 )
1006            ) {
1007             float_raise( float_flag_overflow | float_flag_inexact );
1008             if (    ( roundingMode == float_round_to_zero )
1009                  || ( zSign && ( roundingMode == float_round_up ) )
1010                  || ( ! zSign && ( roundingMode == float_round_down ) )
1011                ) {
1012                 return
1013                     packFloat128(
1014                         zSign,
1015                         0x7FFE,
1016                         LIT64( 0x0000FFFFFFFFFFFF ),
1017                         LIT64( 0xFFFFFFFFFFFFFFFF )
1018                     );
1019             }
1020             return packFloat128( zSign, 0x7FFF, 0, 0 );
1021         }
1022         if ( zExp < 0 ) {
1023             isTiny =
1024                    ( float_detect_tininess == float_tininess_before_rounding )
1025                 || ( zExp < -1 )
1026                 || ! increment
1027                 || lt128(
1028                        zSig0,
1029                        zSig1,
1030                        LIT64( 0x0001FFFFFFFFFFFF ),
1031                        LIT64( 0xFFFFFFFFFFFFFFFF )
1032                    );
1033             shift128ExtraRightJamming(
1034                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1035             zExp = 0;
1036             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1037             if ( roundNearestEven ) {
1038                 increment = ( (sbits64) zSig2 < 0 );
1039             }
1040             else {
1041                 if ( zSign ) {
1042                     increment = ( roundingMode == float_round_down ) && zSig2;
1043                 }
1044                 else {
1045                     increment = ( roundingMode == float_round_up ) && zSig2;
1046                 }
1047             }
1048         }
1049     }
1050     if ( zSig2 ) float_set_inexact();
1051     if ( increment ) {
1052         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1053         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1054     }
1055     else {
1056         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1057     }
1058     return packFloat128( zSign, zExp, zSig0, zSig1 );
1059 
1060 }
1061 
1062 /*----------------------------------------------------------------------------
1063 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1064 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1065 | returns the proper quadruple-precision floating-point value corresponding
1066 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1067 | except that the input significand has fewer bits and does not have to be
1068 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1069 | point exponent.
1070 *----------------------------------------------------------------------------*/
1071 
1072 static float128
normalizeRoundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1073  normalizeRoundAndPackFloat128(
1074      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1075 {
1076     int8 shiftCount;
1077     bits64 zSig2;
1078 
1079     if ( zSig0 == 0 ) {
1080         zSig0 = zSig1;
1081         zSig1 = 0;
1082         zExp -= 64;
1083     }
1084     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1085     if ( 0 <= shiftCount ) {
1086         zSig2 = 0;
1087         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1088     }
1089     else {
1090         shift128ExtraRightJamming(
1091             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1092     }
1093     zExp -= shiftCount;
1094     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1095 
1096 }
1097 
1098 #endif
1099 
1100 /*----------------------------------------------------------------------------
1101 | Returns the result of converting the 32-bit two's complement integer `a'
1102 | to the single-precision floating-point format.  The conversion is performed
1103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1104 *----------------------------------------------------------------------------*/
1105 
int32_to_float32(int32 a)1106 float32 int32_to_float32( int32 a )
1107 {
1108     flag zSign;
1109 
1110     if ( a == 0 ) return 0;
1111     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1112     zSign = ( a < 0 );
1113     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1114 
1115 }
1116 
1117 /*----------------------------------------------------------------------------
1118 | Returns the result of converting the 32-bit two's complement integer `a'
1119 | to the double-precision floating-point format.  The conversion is performed
1120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1121 *----------------------------------------------------------------------------*/
1122 
int32_to_float64(int32 a)1123 float64 int32_to_float64( int32 a )
1124 {
1125     flag zSign;
1126     uint32 absA;
1127     int8 shiftCount;
1128     bits64 zSig;
1129 
1130     if ( a == 0 ) return 0;
1131     zSign = ( a < 0 );
1132     absA = zSign ? - a : a;
1133     shiftCount = countLeadingZeros32( absA ) + 21;
1134     zSig = absA;
1135     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1136 
1137 }
1138 
1139 #ifdef FLOATX80
1140 
1141 /*----------------------------------------------------------------------------
1142 | Returns the result of converting the 32-bit two's complement integer `a'
1143 | to the extended double-precision floating-point format.  The conversion
1144 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1145 | Arithmetic.
1146 *----------------------------------------------------------------------------*/
1147 
int32_to_floatx80(int32 a)1148 floatx80 int32_to_floatx80( int32 a )
1149 {
1150     flag zSign;
1151     uint32 absA;
1152     int8 shiftCount;
1153     bits64 zSig;
1154 
1155     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1156     zSign = ( a < 0 );
1157     absA = zSign ? - a : a;
1158     shiftCount = countLeadingZeros32( absA ) + 32;
1159     zSig = absA;
1160     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1161 
1162 }
1163 
1164 #endif
1165 
1166 #ifdef FLOAT128
1167 
1168 /*----------------------------------------------------------------------------
1169 | Returns the result of converting the 32-bit two's complement integer `a' to
1170 | the quadruple-precision floating-point format.  The conversion is performed
1171 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172 *----------------------------------------------------------------------------*/
1173 
int32_to_float128(int32 a)1174 float128 int32_to_float128( int32 a )
1175 {
1176     flag zSign;
1177     uint32 absA;
1178     int8 shiftCount;
1179     bits64 zSig0;
1180 
1181     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1182     zSign = ( a < 0 );
1183     absA = zSign ? - a : a;
1184     shiftCount = countLeadingZeros32( absA ) + 17;
1185     zSig0 = absA;
1186     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1187 
1188 }
1189 
1190 #endif
1191 
1192 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1193 /*----------------------------------------------------------------------------
1194 | Returns the result of converting the 64-bit two's complement integer `a'
1195 | to the single-precision floating-point format.  The conversion is performed
1196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1197 *----------------------------------------------------------------------------*/
1198 
int64_to_float32(int64 a)1199 float32 int64_to_float32( int64 a )
1200 {
1201     flag zSign;
1202     uint64 absA;
1203     int8 shiftCount;
1204 
1205     if ( a == 0 ) return 0;
1206     zSign = ( a < 0 );
1207     absA = zSign ? - a : a;
1208     shiftCount = countLeadingZeros64( absA ) - 40;
1209     if ( 0 <= shiftCount ) {
1210         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1211     }
1212     else {
1213         shiftCount += 7;
1214         if ( shiftCount < 0 ) {
1215             shift64RightJamming( absA, - shiftCount, &absA );
1216         }
1217         else {
1218             absA <<= shiftCount;
1219         }
1220         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1221     }
1222 
1223 }
1224 
1225 /*----------------------------------------------------------------------------
1226 | Returns the result of converting the 64-bit two's complement integer `a'
1227 | to the double-precision floating-point format.  The conversion is performed
1228 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230 
int64_to_float64(int64 a)1231 float64 int64_to_float64( int64 a )
1232 {
1233     flag zSign;
1234 
1235     if ( a == 0 ) return 0;
1236     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1237         return packFloat64( 1, 0x43E, 0 );
1238     }
1239     zSign = ( a < 0 );
1240     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1241 
1242 }
1243 
1244 #ifdef FLOATX80
1245 
1246 /*----------------------------------------------------------------------------
1247 | Returns the result of converting the 64-bit two's complement integer `a'
1248 | to the extended double-precision floating-point format.  The conversion
1249 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1250 | Arithmetic.
1251 *----------------------------------------------------------------------------*/
1252 
int64_to_floatx80(int64 a)1253 floatx80 int64_to_floatx80( int64 a )
1254 {
1255     flag zSign;
1256     uint64 absA;
1257     int8 shiftCount;
1258 
1259     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1260     zSign = ( a < 0 );
1261     absA = zSign ? - a : a;
1262     shiftCount = countLeadingZeros64( absA );
1263     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1264 
1265 }
1266 
1267 #endif
1268 
1269 #ifdef FLOAT128
1270 
1271 /*----------------------------------------------------------------------------
1272 | Returns the result of converting the 64-bit two's complement integer `a' to
1273 | the quadruple-precision floating-point format.  The conversion is performed
1274 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1275 *----------------------------------------------------------------------------*/
1276 
int64_to_float128(int64 a)1277 float128 int64_to_float128( int64 a )
1278 {
1279     flag zSign;
1280     uint64 absA;
1281     int8 shiftCount;
1282     int32 zExp;
1283     bits64 zSig0, zSig1;
1284 
1285     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1286     zSign = ( a < 0 );
1287     absA = zSign ? - a : a;
1288     shiftCount = countLeadingZeros64( absA ) + 49;
1289     zExp = 0x406E - shiftCount;
1290     if ( 64 <= shiftCount ) {
1291         zSig1 = 0;
1292         zSig0 = absA;
1293         shiftCount -= 64;
1294     }
1295     else {
1296         zSig1 = absA;
1297         zSig0 = 0;
1298     }
1299     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1300     return packFloat128( zSign, zExp, zSig0, zSig1 );
1301 
1302 }
1303 
1304 #endif
1305 #endif /* !SOFTFLOAT_FOR_GCC */
1306 
1307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the single-precision floating-point value
1310 | `a' to the 32-bit two's complement integer format.  The conversion is
1311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 | Arithmetic---which means in particular that the conversion is rounded
1313 | according to the current rounding mode.  If `a' is a NaN, the largest
1314 | positive integer is returned.  Otherwise, if the conversion overflows, the
1315 | largest integer with the same sign as `a' is returned.
1316 *----------------------------------------------------------------------------*/
1317 
float32_to_int32(float32 a)1318 int32 float32_to_int32( float32 a )
1319 {
1320     flag aSign;
1321     int16 aExp, shiftCount;
1322     bits32 aSig;
1323     bits64 aSig64;
1324 
1325     aSig = extractFloat32Frac( a );
1326     aExp = extractFloat32Exp( a );
1327     aSign = extractFloat32Sign( a );
1328     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1329     if ( aExp ) aSig |= 0x00800000;
1330     shiftCount = 0xAF - aExp;
1331     aSig64 = aSig;
1332     aSig64 <<= 32;
1333     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1334     return roundAndPackInt32( aSign, aSig64 );
1335 
1336 }
1337 #endif /* !SOFTFLOAT_FOR_GCC */
1338 
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format.  The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic, except that the conversion is always rounded toward zero.
1344 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1345 | the conversion overflows, the largest integer with the same sign as `a' is
1346 | returned.
1347 *----------------------------------------------------------------------------*/
1348 
float32_to_int32_round_to_zero(float32 a)1349 int32 float32_to_int32_round_to_zero( float32 a )
1350 {
1351     flag aSign;
1352     int16 aExp, shiftCount;
1353     bits32 aSig;
1354     int32 z;
1355 
1356     aSig = extractFloat32Frac( a );
1357     aExp = extractFloat32Exp( a );
1358     aSign = extractFloat32Sign( a );
1359     shiftCount = aExp - 0x9E;
1360     if ( 0 <= shiftCount ) {
1361         if ( a != 0xCF000000 ) {
1362             float_raise( float_flag_invalid );
1363             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1364         }
1365         return (sbits32) 0x80000000;
1366     }
1367     else if ( aExp <= 0x7E ) {
1368         if ( aExp | aSig ) float_set_inexact();
1369         return 0;
1370     }
1371     aSig = ( aSig | 0x00800000 )<<8;
1372     z = aSig>>( - shiftCount );
1373     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1374         float_set_inexact();
1375     }
1376     if ( aSign ) z = - z;
1377     return z;
1378 
1379 }
1380 
1381 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the single-precision floating-point value
1384 | `a' to the 64-bit two's complement integer format.  The conversion is
1385 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1386 | Arithmetic---which means in particular that the conversion is rounded
1387 | according to the current rounding mode.  If `a' is a NaN, the largest
1388 | positive integer is returned.  Otherwise, if the conversion overflows, the
1389 | largest integer with the same sign as `a' is returned.
1390 *----------------------------------------------------------------------------*/
1391 
float32_to_int64(float32 a)1392 int64 float32_to_int64( float32 a )
1393 {
1394     flag aSign;
1395     int16 aExp, shiftCount;
1396     bits32 aSig;
1397     bits64 aSig64, aSigExtra;
1398 
1399     aSig = extractFloat32Frac( a );
1400     aExp = extractFloat32Exp( a );
1401     aSign = extractFloat32Sign( a );
1402     shiftCount = 0xBE - aExp;
1403     if ( shiftCount < 0 ) {
1404         float_raise( float_flag_invalid );
1405         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1406             return LIT64( 0x7FFFFFFFFFFFFFFF );
1407         }
1408         return (sbits64) LIT64( 0x8000000000000000 );
1409     }
1410     if ( aExp ) aSig |= 0x00800000;
1411     aSig64 = aSig;
1412     aSig64 <<= 40;
1413     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1414     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1415 
1416 }
1417 
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 64-bit two's complement integer format.  The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.  If
1423 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1424 | conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427 
float32_to_int64_round_to_zero(float32 a)1428 int64 float32_to_int64_round_to_zero( float32 a )
1429 {
1430     flag aSign;
1431     int16 aExp, shiftCount;
1432     bits32 aSig;
1433     bits64 aSig64;
1434     int64 z;
1435 
1436     aSig = extractFloat32Frac( a );
1437     aExp = extractFloat32Exp( a );
1438     aSign = extractFloat32Sign( a );
1439     shiftCount = aExp - 0xBE;
1440     if ( 0 <= shiftCount ) {
1441         if ( a != 0xDF000000 ) {
1442             float_raise( float_flag_invalid );
1443             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1444                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1445             }
1446         }
1447         return (sbits64) LIT64( 0x8000000000000000 );
1448     }
1449     else if ( aExp <= 0x7E ) {
1450         if ( aExp | aSig ) float_set_inexact();
1451         return 0;
1452     }
1453     aSig64 = aSig | 0x00800000;
1454     aSig64 <<= 40;
1455     z = aSig64>>( - shiftCount );
1456     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1457         float_set_inexact();
1458     }
1459     if ( aSign ) z = - z;
1460     return z;
1461 
1462 }
1463 #endif /* !SOFTFLOAT_FOR_GCC */
1464 
1465 /*----------------------------------------------------------------------------
1466 | Returns the result of converting the single-precision floating-point value
1467 | `a' to the double-precision floating-point format.  The conversion is
1468 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1469 | Arithmetic.
1470 *----------------------------------------------------------------------------*/
1471 
float32_to_float64(float32 a)1472 float64 float32_to_float64( float32 a )
1473 {
1474     flag aSign;
1475     int16 aExp;
1476     bits32 aSig;
1477 
1478     aSig = extractFloat32Frac( a );
1479     aExp = extractFloat32Exp( a );
1480     aSign = extractFloat32Sign( a );
1481     if ( aExp == 0xFF ) {
1482         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1483         return packFloat64( aSign, 0x7FF, 0 );
1484     }
1485     if ( aExp == 0 ) {
1486         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1487         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1488         --aExp;
1489     }
1490     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1491 
1492 }
1493 
1494 #ifdef FLOATX80
1495 
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the single-precision floating-point value
1498 | `a' to the extended double-precision floating-point format.  The conversion
1499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1500 | Arithmetic.
1501 *----------------------------------------------------------------------------*/
1502 
float32_to_floatx80(float32 a)1503 floatx80 float32_to_floatx80( float32 a )
1504 {
1505     flag aSign;
1506     int16 aExp;
1507     bits32 aSig;
1508 
1509     aSig = extractFloat32Frac( a );
1510     aExp = extractFloat32Exp( a );
1511     aSign = extractFloat32Sign( a );
1512     if ( aExp == 0xFF ) {
1513         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1514         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1515     }
1516     if ( aExp == 0 ) {
1517         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1518         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1519     }
1520     aSig |= 0x00800000;
1521     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1522 
1523 }
1524 
1525 #endif
1526 
1527 #ifdef FLOAT128
1528 
1529 /*----------------------------------------------------------------------------
1530 | Returns the result of converting the single-precision floating-point value
1531 | `a' to the double-precision floating-point format.  The conversion is
1532 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1533 | Arithmetic.
1534 *----------------------------------------------------------------------------*/
1535 
float32_to_float128(float32 a)1536 float128 float32_to_float128( float32 a )
1537 {
1538     flag aSign;
1539     int16 aExp;
1540     bits32 aSig;
1541 
1542     aSig = extractFloat32Frac( a );
1543     aExp = extractFloat32Exp( a );
1544     aSign = extractFloat32Sign( a );
1545     if ( aExp == 0xFF ) {
1546         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1547         return packFloat128( aSign, 0x7FFF, 0, 0 );
1548     }
1549     if ( aExp == 0 ) {
1550         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1551         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1552         --aExp;
1553     }
1554     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1555 
1556 }
1557 
1558 #endif
1559 
1560 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1561 /*----------------------------------------------------------------------------
1562 | Rounds the single-precision floating-point value `a' to an integer, and
1563 | returns the result as a single-precision floating-point value.  The
1564 | operation is performed according to the IEC/IEEE Standard for Binary
1565 | Floating-Point Arithmetic.
1566 *----------------------------------------------------------------------------*/
1567 
float32_round_to_int(float32 a)1568 float32 float32_round_to_int( float32 a )
1569 {
1570     flag aSign;
1571     int16 aExp;
1572     bits32 lastBitMask, roundBitsMask;
1573     int8 roundingMode;
1574     float32 z;
1575 
1576     aExp = extractFloat32Exp( a );
1577     if ( 0x96 <= aExp ) {
1578         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1579             return propagateFloat32NaN( a, a );
1580         }
1581         return a;
1582     }
1583     if ( aExp <= 0x7E ) {
1584         if ( (bits32) ( a<<1 ) == 0 ) return a;
1585         float_set_inexact();
1586         aSign = extractFloat32Sign( a );
1587         switch ( float_rounding_mode() ) {
1588          case float_round_nearest_even:
1589             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1590                 return packFloat32( aSign, 0x7F, 0 );
1591             }
1592             break;
1593          case float_round_down:
1594             return aSign ? 0xBF800000 : 0;
1595          case float_round_up:
1596             return aSign ? 0x80000000 : 0x3F800000;
1597         }
1598         return packFloat32( aSign, 0, 0 );
1599     }
1600     lastBitMask = 1;
1601     lastBitMask <<= 0x96 - aExp;
1602     roundBitsMask = lastBitMask - 1;
1603     z = a;
1604     roundingMode = float_rounding_mode();
1605     if ( roundingMode == float_round_nearest_even ) {
1606         z += lastBitMask>>1;
1607         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1608     }
1609     else if ( roundingMode != float_round_to_zero ) {
1610         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1611             z += roundBitsMask;
1612         }
1613     }
1614     z &= ~ roundBitsMask;
1615     if ( z != a ) float_set_inexact();
1616     return z;
1617 
1618 }
1619 #endif /* !SOFTFLOAT_FOR_GCC */
1620 
1621 /*----------------------------------------------------------------------------
1622 | Returns the result of adding the absolute values of the single-precision
1623 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1624 | before being returned.  `zSign' is ignored if the result is a NaN.
1625 | The addition is performed according to the IEC/IEEE Standard for Binary
1626 | Floating-Point Arithmetic.
1627 *----------------------------------------------------------------------------*/
1628 
addFloat32Sigs(float32 a,float32 b,flag zSign)1629 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1630 {
1631     int16 aExp, bExp, zExp;
1632     bits32 aSig, bSig, zSig;
1633     int16 expDiff;
1634 
1635     aSig = extractFloat32Frac( a );
1636     aExp = extractFloat32Exp( a );
1637     bSig = extractFloat32Frac( b );
1638     bExp = extractFloat32Exp( b );
1639     expDiff = aExp - bExp;
1640     aSig <<= 6;
1641     bSig <<= 6;
1642     if ( 0 < expDiff ) {
1643         if ( aExp == 0xFF ) {
1644             if ( aSig ) return propagateFloat32NaN( a, b );
1645             return a;
1646         }
1647         if ( bExp == 0 ) {
1648             --expDiff;
1649         }
1650         else {
1651             bSig |= 0x20000000;
1652         }
1653         shift32RightJamming( bSig, expDiff, &bSig );
1654         zExp = aExp;
1655     }
1656     else if ( expDiff < 0 ) {
1657         if ( bExp == 0xFF ) {
1658             if ( bSig ) return propagateFloat32NaN( a, b );
1659             return packFloat32( zSign, 0xFF, 0 );
1660         }
1661         if ( aExp == 0 ) {
1662             ++expDiff;
1663         }
1664         else {
1665             aSig |= 0x20000000;
1666         }
1667         shift32RightJamming( aSig, - expDiff, &aSig );
1668         zExp = bExp;
1669     }
1670     else {
1671         if ( aExp == 0xFF ) {
1672             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1673             return a;
1674         }
1675         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1676         zSig = 0x40000000 + aSig + bSig;
1677         zExp = aExp;
1678         goto roundAndPack;
1679     }
1680     aSig |= 0x20000000;
1681     zSig = ( aSig + bSig )<<1;
1682     --zExp;
1683     if ( (sbits32) zSig < 0 ) {
1684         zSig = aSig + bSig;
1685         ++zExp;
1686     }
1687  roundAndPack:
1688     return roundAndPackFloat32( zSign, zExp, zSig );
1689 
1690 }
1691 
1692 /*----------------------------------------------------------------------------
1693 | Returns the result of subtracting the absolute values of the single-
1694 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1695 | difference is negated before being returned.  `zSign' is ignored if the
1696 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1697 | Standard for Binary Floating-Point Arithmetic.
1698 *----------------------------------------------------------------------------*/
1699 
subFloat32Sigs(float32 a,float32 b,flag zSign)1700 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1701 {
1702     int16 aExp, bExp, zExp;
1703     bits32 aSig, bSig, zSig;
1704     int16 expDiff;
1705 
1706     aSig = extractFloat32Frac( a );
1707     aExp = extractFloat32Exp( a );
1708     bSig = extractFloat32Frac( b );
1709     bExp = extractFloat32Exp( b );
1710     expDiff = aExp - bExp;
1711     aSig <<= 7;
1712     bSig <<= 7;
1713     if ( 0 < expDiff ) goto aExpBigger;
1714     if ( expDiff < 0 ) goto bExpBigger;
1715     if ( aExp == 0xFF ) {
1716         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1717         float_raise( float_flag_invalid );
1718         return float32_default_nan;
1719     }
1720     if ( aExp == 0 ) {
1721         aExp = 1;
1722         bExp = 1;
1723     }
1724     if ( bSig < aSig ) goto aBigger;
1725     if ( aSig < bSig ) goto bBigger;
1726     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1727  bExpBigger:
1728     if ( bExp == 0xFF ) {
1729         if ( bSig ) return propagateFloat32NaN( a, b );
1730         return packFloat32( zSign ^ 1, 0xFF, 0 );
1731     }
1732     if ( aExp == 0 ) {
1733         ++expDiff;
1734     }
1735     else {
1736         aSig |= 0x40000000;
1737     }
1738     shift32RightJamming( aSig, - expDiff, &aSig );
1739     bSig |= 0x40000000;
1740  bBigger:
1741     zSig = bSig - aSig;
1742     zExp = bExp;
1743     zSign ^= 1;
1744     goto normalizeRoundAndPack;
1745  aExpBigger:
1746     if ( aExp == 0xFF ) {
1747         if ( aSig ) return propagateFloat32NaN( a, b );
1748         return a;
1749     }
1750     if ( bExp == 0 ) {
1751         --expDiff;
1752     }
1753     else {
1754         bSig |= 0x40000000;
1755     }
1756     shift32RightJamming( bSig, expDiff, &bSig );
1757     aSig |= 0x40000000;
1758  aBigger:
1759     zSig = aSig - bSig;
1760     zExp = aExp;
1761  normalizeRoundAndPack:
1762     --zExp;
1763     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1764 
1765 }
1766 
1767 /*----------------------------------------------------------------------------
1768 | Returns the result of adding the single-precision floating-point values `a'
1769 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1770 | Binary Floating-Point Arithmetic.
1771 *----------------------------------------------------------------------------*/
1772 
float32_add(float32 a,float32 b)1773 float32 float32_add( float32 a, float32 b )
1774 {
1775     flag aSign, bSign;
1776 
1777     aSign = extractFloat32Sign( a );
1778     bSign = extractFloat32Sign( b );
1779     if ( aSign == bSign ) {
1780         return addFloat32Sigs( a, b, aSign );
1781     }
1782     else {
1783         return subFloat32Sigs( a, b, aSign );
1784     }
1785 
1786 }
1787 
1788 /*----------------------------------------------------------------------------
1789 | Returns the result of subtracting the single-precision floating-point values
1790 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1791 | for Binary Floating-Point Arithmetic.
1792 *----------------------------------------------------------------------------*/
1793 
float32_sub(float32 a,float32 b)1794 float32 float32_sub( float32 a, float32 b )
1795 {
1796     flag aSign, bSign;
1797 
1798     aSign = extractFloat32Sign( a );
1799     bSign = extractFloat32Sign( b );
1800     if ( aSign == bSign ) {
1801         return subFloat32Sigs( a, b, aSign );
1802     }
1803     else {
1804         return addFloat32Sigs( a, b, aSign );
1805     }
1806 
1807 }
1808 
1809 /*----------------------------------------------------------------------------
1810 | Returns the result of multiplying the single-precision floating-point values
1811 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1812 | for Binary Floating-Point Arithmetic.
1813 *----------------------------------------------------------------------------*/
1814 
float32_mul(float32 a,float32 b)1815 float32 float32_mul( float32 a, float32 b )
1816 {
1817     flag aSign, bSign, zSign;
1818     int16 aExp, bExp, zExp;
1819     bits32 aSig, bSig;
1820     bits64 zSig64;
1821     bits32 zSig;
1822 
1823     aSig = extractFloat32Frac( a );
1824     aExp = extractFloat32Exp( a );
1825     aSign = extractFloat32Sign( a );
1826     bSig = extractFloat32Frac( b );
1827     bExp = extractFloat32Exp( b );
1828     bSign = extractFloat32Sign( b );
1829     zSign = aSign ^ bSign;
1830     if ( aExp == 0xFF ) {
1831         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1832             return propagateFloat32NaN( a, b );
1833         }
1834         if ( ( bExp | bSig ) == 0 ) {
1835             float_raise( float_flag_invalid );
1836             return float32_default_nan;
1837         }
1838         return packFloat32( zSign, 0xFF, 0 );
1839     }
1840     if ( bExp == 0xFF ) {
1841         if ( bSig ) return propagateFloat32NaN( a, b );
1842         if ( ( aExp | aSig ) == 0 ) {
1843             float_raise( float_flag_invalid );
1844             return float32_default_nan;
1845         }
1846         return packFloat32( zSign, 0xFF, 0 );
1847     }
1848     if ( aExp == 0 ) {
1849         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1850         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1851     }
1852     if ( bExp == 0 ) {
1853         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1854         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1855     }
1856     zExp = aExp + bExp - 0x7F;
1857     aSig = ( aSig | 0x00800000 )<<7;
1858     bSig = ( bSig | 0x00800000 )<<8;
1859     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1860     zSig = zSig64;
1861     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1862         zSig <<= 1;
1863         --zExp;
1864     }
1865     return roundAndPackFloat32( zSign, zExp, zSig );
1866 
1867 }
1868 
1869 /*----------------------------------------------------------------------------
1870 | Returns the result of dividing the single-precision floating-point value `a'
1871 | by the corresponding value `b'.  The operation is performed according to the
1872 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1874 
float32_div(float32 a,float32 b)1875 float32 float32_div( float32 a, float32 b )
1876 {
1877     flag aSign, bSign, zSign;
1878     int16 aExp, bExp, zExp;
1879     bits32 aSig, bSig, zSig;
1880 
1881     aSig = extractFloat32Frac( a );
1882     aExp = extractFloat32Exp( a );
1883     aSign = extractFloat32Sign( a );
1884     bSig = extractFloat32Frac( b );
1885     bExp = extractFloat32Exp( b );
1886     bSign = extractFloat32Sign( b );
1887     zSign = aSign ^ bSign;
1888     if ( aExp == 0xFF ) {
1889         if ( aSig ) return propagateFloat32NaN( a, b );
1890         if ( bExp == 0xFF ) {
1891             if ( bSig ) return propagateFloat32NaN( a, b );
1892             float_raise( float_flag_invalid );
1893             return float32_default_nan;
1894         }
1895         return packFloat32( zSign, 0xFF, 0 );
1896     }
1897     if ( bExp == 0xFF ) {
1898         if ( bSig ) return propagateFloat32NaN( a, b );
1899         return packFloat32( zSign, 0, 0 );
1900     }
1901     if ( bExp == 0 ) {
1902         if ( bSig == 0 ) {
1903             if ( ( aExp | aSig ) == 0 ) {
1904                 float_raise( float_flag_invalid );
1905                 return float32_default_nan;
1906             }
1907             float_raise( float_flag_divbyzero );
1908             return packFloat32( zSign, 0xFF, 0 );
1909         }
1910         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1911     }
1912     if ( aExp == 0 ) {
1913         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1914         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1915     }
1916     zExp = aExp - bExp + 0x7D;
1917     aSig = ( aSig | 0x00800000 )<<7;
1918     bSig = ( bSig | 0x00800000 )<<8;
1919     if ( bSig <= ( aSig + aSig ) ) {
1920         aSig >>= 1;
1921         ++zExp;
1922     }
1923     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1924     if ( ( zSig & 0x3F ) == 0 ) {
1925         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1926     }
1927     return roundAndPackFloat32( zSign, zExp, zSig );
1928 
1929 }
1930 
1931 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1932 /*----------------------------------------------------------------------------
1933 | Returns the remainder of the single-precision floating-point value `a'
1934 | with respect to the corresponding value `b'.  The operation is performed
1935 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1937 
float32_rem(float32 a,float32 b)1938 float32 float32_rem( float32 a, float32 b )
1939 {
1940     flag aSign, bSign __unused, zSign;
1941     int16 aExp, bExp, expDiff;
1942     bits32 aSig, bSig;
1943     bits32 q;
1944     bits64 aSig64, bSig64, q64;
1945     bits32 alternateASig;
1946     sbits32 sigMean;
1947 
1948     aSig = extractFloat32Frac( a );
1949     aExp = extractFloat32Exp( a );
1950     aSign = extractFloat32Sign( a );
1951     bSig = extractFloat32Frac( b );
1952     bExp = extractFloat32Exp( b );
1953     bSign = extractFloat32Sign( b );
1954     if ( aExp == 0xFF ) {
1955         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1956             return propagateFloat32NaN( a, b );
1957         }
1958         float_raise( float_flag_invalid );
1959         return float32_default_nan;
1960     }
1961     if ( bExp == 0xFF ) {
1962         if ( bSig ) return propagateFloat32NaN( a, b );
1963         return a;
1964     }
1965     if ( bExp == 0 ) {
1966         if ( bSig == 0 ) {
1967             float_raise( float_flag_invalid );
1968             return float32_default_nan;
1969         }
1970         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1971     }
1972     if ( aExp == 0 ) {
1973         if ( aSig == 0 ) return a;
1974         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1975     }
1976     expDiff = aExp - bExp;
1977     aSig |= 0x00800000;
1978     bSig |= 0x00800000;
1979     if ( expDiff < 32 ) {
1980         aSig <<= 8;
1981         bSig <<= 8;
1982         if ( expDiff < 0 ) {
1983             if ( expDiff < -1 ) return a;
1984             aSig >>= 1;
1985         }
1986         q = ( bSig <= aSig );
1987         if ( q ) aSig -= bSig;
1988         if ( 0 < expDiff ) {
1989             q = ( ( (bits64) aSig )<<32 ) / bSig;
1990             q >>= 32 - expDiff;
1991             bSig >>= 2;
1992             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1993         }
1994         else {
1995             aSig >>= 2;
1996             bSig >>= 2;
1997         }
1998     }
1999     else {
2000         if ( bSig <= aSig ) aSig -= bSig;
2001         aSig64 = ( (bits64) aSig )<<40;
2002         bSig64 = ( (bits64) bSig )<<40;
2003         expDiff -= 64;
2004         while ( 0 < expDiff ) {
2005             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2006             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2007             aSig64 = - ( ( bSig * q64 )<<38 );
2008             expDiff -= 62;
2009         }
2010         expDiff += 64;
2011         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2012         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2013         q = q64>>( 64 - expDiff );
2014         bSig <<= 6;
2015         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2016     }
2017     do {
2018         alternateASig = aSig;
2019         ++q;
2020         aSig -= bSig;
2021     } while ( 0 <= (sbits32) aSig );
2022     sigMean = aSig + alternateASig;
2023     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2024         aSig = alternateASig;
2025     }
2026     zSign = ( (sbits32) aSig < 0 );
2027     if ( zSign ) aSig = - aSig;
2028     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2029 
2030 }
2031 #endif /* !SOFTFLOAT_FOR_GCC */
2032 
2033 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2034 
2035 /*----------------------------------------------------------------------------
2036 | Returns the square root of the single-precision floating-point value `a'.
2037 | The operation is performed according to the IEC/IEEE Standard for Binary
2038 | Floating-Point Arithmetic.
2039 *----------------------------------------------------------------------------*/
2040 
float32_sqrt(float32 a)2041 float32 float32_sqrt( float32 a )
2042 {
2043     flag aSign;
2044     int16 aExp, zExp;
2045     bits32 aSig, zSig;
2046     bits64 rem, term;
2047 
2048     aSig = extractFloat32Frac( a );
2049     aExp = extractFloat32Exp( a );
2050     aSign = extractFloat32Sign( a );
2051     if ( aExp == 0xFF ) {
2052         if ( aSig ) return propagateFloat32NaN( a, 0 );
2053         if ( ! aSign ) return a;
2054         float_raise( float_flag_invalid );
2055         return float32_default_nan;
2056     }
2057     if ( aSign ) {
2058         if ( ( aExp | aSig ) == 0 ) return a;
2059         float_raise( float_flag_invalid );
2060         return float32_default_nan;
2061     }
2062     if ( aExp == 0 ) {
2063         if ( aSig == 0 ) return 0;
2064         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2065     }
2066     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2067     aSig = ( aSig | 0x00800000 )<<8;
2068     zSig = estimateSqrt32( aExp, aSig ) + 2;
2069     if ( ( zSig & 0x7F ) <= 5 ) {
2070         if ( zSig < 2 ) {
2071             zSig = 0x7FFFFFFF;
2072             goto roundAndPack;
2073         }
2074         aSig >>= aExp & 1;
2075         term = ( (bits64) zSig ) * zSig;
2076         rem = ( ( (bits64) aSig )<<32 ) - term;
2077         while ( (sbits64) rem < 0 ) {
2078             --zSig;
2079             rem += ( ( (bits64) zSig )<<1 ) | 1;
2080         }
2081         zSig |= ( rem != 0 );
2082     }
2083     shift32RightJamming( zSig, 1, &zSig );
2084  roundAndPack:
2085     return roundAndPackFloat32( 0, zExp, zSig );
2086 
2087 }
2088 #endif /* !SOFTFLOAT_FOR_GCC */
2089 
2090 /*----------------------------------------------------------------------------
2091 | Returns 1 if the single-precision floating-point value `a' is equal to
2092 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2094 *----------------------------------------------------------------------------*/
2095 
float32_eq(float32 a,float32 b)2096 flag float32_eq( float32 a, float32 b )
2097 {
2098 
2099     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2100          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2101        ) {
2102         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2103             float_raise( float_flag_invalid );
2104         }
2105         return 0;
2106     }
2107     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2108 
2109 }
2110 
2111 /*----------------------------------------------------------------------------
2112 | Returns 1 if the single-precision floating-point value `a' is less than
2113 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2115 | Arithmetic.
2116 *----------------------------------------------------------------------------*/
2117 
float32_le(float32 a,float32 b)2118 flag float32_le( float32 a, float32 b )
2119 {
2120     flag aSign, bSign;
2121 
2122     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2123          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2124        ) {
2125         float_raise( float_flag_invalid );
2126         return 0;
2127     }
2128     aSign = extractFloat32Sign( a );
2129     bSign = extractFloat32Sign( b );
2130     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2131     return ( a == b ) || ( aSign ^ ( a < b ) );
2132 
2133 }
2134 
2135 /*----------------------------------------------------------------------------
2136 | Returns 1 if the single-precision floating-point value `a' is less than
2137 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2140 
float32_lt(float32 a,float32 b)2141 flag float32_lt( float32 a, float32 b )
2142 {
2143     flag aSign, bSign;
2144 
2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147        ) {
2148         float_raise( float_flag_invalid );
2149         return 0;
2150     }
2151     aSign = extractFloat32Sign( a );
2152     bSign = extractFloat32Sign( b );
2153     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2154     return ( a != b ) && ( aSign ^ ( a < b ) );
2155 
2156 }
2157 
2158 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2159 /*----------------------------------------------------------------------------
2160 | Returns 1 if the single-precision floating-point value `a' is equal to
2161 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2162 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2163 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2164 *----------------------------------------------------------------------------*/
2165 
float32_eq_signaling(float32 a,float32 b)2166 flag float32_eq_signaling( float32 a, float32 b )
2167 {
2168 
2169     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2170          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2171        ) {
2172         float_raise( float_flag_invalid );
2173         return 0;
2174     }
2175     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2176 
2177 }
2178 
2179 /*----------------------------------------------------------------------------
2180 | Returns 1 if the single-precision floating-point value `a' is less than or
2181 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2182 | cause an exception.  Otherwise, the comparison is performed according to the
2183 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2185 
float32_le_quiet(float32 a,float32 b)2186 flag float32_le_quiet( float32 a, float32 b )
2187 {
2188     flag aSign, bSign;
2189 
2190     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2191          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2192        ) {
2193         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2194             float_raise( float_flag_invalid );
2195         }
2196         return 0;
2197     }
2198     aSign = extractFloat32Sign( a );
2199     bSign = extractFloat32Sign( b );
2200     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2201     return ( a == b ) || ( aSign ^ ( a < b ) );
2202 
2203 }
2204 
2205 /*----------------------------------------------------------------------------
2206 | Returns 1 if the single-precision floating-point value `a' is less than
2207 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2208 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2209 | Standard for Binary Floating-Point Arithmetic.
2210 *----------------------------------------------------------------------------*/
2211 
float32_lt_quiet(float32 a,float32 b)2212 flag float32_lt_quiet( float32 a, float32 b )
2213 {
2214     flag aSign, bSign;
2215 
2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218        ) {
2219         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2220             float_raise( float_flag_invalid );
2221         }
2222         return 0;
2223     }
2224     aSign = extractFloat32Sign( a );
2225     bSign = extractFloat32Sign( b );
2226     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2227     return ( a != b ) && ( aSign ^ ( a < b ) );
2228 
2229 }
2230 #endif /* !SOFTFLOAT_FOR_GCC */
2231 
2232 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2233 /*----------------------------------------------------------------------------
2234 | Returns the result of converting the double-precision floating-point value
2235 | `a' to the 32-bit two's complement integer format.  The conversion is
2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2237 | Arithmetic---which means in particular that the conversion is rounded
2238 | according to the current rounding mode.  If `a' is a NaN, the largest
2239 | positive integer is returned.  Otherwise, if the conversion overflows, the
2240 | largest integer with the same sign as `a' is returned.
2241 *----------------------------------------------------------------------------*/
2242 
float64_to_int32(float64 a)2243 int32 float64_to_int32( float64 a )
2244 {
2245     flag aSign;
2246     int16 aExp, shiftCount;
2247     bits64 aSig;
2248 
2249     aSig = extractFloat64Frac( a );
2250     aExp = extractFloat64Exp( a );
2251     aSign = extractFloat64Sign( a );
2252     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2253     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2254     shiftCount = 0x42C - aExp;
2255     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2256     return roundAndPackInt32( aSign, aSig );
2257 
2258 }
2259 #endif /* !SOFTFLOAT_FOR_GCC */
2260 
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 32-bit two's complement integer format.  The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic, except that the conversion is always rounded toward zero.
2266 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2267 | the conversion overflows, the largest integer with the same sign as `a' is
2268 | returned.
2269 *----------------------------------------------------------------------------*/
2270 
float64_to_int32_round_to_zero(float64 a)2271 int32 float64_to_int32_round_to_zero( float64 a )
2272 {
2273     flag aSign;
2274     int16 aExp, shiftCount;
2275     bits64 aSig, savedASig;
2276     int32 z;
2277 
2278     aSig = extractFloat64Frac( a );
2279     aExp = extractFloat64Exp( a );
2280     aSign = extractFloat64Sign( a );
2281     if ( 0x41E < aExp ) {
2282         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2283         goto invalid;
2284     }
2285     else if ( aExp < 0x3FF ) {
2286         if ( aExp || aSig ) float_set_inexact();
2287         return 0;
2288     }
2289     aSig |= LIT64( 0x0010000000000000 );
2290     shiftCount = 0x433 - aExp;
2291     savedASig = aSig;
2292     aSig >>= shiftCount;
2293     z = aSig;
2294     if ( aSign ) z = - z;
2295     if ( ( z < 0 ) ^ aSign ) {
2296  invalid:
2297         float_raise( float_flag_invalid );
2298         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2299     }
2300     if ( ( aSig<<shiftCount ) != savedASig ) {
2301         float_set_inexact();
2302     }
2303     return z;
2304 
2305 }
2306 
2307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2308 /*----------------------------------------------------------------------------
2309 | Returns the result of converting the double-precision floating-point value
2310 | `a' to the 64-bit two's complement integer format.  The conversion is
2311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2312 | Arithmetic---which means in particular that the conversion is rounded
2313 | according to the current rounding mode.  If `a' is a NaN, the largest
2314 | positive integer is returned.  Otherwise, if the conversion overflows, the
2315 | largest integer with the same sign as `a' is returned.
2316 *----------------------------------------------------------------------------*/
2317 
float64_to_int64(float64 a)2318 int64 float64_to_int64( float64 a )
2319 {
2320     flag aSign;
2321     int16 aExp, shiftCount;
2322     bits64 aSig, aSigExtra;
2323 
2324     aSig = extractFloat64Frac( a );
2325     aExp = extractFloat64Exp( a );
2326     aSign = extractFloat64Sign( a );
2327     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2328     shiftCount = 0x433 - aExp;
2329     if ( shiftCount <= 0 ) {
2330         if ( 0x43E < aExp ) {
2331             float_raise( float_flag_invalid );
2332             if (    ! aSign
2333                  || (    ( aExp == 0x7FF )
2334                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2335                ) {
2336                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2337             }
2338             return (sbits64) LIT64( 0x8000000000000000 );
2339         }
2340         aSigExtra = 0;
2341         aSig <<= - shiftCount;
2342     }
2343     else {
2344         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2345     }
2346     return roundAndPackInt64( aSign, aSig, aSigExtra );
2347 
2348 }
2349 
2350 /* like above, but result is unsigned */
float64_to_uint64(float64 a)2351 uint64 float64_to_uint64( float64 a )
2352 {
2353     flag aSign;
2354     int16 aExp, shiftCount;
2355     bits64 aSig, aSigExtra;
2356 
2357     aSig = extractFloat64Frac( a );
2358     aExp = extractFloat64Exp( a );
2359     aSign = extractFloat64Sign( a );
2360 
2361     if (aSign) {
2362 	return float64_to_int64(a);
2363     }
2364 
2365     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2366     shiftCount = 0x433 - aExp;
2367     if ( shiftCount <= 0 ) {
2368         if ( 0x43E < aExp ) {
2369             float_raise( float_flag_invalid );
2370             if (    ! aSign
2371                  || (    ( aExp == 0x7FF )
2372                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2373                ) {
2374                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2375             }
2376             return (sbits64) LIT64( 0x8000000000000000 );
2377         }
2378         aSigExtra = 0;
2379         aSig <<= - shiftCount;
2380     }
2381     else {
2382         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2383     }
2384     return roundAndPackUInt64( aSig, aSigExtra );
2385 
2386 }
2387 
2388 /*----------------------------------------------------------------------------
2389 | Returns the result of converting the double-precision floating-point value
2390 | `a' to the 64-bit two's complement integer format.  The conversion is
2391 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2392 | Arithmetic, except that the conversion is always rounded toward zero.
2393 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2394 | the conversion overflows, the largest integer with the same sign as `a' is
2395 | returned.
2396 *----------------------------------------------------------------------------*/
2397 
float64_to_int64_round_to_zero(float64 a)2398 int64 float64_to_int64_round_to_zero( float64 a )
2399 {
2400     flag aSign;
2401     int16 aExp, shiftCount;
2402     bits64 aSig;
2403     int64 z;
2404 
2405     aSig = extractFloat64Frac( a );
2406     aExp = extractFloat64Exp( a );
2407     aSign = extractFloat64Sign( a );
2408     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2409     shiftCount = aExp - 0x433;
2410     if ( 0 <= shiftCount ) {
2411         if ( 0x43E <= aExp ) {
2412             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2413                 float_raise( float_flag_invalid );
2414                 if (    ! aSign
2415                      || (    ( aExp == 0x7FF )
2416                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2417                    ) {
2418                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2419                 }
2420             }
2421             return (sbits64) LIT64( 0x8000000000000000 );
2422         }
2423         z = aSig<<shiftCount;
2424     }
2425     else {
2426         if ( aExp < 0x3FE ) {
2427             if ( aExp | aSig ) float_set_inexact();
2428             return 0;
2429         }
2430         z = aSig>>( - shiftCount );
2431         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2432             float_set_inexact();
2433         }
2434     }
2435     if ( aSign ) z = - z;
2436     return z;
2437 
2438 }
2439 #endif /* !SOFTFLOAT_FOR_GCC */
2440 
2441 /*----------------------------------------------------------------------------
2442 | Returns the result of converting the double-precision floating-point value
2443 | `a' to the single-precision floating-point format.  The conversion is
2444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 | Arithmetic.
2446 *----------------------------------------------------------------------------*/
2447 
float64_to_float32(float64 a)2448 float32 float64_to_float32( float64 a )
2449 {
2450     flag aSign;
2451     int16 aExp;
2452     bits64 aSig;
2453     bits32 zSig;
2454 
2455     aSig = extractFloat64Frac( a );
2456     aExp = extractFloat64Exp( a );
2457     aSign = extractFloat64Sign( a );
2458     if ( aExp == 0x7FF ) {
2459         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2460         return packFloat32( aSign, 0xFF, 0 );
2461     }
2462     shift64RightJamming( aSig, 22, &aSig );
2463     zSig = aSig;
2464     if ( aExp || zSig ) {
2465         zSig |= 0x40000000;
2466         aExp -= 0x381;
2467     }
2468     return roundAndPackFloat32( aSign, aExp, zSig );
2469 
2470 }
2471 
2472 #ifdef FLOATX80
2473 
2474 /*----------------------------------------------------------------------------
2475 | Returns the result of converting the double-precision floating-point value
2476 | `a' to the extended double-precision floating-point format.  The conversion
2477 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2478 | Arithmetic.
2479 *----------------------------------------------------------------------------*/
2480 
float64_to_floatx80(float64 a)2481 floatx80 float64_to_floatx80( float64 a )
2482 {
2483     flag aSign;
2484     int16 aExp;
2485     bits64 aSig;
2486 
2487     aSig = extractFloat64Frac( a );
2488     aExp = extractFloat64Exp( a );
2489     aSign = extractFloat64Sign( a );
2490     if ( aExp == 0x7FF ) {
2491         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2492         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2493     }
2494     if ( aExp == 0 ) {
2495         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2496         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2497     }
2498     return
2499         packFloatx80(
2500             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2501 
2502 }
2503 
2504 #endif
2505 
2506 #ifdef FLOAT128
2507 
2508 /*----------------------------------------------------------------------------
2509 | Returns the result of converting the double-precision floating-point value
2510 | `a' to the quadruple-precision floating-point format.  The conversion is
2511 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2512 | Arithmetic.
2513 *----------------------------------------------------------------------------*/
2514 
float64_to_float128(float64 a)2515 float128 float64_to_float128( float64 a )
2516 {
2517     flag aSign;
2518     int16 aExp;
2519     bits64 aSig, zSig0, zSig1;
2520 
2521     aSig = extractFloat64Frac( a );
2522     aExp = extractFloat64Exp( a );
2523     aSign = extractFloat64Sign( a );
2524     if ( aExp == 0x7FF ) {
2525         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2526         return packFloat128( aSign, 0x7FFF, 0, 0 );
2527     }
2528     if ( aExp == 0 ) {
2529         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2530         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2531         --aExp;
2532     }
2533     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2534     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2535 
2536 }
2537 
2538 #endif
2539 
2540 #ifndef SOFTFLOAT_FOR_GCC
2541 /*----------------------------------------------------------------------------
2542 | Rounds the double-precision floating-point value `a' to an integer, and
2543 | returns the result as a double-precision floating-point value.  The
2544 | operation is performed according to the IEC/IEEE Standard for Binary
2545 | Floating-Point Arithmetic.
2546 *----------------------------------------------------------------------------*/
2547 
float64_round_to_int(float64 a)2548 float64 float64_round_to_int( float64 a )
2549 {
2550     flag aSign;
2551     int16 aExp;
2552     bits64 lastBitMask, roundBitsMask;
2553     int8 roundingMode;
2554     float64 z;
2555 
2556     aExp = extractFloat64Exp( a );
2557     if ( 0x433 <= aExp ) {
2558         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2559             return propagateFloat64NaN( a, a );
2560         }
2561         return a;
2562     }
2563     if ( aExp < 0x3FF ) {
2564         if ( (bits64) ( a<<1 ) == 0 ) return a;
2565         float_set_inexact();
2566         aSign = extractFloat64Sign( a );
2567         switch ( float_rounding_mode() ) {
2568          case float_round_nearest_even:
2569             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2570                 return packFloat64( aSign, 0x3FF, 0 );
2571             }
2572             break;
2573          case float_round_down:
2574             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2575          case float_round_up:
2576             return
2577             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2578         }
2579         return packFloat64( aSign, 0, 0 );
2580     }
2581     lastBitMask = 1;
2582     lastBitMask <<= 0x433 - aExp;
2583     roundBitsMask = lastBitMask - 1;
2584     z = a;
2585     roundingMode = float_rounding_mode();
2586     if ( roundingMode == float_round_nearest_even ) {
2587         z += lastBitMask>>1;
2588         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2589     }
2590     else if ( roundingMode != float_round_to_zero ) {
2591         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2592             z += roundBitsMask;
2593         }
2594     }
2595     z &= ~ roundBitsMask;
2596     if ( z != a ) float_set_inexact();
2597     return z;
2598 
2599 }
2600 #endif
2601 
2602 /*----------------------------------------------------------------------------
2603 | Returns the result of adding the absolute values of the double-precision
2604 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2605 | before being returned.  `zSign' is ignored if the result is a NaN.
2606 | The addition is performed according to the IEC/IEEE Standard for Binary
2607 | Floating-Point Arithmetic.
2608 *----------------------------------------------------------------------------*/
2609 
addFloat64Sigs(float64 a,float64 b,flag zSign)2610 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2611 {
2612     int16 aExp, bExp, zExp;
2613     bits64 aSig, bSig, zSig;
2614     int16 expDiff;
2615 
2616     aSig = extractFloat64Frac( a );
2617     aExp = extractFloat64Exp( a );
2618     bSig = extractFloat64Frac( b );
2619     bExp = extractFloat64Exp( b );
2620     expDiff = aExp - bExp;
2621     aSig <<= 9;
2622     bSig <<= 9;
2623     if ( 0 < expDiff ) {
2624         if ( aExp == 0x7FF ) {
2625             if ( aSig ) return propagateFloat64NaN( a, b );
2626             return a;
2627         }
2628         if ( bExp == 0 ) {
2629             --expDiff;
2630         }
2631         else {
2632             bSig |= LIT64( 0x2000000000000000 );
2633         }
2634         shift64RightJamming( bSig, expDiff, &bSig );
2635         zExp = aExp;
2636     }
2637     else if ( expDiff < 0 ) {
2638         if ( bExp == 0x7FF ) {
2639             if ( bSig ) return propagateFloat64NaN( a, b );
2640             return packFloat64( zSign, 0x7FF, 0 );
2641         }
2642         if ( aExp == 0 ) {
2643             ++expDiff;
2644         }
2645         else {
2646             aSig |= LIT64( 0x2000000000000000 );
2647         }
2648         shift64RightJamming( aSig, - expDiff, &aSig );
2649         zExp = bExp;
2650     }
2651     else {
2652         if ( aExp == 0x7FF ) {
2653             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2654             return a;
2655         }
2656         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2657         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2658         zExp = aExp;
2659         goto roundAndPack;
2660     }
2661     aSig |= LIT64( 0x2000000000000000 );
2662     zSig = ( aSig + bSig )<<1;
2663     --zExp;
2664     if ( (sbits64) zSig < 0 ) {
2665         zSig = aSig + bSig;
2666         ++zExp;
2667     }
2668  roundAndPack:
2669     return roundAndPackFloat64( zSign, zExp, zSig );
2670 
2671 }
2672 
2673 /*----------------------------------------------------------------------------
2674 | Returns the result of subtracting the absolute values of the double-
2675 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2676 | difference is negated before being returned.  `zSign' is ignored if the
2677 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2678 | Standard for Binary Floating-Point Arithmetic.
2679 *----------------------------------------------------------------------------*/
2680 
subFloat64Sigs(float64 a,float64 b,flag zSign)2681 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2682 {
2683     int16 aExp, bExp, zExp;
2684     bits64 aSig, bSig, zSig;
2685     int16 expDiff;
2686 
2687     aSig = extractFloat64Frac( a );
2688     aExp = extractFloat64Exp( a );
2689     bSig = extractFloat64Frac( b );
2690     bExp = extractFloat64Exp( b );
2691     expDiff = aExp - bExp;
2692     aSig <<= 10;
2693     bSig <<= 10;
2694     if ( 0 < expDiff ) goto aExpBigger;
2695     if ( expDiff < 0 ) goto bExpBigger;
2696     if ( aExp == 0x7FF ) {
2697         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2698         float_raise( float_flag_invalid );
2699         return float64_default_nan;
2700     }
2701     if ( aExp == 0 ) {
2702         aExp = 1;
2703         bExp = 1;
2704     }
2705     if ( bSig < aSig ) goto aBigger;
2706     if ( aSig < bSig ) goto bBigger;
2707     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2708  bExpBigger:
2709     if ( bExp == 0x7FF ) {
2710         if ( bSig ) return propagateFloat64NaN( a, b );
2711         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2712     }
2713     if ( aExp == 0 ) {
2714         ++expDiff;
2715     }
2716     else {
2717         aSig |= LIT64( 0x4000000000000000 );
2718     }
2719     shift64RightJamming( aSig, - expDiff, &aSig );
2720     bSig |= LIT64( 0x4000000000000000 );
2721  bBigger:
2722     zSig = bSig - aSig;
2723     zExp = bExp;
2724     zSign ^= 1;
2725     goto normalizeRoundAndPack;
2726  aExpBigger:
2727     if ( aExp == 0x7FF ) {
2728         if ( aSig ) return propagateFloat64NaN( a, b );
2729         return a;
2730     }
2731     if ( bExp == 0 ) {
2732         --expDiff;
2733     }
2734     else {
2735         bSig |= LIT64( 0x4000000000000000 );
2736     }
2737     shift64RightJamming( bSig, expDiff, &bSig );
2738     aSig |= LIT64( 0x4000000000000000 );
2739  aBigger:
2740     zSig = aSig - bSig;
2741     zExp = aExp;
2742  normalizeRoundAndPack:
2743     --zExp;
2744     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2745 
2746 }
2747 
2748 /*----------------------------------------------------------------------------
2749 | Returns the result of adding the double-precision floating-point values `a'
2750 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2751 | Binary Floating-Point Arithmetic.
2752 *----------------------------------------------------------------------------*/
2753 
float64_add(float64 a,float64 b)2754 float64 float64_add( float64 a, float64 b )
2755 {
2756     flag aSign, bSign;
2757 
2758     aSign = extractFloat64Sign( a );
2759     bSign = extractFloat64Sign( b );
2760     if ( aSign == bSign ) {
2761         return addFloat64Sigs( a, b, aSign );
2762     }
2763     else {
2764         return subFloat64Sigs( a, b, aSign );
2765     }
2766 
2767 }
2768 
2769 /*----------------------------------------------------------------------------
2770 | Returns the result of subtracting the double-precision floating-point values
2771 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2772 | for Binary Floating-Point Arithmetic.
2773 *----------------------------------------------------------------------------*/
2774 
float64_sub(float64 a,float64 b)2775 float64 float64_sub( float64 a, float64 b )
2776 {
2777     flag aSign, bSign;
2778 
2779     aSign = extractFloat64Sign( a );
2780     bSign = extractFloat64Sign( b );
2781     if ( aSign == bSign ) {
2782         return subFloat64Sigs( a, b, aSign );
2783     }
2784     else {
2785         return addFloat64Sigs( a, b, aSign );
2786     }
2787 
2788 }
2789 
2790 /*----------------------------------------------------------------------------
2791 | Returns the result of multiplying the double-precision floating-point values
2792 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2793 | for Binary Floating-Point Arithmetic.
2794 *----------------------------------------------------------------------------*/
2795 
float64_mul(float64 a,float64 b)2796 float64 float64_mul( float64 a, float64 b )
2797 {
2798     flag aSign, bSign, zSign;
2799     int16 aExp, bExp, zExp;
2800     bits64 aSig, bSig, zSig0, zSig1;
2801 
2802     aSig = extractFloat64Frac( a );
2803     aExp = extractFloat64Exp( a );
2804     aSign = extractFloat64Sign( a );
2805     bSig = extractFloat64Frac( b );
2806     bExp = extractFloat64Exp( b );
2807     bSign = extractFloat64Sign( b );
2808     zSign = aSign ^ bSign;
2809     if ( aExp == 0x7FF ) {
2810         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2811             return propagateFloat64NaN( a, b );
2812         }
2813         if ( ( bExp | bSig ) == 0 ) {
2814             float_raise( float_flag_invalid );
2815             return float64_default_nan;
2816         }
2817         return packFloat64( zSign, 0x7FF, 0 );
2818     }
2819     if ( bExp == 0x7FF ) {
2820         if ( bSig ) return propagateFloat64NaN( a, b );
2821         if ( ( aExp | aSig ) == 0 ) {
2822             float_raise( float_flag_invalid );
2823             return float64_default_nan;
2824         }
2825         return packFloat64( zSign, 0x7FF, 0 );
2826     }
2827     if ( aExp == 0 ) {
2828         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2829         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2830     }
2831     if ( bExp == 0 ) {
2832         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2833         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834     }
2835     zExp = aExp + bExp - 0x3FF;
2836     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2837     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2838     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2839     zSig0 |= ( zSig1 != 0 );
2840     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2841         zSig0 <<= 1;
2842         --zExp;
2843     }
2844     return roundAndPackFloat64( zSign, zExp, zSig0 );
2845 
2846 }
2847 
2848 /*----------------------------------------------------------------------------
2849 | Returns the result of dividing the double-precision floating-point value `a'
2850 | by the corresponding value `b'.  The operation is performed according to
2851 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2852 *----------------------------------------------------------------------------*/
2853 
float64_div(float64 a,float64 b)2854 float64 float64_div( float64 a, float64 b )
2855 {
2856     flag aSign, bSign, zSign;
2857     int16 aExp, bExp, zExp;
2858     bits64 aSig, bSig, zSig;
2859     bits64 rem0, rem1;
2860     bits64 term0, term1;
2861 
2862     aSig = extractFloat64Frac( a );
2863     aExp = extractFloat64Exp( a );
2864     aSign = extractFloat64Sign( a );
2865     bSig = extractFloat64Frac( b );
2866     bExp = extractFloat64Exp( b );
2867     bSign = extractFloat64Sign( b );
2868     zSign = aSign ^ bSign;
2869     if ( aExp == 0x7FF ) {
2870         if ( aSig ) return propagateFloat64NaN( a, b );
2871         if ( bExp == 0x7FF ) {
2872             if ( bSig ) return propagateFloat64NaN( a, b );
2873             float_raise( float_flag_invalid );
2874             return float64_default_nan;
2875         }
2876         return packFloat64( zSign, 0x7FF, 0 );
2877     }
2878     if ( bExp == 0x7FF ) {
2879         if ( bSig ) return propagateFloat64NaN( a, b );
2880         return packFloat64( zSign, 0, 0 );
2881     }
2882     if ( bExp == 0 ) {
2883         if ( bSig == 0 ) {
2884             if ( ( aExp | aSig ) == 0 ) {
2885                 float_raise( float_flag_invalid );
2886                 return float64_default_nan;
2887             }
2888             float_raise( float_flag_divbyzero );
2889             return packFloat64( zSign, 0x7FF, 0 );
2890         }
2891         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2892     }
2893     if ( aExp == 0 ) {
2894         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2895         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2896     }
2897     zExp = aExp - bExp + 0x3FD;
2898     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2899     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2900     if ( bSig <= ( aSig + aSig ) ) {
2901         aSig >>= 1;
2902         ++zExp;
2903     }
2904     zSig = estimateDiv128To64( aSig, 0, bSig );
2905     if ( ( zSig & 0x1FF ) <= 2 ) {
2906         mul64To128( bSig, zSig, &term0, &term1 );
2907         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2908         while ( (sbits64) rem0 < 0 ) {
2909             --zSig;
2910             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2911         }
2912         zSig |= ( rem1 != 0 );
2913     }
2914     return roundAndPackFloat64( zSign, zExp, zSig );
2915 
2916 }
2917 
2918 #ifndef SOFTFLOAT_FOR_GCC
2919 /*----------------------------------------------------------------------------
2920 | Returns the remainder of the double-precision floating-point value `a'
2921 | with respect to the corresponding value `b'.  The operation is performed
2922 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923 *----------------------------------------------------------------------------*/
2924 
float64_rem(float64 a,float64 b)2925 float64 float64_rem( float64 a, float64 b )
2926 {
2927     flag aSign, bSign __unused, zSign;
2928     int16 aExp, bExp, expDiff;
2929     bits64 aSig, bSig;
2930     bits64 q, alternateASig;
2931     sbits64 sigMean;
2932 
2933     aSig = extractFloat64Frac( a );
2934     aExp = extractFloat64Exp( a );
2935     aSign = extractFloat64Sign( a );
2936     bSig = extractFloat64Frac( b );
2937     bExp = extractFloat64Exp( b );
2938     bSign = extractFloat64Sign( b );
2939     if ( aExp == 0x7FF ) {
2940         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2941             return propagateFloat64NaN( a, b );
2942         }
2943         float_raise( float_flag_invalid );
2944         return float64_default_nan;
2945     }
2946     if ( bExp == 0x7FF ) {
2947         if ( bSig ) return propagateFloat64NaN( a, b );
2948         return a;
2949     }
2950     if ( bExp == 0 ) {
2951         if ( bSig == 0 ) {
2952             float_raise( float_flag_invalid );
2953             return float64_default_nan;
2954         }
2955         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956     }
2957     if ( aExp == 0 ) {
2958         if ( aSig == 0 ) return a;
2959         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960     }
2961     expDiff = aExp - bExp;
2962     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2963     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964     if ( expDiff < 0 ) {
2965         if ( expDiff < -1 ) return a;
2966         aSig >>= 1;
2967     }
2968     q = ( bSig <= aSig );
2969     if ( q ) aSig -= bSig;
2970     expDiff -= 64;
2971     while ( 0 < expDiff ) {
2972         q = estimateDiv128To64( aSig, 0, bSig );
2973         q = ( 2 < q ) ? q - 2 : 0;
2974         aSig = - ( ( bSig>>2 ) * q );
2975         expDiff -= 62;
2976     }
2977     expDiff += 64;
2978     if ( 0 < expDiff ) {
2979         q = estimateDiv128To64( aSig, 0, bSig );
2980         q = ( 2 < q ) ? q - 2 : 0;
2981         q >>= 64 - expDiff;
2982         bSig >>= 2;
2983         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2984     }
2985     else {
2986         aSig >>= 2;
2987         bSig >>= 2;
2988     }
2989     do {
2990         alternateASig = aSig;
2991         ++q;
2992         aSig -= bSig;
2993     } while ( 0 <= (sbits64) aSig );
2994     sigMean = aSig + alternateASig;
2995     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2996         aSig = alternateASig;
2997     }
2998     zSign = ( (sbits64) aSig < 0 );
2999     if ( zSign ) aSig = - aSig;
3000     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3001 
3002 }
3003 
3004 /*----------------------------------------------------------------------------
3005 | Returns the square root of the double-precision floating-point value `a'.
3006 | The operation is performed according to the IEC/IEEE Standard for Binary
3007 | Floating-Point Arithmetic.
3008 *----------------------------------------------------------------------------*/
3009 
float64_sqrt(float64 a)3010 float64 float64_sqrt( float64 a )
3011 {
3012     flag aSign;
3013     int16 aExp, zExp;
3014     bits64 aSig, zSig, doubleZSig;
3015     bits64 rem0, rem1, term0, term1;
3016 
3017     aSig = extractFloat64Frac( a );
3018     aExp = extractFloat64Exp( a );
3019     aSign = extractFloat64Sign( a );
3020     if ( aExp == 0x7FF ) {
3021         if ( aSig ) return propagateFloat64NaN( a, a );
3022         if ( ! aSign ) return a;
3023         float_raise( float_flag_invalid );
3024         return float64_default_nan;
3025     }
3026     if ( aSign ) {
3027         if ( ( aExp | aSig ) == 0 ) return a;
3028         float_raise( float_flag_invalid );
3029         return float64_default_nan;
3030     }
3031     if ( aExp == 0 ) {
3032         if ( aSig == 0 ) return 0;
3033         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3034     }
3035     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3036     aSig |= LIT64( 0x0010000000000000 );
3037     zSig = estimateSqrt32( aExp, aSig>>21 );
3038     aSig <<= 9 - ( aExp & 1 );
3039     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3040     if ( ( zSig & 0x1FF ) <= 5 ) {
3041         doubleZSig = zSig<<1;
3042         mul64To128( zSig, zSig, &term0, &term1 );
3043         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3044         while ( (sbits64) rem0 < 0 ) {
3045             --zSig;
3046             doubleZSig -= 2;
3047             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3048         }
3049         zSig |= ( ( rem0 | rem1 ) != 0 );
3050     }
3051     return roundAndPackFloat64( 0, zExp, zSig );
3052 
3053 }
3054 #endif
3055 
3056 /*----------------------------------------------------------------------------
3057 | Returns 1 if the double-precision floating-point value `a' is equal to the
3058 | corresponding value `b', and 0 otherwise.  The comparison is performed
3059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3060 *----------------------------------------------------------------------------*/
3061 
float64_eq(float64 a,float64 b)3062 flag float64_eq( float64 a, float64 b )
3063 {
3064 
3065     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3066          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3067        ) {
3068         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3069             float_raise( float_flag_invalid );
3070         }
3071         return 0;
3072     }
3073     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3074 
3075 }
3076 
3077 /*----------------------------------------------------------------------------
3078 | Returns 1 if the double-precision floating-point value `a' is less than or
3079 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3081 | Arithmetic.
3082 *----------------------------------------------------------------------------*/
3083 
float64_le(float64 a,float64 b)3084 flag float64_le( float64 a, float64 b )
3085 {
3086     flag aSign, bSign;
3087 
3088     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3089          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3090        ) {
3091         float_raise( float_flag_invalid );
3092         return 0;
3093     }
3094     aSign = extractFloat64Sign( a );
3095     bSign = extractFloat64Sign( b );
3096     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3097     return ( a == b ) || ( aSign ^ ( a < b ) );
3098 
3099 }
3100 
3101 /*----------------------------------------------------------------------------
3102 | Returns 1 if the double-precision floating-point value `a' is less than
3103 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3105 *----------------------------------------------------------------------------*/
3106 
float64_lt(float64 a,float64 b)3107 flag float64_lt( float64 a, float64 b )
3108 {
3109     flag aSign, bSign;
3110 
3111     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3112          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3113        ) {
3114         float_raise( float_flag_invalid );
3115         return 0;
3116     }
3117     aSign = extractFloat64Sign( a );
3118     bSign = extractFloat64Sign( b );
3119     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3120     return ( a != b ) && ( aSign ^ ( a < b ) );
3121 
3122 }
3123 
3124 #ifndef SOFTFLOAT_FOR_GCC
3125 /*----------------------------------------------------------------------------
3126 | Returns 1 if the double-precision floating-point value `a' is equal to the
3127 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3128 | if either operand is a NaN.  Otherwise, the comparison is performed
3129 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3130 *----------------------------------------------------------------------------*/
3131 
float64_eq_signaling(float64 a,float64 b)3132 flag float64_eq_signaling( float64 a, float64 b )
3133 {
3134 
3135     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3136          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3137        ) {
3138         float_raise( float_flag_invalid );
3139         return 0;
3140     }
3141     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3142 
3143 }
3144 
3145 /*----------------------------------------------------------------------------
3146 | Returns 1 if the double-precision floating-point value `a' is less than or
3147 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3148 | cause an exception.  Otherwise, the comparison is performed according to the
3149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3150 *----------------------------------------------------------------------------*/
3151 
float64_le_quiet(float64 a,float64 b)3152 flag float64_le_quiet( float64 a, float64 b )
3153 {
3154     flag aSign, bSign;
3155 
3156     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3157          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3158        ) {
3159         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3160             float_raise( float_flag_invalid );
3161         }
3162         return 0;
3163     }
3164     aSign = extractFloat64Sign( a );
3165     bSign = extractFloat64Sign( b );
3166     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3167     return ( a == b ) || ( aSign ^ ( a < b ) );
3168 
3169 }
3170 
3171 /*----------------------------------------------------------------------------
3172 | Returns 1 if the double-precision floating-point value `a' is less than
3173 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3174 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3175 | Standard for Binary Floating-Point Arithmetic.
3176 *----------------------------------------------------------------------------*/
3177 
float64_lt_quiet(float64 a,float64 b)3178 flag float64_lt_quiet( float64 a, float64 b )
3179 {
3180     flag aSign, bSign;
3181 
3182     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3183          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3184        ) {
3185         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3186             float_raise( float_flag_invalid );
3187         }
3188         return 0;
3189     }
3190     aSign = extractFloat64Sign( a );
3191     bSign = extractFloat64Sign( b );
3192     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3193     return ( a != b ) && ( aSign ^ ( a < b ) );
3194 
3195 }
3196 #endif
3197 
3198 #ifdef FLOATX80
3199 
3200 /*----------------------------------------------------------------------------
3201 | Returns the result of converting the extended double-precision floating-
3202 | point value `a' to the 32-bit two's complement integer format.  The
3203 | conversion is performed according to the IEC/IEEE Standard for Binary
3204 | Floating-Point Arithmetic---which means in particular that the conversion
3205 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3206 | largest positive integer is returned.  Otherwise, if the conversion
3207 | overflows, the largest integer with the same sign as `a' is returned.
3208 *----------------------------------------------------------------------------*/
3209 
floatx80_to_int32(floatx80 a)3210 int32 floatx80_to_int32( floatx80 a )
3211 {
3212     flag aSign;
3213     int32 aExp, shiftCount;
3214     bits64 aSig;
3215 
3216     aSig = extractFloatx80Frac( a );
3217     aExp = extractFloatx80Exp( a );
3218     aSign = extractFloatx80Sign( a );
3219     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3220     shiftCount = 0x4037 - aExp;
3221     if ( shiftCount <= 0 ) shiftCount = 1;
3222     shift64RightJamming( aSig, shiftCount, &aSig );
3223     return roundAndPackInt32( aSign, aSig );
3224 
3225 }
3226 
3227 /*----------------------------------------------------------------------------
3228 | Returns the result of converting the extended double-precision floating-
3229 | point value `a' to the 32-bit two's complement integer format.  The
3230 | conversion is performed according to the IEC/IEEE Standard for Binary
3231 | Floating-Point Arithmetic, except that the conversion is always rounded
3232 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3233 | Otherwise, if the conversion overflows, the largest integer with the same
3234 | sign as `a' is returned.
3235 *----------------------------------------------------------------------------*/
3236 
floatx80_to_int32_round_to_zero(floatx80 a)3237 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3238 {
3239     flag aSign;
3240     int32 aExp, shiftCount;
3241     bits64 aSig, savedASig;
3242     int32 z;
3243 
3244     aSig = extractFloatx80Frac( a );
3245     aExp = extractFloatx80Exp( a );
3246     aSign = extractFloatx80Sign( a );
3247     if ( 0x401E < aExp ) {
3248         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3249         goto invalid;
3250     }
3251     else if ( aExp < 0x3FFF ) {
3252         if ( aExp || aSig ) float_set_inexact();
3253         return 0;
3254     }
3255     shiftCount = 0x403E - aExp;
3256     savedASig = aSig;
3257     aSig >>= shiftCount;
3258     z = aSig;
3259     if ( aSign ) z = - z;
3260     if ( ( z < 0 ) ^ aSign ) {
3261  invalid:
3262         float_raise( float_flag_invalid );
3263         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3264     }
3265     if ( ( aSig<<shiftCount ) != savedASig ) {
3266         float_set_inexact();
3267     }
3268     return z;
3269 
3270 }
3271 
3272 /*----------------------------------------------------------------------------
3273 | Returns the result of converting the extended double-precision floating-
3274 | point value `a' to the 64-bit two's complement integer format.  The
3275 | conversion is performed according to the IEC/IEEE Standard for Binary
3276 | Floating-Point Arithmetic---which means in particular that the conversion
3277 | is rounded according to the current rounding mode.  If `a' is a NaN,
3278 | the largest positive integer is returned.  Otherwise, if the conversion
3279 | overflows, the largest integer with the same sign as `a' is returned.
3280 *----------------------------------------------------------------------------*/
3281 
floatx80_to_int64(floatx80 a)3282 int64 floatx80_to_int64( floatx80 a )
3283 {
3284     flag aSign;
3285     int32 aExp, shiftCount;
3286     bits64 aSig, aSigExtra;
3287 
3288     aSig = extractFloatx80Frac( a );
3289     aExp = extractFloatx80Exp( a );
3290     aSign = extractFloatx80Sign( a );
3291     shiftCount = 0x403E - aExp;
3292     if ( shiftCount <= 0 ) {
3293         if ( shiftCount ) {
3294             float_raise( float_flag_invalid );
3295             if (    ! aSign
3296                  || (    ( aExp == 0x7FFF )
3297                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3298                ) {
3299                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3300             }
3301             return (sbits64) LIT64( 0x8000000000000000 );
3302         }
3303         aSigExtra = 0;
3304     }
3305     else {
3306         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3307     }
3308     return roundAndPackInt64( aSign, aSig, aSigExtra );
3309 
3310 }
3311 
3312 /*----------------------------------------------------------------------------
3313 | Returns the result of converting the extended double-precision floating-
3314 | point value `a' to the 64-bit two's complement integer format.  The
3315 | conversion is performed according to the IEC/IEEE Standard for Binary
3316 | Floating-Point Arithmetic, except that the conversion is always rounded
3317 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3318 | Otherwise, if the conversion overflows, the largest integer with the same
3319 | sign as `a' is returned.
3320 *----------------------------------------------------------------------------*/
3321 
floatx80_to_int64_round_to_zero(floatx80 a)3322 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3323 {
3324     flag aSign;
3325     int32 aExp, shiftCount;
3326     bits64 aSig;
3327     int64 z;
3328 
3329     aSig = extractFloatx80Frac( a );
3330     aExp = extractFloatx80Exp( a );
3331     aSign = extractFloatx80Sign( a );
3332     shiftCount = aExp - 0x403E;
3333     if ( 0 <= shiftCount ) {
3334         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3335         if ( ( a.high != 0xC03E ) || aSig ) {
3336             float_raise( float_flag_invalid );
3337             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3338                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3339             }
3340         }
3341         return (sbits64) LIT64( 0x8000000000000000 );
3342     }
3343     else if ( aExp < 0x3FFF ) {
3344         if ( aExp | aSig ) float_set_inexact();
3345         return 0;
3346     }
3347     z = aSig>>( - shiftCount );
3348     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3349         float_set_inexact();
3350     }
3351     if ( aSign ) z = - z;
3352     return z;
3353 
3354 }
3355 
3356 /*----------------------------------------------------------------------------
3357 | Returns the result of converting the extended double-precision floating-
3358 | point value `a' to the single-precision floating-point format.  The
3359 | conversion is performed according to the IEC/IEEE Standard for Binary
3360 | Floating-Point Arithmetic.
3361 *----------------------------------------------------------------------------*/
3362 
floatx80_to_float32(floatx80 a)3363 float32 floatx80_to_float32( floatx80 a )
3364 {
3365     flag aSign;
3366     int32 aExp;
3367     bits64 aSig;
3368 
3369     aSig = extractFloatx80Frac( a );
3370     aExp = extractFloatx80Exp( a );
3371     aSign = extractFloatx80Sign( a );
3372     if ( aExp == 0x7FFF ) {
3373         if ( (bits64) ( aSig<<1 ) ) {
3374             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3375         }
3376         return packFloat32( aSign, 0xFF, 0 );
3377     }
3378     shift64RightJamming( aSig, 33, &aSig );
3379     if ( aExp || aSig ) aExp -= 0x3F81;
3380     return roundAndPackFloat32( aSign, aExp, aSig );
3381 
3382 }
3383 
3384 /*----------------------------------------------------------------------------
3385 | Returns the result of converting the extended double-precision floating-
3386 | point value `a' to the double-precision floating-point format.  The
3387 | conversion is performed according to the IEC/IEEE Standard for Binary
3388 | Floating-Point Arithmetic.
3389 *----------------------------------------------------------------------------*/
3390 
floatx80_to_float64(floatx80 a)3391 float64 floatx80_to_float64( floatx80 a )
3392 {
3393     flag aSign;
3394     int32 aExp;
3395     bits64 aSig, zSig;
3396 
3397     aSig = extractFloatx80Frac( a );
3398     aExp = extractFloatx80Exp( a );
3399     aSign = extractFloatx80Sign( a );
3400     if ( aExp == 0x7FFF ) {
3401         if ( (bits64) ( aSig<<1 ) ) {
3402             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3403         }
3404         return packFloat64( aSign, 0x7FF, 0 );
3405     }
3406     shift64RightJamming( aSig, 1, &zSig );
3407     if ( aExp || aSig ) aExp -= 0x3C01;
3408     return roundAndPackFloat64( aSign, aExp, zSig );
3409 
3410 }
3411 
3412 #ifdef FLOAT128
3413 
3414 /*----------------------------------------------------------------------------
3415 | Returns the result of converting the extended double-precision floating-
3416 | point value `a' to the quadruple-precision floating-point format.  The
3417 | conversion is performed according to the IEC/IEEE Standard for Binary
3418 | Floating-Point Arithmetic.
3419 *----------------------------------------------------------------------------*/
3420 
floatx80_to_float128(floatx80 a)3421 float128 floatx80_to_float128( floatx80 a )
3422 {
3423     flag aSign;
3424     int16 aExp;
3425     bits64 aSig, zSig0, zSig1;
3426 
3427     aSig = extractFloatx80Frac( a );
3428     aExp = extractFloatx80Exp( a );
3429     aSign = extractFloatx80Sign( a );
3430     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3431         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3432     }
3433     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3434     return packFloat128( aSign, aExp, zSig0, zSig1 );
3435 
3436 }
3437 
3438 #endif
3439 
3440 /*----------------------------------------------------------------------------
3441 | Rounds the extended double-precision floating-point value `a' to an integer,
3442 | and returns the result as an extended quadruple-precision floating-point
3443 | value.  The operation is performed according to the IEC/IEEE Standard for
3444 | Binary Floating-Point Arithmetic.
3445 *----------------------------------------------------------------------------*/
3446 
floatx80_round_to_int(floatx80 a)3447 floatx80 floatx80_round_to_int( floatx80 a )
3448 {
3449     flag aSign;
3450     int32 aExp;
3451     bits64 lastBitMask, roundBitsMask;
3452     int8 roundingMode;
3453     floatx80 z;
3454 
3455     aExp = extractFloatx80Exp( a );
3456     if ( 0x403E <= aExp ) {
3457         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3458             return propagateFloatx80NaN( a, a );
3459         }
3460         return a;
3461     }
3462     if ( aExp < 0x3FFF ) {
3463         if (    ( aExp == 0 )
3464              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3465             return a;
3466         }
3467         float_set_inexact();
3468         aSign = extractFloatx80Sign( a );
3469         switch ( float_rounding_mode() ) {
3470          case float_round_nearest_even:
3471             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3472                ) {
3473                 return
3474                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3475             }
3476             break;
3477          case float_round_down:
3478             return
3479                   aSign ?
3480                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3481                 : packFloatx80( 0, 0, 0 );
3482          case float_round_up:
3483             return
3484                   aSign ? packFloatx80( 1, 0, 0 )
3485                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3486         }
3487         return packFloatx80( aSign, 0, 0 );
3488     }
3489     lastBitMask = 1;
3490     lastBitMask <<= 0x403E - aExp;
3491     roundBitsMask = lastBitMask - 1;
3492     z = a;
3493     roundingMode = float_rounding_mode();
3494     if ( roundingMode == float_round_nearest_even ) {
3495         z.low += lastBitMask>>1;
3496         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3497     }
3498     else if ( roundingMode != float_round_to_zero ) {
3499         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3500             z.low += roundBitsMask;
3501         }
3502     }
3503     z.low &= ~ roundBitsMask;
3504     if ( z.low == 0 ) {
3505         ++z.high;
3506         z.low = LIT64( 0x8000000000000000 );
3507     }
3508     if ( z.low != a.low ) float_set_inexact();
3509     return z;
3510 
3511 }
3512 
3513 /*----------------------------------------------------------------------------
3514 | Returns the result of adding the absolute values of the extended double-
3515 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3516 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3517 | The addition is performed according to the IEC/IEEE Standard for Binary
3518 | Floating-Point Arithmetic.
3519 *----------------------------------------------------------------------------*/
3520 
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3521 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3522 {
3523     int32 aExp, bExp, zExp;
3524     bits64 aSig, bSig, zSig0, zSig1;
3525     int32 expDiff;
3526 
3527     aSig = extractFloatx80Frac( a );
3528     aExp = extractFloatx80Exp( a );
3529     bSig = extractFloatx80Frac( b );
3530     bExp = extractFloatx80Exp( b );
3531     expDiff = aExp - bExp;
3532     if ( 0 < expDiff ) {
3533         if ( aExp == 0x7FFF ) {
3534             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3535             return a;
3536         }
3537         if ( bExp == 0 ) --expDiff;
3538         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3539         zExp = aExp;
3540     }
3541     else if ( expDiff < 0 ) {
3542         if ( bExp == 0x7FFF ) {
3543             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3544             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3545         }
3546         if ( aExp == 0 ) ++expDiff;
3547         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3548         zExp = bExp;
3549     }
3550     else {
3551         if ( aExp == 0x7FFF ) {
3552             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3553                 return propagateFloatx80NaN( a, b );
3554             }
3555             return a;
3556         }
3557         zSig1 = 0;
3558         zSig0 = aSig + bSig;
3559         if ( aExp == 0 ) {
3560             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3561             goto roundAndPack;
3562         }
3563         zExp = aExp;
3564         goto shiftRight1;
3565     }
3566     zSig0 = aSig + bSig;
3567     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3568  shiftRight1:
3569     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3570     zSig0 |= LIT64( 0x8000000000000000 );
3571     ++zExp;
3572  roundAndPack:
3573     return
3574         roundAndPackFloatx80(
3575             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3576 
3577 }
3578 
3579 /*----------------------------------------------------------------------------
3580 | Returns the result of subtracting the absolute values of the extended
3581 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3582 | difference is negated before being returned.  `zSign' is ignored if the
3583 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3584 | Standard for Binary Floating-Point Arithmetic.
3585 *----------------------------------------------------------------------------*/
3586 
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3587 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3588 {
3589     int32 aExp, bExp, zExp;
3590     bits64 aSig, bSig, zSig0, zSig1;
3591     int32 expDiff;
3592     floatx80 z;
3593 
3594     aSig = extractFloatx80Frac( a );
3595     aExp = extractFloatx80Exp( a );
3596     bSig = extractFloatx80Frac( b );
3597     bExp = extractFloatx80Exp( b );
3598     expDiff = aExp - bExp;
3599     if ( 0 < expDiff ) goto aExpBigger;
3600     if ( expDiff < 0 ) goto bExpBigger;
3601     if ( aExp == 0x7FFF ) {
3602         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3603             return propagateFloatx80NaN( a, b );
3604         }
3605         float_raise( float_flag_invalid );
3606         z.low = floatx80_default_nan_low;
3607         z.high = floatx80_default_nan_high;
3608         return z;
3609     }
3610     if ( aExp == 0 ) {
3611         aExp = 1;
3612         bExp = 1;
3613     }
3614     zSig1 = 0;
3615     if ( bSig < aSig ) goto aBigger;
3616     if ( aSig < bSig ) goto bBigger;
3617     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3618  bExpBigger:
3619     if ( bExp == 0x7FFF ) {
3620         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3621         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3622     }
3623     if ( aExp == 0 ) ++expDiff;
3624     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3625  bBigger:
3626     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3627     zExp = bExp;
3628     zSign ^= 1;
3629     goto normalizeRoundAndPack;
3630  aExpBigger:
3631     if ( aExp == 0x7FFF ) {
3632         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3633         return a;
3634     }
3635     if ( bExp == 0 ) --expDiff;
3636     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3637  aBigger:
3638     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3639     zExp = aExp;
3640  normalizeRoundAndPack:
3641     return
3642         normalizeRoundAndPackFloatx80(
3643             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3644 
3645 }
3646 
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of adding the extended double-precision floating-point
3649 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3650 | Standard for Binary Floating-Point Arithmetic.
3651 *----------------------------------------------------------------------------*/
3652 
floatx80_add(floatx80 a,floatx80 b)3653 floatx80 floatx80_add( floatx80 a, floatx80 b )
3654 {
3655     flag aSign, bSign;
3656 
3657     aSign = extractFloatx80Sign( a );
3658     bSign = extractFloatx80Sign( b );
3659     if ( aSign == bSign ) {
3660         return addFloatx80Sigs( a, b, aSign );
3661     }
3662     else {
3663         return subFloatx80Sigs( a, b, aSign );
3664     }
3665 
3666 }
3667 
3668 /*----------------------------------------------------------------------------
3669 | Returns the result of subtracting the extended double-precision floating-
3670 | point values `a' and `b'.  The operation is performed according to the
3671 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3672 *----------------------------------------------------------------------------*/
3673 
floatx80_sub(floatx80 a,floatx80 b)3674 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3675 {
3676     flag aSign, bSign;
3677 
3678     aSign = extractFloatx80Sign( a );
3679     bSign = extractFloatx80Sign( b );
3680     if ( aSign == bSign ) {
3681         return subFloatx80Sigs( a, b, aSign );
3682     }
3683     else {
3684         return addFloatx80Sigs( a, b, aSign );
3685     }
3686 
3687 }
3688 
3689 /*----------------------------------------------------------------------------
3690 | Returns the result of multiplying the extended double-precision floating-
3691 | point values `a' and `b'.  The operation is performed according to the
3692 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3693 *----------------------------------------------------------------------------*/
3694 
floatx80_mul(floatx80 a,floatx80 b)3695 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3696 {
3697     flag aSign, bSign, zSign;
3698     int32 aExp, bExp, zExp;
3699     bits64 aSig, bSig, zSig0, zSig1;
3700     floatx80 z;
3701 
3702     aSig = extractFloatx80Frac( a );
3703     aExp = extractFloatx80Exp( a );
3704     aSign = extractFloatx80Sign( a );
3705     bSig = extractFloatx80Frac( b );
3706     bExp = extractFloatx80Exp( b );
3707     bSign = extractFloatx80Sign( b );
3708     zSign = aSign ^ bSign;
3709     if ( aExp == 0x7FFF ) {
3710         if (    (bits64) ( aSig<<1 )
3711              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3712             return propagateFloatx80NaN( a, b );
3713         }
3714         if ( ( bExp | bSig ) == 0 ) goto invalid;
3715         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3716     }
3717     if ( bExp == 0x7FFF ) {
3718         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3719         if ( ( aExp | aSig ) == 0 ) {
3720  invalid:
3721             float_raise( float_flag_invalid );
3722             z.low = floatx80_default_nan_low;
3723             z.high = floatx80_default_nan_high;
3724             return z;
3725         }
3726         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3727     }
3728     if ( aExp == 0 ) {
3729         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3730         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3731     }
3732     if ( bExp == 0 ) {
3733         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3734         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3735     }
3736     zExp = aExp + bExp - 0x3FFE;
3737     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3738     if ( 0 < (sbits64) zSig0 ) {
3739         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3740         --zExp;
3741     }
3742     return
3743         roundAndPackFloatx80(
3744             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3745 
3746 }
3747 
3748 /*----------------------------------------------------------------------------
3749 | Returns the result of dividing the extended double-precision floating-point
3750 | value `a' by the corresponding value `b'.  The operation is performed
3751 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3752 *----------------------------------------------------------------------------*/
3753 
floatx80_div(floatx80 a,floatx80 b)3754 floatx80 floatx80_div( floatx80 a, floatx80 b )
3755 {
3756     flag aSign, bSign, zSign;
3757     int32 aExp, bExp, zExp;
3758     bits64 aSig, bSig, zSig0, zSig1;
3759     bits64 rem0, rem1, rem2, term0, term1, term2;
3760     floatx80 z;
3761 
3762     aSig = extractFloatx80Frac( a );
3763     aExp = extractFloatx80Exp( a );
3764     aSign = extractFloatx80Sign( a );
3765     bSig = extractFloatx80Frac( b );
3766     bExp = extractFloatx80Exp( b );
3767     bSign = extractFloatx80Sign( b );
3768     zSign = aSign ^ bSign;
3769     if ( aExp == 0x7FFF ) {
3770         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3771         if ( bExp == 0x7FFF ) {
3772             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3773             goto invalid;
3774         }
3775         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3776     }
3777     if ( bExp == 0x7FFF ) {
3778         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3779         return packFloatx80( zSign, 0, 0 );
3780     }
3781     if ( bExp == 0 ) {
3782         if ( bSig == 0 ) {
3783             if ( ( aExp | aSig ) == 0 ) {
3784  invalid:
3785                 float_raise( float_flag_invalid );
3786                 z.low = floatx80_default_nan_low;
3787                 z.high = floatx80_default_nan_high;
3788                 return z;
3789             }
3790             float_raise( float_flag_divbyzero );
3791             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3792         }
3793         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3794     }
3795     if ( aExp == 0 ) {
3796         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3797         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3798     }
3799     zExp = aExp - bExp + 0x3FFE;
3800     rem1 = 0;
3801     if ( bSig <= aSig ) {
3802         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3803         ++zExp;
3804     }
3805     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3806     mul64To128( bSig, zSig0, &term0, &term1 );
3807     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3808     while ( (sbits64) rem0 < 0 ) {
3809         --zSig0;
3810         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3811     }
3812     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3813     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3814         mul64To128( bSig, zSig1, &term1, &term2 );
3815         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3816         while ( (sbits64) rem1 < 0 ) {
3817             --zSig1;
3818             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3819         }
3820         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3821     }
3822     return
3823         roundAndPackFloatx80(
3824             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3825 
3826 }
3827 
3828 /*----------------------------------------------------------------------------
3829 | Returns the remainder of the extended double-precision floating-point value
3830 | `a' with respect to the corresponding value `b'.  The operation is performed
3831 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3832 *----------------------------------------------------------------------------*/
3833 
floatx80_rem(floatx80 a,floatx80 b)3834 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3835 {
3836     flag aSign, bSign, zSign;
3837     int32 aExp, bExp, expDiff;
3838     bits64 aSig0, aSig1, bSig;
3839     bits64 q, term0, term1, alternateASig0, alternateASig1;
3840     floatx80 z;
3841 
3842     aSig0 = extractFloatx80Frac( a );
3843     aExp = extractFloatx80Exp( a );
3844     aSign = extractFloatx80Sign( a );
3845     bSig = extractFloatx80Frac( b );
3846     bExp = extractFloatx80Exp( b );
3847     bSign = extractFloatx80Sign( b );
3848     if ( aExp == 0x7FFF ) {
3849         if (    (bits64) ( aSig0<<1 )
3850              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3851             return propagateFloatx80NaN( a, b );
3852         }
3853         goto invalid;
3854     }
3855     if ( bExp == 0x7FFF ) {
3856         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3857         return a;
3858     }
3859     if ( bExp == 0 ) {
3860         if ( bSig == 0 ) {
3861  invalid:
3862             float_raise( float_flag_invalid );
3863             z.low = floatx80_default_nan_low;
3864             z.high = floatx80_default_nan_high;
3865             return z;
3866         }
3867         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3868     }
3869     if ( aExp == 0 ) {
3870         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3871         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3872     }
3873     bSig |= LIT64( 0x8000000000000000 );
3874     zSign = aSign;
3875     expDiff = aExp - bExp;
3876     aSig1 = 0;
3877     if ( expDiff < 0 ) {
3878         if ( expDiff < -1 ) return a;
3879         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3880         expDiff = 0;
3881     }
3882     q = ( bSig <= aSig0 );
3883     if ( q ) aSig0 -= bSig;
3884     expDiff -= 64;
3885     while ( 0 < expDiff ) {
3886         q = estimateDiv128To64( aSig0, aSig1, bSig );
3887         q = ( 2 < q ) ? q - 2 : 0;
3888         mul64To128( bSig, q, &term0, &term1 );
3889         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3890         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3891         expDiff -= 62;
3892     }
3893     expDiff += 64;
3894     if ( 0 < expDiff ) {
3895         q = estimateDiv128To64( aSig0, aSig1, bSig );
3896         q = ( 2 < q ) ? q - 2 : 0;
3897         q >>= 64 - expDiff;
3898         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3899         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3900         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3901         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3902             ++q;
3903             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3904         }
3905     }
3906     else {
3907         term1 = 0;
3908         term0 = bSig;
3909     }
3910     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3911     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3912          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3913               && ( q & 1 ) )
3914        ) {
3915         aSig0 = alternateASig0;
3916         aSig1 = alternateASig1;
3917         zSign = ! zSign;
3918     }
3919     return
3920         normalizeRoundAndPackFloatx80(
3921             80, zSign, bExp + expDiff, aSig0, aSig1 );
3922 
3923 }
3924 
3925 /*----------------------------------------------------------------------------
3926 | Returns the square root of the extended double-precision floating-point
3927 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3928 | for Binary Floating-Point Arithmetic.
3929 *----------------------------------------------------------------------------*/
3930 
floatx80_sqrt(floatx80 a)3931 floatx80 floatx80_sqrt( floatx80 a )
3932 {
3933     flag aSign;
3934     int32 aExp, zExp;
3935     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3936     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3937     floatx80 z;
3938 
3939     aSig0 = extractFloatx80Frac( a );
3940     aExp = extractFloatx80Exp( a );
3941     aSign = extractFloatx80Sign( a );
3942     if ( aExp == 0x7FFF ) {
3943         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3944         if ( ! aSign ) return a;
3945         goto invalid;
3946     }
3947     if ( aSign ) {
3948         if ( ( aExp | aSig0 ) == 0 ) return a;
3949  invalid:
3950         float_raise( float_flag_invalid );
3951         z.low = floatx80_default_nan_low;
3952         z.high = floatx80_default_nan_high;
3953         return z;
3954     }
3955     if ( aExp == 0 ) {
3956         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3957         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3958     }
3959     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3960     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3961     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3962     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3963     doubleZSig0 = zSig0<<1;
3964     mul64To128( zSig0, zSig0, &term0, &term1 );
3965     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3966     while ( (sbits64) rem0 < 0 ) {
3967         --zSig0;
3968         doubleZSig0 -= 2;
3969         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3970     }
3971     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3972     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3973         if ( zSig1 == 0 ) zSig1 = 1;
3974         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3975         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3976         mul64To128( zSig1, zSig1, &term2, &term3 );
3977         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3978         while ( (sbits64) rem1 < 0 ) {
3979             --zSig1;
3980             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3981             term3 |= 1;
3982             term2 |= doubleZSig0;
3983             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3984         }
3985         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3986     }
3987     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3988     zSig0 |= doubleZSig0;
3989     return
3990         roundAndPackFloatx80(
3991             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3992 
3993 }
3994 
3995 /*----------------------------------------------------------------------------
3996 | Returns 1 if the extended double-precision floating-point value `a' is
3997 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3998 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3999 | Arithmetic.
4000 *----------------------------------------------------------------------------*/
4001 
floatx80_eq(floatx80 a,floatx80 b)4002 flag floatx80_eq( floatx80 a, floatx80 b )
4003 {
4004 
4005     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4006               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4007          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4008               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4009        ) {
4010         if (    floatx80_is_signaling_nan( a )
4011              || floatx80_is_signaling_nan( b ) ) {
4012             float_raise( float_flag_invalid );
4013         }
4014         return 0;
4015     }
4016     return
4017            ( a.low == b.low )
4018         && (    ( a.high == b.high )
4019              || (    ( a.low == 0 )
4020                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4021            );
4022 
4023 }
4024 
4025 /*----------------------------------------------------------------------------
4026 | Returns 1 if the extended double-precision floating-point value `a' is
4027 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4028 | comparison is performed according to the IEC/IEEE Standard for Binary
4029 | Floating-Point Arithmetic.
4030 *----------------------------------------------------------------------------*/
4031 
floatx80_le(floatx80 a,floatx80 b)4032 flag floatx80_le( floatx80 a, floatx80 b )
4033 {
4034     flag aSign, bSign;
4035 
4036     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4037               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4038          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4039               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4040        ) {
4041         float_raise( float_flag_invalid );
4042         return 0;
4043     }
4044     aSign = extractFloatx80Sign( a );
4045     bSign = extractFloatx80Sign( b );
4046     if ( aSign != bSign ) {
4047         return
4048                aSign
4049             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4050                  == 0 );
4051     }
4052     return
4053           aSign ? le128( b.high, b.low, a.high, a.low )
4054         : le128( a.high, a.low, b.high, b.low );
4055 
4056 }
4057 
4058 /*----------------------------------------------------------------------------
4059 | Returns 1 if the extended double-precision floating-point value `a' is
4060 | less than the corresponding value `b', and 0 otherwise.  The comparison
4061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4062 | Arithmetic.
4063 *----------------------------------------------------------------------------*/
4064 
floatx80_lt(floatx80 a,floatx80 b)4065 flag floatx80_lt( floatx80 a, floatx80 b )
4066 {
4067     flag aSign, bSign;
4068 
4069     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4070               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4071          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4072               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4073        ) {
4074         float_raise( float_flag_invalid );
4075         return 0;
4076     }
4077     aSign = extractFloatx80Sign( a );
4078     bSign = extractFloatx80Sign( b );
4079     if ( aSign != bSign ) {
4080         return
4081                aSign
4082             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4083                  != 0 );
4084     }
4085     return
4086           aSign ? lt128( b.high, b.low, a.high, a.low )
4087         : lt128( a.high, a.low, b.high, b.low );
4088 
4089 }
4090 
4091 /*----------------------------------------------------------------------------
4092 | Returns 1 if the extended double-precision floating-point value `a' is equal
4093 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4094 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4096 *----------------------------------------------------------------------------*/
4097 
floatx80_eq_signaling(floatx80 a,floatx80 b)4098 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4099 {
4100 
4101     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4102               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4103          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4104               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4105        ) {
4106         float_raise( float_flag_invalid );
4107         return 0;
4108     }
4109     return
4110            ( a.low == b.low )
4111         && (    ( a.high == b.high )
4112              || (    ( a.low == 0 )
4113                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4114            );
4115 
4116 }
4117 
4118 /*----------------------------------------------------------------------------
4119 | Returns 1 if the extended double-precision floating-point value `a' is less
4120 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4121 | do not cause an exception.  Otherwise, the comparison is performed according
4122 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4124 
floatx80_le_quiet(floatx80 a,floatx80 b)4125 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4126 {
4127     flag aSign, bSign;
4128 
4129     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4130               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4131          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4132               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4133        ) {
4134         if (    floatx80_is_signaling_nan( a )
4135              || floatx80_is_signaling_nan( b ) ) {
4136             float_raise( float_flag_invalid );
4137         }
4138         return 0;
4139     }
4140     aSign = extractFloatx80Sign( a );
4141     bSign = extractFloatx80Sign( b );
4142     if ( aSign != bSign ) {
4143         return
4144                aSign
4145             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4146                  == 0 );
4147     }
4148     return
4149           aSign ? le128( b.high, b.low, a.high, a.low )
4150         : le128( a.high, a.low, b.high, b.low );
4151 
4152 }
4153 
4154 /*----------------------------------------------------------------------------
4155 | Returns 1 if the extended double-precision floating-point value `a' is less
4156 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4157 | an exception.  Otherwise, the comparison is performed according to the
4158 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4159 *----------------------------------------------------------------------------*/
4160 
floatx80_lt_quiet(floatx80 a,floatx80 b)4161 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4162 {
4163     flag aSign, bSign;
4164 
4165     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4166               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4167          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4168               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4169        ) {
4170         if (    floatx80_is_signaling_nan( a )
4171              || floatx80_is_signaling_nan( b ) ) {
4172             float_raise( float_flag_invalid );
4173         }
4174         return 0;
4175     }
4176     aSign = extractFloatx80Sign( a );
4177     bSign = extractFloatx80Sign( b );
4178     if ( aSign != bSign ) {
4179         return
4180                aSign
4181             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4182                  != 0 );
4183     }
4184     return
4185           aSign ? lt128( b.high, b.low, a.high, a.low )
4186         : lt128( a.high, a.low, b.high, b.low );
4187 
4188 }
4189 
4190 #endif
4191 
4192 #ifdef FLOAT128
4193 
4194 /*----------------------------------------------------------------------------
4195 | Returns the result of converting the quadruple-precision floating-point
4196 | value `a' to the 32-bit two's complement integer format.  The conversion
4197 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4198 | Arithmetic---which means in particular that the conversion is rounded
4199 | according to the current rounding mode.  If `a' is a NaN, the largest
4200 | positive integer is returned.  Otherwise, if the conversion overflows, the
4201 | largest integer with the same sign as `a' is returned.
4202 *----------------------------------------------------------------------------*/
4203 
float128_to_int32(float128 a)4204 int32 float128_to_int32( float128 a )
4205 {
4206     flag aSign;
4207     int32 aExp, shiftCount;
4208     bits64 aSig0, aSig1;
4209 
4210     aSig1 = extractFloat128Frac1( a );
4211     aSig0 = extractFloat128Frac0( a );
4212     aExp = extractFloat128Exp( a );
4213     aSign = extractFloat128Sign( a );
4214     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4215     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4216     aSig0 |= ( aSig1 != 0 );
4217     shiftCount = 0x4028 - aExp;
4218     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4219     return roundAndPackInt32( aSign, aSig0 );
4220 
4221 }
4222 
4223 /*----------------------------------------------------------------------------
4224 | Returns the result of converting the quadruple-precision floating-point
4225 | value `a' to the 32-bit two's complement integer format.  The conversion
4226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4227 | Arithmetic, except that the conversion is always rounded toward zero.  If
4228 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4229 | conversion overflows, the largest integer with the same sign as `a' is
4230 | returned.
4231 *----------------------------------------------------------------------------*/
4232 
float128_to_int32_round_to_zero(float128 a)4233 int32 float128_to_int32_round_to_zero( float128 a )
4234 {
4235     flag aSign;
4236     int32 aExp, shiftCount;
4237     bits64 aSig0, aSig1, savedASig;
4238     int32 z;
4239 
4240     aSig1 = extractFloat128Frac1( a );
4241     aSig0 = extractFloat128Frac0( a );
4242     aExp = extractFloat128Exp( a );
4243     aSign = extractFloat128Sign( a );
4244     aSig0 |= ( aSig1 != 0 );
4245     if ( 0x401E < aExp ) {
4246         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4247         goto invalid;
4248     }
4249     else if ( aExp < 0x3FFF ) {
4250         if ( aExp || aSig0 ) float_set_inexact();
4251         return 0;
4252     }
4253     aSig0 |= LIT64( 0x0001000000000000 );
4254     shiftCount = 0x402F - aExp;
4255     savedASig = aSig0;
4256     aSig0 >>= shiftCount;
4257     z = aSig0;
4258     if ( aSign ) z = - z;
4259     if ( ( z < 0 ) ^ aSign ) {
4260  invalid:
4261         float_raise( float_flag_invalid );
4262         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4263     }
4264     if ( ( aSig0<<shiftCount ) != savedASig ) {
4265         float_set_inexact();
4266     }
4267     return z;
4268 
4269 }
4270 
4271 /*----------------------------------------------------------------------------
4272 | Returns the result of converting the quadruple-precision floating-point
4273 | value `a' to the 64-bit two's complement integer format.  The conversion
4274 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4275 | Arithmetic---which means in particular that the conversion is rounded
4276 | according to the current rounding mode.  If `a' is a NaN, the largest
4277 | positive integer is returned.  Otherwise, if the conversion overflows, the
4278 | largest integer with the same sign as `a' is returned.
4279 *----------------------------------------------------------------------------*/
4280 
float128_to_int64(float128 a)4281 int64 float128_to_int64( float128 a )
4282 {
4283     flag aSign;
4284     int32 aExp, shiftCount;
4285     bits64 aSig0, aSig1;
4286 
4287     aSig1 = extractFloat128Frac1( a );
4288     aSig0 = extractFloat128Frac0( a );
4289     aExp = extractFloat128Exp( a );
4290     aSign = extractFloat128Sign( a );
4291     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4292     shiftCount = 0x402F - aExp;
4293     if ( shiftCount <= 0 ) {
4294         if ( 0x403E < aExp ) {
4295             float_raise( float_flag_invalid );
4296             if (    ! aSign
4297                  || (    ( aExp == 0x7FFF )
4298                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4299                     )
4300                ) {
4301                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4302             }
4303             return (sbits64) LIT64( 0x8000000000000000 );
4304         }
4305         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4306     }
4307     else {
4308         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4309     }
4310     return roundAndPackInt64( aSign, aSig0, aSig1 );
4311 
4312 }
4313 
4314 /*----------------------------------------------------------------------------
4315 | Returns the result of converting the quadruple-precision floating-point
4316 | value `a' to the 64-bit two's complement integer format.  The conversion
4317 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4318 | Arithmetic, except that the conversion is always rounded toward zero.
4319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4320 | the conversion overflows, the largest integer with the same sign as `a' is
4321 | returned.
4322 *----------------------------------------------------------------------------*/
4323 
float128_to_int64_round_to_zero(float128 a)4324 int64 float128_to_int64_round_to_zero( float128 a )
4325 {
4326     flag aSign;
4327     int32 aExp, shiftCount;
4328     bits64 aSig0, aSig1;
4329     int64 z;
4330 
4331     aSig1 = extractFloat128Frac1( a );
4332     aSig0 = extractFloat128Frac0( a );
4333     aExp = extractFloat128Exp( a );
4334     aSign = extractFloat128Sign( a );
4335     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4336     shiftCount = aExp - 0x402F;
4337     if ( 0 < shiftCount ) {
4338         if ( 0x403E <= aExp ) {
4339             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4340             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4341                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4342                 if ( aSig1 ) float_set_inexact();
4343             }
4344             else {
4345                 float_raise( float_flag_invalid );
4346                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4347                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4348                 }
4349             }
4350             return (sbits64) LIT64( 0x8000000000000000 );
4351         }
4352         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4353         if ( (bits64) ( aSig1<<shiftCount ) ) {
4354             float_set_inexact();
4355         }
4356     }
4357     else {
4358         if ( aExp < 0x3FFF ) {
4359             if ( aExp | aSig0 | aSig1 ) {
4360                 float_set_inexact();
4361             }
4362             return 0;
4363         }
4364         z = aSig0>>( - shiftCount );
4365         if (    aSig1
4366              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4367             float_set_inexact();
4368         }
4369     }
4370     if ( aSign ) z = - z;
4371     return z;
4372 
4373 }
4374 
4375 /*----------------------------------------------------------------------------
4376 | Returns the result of converting the quadruple-precision floating-point
4377 | value `a' to the single-precision floating-point format.  The conversion
4378 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4379 | Arithmetic.
4380 *----------------------------------------------------------------------------*/
4381 
float128_to_float32(float128 a)4382 float32 float128_to_float32( float128 a )
4383 {
4384     flag aSign;
4385     int32 aExp;
4386     bits64 aSig0, aSig1;
4387     bits32 zSig;
4388 
4389     aSig1 = extractFloat128Frac1( a );
4390     aSig0 = extractFloat128Frac0( a );
4391     aExp = extractFloat128Exp( a );
4392     aSign = extractFloat128Sign( a );
4393     if ( aExp == 0x7FFF ) {
4394         if ( aSig0 | aSig1 ) {
4395             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4396         }
4397         return packFloat32( aSign, 0xFF, 0 );
4398     }
4399     aSig0 |= ( aSig1 != 0 );
4400     shift64RightJamming( aSig0, 18, &aSig0 );
4401     zSig = aSig0;
4402     if ( aExp || zSig ) {
4403         zSig |= 0x40000000;
4404         aExp -= 0x3F81;
4405     }
4406     return roundAndPackFloat32( aSign, aExp, zSig );
4407 
4408 }
4409 
4410 /*----------------------------------------------------------------------------
4411 | Returns the result of converting the quadruple-precision floating-point
4412 | value `a' to the double-precision floating-point format.  The conversion
4413 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4414 | Arithmetic.
4415 *----------------------------------------------------------------------------*/
4416 
float128_to_float64(float128 a)4417 float64 float128_to_float64( float128 a )
4418 {
4419     flag aSign;
4420     int32 aExp;
4421     bits64 aSig0, aSig1;
4422 
4423     aSig1 = extractFloat128Frac1( a );
4424     aSig0 = extractFloat128Frac0( a );
4425     aExp = extractFloat128Exp( a );
4426     aSign = extractFloat128Sign( a );
4427     if ( aExp == 0x7FFF ) {
4428         if ( aSig0 | aSig1 ) {
4429             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4430         }
4431         return packFloat64( aSign, 0x7FF, 0 );
4432     }
4433     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4434     aSig0 |= ( aSig1 != 0 );
4435     if ( aExp || aSig0 ) {
4436         aSig0 |= LIT64( 0x4000000000000000 );
4437         aExp -= 0x3C01;
4438     }
4439     return roundAndPackFloat64( aSign, aExp, aSig0 );
4440 
4441 }
4442 
4443 #ifdef FLOATX80
4444 
4445 /*----------------------------------------------------------------------------
4446 | Returns the result of converting the quadruple-precision floating-point
4447 | value `a' to the extended double-precision floating-point format.  The
4448 | conversion is performed according to the IEC/IEEE Standard for Binary
4449 | Floating-Point Arithmetic.
4450 *----------------------------------------------------------------------------*/
4451 
float128_to_floatx80(float128 a)4452 floatx80 float128_to_floatx80( float128 a )
4453 {
4454     flag aSign;
4455     int32 aExp;
4456     bits64 aSig0, aSig1;
4457 
4458     aSig1 = extractFloat128Frac1( a );
4459     aSig0 = extractFloat128Frac0( a );
4460     aExp = extractFloat128Exp( a );
4461     aSign = extractFloat128Sign( a );
4462     if ( aExp == 0x7FFF ) {
4463         if ( aSig0 | aSig1 ) {
4464             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4465         }
4466         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4467     }
4468     if ( aExp == 0 ) {
4469         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4470         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4471     }
4472     else {
4473         aSig0 |= LIT64( 0x0001000000000000 );
4474     }
4475     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4476     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4477 
4478 }
4479 
4480 #endif
4481 
4482 /*----------------------------------------------------------------------------
4483 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4484 | returns the result as a quadruple-precision floating-point value.  The
4485 | operation is performed according to the IEC/IEEE Standard for Binary
4486 | Floating-Point Arithmetic.
4487 *----------------------------------------------------------------------------*/
4488 
float128_round_to_int(float128 a)4489 float128 float128_round_to_int( float128 a )
4490 {
4491     flag aSign;
4492     int32 aExp;
4493     bits64 lastBitMask, roundBitsMask;
4494     int8 roundingMode;
4495     float128 z;
4496 
4497     aExp = extractFloat128Exp( a );
4498     if ( 0x402F <= aExp ) {
4499         if ( 0x406F <= aExp ) {
4500             if (    ( aExp == 0x7FFF )
4501                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4502                ) {
4503                 return propagateFloat128NaN( a, a );
4504             }
4505             return a;
4506         }
4507         lastBitMask = 1;
4508         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4509         roundBitsMask = lastBitMask - 1;
4510         z = a;
4511         roundingMode = float_rounding_mode();
4512         if ( roundingMode == float_round_nearest_even ) {
4513             if ( lastBitMask ) {
4514                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4515                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4516             }
4517             else {
4518                 if ( (sbits64) z.low < 0 ) {
4519                     ++z.high;
4520                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4521                 }
4522             }
4523         }
4524         else if ( roundingMode != float_round_to_zero ) {
4525             if (   extractFloat128Sign( z )
4526                  ^ ( roundingMode == float_round_up ) ) {
4527                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4528             }
4529         }
4530         z.low &= ~ roundBitsMask;
4531     }
4532     else {
4533         if ( aExp < 0x3FFF ) {
4534             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4535             float_set_inexact();
4536             aSign = extractFloat128Sign( a );
4537             switch ( float_rounding_mode() ) {
4538              case float_round_nearest_even:
4539                 if (    ( aExp == 0x3FFE )
4540                      && (   extractFloat128Frac0( a )
4541                           | extractFloat128Frac1( a ) )
4542                    ) {
4543                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4544                 }
4545                 break;
4546              case float_round_down:
4547                 return
4548                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4549                     : packFloat128( 0, 0, 0, 0 );
4550              case float_round_up:
4551                 return
4552                       aSign ? packFloat128( 1, 0, 0, 0 )
4553                     : packFloat128( 0, 0x3FFF, 0, 0 );
4554             }
4555             return packFloat128( aSign, 0, 0, 0 );
4556         }
4557         lastBitMask = 1;
4558         lastBitMask <<= 0x402F - aExp;
4559         roundBitsMask = lastBitMask - 1;
4560         z.low = 0;
4561         z.high = a.high;
4562         roundingMode = float_rounding_mode();
4563         if ( roundingMode == float_round_nearest_even ) {
4564             z.high += lastBitMask>>1;
4565             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4566                 z.high &= ~ lastBitMask;
4567             }
4568         }
4569         else if ( roundingMode != float_round_to_zero ) {
4570             if (   extractFloat128Sign( z )
4571                  ^ ( roundingMode == float_round_up ) ) {
4572                 z.high |= ( a.low != 0 );
4573                 z.high += roundBitsMask;
4574             }
4575         }
4576         z.high &= ~ roundBitsMask;
4577     }
4578     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4579         float_set_inexact();
4580     }
4581     return z;
4582 
4583 }
4584 
4585 /*----------------------------------------------------------------------------
4586 | Returns the result of adding the absolute values of the quadruple-precision
4587 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4588 | before being returned.  `zSign' is ignored if the result is a NaN.
4589 | The addition is performed according to the IEC/IEEE Standard for Binary
4590 | Floating-Point Arithmetic.
4591 *----------------------------------------------------------------------------*/
4592 
addFloat128Sigs(float128 a,float128 b,flag zSign)4593 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4594 {
4595     int32 aExp, bExp, zExp;
4596     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4597     int32 expDiff;
4598 
4599     aSig1 = extractFloat128Frac1( a );
4600     aSig0 = extractFloat128Frac0( a );
4601     aExp = extractFloat128Exp( a );
4602     bSig1 = extractFloat128Frac1( b );
4603     bSig0 = extractFloat128Frac0( b );
4604     bExp = extractFloat128Exp( b );
4605     expDiff = aExp - bExp;
4606     if ( 0 < expDiff ) {
4607         if ( aExp == 0x7FFF ) {
4608             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4609             return a;
4610         }
4611         if ( bExp == 0 ) {
4612             --expDiff;
4613         }
4614         else {
4615             bSig0 |= LIT64( 0x0001000000000000 );
4616         }
4617         shift128ExtraRightJamming(
4618             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4619         zExp = aExp;
4620     }
4621     else if ( expDiff < 0 ) {
4622         if ( bExp == 0x7FFF ) {
4623             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4624             return packFloat128( zSign, 0x7FFF, 0, 0 );
4625         }
4626         if ( aExp == 0 ) {
4627             ++expDiff;
4628         }
4629         else {
4630             aSig0 |= LIT64( 0x0001000000000000 );
4631         }
4632         shift128ExtraRightJamming(
4633             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4634         zExp = bExp;
4635     }
4636     else {
4637         if ( aExp == 0x7FFF ) {
4638             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4639                 return propagateFloat128NaN( a, b );
4640             }
4641             return a;
4642         }
4643         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4644         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4645         zSig2 = 0;
4646         zSig0 |= LIT64( 0x0002000000000000 );
4647         zExp = aExp;
4648         goto shiftRight1;
4649     }
4650     aSig0 |= LIT64( 0x0001000000000000 );
4651     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4652     --zExp;
4653     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4654     ++zExp;
4655  shiftRight1:
4656     shift128ExtraRightJamming(
4657         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4658  roundAndPack:
4659     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4660 
4661 }
4662 
4663 /*----------------------------------------------------------------------------
4664 | Returns the result of subtracting the absolute values of the quadruple-
4665 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4666 | difference is negated before being returned.  `zSign' is ignored if the
4667 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4668 | Standard for Binary Floating-Point Arithmetic.
4669 *----------------------------------------------------------------------------*/
4670 
subFloat128Sigs(float128 a,float128 b,flag zSign)4671 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4672 {
4673     int32 aExp, bExp, zExp;
4674     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4675     int32 expDiff;
4676     float128 z;
4677 
4678     aSig1 = extractFloat128Frac1( a );
4679     aSig0 = extractFloat128Frac0( a );
4680     aExp = extractFloat128Exp( a );
4681     bSig1 = extractFloat128Frac1( b );
4682     bSig0 = extractFloat128Frac0( b );
4683     bExp = extractFloat128Exp( b );
4684     expDiff = aExp - bExp;
4685     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4686     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4687     if ( 0 < expDiff ) goto aExpBigger;
4688     if ( expDiff < 0 ) goto bExpBigger;
4689     if ( aExp == 0x7FFF ) {
4690         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4691             return propagateFloat128NaN( a, b );
4692         }
4693         float_raise( float_flag_invalid );
4694         z.low = float128_default_nan_low;
4695         z.high = float128_default_nan_high;
4696         return z;
4697     }
4698     if ( aExp == 0 ) {
4699         aExp = 1;
4700         bExp = 1;
4701     }
4702     if ( bSig0 < aSig0 ) goto aBigger;
4703     if ( aSig0 < bSig0 ) goto bBigger;
4704     if ( bSig1 < aSig1 ) goto aBigger;
4705     if ( aSig1 < bSig1 ) goto bBigger;
4706     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4707  bExpBigger:
4708     if ( bExp == 0x7FFF ) {
4709         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4710         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4711     }
4712     if ( aExp == 0 ) {
4713         ++expDiff;
4714     }
4715     else {
4716         aSig0 |= LIT64( 0x4000000000000000 );
4717     }
4718     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4719     bSig0 |= LIT64( 0x4000000000000000 );
4720  bBigger:
4721     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4722     zExp = bExp;
4723     zSign ^= 1;
4724     goto normalizeRoundAndPack;
4725  aExpBigger:
4726     if ( aExp == 0x7FFF ) {
4727         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4728         return a;
4729     }
4730     if ( bExp == 0 ) {
4731         --expDiff;
4732     }
4733     else {
4734         bSig0 |= LIT64( 0x4000000000000000 );
4735     }
4736     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4737     aSig0 |= LIT64( 0x4000000000000000 );
4738  aBigger:
4739     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4740     zExp = aExp;
4741  normalizeRoundAndPack:
4742     --zExp;
4743     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4744 
4745 }
4746 
4747 /*----------------------------------------------------------------------------
4748 | Returns the result of adding the quadruple-precision floating-point values
4749 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4750 | for Binary Floating-Point Arithmetic.
4751 *----------------------------------------------------------------------------*/
4752 
float128_add(float128 a,float128 b)4753 float128 float128_add( float128 a, float128 b )
4754 {
4755     flag aSign, bSign;
4756 
4757     aSign = extractFloat128Sign( a );
4758     bSign = extractFloat128Sign( b );
4759     if ( aSign == bSign ) {
4760         return addFloat128Sigs( a, b, aSign );
4761     }
4762     else {
4763         return subFloat128Sigs( a, b, aSign );
4764     }
4765 
4766 }
4767 
4768 /*----------------------------------------------------------------------------
4769 | Returns the result of subtracting the quadruple-precision floating-point
4770 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4771 | Standard for Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773 
float128_sub(float128 a,float128 b)4774 float128 float128_sub( float128 a, float128 b )
4775 {
4776     flag aSign, bSign;
4777 
4778     aSign = extractFloat128Sign( a );
4779     bSign = extractFloat128Sign( b );
4780     if ( aSign == bSign ) {
4781         return subFloat128Sigs( a, b, aSign );
4782     }
4783     else {
4784         return addFloat128Sigs( a, b, aSign );
4785     }
4786 
4787 }
4788 
4789 /*----------------------------------------------------------------------------
4790 | Returns the result of multiplying the quadruple-precision floating-point
4791 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4792 | Standard for Binary Floating-Point Arithmetic.
4793 *----------------------------------------------------------------------------*/
4794 
float128_mul(float128 a,float128 b)4795 float128 float128_mul( float128 a, float128 b )
4796 {
4797     flag aSign, bSign, zSign;
4798     int32 aExp, bExp, zExp;
4799     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4800     float128 z;
4801 
4802     aSig1 = extractFloat128Frac1( a );
4803     aSig0 = extractFloat128Frac0( a );
4804     aExp = extractFloat128Exp( a );
4805     aSign = extractFloat128Sign( a );
4806     bSig1 = extractFloat128Frac1( b );
4807     bSig0 = extractFloat128Frac0( b );
4808     bExp = extractFloat128Exp( b );
4809     bSign = extractFloat128Sign( b );
4810     zSign = aSign ^ bSign;
4811     if ( aExp == 0x7FFF ) {
4812         if (    ( aSig0 | aSig1 )
4813              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4814             return propagateFloat128NaN( a, b );
4815         }
4816         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4817         return packFloat128( zSign, 0x7FFF, 0, 0 );
4818     }
4819     if ( bExp == 0x7FFF ) {
4820         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4821         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4822  invalid:
4823             float_raise( float_flag_invalid );
4824             z.low = float128_default_nan_low;
4825             z.high = float128_default_nan_high;
4826             return z;
4827         }
4828         return packFloat128( zSign, 0x7FFF, 0, 0 );
4829     }
4830     if ( aExp == 0 ) {
4831         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4832         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4833     }
4834     if ( bExp == 0 ) {
4835         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4836         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4837     }
4838     zExp = aExp + bExp - 0x4000;
4839     aSig0 |= LIT64( 0x0001000000000000 );
4840     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4841     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4842     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4843     zSig2 |= ( zSig3 != 0 );
4844     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4845         shift128ExtraRightJamming(
4846             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4847         ++zExp;
4848     }
4849     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4850 
4851 }
4852 
4853 /*----------------------------------------------------------------------------
4854 | Returns the result of dividing the quadruple-precision floating-point value
4855 | `a' by the corresponding value `b'.  The operation is performed according to
4856 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4857 *----------------------------------------------------------------------------*/
4858 
float128_div(float128 a,float128 b)4859 float128 float128_div( float128 a, float128 b )
4860 {
4861     flag aSign, bSign, zSign;
4862     int32 aExp, bExp, zExp;
4863     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4864     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4865     float128 z;
4866 
4867     aSig1 = extractFloat128Frac1( a );
4868     aSig0 = extractFloat128Frac0( a );
4869     aExp = extractFloat128Exp( a );
4870     aSign = extractFloat128Sign( a );
4871     bSig1 = extractFloat128Frac1( b );
4872     bSig0 = extractFloat128Frac0( b );
4873     bExp = extractFloat128Exp( b );
4874     bSign = extractFloat128Sign( b );
4875     zSign = aSign ^ bSign;
4876     if ( aExp == 0x7FFF ) {
4877         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4878         if ( bExp == 0x7FFF ) {
4879             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4880             goto invalid;
4881         }
4882         return packFloat128( zSign, 0x7FFF, 0, 0 );
4883     }
4884     if ( bExp == 0x7FFF ) {
4885         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4886         return packFloat128( zSign, 0, 0, 0 );
4887     }
4888     if ( bExp == 0 ) {
4889         if ( ( bSig0 | bSig1 ) == 0 ) {
4890             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4891  invalid:
4892                 float_raise( float_flag_invalid );
4893                 z.low = float128_default_nan_low;
4894                 z.high = float128_default_nan_high;
4895                 return z;
4896             }
4897             float_raise( float_flag_divbyzero );
4898             return packFloat128( zSign, 0x7FFF, 0, 0 );
4899         }
4900         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4901     }
4902     if ( aExp == 0 ) {
4903         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4904         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4905     }
4906     zExp = aExp - bExp + 0x3FFD;
4907     shortShift128Left(
4908         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4909     shortShift128Left(
4910         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4911     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4912         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4913         ++zExp;
4914     }
4915     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4916     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4917     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4918     while ( (sbits64) rem0 < 0 ) {
4919         --zSig0;
4920         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4921     }
4922     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4923     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4924         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4925         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4926         while ( (sbits64) rem1 < 0 ) {
4927             --zSig1;
4928             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4929         }
4930         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4931     }
4932     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4933     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4934 
4935 }
4936 
4937 /*----------------------------------------------------------------------------
4938 | Returns the remainder of the quadruple-precision floating-point value `a'
4939 | with respect to the corresponding value `b'.  The operation is performed
4940 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4941 *----------------------------------------------------------------------------*/
4942 
float128_rem(float128 a,float128 b)4943 float128 float128_rem( float128 a, float128 b )
4944 {
4945     flag aSign, bSign, zSign;
4946     int32 aExp, bExp, expDiff;
4947     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4948     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4949     sbits64 sigMean0;
4950     float128 z;
4951 
4952     aSig1 = extractFloat128Frac1( a );
4953     aSig0 = extractFloat128Frac0( a );
4954     aExp = extractFloat128Exp( a );
4955     aSign = extractFloat128Sign( a );
4956     bSig1 = extractFloat128Frac1( b );
4957     bSig0 = extractFloat128Frac0( b );
4958     bExp = extractFloat128Exp( b );
4959     bSign = extractFloat128Sign( b );
4960     if ( aExp == 0x7FFF ) {
4961         if (    ( aSig0 | aSig1 )
4962              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4963             return propagateFloat128NaN( a, b );
4964         }
4965         goto invalid;
4966     }
4967     if ( bExp == 0x7FFF ) {
4968         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4969         return a;
4970     }
4971     if ( bExp == 0 ) {
4972         if ( ( bSig0 | bSig1 ) == 0 ) {
4973  invalid:
4974             float_raise( float_flag_invalid );
4975             z.low = float128_default_nan_low;
4976             z.high = float128_default_nan_high;
4977             return z;
4978         }
4979         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4980     }
4981     if ( aExp == 0 ) {
4982         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4983         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4984     }
4985     expDiff = aExp - bExp;
4986     if ( expDiff < -1 ) return a;
4987     shortShift128Left(
4988         aSig0 | LIT64( 0x0001000000000000 ),
4989         aSig1,
4990         15 - ( expDiff < 0 ),
4991         &aSig0,
4992         &aSig1
4993     );
4994     shortShift128Left(
4995         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4996     q = le128( bSig0, bSig1, aSig0, aSig1 );
4997     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4998     expDiff -= 64;
4999     while ( 0 < expDiff ) {
5000         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5001         q = ( 4 < q ) ? q - 4 : 0;
5002         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5003         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5004         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5005         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5006         expDiff -= 61;
5007     }
5008     if ( -64 < expDiff ) {
5009         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5010         q = ( 4 < q ) ? q - 4 : 0;
5011         q >>= - expDiff;
5012         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5013         expDiff += 52;
5014         if ( expDiff < 0 ) {
5015             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5016         }
5017         else {
5018             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5019         }
5020         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5021         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5022     }
5023     else {
5024         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5025         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5026     }
5027     do {
5028         alternateASig0 = aSig0;
5029         alternateASig1 = aSig1;
5030         ++q;
5031         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5032     } while ( 0 <= (sbits64) aSig0 );
5033     add128(
5034         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5035     if (    ( sigMean0 < 0 )
5036          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5037         aSig0 = alternateASig0;
5038         aSig1 = alternateASig1;
5039     }
5040     zSign = ( (sbits64) aSig0 < 0 );
5041     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5042     return
5043         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5044 
5045 }
5046 
5047 /*----------------------------------------------------------------------------
5048 | Returns the square root of the quadruple-precision floating-point value `a'.
5049 | The operation is performed according to the IEC/IEEE Standard for Binary
5050 | Floating-Point Arithmetic.
5051 *----------------------------------------------------------------------------*/
5052 
float128_sqrt(float128 a)5053 float128 float128_sqrt( float128 a )
5054 {
5055     flag aSign;
5056     int32 aExp, zExp;
5057     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5058     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5059     float128 z;
5060 
5061     aSig1 = extractFloat128Frac1( a );
5062     aSig0 = extractFloat128Frac0( a );
5063     aExp = extractFloat128Exp( a );
5064     aSign = extractFloat128Sign( a );
5065     if ( aExp == 0x7FFF ) {
5066         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5067         if ( ! aSign ) return a;
5068         goto invalid;
5069     }
5070     if ( aSign ) {
5071         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5072  invalid:
5073         float_raise( float_flag_invalid );
5074         z.low = float128_default_nan_low;
5075         z.high = float128_default_nan_high;
5076         return z;
5077     }
5078     if ( aExp == 0 ) {
5079         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5080         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5081     }
5082     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5083     aSig0 |= LIT64( 0x0001000000000000 );
5084     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5085     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5086     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5087     doubleZSig0 = zSig0<<1;
5088     mul64To128( zSig0, zSig0, &term0, &term1 );
5089     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5090     while ( (sbits64) rem0 < 0 ) {
5091         --zSig0;
5092         doubleZSig0 -= 2;
5093         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5094     }
5095     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5096     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5097         if ( zSig1 == 0 ) zSig1 = 1;
5098         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5099         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5100         mul64To128( zSig1, zSig1, &term2, &term3 );
5101         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5102         while ( (sbits64) rem1 < 0 ) {
5103             --zSig1;
5104             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5105             term3 |= 1;
5106             term2 |= doubleZSig0;
5107             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5108         }
5109         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5110     }
5111     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5112     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5113 
5114 }
5115 
5116 /*----------------------------------------------------------------------------
5117 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5118 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5121 
float128_eq(float128 a,float128 b)5122 flag float128_eq( float128 a, float128 b )
5123 {
5124 
5125     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5126               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5127          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5128               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5129        ) {
5130         if (    float128_is_signaling_nan( a )
5131              || float128_is_signaling_nan( b ) ) {
5132             float_raise( float_flag_invalid );
5133         }
5134         return 0;
5135     }
5136     return
5137            ( a.low == b.low )
5138         && (    ( a.high == b.high )
5139              || (    ( a.low == 0 )
5140                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5141            );
5142 
5143 }
5144 
5145 /*----------------------------------------------------------------------------
5146 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5147 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5148 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5149 | Arithmetic.
5150 *----------------------------------------------------------------------------*/
5151 
float128_le(float128 a,float128 b)5152 flag float128_le( float128 a, float128 b )
5153 {
5154     flag aSign, bSign;
5155 
5156     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5157               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5158          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5159               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5160        ) {
5161         float_raise( float_flag_invalid );
5162         return 0;
5163     }
5164     aSign = extractFloat128Sign( a );
5165     bSign = extractFloat128Sign( b );
5166     if ( aSign != bSign ) {
5167         return
5168                aSign
5169             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5170                  == 0 );
5171     }
5172     return
5173           aSign ? le128( b.high, b.low, a.high, a.low )
5174         : le128( a.high, a.low, b.high, b.low );
5175 
5176 }
5177 
5178 /*----------------------------------------------------------------------------
5179 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5180 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5181 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5182 *----------------------------------------------------------------------------*/
5183 
float128_lt(float128 a,float128 b)5184 flag float128_lt( float128 a, float128 b )
5185 {
5186     flag aSign, bSign;
5187 
5188     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5189               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5190          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5191               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5192        ) {
5193         float_raise( float_flag_invalid );
5194         return 0;
5195     }
5196     aSign = extractFloat128Sign( a );
5197     bSign = extractFloat128Sign( b );
5198     if ( aSign != bSign ) {
5199         return
5200                aSign
5201             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5202                  != 0 );
5203     }
5204     return
5205           aSign ? lt128( b.high, b.low, a.high, a.low )
5206         : lt128( a.high, a.low, b.high, b.low );
5207 
5208 }
5209 
5210 /*----------------------------------------------------------------------------
5211 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5212 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5213 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5214 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5215 *----------------------------------------------------------------------------*/
5216 
float128_eq_signaling(float128 a,float128 b)5217 flag float128_eq_signaling( float128 a, float128 b )
5218 {
5219 
5220     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5221               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5222          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5223               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5224        ) {
5225         float_raise( float_flag_invalid );
5226         return 0;
5227     }
5228     return
5229            ( a.low == b.low )
5230         && (    ( a.high == b.high )
5231              || (    ( a.low == 0 )
5232                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5233            );
5234 
5235 }
5236 
5237 /*----------------------------------------------------------------------------
5238 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5239 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5240 | cause an exception.  Otherwise, the comparison is performed according to the
5241 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242 *----------------------------------------------------------------------------*/
5243 
float128_le_quiet(float128 a,float128 b)5244 flag float128_le_quiet( float128 a, float128 b )
5245 {
5246     flag aSign, bSign;
5247 
5248     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5249               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5250          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5251               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5252        ) {
5253         if (    float128_is_signaling_nan( a )
5254              || float128_is_signaling_nan( b ) ) {
5255             float_raise( float_flag_invalid );
5256         }
5257         return 0;
5258     }
5259     aSign = extractFloat128Sign( a );
5260     bSign = extractFloat128Sign( b );
5261     if ( aSign != bSign ) {
5262         return
5263                aSign
5264             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5265                  == 0 );
5266     }
5267     return
5268           aSign ? le128( b.high, b.low, a.high, a.low )
5269         : le128( a.high, a.low, b.high, b.low );
5270 
5271 }
5272 
5273 /*----------------------------------------------------------------------------
5274 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5275 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5276 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5277 | Standard for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5279 
float128_lt_quiet(float128 a,float128 b)5280 flag float128_lt_quiet( float128 a, float128 b )
5281 {
5282     flag aSign, bSign;
5283 
5284     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5285               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5286          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5287               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5288        ) {
5289         if (    float128_is_signaling_nan( a )
5290              || float128_is_signaling_nan( b ) ) {
5291             float_raise( float_flag_invalid );
5292         }
5293         return 0;
5294     }
5295     aSign = extractFloat128Sign( a );
5296     bSign = extractFloat128Sign( b );
5297     if ( aSign != bSign ) {
5298         return
5299                aSign
5300             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5301                  != 0 );
5302     }
5303     return
5304           aSign ? lt128( b.high, b.low, a.high, a.low )
5305         : lt128( a.high, a.low, b.high, b.low );
5306 
5307 }
5308 
5309 #endif
5310 
5311 
5312 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5313 
5314 /*
5315  * These two routines are not part of the original softfloat distribution.
5316  *
5317  * They are based on the corresponding conversions to integer but return
5318  * unsigned numbers instead since these functions are required by GCC.
5319  *
5320  * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5321  *
5322  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5323  */
5324 
5325 /*----------------------------------------------------------------------------
5326 | Returns the result of converting the double-precision floating-point value
5327 | `a' to the 32-bit unsigned integer format.  The conversion is
5328 | performed according to the IEC/IEEE Standard for Binary Floating-point
5329 | Arithmetic, except that the conversion is always rounded toward zero.  If
5330 | `a' is a NaN, the largest positive integer is returned.  If the conversion
5331 | overflows, the largest integer positive is returned.
5332 *----------------------------------------------------------------------------*/
5333 
float64_to_uint32_round_to_zero(float64 a)5334 uint32 float64_to_uint32_round_to_zero( float64 a )
5335 {
5336     flag aSign;
5337     int16 aExp, shiftCount;
5338     bits64 aSig, savedASig;
5339     uint32 z;
5340 
5341     aSig = extractFloat64Frac( a );
5342     aExp = extractFloat64Exp( a );
5343     aSign = extractFloat64Sign( a );
5344 
5345     if (aSign) {
5346         float_raise( float_flag_invalid );
5347     	return(0);
5348     }
5349 
5350     if ( 0x41E < aExp ) {
5351         float_raise( float_flag_invalid );
5352         return 0xffffffff;
5353     }
5354     else if ( aExp < 0x3FF ) {
5355         if ( aExp || aSig ) float_set_inexact();
5356         return 0;
5357     }
5358     aSig |= LIT64( 0x0010000000000000 );
5359     shiftCount = 0x433 - aExp;
5360     savedASig = aSig;
5361     aSig >>= shiftCount;
5362     z = aSig;
5363     if ( ( aSig<<shiftCount ) != savedASig ) {
5364         float_set_inexact();
5365     }
5366     return z;
5367 
5368 }
5369 
5370 /*----------------------------------------------------------------------------
5371 | Returns the result of converting the single-precision floating-point value
5372 | `a' to the 32-bit unsigned integer format.  The conversion is
5373 | performed according to the IEC/IEEE Standard for Binary Floating-point
5374 | Arithmetic, except that the conversion is always rounded toward zero.  If
5375 | `a' is a NaN, the largest positive integer is returned.  If the conversion
5376 | overflows, the largest positive integer is returned.
5377 *----------------------------------------------------------------------------*/
5378 
float32_to_uint32_round_to_zero(float32 a)5379 uint32 float32_to_uint32_round_to_zero( float32 a )
5380 {
5381     flag aSign;
5382     int16 aExp, shiftCount;
5383     bits32 aSig;
5384     uint32 z;
5385 
5386     aSig = extractFloat32Frac( a );
5387     aExp = extractFloat32Exp( a );
5388     aSign = extractFloat32Sign( a );
5389     shiftCount = aExp - 0x9E;
5390 
5391     if (aSign) {
5392         float_raise( float_flag_invalid );
5393     	return(0);
5394     }
5395     if ( 0 < shiftCount ) {
5396         float_raise( float_flag_invalid );
5397         return 0xFFFFFFFF;
5398     }
5399     else if ( aExp <= 0x7E ) {
5400         if ( aExp | aSig ) float_set_inexact();
5401         return 0;
5402     }
5403     aSig = ( aSig | 0x800000 )<<8;
5404     z = aSig>>( - shiftCount );
5405     if ( aSig<<( shiftCount & 31 ) ) {
5406         float_set_inexact();
5407     }
5408     return z;
5409 
5410 }
5411 
5412 #endif
5413 
5414 #endif /* _STANDALONE */
5415