1 /* $NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*============================================================================
18
19 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
20 Package, Release 2b.
21
22 Written by John R. Hauser. This work was made possible in part by the
23 International Computer Science Institute, located at Suite 600, 1947 Center
24 Street, Berkeley, California 94704. Funding was partially provided by the
25 National Science Foundation under grant MIP-9311980. The original version
26 of this code was written as part of a project to build a fixed-point vector
27 processor in collaboration with the University of California at Berkeley,
28 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
29 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
30 arithmetic/SoftFloat.html'.
31
32 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
33 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
34 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
35 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
36 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
37 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
38 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
39 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
40
41 Derivative works are acceptable, even for commercial purposes, so long as
42 (1) the source code for the derivative work includes prominent notice that
43 the work is derivative, and (2) the source code includes prominent notice with
44 these four paragraphs for those parts of this code that are retained.
45
46 =============================================================================*/
47
48 /* If you need this in a boot program, you have bigger problems... */
49 #ifndef _STANDALONE
50
51 #include <sys/cdefs.h>
52 #if defined(LIBC_SCCS) && !defined(lint)
53 __RCSID("$NetBSD: softfloat.c,v 1.8 2020/09/02 04:06:43 thorpej Exp $");
54 #endif /* LIBC_SCCS and not lint */
55
56 #ifdef SOFTFLOAT_FOR_GCC
57 #include "softfloat-for-gcc.h"
58 #endif
59
60 #include "milieu.h"
61 #include "softfloat.h"
62
63 /*----------------------------------------------------------------------------
64 | Floating-point rounding mode, extended double-precision rounding precision,
65 | and exception flags.
66 *----------------------------------------------------------------------------*/
67 /*
68 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
69 * Right now, it does not. I've removed all other dynamic global
70 * variables. [ross]
71 */
72 #ifdef FLOATX80
73 int8 floatx80_rounding_precision = 80;
74 #endif
75
76 /*----------------------------------------------------------------------------
77 | Primitive arithmetic functions, including multi-word arithmetic, and
78 | division and square root approximations. (Can be specialized to target if
79 | desired.)
80 *----------------------------------------------------------------------------*/
81 #include "softfloat-macros.h"
82
83 /*----------------------------------------------------------------------------
84 | Functions and definitions to determine: (1) whether tininess for underflow
85 | is detected before or after rounding by default, (2) what (if anything)
86 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
87 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
88 | are propagated from function inputs to output. These details are target-
89 | specific.
90 *----------------------------------------------------------------------------*/
91 #include "softfloat-specialize.h"
92
93 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
94 /*----------------------------------------------------------------------------
95 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
96 | and 7, and returns the properly rounded 32-bit integer corresponding to the
97 | input. If `zSign' is 1, the input is negated before being converted to an
98 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
99 | is simply rounded to an integer, with the inexact exception raised if the
100 | input cannot be represented exactly as an integer. However, if the fixed-
101 | point input is too large, the invalid exception is raised and the largest
102 | positive or negative integer is returned.
103 *----------------------------------------------------------------------------*/
104
roundAndPackInt32(flag zSign,bits64 absZ)105 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
106 {
107 int8 roundingMode;
108 flag roundNearestEven;
109 int8 roundIncrement, roundBits;
110 int32 z;
111
112 roundingMode = float_rounding_mode();
113 roundNearestEven = ( roundingMode == float_round_nearest_even );
114 roundIncrement = 0x40;
115 if ( ! roundNearestEven ) {
116 if ( roundingMode == float_round_to_zero ) {
117 roundIncrement = 0;
118 }
119 else {
120 roundIncrement = 0x7F;
121 if ( zSign ) {
122 if ( roundingMode == float_round_up ) roundIncrement = 0;
123 }
124 else {
125 if ( roundingMode == float_round_down ) roundIncrement = 0;
126 }
127 }
128 }
129 roundBits = absZ & 0x7F;
130 absZ = ( absZ + roundIncrement )>>7;
131 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132 z = absZ;
133 if ( zSign ) z = - z;
134 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135 float_raise( float_flag_invalid );
136 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
137 }
138 if ( roundBits ) float_set_inexact();
139 return z;
140
141 }
142
143 /*----------------------------------------------------------------------------
144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
146 | and returns the properly rounded 64-bit integer corresponding to the input.
147 | If `zSign' is 1, the input is negated before being converted to an integer.
148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
149 | the inexact exception raised if the input cannot be represented exactly as
150 | an integer. However, if the fixed-point input is too large, the invalid
151 | exception is raised and the largest positive or negative integer is
152 | returned.
153 *----------------------------------------------------------------------------*/
154
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)155 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
156 {
157 int8 roundingMode;
158 flag roundNearestEven, increment;
159 int64 z;
160
161 roundingMode = float_rounding_mode();
162 roundNearestEven = ( roundingMode == float_round_nearest_even );
163 increment = ( (sbits64) absZ1 < 0 );
164 if ( ! roundNearestEven ) {
165 if ( roundingMode == float_round_to_zero ) {
166 increment = 0;
167 }
168 else {
169 if ( zSign ) {
170 increment = ( roundingMode == float_round_down ) && absZ1;
171 }
172 else {
173 increment = ( roundingMode == float_round_up ) && absZ1;
174 }
175 }
176 }
177 if ( increment ) {
178 ++absZ0;
179 if ( absZ0 == 0 ) goto overflow;
180 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
181 }
182 z = absZ0;
183 if ( zSign ) z = - z;
184 if ( z && ( ( z < 0 ) ^ zSign ) ) {
185 overflow:
186 float_raise( float_flag_invalid );
187 return
188 zSign ? (sbits64) LIT64( 0x8000000000000000 )
189 : LIT64( 0x7FFFFFFFFFFFFFFF );
190 }
191 if ( absZ1 ) float_set_inexact();
192 return z;
193
194 }
195
196 /* same as above, but for unsigned values */
roundAndPackUInt64(bits64 absZ0,bits64 absZ1)197 static uint64 roundAndPackUInt64( bits64 absZ0, bits64 absZ1 )
198 {
199 int8 roundingMode;
200 flag roundNearestEven, increment;
201 uint64 z;
202
203 roundingMode = float_rounding_mode();
204 roundNearestEven = ( roundingMode == float_round_nearest_even );
205 increment = ( (sbits64) absZ1 < 0 );
206 if ( ! roundNearestEven ) {
207 if ( roundingMode == float_round_to_zero ) {
208 increment = 0;
209 }
210 else {
211 increment = ( roundingMode == float_round_up ) && absZ1;
212 }
213 }
214 if ( increment ) {
215 ++absZ0;
216 if ( absZ0 == 0 ) {
217 float_raise( float_flag_invalid );
218 return LIT64( 0x7FFFFFFFFFFFFFFF );
219 }
220 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
221 }
222 z = absZ0;
223 if ( absZ1 ) float_set_inexact();
224 return z;
225
226 }
227 #endif /* SOFTFLOAT_FOR_GCC */
228
229 /*----------------------------------------------------------------------------
230 | Returns the fraction bits of the single-precision floating-point value `a'.
231 *----------------------------------------------------------------------------*/
232
extractFloat32Frac(float32 a)233 INLINE bits32 extractFloat32Frac( float32 a )
234 {
235
236 return a & 0x007FFFFF;
237
238 }
239
240 /*----------------------------------------------------------------------------
241 | Returns the exponent bits of the single-precision floating-point value `a'.
242 *----------------------------------------------------------------------------*/
243
extractFloat32Exp(float32 a)244 INLINE int16 extractFloat32Exp( float32 a )
245 {
246
247 return ( a>>23 ) & 0xFF;
248
249 }
250
251 /*----------------------------------------------------------------------------
252 | Returns the sign bit of the single-precision floating-point value `a'.
253 *----------------------------------------------------------------------------*/
254
extractFloat32Sign(float32 a)255 INLINE flag extractFloat32Sign( float32 a )
256 {
257
258 return a>>31;
259
260 }
261
262 /*----------------------------------------------------------------------------
263 | Normalizes the subnormal single-precision floating-point value represented
264 | by the denormalized significand `aSig'. The normalized exponent and
265 | significand are stored at the locations pointed to by `zExpPtr' and
266 | `zSigPtr', respectively.
267 *----------------------------------------------------------------------------*/
268
269 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)270 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
271 {
272 int8 shiftCount;
273
274 shiftCount = countLeadingZeros32( aSig ) - 8;
275 *zSigPtr = aSig<<shiftCount;
276 *zExpPtr = 1 - shiftCount;
277
278 }
279
280 /*----------------------------------------------------------------------------
281 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
282 | single-precision floating-point value, returning the result. After being
283 | shifted into the proper positions, the three fields are simply added
284 | together to form the result. This means that any integer portion of `zSig'
285 | will be added into the exponent. Since a properly normalized significand
286 | will have an integer portion equal to 1, the `zExp' input should be 1 less
287 | than the desired result exponent whenever `zSig' is a complete, normalized
288 | significand.
289 *----------------------------------------------------------------------------*/
290
packFloat32(flag zSign,int16 zExp,bits32 zSig)291 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
292 {
293
294 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
295
296 }
297
298 /*----------------------------------------------------------------------------
299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
300 | and significand `zSig', and returns the proper single-precision floating-
301 | point value corresponding to the abstract input. Ordinarily, the abstract
302 | value is simply rounded and packed into the single-precision format, with
303 | the inexact exception raised if the abstract input cannot be represented
304 | exactly. However, if the abstract value is too large, the overflow and
305 | inexact exceptions are raised and an infinity or maximal finite value is
306 | returned. If the abstract value is too small, the input value is rounded to
307 | a subnormal number, and the underflow and inexact exceptions are raised if
308 | the abstract input cannot be represented exactly as a subnormal single-
309 | precision floating-point number.
310 | The input significand `zSig' has its binary point between bits 30
311 | and 29, which is 7 bits to the left of the usual location. This shifted
312 | significand must be normalized or smaller. If `zSig' is not normalized,
313 | `zExp' must be 0; in that case, the result returned is a subnormal number,
314 | and it must not require rounding. In the usual case that `zSig' is
315 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
316 | The handling of underflow and overflow follows the IEC/IEEE Standard for
317 | Binary Floating-Point Arithmetic.
318 *----------------------------------------------------------------------------*/
319
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)320 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
321 {
322 int8 roundingMode;
323 flag roundNearestEven;
324 int8 roundIncrement, roundBits;
325 flag isTiny;
326
327 roundingMode = float_rounding_mode();
328 roundNearestEven = ( roundingMode == float_round_nearest_even );
329 roundIncrement = 0x40;
330 if ( ! roundNearestEven ) {
331 if ( roundingMode == float_round_to_zero ) {
332 roundIncrement = 0;
333 }
334 else {
335 roundIncrement = 0x7F;
336 if ( zSign ) {
337 if ( roundingMode == float_round_up ) roundIncrement = 0;
338 }
339 else {
340 if ( roundingMode == float_round_down ) roundIncrement = 0;
341 }
342 }
343 }
344 roundBits = zSig & 0x7F;
345 if ( 0xFD <= (bits16) zExp ) {
346 if ( ( 0xFD < zExp )
347 || ( ( zExp == 0xFD )
348 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
349 ) {
350 float_raise( float_flag_overflow | float_flag_inexact );
351 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
352 }
353 if ( zExp < 0 ) {
354 isTiny =
355 ( float_detect_tininess == float_tininess_before_rounding )
356 || ( zExp < -1 )
357 || ( zSig + roundIncrement < 0x80000000 );
358 shift32RightJamming( zSig, - zExp, &zSig );
359 zExp = 0;
360 roundBits = zSig & 0x7F;
361 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
362 }
363 }
364 if ( roundBits ) float_set_inexact();
365 zSig = ( zSig + roundIncrement )>>7;
366 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
367 if ( zSig == 0 ) zExp = 0;
368 return packFloat32( zSign, zExp, zSig );
369
370 }
371
372 /*----------------------------------------------------------------------------
373 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
374 | and significand `zSig', and returns the proper single-precision floating-
375 | point value corresponding to the abstract input. This routine is just like
376 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
377 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
378 | floating-point exponent.
379 *----------------------------------------------------------------------------*/
380
381 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)382 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
383 {
384 int8 shiftCount;
385
386 shiftCount = countLeadingZeros32( zSig ) - 1;
387 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
388
389 }
390
391 /*----------------------------------------------------------------------------
392 | Returns the fraction bits of the double-precision floating-point value `a'.
393 *----------------------------------------------------------------------------*/
394
extractFloat64Frac(float64 a)395 INLINE bits64 extractFloat64Frac( float64 a )
396 {
397
398 return a & LIT64( 0x000FFFFFFFFFFFFF );
399
400 }
401
402 /*----------------------------------------------------------------------------
403 | Returns the exponent bits of the double-precision floating-point value `a'.
404 *----------------------------------------------------------------------------*/
405
extractFloat64Exp(float64 a)406 INLINE int16 extractFloat64Exp( float64 a )
407 {
408
409 return ( a>>52 ) & 0x7FF;
410
411 }
412 /*----------------------------------------------------------------------------
413 | Returns the sign bit of the double-precision floating-point value `a'.
414 *----------------------------------------------------------------------------*/
415
extractFloat64Sign(float64 a)416 INLINE flag extractFloat64Sign( float64 a )
417 {
418
419 return a>>63;
420
421 }
422
423 /*----------------------------------------------------------------------------
424 | Normalizes the subnormal double-precision floating-point value represented
425 | by the denormalized significand `aSig'. The normalized exponent and
426 | significand are stored at the locations pointed to by `zExpPtr' and
427 | `zSigPtr', respectively.
428 *----------------------------------------------------------------------------*/
429
430 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)431 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
432 {
433 int8 shiftCount;
434
435 shiftCount = countLeadingZeros64( aSig ) - 11;
436 *zSigPtr = aSig<<shiftCount;
437 *zExpPtr = 1 - shiftCount;
438
439 }
440
441 /*----------------------------------------------------------------------------
442 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
443 | double-precision floating-point value, returning the result. After being
444 | shifted into the proper positions, the three fields are simply added
445 | together to form the result. This means that any integer portion of `zSig'
446 | will be added into the exponent. Since a properly normalized significand
447 | will have an integer portion equal to 1, the `zExp' input should be 1 less
448 | than the desired result exponent whenever `zSig' is a complete, normalized
449 | significand.
450 *----------------------------------------------------------------------------*/
451
packFloat64(flag zSign,int16 zExp,bits64 zSig)452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
453 {
454
455 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
456
457 }
458
459 /*----------------------------------------------------------------------------
460 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
461 | and significand `zSig', and returns the proper double-precision floating-
462 | point value corresponding to the abstract input. Ordinarily, the abstract
463 | value is simply rounded and packed into the double-precision format, with
464 | the inexact exception raised if the abstract input cannot be represented
465 | exactly. However, if the abstract value is too large, the overflow and
466 | inexact exceptions are raised and an infinity or maximal finite value is
467 | returned. If the abstract value is too small, the input value is rounded
468 | to a subnormal number, and the underflow and inexact exceptions are raised
469 | if the abstract input cannot be represented exactly as a subnormal double-
470 | precision floating-point number.
471 | The input significand `zSig' has its binary point between bits 62
472 | and 61, which is 10 bits to the left of the usual location. This shifted
473 | significand must be normalized or smaller. If `zSig' is not normalized,
474 | `zExp' must be 0; in that case, the result returned is a subnormal number,
475 | and it must not require rounding. In the usual case that `zSig' is
476 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
477 | The handling of underflow and overflow follows the IEC/IEEE Standard for
478 | Binary Floating-Point Arithmetic.
479 *----------------------------------------------------------------------------*/
480
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)481 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
482 {
483 int8 roundingMode;
484 flag roundNearestEven;
485 int16 roundIncrement, roundBits;
486 flag isTiny;
487
488 roundingMode = float_rounding_mode();
489 roundNearestEven = ( roundingMode == float_round_nearest_even );
490 roundIncrement = 0x200;
491 if ( ! roundNearestEven ) {
492 if ( roundingMode == float_round_to_zero ) {
493 roundIncrement = 0;
494 }
495 else {
496 roundIncrement = 0x3FF;
497 if ( zSign ) {
498 if ( roundingMode == float_round_up ) roundIncrement = 0;
499 }
500 else {
501 if ( roundingMode == float_round_down ) roundIncrement = 0;
502 }
503 }
504 }
505 roundBits = zSig & 0x3FF;
506 if ( 0x7FD <= (bits16) zExp ) {
507 if ( ( 0x7FD < zExp )
508 || ( ( zExp == 0x7FD )
509 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
510 ) {
511 float_raise( float_flag_overflow | float_flag_inexact );
512 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
513 }
514 if ( zExp < 0 ) {
515 isTiny =
516 ( float_detect_tininess == float_tininess_before_rounding )
517 || ( zExp < -1 )
518 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
519 shift64RightJamming( zSig, - zExp, &zSig );
520 zExp = 0;
521 roundBits = zSig & 0x3FF;
522 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
523 }
524 }
525 if ( roundBits ) float_set_inexact();
526 zSig = ( zSig + roundIncrement )>>10;
527 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
528 if ( zSig == 0 ) zExp = 0;
529 return packFloat64( zSign, zExp, zSig );
530
531 }
532
533 /*----------------------------------------------------------------------------
534 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
535 | and significand `zSig', and returns the proper double-precision floating-
536 | point value corresponding to the abstract input. This routine is just like
537 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
538 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
539 | floating-point exponent.
540 *----------------------------------------------------------------------------*/
541
542 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)543 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
544 {
545 int8 shiftCount;
546
547 shiftCount = countLeadingZeros64( zSig ) - 1;
548 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
549
550 }
551
552 #ifdef FLOATX80
553
554 /*----------------------------------------------------------------------------
555 | Returns the fraction bits of the extended double-precision floating-point
556 | value `a'.
557 *----------------------------------------------------------------------------*/
558
extractFloatx80Frac(floatx80 a)559 INLINE bits64 extractFloatx80Frac( floatx80 a )
560 {
561
562 return a.low;
563
564 }
565
566 /*----------------------------------------------------------------------------
567 | Returns the exponent bits of the extended double-precision floating-point
568 | value `a'.
569 *----------------------------------------------------------------------------*/
570
extractFloatx80Exp(floatx80 a)571 INLINE int32 extractFloatx80Exp( floatx80 a )
572 {
573
574 return a.high & 0x7FFF;
575
576 }
577
578 /*----------------------------------------------------------------------------
579 | Returns the sign bit of the extended double-precision floating-point value
580 | `a'.
581 *----------------------------------------------------------------------------*/
582
extractFloatx80Sign(floatx80 a)583 INLINE flag extractFloatx80Sign( floatx80 a )
584 {
585
586 return a.high>>15;
587
588 }
589
590 /*----------------------------------------------------------------------------
591 | Normalizes the subnormal extended double-precision floating-point value
592 | represented by the denormalized significand `aSig'. The normalized exponent
593 | and significand are stored at the locations pointed to by `zExpPtr' and
594 | `zSigPtr', respectively.
595 *----------------------------------------------------------------------------*/
596
597 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
599 {
600 int8 shiftCount;
601
602 shiftCount = countLeadingZeros64( aSig );
603 *zSigPtr = aSig<<shiftCount;
604 *zExpPtr = 1 - shiftCount;
605
606 }
607
608 /*----------------------------------------------------------------------------
609 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
610 | extended double-precision floating-point value, returning the result.
611 *----------------------------------------------------------------------------*/
612
packFloatx80(flag zSign,int32 zExp,bits64 zSig)613 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
614 {
615 floatx80 z;
616
617 z.low = zSig;
618 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
619 return z;
620
621 }
622
623 /*----------------------------------------------------------------------------
624 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
625 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
626 | and returns the proper extended double-precision floating-point value
627 | corresponding to the abstract input. Ordinarily, the abstract value is
628 | rounded and packed into the extended double-precision format, with the
629 | inexact exception raised if the abstract input cannot be represented
630 | exactly. However, if the abstract value is too large, the overflow and
631 | inexact exceptions are raised and an infinity or maximal finite value is
632 | returned. If the abstract value is too small, the input value is rounded to
633 | a subnormal number, and the underflow and inexact exceptions are raised if
634 | the abstract input cannot be represented exactly as a subnormal extended
635 | double-precision floating-point number.
636 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
637 | number of bits as single or double precision, respectively. Otherwise, the
638 | result is rounded to the full precision of the extended double-precision
639 | format.
640 | The input significand must be normalized or smaller. If the input
641 | significand is not normalized, `zExp' must be 0; in that case, the result
642 | returned is a subnormal number, and it must not require rounding. The
643 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
644 | Floating-Point Arithmetic.
645 *----------------------------------------------------------------------------*/
646
647 static floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)648 roundAndPackFloatx80(
649 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
650 )
651 {
652 int8 roundingMode;
653 flag roundNearestEven, increment, isTiny;
654 int64 roundIncrement, roundMask, roundBits;
655
656 roundingMode = float_rounding_mode();
657 roundNearestEven = ( roundingMode == float_round_nearest_even );
658 if ( roundingPrecision == 80 ) goto precision80;
659 if ( roundingPrecision == 64 ) {
660 roundIncrement = LIT64( 0x0000000000000400 );
661 roundMask = LIT64( 0x00000000000007FF );
662 }
663 else if ( roundingPrecision == 32 ) {
664 roundIncrement = LIT64( 0x0000008000000000 );
665 roundMask = LIT64( 0x000000FFFFFFFFFF );
666 }
667 else {
668 goto precision80;
669 }
670 zSig0 |= ( zSig1 != 0 );
671 if ( ! roundNearestEven ) {
672 if ( roundingMode == float_round_to_zero ) {
673 roundIncrement = 0;
674 }
675 else {
676 roundIncrement = roundMask;
677 if ( zSign ) {
678 if ( roundingMode == float_round_up ) roundIncrement = 0;
679 }
680 else {
681 if ( roundingMode == float_round_down ) roundIncrement = 0;
682 }
683 }
684 }
685 roundBits = zSig0 & roundMask;
686 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
687 if ( ( 0x7FFE < zExp )
688 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
689 ) {
690 goto overflow;
691 }
692 if ( zExp <= 0 ) {
693 isTiny =
694 ( float_detect_tininess == float_tininess_before_rounding )
695 || ( zExp < 0 )
696 || ( zSig0 <= zSig0 + roundIncrement );
697 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
698 zExp = 0;
699 roundBits = zSig0 & roundMask;
700 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
701 if ( roundBits ) float_set_inexact();
702 zSig0 += roundIncrement;
703 if ( (sbits64) zSig0 < 0 ) zExp = 1;
704 roundIncrement = roundMask + 1;
705 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
706 roundMask |= roundIncrement;
707 }
708 zSig0 &= ~ roundMask;
709 return packFloatx80( zSign, zExp, zSig0 );
710 }
711 }
712 if ( roundBits ) float_set_inexact();
713 zSig0 += roundIncrement;
714 if ( zSig0 < roundIncrement ) {
715 ++zExp;
716 zSig0 = LIT64( 0x8000000000000000 );
717 }
718 roundIncrement = roundMask + 1;
719 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
720 roundMask |= roundIncrement;
721 }
722 zSig0 &= ~ roundMask;
723 if ( zSig0 == 0 ) zExp = 0;
724 return packFloatx80( zSign, zExp, zSig0 );
725 precision80:
726 increment = ( (sbits64) zSig1 < 0 );
727 if ( ! roundNearestEven ) {
728 if ( roundingMode == float_round_to_zero ) {
729 increment = 0;
730 }
731 else {
732 if ( zSign ) {
733 increment = ( roundingMode == float_round_down ) && zSig1;
734 }
735 else {
736 increment = ( roundingMode == float_round_up ) && zSig1;
737 }
738 }
739 }
740 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
741 if ( ( 0x7FFE < zExp )
742 || ( ( zExp == 0x7FFE )
743 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
744 && increment
745 )
746 ) {
747 roundMask = 0;
748 overflow:
749 float_raise( float_flag_overflow | float_flag_inexact );
750 if ( ( roundingMode == float_round_to_zero )
751 || ( zSign && ( roundingMode == float_round_up ) )
752 || ( ! zSign && ( roundingMode == float_round_down ) )
753 ) {
754 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
755 }
756 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
757 }
758 if ( zExp <= 0 ) {
759 isTiny =
760 ( float_detect_tininess == float_tininess_before_rounding )
761 || ( zExp < 0 )
762 || ! increment
763 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
764 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
765 zExp = 0;
766 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
767 if ( zSig1 ) float_set_inexact();
768 if ( roundNearestEven ) {
769 increment = ( (sbits64) zSig1 < 0 );
770 }
771 else {
772 if ( zSign ) {
773 increment = ( roundingMode == float_round_down ) && zSig1;
774 }
775 else {
776 increment = ( roundingMode == float_round_up ) && zSig1;
777 }
778 }
779 if ( increment ) {
780 ++zSig0;
781 zSig0 &=
782 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
783 if ( (sbits64) zSig0 < 0 ) zExp = 1;
784 }
785 return packFloatx80( zSign, zExp, zSig0 );
786 }
787 }
788 if ( zSig1 ) float_set_inexact();
789 if ( increment ) {
790 ++zSig0;
791 if ( zSig0 == 0 ) {
792 ++zExp;
793 zSig0 = LIT64( 0x8000000000000000 );
794 }
795 else {
796 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
797 }
798 }
799 else {
800 if ( zSig0 == 0 ) zExp = 0;
801 }
802 return packFloatx80( zSign, zExp, zSig0 );
803
804 }
805
806 /*----------------------------------------------------------------------------
807 | Takes an abstract floating-point value having sign `zSign', exponent
808 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809 | and returns the proper extended double-precision floating-point value
810 | corresponding to the abstract input. This routine is just like
811 | `roundAndPackFloatx80' except that the input significand does not have to be
812 | normalized.
813 *----------------------------------------------------------------------------*/
814
815 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)816 normalizeRoundAndPackFloatx80(
817 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
818 )
819 {
820 int8 shiftCount;
821
822 if ( zSig0 == 0 ) {
823 zSig0 = zSig1;
824 zSig1 = 0;
825 zExp -= 64;
826 }
827 shiftCount = countLeadingZeros64( zSig0 );
828 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
829 zExp -= shiftCount;
830 return
831 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
832
833 }
834
835 #endif
836
837 #ifdef FLOAT128
838
839 /*----------------------------------------------------------------------------
840 | Returns the least-significant 64 fraction bits of the quadruple-precision
841 | floating-point value `a'.
842 *----------------------------------------------------------------------------*/
843
extractFloat128Frac1(float128 a)844 INLINE bits64 extractFloat128Frac1( float128 a )
845 {
846
847 return a.low;
848
849 }
850
851 /*----------------------------------------------------------------------------
852 | Returns the most-significant 48 fraction bits of the quadruple-precision
853 | floating-point value `a'.
854 *----------------------------------------------------------------------------*/
855
extractFloat128Frac0(float128 a)856 INLINE bits64 extractFloat128Frac0( float128 a )
857 {
858
859 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
860
861 }
862
863 /*----------------------------------------------------------------------------
864 | Returns the exponent bits of the quadruple-precision floating-point value
865 | `a'.
866 *----------------------------------------------------------------------------*/
867
extractFloat128Exp(float128 a)868 INLINE int32 extractFloat128Exp( float128 a )
869 {
870
871 return ( a.high>>48 ) & 0x7FFF;
872
873 }
874
875 /*----------------------------------------------------------------------------
876 | Returns the sign bit of the quadruple-precision floating-point value `a'.
877 *----------------------------------------------------------------------------*/
878
extractFloat128Sign(float128 a)879 INLINE flag extractFloat128Sign( float128 a )
880 {
881
882 return a.high>>63;
883
884 }
885
886 /*----------------------------------------------------------------------------
887 | Normalizes the subnormal quadruple-precision floating-point value
888 | represented by the denormalized significand formed by the concatenation of
889 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
890 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
891 | significand are stored at the location pointed to by `zSig0Ptr', and the
892 | least significant 64 bits of the normalized significand are stored at the
893 | location pointed to by `zSig1Ptr'.
894 *----------------------------------------------------------------------------*/
895
896 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)897 normalizeFloat128Subnormal(
898 bits64 aSig0,
899 bits64 aSig1,
900 int32 *zExpPtr,
901 bits64 *zSig0Ptr,
902 bits64 *zSig1Ptr
903 )
904 {
905 int8 shiftCount;
906
907 if ( aSig0 == 0 ) {
908 shiftCount = countLeadingZeros64( aSig1 ) - 15;
909 if ( shiftCount < 0 ) {
910 *zSig0Ptr = aSig1>>( - shiftCount );
911 *zSig1Ptr = aSig1<<( shiftCount & 63 );
912 }
913 else {
914 *zSig0Ptr = aSig1<<shiftCount;
915 *zSig1Ptr = 0;
916 }
917 *zExpPtr = - shiftCount - 63;
918 }
919 else {
920 shiftCount = countLeadingZeros64( aSig0 ) - 15;
921 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
922 *zExpPtr = 1 - shiftCount;
923 }
924
925 }
926
927 /*----------------------------------------------------------------------------
928 | Packs the sign `zSign', the exponent `zExp', and the significand formed
929 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
930 | floating-point value, returning the result. After being shifted into the
931 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
932 | added together to form the most significant 32 bits of the result. This
933 | means that any integer portion of `zSig0' will be added into the exponent.
934 | Since a properly normalized significand will have an integer portion equal
935 | to 1, the `zExp' input should be 1 less than the desired result exponent
936 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
937 | significand.
938 *----------------------------------------------------------------------------*/
939
940 INLINE float128
packFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)941 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
942 {
943 float128 z;
944
945 z.low = zSig1;
946 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
947 return z;
948
949 }
950
951 /*----------------------------------------------------------------------------
952 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
953 | and extended significand formed by the concatenation of `zSig0', `zSig1',
954 | and `zSig2', and returns the proper quadruple-precision floating-point value
955 | corresponding to the abstract input. Ordinarily, the abstract value is
956 | simply rounded and packed into the quadruple-precision format, with the
957 | inexact exception raised if the abstract input cannot be represented
958 | exactly. However, if the abstract value is too large, the overflow and
959 | inexact exceptions are raised and an infinity or maximal finite value is
960 | returned. If the abstract value is too small, the input value is rounded to
961 | a subnormal number, and the underflow and inexact exceptions are raised if
962 | the abstract input cannot be represented exactly as a subnormal quadruple-
963 | precision floating-point number.
964 | The input significand must be normalized or smaller. If the input
965 | significand is not normalized, `zExp' must be 0; in that case, the result
966 | returned is a subnormal number, and it must not require rounding. In the
967 | usual case that the input significand is normalized, `zExp' must be 1 less
968 | than the ``true'' floating-point exponent. The handling of underflow and
969 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
970 *----------------------------------------------------------------------------*/
971
972 static float128
roundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1,bits64 zSig2)973 roundAndPackFloat128(
974 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
975 {
976 int8 roundingMode;
977 flag roundNearestEven, increment, isTiny;
978
979 roundingMode = float_rounding_mode();
980 roundNearestEven = ( roundingMode == float_round_nearest_even );
981 increment = ( (sbits64) zSig2 < 0 );
982 if ( ! roundNearestEven ) {
983 if ( roundingMode == float_round_to_zero ) {
984 increment = 0;
985 }
986 else {
987 if ( zSign ) {
988 increment = ( roundingMode == float_round_down ) && zSig2;
989 }
990 else {
991 increment = ( roundingMode == float_round_up ) && zSig2;
992 }
993 }
994 }
995 if ( 0x7FFD <= (bits32) zExp ) {
996 if ( ( 0x7FFD < zExp )
997 || ( ( zExp == 0x7FFD )
998 && eq128(
999 LIT64( 0x0001FFFFFFFFFFFF ),
1000 LIT64( 0xFFFFFFFFFFFFFFFF ),
1001 zSig0,
1002 zSig1
1003 )
1004 && increment
1005 )
1006 ) {
1007 float_raise( float_flag_overflow | float_flag_inexact );
1008 if ( ( roundingMode == float_round_to_zero )
1009 || ( zSign && ( roundingMode == float_round_up ) )
1010 || ( ! zSign && ( roundingMode == float_round_down ) )
1011 ) {
1012 return
1013 packFloat128(
1014 zSign,
1015 0x7FFE,
1016 LIT64( 0x0000FFFFFFFFFFFF ),
1017 LIT64( 0xFFFFFFFFFFFFFFFF )
1018 );
1019 }
1020 return packFloat128( zSign, 0x7FFF, 0, 0 );
1021 }
1022 if ( zExp < 0 ) {
1023 isTiny =
1024 ( float_detect_tininess == float_tininess_before_rounding )
1025 || ( zExp < -1 )
1026 || ! increment
1027 || lt128(
1028 zSig0,
1029 zSig1,
1030 LIT64( 0x0001FFFFFFFFFFFF ),
1031 LIT64( 0xFFFFFFFFFFFFFFFF )
1032 );
1033 shift128ExtraRightJamming(
1034 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1035 zExp = 0;
1036 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1037 if ( roundNearestEven ) {
1038 increment = ( (sbits64) zSig2 < 0 );
1039 }
1040 else {
1041 if ( zSign ) {
1042 increment = ( roundingMode == float_round_down ) && zSig2;
1043 }
1044 else {
1045 increment = ( roundingMode == float_round_up ) && zSig2;
1046 }
1047 }
1048 }
1049 }
1050 if ( zSig2 ) float_set_inexact();
1051 if ( increment ) {
1052 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1053 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1054 }
1055 else {
1056 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1057 }
1058 return packFloat128( zSign, zExp, zSig0, zSig1 );
1059
1060 }
1061
1062 /*----------------------------------------------------------------------------
1063 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1064 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1065 | returns the proper quadruple-precision floating-point value corresponding
1066 | to the abstract input. This routine is just like `roundAndPackFloat128'
1067 | except that the input significand has fewer bits and does not have to be
1068 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1069 | point exponent.
1070 *----------------------------------------------------------------------------*/
1071
1072 static float128
normalizeRoundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1073 normalizeRoundAndPackFloat128(
1074 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1075 {
1076 int8 shiftCount;
1077 bits64 zSig2;
1078
1079 if ( zSig0 == 0 ) {
1080 zSig0 = zSig1;
1081 zSig1 = 0;
1082 zExp -= 64;
1083 }
1084 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1085 if ( 0 <= shiftCount ) {
1086 zSig2 = 0;
1087 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1088 }
1089 else {
1090 shift128ExtraRightJamming(
1091 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1092 }
1093 zExp -= shiftCount;
1094 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1095
1096 }
1097
1098 #endif
1099
1100 /*----------------------------------------------------------------------------
1101 | Returns the result of converting the 32-bit two's complement integer `a'
1102 | to the single-precision floating-point format. The conversion is performed
1103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1104 *----------------------------------------------------------------------------*/
1105
int32_to_float32(int32 a)1106 float32 int32_to_float32( int32 a )
1107 {
1108 flag zSign;
1109
1110 if ( a == 0 ) return 0;
1111 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1112 zSign = ( a < 0 );
1113 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1114
1115 }
1116
1117 /*----------------------------------------------------------------------------
1118 | Returns the result of converting the 32-bit two's complement integer `a'
1119 | to the double-precision floating-point format. The conversion is performed
1120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1121 *----------------------------------------------------------------------------*/
1122
int32_to_float64(int32 a)1123 float64 int32_to_float64( int32 a )
1124 {
1125 flag zSign;
1126 uint32 absA;
1127 int8 shiftCount;
1128 bits64 zSig;
1129
1130 if ( a == 0 ) return 0;
1131 zSign = ( a < 0 );
1132 absA = zSign ? - a : a;
1133 shiftCount = countLeadingZeros32( absA ) + 21;
1134 zSig = absA;
1135 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1136
1137 }
1138
1139 #ifdef FLOATX80
1140
1141 /*----------------------------------------------------------------------------
1142 | Returns the result of converting the 32-bit two's complement integer `a'
1143 | to the extended double-precision floating-point format. The conversion
1144 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1145 | Arithmetic.
1146 *----------------------------------------------------------------------------*/
1147
int32_to_floatx80(int32 a)1148 floatx80 int32_to_floatx80( int32 a )
1149 {
1150 flag zSign;
1151 uint32 absA;
1152 int8 shiftCount;
1153 bits64 zSig;
1154
1155 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1156 zSign = ( a < 0 );
1157 absA = zSign ? - a : a;
1158 shiftCount = countLeadingZeros32( absA ) + 32;
1159 zSig = absA;
1160 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1161
1162 }
1163
1164 #endif
1165
1166 #ifdef FLOAT128
1167
1168 /*----------------------------------------------------------------------------
1169 | Returns the result of converting the 32-bit two's complement integer `a' to
1170 | the quadruple-precision floating-point format. The conversion is performed
1171 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172 *----------------------------------------------------------------------------*/
1173
int32_to_float128(int32 a)1174 float128 int32_to_float128( int32 a )
1175 {
1176 flag zSign;
1177 uint32 absA;
1178 int8 shiftCount;
1179 bits64 zSig0;
1180
1181 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1182 zSign = ( a < 0 );
1183 absA = zSign ? - a : a;
1184 shiftCount = countLeadingZeros32( absA ) + 17;
1185 zSig0 = absA;
1186 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1187
1188 }
1189
1190 #endif
1191
1192 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1193 /*----------------------------------------------------------------------------
1194 | Returns the result of converting the 64-bit two's complement integer `a'
1195 | to the single-precision floating-point format. The conversion is performed
1196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1197 *----------------------------------------------------------------------------*/
1198
int64_to_float32(int64 a)1199 float32 int64_to_float32( int64 a )
1200 {
1201 flag zSign;
1202 uint64 absA;
1203 int8 shiftCount;
1204
1205 if ( a == 0 ) return 0;
1206 zSign = ( a < 0 );
1207 absA = zSign ? - a : a;
1208 shiftCount = countLeadingZeros64( absA ) - 40;
1209 if ( 0 <= shiftCount ) {
1210 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1211 }
1212 else {
1213 shiftCount += 7;
1214 if ( shiftCount < 0 ) {
1215 shift64RightJamming( absA, - shiftCount, &absA );
1216 }
1217 else {
1218 absA <<= shiftCount;
1219 }
1220 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1221 }
1222
1223 }
1224
1225 /*----------------------------------------------------------------------------
1226 | Returns the result of converting the 64-bit two's complement integer `a'
1227 | to the double-precision floating-point format. The conversion is performed
1228 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230
int64_to_float64(int64 a)1231 float64 int64_to_float64( int64 a )
1232 {
1233 flag zSign;
1234
1235 if ( a == 0 ) return 0;
1236 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1237 return packFloat64( 1, 0x43E, 0 );
1238 }
1239 zSign = ( a < 0 );
1240 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1241
1242 }
1243
1244 #ifdef FLOATX80
1245
1246 /*----------------------------------------------------------------------------
1247 | Returns the result of converting the 64-bit two's complement integer `a'
1248 | to the extended double-precision floating-point format. The conversion
1249 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1250 | Arithmetic.
1251 *----------------------------------------------------------------------------*/
1252
int64_to_floatx80(int64 a)1253 floatx80 int64_to_floatx80( int64 a )
1254 {
1255 flag zSign;
1256 uint64 absA;
1257 int8 shiftCount;
1258
1259 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1260 zSign = ( a < 0 );
1261 absA = zSign ? - a : a;
1262 shiftCount = countLeadingZeros64( absA );
1263 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1264
1265 }
1266
1267 #endif
1268
1269 #ifdef FLOAT128
1270
1271 /*----------------------------------------------------------------------------
1272 | Returns the result of converting the 64-bit two's complement integer `a' to
1273 | the quadruple-precision floating-point format. The conversion is performed
1274 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1275 *----------------------------------------------------------------------------*/
1276
int64_to_float128(int64 a)1277 float128 int64_to_float128( int64 a )
1278 {
1279 flag zSign;
1280 uint64 absA;
1281 int8 shiftCount;
1282 int32 zExp;
1283 bits64 zSig0, zSig1;
1284
1285 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1286 zSign = ( a < 0 );
1287 absA = zSign ? - a : a;
1288 shiftCount = countLeadingZeros64( absA ) + 49;
1289 zExp = 0x406E - shiftCount;
1290 if ( 64 <= shiftCount ) {
1291 zSig1 = 0;
1292 zSig0 = absA;
1293 shiftCount -= 64;
1294 }
1295 else {
1296 zSig1 = absA;
1297 zSig0 = 0;
1298 }
1299 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1300 return packFloat128( zSign, zExp, zSig0, zSig1 );
1301
1302 }
1303
1304 #endif
1305 #endif /* !SOFTFLOAT_FOR_GCC */
1306
1307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the single-precision floating-point value
1310 | `a' to the 32-bit two's complement integer format. The conversion is
1311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 | Arithmetic---which means in particular that the conversion is rounded
1313 | according to the current rounding mode. If `a' is a NaN, the largest
1314 | positive integer is returned. Otherwise, if the conversion overflows, the
1315 | largest integer with the same sign as `a' is returned.
1316 *----------------------------------------------------------------------------*/
1317
float32_to_int32(float32 a)1318 int32 float32_to_int32( float32 a )
1319 {
1320 flag aSign;
1321 int16 aExp, shiftCount;
1322 bits32 aSig;
1323 bits64 aSig64;
1324
1325 aSig = extractFloat32Frac( a );
1326 aExp = extractFloat32Exp( a );
1327 aSign = extractFloat32Sign( a );
1328 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1329 if ( aExp ) aSig |= 0x00800000;
1330 shiftCount = 0xAF - aExp;
1331 aSig64 = aSig;
1332 aSig64 <<= 32;
1333 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1334 return roundAndPackInt32( aSign, aSig64 );
1335
1336 }
1337 #endif /* !SOFTFLOAT_FOR_GCC */
1338
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format. The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic, except that the conversion is always rounded toward zero.
1344 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1345 | the conversion overflows, the largest integer with the same sign as `a' is
1346 | returned.
1347 *----------------------------------------------------------------------------*/
1348
float32_to_int32_round_to_zero(float32 a)1349 int32 float32_to_int32_round_to_zero( float32 a )
1350 {
1351 flag aSign;
1352 int16 aExp, shiftCount;
1353 bits32 aSig;
1354 int32 z;
1355
1356 aSig = extractFloat32Frac( a );
1357 aExp = extractFloat32Exp( a );
1358 aSign = extractFloat32Sign( a );
1359 shiftCount = aExp - 0x9E;
1360 if ( 0 <= shiftCount ) {
1361 if ( a != 0xCF000000 ) {
1362 float_raise( float_flag_invalid );
1363 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1364 }
1365 return (sbits32) 0x80000000;
1366 }
1367 else if ( aExp <= 0x7E ) {
1368 if ( aExp | aSig ) float_set_inexact();
1369 return 0;
1370 }
1371 aSig = ( aSig | 0x00800000 )<<8;
1372 z = aSig>>( - shiftCount );
1373 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1374 float_set_inexact();
1375 }
1376 if ( aSign ) z = - z;
1377 return z;
1378
1379 }
1380
1381 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the single-precision floating-point value
1384 | `a' to the 64-bit two's complement integer format. The conversion is
1385 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1386 | Arithmetic---which means in particular that the conversion is rounded
1387 | according to the current rounding mode. If `a' is a NaN, the largest
1388 | positive integer is returned. Otherwise, if the conversion overflows, the
1389 | largest integer with the same sign as `a' is returned.
1390 *----------------------------------------------------------------------------*/
1391
float32_to_int64(float32 a)1392 int64 float32_to_int64( float32 a )
1393 {
1394 flag aSign;
1395 int16 aExp, shiftCount;
1396 bits32 aSig;
1397 bits64 aSig64, aSigExtra;
1398
1399 aSig = extractFloat32Frac( a );
1400 aExp = extractFloat32Exp( a );
1401 aSign = extractFloat32Sign( a );
1402 shiftCount = 0xBE - aExp;
1403 if ( shiftCount < 0 ) {
1404 float_raise( float_flag_invalid );
1405 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1406 return LIT64( 0x7FFFFFFFFFFFFFFF );
1407 }
1408 return (sbits64) LIT64( 0x8000000000000000 );
1409 }
1410 if ( aExp ) aSig |= 0x00800000;
1411 aSig64 = aSig;
1412 aSig64 <<= 40;
1413 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1414 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1415
1416 }
1417
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 64-bit two's complement integer format. The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero. If
1423 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1424 | conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427
float32_to_int64_round_to_zero(float32 a)1428 int64 float32_to_int64_round_to_zero( float32 a )
1429 {
1430 flag aSign;
1431 int16 aExp, shiftCount;
1432 bits32 aSig;
1433 bits64 aSig64;
1434 int64 z;
1435
1436 aSig = extractFloat32Frac( a );
1437 aExp = extractFloat32Exp( a );
1438 aSign = extractFloat32Sign( a );
1439 shiftCount = aExp - 0xBE;
1440 if ( 0 <= shiftCount ) {
1441 if ( a != 0xDF000000 ) {
1442 float_raise( float_flag_invalid );
1443 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1444 return LIT64( 0x7FFFFFFFFFFFFFFF );
1445 }
1446 }
1447 return (sbits64) LIT64( 0x8000000000000000 );
1448 }
1449 else if ( aExp <= 0x7E ) {
1450 if ( aExp | aSig ) float_set_inexact();
1451 return 0;
1452 }
1453 aSig64 = aSig | 0x00800000;
1454 aSig64 <<= 40;
1455 z = aSig64>>( - shiftCount );
1456 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1457 float_set_inexact();
1458 }
1459 if ( aSign ) z = - z;
1460 return z;
1461
1462 }
1463 #endif /* !SOFTFLOAT_FOR_GCC */
1464
1465 /*----------------------------------------------------------------------------
1466 | Returns the result of converting the single-precision floating-point value
1467 | `a' to the double-precision floating-point format. The conversion is
1468 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1469 | Arithmetic.
1470 *----------------------------------------------------------------------------*/
1471
float32_to_float64(float32 a)1472 float64 float32_to_float64( float32 a )
1473 {
1474 flag aSign;
1475 int16 aExp;
1476 bits32 aSig;
1477
1478 aSig = extractFloat32Frac( a );
1479 aExp = extractFloat32Exp( a );
1480 aSign = extractFloat32Sign( a );
1481 if ( aExp == 0xFF ) {
1482 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1483 return packFloat64( aSign, 0x7FF, 0 );
1484 }
1485 if ( aExp == 0 ) {
1486 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1487 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1488 --aExp;
1489 }
1490 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1491
1492 }
1493
1494 #ifdef FLOATX80
1495
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the single-precision floating-point value
1498 | `a' to the extended double-precision floating-point format. The conversion
1499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1500 | Arithmetic.
1501 *----------------------------------------------------------------------------*/
1502
float32_to_floatx80(float32 a)1503 floatx80 float32_to_floatx80( float32 a )
1504 {
1505 flag aSign;
1506 int16 aExp;
1507 bits32 aSig;
1508
1509 aSig = extractFloat32Frac( a );
1510 aExp = extractFloat32Exp( a );
1511 aSign = extractFloat32Sign( a );
1512 if ( aExp == 0xFF ) {
1513 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1514 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1515 }
1516 if ( aExp == 0 ) {
1517 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1518 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1519 }
1520 aSig |= 0x00800000;
1521 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1522
1523 }
1524
1525 #endif
1526
1527 #ifdef FLOAT128
1528
1529 /*----------------------------------------------------------------------------
1530 | Returns the result of converting the single-precision floating-point value
1531 | `a' to the double-precision floating-point format. The conversion is
1532 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1533 | Arithmetic.
1534 *----------------------------------------------------------------------------*/
1535
float32_to_float128(float32 a)1536 float128 float32_to_float128( float32 a )
1537 {
1538 flag aSign;
1539 int16 aExp;
1540 bits32 aSig;
1541
1542 aSig = extractFloat32Frac( a );
1543 aExp = extractFloat32Exp( a );
1544 aSign = extractFloat32Sign( a );
1545 if ( aExp == 0xFF ) {
1546 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1547 return packFloat128( aSign, 0x7FFF, 0, 0 );
1548 }
1549 if ( aExp == 0 ) {
1550 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1551 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1552 --aExp;
1553 }
1554 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1555
1556 }
1557
1558 #endif
1559
1560 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1561 /*----------------------------------------------------------------------------
1562 | Rounds the single-precision floating-point value `a' to an integer, and
1563 | returns the result as a single-precision floating-point value. The
1564 | operation is performed according to the IEC/IEEE Standard for Binary
1565 | Floating-Point Arithmetic.
1566 *----------------------------------------------------------------------------*/
1567
float32_round_to_int(float32 a)1568 float32 float32_round_to_int( float32 a )
1569 {
1570 flag aSign;
1571 int16 aExp;
1572 bits32 lastBitMask, roundBitsMask;
1573 int8 roundingMode;
1574 float32 z;
1575
1576 aExp = extractFloat32Exp( a );
1577 if ( 0x96 <= aExp ) {
1578 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1579 return propagateFloat32NaN( a, a );
1580 }
1581 return a;
1582 }
1583 if ( aExp <= 0x7E ) {
1584 if ( (bits32) ( a<<1 ) == 0 ) return a;
1585 float_set_inexact();
1586 aSign = extractFloat32Sign( a );
1587 switch ( float_rounding_mode() ) {
1588 case float_round_nearest_even:
1589 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1590 return packFloat32( aSign, 0x7F, 0 );
1591 }
1592 break;
1593 case float_round_down:
1594 return aSign ? 0xBF800000 : 0;
1595 case float_round_up:
1596 return aSign ? 0x80000000 : 0x3F800000;
1597 }
1598 return packFloat32( aSign, 0, 0 );
1599 }
1600 lastBitMask = 1;
1601 lastBitMask <<= 0x96 - aExp;
1602 roundBitsMask = lastBitMask - 1;
1603 z = a;
1604 roundingMode = float_rounding_mode();
1605 if ( roundingMode == float_round_nearest_even ) {
1606 z += lastBitMask>>1;
1607 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1608 }
1609 else if ( roundingMode != float_round_to_zero ) {
1610 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1611 z += roundBitsMask;
1612 }
1613 }
1614 z &= ~ roundBitsMask;
1615 if ( z != a ) float_set_inexact();
1616 return z;
1617
1618 }
1619 #endif /* !SOFTFLOAT_FOR_GCC */
1620
1621 /*----------------------------------------------------------------------------
1622 | Returns the result of adding the absolute values of the single-precision
1623 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1624 | before being returned. `zSign' is ignored if the result is a NaN.
1625 | The addition is performed according to the IEC/IEEE Standard for Binary
1626 | Floating-Point Arithmetic.
1627 *----------------------------------------------------------------------------*/
1628
addFloat32Sigs(float32 a,float32 b,flag zSign)1629 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1630 {
1631 int16 aExp, bExp, zExp;
1632 bits32 aSig, bSig, zSig;
1633 int16 expDiff;
1634
1635 aSig = extractFloat32Frac( a );
1636 aExp = extractFloat32Exp( a );
1637 bSig = extractFloat32Frac( b );
1638 bExp = extractFloat32Exp( b );
1639 expDiff = aExp - bExp;
1640 aSig <<= 6;
1641 bSig <<= 6;
1642 if ( 0 < expDiff ) {
1643 if ( aExp == 0xFF ) {
1644 if ( aSig ) return propagateFloat32NaN( a, b );
1645 return a;
1646 }
1647 if ( bExp == 0 ) {
1648 --expDiff;
1649 }
1650 else {
1651 bSig |= 0x20000000;
1652 }
1653 shift32RightJamming( bSig, expDiff, &bSig );
1654 zExp = aExp;
1655 }
1656 else if ( expDiff < 0 ) {
1657 if ( bExp == 0xFF ) {
1658 if ( bSig ) return propagateFloat32NaN( a, b );
1659 return packFloat32( zSign, 0xFF, 0 );
1660 }
1661 if ( aExp == 0 ) {
1662 ++expDiff;
1663 }
1664 else {
1665 aSig |= 0x20000000;
1666 }
1667 shift32RightJamming( aSig, - expDiff, &aSig );
1668 zExp = bExp;
1669 }
1670 else {
1671 if ( aExp == 0xFF ) {
1672 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1673 return a;
1674 }
1675 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1676 zSig = 0x40000000 + aSig + bSig;
1677 zExp = aExp;
1678 goto roundAndPack;
1679 }
1680 aSig |= 0x20000000;
1681 zSig = ( aSig + bSig )<<1;
1682 --zExp;
1683 if ( (sbits32) zSig < 0 ) {
1684 zSig = aSig + bSig;
1685 ++zExp;
1686 }
1687 roundAndPack:
1688 return roundAndPackFloat32( zSign, zExp, zSig );
1689
1690 }
1691
1692 /*----------------------------------------------------------------------------
1693 | Returns the result of subtracting the absolute values of the single-
1694 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1695 | difference is negated before being returned. `zSign' is ignored if the
1696 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1697 | Standard for Binary Floating-Point Arithmetic.
1698 *----------------------------------------------------------------------------*/
1699
subFloat32Sigs(float32 a,float32 b,flag zSign)1700 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1701 {
1702 int16 aExp, bExp, zExp;
1703 bits32 aSig, bSig, zSig;
1704 int16 expDiff;
1705
1706 aSig = extractFloat32Frac( a );
1707 aExp = extractFloat32Exp( a );
1708 bSig = extractFloat32Frac( b );
1709 bExp = extractFloat32Exp( b );
1710 expDiff = aExp - bExp;
1711 aSig <<= 7;
1712 bSig <<= 7;
1713 if ( 0 < expDiff ) goto aExpBigger;
1714 if ( expDiff < 0 ) goto bExpBigger;
1715 if ( aExp == 0xFF ) {
1716 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1717 float_raise( float_flag_invalid );
1718 return float32_default_nan;
1719 }
1720 if ( aExp == 0 ) {
1721 aExp = 1;
1722 bExp = 1;
1723 }
1724 if ( bSig < aSig ) goto aBigger;
1725 if ( aSig < bSig ) goto bBigger;
1726 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1727 bExpBigger:
1728 if ( bExp == 0xFF ) {
1729 if ( bSig ) return propagateFloat32NaN( a, b );
1730 return packFloat32( zSign ^ 1, 0xFF, 0 );
1731 }
1732 if ( aExp == 0 ) {
1733 ++expDiff;
1734 }
1735 else {
1736 aSig |= 0x40000000;
1737 }
1738 shift32RightJamming( aSig, - expDiff, &aSig );
1739 bSig |= 0x40000000;
1740 bBigger:
1741 zSig = bSig - aSig;
1742 zExp = bExp;
1743 zSign ^= 1;
1744 goto normalizeRoundAndPack;
1745 aExpBigger:
1746 if ( aExp == 0xFF ) {
1747 if ( aSig ) return propagateFloat32NaN( a, b );
1748 return a;
1749 }
1750 if ( bExp == 0 ) {
1751 --expDiff;
1752 }
1753 else {
1754 bSig |= 0x40000000;
1755 }
1756 shift32RightJamming( bSig, expDiff, &bSig );
1757 aSig |= 0x40000000;
1758 aBigger:
1759 zSig = aSig - bSig;
1760 zExp = aExp;
1761 normalizeRoundAndPack:
1762 --zExp;
1763 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1764
1765 }
1766
1767 /*----------------------------------------------------------------------------
1768 | Returns the result of adding the single-precision floating-point values `a'
1769 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1770 | Binary Floating-Point Arithmetic.
1771 *----------------------------------------------------------------------------*/
1772
float32_add(float32 a,float32 b)1773 float32 float32_add( float32 a, float32 b )
1774 {
1775 flag aSign, bSign;
1776
1777 aSign = extractFloat32Sign( a );
1778 bSign = extractFloat32Sign( b );
1779 if ( aSign == bSign ) {
1780 return addFloat32Sigs( a, b, aSign );
1781 }
1782 else {
1783 return subFloat32Sigs( a, b, aSign );
1784 }
1785
1786 }
1787
1788 /*----------------------------------------------------------------------------
1789 | Returns the result of subtracting the single-precision floating-point values
1790 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1791 | for Binary Floating-Point Arithmetic.
1792 *----------------------------------------------------------------------------*/
1793
float32_sub(float32 a,float32 b)1794 float32 float32_sub( float32 a, float32 b )
1795 {
1796 flag aSign, bSign;
1797
1798 aSign = extractFloat32Sign( a );
1799 bSign = extractFloat32Sign( b );
1800 if ( aSign == bSign ) {
1801 return subFloat32Sigs( a, b, aSign );
1802 }
1803 else {
1804 return addFloat32Sigs( a, b, aSign );
1805 }
1806
1807 }
1808
1809 /*----------------------------------------------------------------------------
1810 | Returns the result of multiplying the single-precision floating-point values
1811 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1812 | for Binary Floating-Point Arithmetic.
1813 *----------------------------------------------------------------------------*/
1814
float32_mul(float32 a,float32 b)1815 float32 float32_mul( float32 a, float32 b )
1816 {
1817 flag aSign, bSign, zSign;
1818 int16 aExp, bExp, zExp;
1819 bits32 aSig, bSig;
1820 bits64 zSig64;
1821 bits32 zSig;
1822
1823 aSig = extractFloat32Frac( a );
1824 aExp = extractFloat32Exp( a );
1825 aSign = extractFloat32Sign( a );
1826 bSig = extractFloat32Frac( b );
1827 bExp = extractFloat32Exp( b );
1828 bSign = extractFloat32Sign( b );
1829 zSign = aSign ^ bSign;
1830 if ( aExp == 0xFF ) {
1831 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1832 return propagateFloat32NaN( a, b );
1833 }
1834 if ( ( bExp | bSig ) == 0 ) {
1835 float_raise( float_flag_invalid );
1836 return float32_default_nan;
1837 }
1838 return packFloat32( zSign, 0xFF, 0 );
1839 }
1840 if ( bExp == 0xFF ) {
1841 if ( bSig ) return propagateFloat32NaN( a, b );
1842 if ( ( aExp | aSig ) == 0 ) {
1843 float_raise( float_flag_invalid );
1844 return float32_default_nan;
1845 }
1846 return packFloat32( zSign, 0xFF, 0 );
1847 }
1848 if ( aExp == 0 ) {
1849 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1850 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1851 }
1852 if ( bExp == 0 ) {
1853 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1854 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1855 }
1856 zExp = aExp + bExp - 0x7F;
1857 aSig = ( aSig | 0x00800000 )<<7;
1858 bSig = ( bSig | 0x00800000 )<<8;
1859 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1860 zSig = zSig64;
1861 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1862 zSig <<= 1;
1863 --zExp;
1864 }
1865 return roundAndPackFloat32( zSign, zExp, zSig );
1866
1867 }
1868
1869 /*----------------------------------------------------------------------------
1870 | Returns the result of dividing the single-precision floating-point value `a'
1871 | by the corresponding value `b'. The operation is performed according to the
1872 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1874
float32_div(float32 a,float32 b)1875 float32 float32_div( float32 a, float32 b )
1876 {
1877 flag aSign, bSign, zSign;
1878 int16 aExp, bExp, zExp;
1879 bits32 aSig, bSig, zSig;
1880
1881 aSig = extractFloat32Frac( a );
1882 aExp = extractFloat32Exp( a );
1883 aSign = extractFloat32Sign( a );
1884 bSig = extractFloat32Frac( b );
1885 bExp = extractFloat32Exp( b );
1886 bSign = extractFloat32Sign( b );
1887 zSign = aSign ^ bSign;
1888 if ( aExp == 0xFF ) {
1889 if ( aSig ) return propagateFloat32NaN( a, b );
1890 if ( bExp == 0xFF ) {
1891 if ( bSig ) return propagateFloat32NaN( a, b );
1892 float_raise( float_flag_invalid );
1893 return float32_default_nan;
1894 }
1895 return packFloat32( zSign, 0xFF, 0 );
1896 }
1897 if ( bExp == 0xFF ) {
1898 if ( bSig ) return propagateFloat32NaN( a, b );
1899 return packFloat32( zSign, 0, 0 );
1900 }
1901 if ( bExp == 0 ) {
1902 if ( bSig == 0 ) {
1903 if ( ( aExp | aSig ) == 0 ) {
1904 float_raise( float_flag_invalid );
1905 return float32_default_nan;
1906 }
1907 float_raise( float_flag_divbyzero );
1908 return packFloat32( zSign, 0xFF, 0 );
1909 }
1910 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1911 }
1912 if ( aExp == 0 ) {
1913 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1914 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1915 }
1916 zExp = aExp - bExp + 0x7D;
1917 aSig = ( aSig | 0x00800000 )<<7;
1918 bSig = ( bSig | 0x00800000 )<<8;
1919 if ( bSig <= ( aSig + aSig ) ) {
1920 aSig >>= 1;
1921 ++zExp;
1922 }
1923 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1924 if ( ( zSig & 0x3F ) == 0 ) {
1925 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1926 }
1927 return roundAndPackFloat32( zSign, zExp, zSig );
1928
1929 }
1930
1931 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1932 /*----------------------------------------------------------------------------
1933 | Returns the remainder of the single-precision floating-point value `a'
1934 | with respect to the corresponding value `b'. The operation is performed
1935 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1937
float32_rem(float32 a,float32 b)1938 float32 float32_rem( float32 a, float32 b )
1939 {
1940 flag aSign, bSign __unused, zSign;
1941 int16 aExp, bExp, expDiff;
1942 bits32 aSig, bSig;
1943 bits32 q;
1944 bits64 aSig64, bSig64, q64;
1945 bits32 alternateASig;
1946 sbits32 sigMean;
1947
1948 aSig = extractFloat32Frac( a );
1949 aExp = extractFloat32Exp( a );
1950 aSign = extractFloat32Sign( a );
1951 bSig = extractFloat32Frac( b );
1952 bExp = extractFloat32Exp( b );
1953 bSign = extractFloat32Sign( b );
1954 if ( aExp == 0xFF ) {
1955 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1956 return propagateFloat32NaN( a, b );
1957 }
1958 float_raise( float_flag_invalid );
1959 return float32_default_nan;
1960 }
1961 if ( bExp == 0xFF ) {
1962 if ( bSig ) return propagateFloat32NaN( a, b );
1963 return a;
1964 }
1965 if ( bExp == 0 ) {
1966 if ( bSig == 0 ) {
1967 float_raise( float_flag_invalid );
1968 return float32_default_nan;
1969 }
1970 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1971 }
1972 if ( aExp == 0 ) {
1973 if ( aSig == 0 ) return a;
1974 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1975 }
1976 expDiff = aExp - bExp;
1977 aSig |= 0x00800000;
1978 bSig |= 0x00800000;
1979 if ( expDiff < 32 ) {
1980 aSig <<= 8;
1981 bSig <<= 8;
1982 if ( expDiff < 0 ) {
1983 if ( expDiff < -1 ) return a;
1984 aSig >>= 1;
1985 }
1986 q = ( bSig <= aSig );
1987 if ( q ) aSig -= bSig;
1988 if ( 0 < expDiff ) {
1989 q = ( ( (bits64) aSig )<<32 ) / bSig;
1990 q >>= 32 - expDiff;
1991 bSig >>= 2;
1992 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1993 }
1994 else {
1995 aSig >>= 2;
1996 bSig >>= 2;
1997 }
1998 }
1999 else {
2000 if ( bSig <= aSig ) aSig -= bSig;
2001 aSig64 = ( (bits64) aSig )<<40;
2002 bSig64 = ( (bits64) bSig )<<40;
2003 expDiff -= 64;
2004 while ( 0 < expDiff ) {
2005 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2006 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2007 aSig64 = - ( ( bSig * q64 )<<38 );
2008 expDiff -= 62;
2009 }
2010 expDiff += 64;
2011 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2012 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2013 q = q64>>( 64 - expDiff );
2014 bSig <<= 6;
2015 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2016 }
2017 do {
2018 alternateASig = aSig;
2019 ++q;
2020 aSig -= bSig;
2021 } while ( 0 <= (sbits32) aSig );
2022 sigMean = aSig + alternateASig;
2023 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2024 aSig = alternateASig;
2025 }
2026 zSign = ( (sbits32) aSig < 0 );
2027 if ( zSign ) aSig = - aSig;
2028 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2029
2030 }
2031 #endif /* !SOFTFLOAT_FOR_GCC */
2032
2033 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2034
2035 /*----------------------------------------------------------------------------
2036 | Returns the square root of the single-precision floating-point value `a'.
2037 | The operation is performed according to the IEC/IEEE Standard for Binary
2038 | Floating-Point Arithmetic.
2039 *----------------------------------------------------------------------------*/
2040
float32_sqrt(float32 a)2041 float32 float32_sqrt( float32 a )
2042 {
2043 flag aSign;
2044 int16 aExp, zExp;
2045 bits32 aSig, zSig;
2046 bits64 rem, term;
2047
2048 aSig = extractFloat32Frac( a );
2049 aExp = extractFloat32Exp( a );
2050 aSign = extractFloat32Sign( a );
2051 if ( aExp == 0xFF ) {
2052 if ( aSig ) return propagateFloat32NaN( a, 0 );
2053 if ( ! aSign ) return a;
2054 float_raise( float_flag_invalid );
2055 return float32_default_nan;
2056 }
2057 if ( aSign ) {
2058 if ( ( aExp | aSig ) == 0 ) return a;
2059 float_raise( float_flag_invalid );
2060 return float32_default_nan;
2061 }
2062 if ( aExp == 0 ) {
2063 if ( aSig == 0 ) return 0;
2064 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2065 }
2066 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2067 aSig = ( aSig | 0x00800000 )<<8;
2068 zSig = estimateSqrt32( aExp, aSig ) + 2;
2069 if ( ( zSig & 0x7F ) <= 5 ) {
2070 if ( zSig < 2 ) {
2071 zSig = 0x7FFFFFFF;
2072 goto roundAndPack;
2073 }
2074 aSig >>= aExp & 1;
2075 term = ( (bits64) zSig ) * zSig;
2076 rem = ( ( (bits64) aSig )<<32 ) - term;
2077 while ( (sbits64) rem < 0 ) {
2078 --zSig;
2079 rem += ( ( (bits64) zSig )<<1 ) | 1;
2080 }
2081 zSig |= ( rem != 0 );
2082 }
2083 shift32RightJamming( zSig, 1, &zSig );
2084 roundAndPack:
2085 return roundAndPackFloat32( 0, zExp, zSig );
2086
2087 }
2088 #endif /* !SOFTFLOAT_FOR_GCC */
2089
2090 /*----------------------------------------------------------------------------
2091 | Returns 1 if the single-precision floating-point value `a' is equal to
2092 | the corresponding value `b', and 0 otherwise. The comparison is performed
2093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2094 *----------------------------------------------------------------------------*/
2095
float32_eq(float32 a,float32 b)2096 flag float32_eq( float32 a, float32 b )
2097 {
2098
2099 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2100 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2101 ) {
2102 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2103 float_raise( float_flag_invalid );
2104 }
2105 return 0;
2106 }
2107 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2108
2109 }
2110
2111 /*----------------------------------------------------------------------------
2112 | Returns 1 if the single-precision floating-point value `a' is less than
2113 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2115 | Arithmetic.
2116 *----------------------------------------------------------------------------*/
2117
float32_le(float32 a,float32 b)2118 flag float32_le( float32 a, float32 b )
2119 {
2120 flag aSign, bSign;
2121
2122 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2123 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2124 ) {
2125 float_raise( float_flag_invalid );
2126 return 0;
2127 }
2128 aSign = extractFloat32Sign( a );
2129 bSign = extractFloat32Sign( b );
2130 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2131 return ( a == b ) || ( aSign ^ ( a < b ) );
2132
2133 }
2134
2135 /*----------------------------------------------------------------------------
2136 | Returns 1 if the single-precision floating-point value `a' is less than
2137 | the corresponding value `b', and 0 otherwise. The comparison is performed
2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2140
float32_lt(float32 a,float32 b)2141 flag float32_lt( float32 a, float32 b )
2142 {
2143 flag aSign, bSign;
2144
2145 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147 ) {
2148 float_raise( float_flag_invalid );
2149 return 0;
2150 }
2151 aSign = extractFloat32Sign( a );
2152 bSign = extractFloat32Sign( b );
2153 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2154 return ( a != b ) && ( aSign ^ ( a < b ) );
2155
2156 }
2157
2158 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2159 /*----------------------------------------------------------------------------
2160 | Returns 1 if the single-precision floating-point value `a' is equal to
2161 | the corresponding value `b', and 0 otherwise. The invalid exception is
2162 | raised if either operand is a NaN. Otherwise, the comparison is performed
2163 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2164 *----------------------------------------------------------------------------*/
2165
float32_eq_signaling(float32 a,float32 b)2166 flag float32_eq_signaling( float32 a, float32 b )
2167 {
2168
2169 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2170 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2171 ) {
2172 float_raise( float_flag_invalid );
2173 return 0;
2174 }
2175 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2176
2177 }
2178
2179 /*----------------------------------------------------------------------------
2180 | Returns 1 if the single-precision floating-point value `a' is less than or
2181 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2182 | cause an exception. Otherwise, the comparison is performed according to the
2183 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2185
float32_le_quiet(float32 a,float32 b)2186 flag float32_le_quiet( float32 a, float32 b )
2187 {
2188 flag aSign, bSign;
2189
2190 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2191 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2192 ) {
2193 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2194 float_raise( float_flag_invalid );
2195 }
2196 return 0;
2197 }
2198 aSign = extractFloat32Sign( a );
2199 bSign = extractFloat32Sign( b );
2200 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2201 return ( a == b ) || ( aSign ^ ( a < b ) );
2202
2203 }
2204
2205 /*----------------------------------------------------------------------------
2206 | Returns 1 if the single-precision floating-point value `a' is less than
2207 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2208 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2209 | Standard for Binary Floating-Point Arithmetic.
2210 *----------------------------------------------------------------------------*/
2211
float32_lt_quiet(float32 a,float32 b)2212 flag float32_lt_quiet( float32 a, float32 b )
2213 {
2214 flag aSign, bSign;
2215
2216 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218 ) {
2219 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2220 float_raise( float_flag_invalid );
2221 }
2222 return 0;
2223 }
2224 aSign = extractFloat32Sign( a );
2225 bSign = extractFloat32Sign( b );
2226 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2227 return ( a != b ) && ( aSign ^ ( a < b ) );
2228
2229 }
2230 #endif /* !SOFTFLOAT_FOR_GCC */
2231
2232 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2233 /*----------------------------------------------------------------------------
2234 | Returns the result of converting the double-precision floating-point value
2235 | `a' to the 32-bit two's complement integer format. The conversion is
2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2237 | Arithmetic---which means in particular that the conversion is rounded
2238 | according to the current rounding mode. If `a' is a NaN, the largest
2239 | positive integer is returned. Otherwise, if the conversion overflows, the
2240 | largest integer with the same sign as `a' is returned.
2241 *----------------------------------------------------------------------------*/
2242
float64_to_int32(float64 a)2243 int32 float64_to_int32( float64 a )
2244 {
2245 flag aSign;
2246 int16 aExp, shiftCount;
2247 bits64 aSig;
2248
2249 aSig = extractFloat64Frac( a );
2250 aExp = extractFloat64Exp( a );
2251 aSign = extractFloat64Sign( a );
2252 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2253 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2254 shiftCount = 0x42C - aExp;
2255 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2256 return roundAndPackInt32( aSign, aSig );
2257
2258 }
2259 #endif /* !SOFTFLOAT_FOR_GCC */
2260
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 32-bit two's complement integer format. The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic, except that the conversion is always rounded toward zero.
2266 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2267 | the conversion overflows, the largest integer with the same sign as `a' is
2268 | returned.
2269 *----------------------------------------------------------------------------*/
2270
float64_to_int32_round_to_zero(float64 a)2271 int32 float64_to_int32_round_to_zero( float64 a )
2272 {
2273 flag aSign;
2274 int16 aExp, shiftCount;
2275 bits64 aSig, savedASig;
2276 int32 z;
2277
2278 aSig = extractFloat64Frac( a );
2279 aExp = extractFloat64Exp( a );
2280 aSign = extractFloat64Sign( a );
2281 if ( 0x41E < aExp ) {
2282 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2283 goto invalid;
2284 }
2285 else if ( aExp < 0x3FF ) {
2286 if ( aExp || aSig ) float_set_inexact();
2287 return 0;
2288 }
2289 aSig |= LIT64( 0x0010000000000000 );
2290 shiftCount = 0x433 - aExp;
2291 savedASig = aSig;
2292 aSig >>= shiftCount;
2293 z = aSig;
2294 if ( aSign ) z = - z;
2295 if ( ( z < 0 ) ^ aSign ) {
2296 invalid:
2297 float_raise( float_flag_invalid );
2298 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2299 }
2300 if ( ( aSig<<shiftCount ) != savedASig ) {
2301 float_set_inexact();
2302 }
2303 return z;
2304
2305 }
2306
2307 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2308 /*----------------------------------------------------------------------------
2309 | Returns the result of converting the double-precision floating-point value
2310 | `a' to the 64-bit two's complement integer format. The conversion is
2311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2312 | Arithmetic---which means in particular that the conversion is rounded
2313 | according to the current rounding mode. If `a' is a NaN, the largest
2314 | positive integer is returned. Otherwise, if the conversion overflows, the
2315 | largest integer with the same sign as `a' is returned.
2316 *----------------------------------------------------------------------------*/
2317
float64_to_int64(float64 a)2318 int64 float64_to_int64( float64 a )
2319 {
2320 flag aSign;
2321 int16 aExp, shiftCount;
2322 bits64 aSig, aSigExtra;
2323
2324 aSig = extractFloat64Frac( a );
2325 aExp = extractFloat64Exp( a );
2326 aSign = extractFloat64Sign( a );
2327 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2328 shiftCount = 0x433 - aExp;
2329 if ( shiftCount <= 0 ) {
2330 if ( 0x43E < aExp ) {
2331 float_raise( float_flag_invalid );
2332 if ( ! aSign
2333 || ( ( aExp == 0x7FF )
2334 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2335 ) {
2336 return LIT64( 0x7FFFFFFFFFFFFFFF );
2337 }
2338 return (sbits64) LIT64( 0x8000000000000000 );
2339 }
2340 aSigExtra = 0;
2341 aSig <<= - shiftCount;
2342 }
2343 else {
2344 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2345 }
2346 return roundAndPackInt64( aSign, aSig, aSigExtra );
2347
2348 }
2349
2350 /* like above, but result is unsigned */
float64_to_uint64(float64 a)2351 uint64 float64_to_uint64( float64 a )
2352 {
2353 flag aSign;
2354 int16 aExp, shiftCount;
2355 bits64 aSig, aSigExtra;
2356
2357 aSig = extractFloat64Frac( a );
2358 aExp = extractFloat64Exp( a );
2359 aSign = extractFloat64Sign( a );
2360
2361 if (aSign) {
2362 return float64_to_int64(a);
2363 }
2364
2365 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2366 shiftCount = 0x433 - aExp;
2367 if ( shiftCount <= 0 ) {
2368 if ( 0x43E < aExp ) {
2369 float_raise( float_flag_invalid );
2370 if ( ! aSign
2371 || ( ( aExp == 0x7FF )
2372 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2373 ) {
2374 return LIT64( 0x7FFFFFFFFFFFFFFF );
2375 }
2376 return (sbits64) LIT64( 0x8000000000000000 );
2377 }
2378 aSigExtra = 0;
2379 aSig <<= - shiftCount;
2380 }
2381 else {
2382 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2383 }
2384 return roundAndPackUInt64( aSig, aSigExtra );
2385
2386 }
2387
2388 /*----------------------------------------------------------------------------
2389 | Returns the result of converting the double-precision floating-point value
2390 | `a' to the 64-bit two's complement integer format. The conversion is
2391 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2392 | Arithmetic, except that the conversion is always rounded toward zero.
2393 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2394 | the conversion overflows, the largest integer with the same sign as `a' is
2395 | returned.
2396 *----------------------------------------------------------------------------*/
2397
float64_to_int64_round_to_zero(float64 a)2398 int64 float64_to_int64_round_to_zero( float64 a )
2399 {
2400 flag aSign;
2401 int16 aExp, shiftCount;
2402 bits64 aSig;
2403 int64 z;
2404
2405 aSig = extractFloat64Frac( a );
2406 aExp = extractFloat64Exp( a );
2407 aSign = extractFloat64Sign( a );
2408 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2409 shiftCount = aExp - 0x433;
2410 if ( 0 <= shiftCount ) {
2411 if ( 0x43E <= aExp ) {
2412 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2413 float_raise( float_flag_invalid );
2414 if ( ! aSign
2415 || ( ( aExp == 0x7FF )
2416 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2417 ) {
2418 return LIT64( 0x7FFFFFFFFFFFFFFF );
2419 }
2420 }
2421 return (sbits64) LIT64( 0x8000000000000000 );
2422 }
2423 z = aSig<<shiftCount;
2424 }
2425 else {
2426 if ( aExp < 0x3FE ) {
2427 if ( aExp | aSig ) float_set_inexact();
2428 return 0;
2429 }
2430 z = aSig>>( - shiftCount );
2431 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2432 float_set_inexact();
2433 }
2434 }
2435 if ( aSign ) z = - z;
2436 return z;
2437
2438 }
2439 #endif /* !SOFTFLOAT_FOR_GCC */
2440
2441 /*----------------------------------------------------------------------------
2442 | Returns the result of converting the double-precision floating-point value
2443 | `a' to the single-precision floating-point format. The conversion is
2444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 | Arithmetic.
2446 *----------------------------------------------------------------------------*/
2447
float64_to_float32(float64 a)2448 float32 float64_to_float32( float64 a )
2449 {
2450 flag aSign;
2451 int16 aExp;
2452 bits64 aSig;
2453 bits32 zSig;
2454
2455 aSig = extractFloat64Frac( a );
2456 aExp = extractFloat64Exp( a );
2457 aSign = extractFloat64Sign( a );
2458 if ( aExp == 0x7FF ) {
2459 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2460 return packFloat32( aSign, 0xFF, 0 );
2461 }
2462 shift64RightJamming( aSig, 22, &aSig );
2463 zSig = aSig;
2464 if ( aExp || zSig ) {
2465 zSig |= 0x40000000;
2466 aExp -= 0x381;
2467 }
2468 return roundAndPackFloat32( aSign, aExp, zSig );
2469
2470 }
2471
2472 #ifdef FLOATX80
2473
2474 /*----------------------------------------------------------------------------
2475 | Returns the result of converting the double-precision floating-point value
2476 | `a' to the extended double-precision floating-point format. The conversion
2477 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2478 | Arithmetic.
2479 *----------------------------------------------------------------------------*/
2480
float64_to_floatx80(float64 a)2481 floatx80 float64_to_floatx80( float64 a )
2482 {
2483 flag aSign;
2484 int16 aExp;
2485 bits64 aSig;
2486
2487 aSig = extractFloat64Frac( a );
2488 aExp = extractFloat64Exp( a );
2489 aSign = extractFloat64Sign( a );
2490 if ( aExp == 0x7FF ) {
2491 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2492 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2493 }
2494 if ( aExp == 0 ) {
2495 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2496 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2497 }
2498 return
2499 packFloatx80(
2500 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2501
2502 }
2503
2504 #endif
2505
2506 #ifdef FLOAT128
2507
2508 /*----------------------------------------------------------------------------
2509 | Returns the result of converting the double-precision floating-point value
2510 | `a' to the quadruple-precision floating-point format. The conversion is
2511 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2512 | Arithmetic.
2513 *----------------------------------------------------------------------------*/
2514
float64_to_float128(float64 a)2515 float128 float64_to_float128( float64 a )
2516 {
2517 flag aSign;
2518 int16 aExp;
2519 bits64 aSig, zSig0, zSig1;
2520
2521 aSig = extractFloat64Frac( a );
2522 aExp = extractFloat64Exp( a );
2523 aSign = extractFloat64Sign( a );
2524 if ( aExp == 0x7FF ) {
2525 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2526 return packFloat128( aSign, 0x7FFF, 0, 0 );
2527 }
2528 if ( aExp == 0 ) {
2529 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2530 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2531 --aExp;
2532 }
2533 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2534 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2535
2536 }
2537
2538 #endif
2539
2540 #ifndef SOFTFLOAT_FOR_GCC
2541 /*----------------------------------------------------------------------------
2542 | Rounds the double-precision floating-point value `a' to an integer, and
2543 | returns the result as a double-precision floating-point value. The
2544 | operation is performed according to the IEC/IEEE Standard for Binary
2545 | Floating-Point Arithmetic.
2546 *----------------------------------------------------------------------------*/
2547
float64_round_to_int(float64 a)2548 float64 float64_round_to_int( float64 a )
2549 {
2550 flag aSign;
2551 int16 aExp;
2552 bits64 lastBitMask, roundBitsMask;
2553 int8 roundingMode;
2554 float64 z;
2555
2556 aExp = extractFloat64Exp( a );
2557 if ( 0x433 <= aExp ) {
2558 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2559 return propagateFloat64NaN( a, a );
2560 }
2561 return a;
2562 }
2563 if ( aExp < 0x3FF ) {
2564 if ( (bits64) ( a<<1 ) == 0 ) return a;
2565 float_set_inexact();
2566 aSign = extractFloat64Sign( a );
2567 switch ( float_rounding_mode() ) {
2568 case float_round_nearest_even:
2569 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2570 return packFloat64( aSign, 0x3FF, 0 );
2571 }
2572 break;
2573 case float_round_down:
2574 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2575 case float_round_up:
2576 return
2577 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2578 }
2579 return packFloat64( aSign, 0, 0 );
2580 }
2581 lastBitMask = 1;
2582 lastBitMask <<= 0x433 - aExp;
2583 roundBitsMask = lastBitMask - 1;
2584 z = a;
2585 roundingMode = float_rounding_mode();
2586 if ( roundingMode == float_round_nearest_even ) {
2587 z += lastBitMask>>1;
2588 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2589 }
2590 else if ( roundingMode != float_round_to_zero ) {
2591 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2592 z += roundBitsMask;
2593 }
2594 }
2595 z &= ~ roundBitsMask;
2596 if ( z != a ) float_set_inexact();
2597 return z;
2598
2599 }
2600 #endif
2601
2602 /*----------------------------------------------------------------------------
2603 | Returns the result of adding the absolute values of the double-precision
2604 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2605 | before being returned. `zSign' is ignored if the result is a NaN.
2606 | The addition is performed according to the IEC/IEEE Standard for Binary
2607 | Floating-Point Arithmetic.
2608 *----------------------------------------------------------------------------*/
2609
addFloat64Sigs(float64 a,float64 b,flag zSign)2610 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2611 {
2612 int16 aExp, bExp, zExp;
2613 bits64 aSig, bSig, zSig;
2614 int16 expDiff;
2615
2616 aSig = extractFloat64Frac( a );
2617 aExp = extractFloat64Exp( a );
2618 bSig = extractFloat64Frac( b );
2619 bExp = extractFloat64Exp( b );
2620 expDiff = aExp - bExp;
2621 aSig <<= 9;
2622 bSig <<= 9;
2623 if ( 0 < expDiff ) {
2624 if ( aExp == 0x7FF ) {
2625 if ( aSig ) return propagateFloat64NaN( a, b );
2626 return a;
2627 }
2628 if ( bExp == 0 ) {
2629 --expDiff;
2630 }
2631 else {
2632 bSig |= LIT64( 0x2000000000000000 );
2633 }
2634 shift64RightJamming( bSig, expDiff, &bSig );
2635 zExp = aExp;
2636 }
2637 else if ( expDiff < 0 ) {
2638 if ( bExp == 0x7FF ) {
2639 if ( bSig ) return propagateFloat64NaN( a, b );
2640 return packFloat64( zSign, 0x7FF, 0 );
2641 }
2642 if ( aExp == 0 ) {
2643 ++expDiff;
2644 }
2645 else {
2646 aSig |= LIT64( 0x2000000000000000 );
2647 }
2648 shift64RightJamming( aSig, - expDiff, &aSig );
2649 zExp = bExp;
2650 }
2651 else {
2652 if ( aExp == 0x7FF ) {
2653 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2654 return a;
2655 }
2656 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2657 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2658 zExp = aExp;
2659 goto roundAndPack;
2660 }
2661 aSig |= LIT64( 0x2000000000000000 );
2662 zSig = ( aSig + bSig )<<1;
2663 --zExp;
2664 if ( (sbits64) zSig < 0 ) {
2665 zSig = aSig + bSig;
2666 ++zExp;
2667 }
2668 roundAndPack:
2669 return roundAndPackFloat64( zSign, zExp, zSig );
2670
2671 }
2672
2673 /*----------------------------------------------------------------------------
2674 | Returns the result of subtracting the absolute values of the double-
2675 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2676 | difference is negated before being returned. `zSign' is ignored if the
2677 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2678 | Standard for Binary Floating-Point Arithmetic.
2679 *----------------------------------------------------------------------------*/
2680
subFloat64Sigs(float64 a,float64 b,flag zSign)2681 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2682 {
2683 int16 aExp, bExp, zExp;
2684 bits64 aSig, bSig, zSig;
2685 int16 expDiff;
2686
2687 aSig = extractFloat64Frac( a );
2688 aExp = extractFloat64Exp( a );
2689 bSig = extractFloat64Frac( b );
2690 bExp = extractFloat64Exp( b );
2691 expDiff = aExp - bExp;
2692 aSig <<= 10;
2693 bSig <<= 10;
2694 if ( 0 < expDiff ) goto aExpBigger;
2695 if ( expDiff < 0 ) goto bExpBigger;
2696 if ( aExp == 0x7FF ) {
2697 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2698 float_raise( float_flag_invalid );
2699 return float64_default_nan;
2700 }
2701 if ( aExp == 0 ) {
2702 aExp = 1;
2703 bExp = 1;
2704 }
2705 if ( bSig < aSig ) goto aBigger;
2706 if ( aSig < bSig ) goto bBigger;
2707 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2708 bExpBigger:
2709 if ( bExp == 0x7FF ) {
2710 if ( bSig ) return propagateFloat64NaN( a, b );
2711 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2712 }
2713 if ( aExp == 0 ) {
2714 ++expDiff;
2715 }
2716 else {
2717 aSig |= LIT64( 0x4000000000000000 );
2718 }
2719 shift64RightJamming( aSig, - expDiff, &aSig );
2720 bSig |= LIT64( 0x4000000000000000 );
2721 bBigger:
2722 zSig = bSig - aSig;
2723 zExp = bExp;
2724 zSign ^= 1;
2725 goto normalizeRoundAndPack;
2726 aExpBigger:
2727 if ( aExp == 0x7FF ) {
2728 if ( aSig ) return propagateFloat64NaN( a, b );
2729 return a;
2730 }
2731 if ( bExp == 0 ) {
2732 --expDiff;
2733 }
2734 else {
2735 bSig |= LIT64( 0x4000000000000000 );
2736 }
2737 shift64RightJamming( bSig, expDiff, &bSig );
2738 aSig |= LIT64( 0x4000000000000000 );
2739 aBigger:
2740 zSig = aSig - bSig;
2741 zExp = aExp;
2742 normalizeRoundAndPack:
2743 --zExp;
2744 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2745
2746 }
2747
2748 /*----------------------------------------------------------------------------
2749 | Returns the result of adding the double-precision floating-point values `a'
2750 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2751 | Binary Floating-Point Arithmetic.
2752 *----------------------------------------------------------------------------*/
2753
float64_add(float64 a,float64 b)2754 float64 float64_add( float64 a, float64 b )
2755 {
2756 flag aSign, bSign;
2757
2758 aSign = extractFloat64Sign( a );
2759 bSign = extractFloat64Sign( b );
2760 if ( aSign == bSign ) {
2761 return addFloat64Sigs( a, b, aSign );
2762 }
2763 else {
2764 return subFloat64Sigs( a, b, aSign );
2765 }
2766
2767 }
2768
2769 /*----------------------------------------------------------------------------
2770 | Returns the result of subtracting the double-precision floating-point values
2771 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2772 | for Binary Floating-Point Arithmetic.
2773 *----------------------------------------------------------------------------*/
2774
float64_sub(float64 a,float64 b)2775 float64 float64_sub( float64 a, float64 b )
2776 {
2777 flag aSign, bSign;
2778
2779 aSign = extractFloat64Sign( a );
2780 bSign = extractFloat64Sign( b );
2781 if ( aSign == bSign ) {
2782 return subFloat64Sigs( a, b, aSign );
2783 }
2784 else {
2785 return addFloat64Sigs( a, b, aSign );
2786 }
2787
2788 }
2789
2790 /*----------------------------------------------------------------------------
2791 | Returns the result of multiplying the double-precision floating-point values
2792 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2793 | for Binary Floating-Point Arithmetic.
2794 *----------------------------------------------------------------------------*/
2795
float64_mul(float64 a,float64 b)2796 float64 float64_mul( float64 a, float64 b )
2797 {
2798 flag aSign, bSign, zSign;
2799 int16 aExp, bExp, zExp;
2800 bits64 aSig, bSig, zSig0, zSig1;
2801
2802 aSig = extractFloat64Frac( a );
2803 aExp = extractFloat64Exp( a );
2804 aSign = extractFloat64Sign( a );
2805 bSig = extractFloat64Frac( b );
2806 bExp = extractFloat64Exp( b );
2807 bSign = extractFloat64Sign( b );
2808 zSign = aSign ^ bSign;
2809 if ( aExp == 0x7FF ) {
2810 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2811 return propagateFloat64NaN( a, b );
2812 }
2813 if ( ( bExp | bSig ) == 0 ) {
2814 float_raise( float_flag_invalid );
2815 return float64_default_nan;
2816 }
2817 return packFloat64( zSign, 0x7FF, 0 );
2818 }
2819 if ( bExp == 0x7FF ) {
2820 if ( bSig ) return propagateFloat64NaN( a, b );
2821 if ( ( aExp | aSig ) == 0 ) {
2822 float_raise( float_flag_invalid );
2823 return float64_default_nan;
2824 }
2825 return packFloat64( zSign, 0x7FF, 0 );
2826 }
2827 if ( aExp == 0 ) {
2828 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2829 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2830 }
2831 if ( bExp == 0 ) {
2832 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2833 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834 }
2835 zExp = aExp + bExp - 0x3FF;
2836 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2837 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2838 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2839 zSig0 |= ( zSig1 != 0 );
2840 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2841 zSig0 <<= 1;
2842 --zExp;
2843 }
2844 return roundAndPackFloat64( zSign, zExp, zSig0 );
2845
2846 }
2847
2848 /*----------------------------------------------------------------------------
2849 | Returns the result of dividing the double-precision floating-point value `a'
2850 | by the corresponding value `b'. The operation is performed according to
2851 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2852 *----------------------------------------------------------------------------*/
2853
float64_div(float64 a,float64 b)2854 float64 float64_div( float64 a, float64 b )
2855 {
2856 flag aSign, bSign, zSign;
2857 int16 aExp, bExp, zExp;
2858 bits64 aSig, bSig, zSig;
2859 bits64 rem0, rem1;
2860 bits64 term0, term1;
2861
2862 aSig = extractFloat64Frac( a );
2863 aExp = extractFloat64Exp( a );
2864 aSign = extractFloat64Sign( a );
2865 bSig = extractFloat64Frac( b );
2866 bExp = extractFloat64Exp( b );
2867 bSign = extractFloat64Sign( b );
2868 zSign = aSign ^ bSign;
2869 if ( aExp == 0x7FF ) {
2870 if ( aSig ) return propagateFloat64NaN( a, b );
2871 if ( bExp == 0x7FF ) {
2872 if ( bSig ) return propagateFloat64NaN( a, b );
2873 float_raise( float_flag_invalid );
2874 return float64_default_nan;
2875 }
2876 return packFloat64( zSign, 0x7FF, 0 );
2877 }
2878 if ( bExp == 0x7FF ) {
2879 if ( bSig ) return propagateFloat64NaN( a, b );
2880 return packFloat64( zSign, 0, 0 );
2881 }
2882 if ( bExp == 0 ) {
2883 if ( bSig == 0 ) {
2884 if ( ( aExp | aSig ) == 0 ) {
2885 float_raise( float_flag_invalid );
2886 return float64_default_nan;
2887 }
2888 float_raise( float_flag_divbyzero );
2889 return packFloat64( zSign, 0x7FF, 0 );
2890 }
2891 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2892 }
2893 if ( aExp == 0 ) {
2894 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2895 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2896 }
2897 zExp = aExp - bExp + 0x3FD;
2898 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2899 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2900 if ( bSig <= ( aSig + aSig ) ) {
2901 aSig >>= 1;
2902 ++zExp;
2903 }
2904 zSig = estimateDiv128To64( aSig, 0, bSig );
2905 if ( ( zSig & 0x1FF ) <= 2 ) {
2906 mul64To128( bSig, zSig, &term0, &term1 );
2907 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2908 while ( (sbits64) rem0 < 0 ) {
2909 --zSig;
2910 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2911 }
2912 zSig |= ( rem1 != 0 );
2913 }
2914 return roundAndPackFloat64( zSign, zExp, zSig );
2915
2916 }
2917
2918 #ifndef SOFTFLOAT_FOR_GCC
2919 /*----------------------------------------------------------------------------
2920 | Returns the remainder of the double-precision floating-point value `a'
2921 | with respect to the corresponding value `b'. The operation is performed
2922 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923 *----------------------------------------------------------------------------*/
2924
float64_rem(float64 a,float64 b)2925 float64 float64_rem( float64 a, float64 b )
2926 {
2927 flag aSign, bSign __unused, zSign;
2928 int16 aExp, bExp, expDiff;
2929 bits64 aSig, bSig;
2930 bits64 q, alternateASig;
2931 sbits64 sigMean;
2932
2933 aSig = extractFloat64Frac( a );
2934 aExp = extractFloat64Exp( a );
2935 aSign = extractFloat64Sign( a );
2936 bSig = extractFloat64Frac( b );
2937 bExp = extractFloat64Exp( b );
2938 bSign = extractFloat64Sign( b );
2939 if ( aExp == 0x7FF ) {
2940 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2941 return propagateFloat64NaN( a, b );
2942 }
2943 float_raise( float_flag_invalid );
2944 return float64_default_nan;
2945 }
2946 if ( bExp == 0x7FF ) {
2947 if ( bSig ) return propagateFloat64NaN( a, b );
2948 return a;
2949 }
2950 if ( bExp == 0 ) {
2951 if ( bSig == 0 ) {
2952 float_raise( float_flag_invalid );
2953 return float64_default_nan;
2954 }
2955 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956 }
2957 if ( aExp == 0 ) {
2958 if ( aSig == 0 ) return a;
2959 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960 }
2961 expDiff = aExp - bExp;
2962 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2963 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964 if ( expDiff < 0 ) {
2965 if ( expDiff < -1 ) return a;
2966 aSig >>= 1;
2967 }
2968 q = ( bSig <= aSig );
2969 if ( q ) aSig -= bSig;
2970 expDiff -= 64;
2971 while ( 0 < expDiff ) {
2972 q = estimateDiv128To64( aSig, 0, bSig );
2973 q = ( 2 < q ) ? q - 2 : 0;
2974 aSig = - ( ( bSig>>2 ) * q );
2975 expDiff -= 62;
2976 }
2977 expDiff += 64;
2978 if ( 0 < expDiff ) {
2979 q = estimateDiv128To64( aSig, 0, bSig );
2980 q = ( 2 < q ) ? q - 2 : 0;
2981 q >>= 64 - expDiff;
2982 bSig >>= 2;
2983 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2984 }
2985 else {
2986 aSig >>= 2;
2987 bSig >>= 2;
2988 }
2989 do {
2990 alternateASig = aSig;
2991 ++q;
2992 aSig -= bSig;
2993 } while ( 0 <= (sbits64) aSig );
2994 sigMean = aSig + alternateASig;
2995 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2996 aSig = alternateASig;
2997 }
2998 zSign = ( (sbits64) aSig < 0 );
2999 if ( zSign ) aSig = - aSig;
3000 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3001
3002 }
3003
3004 /*----------------------------------------------------------------------------
3005 | Returns the square root of the double-precision floating-point value `a'.
3006 | The operation is performed according to the IEC/IEEE Standard for Binary
3007 | Floating-Point Arithmetic.
3008 *----------------------------------------------------------------------------*/
3009
float64_sqrt(float64 a)3010 float64 float64_sqrt( float64 a )
3011 {
3012 flag aSign;
3013 int16 aExp, zExp;
3014 bits64 aSig, zSig, doubleZSig;
3015 bits64 rem0, rem1, term0, term1;
3016
3017 aSig = extractFloat64Frac( a );
3018 aExp = extractFloat64Exp( a );
3019 aSign = extractFloat64Sign( a );
3020 if ( aExp == 0x7FF ) {
3021 if ( aSig ) return propagateFloat64NaN( a, a );
3022 if ( ! aSign ) return a;
3023 float_raise( float_flag_invalid );
3024 return float64_default_nan;
3025 }
3026 if ( aSign ) {
3027 if ( ( aExp | aSig ) == 0 ) return a;
3028 float_raise( float_flag_invalid );
3029 return float64_default_nan;
3030 }
3031 if ( aExp == 0 ) {
3032 if ( aSig == 0 ) return 0;
3033 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3034 }
3035 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3036 aSig |= LIT64( 0x0010000000000000 );
3037 zSig = estimateSqrt32( aExp, aSig>>21 );
3038 aSig <<= 9 - ( aExp & 1 );
3039 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3040 if ( ( zSig & 0x1FF ) <= 5 ) {
3041 doubleZSig = zSig<<1;
3042 mul64To128( zSig, zSig, &term0, &term1 );
3043 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3044 while ( (sbits64) rem0 < 0 ) {
3045 --zSig;
3046 doubleZSig -= 2;
3047 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3048 }
3049 zSig |= ( ( rem0 | rem1 ) != 0 );
3050 }
3051 return roundAndPackFloat64( 0, zExp, zSig );
3052
3053 }
3054 #endif
3055
3056 /*----------------------------------------------------------------------------
3057 | Returns 1 if the double-precision floating-point value `a' is equal to the
3058 | corresponding value `b', and 0 otherwise. The comparison is performed
3059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3060 *----------------------------------------------------------------------------*/
3061
float64_eq(float64 a,float64 b)3062 flag float64_eq( float64 a, float64 b )
3063 {
3064
3065 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3066 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3067 ) {
3068 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3069 float_raise( float_flag_invalid );
3070 }
3071 return 0;
3072 }
3073 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3074
3075 }
3076
3077 /*----------------------------------------------------------------------------
3078 | Returns 1 if the double-precision floating-point value `a' is less than or
3079 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3081 | Arithmetic.
3082 *----------------------------------------------------------------------------*/
3083
float64_le(float64 a,float64 b)3084 flag float64_le( float64 a, float64 b )
3085 {
3086 flag aSign, bSign;
3087
3088 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3089 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3090 ) {
3091 float_raise( float_flag_invalid );
3092 return 0;
3093 }
3094 aSign = extractFloat64Sign( a );
3095 bSign = extractFloat64Sign( b );
3096 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3097 return ( a == b ) || ( aSign ^ ( a < b ) );
3098
3099 }
3100
3101 /*----------------------------------------------------------------------------
3102 | Returns 1 if the double-precision floating-point value `a' is less than
3103 | the corresponding value `b', and 0 otherwise. The comparison is performed
3104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3105 *----------------------------------------------------------------------------*/
3106
float64_lt(float64 a,float64 b)3107 flag float64_lt( float64 a, float64 b )
3108 {
3109 flag aSign, bSign;
3110
3111 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3112 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3113 ) {
3114 float_raise( float_flag_invalid );
3115 return 0;
3116 }
3117 aSign = extractFloat64Sign( a );
3118 bSign = extractFloat64Sign( b );
3119 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3120 return ( a != b ) && ( aSign ^ ( a < b ) );
3121
3122 }
3123
3124 #ifndef SOFTFLOAT_FOR_GCC
3125 /*----------------------------------------------------------------------------
3126 | Returns 1 if the double-precision floating-point value `a' is equal to the
3127 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3128 | if either operand is a NaN. Otherwise, the comparison is performed
3129 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3130 *----------------------------------------------------------------------------*/
3131
float64_eq_signaling(float64 a,float64 b)3132 flag float64_eq_signaling( float64 a, float64 b )
3133 {
3134
3135 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3136 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3137 ) {
3138 float_raise( float_flag_invalid );
3139 return 0;
3140 }
3141 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3142
3143 }
3144
3145 /*----------------------------------------------------------------------------
3146 | Returns 1 if the double-precision floating-point value `a' is less than or
3147 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3148 | cause an exception. Otherwise, the comparison is performed according to the
3149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3150 *----------------------------------------------------------------------------*/
3151
float64_le_quiet(float64 a,float64 b)3152 flag float64_le_quiet( float64 a, float64 b )
3153 {
3154 flag aSign, bSign;
3155
3156 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3157 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3158 ) {
3159 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3160 float_raise( float_flag_invalid );
3161 }
3162 return 0;
3163 }
3164 aSign = extractFloat64Sign( a );
3165 bSign = extractFloat64Sign( b );
3166 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3167 return ( a == b ) || ( aSign ^ ( a < b ) );
3168
3169 }
3170
3171 /*----------------------------------------------------------------------------
3172 | Returns 1 if the double-precision floating-point value `a' is less than
3173 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3174 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3175 | Standard for Binary Floating-Point Arithmetic.
3176 *----------------------------------------------------------------------------*/
3177
float64_lt_quiet(float64 a,float64 b)3178 flag float64_lt_quiet( float64 a, float64 b )
3179 {
3180 flag aSign, bSign;
3181
3182 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3183 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3184 ) {
3185 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3186 float_raise( float_flag_invalid );
3187 }
3188 return 0;
3189 }
3190 aSign = extractFloat64Sign( a );
3191 bSign = extractFloat64Sign( b );
3192 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3193 return ( a != b ) && ( aSign ^ ( a < b ) );
3194
3195 }
3196 #endif
3197
3198 #ifdef FLOATX80
3199
3200 /*----------------------------------------------------------------------------
3201 | Returns the result of converting the extended double-precision floating-
3202 | point value `a' to the 32-bit two's complement integer format. The
3203 | conversion is performed according to the IEC/IEEE Standard for Binary
3204 | Floating-Point Arithmetic---which means in particular that the conversion
3205 | is rounded according to the current rounding mode. If `a' is a NaN, the
3206 | largest positive integer is returned. Otherwise, if the conversion
3207 | overflows, the largest integer with the same sign as `a' is returned.
3208 *----------------------------------------------------------------------------*/
3209
floatx80_to_int32(floatx80 a)3210 int32 floatx80_to_int32( floatx80 a )
3211 {
3212 flag aSign;
3213 int32 aExp, shiftCount;
3214 bits64 aSig;
3215
3216 aSig = extractFloatx80Frac( a );
3217 aExp = extractFloatx80Exp( a );
3218 aSign = extractFloatx80Sign( a );
3219 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3220 shiftCount = 0x4037 - aExp;
3221 if ( shiftCount <= 0 ) shiftCount = 1;
3222 shift64RightJamming( aSig, shiftCount, &aSig );
3223 return roundAndPackInt32( aSign, aSig );
3224
3225 }
3226
3227 /*----------------------------------------------------------------------------
3228 | Returns the result of converting the extended double-precision floating-
3229 | point value `a' to the 32-bit two's complement integer format. The
3230 | conversion is performed according to the IEC/IEEE Standard for Binary
3231 | Floating-Point Arithmetic, except that the conversion is always rounded
3232 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3233 | Otherwise, if the conversion overflows, the largest integer with the same
3234 | sign as `a' is returned.
3235 *----------------------------------------------------------------------------*/
3236
floatx80_to_int32_round_to_zero(floatx80 a)3237 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3238 {
3239 flag aSign;
3240 int32 aExp, shiftCount;
3241 bits64 aSig, savedASig;
3242 int32 z;
3243
3244 aSig = extractFloatx80Frac( a );
3245 aExp = extractFloatx80Exp( a );
3246 aSign = extractFloatx80Sign( a );
3247 if ( 0x401E < aExp ) {
3248 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3249 goto invalid;
3250 }
3251 else if ( aExp < 0x3FFF ) {
3252 if ( aExp || aSig ) float_set_inexact();
3253 return 0;
3254 }
3255 shiftCount = 0x403E - aExp;
3256 savedASig = aSig;
3257 aSig >>= shiftCount;
3258 z = aSig;
3259 if ( aSign ) z = - z;
3260 if ( ( z < 0 ) ^ aSign ) {
3261 invalid:
3262 float_raise( float_flag_invalid );
3263 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3264 }
3265 if ( ( aSig<<shiftCount ) != savedASig ) {
3266 float_set_inexact();
3267 }
3268 return z;
3269
3270 }
3271
3272 /*----------------------------------------------------------------------------
3273 | Returns the result of converting the extended double-precision floating-
3274 | point value `a' to the 64-bit two's complement integer format. The
3275 | conversion is performed according to the IEC/IEEE Standard for Binary
3276 | Floating-Point Arithmetic---which means in particular that the conversion
3277 | is rounded according to the current rounding mode. If `a' is a NaN,
3278 | the largest positive integer is returned. Otherwise, if the conversion
3279 | overflows, the largest integer with the same sign as `a' is returned.
3280 *----------------------------------------------------------------------------*/
3281
floatx80_to_int64(floatx80 a)3282 int64 floatx80_to_int64( floatx80 a )
3283 {
3284 flag aSign;
3285 int32 aExp, shiftCount;
3286 bits64 aSig, aSigExtra;
3287
3288 aSig = extractFloatx80Frac( a );
3289 aExp = extractFloatx80Exp( a );
3290 aSign = extractFloatx80Sign( a );
3291 shiftCount = 0x403E - aExp;
3292 if ( shiftCount <= 0 ) {
3293 if ( shiftCount ) {
3294 float_raise( float_flag_invalid );
3295 if ( ! aSign
3296 || ( ( aExp == 0x7FFF )
3297 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3298 ) {
3299 return LIT64( 0x7FFFFFFFFFFFFFFF );
3300 }
3301 return (sbits64) LIT64( 0x8000000000000000 );
3302 }
3303 aSigExtra = 0;
3304 }
3305 else {
3306 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3307 }
3308 return roundAndPackInt64( aSign, aSig, aSigExtra );
3309
3310 }
3311
3312 /*----------------------------------------------------------------------------
3313 | Returns the result of converting the extended double-precision floating-
3314 | point value `a' to the 64-bit two's complement integer format. The
3315 | conversion is performed according to the IEC/IEEE Standard for Binary
3316 | Floating-Point Arithmetic, except that the conversion is always rounded
3317 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3318 | Otherwise, if the conversion overflows, the largest integer with the same
3319 | sign as `a' is returned.
3320 *----------------------------------------------------------------------------*/
3321
floatx80_to_int64_round_to_zero(floatx80 a)3322 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3323 {
3324 flag aSign;
3325 int32 aExp, shiftCount;
3326 bits64 aSig;
3327 int64 z;
3328
3329 aSig = extractFloatx80Frac( a );
3330 aExp = extractFloatx80Exp( a );
3331 aSign = extractFloatx80Sign( a );
3332 shiftCount = aExp - 0x403E;
3333 if ( 0 <= shiftCount ) {
3334 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3335 if ( ( a.high != 0xC03E ) || aSig ) {
3336 float_raise( float_flag_invalid );
3337 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3338 return LIT64( 0x7FFFFFFFFFFFFFFF );
3339 }
3340 }
3341 return (sbits64) LIT64( 0x8000000000000000 );
3342 }
3343 else if ( aExp < 0x3FFF ) {
3344 if ( aExp | aSig ) float_set_inexact();
3345 return 0;
3346 }
3347 z = aSig>>( - shiftCount );
3348 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3349 float_set_inexact();
3350 }
3351 if ( aSign ) z = - z;
3352 return z;
3353
3354 }
3355
3356 /*----------------------------------------------------------------------------
3357 | Returns the result of converting the extended double-precision floating-
3358 | point value `a' to the single-precision floating-point format. The
3359 | conversion is performed according to the IEC/IEEE Standard for Binary
3360 | Floating-Point Arithmetic.
3361 *----------------------------------------------------------------------------*/
3362
floatx80_to_float32(floatx80 a)3363 float32 floatx80_to_float32( floatx80 a )
3364 {
3365 flag aSign;
3366 int32 aExp;
3367 bits64 aSig;
3368
3369 aSig = extractFloatx80Frac( a );
3370 aExp = extractFloatx80Exp( a );
3371 aSign = extractFloatx80Sign( a );
3372 if ( aExp == 0x7FFF ) {
3373 if ( (bits64) ( aSig<<1 ) ) {
3374 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3375 }
3376 return packFloat32( aSign, 0xFF, 0 );
3377 }
3378 shift64RightJamming( aSig, 33, &aSig );
3379 if ( aExp || aSig ) aExp -= 0x3F81;
3380 return roundAndPackFloat32( aSign, aExp, aSig );
3381
3382 }
3383
3384 /*----------------------------------------------------------------------------
3385 | Returns the result of converting the extended double-precision floating-
3386 | point value `a' to the double-precision floating-point format. The
3387 | conversion is performed according to the IEC/IEEE Standard for Binary
3388 | Floating-Point Arithmetic.
3389 *----------------------------------------------------------------------------*/
3390
floatx80_to_float64(floatx80 a)3391 float64 floatx80_to_float64( floatx80 a )
3392 {
3393 flag aSign;
3394 int32 aExp;
3395 bits64 aSig, zSig;
3396
3397 aSig = extractFloatx80Frac( a );
3398 aExp = extractFloatx80Exp( a );
3399 aSign = extractFloatx80Sign( a );
3400 if ( aExp == 0x7FFF ) {
3401 if ( (bits64) ( aSig<<1 ) ) {
3402 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3403 }
3404 return packFloat64( aSign, 0x7FF, 0 );
3405 }
3406 shift64RightJamming( aSig, 1, &zSig );
3407 if ( aExp || aSig ) aExp -= 0x3C01;
3408 return roundAndPackFloat64( aSign, aExp, zSig );
3409
3410 }
3411
3412 #ifdef FLOAT128
3413
3414 /*----------------------------------------------------------------------------
3415 | Returns the result of converting the extended double-precision floating-
3416 | point value `a' to the quadruple-precision floating-point format. The
3417 | conversion is performed according to the IEC/IEEE Standard for Binary
3418 | Floating-Point Arithmetic.
3419 *----------------------------------------------------------------------------*/
3420
floatx80_to_float128(floatx80 a)3421 float128 floatx80_to_float128( floatx80 a )
3422 {
3423 flag aSign;
3424 int16 aExp;
3425 bits64 aSig, zSig0, zSig1;
3426
3427 aSig = extractFloatx80Frac( a );
3428 aExp = extractFloatx80Exp( a );
3429 aSign = extractFloatx80Sign( a );
3430 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3431 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3432 }
3433 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3434 return packFloat128( aSign, aExp, zSig0, zSig1 );
3435
3436 }
3437
3438 #endif
3439
3440 /*----------------------------------------------------------------------------
3441 | Rounds the extended double-precision floating-point value `a' to an integer,
3442 | and returns the result as an extended quadruple-precision floating-point
3443 | value. The operation is performed according to the IEC/IEEE Standard for
3444 | Binary Floating-Point Arithmetic.
3445 *----------------------------------------------------------------------------*/
3446
floatx80_round_to_int(floatx80 a)3447 floatx80 floatx80_round_to_int( floatx80 a )
3448 {
3449 flag aSign;
3450 int32 aExp;
3451 bits64 lastBitMask, roundBitsMask;
3452 int8 roundingMode;
3453 floatx80 z;
3454
3455 aExp = extractFloatx80Exp( a );
3456 if ( 0x403E <= aExp ) {
3457 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3458 return propagateFloatx80NaN( a, a );
3459 }
3460 return a;
3461 }
3462 if ( aExp < 0x3FFF ) {
3463 if ( ( aExp == 0 )
3464 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3465 return a;
3466 }
3467 float_set_inexact();
3468 aSign = extractFloatx80Sign( a );
3469 switch ( float_rounding_mode() ) {
3470 case float_round_nearest_even:
3471 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3472 ) {
3473 return
3474 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3475 }
3476 break;
3477 case float_round_down:
3478 return
3479 aSign ?
3480 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3481 : packFloatx80( 0, 0, 0 );
3482 case float_round_up:
3483 return
3484 aSign ? packFloatx80( 1, 0, 0 )
3485 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3486 }
3487 return packFloatx80( aSign, 0, 0 );
3488 }
3489 lastBitMask = 1;
3490 lastBitMask <<= 0x403E - aExp;
3491 roundBitsMask = lastBitMask - 1;
3492 z = a;
3493 roundingMode = float_rounding_mode();
3494 if ( roundingMode == float_round_nearest_even ) {
3495 z.low += lastBitMask>>1;
3496 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3497 }
3498 else if ( roundingMode != float_round_to_zero ) {
3499 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3500 z.low += roundBitsMask;
3501 }
3502 }
3503 z.low &= ~ roundBitsMask;
3504 if ( z.low == 0 ) {
3505 ++z.high;
3506 z.low = LIT64( 0x8000000000000000 );
3507 }
3508 if ( z.low != a.low ) float_set_inexact();
3509 return z;
3510
3511 }
3512
3513 /*----------------------------------------------------------------------------
3514 | Returns the result of adding the absolute values of the extended double-
3515 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3516 | negated before being returned. `zSign' is ignored if the result is a NaN.
3517 | The addition is performed according to the IEC/IEEE Standard for Binary
3518 | Floating-Point Arithmetic.
3519 *----------------------------------------------------------------------------*/
3520
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3521 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3522 {
3523 int32 aExp, bExp, zExp;
3524 bits64 aSig, bSig, zSig0, zSig1;
3525 int32 expDiff;
3526
3527 aSig = extractFloatx80Frac( a );
3528 aExp = extractFloatx80Exp( a );
3529 bSig = extractFloatx80Frac( b );
3530 bExp = extractFloatx80Exp( b );
3531 expDiff = aExp - bExp;
3532 if ( 0 < expDiff ) {
3533 if ( aExp == 0x7FFF ) {
3534 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3535 return a;
3536 }
3537 if ( bExp == 0 ) --expDiff;
3538 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3539 zExp = aExp;
3540 }
3541 else if ( expDiff < 0 ) {
3542 if ( bExp == 0x7FFF ) {
3543 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3544 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3545 }
3546 if ( aExp == 0 ) ++expDiff;
3547 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3548 zExp = bExp;
3549 }
3550 else {
3551 if ( aExp == 0x7FFF ) {
3552 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3553 return propagateFloatx80NaN( a, b );
3554 }
3555 return a;
3556 }
3557 zSig1 = 0;
3558 zSig0 = aSig + bSig;
3559 if ( aExp == 0 ) {
3560 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3561 goto roundAndPack;
3562 }
3563 zExp = aExp;
3564 goto shiftRight1;
3565 }
3566 zSig0 = aSig + bSig;
3567 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3568 shiftRight1:
3569 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3570 zSig0 |= LIT64( 0x8000000000000000 );
3571 ++zExp;
3572 roundAndPack:
3573 return
3574 roundAndPackFloatx80(
3575 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3576
3577 }
3578
3579 /*----------------------------------------------------------------------------
3580 | Returns the result of subtracting the absolute values of the extended
3581 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3582 | difference is negated before being returned. `zSign' is ignored if the
3583 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3584 | Standard for Binary Floating-Point Arithmetic.
3585 *----------------------------------------------------------------------------*/
3586
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3587 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3588 {
3589 int32 aExp, bExp, zExp;
3590 bits64 aSig, bSig, zSig0, zSig1;
3591 int32 expDiff;
3592 floatx80 z;
3593
3594 aSig = extractFloatx80Frac( a );
3595 aExp = extractFloatx80Exp( a );
3596 bSig = extractFloatx80Frac( b );
3597 bExp = extractFloatx80Exp( b );
3598 expDiff = aExp - bExp;
3599 if ( 0 < expDiff ) goto aExpBigger;
3600 if ( expDiff < 0 ) goto bExpBigger;
3601 if ( aExp == 0x7FFF ) {
3602 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3603 return propagateFloatx80NaN( a, b );
3604 }
3605 float_raise( float_flag_invalid );
3606 z.low = floatx80_default_nan_low;
3607 z.high = floatx80_default_nan_high;
3608 return z;
3609 }
3610 if ( aExp == 0 ) {
3611 aExp = 1;
3612 bExp = 1;
3613 }
3614 zSig1 = 0;
3615 if ( bSig < aSig ) goto aBigger;
3616 if ( aSig < bSig ) goto bBigger;
3617 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3618 bExpBigger:
3619 if ( bExp == 0x7FFF ) {
3620 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3621 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3622 }
3623 if ( aExp == 0 ) ++expDiff;
3624 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3625 bBigger:
3626 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3627 zExp = bExp;
3628 zSign ^= 1;
3629 goto normalizeRoundAndPack;
3630 aExpBigger:
3631 if ( aExp == 0x7FFF ) {
3632 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3633 return a;
3634 }
3635 if ( bExp == 0 ) --expDiff;
3636 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3637 aBigger:
3638 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3639 zExp = aExp;
3640 normalizeRoundAndPack:
3641 return
3642 normalizeRoundAndPackFloatx80(
3643 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3644
3645 }
3646
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of adding the extended double-precision floating-point
3649 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3650 | Standard for Binary Floating-Point Arithmetic.
3651 *----------------------------------------------------------------------------*/
3652
floatx80_add(floatx80 a,floatx80 b)3653 floatx80 floatx80_add( floatx80 a, floatx80 b )
3654 {
3655 flag aSign, bSign;
3656
3657 aSign = extractFloatx80Sign( a );
3658 bSign = extractFloatx80Sign( b );
3659 if ( aSign == bSign ) {
3660 return addFloatx80Sigs( a, b, aSign );
3661 }
3662 else {
3663 return subFloatx80Sigs( a, b, aSign );
3664 }
3665
3666 }
3667
3668 /*----------------------------------------------------------------------------
3669 | Returns the result of subtracting the extended double-precision floating-
3670 | point values `a' and `b'. The operation is performed according to the
3671 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3672 *----------------------------------------------------------------------------*/
3673
floatx80_sub(floatx80 a,floatx80 b)3674 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3675 {
3676 flag aSign, bSign;
3677
3678 aSign = extractFloatx80Sign( a );
3679 bSign = extractFloatx80Sign( b );
3680 if ( aSign == bSign ) {
3681 return subFloatx80Sigs( a, b, aSign );
3682 }
3683 else {
3684 return addFloatx80Sigs( a, b, aSign );
3685 }
3686
3687 }
3688
3689 /*----------------------------------------------------------------------------
3690 | Returns the result of multiplying the extended double-precision floating-
3691 | point values `a' and `b'. The operation is performed according to the
3692 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3693 *----------------------------------------------------------------------------*/
3694
floatx80_mul(floatx80 a,floatx80 b)3695 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3696 {
3697 flag aSign, bSign, zSign;
3698 int32 aExp, bExp, zExp;
3699 bits64 aSig, bSig, zSig0, zSig1;
3700 floatx80 z;
3701
3702 aSig = extractFloatx80Frac( a );
3703 aExp = extractFloatx80Exp( a );
3704 aSign = extractFloatx80Sign( a );
3705 bSig = extractFloatx80Frac( b );
3706 bExp = extractFloatx80Exp( b );
3707 bSign = extractFloatx80Sign( b );
3708 zSign = aSign ^ bSign;
3709 if ( aExp == 0x7FFF ) {
3710 if ( (bits64) ( aSig<<1 )
3711 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3712 return propagateFloatx80NaN( a, b );
3713 }
3714 if ( ( bExp | bSig ) == 0 ) goto invalid;
3715 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3716 }
3717 if ( bExp == 0x7FFF ) {
3718 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3719 if ( ( aExp | aSig ) == 0 ) {
3720 invalid:
3721 float_raise( float_flag_invalid );
3722 z.low = floatx80_default_nan_low;
3723 z.high = floatx80_default_nan_high;
3724 return z;
3725 }
3726 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3727 }
3728 if ( aExp == 0 ) {
3729 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3730 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3731 }
3732 if ( bExp == 0 ) {
3733 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3734 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3735 }
3736 zExp = aExp + bExp - 0x3FFE;
3737 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3738 if ( 0 < (sbits64) zSig0 ) {
3739 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3740 --zExp;
3741 }
3742 return
3743 roundAndPackFloatx80(
3744 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3745
3746 }
3747
3748 /*----------------------------------------------------------------------------
3749 | Returns the result of dividing the extended double-precision floating-point
3750 | value `a' by the corresponding value `b'. The operation is performed
3751 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3752 *----------------------------------------------------------------------------*/
3753
floatx80_div(floatx80 a,floatx80 b)3754 floatx80 floatx80_div( floatx80 a, floatx80 b )
3755 {
3756 flag aSign, bSign, zSign;
3757 int32 aExp, bExp, zExp;
3758 bits64 aSig, bSig, zSig0, zSig1;
3759 bits64 rem0, rem1, rem2, term0, term1, term2;
3760 floatx80 z;
3761
3762 aSig = extractFloatx80Frac( a );
3763 aExp = extractFloatx80Exp( a );
3764 aSign = extractFloatx80Sign( a );
3765 bSig = extractFloatx80Frac( b );
3766 bExp = extractFloatx80Exp( b );
3767 bSign = extractFloatx80Sign( b );
3768 zSign = aSign ^ bSign;
3769 if ( aExp == 0x7FFF ) {
3770 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3771 if ( bExp == 0x7FFF ) {
3772 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3773 goto invalid;
3774 }
3775 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3776 }
3777 if ( bExp == 0x7FFF ) {
3778 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3779 return packFloatx80( zSign, 0, 0 );
3780 }
3781 if ( bExp == 0 ) {
3782 if ( bSig == 0 ) {
3783 if ( ( aExp | aSig ) == 0 ) {
3784 invalid:
3785 float_raise( float_flag_invalid );
3786 z.low = floatx80_default_nan_low;
3787 z.high = floatx80_default_nan_high;
3788 return z;
3789 }
3790 float_raise( float_flag_divbyzero );
3791 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3792 }
3793 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3794 }
3795 if ( aExp == 0 ) {
3796 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3797 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3798 }
3799 zExp = aExp - bExp + 0x3FFE;
3800 rem1 = 0;
3801 if ( bSig <= aSig ) {
3802 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3803 ++zExp;
3804 }
3805 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3806 mul64To128( bSig, zSig0, &term0, &term1 );
3807 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3808 while ( (sbits64) rem0 < 0 ) {
3809 --zSig0;
3810 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3811 }
3812 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3813 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3814 mul64To128( bSig, zSig1, &term1, &term2 );
3815 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3816 while ( (sbits64) rem1 < 0 ) {
3817 --zSig1;
3818 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3819 }
3820 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3821 }
3822 return
3823 roundAndPackFloatx80(
3824 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3825
3826 }
3827
3828 /*----------------------------------------------------------------------------
3829 | Returns the remainder of the extended double-precision floating-point value
3830 | `a' with respect to the corresponding value `b'. The operation is performed
3831 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3832 *----------------------------------------------------------------------------*/
3833
floatx80_rem(floatx80 a,floatx80 b)3834 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3835 {
3836 flag aSign, bSign, zSign;
3837 int32 aExp, bExp, expDiff;
3838 bits64 aSig0, aSig1, bSig;
3839 bits64 q, term0, term1, alternateASig0, alternateASig1;
3840 floatx80 z;
3841
3842 aSig0 = extractFloatx80Frac( a );
3843 aExp = extractFloatx80Exp( a );
3844 aSign = extractFloatx80Sign( a );
3845 bSig = extractFloatx80Frac( b );
3846 bExp = extractFloatx80Exp( b );
3847 bSign = extractFloatx80Sign( b );
3848 if ( aExp == 0x7FFF ) {
3849 if ( (bits64) ( aSig0<<1 )
3850 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3851 return propagateFloatx80NaN( a, b );
3852 }
3853 goto invalid;
3854 }
3855 if ( bExp == 0x7FFF ) {
3856 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3857 return a;
3858 }
3859 if ( bExp == 0 ) {
3860 if ( bSig == 0 ) {
3861 invalid:
3862 float_raise( float_flag_invalid );
3863 z.low = floatx80_default_nan_low;
3864 z.high = floatx80_default_nan_high;
3865 return z;
3866 }
3867 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3868 }
3869 if ( aExp == 0 ) {
3870 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3871 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3872 }
3873 bSig |= LIT64( 0x8000000000000000 );
3874 zSign = aSign;
3875 expDiff = aExp - bExp;
3876 aSig1 = 0;
3877 if ( expDiff < 0 ) {
3878 if ( expDiff < -1 ) return a;
3879 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3880 expDiff = 0;
3881 }
3882 q = ( bSig <= aSig0 );
3883 if ( q ) aSig0 -= bSig;
3884 expDiff -= 64;
3885 while ( 0 < expDiff ) {
3886 q = estimateDiv128To64( aSig0, aSig1, bSig );
3887 q = ( 2 < q ) ? q - 2 : 0;
3888 mul64To128( bSig, q, &term0, &term1 );
3889 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3890 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3891 expDiff -= 62;
3892 }
3893 expDiff += 64;
3894 if ( 0 < expDiff ) {
3895 q = estimateDiv128To64( aSig0, aSig1, bSig );
3896 q = ( 2 < q ) ? q - 2 : 0;
3897 q >>= 64 - expDiff;
3898 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3899 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3900 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3901 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3902 ++q;
3903 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3904 }
3905 }
3906 else {
3907 term1 = 0;
3908 term0 = bSig;
3909 }
3910 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3911 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3912 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3913 && ( q & 1 ) )
3914 ) {
3915 aSig0 = alternateASig0;
3916 aSig1 = alternateASig1;
3917 zSign = ! zSign;
3918 }
3919 return
3920 normalizeRoundAndPackFloatx80(
3921 80, zSign, bExp + expDiff, aSig0, aSig1 );
3922
3923 }
3924
3925 /*----------------------------------------------------------------------------
3926 | Returns the square root of the extended double-precision floating-point
3927 | value `a'. The operation is performed according to the IEC/IEEE Standard
3928 | for Binary Floating-Point Arithmetic.
3929 *----------------------------------------------------------------------------*/
3930
floatx80_sqrt(floatx80 a)3931 floatx80 floatx80_sqrt( floatx80 a )
3932 {
3933 flag aSign;
3934 int32 aExp, zExp;
3935 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3936 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3937 floatx80 z;
3938
3939 aSig0 = extractFloatx80Frac( a );
3940 aExp = extractFloatx80Exp( a );
3941 aSign = extractFloatx80Sign( a );
3942 if ( aExp == 0x7FFF ) {
3943 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3944 if ( ! aSign ) return a;
3945 goto invalid;
3946 }
3947 if ( aSign ) {
3948 if ( ( aExp | aSig0 ) == 0 ) return a;
3949 invalid:
3950 float_raise( float_flag_invalid );
3951 z.low = floatx80_default_nan_low;
3952 z.high = floatx80_default_nan_high;
3953 return z;
3954 }
3955 if ( aExp == 0 ) {
3956 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3957 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3958 }
3959 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3960 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3961 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3962 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3963 doubleZSig0 = zSig0<<1;
3964 mul64To128( zSig0, zSig0, &term0, &term1 );
3965 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3966 while ( (sbits64) rem0 < 0 ) {
3967 --zSig0;
3968 doubleZSig0 -= 2;
3969 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3970 }
3971 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3972 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3973 if ( zSig1 == 0 ) zSig1 = 1;
3974 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3975 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3976 mul64To128( zSig1, zSig1, &term2, &term3 );
3977 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3978 while ( (sbits64) rem1 < 0 ) {
3979 --zSig1;
3980 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3981 term3 |= 1;
3982 term2 |= doubleZSig0;
3983 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3984 }
3985 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3986 }
3987 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3988 zSig0 |= doubleZSig0;
3989 return
3990 roundAndPackFloatx80(
3991 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3992
3993 }
3994
3995 /*----------------------------------------------------------------------------
3996 | Returns 1 if the extended double-precision floating-point value `a' is
3997 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3998 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3999 | Arithmetic.
4000 *----------------------------------------------------------------------------*/
4001
floatx80_eq(floatx80 a,floatx80 b)4002 flag floatx80_eq( floatx80 a, floatx80 b )
4003 {
4004
4005 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4006 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4007 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4008 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4009 ) {
4010 if ( floatx80_is_signaling_nan( a )
4011 || floatx80_is_signaling_nan( b ) ) {
4012 float_raise( float_flag_invalid );
4013 }
4014 return 0;
4015 }
4016 return
4017 ( a.low == b.low )
4018 && ( ( a.high == b.high )
4019 || ( ( a.low == 0 )
4020 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4021 );
4022
4023 }
4024
4025 /*----------------------------------------------------------------------------
4026 | Returns 1 if the extended double-precision floating-point value `a' is
4027 | less than or equal to the corresponding value `b', and 0 otherwise. The
4028 | comparison is performed according to the IEC/IEEE Standard for Binary
4029 | Floating-Point Arithmetic.
4030 *----------------------------------------------------------------------------*/
4031
floatx80_le(floatx80 a,floatx80 b)4032 flag floatx80_le( floatx80 a, floatx80 b )
4033 {
4034 flag aSign, bSign;
4035
4036 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4037 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4038 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4039 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4040 ) {
4041 float_raise( float_flag_invalid );
4042 return 0;
4043 }
4044 aSign = extractFloatx80Sign( a );
4045 bSign = extractFloatx80Sign( b );
4046 if ( aSign != bSign ) {
4047 return
4048 aSign
4049 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4050 == 0 );
4051 }
4052 return
4053 aSign ? le128( b.high, b.low, a.high, a.low )
4054 : le128( a.high, a.low, b.high, b.low );
4055
4056 }
4057
4058 /*----------------------------------------------------------------------------
4059 | Returns 1 if the extended double-precision floating-point value `a' is
4060 | less than the corresponding value `b', and 0 otherwise. The comparison
4061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4062 | Arithmetic.
4063 *----------------------------------------------------------------------------*/
4064
floatx80_lt(floatx80 a,floatx80 b)4065 flag floatx80_lt( floatx80 a, floatx80 b )
4066 {
4067 flag aSign, bSign;
4068
4069 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4070 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4071 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4072 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4073 ) {
4074 float_raise( float_flag_invalid );
4075 return 0;
4076 }
4077 aSign = extractFloatx80Sign( a );
4078 bSign = extractFloatx80Sign( b );
4079 if ( aSign != bSign ) {
4080 return
4081 aSign
4082 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4083 != 0 );
4084 }
4085 return
4086 aSign ? lt128( b.high, b.low, a.high, a.low )
4087 : lt128( a.high, a.low, b.high, b.low );
4088
4089 }
4090
4091 /*----------------------------------------------------------------------------
4092 | Returns 1 if the extended double-precision floating-point value `a' is equal
4093 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4094 | raised if either operand is a NaN. Otherwise, the comparison is performed
4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4096 *----------------------------------------------------------------------------*/
4097
floatx80_eq_signaling(floatx80 a,floatx80 b)4098 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4099 {
4100
4101 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4102 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4103 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4104 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4105 ) {
4106 float_raise( float_flag_invalid );
4107 return 0;
4108 }
4109 return
4110 ( a.low == b.low )
4111 && ( ( a.high == b.high )
4112 || ( ( a.low == 0 )
4113 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4114 );
4115
4116 }
4117
4118 /*----------------------------------------------------------------------------
4119 | Returns 1 if the extended double-precision floating-point value `a' is less
4120 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4121 | do not cause an exception. Otherwise, the comparison is performed according
4122 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4124
floatx80_le_quiet(floatx80 a,floatx80 b)4125 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4126 {
4127 flag aSign, bSign;
4128
4129 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4130 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4131 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4132 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4133 ) {
4134 if ( floatx80_is_signaling_nan( a )
4135 || floatx80_is_signaling_nan( b ) ) {
4136 float_raise( float_flag_invalid );
4137 }
4138 return 0;
4139 }
4140 aSign = extractFloatx80Sign( a );
4141 bSign = extractFloatx80Sign( b );
4142 if ( aSign != bSign ) {
4143 return
4144 aSign
4145 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4146 == 0 );
4147 }
4148 return
4149 aSign ? le128( b.high, b.low, a.high, a.low )
4150 : le128( a.high, a.low, b.high, b.low );
4151
4152 }
4153
4154 /*----------------------------------------------------------------------------
4155 | Returns 1 if the extended double-precision floating-point value `a' is less
4156 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4157 | an exception. Otherwise, the comparison is performed according to the
4158 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4159 *----------------------------------------------------------------------------*/
4160
floatx80_lt_quiet(floatx80 a,floatx80 b)4161 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4162 {
4163 flag aSign, bSign;
4164
4165 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4166 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4167 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4168 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4169 ) {
4170 if ( floatx80_is_signaling_nan( a )
4171 || floatx80_is_signaling_nan( b ) ) {
4172 float_raise( float_flag_invalid );
4173 }
4174 return 0;
4175 }
4176 aSign = extractFloatx80Sign( a );
4177 bSign = extractFloatx80Sign( b );
4178 if ( aSign != bSign ) {
4179 return
4180 aSign
4181 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4182 != 0 );
4183 }
4184 return
4185 aSign ? lt128( b.high, b.low, a.high, a.low )
4186 : lt128( a.high, a.low, b.high, b.low );
4187
4188 }
4189
4190 #endif
4191
4192 #ifdef FLOAT128
4193
4194 /*----------------------------------------------------------------------------
4195 | Returns the result of converting the quadruple-precision floating-point
4196 | value `a' to the 32-bit two's complement integer format. The conversion
4197 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4198 | Arithmetic---which means in particular that the conversion is rounded
4199 | according to the current rounding mode. If `a' is a NaN, the largest
4200 | positive integer is returned. Otherwise, if the conversion overflows, the
4201 | largest integer with the same sign as `a' is returned.
4202 *----------------------------------------------------------------------------*/
4203
float128_to_int32(float128 a)4204 int32 float128_to_int32( float128 a )
4205 {
4206 flag aSign;
4207 int32 aExp, shiftCount;
4208 bits64 aSig0, aSig1;
4209
4210 aSig1 = extractFloat128Frac1( a );
4211 aSig0 = extractFloat128Frac0( a );
4212 aExp = extractFloat128Exp( a );
4213 aSign = extractFloat128Sign( a );
4214 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4215 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4216 aSig0 |= ( aSig1 != 0 );
4217 shiftCount = 0x4028 - aExp;
4218 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4219 return roundAndPackInt32( aSign, aSig0 );
4220
4221 }
4222
4223 /*----------------------------------------------------------------------------
4224 | Returns the result of converting the quadruple-precision floating-point
4225 | value `a' to the 32-bit two's complement integer format. The conversion
4226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4227 | Arithmetic, except that the conversion is always rounded toward zero. If
4228 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4229 | conversion overflows, the largest integer with the same sign as `a' is
4230 | returned.
4231 *----------------------------------------------------------------------------*/
4232
float128_to_int32_round_to_zero(float128 a)4233 int32 float128_to_int32_round_to_zero( float128 a )
4234 {
4235 flag aSign;
4236 int32 aExp, shiftCount;
4237 bits64 aSig0, aSig1, savedASig;
4238 int32 z;
4239
4240 aSig1 = extractFloat128Frac1( a );
4241 aSig0 = extractFloat128Frac0( a );
4242 aExp = extractFloat128Exp( a );
4243 aSign = extractFloat128Sign( a );
4244 aSig0 |= ( aSig1 != 0 );
4245 if ( 0x401E < aExp ) {
4246 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4247 goto invalid;
4248 }
4249 else if ( aExp < 0x3FFF ) {
4250 if ( aExp || aSig0 ) float_set_inexact();
4251 return 0;
4252 }
4253 aSig0 |= LIT64( 0x0001000000000000 );
4254 shiftCount = 0x402F - aExp;
4255 savedASig = aSig0;
4256 aSig0 >>= shiftCount;
4257 z = aSig0;
4258 if ( aSign ) z = - z;
4259 if ( ( z < 0 ) ^ aSign ) {
4260 invalid:
4261 float_raise( float_flag_invalid );
4262 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4263 }
4264 if ( ( aSig0<<shiftCount ) != savedASig ) {
4265 float_set_inexact();
4266 }
4267 return z;
4268
4269 }
4270
4271 /*----------------------------------------------------------------------------
4272 | Returns the result of converting the quadruple-precision floating-point
4273 | value `a' to the 64-bit two's complement integer format. The conversion
4274 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4275 | Arithmetic---which means in particular that the conversion is rounded
4276 | according to the current rounding mode. If `a' is a NaN, the largest
4277 | positive integer is returned. Otherwise, if the conversion overflows, the
4278 | largest integer with the same sign as `a' is returned.
4279 *----------------------------------------------------------------------------*/
4280
float128_to_int64(float128 a)4281 int64 float128_to_int64( float128 a )
4282 {
4283 flag aSign;
4284 int32 aExp, shiftCount;
4285 bits64 aSig0, aSig1;
4286
4287 aSig1 = extractFloat128Frac1( a );
4288 aSig0 = extractFloat128Frac0( a );
4289 aExp = extractFloat128Exp( a );
4290 aSign = extractFloat128Sign( a );
4291 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4292 shiftCount = 0x402F - aExp;
4293 if ( shiftCount <= 0 ) {
4294 if ( 0x403E < aExp ) {
4295 float_raise( float_flag_invalid );
4296 if ( ! aSign
4297 || ( ( aExp == 0x7FFF )
4298 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4299 )
4300 ) {
4301 return LIT64( 0x7FFFFFFFFFFFFFFF );
4302 }
4303 return (sbits64) LIT64( 0x8000000000000000 );
4304 }
4305 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4306 }
4307 else {
4308 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4309 }
4310 return roundAndPackInt64( aSign, aSig0, aSig1 );
4311
4312 }
4313
4314 /*----------------------------------------------------------------------------
4315 | Returns the result of converting the quadruple-precision floating-point
4316 | value `a' to the 64-bit two's complement integer format. The conversion
4317 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4318 | Arithmetic, except that the conversion is always rounded toward zero.
4319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4320 | the conversion overflows, the largest integer with the same sign as `a' is
4321 | returned.
4322 *----------------------------------------------------------------------------*/
4323
float128_to_int64_round_to_zero(float128 a)4324 int64 float128_to_int64_round_to_zero( float128 a )
4325 {
4326 flag aSign;
4327 int32 aExp, shiftCount;
4328 bits64 aSig0, aSig1;
4329 int64 z;
4330
4331 aSig1 = extractFloat128Frac1( a );
4332 aSig0 = extractFloat128Frac0( a );
4333 aExp = extractFloat128Exp( a );
4334 aSign = extractFloat128Sign( a );
4335 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4336 shiftCount = aExp - 0x402F;
4337 if ( 0 < shiftCount ) {
4338 if ( 0x403E <= aExp ) {
4339 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4340 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4341 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4342 if ( aSig1 ) float_set_inexact();
4343 }
4344 else {
4345 float_raise( float_flag_invalid );
4346 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4347 return LIT64( 0x7FFFFFFFFFFFFFFF );
4348 }
4349 }
4350 return (sbits64) LIT64( 0x8000000000000000 );
4351 }
4352 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4353 if ( (bits64) ( aSig1<<shiftCount ) ) {
4354 float_set_inexact();
4355 }
4356 }
4357 else {
4358 if ( aExp < 0x3FFF ) {
4359 if ( aExp | aSig0 | aSig1 ) {
4360 float_set_inexact();
4361 }
4362 return 0;
4363 }
4364 z = aSig0>>( - shiftCount );
4365 if ( aSig1
4366 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4367 float_set_inexact();
4368 }
4369 }
4370 if ( aSign ) z = - z;
4371 return z;
4372
4373 }
4374
4375 /*----------------------------------------------------------------------------
4376 | Returns the result of converting the quadruple-precision floating-point
4377 | value `a' to the single-precision floating-point format. The conversion
4378 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4379 | Arithmetic.
4380 *----------------------------------------------------------------------------*/
4381
float128_to_float32(float128 a)4382 float32 float128_to_float32( float128 a )
4383 {
4384 flag aSign;
4385 int32 aExp;
4386 bits64 aSig0, aSig1;
4387 bits32 zSig;
4388
4389 aSig1 = extractFloat128Frac1( a );
4390 aSig0 = extractFloat128Frac0( a );
4391 aExp = extractFloat128Exp( a );
4392 aSign = extractFloat128Sign( a );
4393 if ( aExp == 0x7FFF ) {
4394 if ( aSig0 | aSig1 ) {
4395 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4396 }
4397 return packFloat32( aSign, 0xFF, 0 );
4398 }
4399 aSig0 |= ( aSig1 != 0 );
4400 shift64RightJamming( aSig0, 18, &aSig0 );
4401 zSig = aSig0;
4402 if ( aExp || zSig ) {
4403 zSig |= 0x40000000;
4404 aExp -= 0x3F81;
4405 }
4406 return roundAndPackFloat32( aSign, aExp, zSig );
4407
4408 }
4409
4410 /*----------------------------------------------------------------------------
4411 | Returns the result of converting the quadruple-precision floating-point
4412 | value `a' to the double-precision floating-point format. The conversion
4413 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4414 | Arithmetic.
4415 *----------------------------------------------------------------------------*/
4416
float128_to_float64(float128 a)4417 float64 float128_to_float64( float128 a )
4418 {
4419 flag aSign;
4420 int32 aExp;
4421 bits64 aSig0, aSig1;
4422
4423 aSig1 = extractFloat128Frac1( a );
4424 aSig0 = extractFloat128Frac0( a );
4425 aExp = extractFloat128Exp( a );
4426 aSign = extractFloat128Sign( a );
4427 if ( aExp == 0x7FFF ) {
4428 if ( aSig0 | aSig1 ) {
4429 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4430 }
4431 return packFloat64( aSign, 0x7FF, 0 );
4432 }
4433 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4434 aSig0 |= ( aSig1 != 0 );
4435 if ( aExp || aSig0 ) {
4436 aSig0 |= LIT64( 0x4000000000000000 );
4437 aExp -= 0x3C01;
4438 }
4439 return roundAndPackFloat64( aSign, aExp, aSig0 );
4440
4441 }
4442
4443 #ifdef FLOATX80
4444
4445 /*----------------------------------------------------------------------------
4446 | Returns the result of converting the quadruple-precision floating-point
4447 | value `a' to the extended double-precision floating-point format. The
4448 | conversion is performed according to the IEC/IEEE Standard for Binary
4449 | Floating-Point Arithmetic.
4450 *----------------------------------------------------------------------------*/
4451
float128_to_floatx80(float128 a)4452 floatx80 float128_to_floatx80( float128 a )
4453 {
4454 flag aSign;
4455 int32 aExp;
4456 bits64 aSig0, aSig1;
4457
4458 aSig1 = extractFloat128Frac1( a );
4459 aSig0 = extractFloat128Frac0( a );
4460 aExp = extractFloat128Exp( a );
4461 aSign = extractFloat128Sign( a );
4462 if ( aExp == 0x7FFF ) {
4463 if ( aSig0 | aSig1 ) {
4464 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4465 }
4466 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4467 }
4468 if ( aExp == 0 ) {
4469 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4470 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4471 }
4472 else {
4473 aSig0 |= LIT64( 0x0001000000000000 );
4474 }
4475 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4476 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4477
4478 }
4479
4480 #endif
4481
4482 /*----------------------------------------------------------------------------
4483 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4484 | returns the result as a quadruple-precision floating-point value. The
4485 | operation is performed according to the IEC/IEEE Standard for Binary
4486 | Floating-Point Arithmetic.
4487 *----------------------------------------------------------------------------*/
4488
float128_round_to_int(float128 a)4489 float128 float128_round_to_int( float128 a )
4490 {
4491 flag aSign;
4492 int32 aExp;
4493 bits64 lastBitMask, roundBitsMask;
4494 int8 roundingMode;
4495 float128 z;
4496
4497 aExp = extractFloat128Exp( a );
4498 if ( 0x402F <= aExp ) {
4499 if ( 0x406F <= aExp ) {
4500 if ( ( aExp == 0x7FFF )
4501 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4502 ) {
4503 return propagateFloat128NaN( a, a );
4504 }
4505 return a;
4506 }
4507 lastBitMask = 1;
4508 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4509 roundBitsMask = lastBitMask - 1;
4510 z = a;
4511 roundingMode = float_rounding_mode();
4512 if ( roundingMode == float_round_nearest_even ) {
4513 if ( lastBitMask ) {
4514 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4515 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4516 }
4517 else {
4518 if ( (sbits64) z.low < 0 ) {
4519 ++z.high;
4520 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4521 }
4522 }
4523 }
4524 else if ( roundingMode != float_round_to_zero ) {
4525 if ( extractFloat128Sign( z )
4526 ^ ( roundingMode == float_round_up ) ) {
4527 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4528 }
4529 }
4530 z.low &= ~ roundBitsMask;
4531 }
4532 else {
4533 if ( aExp < 0x3FFF ) {
4534 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4535 float_set_inexact();
4536 aSign = extractFloat128Sign( a );
4537 switch ( float_rounding_mode() ) {
4538 case float_round_nearest_even:
4539 if ( ( aExp == 0x3FFE )
4540 && ( extractFloat128Frac0( a )
4541 | extractFloat128Frac1( a ) )
4542 ) {
4543 return packFloat128( aSign, 0x3FFF, 0, 0 );
4544 }
4545 break;
4546 case float_round_down:
4547 return
4548 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4549 : packFloat128( 0, 0, 0, 0 );
4550 case float_round_up:
4551 return
4552 aSign ? packFloat128( 1, 0, 0, 0 )
4553 : packFloat128( 0, 0x3FFF, 0, 0 );
4554 }
4555 return packFloat128( aSign, 0, 0, 0 );
4556 }
4557 lastBitMask = 1;
4558 lastBitMask <<= 0x402F - aExp;
4559 roundBitsMask = lastBitMask - 1;
4560 z.low = 0;
4561 z.high = a.high;
4562 roundingMode = float_rounding_mode();
4563 if ( roundingMode == float_round_nearest_even ) {
4564 z.high += lastBitMask>>1;
4565 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4566 z.high &= ~ lastBitMask;
4567 }
4568 }
4569 else if ( roundingMode != float_round_to_zero ) {
4570 if ( extractFloat128Sign( z )
4571 ^ ( roundingMode == float_round_up ) ) {
4572 z.high |= ( a.low != 0 );
4573 z.high += roundBitsMask;
4574 }
4575 }
4576 z.high &= ~ roundBitsMask;
4577 }
4578 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4579 float_set_inexact();
4580 }
4581 return z;
4582
4583 }
4584
4585 /*----------------------------------------------------------------------------
4586 | Returns the result of adding the absolute values of the quadruple-precision
4587 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4588 | before being returned. `zSign' is ignored if the result is a NaN.
4589 | The addition is performed according to the IEC/IEEE Standard for Binary
4590 | Floating-Point Arithmetic.
4591 *----------------------------------------------------------------------------*/
4592
addFloat128Sigs(float128 a,float128 b,flag zSign)4593 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4594 {
4595 int32 aExp, bExp, zExp;
4596 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4597 int32 expDiff;
4598
4599 aSig1 = extractFloat128Frac1( a );
4600 aSig0 = extractFloat128Frac0( a );
4601 aExp = extractFloat128Exp( a );
4602 bSig1 = extractFloat128Frac1( b );
4603 bSig0 = extractFloat128Frac0( b );
4604 bExp = extractFloat128Exp( b );
4605 expDiff = aExp - bExp;
4606 if ( 0 < expDiff ) {
4607 if ( aExp == 0x7FFF ) {
4608 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4609 return a;
4610 }
4611 if ( bExp == 0 ) {
4612 --expDiff;
4613 }
4614 else {
4615 bSig0 |= LIT64( 0x0001000000000000 );
4616 }
4617 shift128ExtraRightJamming(
4618 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4619 zExp = aExp;
4620 }
4621 else if ( expDiff < 0 ) {
4622 if ( bExp == 0x7FFF ) {
4623 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4624 return packFloat128( zSign, 0x7FFF, 0, 0 );
4625 }
4626 if ( aExp == 0 ) {
4627 ++expDiff;
4628 }
4629 else {
4630 aSig0 |= LIT64( 0x0001000000000000 );
4631 }
4632 shift128ExtraRightJamming(
4633 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4634 zExp = bExp;
4635 }
4636 else {
4637 if ( aExp == 0x7FFF ) {
4638 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4639 return propagateFloat128NaN( a, b );
4640 }
4641 return a;
4642 }
4643 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4644 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4645 zSig2 = 0;
4646 zSig0 |= LIT64( 0x0002000000000000 );
4647 zExp = aExp;
4648 goto shiftRight1;
4649 }
4650 aSig0 |= LIT64( 0x0001000000000000 );
4651 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4652 --zExp;
4653 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4654 ++zExp;
4655 shiftRight1:
4656 shift128ExtraRightJamming(
4657 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4658 roundAndPack:
4659 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4660
4661 }
4662
4663 /*----------------------------------------------------------------------------
4664 | Returns the result of subtracting the absolute values of the quadruple-
4665 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4666 | difference is negated before being returned. `zSign' is ignored if the
4667 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4668 | Standard for Binary Floating-Point Arithmetic.
4669 *----------------------------------------------------------------------------*/
4670
subFloat128Sigs(float128 a,float128 b,flag zSign)4671 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4672 {
4673 int32 aExp, bExp, zExp;
4674 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4675 int32 expDiff;
4676 float128 z;
4677
4678 aSig1 = extractFloat128Frac1( a );
4679 aSig0 = extractFloat128Frac0( a );
4680 aExp = extractFloat128Exp( a );
4681 bSig1 = extractFloat128Frac1( b );
4682 bSig0 = extractFloat128Frac0( b );
4683 bExp = extractFloat128Exp( b );
4684 expDiff = aExp - bExp;
4685 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4686 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4687 if ( 0 < expDiff ) goto aExpBigger;
4688 if ( expDiff < 0 ) goto bExpBigger;
4689 if ( aExp == 0x7FFF ) {
4690 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4691 return propagateFloat128NaN( a, b );
4692 }
4693 float_raise( float_flag_invalid );
4694 z.low = float128_default_nan_low;
4695 z.high = float128_default_nan_high;
4696 return z;
4697 }
4698 if ( aExp == 0 ) {
4699 aExp = 1;
4700 bExp = 1;
4701 }
4702 if ( bSig0 < aSig0 ) goto aBigger;
4703 if ( aSig0 < bSig0 ) goto bBigger;
4704 if ( bSig1 < aSig1 ) goto aBigger;
4705 if ( aSig1 < bSig1 ) goto bBigger;
4706 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4707 bExpBigger:
4708 if ( bExp == 0x7FFF ) {
4709 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4710 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4711 }
4712 if ( aExp == 0 ) {
4713 ++expDiff;
4714 }
4715 else {
4716 aSig0 |= LIT64( 0x4000000000000000 );
4717 }
4718 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4719 bSig0 |= LIT64( 0x4000000000000000 );
4720 bBigger:
4721 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4722 zExp = bExp;
4723 zSign ^= 1;
4724 goto normalizeRoundAndPack;
4725 aExpBigger:
4726 if ( aExp == 0x7FFF ) {
4727 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4728 return a;
4729 }
4730 if ( bExp == 0 ) {
4731 --expDiff;
4732 }
4733 else {
4734 bSig0 |= LIT64( 0x4000000000000000 );
4735 }
4736 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4737 aSig0 |= LIT64( 0x4000000000000000 );
4738 aBigger:
4739 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4740 zExp = aExp;
4741 normalizeRoundAndPack:
4742 --zExp;
4743 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4744
4745 }
4746
4747 /*----------------------------------------------------------------------------
4748 | Returns the result of adding the quadruple-precision floating-point values
4749 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4750 | for Binary Floating-Point Arithmetic.
4751 *----------------------------------------------------------------------------*/
4752
float128_add(float128 a,float128 b)4753 float128 float128_add( float128 a, float128 b )
4754 {
4755 flag aSign, bSign;
4756
4757 aSign = extractFloat128Sign( a );
4758 bSign = extractFloat128Sign( b );
4759 if ( aSign == bSign ) {
4760 return addFloat128Sigs( a, b, aSign );
4761 }
4762 else {
4763 return subFloat128Sigs( a, b, aSign );
4764 }
4765
4766 }
4767
4768 /*----------------------------------------------------------------------------
4769 | Returns the result of subtracting the quadruple-precision floating-point
4770 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4771 | Standard for Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773
float128_sub(float128 a,float128 b)4774 float128 float128_sub( float128 a, float128 b )
4775 {
4776 flag aSign, bSign;
4777
4778 aSign = extractFloat128Sign( a );
4779 bSign = extractFloat128Sign( b );
4780 if ( aSign == bSign ) {
4781 return subFloat128Sigs( a, b, aSign );
4782 }
4783 else {
4784 return addFloat128Sigs( a, b, aSign );
4785 }
4786
4787 }
4788
4789 /*----------------------------------------------------------------------------
4790 | Returns the result of multiplying the quadruple-precision floating-point
4791 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4792 | Standard for Binary Floating-Point Arithmetic.
4793 *----------------------------------------------------------------------------*/
4794
float128_mul(float128 a,float128 b)4795 float128 float128_mul( float128 a, float128 b )
4796 {
4797 flag aSign, bSign, zSign;
4798 int32 aExp, bExp, zExp;
4799 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4800 float128 z;
4801
4802 aSig1 = extractFloat128Frac1( a );
4803 aSig0 = extractFloat128Frac0( a );
4804 aExp = extractFloat128Exp( a );
4805 aSign = extractFloat128Sign( a );
4806 bSig1 = extractFloat128Frac1( b );
4807 bSig0 = extractFloat128Frac0( b );
4808 bExp = extractFloat128Exp( b );
4809 bSign = extractFloat128Sign( b );
4810 zSign = aSign ^ bSign;
4811 if ( aExp == 0x7FFF ) {
4812 if ( ( aSig0 | aSig1 )
4813 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4814 return propagateFloat128NaN( a, b );
4815 }
4816 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4817 return packFloat128( zSign, 0x7FFF, 0, 0 );
4818 }
4819 if ( bExp == 0x7FFF ) {
4820 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4821 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4822 invalid:
4823 float_raise( float_flag_invalid );
4824 z.low = float128_default_nan_low;
4825 z.high = float128_default_nan_high;
4826 return z;
4827 }
4828 return packFloat128( zSign, 0x7FFF, 0, 0 );
4829 }
4830 if ( aExp == 0 ) {
4831 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4832 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4833 }
4834 if ( bExp == 0 ) {
4835 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4836 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4837 }
4838 zExp = aExp + bExp - 0x4000;
4839 aSig0 |= LIT64( 0x0001000000000000 );
4840 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4841 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4842 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4843 zSig2 |= ( zSig3 != 0 );
4844 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4845 shift128ExtraRightJamming(
4846 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4847 ++zExp;
4848 }
4849 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4850
4851 }
4852
4853 /*----------------------------------------------------------------------------
4854 | Returns the result of dividing the quadruple-precision floating-point value
4855 | `a' by the corresponding value `b'. The operation is performed according to
4856 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4857 *----------------------------------------------------------------------------*/
4858
float128_div(float128 a,float128 b)4859 float128 float128_div( float128 a, float128 b )
4860 {
4861 flag aSign, bSign, zSign;
4862 int32 aExp, bExp, zExp;
4863 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4864 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4865 float128 z;
4866
4867 aSig1 = extractFloat128Frac1( a );
4868 aSig0 = extractFloat128Frac0( a );
4869 aExp = extractFloat128Exp( a );
4870 aSign = extractFloat128Sign( a );
4871 bSig1 = extractFloat128Frac1( b );
4872 bSig0 = extractFloat128Frac0( b );
4873 bExp = extractFloat128Exp( b );
4874 bSign = extractFloat128Sign( b );
4875 zSign = aSign ^ bSign;
4876 if ( aExp == 0x7FFF ) {
4877 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4878 if ( bExp == 0x7FFF ) {
4879 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4880 goto invalid;
4881 }
4882 return packFloat128( zSign, 0x7FFF, 0, 0 );
4883 }
4884 if ( bExp == 0x7FFF ) {
4885 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4886 return packFloat128( zSign, 0, 0, 0 );
4887 }
4888 if ( bExp == 0 ) {
4889 if ( ( bSig0 | bSig1 ) == 0 ) {
4890 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4891 invalid:
4892 float_raise( float_flag_invalid );
4893 z.low = float128_default_nan_low;
4894 z.high = float128_default_nan_high;
4895 return z;
4896 }
4897 float_raise( float_flag_divbyzero );
4898 return packFloat128( zSign, 0x7FFF, 0, 0 );
4899 }
4900 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4901 }
4902 if ( aExp == 0 ) {
4903 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4904 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4905 }
4906 zExp = aExp - bExp + 0x3FFD;
4907 shortShift128Left(
4908 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4909 shortShift128Left(
4910 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4911 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4912 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4913 ++zExp;
4914 }
4915 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4916 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4917 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4918 while ( (sbits64) rem0 < 0 ) {
4919 --zSig0;
4920 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4921 }
4922 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4923 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4924 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4925 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4926 while ( (sbits64) rem1 < 0 ) {
4927 --zSig1;
4928 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4929 }
4930 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4931 }
4932 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4933 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4934
4935 }
4936
4937 /*----------------------------------------------------------------------------
4938 | Returns the remainder of the quadruple-precision floating-point value `a'
4939 | with respect to the corresponding value `b'. The operation is performed
4940 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4941 *----------------------------------------------------------------------------*/
4942
float128_rem(float128 a,float128 b)4943 float128 float128_rem( float128 a, float128 b )
4944 {
4945 flag aSign, bSign, zSign;
4946 int32 aExp, bExp, expDiff;
4947 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4948 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4949 sbits64 sigMean0;
4950 float128 z;
4951
4952 aSig1 = extractFloat128Frac1( a );
4953 aSig0 = extractFloat128Frac0( a );
4954 aExp = extractFloat128Exp( a );
4955 aSign = extractFloat128Sign( a );
4956 bSig1 = extractFloat128Frac1( b );
4957 bSig0 = extractFloat128Frac0( b );
4958 bExp = extractFloat128Exp( b );
4959 bSign = extractFloat128Sign( b );
4960 if ( aExp == 0x7FFF ) {
4961 if ( ( aSig0 | aSig1 )
4962 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4963 return propagateFloat128NaN( a, b );
4964 }
4965 goto invalid;
4966 }
4967 if ( bExp == 0x7FFF ) {
4968 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4969 return a;
4970 }
4971 if ( bExp == 0 ) {
4972 if ( ( bSig0 | bSig1 ) == 0 ) {
4973 invalid:
4974 float_raise( float_flag_invalid );
4975 z.low = float128_default_nan_low;
4976 z.high = float128_default_nan_high;
4977 return z;
4978 }
4979 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4980 }
4981 if ( aExp == 0 ) {
4982 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4983 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4984 }
4985 expDiff = aExp - bExp;
4986 if ( expDiff < -1 ) return a;
4987 shortShift128Left(
4988 aSig0 | LIT64( 0x0001000000000000 ),
4989 aSig1,
4990 15 - ( expDiff < 0 ),
4991 &aSig0,
4992 &aSig1
4993 );
4994 shortShift128Left(
4995 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4996 q = le128( bSig0, bSig1, aSig0, aSig1 );
4997 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4998 expDiff -= 64;
4999 while ( 0 < expDiff ) {
5000 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5001 q = ( 4 < q ) ? q - 4 : 0;
5002 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5003 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5004 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5005 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5006 expDiff -= 61;
5007 }
5008 if ( -64 < expDiff ) {
5009 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5010 q = ( 4 < q ) ? q - 4 : 0;
5011 q >>= - expDiff;
5012 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5013 expDiff += 52;
5014 if ( expDiff < 0 ) {
5015 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5016 }
5017 else {
5018 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5019 }
5020 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5021 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5022 }
5023 else {
5024 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5025 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5026 }
5027 do {
5028 alternateASig0 = aSig0;
5029 alternateASig1 = aSig1;
5030 ++q;
5031 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5032 } while ( 0 <= (sbits64) aSig0 );
5033 add128(
5034 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5035 if ( ( sigMean0 < 0 )
5036 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5037 aSig0 = alternateASig0;
5038 aSig1 = alternateASig1;
5039 }
5040 zSign = ( (sbits64) aSig0 < 0 );
5041 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5042 return
5043 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5044
5045 }
5046
5047 /*----------------------------------------------------------------------------
5048 | Returns the square root of the quadruple-precision floating-point value `a'.
5049 | The operation is performed according to the IEC/IEEE Standard for Binary
5050 | Floating-Point Arithmetic.
5051 *----------------------------------------------------------------------------*/
5052
float128_sqrt(float128 a)5053 float128 float128_sqrt( float128 a )
5054 {
5055 flag aSign;
5056 int32 aExp, zExp;
5057 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5058 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5059 float128 z;
5060
5061 aSig1 = extractFloat128Frac1( a );
5062 aSig0 = extractFloat128Frac0( a );
5063 aExp = extractFloat128Exp( a );
5064 aSign = extractFloat128Sign( a );
5065 if ( aExp == 0x7FFF ) {
5066 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5067 if ( ! aSign ) return a;
5068 goto invalid;
5069 }
5070 if ( aSign ) {
5071 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5072 invalid:
5073 float_raise( float_flag_invalid );
5074 z.low = float128_default_nan_low;
5075 z.high = float128_default_nan_high;
5076 return z;
5077 }
5078 if ( aExp == 0 ) {
5079 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5080 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5081 }
5082 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5083 aSig0 |= LIT64( 0x0001000000000000 );
5084 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5085 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5086 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5087 doubleZSig0 = zSig0<<1;
5088 mul64To128( zSig0, zSig0, &term0, &term1 );
5089 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5090 while ( (sbits64) rem0 < 0 ) {
5091 --zSig0;
5092 doubleZSig0 -= 2;
5093 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5094 }
5095 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5096 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5097 if ( zSig1 == 0 ) zSig1 = 1;
5098 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5099 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5100 mul64To128( zSig1, zSig1, &term2, &term3 );
5101 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5102 while ( (sbits64) rem1 < 0 ) {
5103 --zSig1;
5104 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5105 term3 |= 1;
5106 term2 |= doubleZSig0;
5107 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5108 }
5109 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5110 }
5111 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5112 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5113
5114 }
5115
5116 /*----------------------------------------------------------------------------
5117 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5118 | the corresponding value `b', and 0 otherwise. The comparison is performed
5119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5121
float128_eq(float128 a,float128 b)5122 flag float128_eq( float128 a, float128 b )
5123 {
5124
5125 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5126 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5127 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5128 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5129 ) {
5130 if ( float128_is_signaling_nan( a )
5131 || float128_is_signaling_nan( b ) ) {
5132 float_raise( float_flag_invalid );
5133 }
5134 return 0;
5135 }
5136 return
5137 ( a.low == b.low )
5138 && ( ( a.high == b.high )
5139 || ( ( a.low == 0 )
5140 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5141 );
5142
5143 }
5144
5145 /*----------------------------------------------------------------------------
5146 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5147 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5148 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5149 | Arithmetic.
5150 *----------------------------------------------------------------------------*/
5151
float128_le(float128 a,float128 b)5152 flag float128_le( float128 a, float128 b )
5153 {
5154 flag aSign, bSign;
5155
5156 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5157 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5158 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5159 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5160 ) {
5161 float_raise( float_flag_invalid );
5162 return 0;
5163 }
5164 aSign = extractFloat128Sign( a );
5165 bSign = extractFloat128Sign( b );
5166 if ( aSign != bSign ) {
5167 return
5168 aSign
5169 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5170 == 0 );
5171 }
5172 return
5173 aSign ? le128( b.high, b.low, a.high, a.low )
5174 : le128( a.high, a.low, b.high, b.low );
5175
5176 }
5177
5178 /*----------------------------------------------------------------------------
5179 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5180 | the corresponding value `b', and 0 otherwise. The comparison is performed
5181 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5182 *----------------------------------------------------------------------------*/
5183
float128_lt(float128 a,float128 b)5184 flag float128_lt( float128 a, float128 b )
5185 {
5186 flag aSign, bSign;
5187
5188 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5189 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5190 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5191 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5192 ) {
5193 float_raise( float_flag_invalid );
5194 return 0;
5195 }
5196 aSign = extractFloat128Sign( a );
5197 bSign = extractFloat128Sign( b );
5198 if ( aSign != bSign ) {
5199 return
5200 aSign
5201 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5202 != 0 );
5203 }
5204 return
5205 aSign ? lt128( b.high, b.low, a.high, a.low )
5206 : lt128( a.high, a.low, b.high, b.low );
5207
5208 }
5209
5210 /*----------------------------------------------------------------------------
5211 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5212 | the corresponding value `b', and 0 otherwise. The invalid exception is
5213 | raised if either operand is a NaN. Otherwise, the comparison is performed
5214 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5215 *----------------------------------------------------------------------------*/
5216
float128_eq_signaling(float128 a,float128 b)5217 flag float128_eq_signaling( float128 a, float128 b )
5218 {
5219
5220 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5221 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5222 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5223 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5224 ) {
5225 float_raise( float_flag_invalid );
5226 return 0;
5227 }
5228 return
5229 ( a.low == b.low )
5230 && ( ( a.high == b.high )
5231 || ( ( a.low == 0 )
5232 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5233 );
5234
5235 }
5236
5237 /*----------------------------------------------------------------------------
5238 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5239 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5240 | cause an exception. Otherwise, the comparison is performed according to the
5241 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242 *----------------------------------------------------------------------------*/
5243
float128_le_quiet(float128 a,float128 b)5244 flag float128_le_quiet( float128 a, float128 b )
5245 {
5246 flag aSign, bSign;
5247
5248 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5249 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5250 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5251 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5252 ) {
5253 if ( float128_is_signaling_nan( a )
5254 || float128_is_signaling_nan( b ) ) {
5255 float_raise( float_flag_invalid );
5256 }
5257 return 0;
5258 }
5259 aSign = extractFloat128Sign( a );
5260 bSign = extractFloat128Sign( b );
5261 if ( aSign != bSign ) {
5262 return
5263 aSign
5264 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5265 == 0 );
5266 }
5267 return
5268 aSign ? le128( b.high, b.low, a.high, a.low )
5269 : le128( a.high, a.low, b.high, b.low );
5270
5271 }
5272
5273 /*----------------------------------------------------------------------------
5274 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5275 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5276 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5277 | Standard for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5279
float128_lt_quiet(float128 a,float128 b)5280 flag float128_lt_quiet( float128 a, float128 b )
5281 {
5282 flag aSign, bSign;
5283
5284 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5285 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5286 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5287 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5288 ) {
5289 if ( float128_is_signaling_nan( a )
5290 || float128_is_signaling_nan( b ) ) {
5291 float_raise( float_flag_invalid );
5292 }
5293 return 0;
5294 }
5295 aSign = extractFloat128Sign( a );
5296 bSign = extractFloat128Sign( b );
5297 if ( aSign != bSign ) {
5298 return
5299 aSign
5300 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5301 != 0 );
5302 }
5303 return
5304 aSign ? lt128( b.high, b.low, a.high, a.low )
5305 : lt128( a.high, a.low, b.high, b.low );
5306
5307 }
5308
5309 #endif
5310
5311
5312 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5313
5314 /*
5315 * These two routines are not part of the original softfloat distribution.
5316 *
5317 * They are based on the corresponding conversions to integer but return
5318 * unsigned numbers instead since these functions are required by GCC.
5319 *
5320 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5321 *
5322 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5323 */
5324
5325 /*----------------------------------------------------------------------------
5326 | Returns the result of converting the double-precision floating-point value
5327 | `a' to the 32-bit unsigned integer format. The conversion is
5328 | performed according to the IEC/IEEE Standard for Binary Floating-point
5329 | Arithmetic, except that the conversion is always rounded toward zero. If
5330 | `a' is a NaN, the largest positive integer is returned. If the conversion
5331 | overflows, the largest integer positive is returned.
5332 *----------------------------------------------------------------------------*/
5333
float64_to_uint32_round_to_zero(float64 a)5334 uint32 float64_to_uint32_round_to_zero( float64 a )
5335 {
5336 flag aSign;
5337 int16 aExp, shiftCount;
5338 bits64 aSig, savedASig;
5339 uint32 z;
5340
5341 aSig = extractFloat64Frac( a );
5342 aExp = extractFloat64Exp( a );
5343 aSign = extractFloat64Sign( a );
5344
5345 if (aSign) {
5346 float_raise( float_flag_invalid );
5347 return(0);
5348 }
5349
5350 if ( 0x41E < aExp ) {
5351 float_raise( float_flag_invalid );
5352 return 0xffffffff;
5353 }
5354 else if ( aExp < 0x3FF ) {
5355 if ( aExp || aSig ) float_set_inexact();
5356 return 0;
5357 }
5358 aSig |= LIT64( 0x0010000000000000 );
5359 shiftCount = 0x433 - aExp;
5360 savedASig = aSig;
5361 aSig >>= shiftCount;
5362 z = aSig;
5363 if ( ( aSig<<shiftCount ) != savedASig ) {
5364 float_set_inexact();
5365 }
5366 return z;
5367
5368 }
5369
5370 /*----------------------------------------------------------------------------
5371 | Returns the result of converting the single-precision floating-point value
5372 | `a' to the 32-bit unsigned integer format. The conversion is
5373 | performed according to the IEC/IEEE Standard for Binary Floating-point
5374 | Arithmetic, except that the conversion is always rounded toward zero. If
5375 | `a' is a NaN, the largest positive integer is returned. If the conversion
5376 | overflows, the largest positive integer is returned.
5377 *----------------------------------------------------------------------------*/
5378
float32_to_uint32_round_to_zero(float32 a)5379 uint32 float32_to_uint32_round_to_zero( float32 a )
5380 {
5381 flag aSign;
5382 int16 aExp, shiftCount;
5383 bits32 aSig;
5384 uint32 z;
5385
5386 aSig = extractFloat32Frac( a );
5387 aExp = extractFloat32Exp( a );
5388 aSign = extractFloat32Sign( a );
5389 shiftCount = aExp - 0x9E;
5390
5391 if (aSign) {
5392 float_raise( float_flag_invalid );
5393 return(0);
5394 }
5395 if ( 0 < shiftCount ) {
5396 float_raise( float_flag_invalid );
5397 return 0xFFFFFFFF;
5398 }
5399 else if ( aExp <= 0x7E ) {
5400 if ( aExp | aSig ) float_set_inexact();
5401 return 0;
5402 }
5403 aSig = ( aSig | 0x800000 )<<8;
5404 z = aSig>>( - shiftCount );
5405 if ( aSig<<( shiftCount & 31 ) ) {
5406 float_set_inexact();
5407 }
5408 return z;
5409
5410 }
5411
5412 #endif
5413
5414 #endif /* _STANDALONE */
5415