1/*
2===============================================================================
3
4This C source file is part of the SoftFloat IEC/IEEE Floating-point
5Arithmetic Package, Release 2.
6
7Written by John R. Hauser.  This work was made possible in part by the
8International Computer Science Institute, located at Suite 600, 1947 Center
9Street, Berkeley, California 94704.  Funding was partially provided by the
10National Science Foundation under grant MIP-9311980.  The original version
11of this code was written as part of a project to build a fixed-point vector
12processor in collaboration with the University of California at Berkeley,
13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15arithmetic/softfloat.html'.
16
17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
18has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
20PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22
23Derivative works are acceptable, even for commercial purposes, so long as
24(1) they include prominent notice that the work is derivative, and (2) they
25include prominent notice akin to these three paragraphs for those parts of
26this code that are retained.
27
28===============================================================================
29*/
30
31#include "fpa11.h"
32#include "milieu.h"
33#include "softfloat.h"
34
35/*
36-------------------------------------------------------------------------------
37Floating-point rounding mode, extended double-precision rounding precision,
38and exception flags.
39-------------------------------------------------------------------------------
40*/
41int8 float_rounding_mode = float_round_nearest_even;
42int8 floatx80_rounding_precision = 80;
43int8 float_exception_flags;
44
45/*
46-------------------------------------------------------------------------------
47Primitive arithmetic functions, including multi-word arithmetic, and
48division and square root approximations.  (Can be specialized to target if
49desired.)
50-------------------------------------------------------------------------------
51*/
52#include "softfloat-macros"
53
54/*
55-------------------------------------------------------------------------------
56Functions and definitions to determine:  (1) whether tininess for underflow
57is detected before or after rounding by default, (2) what (if anything)
58happens when exceptions are raised, (3) how signaling NaNs are distinguished
59from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60are propagated from function inputs to output.  These details are target-
61specific.
62-------------------------------------------------------------------------------
63*/
64#include "softfloat-specialize"
65
66/*
67-------------------------------------------------------------------------------
68Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69and 7, and returns the properly rounded 32-bit integer corresponding to the
70input.  If `zSign' is nonzero, the input is negated before being converted
71to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
72input is simply rounded to an integer, with the inexact exception raised if
73the input cannot be represented exactly as an integer.  If the fixed-point
74input is too large, however, the invalid exception is raised and the largest
75positive or negative integer is returned.
76-------------------------------------------------------------------------------
77*/
78static int32 roundAndPackInt32( flag zSign, bits64 absZ )
79{
80    int8 roundingMode;
81    flag roundNearestEven;
82    int8 roundIncrement, roundBits;
83    int32 z;
84
85    roundingMode = float_rounding_mode;
86    roundNearestEven = ( roundingMode == float_round_nearest_even );
87    roundIncrement = 0x40;
88    if ( ! roundNearestEven ) {
89        if ( roundingMode == float_round_to_zero ) {
90            roundIncrement = 0;
91        }
92        else {
93            roundIncrement = 0x7F;
94            if ( zSign ) {
95                if ( roundingMode == float_round_up ) roundIncrement = 0;
96            }
97            else {
98                if ( roundingMode == float_round_down ) roundIncrement = 0;
99            }
100        }
101    }
102    roundBits = absZ & 0x7F;
103    absZ = ( absZ + roundIncrement )>>7;
104    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
105    z = absZ;
106    if ( zSign ) z = - z;
107    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
108        float_exception_flags |= float_flag_invalid;
109        return zSign ? 0x80000000 : 0x7FFFFFFF;
110    }
111    if ( roundBits ) float_exception_flags |= float_flag_inexact;
112    return z;
113
114}
115
116/*
117-------------------------------------------------------------------------------
118Returns the fraction bits of the single-precision floating-point value `a'.
119-------------------------------------------------------------------------------
120*/
121INLINE bits32 extractFloat32Frac( float32 a )
122{
123
124    return a & 0x007FFFFF;
125
126}
127
128/*
129-------------------------------------------------------------------------------
130Returns the exponent bits of the single-precision floating-point value `a'.
131-------------------------------------------------------------------------------
132*/
133INLINE int16 extractFloat32Exp( float32 a )
134{
135
136    return ( a>>23 ) & 0xFF;
137
138}
139
140/*
141-------------------------------------------------------------------------------
142Returns the sign bit of the single-precision floating-point value `a'.
143-------------------------------------------------------------------------------
144*/
145INLINE flag extractFloat32Sign( float32 a )
146{
147
148    return a>>31;
149
150}
151
152/*
153-------------------------------------------------------------------------------
154Normalizes the subnormal single-precision floating-point value represented
155by the denormalized significand `aSig'.  The normalized exponent and
156significand are stored at the locations pointed to by `zExpPtr' and
157`zSigPtr', respectively.
158-------------------------------------------------------------------------------
159*/
160static void
161 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
162{
163    int8 shiftCount;
164
165    shiftCount = countLeadingZeros32( aSig ) - 8;
166    *zSigPtr = aSig<<shiftCount;
167    *zExpPtr = 1 - shiftCount;
168
169}
170
171/*
172-------------------------------------------------------------------------------
173Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
174single-precision floating-point value, returning the result.  After being
175shifted into the proper positions, the three fields are simply added
176together to form the result.  This means that any integer portion of `zSig'
177will be added into the exponent.  Since a properly normalized significand
178will have an integer portion equal to 1, the `zExp' input should be 1 less
179than the desired result exponent whenever `zSig' is a complete, normalized
180significand.
181-------------------------------------------------------------------------------
182*/
183INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
184{
185    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
186}
187
188/*
189-------------------------------------------------------------------------------
190Takes an abstract floating-point value having sign `zSign', exponent `zExp',
191and significand `zSig', and returns the proper single-precision floating-
192point value corresponding to the abstract input.  Ordinarily, the abstract
193value is simply rounded and packed into the single-precision format, with
194the inexact exception raised if the abstract input cannot be represented
195exactly.  If the abstract value is too large, however, the overflow and
196inexact exceptions are raised and an infinity or maximal finite value is
197returned.  If the abstract value is too small, the input value is rounded to
198a subnormal number, and the underflow and inexact exceptions are raised if
199the abstract input cannot be represented exactly as a subnormal single-
200precision floating-point number.
201    The input significand `zSig' has its binary point between bits 30
202and 29, which is 7 bits to the left of the usual location.  This shifted
203significand must be normalized or smaller.  If `zSig' is not normalized,
204`zExp' must be 0; in that case, the result returned is a subnormal number,
205and it must not require rounding.  In the usual case that `zSig' is
206normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
207The handling of underflow and overflow follows the IEC/IEEE Standard for
208Binary Floating-point Arithmetic.
209-------------------------------------------------------------------------------
210*/
211static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
212{
213    int8 roundingMode;
214    flag roundNearestEven;
215    int8 roundIncrement, roundBits;
216    flag isTiny;
217
218    roundingMode = float_rounding_mode;
219    roundNearestEven = ( roundingMode == float_round_nearest_even );
220    roundIncrement = 0x40;
221    if ( ! roundNearestEven ) {
222        if ( roundingMode == float_round_to_zero ) {
223            roundIncrement = 0;
224        }
225        else {
226            roundIncrement = 0x7F;
227            if ( zSign ) {
228                if ( roundingMode == float_round_up ) roundIncrement = 0;
229            }
230            else {
231                if ( roundingMode == float_round_down ) roundIncrement = 0;
232            }
233        }
234    }
235    roundBits = zSig & 0x7F;
236    if ( 0xFD <= (bits16) zExp ) {
237        if (    ( 0xFD < zExp )
238             || (    ( zExp == 0xFD )
239                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
240           ) {
241            float_raise( float_flag_overflow | float_flag_inexact );
242            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
243        }
244        if ( zExp < 0 ) {
245            isTiny =
246                   ( float_detect_tininess == float_tininess_before_rounding )
247                || ( zExp < -1 )
248                || ( zSig + roundIncrement < 0x80000000 );
249            shift32RightJamming( zSig, - zExp, &zSig );
250            zExp = 0;
251            roundBits = zSig & 0x7F;
252            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
253        }
254    }
255    if ( roundBits ) float_exception_flags |= float_flag_inexact;
256    zSig = ( zSig + roundIncrement )>>7;
257    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
258    if ( zSig == 0 ) zExp = 0;
259    return packFloat32( zSign, zExp, zSig );
260
261}
262
263/*
264-------------------------------------------------------------------------------
265Takes an abstract floating-point value having sign `zSign', exponent `zExp',
266and significand `zSig', and returns the proper single-precision floating-
267point value corresponding to the abstract input.  This routine is just like
268`roundAndPackFloat32' except that `zSig' does not have to be normalized in
269any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
270point exponent.
271-------------------------------------------------------------------------------
272*/
273static float32
274 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
275{
276    int8 shiftCount;
277
278    shiftCount = countLeadingZeros32( zSig ) - 1;
279    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
280
281}
282
283/*
284-------------------------------------------------------------------------------
285Returns the fraction bits of the double-precision floating-point value `a'.
286-------------------------------------------------------------------------------
287*/
288INLINE bits64 extractFloat64Frac( float64 a )
289{
290
291    return a & LIT64( 0x000FFFFFFFFFFFFF );
292
293}
294
295/*
296-------------------------------------------------------------------------------
297Returns the exponent bits of the double-precision floating-point value `a'.
298-------------------------------------------------------------------------------
299*/
300INLINE int16 extractFloat64Exp( float64 a )
301{
302
303    return ( a>>52 ) & 0x7FF;
304
305}
306
307/*
308-------------------------------------------------------------------------------
309Returns the sign bit of the double-precision floating-point value `a'.
310-------------------------------------------------------------------------------
311*/
312INLINE flag extractFloat64Sign( float64 a )
313{
314
315    return a>>63;
316
317}
318
319/*
320-------------------------------------------------------------------------------
321Normalizes the subnormal double-precision floating-point value represented
322by the denormalized significand `aSig'.  The normalized exponent and
323significand are stored at the locations pointed to by `zExpPtr' and
324`zSigPtr', respectively.
325-------------------------------------------------------------------------------
326*/
327static void
328 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
329{
330    int8 shiftCount;
331
332    shiftCount = countLeadingZeros64( aSig ) - 11;
333    *zSigPtr = aSig<<shiftCount;
334    *zExpPtr = 1 - shiftCount;
335
336}
337
338/*
339-------------------------------------------------------------------------------
340Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
341double-precision floating-point value, returning the result.  After being
342shifted into the proper positions, the three fields are simply added
343together to form the result.  This means that any integer portion of `zSig'
344will be added into the exponent.  Since a properly normalized significand
345will have an integer portion equal to 1, the `zExp' input should be 1 less
346than the desired result exponent whenever `zSig' is a complete, normalized
347significand.
348-------------------------------------------------------------------------------
349*/
350INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
351{
352
353    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
354
355}
356
357/*
358-------------------------------------------------------------------------------
359Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360and significand `zSig', and returns the proper double-precision floating-
361point value corresponding to the abstract input.  Ordinarily, the abstract
362value is simply rounded and packed into the double-precision format, with
363the inexact exception raised if the abstract input cannot be represented
364exactly.  If the abstract value is too large, however, the overflow and
365inexact exceptions are raised and an infinity or maximal finite value is
366returned.  If the abstract value is too small, the input value is rounded to
367a subnormal number, and the underflow and inexact exceptions are raised if
368the abstract input cannot be represented exactly as a subnormal double-
369precision floating-point number.
370    The input significand `zSig' has its binary point between bits 62
371and 61, which is 10 bits to the left of the usual location.  This shifted
372significand must be normalized or smaller.  If `zSig' is not normalized,
373`zExp' must be 0; in that case, the result returned is a subnormal number,
374and it must not require rounding.  In the usual case that `zSig' is
375normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
376The handling of underflow and overflow follows the IEC/IEEE Standard for
377Binary Floating-point Arithmetic.
378-------------------------------------------------------------------------------
379*/
380static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
381{
382    int8 roundingMode;
383    flag roundNearestEven;
384    int16 roundIncrement, roundBits;
385    flag isTiny;
386
387    roundingMode = float_rounding_mode;
388    roundNearestEven = ( roundingMode == float_round_nearest_even );
389    roundIncrement = 0x200;
390    if ( ! roundNearestEven ) {
391        if ( roundingMode == float_round_to_zero ) {
392            roundIncrement = 0;
393        }
394        else {
395            roundIncrement = 0x3FF;
396            if ( zSign ) {
397                if ( roundingMode == float_round_up ) roundIncrement = 0;
398            }
399            else {
400                if ( roundingMode == float_round_down ) roundIncrement = 0;
401            }
402        }
403    }
404    roundBits = zSig & 0x3FF;
405    if ( 0x7FD <= (bits16) zExp ) {
406        if (    ( 0x7FD < zExp )
407             || (    ( zExp == 0x7FD )
408                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
409           ) {
410            //register int lr = __builtin_return_address(0);
411            //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
412            float_raise( float_flag_overflow | float_flag_inexact );
413            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
414        }
415        if ( zExp < 0 ) {
416            isTiny =
417                   ( float_detect_tininess == float_tininess_before_rounding )
418                || ( zExp < -1 )
419                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
420            shift64RightJamming( zSig, - zExp, &zSig );
421            zExp = 0;
422            roundBits = zSig & 0x3FF;
423            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
424        }
425    }
426    if ( roundBits ) float_exception_flags |= float_flag_inexact;
427    zSig = ( zSig + roundIncrement )>>10;
428    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
429    if ( zSig == 0 ) zExp = 0;
430    return packFloat64( zSign, zExp, zSig );
431
432}
433
434/*
435-------------------------------------------------------------------------------
436Takes an abstract floating-point value having sign `zSign', exponent `zExp',
437and significand `zSig', and returns the proper double-precision floating-
438point value corresponding to the abstract input.  This routine is just like
439`roundAndPackFloat64' except that `zSig' does not have to be normalized in
440any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
441point exponent.
442-------------------------------------------------------------------------------
443*/
444static float64
445 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
446{
447    int8 shiftCount;
448
449    shiftCount = countLeadingZeros64( zSig ) - 1;
450    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
451
452}
453
454#ifdef FLOATX80
455
456/*
457-------------------------------------------------------------------------------
458Returns the fraction bits of the extended double-precision floating-point
459value `a'.
460-------------------------------------------------------------------------------
461*/
462INLINE bits64 extractFloatx80Frac( floatx80 a )
463{
464
465    return a.low;
466
467}
468
469/*
470-------------------------------------------------------------------------------
471Returns the exponent bits of the extended double-precision floating-point
472value `a'.
473-------------------------------------------------------------------------------
474*/
475INLINE int32 extractFloatx80Exp( floatx80 a )
476{
477
478    return a.high & 0x7FFF;
479
480}
481
482/*
483-------------------------------------------------------------------------------
484Returns the sign bit of the extended double-precision floating-point value
485`a'.
486-------------------------------------------------------------------------------
487*/
488INLINE flag extractFloatx80Sign( floatx80 a )
489{
490
491    return a.high>>15;
492
493}
494
495/*
496-------------------------------------------------------------------------------
497Normalizes the subnormal extended double-precision floating-point value
498represented by the denormalized significand `aSig'.  The normalized exponent
499and significand are stored at the locations pointed to by `zExpPtr' and
500`zSigPtr', respectively.
501-------------------------------------------------------------------------------
502*/
503static void
504 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
505{
506    int8 shiftCount;
507
508    shiftCount = countLeadingZeros64( aSig );
509    *zSigPtr = aSig<<shiftCount;
510    *zExpPtr = 1 - shiftCount;
511
512}
513
514/*
515-------------------------------------------------------------------------------
516Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
517extended double-precision floating-point value, returning the result.
518-------------------------------------------------------------------------------
519*/
520INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
521{
522    floatx80 z;
523
524    z.low = zSig;
525    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
526    return z;
527
528}
529
530/*
531-------------------------------------------------------------------------------
532Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533and extended significand formed by the concatenation of `zSig0' and `zSig1',
534and returns the proper extended double-precision floating-point value
535corresponding to the abstract input.  Ordinarily, the abstract value is
536rounded and packed into the extended double-precision format, with the
537inexact exception raised if the abstract input cannot be represented
538exactly.  If the abstract value is too large, however, the overflow and
539inexact exceptions are raised and an infinity or maximal finite value is
540returned.  If the abstract value is too small, the input value is rounded to
541a subnormal number, and the underflow and inexact exceptions are raised if
542the abstract input cannot be represented exactly as a subnormal extended
543double-precision floating-point number.
544    If `roundingPrecision' is 32 or 64, the result is rounded to the same
545number of bits as single or double precision, respectively.  Otherwise, the
546result is rounded to the full precision of the extended double-precision
547format.
548    The input significand must be normalized or smaller.  If the input
549significand is not normalized, `zExp' must be 0; in that case, the result
550returned is a subnormal number, and it must not require rounding.  The
551handling of underflow and overflow follows the IEC/IEEE Standard for Binary
552Floating-point Arithmetic.
553-------------------------------------------------------------------------------
554*/
555static floatx80
556 roundAndPackFloatx80(
557     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
558 )
559{
560    int8 roundingMode;
561    flag roundNearestEven, increment, isTiny;
562    int64 roundIncrement, roundMask, roundBits;
563
564    roundingMode = float_rounding_mode;
565    roundNearestEven = ( roundingMode == float_round_nearest_even );
566    if ( roundingPrecision == 80 ) goto precision80;
567    if ( roundingPrecision == 64 ) {
568        roundIncrement = LIT64( 0x0000000000000400 );
569        roundMask = LIT64( 0x00000000000007FF );
570    }
571    else if ( roundingPrecision == 32 ) {
572        roundIncrement = LIT64( 0x0000008000000000 );
573        roundMask = LIT64( 0x000000FFFFFFFFFF );
574    }
575    else {
576        goto precision80;
577    }
578    zSig0 |= ( zSig1 != 0 );
579    if ( ! roundNearestEven ) {
580        if ( roundingMode == float_round_to_zero ) {
581            roundIncrement = 0;
582        }
583        else {
584            roundIncrement = roundMask;
585            if ( zSign ) {
586                if ( roundingMode == float_round_up ) roundIncrement = 0;
587            }
588            else {
589                if ( roundingMode == float_round_down ) roundIncrement = 0;
590            }
591        }
592    }
593    roundBits = zSig0 & roundMask;
594    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
595        if (    ( 0x7FFE < zExp )
596             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
597           ) {
598            goto overflow;
599        }
600        if ( zExp <= 0 ) {
601            isTiny =
602                   ( float_detect_tininess == float_tininess_before_rounding )
603                || ( zExp < 0 )
604                || ( zSig0 <= zSig0 + roundIncrement );
605            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
606            zExp = 0;
607            roundBits = zSig0 & roundMask;
608            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
609            if ( roundBits ) float_exception_flags |= float_flag_inexact;
610            zSig0 += roundIncrement;
611            if ( (sbits64) zSig0 < 0 ) zExp = 1;
612            roundIncrement = roundMask + 1;
613            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
614                roundMask |= roundIncrement;
615            }
616            zSig0 &= ~ roundMask;
617            return packFloatx80( zSign, zExp, zSig0 );
618        }
619    }
620    if ( roundBits ) float_exception_flags |= float_flag_inexact;
621    zSig0 += roundIncrement;
622    if ( zSig0 < roundIncrement ) {
623        ++zExp;
624        zSig0 = LIT64( 0x8000000000000000 );
625    }
626    roundIncrement = roundMask + 1;
627    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
628        roundMask |= roundIncrement;
629    }
630    zSig0 &= ~ roundMask;
631    if ( zSig0 == 0 ) zExp = 0;
632    return packFloatx80( zSign, zExp, zSig0 );
633 precision80:
634    increment = ( (sbits64) zSig1 < 0 );
635    if ( ! roundNearestEven ) {
636        if ( roundingMode == float_round_to_zero ) {
637            increment = 0;
638        }
639        else {
640            if ( zSign ) {
641                increment = ( roundingMode == float_round_down ) && zSig1;
642            }
643            else {
644                increment = ( roundingMode == float_round_up ) && zSig1;
645            }
646        }
647    }
648    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
649        if (    ( 0x7FFE < zExp )
650             || (    ( zExp == 0x7FFE )
651                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
652                  && increment
653                )
654           ) {
655            roundMask = 0;
656 overflow:
657            float_raise( float_flag_overflow | float_flag_inexact );
658            if (    ( roundingMode == float_round_to_zero )
659                 || ( zSign && ( roundingMode == float_round_up ) )
660                 || ( ! zSign && ( roundingMode == float_round_down ) )
661               ) {
662                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
663            }
664            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
665        }
666        if ( zExp <= 0 ) {
667            isTiny =
668                   ( float_detect_tininess == float_tininess_before_rounding )
669                || ( zExp < 0 )
670                || ! increment
671                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
672            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
673            zExp = 0;
674            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
675            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
676            if ( roundNearestEven ) {
677                increment = ( (sbits64) zSig1 < 0 );
678            }
679            else {
680                if ( zSign ) {
681                    increment = ( roundingMode == float_round_down ) && zSig1;
682                }
683                else {
684                    increment = ( roundingMode == float_round_up ) && zSig1;
685                }
686            }
687            if ( increment ) {
688                ++zSig0;
689                zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
690                if ( (sbits64) zSig0 < 0 ) zExp = 1;
691            }
692            return packFloatx80( zSign, zExp, zSig0 );
693        }
694    }
695    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
696    if ( increment ) {
697        ++zSig0;
698        if ( zSig0 == 0 ) {
699            ++zExp;
700            zSig0 = LIT64( 0x8000000000000000 );
701        }
702        else {
703            zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
704        }
705    }
706    else {
707        if ( zSig0 == 0 ) zExp = 0;
708    }
709
710    return packFloatx80( zSign, zExp, zSig0 );
711}
712
713/*
714-------------------------------------------------------------------------------
715Takes an abstract floating-point value having sign `zSign', exponent
716`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
717and returns the proper extended double-precision floating-point value
718corresponding to the abstract input.  This routine is just like
719`roundAndPackFloatx80' except that the input significand does not have to be
720normalized.
721-------------------------------------------------------------------------------
722*/
723static floatx80
724 normalizeRoundAndPackFloatx80(
725     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
726 )
727{
728    int8 shiftCount;
729
730    if ( zSig0 == 0 ) {
731        zSig0 = zSig1;
732        zSig1 = 0;
733        zExp -= 64;
734    }
735    shiftCount = countLeadingZeros64( zSig0 );
736    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
737    zExp -= shiftCount;
738    return
739        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
740
741}
742
743#endif
744
745/*
746-------------------------------------------------------------------------------
747Returns the result of converting the 32-bit two's complement integer `a' to
748the single-precision floating-point format.  The conversion is performed
749according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
750-------------------------------------------------------------------------------
751*/
752float32 int32_to_float32( int32 a )
753{
754    flag zSign;
755
756    if ( a == 0 ) return 0;
757    if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
758    zSign = ( a < 0 );
759    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
760
761}
762
763/*
764-------------------------------------------------------------------------------
765Returns the result of converting the 32-bit two's complement integer `a' to
766the double-precision floating-point format.  The conversion is performed
767according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
768-------------------------------------------------------------------------------
769*/
770float64 int32_to_float64( int32 a )
771{
772    flag aSign;
773    uint32 absA;
774    int8 shiftCount;
775    bits64 zSig;
776
777    if ( a == 0 ) return 0;
778    aSign = ( a < 0 );
779    absA = aSign ? - a : a;
780    shiftCount = countLeadingZeros32( absA ) + 21;
781    zSig = absA;
782    return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
783
784}
785
786#ifdef FLOATX80
787
788/*
789-------------------------------------------------------------------------------
790Returns the result of converting the 32-bit two's complement integer `a'
791to the extended double-precision floating-point format.  The conversion
792is performed according to the IEC/IEEE Standard for Binary Floating-point
793Arithmetic.
794-------------------------------------------------------------------------------
795*/
796floatx80 int32_to_floatx80( int32 a )
797{
798    flag zSign;
799    uint32 absA;
800    int8 shiftCount;
801    bits64 zSig;
802
803    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
804    zSign = ( a < 0 );
805    absA = zSign ? - a : a;
806    shiftCount = countLeadingZeros32( absA ) + 32;
807    zSig = absA;
808    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
809
810}
811
812#endif
813
814/*
815-------------------------------------------------------------------------------
816Returns the result of converting the single-precision floating-point value
817`a' to the 32-bit two's complement integer format.  The conversion is
818performed according to the IEC/IEEE Standard for Binary Floating-point
819Arithmetic---which means in particular that the conversion is rounded
820according to the current rounding mode.  If `a' is a NaN, the largest
821positive integer is returned.  Otherwise, if the conversion overflows, the
822largest integer with the same sign as `a' is returned.
823-------------------------------------------------------------------------------
824*/
825int32 float32_to_int32( float32 a )
826{
827    flag aSign;
828    int16 aExp, shiftCount;
829    bits32 aSig;
830    bits64 zSig;
831
832    aSig = extractFloat32Frac( a );
833    aExp = extractFloat32Exp( a );
834    aSign = extractFloat32Sign( a );
835    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
836    if ( aExp ) aSig |= 0x00800000;
837    shiftCount = 0xAF - aExp;
838    zSig = aSig;
839    zSig <<= 32;
840    if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
841    return roundAndPackInt32( aSign, zSig );
842
843}
844
845/*
846-------------------------------------------------------------------------------
847Returns the result of converting the single-precision floating-point value
848`a' to the 32-bit two's complement integer format.  The conversion is
849performed according to the IEC/IEEE Standard for Binary Floating-point
850Arithmetic, except that the conversion is always rounded toward zero.  If
851`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
852conversion overflows, the largest integer with the same sign as `a' is
853returned.
854-------------------------------------------------------------------------------
855*/
856int32 float32_to_int32_round_to_zero( float32 a )
857{
858    flag aSign;
859    int16 aExp, shiftCount;
860    bits32 aSig;
861    int32 z;
862
863    aSig = extractFloat32Frac( a );
864    aExp = extractFloat32Exp( a );
865    aSign = extractFloat32Sign( a );
866    shiftCount = aExp - 0x9E;
867    if ( 0 <= shiftCount ) {
868        if ( a == 0xCF000000 ) return 0x80000000;
869        float_raise( float_flag_invalid );
870        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
871        return 0x80000000;
872    }
873    else if ( aExp <= 0x7E ) {
874        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
875        return 0;
876    }
877    aSig = ( aSig | 0x00800000 )<<8;
878    z = aSig>>( - shiftCount );
879    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
880        float_exception_flags |= float_flag_inexact;
881    }
882    return aSign ? - z : z;
883
884}
885
886/*
887-------------------------------------------------------------------------------
888Returns the result of converting the single-precision floating-point value
889`a' to the double-precision floating-point format.  The conversion is
890performed according to the IEC/IEEE Standard for Binary Floating-point
891Arithmetic.
892-------------------------------------------------------------------------------
893*/
894float64 float32_to_float64( float32 a )
895{
896    flag aSign;
897    int16 aExp;
898    bits32 aSig;
899
900    aSig = extractFloat32Frac( a );
901    aExp = extractFloat32Exp( a );
902    aSign = extractFloat32Sign( a );
903    if ( aExp == 0xFF ) {
904        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
905        return packFloat64( aSign, 0x7FF, 0 );
906    }
907    if ( aExp == 0 ) {
908        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
909        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
910        --aExp;
911    }
912    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
913
914}
915
916#ifdef FLOATX80
917
918/*
919-------------------------------------------------------------------------------
920Returns the result of converting the single-precision floating-point value
921`a' to the extended double-precision floating-point format.  The conversion
922is performed according to the IEC/IEEE Standard for Binary Floating-point
923Arithmetic.
924-------------------------------------------------------------------------------
925*/
926floatx80 float32_to_floatx80( float32 a )
927{
928    flag aSign;
929    int16 aExp;
930    bits32 aSig;
931
932    aSig = extractFloat32Frac( a );
933    aExp = extractFloat32Exp( a );
934    aSign = extractFloat32Sign( a );
935    if ( aExp == 0xFF ) {
936        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
937        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
938    }
939    if ( aExp == 0 ) {
940        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
941        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
942    }
943    aSig |= 0x00800000;
944    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
945
946}
947
948#endif
949
950/*
951-------------------------------------------------------------------------------
952Rounds the single-precision floating-point value `a' to an integer, and
953returns the result as a single-precision floating-point value.  The
954operation is performed according to the IEC/IEEE Standard for Binary
955Floating-point Arithmetic.
956-------------------------------------------------------------------------------
957*/
958float32 float32_round_to_int( float32 a )
959{
960    flag aSign;
961    int16 aExp;
962    bits32 lastBitMask, roundBitsMask;
963    int8 roundingMode;
964    float32 z;
965
966    aExp = extractFloat32Exp( a );
967    if ( 0x96 <= aExp ) {
968        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
969            return propagateFloat32NaN( a, a );
970        }
971        return a;
972    }
973    if ( aExp <= 0x7E ) {
974        if ( (bits32) ( a<<1 ) == 0 ) return a;
975        float_exception_flags |= float_flag_inexact;
976        aSign = extractFloat32Sign( a );
977        switch ( float_rounding_mode ) {
978         case float_round_nearest_even:
979            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
980                return packFloat32( aSign, 0x7F, 0 );
981            }
982            break;
983         case float_round_down:
984            return aSign ? 0xBF800000 : 0;
985         case float_round_up:
986            return aSign ? 0x80000000 : 0x3F800000;
987        }
988        return packFloat32( aSign, 0, 0 );
989    }
990    lastBitMask = 1;
991    lastBitMask <<= 0x96 - aExp;
992    roundBitsMask = lastBitMask - 1;
993    z = a;
994    roundingMode = float_rounding_mode;
995    if ( roundingMode == float_round_nearest_even ) {
996        z += lastBitMask>>1;
997        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
998    }
999    else if ( roundingMode != float_round_to_zero ) {
1000        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1001            z += roundBitsMask;
1002        }
1003    }
1004    z &= ~ roundBitsMask;
1005    if ( z != a ) float_exception_flags |= float_flag_inexact;
1006    return z;
1007
1008}
1009
1010/*
1011-------------------------------------------------------------------------------
1012Returns the result of adding the absolute values of the single-precision
1013floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1014before being returned.  `zSign' is ignored if the result is a NaN.  The
1015addition is performed according to the IEC/IEEE Standard for Binary
1016Floating-point Arithmetic.
1017-------------------------------------------------------------------------------
1018*/
1019static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1020{
1021    int16 aExp, bExp, zExp;
1022    bits32 aSig, bSig, zSig;
1023    int16 expDiff;
1024
1025    aSig = extractFloat32Frac( a );
1026    aExp = extractFloat32Exp( a );
1027    bSig = extractFloat32Frac( b );
1028    bExp = extractFloat32Exp( b );
1029    expDiff = aExp - bExp;
1030    aSig <<= 6;
1031    bSig <<= 6;
1032    if ( 0 < expDiff ) {
1033        if ( aExp == 0xFF ) {
1034            if ( aSig ) return propagateFloat32NaN( a, b );
1035            return a;
1036        }
1037        if ( bExp == 0 ) {
1038            --expDiff;
1039        }
1040        else {
1041            bSig |= 0x20000000;
1042        }
1043        shift32RightJamming( bSig, expDiff, &bSig );
1044        zExp = aExp;
1045    }
1046    else if ( expDiff < 0 ) {
1047        if ( bExp == 0xFF ) {
1048            if ( bSig ) return propagateFloat32NaN( a, b );
1049            return packFloat32( zSign, 0xFF, 0 );
1050        }
1051        if ( aExp == 0 ) {
1052            ++expDiff;
1053        }
1054        else {
1055            aSig |= 0x20000000;
1056        }
1057        shift32RightJamming( aSig, - expDiff, &aSig );
1058        zExp = bExp;
1059    }
1060    else {
1061        if ( aExp == 0xFF ) {
1062            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1063            return a;
1064        }
1065        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1066        zSig = 0x40000000 + aSig + bSig;
1067        zExp = aExp;
1068        goto roundAndPack;
1069    }
1070    aSig |= 0x20000000;
1071    zSig = ( aSig + bSig )<<1;
1072    --zExp;
1073    if ( (sbits32) zSig < 0 ) {
1074        zSig = aSig + bSig;
1075        ++zExp;
1076    }
1077 roundAndPack:
1078    return roundAndPackFloat32( zSign, zExp, zSig );
1079
1080}
1081
1082/*
1083-------------------------------------------------------------------------------
1084Returns the result of subtracting the absolute values of the single-
1085precision floating-point values `a' and `b'.  If `zSign' is true, the
1086difference is negated before being returned.  `zSign' is ignored if the
1087result is a NaN.  The subtraction is performed according to the IEC/IEEE
1088Standard for Binary Floating-point Arithmetic.
1089-------------------------------------------------------------------------------
1090*/
1091static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1092{
1093    int16 aExp, bExp, zExp;
1094    bits32 aSig, bSig, zSig;
1095    int16 expDiff;
1096
1097    aSig = extractFloat32Frac( a );
1098    aExp = extractFloat32Exp( a );
1099    bSig = extractFloat32Frac( b );
1100    bExp = extractFloat32Exp( b );
1101    expDiff = aExp - bExp;
1102    aSig <<= 7;
1103    bSig <<= 7;
1104    if ( 0 < expDiff ) goto aExpBigger;
1105    if ( expDiff < 0 ) goto bExpBigger;
1106    if ( aExp == 0xFF ) {
1107        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1108        float_raise( float_flag_invalid );
1109        return float32_default_nan;
1110    }
1111    if ( aExp == 0 ) {
1112        aExp = 1;
1113        bExp = 1;
1114    }
1115    if ( bSig < aSig ) goto aBigger;
1116    if ( aSig < bSig ) goto bBigger;
1117    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1118 bExpBigger:
1119    if ( bExp == 0xFF ) {
1120        if ( bSig ) return propagateFloat32NaN( a, b );
1121        return packFloat32( zSign ^ 1, 0xFF, 0 );
1122    }
1123    if ( aExp == 0 ) {
1124        ++expDiff;
1125    }
1126    else {
1127        aSig |= 0x40000000;
1128    }
1129    shift32RightJamming( aSig, - expDiff, &aSig );
1130    bSig |= 0x40000000;
1131 bBigger:
1132    zSig = bSig - aSig;
1133    zExp = bExp;
1134    zSign ^= 1;
1135    goto normalizeRoundAndPack;
1136 aExpBigger:
1137    if ( aExp == 0xFF ) {
1138        if ( aSig ) return propagateFloat32NaN( a, b );
1139        return a;
1140    }
1141    if ( bExp == 0 ) {
1142        --expDiff;
1143    }
1144    else {
1145        bSig |= 0x40000000;
1146    }
1147    shift32RightJamming( bSig, expDiff, &bSig );
1148    aSig |= 0x40000000;
1149 aBigger:
1150    zSig = aSig - bSig;
1151    zExp = aExp;
1152 normalizeRoundAndPack:
1153    --zExp;
1154    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1155
1156}
1157
1158/*
1159-------------------------------------------------------------------------------
1160Returns the result of adding the single-precision floating-point values `a'
1161and `b'.  The operation is performed according to the IEC/IEEE Standard for
1162Binary Floating-point Arithmetic.
1163-------------------------------------------------------------------------------
1164*/
1165float32 float32_add( float32 a, float32 b )
1166{
1167    flag aSign, bSign;
1168
1169    aSign = extractFloat32Sign( a );
1170    bSign = extractFloat32Sign( b );
1171    if ( aSign == bSign ) {
1172        return addFloat32Sigs( a, b, aSign );
1173    }
1174    else {
1175        return subFloat32Sigs( a, b, aSign );
1176    }
1177
1178}
1179
1180/*
1181-------------------------------------------------------------------------------
1182Returns the result of subtracting the single-precision floating-point values
1183`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1184for Binary Floating-point Arithmetic.
1185-------------------------------------------------------------------------------
1186*/
1187float32 float32_sub( float32 a, float32 b )
1188{
1189    flag aSign, bSign;
1190
1191    aSign = extractFloat32Sign( a );
1192    bSign = extractFloat32Sign( b );
1193    if ( aSign == bSign ) {
1194        return subFloat32Sigs( a, b, aSign );
1195    }
1196    else {
1197        return addFloat32Sigs( a, b, aSign );
1198    }
1199
1200}
1201
1202/*
1203-------------------------------------------------------------------------------
1204Returns the result of multiplying the single-precision floating-point values
1205`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1206for Binary Floating-point Arithmetic.
1207-------------------------------------------------------------------------------
1208*/
1209float32 float32_mul( float32 a, float32 b )
1210{
1211    flag aSign, bSign, zSign;
1212    int16 aExp, bExp, zExp;
1213    bits32 aSig, bSig;
1214    bits64 zSig64;
1215    bits32 zSig;
1216
1217    aSig = extractFloat32Frac( a );
1218    aExp = extractFloat32Exp( a );
1219    aSign = extractFloat32Sign( a );
1220    bSig = extractFloat32Frac( b );
1221    bExp = extractFloat32Exp( b );
1222    bSign = extractFloat32Sign( b );
1223    zSign = aSign ^ bSign;
1224    if ( aExp == 0xFF ) {
1225        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1226            return propagateFloat32NaN( a, b );
1227        }
1228        if ( ( bExp | bSig ) == 0 ) {
1229            float_raise( float_flag_invalid );
1230            return float32_default_nan;
1231        }
1232        return packFloat32( zSign, 0xFF, 0 );
1233    }
1234    if ( bExp == 0xFF ) {
1235        if ( bSig ) return propagateFloat32NaN( a, b );
1236        if ( ( aExp | aSig ) == 0 ) {
1237            float_raise( float_flag_invalid );
1238            return float32_default_nan;
1239        }
1240        return packFloat32( zSign, 0xFF, 0 );
1241    }
1242    if ( aExp == 0 ) {
1243        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1244        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1245    }
1246    if ( bExp == 0 ) {
1247        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1248        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1249    }
1250    zExp = aExp + bExp - 0x7F;
1251    aSig = ( aSig | 0x00800000 )<<7;
1252    bSig = ( bSig | 0x00800000 )<<8;
1253    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1254    zSig = zSig64;
1255    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1256        zSig <<= 1;
1257        --zExp;
1258    }
1259    return roundAndPackFloat32( zSign, zExp, zSig );
1260
1261}
1262
1263/*
1264-------------------------------------------------------------------------------
1265Returns the result of dividing the single-precision floating-point value `a'
1266by the corresponding value `b'.  The operation is performed according to the
1267IEC/IEEE Standard for Binary Floating-point Arithmetic.
1268-------------------------------------------------------------------------------
1269*/
1270float32 float32_div( float32 a, float32 b )
1271{
1272    flag aSign, bSign, zSign;
1273    int16 aExp, bExp, zExp;
1274    bits32 aSig, bSig, zSig;
1275
1276    aSig = extractFloat32Frac( a );
1277    aExp = extractFloat32Exp( a );
1278    aSign = extractFloat32Sign( a );
1279    bSig = extractFloat32Frac( b );
1280    bExp = extractFloat32Exp( b );
1281    bSign = extractFloat32Sign( b );
1282    zSign = aSign ^ bSign;
1283    if ( aExp == 0xFF ) {
1284        if ( aSig ) return propagateFloat32NaN( a, b );
1285        if ( bExp == 0xFF ) {
1286            if ( bSig ) return propagateFloat32NaN( a, b );
1287            float_raise( float_flag_invalid );
1288            return float32_default_nan;
1289        }
1290        return packFloat32( zSign, 0xFF, 0 );
1291    }
1292    if ( bExp == 0xFF ) {
1293        if ( bSig ) return propagateFloat32NaN( a, b );
1294        return packFloat32( zSign, 0, 0 );
1295    }
1296    if ( bExp == 0 ) {
1297        if ( bSig == 0 ) {
1298            if ( ( aExp | aSig ) == 0 ) {
1299                float_raise( float_flag_invalid );
1300                return float32_default_nan;
1301            }
1302            float_raise( float_flag_divbyzero );
1303            return packFloat32( zSign, 0xFF, 0 );
1304        }
1305        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1306    }
1307    if ( aExp == 0 ) {
1308        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1309        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1310    }
1311    zExp = aExp - bExp + 0x7D;
1312    aSig = ( aSig | 0x00800000 )<<7;
1313    bSig = ( bSig | 0x00800000 )<<8;
1314    if ( bSig <= ( aSig + aSig ) ) {
1315        aSig >>= 1;
1316        ++zExp;
1317    }
1318    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1319    if ( ( zSig & 0x3F ) == 0 ) {
1320        zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1321    }
1322    return roundAndPackFloat32( zSign, zExp, zSig );
1323
1324}
1325
1326/*
1327-------------------------------------------------------------------------------
1328Returns the remainder of the single-precision floating-point value `a'
1329with respect to the corresponding value `b'.  The operation is performed
1330according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1331-------------------------------------------------------------------------------
1332*/
1333float32 float32_rem( float32 a, float32 b )
1334{
1335    flag aSign, bSign, zSign;
1336    int16 aExp, bExp, expDiff;
1337    bits32 aSig, bSig;
1338    bits32 q;
1339    bits64 aSig64, bSig64, q64;
1340    bits32 alternateASig;
1341    sbits32 sigMean;
1342
1343    aSig = extractFloat32Frac( a );
1344    aExp = extractFloat32Exp( a );
1345    aSign = extractFloat32Sign( a );
1346    bSig = extractFloat32Frac( b );
1347    bExp = extractFloat32Exp( b );
1348    bSign = extractFloat32Sign( b );
1349    if ( aExp == 0xFF ) {
1350        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1351            return propagateFloat32NaN( a, b );
1352        }
1353        float_raise( float_flag_invalid );
1354        return float32_default_nan;
1355    }
1356    if ( bExp == 0xFF ) {
1357        if ( bSig ) return propagateFloat32NaN( a, b );
1358        return a;
1359    }
1360    if ( bExp == 0 ) {
1361        if ( bSig == 0 ) {
1362            float_raise( float_flag_invalid );
1363            return float32_default_nan;
1364        }
1365        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1366    }
1367    if ( aExp == 0 ) {
1368        if ( aSig == 0 ) return a;
1369        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1370    }
1371    expDiff = aExp - bExp;
1372    aSig |= 0x00800000;
1373    bSig |= 0x00800000;
1374    if ( expDiff < 32 ) {
1375        aSig <<= 8;
1376        bSig <<= 8;
1377        if ( expDiff < 0 ) {
1378            if ( expDiff < -1 ) return a;
1379            aSig >>= 1;
1380        }
1381        q = ( bSig <= aSig );
1382        if ( q ) aSig -= bSig;
1383        if ( 0 < expDiff ) {
1384            q = ( ( (bits64) aSig )<<32 ) / bSig;
1385            q >>= 32 - expDiff;
1386            bSig >>= 2;
1387            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1388        }
1389        else {
1390            aSig >>= 2;
1391            bSig >>= 2;
1392        }
1393    }
1394    else {
1395        if ( bSig <= aSig ) aSig -= bSig;
1396        aSig64 = ( (bits64) aSig )<<40;
1397        bSig64 = ( (bits64) bSig )<<40;
1398        expDiff -= 64;
1399        while ( 0 < expDiff ) {
1400            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1401            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1402            aSig64 = - ( ( bSig * q64 )<<38 );
1403            expDiff -= 62;
1404        }
1405        expDiff += 64;
1406        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1407        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1408        q = q64>>( 64 - expDiff );
1409        bSig <<= 6;
1410        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1411    }
1412    do {
1413        alternateASig = aSig;
1414        ++q;
1415        aSig -= bSig;
1416    } while ( 0 <= (sbits32) aSig );
1417    sigMean = aSig + alternateASig;
1418    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1419        aSig = alternateASig;
1420    }
1421    zSign = ( (sbits32) aSig < 0 );
1422    if ( zSign ) aSig = - aSig;
1423    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1424
1425}
1426
1427/*
1428-------------------------------------------------------------------------------
1429Returns the square root of the single-precision floating-point value `a'.
1430The operation is performed according to the IEC/IEEE Standard for Binary
1431Floating-point Arithmetic.
1432-------------------------------------------------------------------------------
1433*/
1434float32 float32_sqrt( float32 a )
1435{
1436    flag aSign;
1437    int16 aExp, zExp;
1438    bits32 aSig, zSig;
1439    bits64 rem, term;
1440
1441    aSig = extractFloat32Frac( a );
1442    aExp = extractFloat32Exp( a );
1443    aSign = extractFloat32Sign( a );
1444    if ( aExp == 0xFF ) {
1445        if ( aSig ) return propagateFloat32NaN( a, 0 );
1446        if ( ! aSign ) return a;
1447        float_raise( float_flag_invalid );
1448        return float32_default_nan;
1449    }
1450    if ( aSign ) {
1451        if ( ( aExp | aSig ) == 0 ) return a;
1452        float_raise( float_flag_invalid );
1453        return float32_default_nan;
1454    }
1455    if ( aExp == 0 ) {
1456        if ( aSig == 0 ) return 0;
1457        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1458    }
1459    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1460    aSig = ( aSig | 0x00800000 )<<8;
1461    zSig = estimateSqrt32( aExp, aSig ) + 2;
1462    if ( ( zSig & 0x7F ) <= 5 ) {
1463        if ( zSig < 2 ) {
1464            zSig = 0xFFFFFFFF;
1465        }
1466        else {
1467            aSig >>= aExp & 1;
1468            term = ( (bits64) zSig ) * zSig;
1469            rem = ( ( (bits64) aSig )<<32 ) - term;
1470            while ( (sbits64) rem < 0 ) {
1471                --zSig;
1472                rem += ( ( (bits64) zSig )<<1 ) | 1;
1473            }
1474            zSig |= ( rem != 0 );
1475        }
1476    }
1477    shift32RightJamming( zSig, 1, &zSig );
1478    return roundAndPackFloat32( 0, zExp, zSig );
1479
1480}
1481
1482/*
1483-------------------------------------------------------------------------------
1484Returns 1 if the single-precision floating-point value `a' is equal to the
1485corresponding value `b', and 0 otherwise.  The comparison is performed
1486according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1487-------------------------------------------------------------------------------
1488*/
1489flag float32_eq( float32 a, float32 b )
1490{
1491
1492    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1493         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1494       ) {
1495        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1496            float_raise( float_flag_invalid );
1497        }
1498        return 0;
1499    }
1500    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1501
1502}
1503
1504/*
1505-------------------------------------------------------------------------------
1506Returns 1 if the single-precision floating-point value `a' is less than or
1507equal to the corresponding value `b', and 0 otherwise.  The comparison is
1508performed according to the IEC/IEEE Standard for Binary Floating-point
1509Arithmetic.
1510-------------------------------------------------------------------------------
1511*/
1512flag float32_le( float32 a, float32 b )
1513{
1514    flag aSign, bSign;
1515
1516    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1517         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1518       ) {
1519        float_raise( float_flag_invalid );
1520        return 0;
1521    }
1522    aSign = extractFloat32Sign( a );
1523    bSign = extractFloat32Sign( b );
1524    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1525    return ( a == b ) || ( aSign ^ ( a < b ) );
1526
1527}
1528
1529/*
1530-------------------------------------------------------------------------------
1531Returns 1 if the single-precision floating-point value `a' is less than
1532the corresponding value `b', and 0 otherwise.  The comparison is performed
1533according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1534-------------------------------------------------------------------------------
1535*/
1536flag float32_lt( float32 a, float32 b )
1537{
1538    flag aSign, bSign;
1539
1540    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1541         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1542       ) {
1543        float_raise( float_flag_invalid );
1544        return 0;
1545    }
1546    aSign = extractFloat32Sign( a );
1547    bSign = extractFloat32Sign( b );
1548    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1549    return ( a != b ) && ( aSign ^ ( a < b ) );
1550
1551}
1552
1553/*
1554-------------------------------------------------------------------------------
1555Returns 1 if the single-precision floating-point value `a' is equal to the
1556corresponding value `b', and 0 otherwise.  The invalid exception is raised
1557if either operand is a NaN.  Otherwise, the comparison is performed
1558according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1559-------------------------------------------------------------------------------
1560*/
1561flag float32_eq_signaling( float32 a, float32 b )
1562{
1563
1564    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1565         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1566       ) {
1567        float_raise( float_flag_invalid );
1568        return 0;
1569    }
1570    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1571
1572}
1573
1574/*
1575-------------------------------------------------------------------------------
1576Returns 1 if the single-precision floating-point value `a' is less than or
1577equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1578cause an exception.  Otherwise, the comparison is performed according to the
1579IEC/IEEE Standard for Binary Floating-point Arithmetic.
1580-------------------------------------------------------------------------------
1581*/
1582flag float32_le_quiet( float32 a, float32 b )
1583{
1584    flag aSign, bSign;
1585    //int16 aExp, bExp;
1586
1587    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1588         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1589       ) {
1590        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1591            float_raise( float_flag_invalid );
1592        }
1593        return 0;
1594    }
1595    aSign = extractFloat32Sign( a );
1596    bSign = extractFloat32Sign( b );
1597    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1598    return ( a == b ) || ( aSign ^ ( a < b ) );
1599
1600}
1601
1602/*
1603-------------------------------------------------------------------------------
1604Returns 1 if the single-precision floating-point value `a' is less than
1605the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1606exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1607Standard for Binary Floating-point Arithmetic.
1608-------------------------------------------------------------------------------
1609*/
1610flag float32_lt_quiet( float32 a, float32 b )
1611{
1612    flag aSign, bSign;
1613
1614    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1615         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1616       ) {
1617        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1618            float_raise( float_flag_invalid );
1619        }
1620        return 0;
1621    }
1622    aSign = extractFloat32Sign( a );
1623    bSign = extractFloat32Sign( b );
1624    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1625    return ( a != b ) && ( aSign ^ ( a < b ) );
1626
1627}
1628
1629/*
1630-------------------------------------------------------------------------------
1631Returns the result of converting the double-precision floating-point value
1632`a' to the 32-bit two's complement integer format.  The conversion is
1633performed according to the IEC/IEEE Standard for Binary Floating-point
1634Arithmetic---which means in particular that the conversion is rounded
1635according to the current rounding mode.  If `a' is a NaN, the largest
1636positive integer is returned.  Otherwise, if the conversion overflows, the
1637largest integer with the same sign as `a' is returned.
1638-------------------------------------------------------------------------------
1639*/
1640int32 float64_to_int32( float64 a )
1641{
1642    flag aSign;
1643    int16 aExp, shiftCount;
1644    bits64 aSig;
1645
1646    aSig = extractFloat64Frac( a );
1647    aExp = extractFloat64Exp( a );
1648    aSign = extractFloat64Sign( a );
1649    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1650    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1651    shiftCount = 0x42C - aExp;
1652    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1653    return roundAndPackInt32( aSign, aSig );
1654
1655}
1656
1657/*
1658-------------------------------------------------------------------------------
1659Returns the result of converting the double-precision floating-point value
1660`a' to the 32-bit two's complement integer format.  The conversion is
1661performed according to the IEC/IEEE Standard for Binary Floating-point
1662Arithmetic, except that the conversion is always rounded toward zero.  If
1663`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1664conversion overflows, the largest integer with the same sign as `a' is
1665returned.
1666-------------------------------------------------------------------------------
1667*/
1668int32 float64_to_int32_round_to_zero( float64 a )
1669{
1670    flag aSign;
1671    int16 aExp, shiftCount;
1672    bits64 aSig, savedASig;
1673    int32 z;
1674
1675    aSig = extractFloat64Frac( a );
1676    aExp = extractFloat64Exp( a );
1677    aSign = extractFloat64Sign( a );
1678    shiftCount = 0x433 - aExp;
1679    if ( shiftCount < 21 ) {
1680        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1681        goto invalid;
1682    }
1683    else if ( 52 < shiftCount ) {
1684        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1685        return 0;
1686    }
1687    aSig |= LIT64( 0x0010000000000000 );
1688    savedASig = aSig;
1689    aSig >>= shiftCount;
1690    z = aSig;
1691    if ( aSign ) z = - z;
1692    if ( ( z < 0 ) ^ aSign ) {
1693 invalid:
1694        float_exception_flags |= float_flag_invalid;
1695        return aSign ? 0x80000000 : 0x7FFFFFFF;
1696    }
1697    if ( ( aSig<<shiftCount ) != savedASig ) {
1698        float_exception_flags |= float_flag_inexact;
1699    }
1700    return z;
1701
1702}
1703
1704/*
1705-------------------------------------------------------------------------------
1706Returns the result of converting the double-precision floating-point value
1707`a' to the 32-bit two's complement unsigned integer format.  The conversion
1708is performed according to the IEC/IEEE Standard for Binary Floating-point
1709Arithmetic---which means in particular that the conversion is rounded
1710according to the current rounding mode.  If `a' is a NaN, the largest
1711positive integer is returned.  Otherwise, if the conversion overflows, the
1712largest positive integer is returned.
1713-------------------------------------------------------------------------------
1714*/
1715int32 float64_to_uint32( float64 a )
1716{
1717    flag aSign;
1718    int16 aExp, shiftCount;
1719    bits64 aSig;
1720
1721    aSig = extractFloat64Frac( a );
1722    aExp = extractFloat64Exp( a );
1723    aSign = 0; //extractFloat64Sign( a );
1724    //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1725    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1726    shiftCount = 0x42C - aExp;
1727    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1728    return roundAndPackInt32( aSign, aSig );
1729}
1730
1731/*
1732-------------------------------------------------------------------------------
1733Returns the result of converting the double-precision floating-point value
1734`a' to the 32-bit two's complement integer format.  The conversion is
1735performed according to the IEC/IEEE Standard for Binary Floating-point
1736Arithmetic, except that the conversion is always rounded toward zero.  If
1737`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1738conversion overflows, the largest positive integer is returned.
1739-------------------------------------------------------------------------------
1740*/
1741int32 float64_to_uint32_round_to_zero( float64 a )
1742{
1743    flag aSign;
1744    int16 aExp, shiftCount;
1745    bits64 aSig, savedASig;
1746    int32 z;
1747
1748    aSig = extractFloat64Frac( a );
1749    aExp = extractFloat64Exp( a );
1750    aSign = extractFloat64Sign( a );
1751    shiftCount = 0x433 - aExp;
1752    if ( shiftCount < 21 ) {
1753        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1754        goto invalid;
1755    }
1756    else if ( 52 < shiftCount ) {
1757        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1758        return 0;
1759    }
1760    aSig |= LIT64( 0x0010000000000000 );
1761    savedASig = aSig;
1762    aSig >>= shiftCount;
1763    z = aSig;
1764    if ( aSign ) z = - z;
1765    if ( ( z < 0 ) ^ aSign ) {
1766 invalid:
1767        float_exception_flags |= float_flag_invalid;
1768        return aSign ? 0x80000000 : 0x7FFFFFFF;
1769    }
1770    if ( ( aSig<<shiftCount ) != savedASig ) {
1771        float_exception_flags |= float_flag_inexact;
1772    }
1773    return z;
1774}
1775
1776/*
1777-------------------------------------------------------------------------------
1778Returns the result of converting the double-precision floating-point value
1779`a' to the single-precision floating-point format.  The conversion is
1780performed according to the IEC/IEEE Standard for Binary Floating-point
1781Arithmetic.
1782-------------------------------------------------------------------------------
1783*/
1784float32 float64_to_float32( float64 a )
1785{
1786    flag aSign;
1787    int16 aExp;
1788    bits64 aSig;
1789    bits32 zSig;
1790
1791    aSig = extractFloat64Frac( a );
1792    aExp = extractFloat64Exp( a );
1793    aSign = extractFloat64Sign( a );
1794    if ( aExp == 0x7FF ) {
1795        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1796        return packFloat32( aSign, 0xFF, 0 );
1797    }
1798    shift64RightJamming( aSig, 22, &aSig );
1799    zSig = aSig;
1800    if ( aExp || zSig ) {
1801        zSig |= 0x40000000;
1802        aExp -= 0x381;
1803    }
1804    return roundAndPackFloat32( aSign, aExp, zSig );
1805
1806}
1807
1808#ifdef FLOATX80
1809
1810/*
1811-------------------------------------------------------------------------------
1812Returns the result of converting the double-precision floating-point value
1813`a' to the extended double-precision floating-point format.  The conversion
1814is performed according to the IEC/IEEE Standard for Binary Floating-point
1815Arithmetic.
1816-------------------------------------------------------------------------------
1817*/
1818floatx80 float64_to_floatx80( float64 a )
1819{
1820    flag aSign;
1821    int16 aExp;
1822    bits64 aSig;
1823
1824    aSig = extractFloat64Frac( a );
1825    aExp = extractFloat64Exp( a );
1826    aSign = extractFloat64Sign( a );
1827    if ( aExp == 0x7FF ) {
1828        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1829        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1830    }
1831    if ( aExp == 0 ) {
1832        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1833        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1834    }
1835    return
1836        packFloatx80(
1837            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1838
1839}
1840
1841#endif
1842
1843/*
1844-------------------------------------------------------------------------------
1845Rounds the double-precision floating-point value `a' to an integer, and
1846returns the result as a double-precision floating-point value.  The
1847operation is performed according to the IEC/IEEE Standard for Binary
1848Floating-point Arithmetic.
1849-------------------------------------------------------------------------------
1850*/
1851float64 float64_round_to_int( float64 a )
1852{
1853    flag aSign;
1854    int16 aExp;
1855    bits64 lastBitMask, roundBitsMask;
1856    int8 roundingMode;
1857    float64 z;
1858
1859    aExp = extractFloat64Exp( a );
1860    if ( 0x433 <= aExp ) {
1861        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1862            return propagateFloat64NaN( a, a );
1863        }
1864        return a;
1865    }
1866    if ( aExp <= 0x3FE ) {
1867        if ( (bits64) ( a<<1 ) == 0 ) return a;
1868        float_exception_flags |= float_flag_inexact;
1869        aSign = extractFloat64Sign( a );
1870        switch ( float_rounding_mode ) {
1871         case float_round_nearest_even:
1872            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1873                return packFloat64( aSign, 0x3FF, 0 );
1874            }
1875            break;
1876         case float_round_down:
1877            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1878         case float_round_up:
1879            return
1880            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1881        }
1882        return packFloat64( aSign, 0, 0 );
1883    }
1884    lastBitMask = 1;
1885    lastBitMask <<= 0x433 - aExp;
1886    roundBitsMask = lastBitMask - 1;
1887    z = a;
1888    roundingMode = float_rounding_mode;
1889    if ( roundingMode == float_round_nearest_even ) {
1890        z += lastBitMask>>1;
1891        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1892    }
1893    else if ( roundingMode != float_round_to_zero ) {
1894        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1895            z += roundBitsMask;
1896        }
1897    }
1898    z &= ~ roundBitsMask;
1899    if ( z != a ) float_exception_flags |= float_flag_inexact;
1900    return z;
1901
1902}
1903
1904/*
1905-------------------------------------------------------------------------------
1906Returns the result of adding the absolute values of the double-precision
1907floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1908before being returned.  `zSign' is ignored if the result is a NaN.  The
1909addition is performed according to the IEC/IEEE Standard for Binary
1910Floating-point Arithmetic.
1911-------------------------------------------------------------------------------
1912*/
1913static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1914{
1915    int16 aExp, bExp, zExp;
1916    bits64 aSig, bSig, zSig;
1917    int16 expDiff;
1918
1919    aSig = extractFloat64Frac( a );
1920    aExp = extractFloat64Exp( a );
1921    bSig = extractFloat64Frac( b );
1922    bExp = extractFloat64Exp( b );
1923    expDiff = aExp - bExp;
1924    aSig <<= 9;
1925    bSig <<= 9;
1926    if ( 0 < expDiff ) {
1927        if ( aExp == 0x7FF ) {
1928            if ( aSig ) return propagateFloat64NaN( a, b );
1929            return a;
1930        }
1931        if ( bExp == 0 ) {
1932            --expDiff;
1933        }
1934        else {
1935            bSig |= LIT64( 0x2000000000000000 );
1936        }
1937        shift64RightJamming( bSig, expDiff, &bSig );
1938        zExp = aExp;
1939    }
1940    else if ( expDiff < 0 ) {
1941        if ( bExp == 0x7FF ) {
1942            if ( bSig ) return propagateFloat64NaN( a, b );
1943            return packFloat64( zSign, 0x7FF, 0 );
1944        }
1945        if ( aExp == 0 ) {
1946            ++expDiff;
1947        }
1948        else {
1949            aSig |= LIT64( 0x2000000000000000 );
1950        }
1951        shift64RightJamming( aSig, - expDiff, &aSig );
1952        zExp = bExp;
1953    }
1954    else {
1955        if ( aExp == 0x7FF ) {
1956            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1957            return a;
1958        }
1959        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1960        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1961        zExp = aExp;
1962        goto roundAndPack;
1963    }
1964    aSig |= LIT64( 0x2000000000000000 );
1965    zSig = ( aSig + bSig )<<1;
1966    --zExp;
1967    if ( (sbits64) zSig < 0 ) {
1968        zSig = aSig + bSig;
1969        ++zExp;
1970    }
1971 roundAndPack:
1972    return roundAndPackFloat64( zSign, zExp, zSig );
1973
1974}
1975
1976/*
1977-------------------------------------------------------------------------------
1978Returns the result of subtracting the absolute values of the double-
1979precision floating-point values `a' and `b'.  If `zSign' is true, the
1980difference is negated before being returned.  `zSign' is ignored if the
1981result is a NaN.  The subtraction is performed according to the IEC/IEEE
1982Standard for Binary Floating-point Arithmetic.
1983-------------------------------------------------------------------------------
1984*/
1985static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1986{
1987    int16 aExp, bExp, zExp;
1988    bits64 aSig, bSig, zSig;
1989    int16 expDiff;
1990
1991    aSig = extractFloat64Frac( a );
1992    aExp = extractFloat64Exp( a );
1993    bSig = extractFloat64Frac( b );
1994    bExp = extractFloat64Exp( b );
1995    expDiff = aExp - bExp;
1996    aSig <<= 10;
1997    bSig <<= 10;
1998    if ( 0 < expDiff ) goto aExpBigger;
1999    if ( expDiff < 0 ) goto bExpBigger;
2000    if ( aExp == 0x7FF ) {
2001        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2002        float_raise( float_flag_invalid );
2003        return float64_default_nan;
2004    }
2005    if ( aExp == 0 ) {
2006        aExp = 1;
2007        bExp = 1;
2008    }
2009    if ( bSig < aSig ) goto aBigger;
2010    if ( aSig < bSig ) goto bBigger;
2011    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2012 bExpBigger:
2013    if ( bExp == 0x7FF ) {
2014        if ( bSig ) return propagateFloat64NaN( a, b );
2015        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2016    }
2017    if ( aExp == 0 ) {
2018        ++expDiff;
2019    }
2020    else {
2021        aSig |= LIT64( 0x4000000000000000 );
2022    }
2023    shift64RightJamming( aSig, - expDiff, &aSig );
2024    bSig |= LIT64( 0x4000000000000000 );
2025 bBigger:
2026    zSig = bSig - aSig;
2027    zExp = bExp;
2028    zSign ^= 1;
2029    goto normalizeRoundAndPack;
2030 aExpBigger:
2031    if ( aExp == 0x7FF ) {
2032        if ( aSig ) return propagateFloat64NaN( a, b );
2033        return a;
2034    }
2035    if ( bExp == 0 ) {
2036        --expDiff;
2037    }
2038    else {
2039        bSig |= LIT64( 0x4000000000000000 );
2040    }
2041    shift64RightJamming( bSig, expDiff, &bSig );
2042    aSig |= LIT64( 0x4000000000000000 );
2043 aBigger:
2044    zSig = aSig - bSig;
2045    zExp = aExp;
2046 normalizeRoundAndPack:
2047    --zExp;
2048    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2049
2050}
2051
2052/*
2053-------------------------------------------------------------------------------
2054Returns the result of adding the double-precision floating-point values `a'
2055and `b'.  The operation is performed according to the IEC/IEEE Standard for
2056Binary Floating-point Arithmetic.
2057-------------------------------------------------------------------------------
2058*/
2059float64 float64_add( float64 a, float64 b )
2060{
2061    flag aSign, bSign;
2062
2063    aSign = extractFloat64Sign( a );
2064    bSign = extractFloat64Sign( b );
2065    if ( aSign == bSign ) {
2066        return addFloat64Sigs( a, b, aSign );
2067    }
2068    else {
2069        return subFloat64Sigs( a, b, aSign );
2070    }
2071
2072}
2073
2074/*
2075-------------------------------------------------------------------------------
2076Returns the result of subtracting the double-precision floating-point values
2077`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2078for Binary Floating-point Arithmetic.
2079-------------------------------------------------------------------------------
2080*/
2081float64 float64_sub( float64 a, float64 b )
2082{
2083    flag aSign, bSign;
2084
2085    aSign = extractFloat64Sign( a );
2086    bSign = extractFloat64Sign( b );
2087    if ( aSign == bSign ) {
2088        return subFloat64Sigs( a, b, aSign );
2089    }
2090    else {
2091        return addFloat64Sigs( a, b, aSign );
2092    }
2093
2094}
2095
2096/*
2097-------------------------------------------------------------------------------
2098Returns the result of multiplying the double-precision floating-point values
2099`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2100for Binary Floating-point Arithmetic.
2101-------------------------------------------------------------------------------
2102*/
2103float64 float64_mul( float64 a, float64 b )
2104{
2105    flag aSign, bSign, zSign;
2106    int16 aExp, bExp, zExp;
2107    bits64 aSig, bSig, zSig0, zSig1;
2108
2109    aSig = extractFloat64Frac( a );
2110    aExp = extractFloat64Exp( a );
2111    aSign = extractFloat64Sign( a );
2112    bSig = extractFloat64Frac( b );
2113    bExp = extractFloat64Exp( b );
2114    bSign = extractFloat64Sign( b );
2115    zSign = aSign ^ bSign;
2116    if ( aExp == 0x7FF ) {
2117        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2118            return propagateFloat64NaN( a, b );
2119        }
2120        if ( ( bExp | bSig ) == 0 ) {
2121            float_raise( float_flag_invalid );
2122            return float64_default_nan;
2123        }
2124        return packFloat64( zSign, 0x7FF, 0 );
2125    }
2126    if ( bExp == 0x7FF ) {
2127        if ( bSig ) return propagateFloat64NaN( a, b );
2128        if ( ( aExp | aSig ) == 0 ) {
2129            float_raise( float_flag_invalid );
2130            return float64_default_nan;
2131        }
2132        return packFloat64( zSign, 0x7FF, 0 );
2133    }
2134    if ( aExp == 0 ) {
2135        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2136        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2137    }
2138    if ( bExp == 0 ) {
2139        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2140        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2141    }
2142    zExp = aExp + bExp - 0x3FF;
2143    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2144    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2145    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2146    zSig0 |= ( zSig1 != 0 );
2147    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2148        zSig0 <<= 1;
2149        --zExp;
2150    }
2151    return roundAndPackFloat64( zSign, zExp, zSig0 );
2152
2153}
2154
2155/*
2156-------------------------------------------------------------------------------
2157Returns the result of dividing the double-precision floating-point value `a'
2158by the corresponding value `b'.  The operation is performed according to
2159the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2160-------------------------------------------------------------------------------
2161*/
2162float64 float64_div( float64 a, float64 b )
2163{
2164    flag aSign, bSign, zSign;
2165    int16 aExp, bExp, zExp;
2166    bits64 aSig, bSig, zSig;
2167    bits64 rem0, rem1;
2168    bits64 term0, term1;
2169
2170    aSig = extractFloat64Frac( a );
2171    aExp = extractFloat64Exp( a );
2172    aSign = extractFloat64Sign( a );
2173    bSig = extractFloat64Frac( b );
2174    bExp = extractFloat64Exp( b );
2175    bSign = extractFloat64Sign( b );
2176    zSign = aSign ^ bSign;
2177    if ( aExp == 0x7FF ) {
2178        if ( aSig ) return propagateFloat64NaN( a, b );
2179        if ( bExp == 0x7FF ) {
2180            if ( bSig ) return propagateFloat64NaN( a, b );
2181            float_raise( float_flag_invalid );
2182            return float64_default_nan;
2183        }
2184        return packFloat64( zSign, 0x7FF, 0 );
2185    }
2186    if ( bExp == 0x7FF ) {
2187        if ( bSig ) return propagateFloat64NaN( a, b );
2188        return packFloat64( zSign, 0, 0 );
2189    }
2190    if ( bExp == 0 ) {
2191        if ( bSig == 0 ) {
2192            if ( ( aExp | aSig ) == 0 ) {
2193                float_raise( float_flag_invalid );
2194                return float64_default_nan;
2195            }
2196            float_raise( float_flag_divbyzero );
2197            return packFloat64( zSign, 0x7FF, 0 );
2198        }
2199        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2200    }
2201    if ( aExp == 0 ) {
2202        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2203        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2204    }
2205    zExp = aExp - bExp + 0x3FD;
2206    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2207    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2208    if ( bSig <= ( aSig + aSig ) ) {
2209        aSig >>= 1;
2210        ++zExp;
2211    }
2212    zSig = estimateDiv128To64( aSig, 0, bSig );
2213    if ( ( zSig & 0x1FF ) <= 2 ) {
2214        mul64To128( bSig, zSig, &term0, &term1 );
2215        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2216        while ( (sbits64) rem0 < 0 ) {
2217            --zSig;
2218            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2219        }
2220        zSig |= ( rem1 != 0 );
2221    }
2222    return roundAndPackFloat64( zSign, zExp, zSig );
2223
2224}
2225
2226/*
2227-------------------------------------------------------------------------------
2228Returns the remainder of the double-precision floating-point value `a'
2229with respect to the corresponding value `b'.  The operation is performed
2230according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2231-------------------------------------------------------------------------------
2232*/
2233float64 float64_rem( float64 a, float64 b )
2234{
2235    flag aSign, bSign, zSign;
2236    int16 aExp, bExp, expDiff;
2237    bits64 aSig, bSig;
2238    bits64 q, alternateASig;
2239    sbits64 sigMean;
2240
2241    aSig = extractFloat64Frac( a );
2242    aExp = extractFloat64Exp( a );
2243    aSign = extractFloat64Sign( a );
2244    bSig = extractFloat64Frac( b );
2245    bExp = extractFloat64Exp( b );
2246    bSign = extractFloat64Sign( b );
2247    if ( aExp == 0x7FF ) {
2248        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2249            return propagateFloat64NaN( a, b );
2250        }
2251        float_raise( float_flag_invalid );
2252        return float64_default_nan;
2253    }
2254    if ( bExp == 0x7FF ) {
2255        if ( bSig ) return propagateFloat64NaN( a, b );
2256        return a;
2257    }
2258    if ( bExp == 0 ) {
2259        if ( bSig == 0 ) {
2260            float_raise( float_flag_invalid );
2261            return float64_default_nan;
2262        }
2263        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2264    }
2265    if ( aExp == 0 ) {
2266        if ( aSig == 0 ) return a;
2267        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2268    }
2269    expDiff = aExp - bExp;
2270    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2271    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2272    if ( expDiff < 0 ) {
2273        if ( expDiff < -1 ) return a;
2274        aSig >>= 1;
2275    }
2276    q = ( bSig <= aSig );
2277    if ( q ) aSig -= bSig;
2278    expDiff -= 64;
2279    while ( 0 < expDiff ) {
2280        q = estimateDiv128To64( aSig, 0, bSig );
2281        q = ( 2 < q ) ? q - 2 : 0;
2282        aSig = - ( ( bSig>>2 ) * q );
2283        expDiff -= 62;
2284    }
2285    expDiff += 64;
2286    if ( 0 < expDiff ) {
2287        q = estimateDiv128To64( aSig, 0, bSig );
2288        q = ( 2 < q ) ? q - 2 : 0;
2289        q >>= 64 - expDiff;
2290        bSig >>= 2;
2291        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2292    }
2293    else {
2294        aSig >>= 2;
2295        bSig >>= 2;
2296    }
2297    do {
2298        alternateASig = aSig;
2299        ++q;
2300        aSig -= bSig;
2301    } while ( 0 <= (sbits64) aSig );
2302    sigMean = aSig + alternateASig;
2303    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2304        aSig = alternateASig;
2305    }
2306    zSign = ( (sbits64) aSig < 0 );
2307    if ( zSign ) aSig = - aSig;
2308    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2309
2310}
2311
2312/*
2313-------------------------------------------------------------------------------
2314Returns the square root of the double-precision floating-point value `a'.
2315The operation is performed according to the IEC/IEEE Standard for Binary
2316Floating-point Arithmetic.
2317-------------------------------------------------------------------------------
2318*/
2319float64 float64_sqrt( float64 a )
2320{
2321    flag aSign;
2322    int16 aExp, zExp;
2323    bits64 aSig, zSig;
2324    bits64 rem0, rem1, term0, term1; //, shiftedRem;
2325    //float64 z;
2326
2327    aSig = extractFloat64Frac( a );
2328    aExp = extractFloat64Exp( a );
2329    aSign = extractFloat64Sign( a );
2330    if ( aExp == 0x7FF ) {
2331        if ( aSig ) return propagateFloat64NaN( a, a );
2332        if ( ! aSign ) return a;
2333        float_raise( float_flag_invalid );
2334        return float64_default_nan;
2335    }
2336    if ( aSign ) {
2337        if ( ( aExp | aSig ) == 0 ) return a;
2338        float_raise( float_flag_invalid );
2339        return float64_default_nan;
2340    }
2341    if ( aExp == 0 ) {
2342        if ( aSig == 0 ) return 0;
2343        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2344    }
2345    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2346    aSig |= LIT64( 0x0010000000000000 );
2347    zSig = estimateSqrt32( aExp, aSig>>21 );
2348    zSig <<= 31;
2349    aSig <<= 9 - ( aExp & 1 );
2350    zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2351    if ( ( zSig & 0x3FF ) <= 5 ) {
2352        if ( zSig < 2 ) {
2353            zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2354        }
2355        else {
2356            aSig <<= 2;
2357            mul64To128( zSig, zSig, &term0, &term1 );
2358            sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2359            while ( (sbits64) rem0 < 0 ) {
2360                --zSig;
2361                shortShift128Left( 0, zSig, 1, &term0, &term1 );
2362                term1 |= 1;
2363                add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2364            }
2365            zSig |= ( ( rem0 | rem1 ) != 0 );
2366        }
2367    }
2368    shift64RightJamming( zSig, 1, &zSig );
2369    return roundAndPackFloat64( 0, zExp, zSig );
2370
2371}
2372
2373/*
2374-------------------------------------------------------------------------------
2375Returns 1 if the double-precision floating-point value `a' is equal to the
2376corresponding value `b', and 0 otherwise.  The comparison is performed
2377according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2378-------------------------------------------------------------------------------
2379*/
2380flag float64_eq( float64 a, float64 b )
2381{
2382
2383    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2384         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2385       ) {
2386        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2387            float_raise( float_flag_invalid );
2388        }
2389        return 0;
2390    }
2391    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2392
2393}
2394
2395/*
2396-------------------------------------------------------------------------------
2397Returns 1 if the double-precision floating-point value `a' is less than or
2398equal to the corresponding value `b', and 0 otherwise.  The comparison is
2399performed according to the IEC/IEEE Standard for Binary Floating-point
2400Arithmetic.
2401-------------------------------------------------------------------------------
2402*/
2403flag float64_le( float64 a, float64 b )
2404{
2405    flag aSign, bSign;
2406
2407    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2408         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2409       ) {
2410        float_raise( float_flag_invalid );
2411        return 0;
2412    }
2413    aSign = extractFloat64Sign( a );
2414    bSign = extractFloat64Sign( b );
2415    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2416    return ( a == b ) || ( aSign ^ ( a < b ) );
2417
2418}
2419
2420/*
2421-------------------------------------------------------------------------------
2422Returns 1 if the double-precision floating-point value `a' is less than
2423the corresponding value `b', and 0 otherwise.  The comparison is performed
2424according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2425-------------------------------------------------------------------------------
2426*/
2427flag float64_lt( float64 a, float64 b )
2428{
2429    flag aSign, bSign;
2430
2431    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2432         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2433       ) {
2434        float_raise( float_flag_invalid );
2435        return 0;
2436    }
2437    aSign = extractFloat64Sign( a );
2438    bSign = extractFloat64Sign( b );
2439    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2440    return ( a != b ) && ( aSign ^ ( a < b ) );
2441
2442}
2443
2444/*
2445-------------------------------------------------------------------------------
2446Returns 1 if the double-precision floating-point value `a' is equal to the
2447corresponding value `b', and 0 otherwise.  The invalid exception is raised
2448if either operand is a NaN.  Otherwise, the comparison is performed
2449according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2450-------------------------------------------------------------------------------
2451*/
2452flag float64_eq_signaling( float64 a, float64 b )
2453{
2454
2455    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2456         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2457       ) {
2458        float_raise( float_flag_invalid );
2459        return 0;
2460    }
2461    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2462
2463}
2464
2465/*
2466-------------------------------------------------------------------------------
2467Returns 1 if the double-precision floating-point value `a' is less than or
2468equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2469cause an exception.  Otherwise, the comparison is performed according to the
2470IEC/IEEE Standard for Binary Floating-point Arithmetic.
2471-------------------------------------------------------------------------------
2472*/
2473flag float64_le_quiet( float64 a, float64 b )
2474{
2475    flag aSign, bSign;
2476    //int16 aExp, bExp;
2477
2478    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2479         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2480       ) {
2481        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2482            float_raise( float_flag_invalid );
2483        }
2484        return 0;
2485    }
2486    aSign = extractFloat64Sign( a );
2487    bSign = extractFloat64Sign( b );
2488    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2489    return ( a == b ) || ( aSign ^ ( a < b ) );
2490
2491}
2492
2493/*
2494-------------------------------------------------------------------------------
2495Returns 1 if the double-precision floating-point value `a' is less than
2496the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2497exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2498Standard for Binary Floating-point Arithmetic.
2499-------------------------------------------------------------------------------
2500*/
2501flag float64_lt_quiet( float64 a, float64 b )
2502{
2503    flag aSign, bSign;
2504
2505    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2506         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2507       ) {
2508        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2509            float_raise( float_flag_invalid );
2510        }
2511        return 0;
2512    }
2513    aSign = extractFloat64Sign( a );
2514    bSign = extractFloat64Sign( b );
2515    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2516    return ( a != b ) && ( aSign ^ ( a < b ) );
2517
2518}
2519
2520#ifdef FLOATX80
2521
2522/*
2523-------------------------------------------------------------------------------
2524Returns the result of converting the extended double-precision floating-
2525point value `a' to the 32-bit two's complement integer format.  The
2526conversion is performed according to the IEC/IEEE Standard for Binary
2527Floating-point Arithmetic---which means in particular that the conversion
2528is rounded according to the current rounding mode.  If `a' is a NaN, the
2529largest positive integer is returned.  Otherwise, if the conversion
2530overflows, the largest integer with the same sign as `a' is returned.
2531-------------------------------------------------------------------------------
2532*/
2533int32 floatx80_to_int32( floatx80 a )
2534{
2535    flag aSign;
2536    int32 aExp, shiftCount;
2537    bits64 aSig;
2538
2539    aSig = extractFloatx80Frac( a );
2540    aExp = extractFloatx80Exp( a );
2541    aSign = extractFloatx80Sign( a );
2542    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2543    shiftCount = 0x4037 - aExp;
2544    if ( shiftCount <= 0 ) shiftCount = 1;
2545    shift64RightJamming( aSig, shiftCount, &aSig );
2546    return roundAndPackInt32( aSign, aSig );
2547
2548}
2549
2550/*
2551-------------------------------------------------------------------------------
2552Returns the result of converting the extended double-precision floating-
2553point value `a' to the 32-bit two's complement integer format.  The
2554conversion is performed according to the IEC/IEEE Standard for Binary
2555Floating-point Arithmetic, except that the conversion is always rounded
2556toward zero.  If `a' is a NaN, the largest positive integer is returned.
2557Otherwise, if the conversion overflows, the largest integer with the same
2558sign as `a' is returned.
2559-------------------------------------------------------------------------------
2560*/
2561int32 floatx80_to_int32_round_to_zero( floatx80 a )
2562{
2563    flag aSign;
2564    int32 aExp, shiftCount;
2565    bits64 aSig, savedASig;
2566    int32 z;
2567
2568    aSig = extractFloatx80Frac( a );
2569    aExp = extractFloatx80Exp( a );
2570    aSign = extractFloatx80Sign( a );
2571    shiftCount = 0x403E - aExp;
2572    if ( shiftCount < 32 ) {
2573        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2574        goto invalid;
2575    }
2576    else if ( 63 < shiftCount ) {
2577        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2578        return 0;
2579    }
2580    savedASig = aSig;
2581    aSig >>= shiftCount;
2582    z = aSig;
2583    if ( aSign ) z = - z;
2584    if ( ( z < 0 ) ^ aSign ) {
2585 invalid:
2586        float_exception_flags |= float_flag_invalid;
2587        return aSign ? 0x80000000 : 0x7FFFFFFF;
2588    }
2589    if ( ( aSig<<shiftCount ) != savedASig ) {
2590        float_exception_flags |= float_flag_inexact;
2591    }
2592    return z;
2593
2594}
2595
2596/*
2597-------------------------------------------------------------------------------
2598Returns the result of converting the extended double-precision floating-
2599point value `a' to the single-precision floating-point format.  The
2600conversion is performed according to the IEC/IEEE Standard for Binary
2601Floating-point Arithmetic.
2602-------------------------------------------------------------------------------
2603*/
2604float32 floatx80_to_float32( floatx80 a )
2605{
2606    flag aSign;
2607    int32 aExp;
2608    bits64 aSig;
2609
2610    aSig = extractFloatx80Frac( a );
2611    aExp = extractFloatx80Exp( a );
2612    aSign = extractFloatx80Sign( a );
2613    if ( aExp == 0x7FFF ) {
2614        if ( (bits64) ( aSig<<1 ) ) {
2615            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2616        }
2617        return packFloat32( aSign, 0xFF, 0 );
2618    }
2619    shift64RightJamming( aSig, 33, &aSig );
2620    if ( aExp || aSig ) aExp -= 0x3F81;
2621    return roundAndPackFloat32( aSign, aExp, aSig );
2622
2623}
2624
2625/*
2626-------------------------------------------------------------------------------
2627Returns the result of converting the extended double-precision floating-
2628point value `a' to the double-precision floating-point format.  The
2629conversion is performed according to the IEC/IEEE Standard for Binary
2630Floating-point Arithmetic.
2631-------------------------------------------------------------------------------
2632*/
2633float64 floatx80_to_float64( floatx80 a )
2634{
2635    flag aSign;
2636    int32 aExp;
2637    bits64 aSig, zSig;
2638
2639    aSig = extractFloatx80Frac( a );
2640    aExp = extractFloatx80Exp( a );
2641    aSign = extractFloatx80Sign( a );
2642    if ( aExp == 0x7FFF ) {
2643        if ( (bits64) ( aSig<<1 ) ) {
2644            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2645        }
2646        return packFloat64( aSign, 0x7FF, 0 );
2647    }
2648    shift64RightJamming( aSig, 1, &zSig );
2649    if ( aExp || aSig ) aExp -= 0x3C01;
2650    return roundAndPackFloat64( aSign, aExp, zSig );
2651
2652}
2653
2654/*
2655-------------------------------------------------------------------------------
2656Rounds the extended double-precision floating-point value `a' to an integer,
2657and returns the result as an extended quadruple-precision floating-point
2658value.  The operation is performed according to the IEC/IEEE Standard for
2659Binary Floating-point Arithmetic.
2660-------------------------------------------------------------------------------
2661*/
2662floatx80 floatx80_round_to_int( floatx80 a )
2663{
2664    flag aSign;
2665    int32 aExp;
2666    bits64 lastBitMask, roundBitsMask;
2667    int8 roundingMode;
2668    floatx80 z;
2669
2670    aExp = extractFloatx80Exp( a );
2671    if ( 0x403E <= aExp ) {
2672        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2673            return propagateFloatx80NaN( a, a );
2674        }
2675        return a;
2676    }
2677    if ( aExp <= 0x3FFE ) {
2678        if (    ( aExp == 0 )
2679             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2680            return a;
2681        }
2682        float_exception_flags |= float_flag_inexact;
2683        aSign = extractFloatx80Sign( a );
2684        switch ( float_rounding_mode ) {
2685         case float_round_nearest_even:
2686            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2687               ) {
2688                return
2689                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2690            }
2691            break;
2692         case float_round_down:
2693            return
2694                  aSign ?
2695                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2696                : packFloatx80( 0, 0, 0 );
2697         case float_round_up:
2698            return
2699                  aSign ? packFloatx80( 1, 0, 0 )
2700                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2701        }
2702        return packFloatx80( aSign, 0, 0 );
2703    }
2704    lastBitMask = 1;
2705    lastBitMask <<= 0x403E - aExp;
2706    roundBitsMask = lastBitMask - 1;
2707    z = a;
2708    roundingMode = float_rounding_mode;
2709    if ( roundingMode == float_round_nearest_even ) {
2710        z.low += lastBitMask>>1;
2711        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2712    }
2713    else if ( roundingMode != float_round_to_zero ) {
2714        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2715            z.low += roundBitsMask;
2716        }
2717    }
2718    z.low &= ~ roundBitsMask;
2719    if ( z.low == 0 ) {
2720        ++z.high;
2721        z.low = LIT64( 0x8000000000000000 );
2722    }
2723    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2724    return z;
2725
2726}
2727
2728/*
2729-------------------------------------------------------------------------------
2730Returns the result of adding the absolute values of the extended double-
2731precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2732negated before being returned.  `zSign' is ignored if the result is a NaN.
2733The addition is performed according to the IEC/IEEE Standard for Binary
2734Floating-point Arithmetic.
2735-------------------------------------------------------------------------------
2736*/
2737static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2738{
2739    int32 aExp, bExp, zExp;
2740    bits64 aSig, bSig, zSig0, zSig1;
2741    int32 expDiff;
2742
2743    aSig = extractFloatx80Frac( a );
2744    aExp = extractFloatx80Exp( a );
2745    bSig = extractFloatx80Frac( b );
2746    bExp = extractFloatx80Exp( b );
2747    expDiff = aExp - bExp;
2748    if ( 0 < expDiff ) {
2749        if ( aExp == 0x7FFF ) {
2750            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2751            return a;
2752        }
2753        if ( bExp == 0 ) --expDiff;
2754        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2755        zExp = aExp;
2756    }
2757    else if ( expDiff < 0 ) {
2758        if ( bExp == 0x7FFF ) {
2759            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2760            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2761        }
2762        if ( aExp == 0 ) ++expDiff;
2763        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2764        zExp = bExp;
2765    }
2766    else {
2767        if ( aExp == 0x7FFF ) {
2768            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2769                return propagateFloatx80NaN( a, b );
2770            }
2771            return a;
2772        }
2773        zSig1 = 0;
2774        zSig0 = aSig + bSig;
2775        if ( aExp == 0 ) {
2776            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2777            goto roundAndPack;
2778        }
2779        zExp = aExp;
2780        goto shiftRight1;
2781    }
2782
2783    zSig0 = aSig + bSig;
2784
2785    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2786 shiftRight1:
2787    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2788    zSig0 |= LIT64( 0x8000000000000000 );
2789    ++zExp;
2790 roundAndPack:
2791    return
2792        roundAndPackFloatx80(
2793            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2794
2795}
2796
2797/*
2798-------------------------------------------------------------------------------
2799Returns the result of subtracting the absolute values of the extended
2800double-precision floating-point values `a' and `b'.  If `zSign' is true,
2801the difference is negated before being returned.  `zSign' is ignored if the
2802result is a NaN.  The subtraction is performed according to the IEC/IEEE
2803Standard for Binary Floating-point Arithmetic.
2804-------------------------------------------------------------------------------
2805*/
2806static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2807{
2808    int32 aExp, bExp, zExp;
2809    bits64 aSig, bSig, zSig0, zSig1;
2810    int32 expDiff;
2811    floatx80 z;
2812
2813    aSig = extractFloatx80Frac( a );
2814    aExp = extractFloatx80Exp( a );
2815    bSig = extractFloatx80Frac( b );
2816    bExp = extractFloatx80Exp( b );
2817    expDiff = aExp - bExp;
2818    if ( 0 < expDiff ) goto aExpBigger;
2819    if ( expDiff < 0 ) goto bExpBigger;
2820    if ( aExp == 0x7FFF ) {
2821        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2822            return propagateFloatx80NaN( a, b );
2823        }
2824        float_raise( float_flag_invalid );
2825        z.low = floatx80_default_nan_low;
2826        z.high = floatx80_default_nan_high;
2827        return z;
2828    }
2829    if ( aExp == 0 ) {
2830        aExp = 1;
2831        bExp = 1;
2832    }
2833    zSig1 = 0;
2834    if ( bSig < aSig ) goto aBigger;
2835    if ( aSig < bSig ) goto bBigger;
2836    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2837 bExpBigger:
2838    if ( bExp == 0x7FFF ) {
2839        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2840        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2841    }
2842    if ( aExp == 0 ) ++expDiff;
2843    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2844 bBigger:
2845    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2846    zExp = bExp;
2847    zSign ^= 1;
2848    goto normalizeRoundAndPack;
2849 aExpBigger:
2850    if ( aExp == 0x7FFF ) {
2851        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2852        return a;
2853    }
2854    if ( bExp == 0 ) --expDiff;
2855    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2856 aBigger:
2857    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2858    zExp = aExp;
2859 normalizeRoundAndPack:
2860    return
2861        normalizeRoundAndPackFloatx80(
2862            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2863
2864}
2865
2866/*
2867-------------------------------------------------------------------------------
2868Returns the result of adding the extended double-precision floating-point
2869values `a' and `b'.  The operation is performed according to the IEC/IEEE
2870Standard for Binary Floating-point Arithmetic.
2871-------------------------------------------------------------------------------
2872*/
2873floatx80 floatx80_add( floatx80 a, floatx80 b )
2874{
2875    flag aSign, bSign;
2876
2877    aSign = extractFloatx80Sign( a );
2878    bSign = extractFloatx80Sign( b );
2879    if ( aSign == bSign ) {
2880        return addFloatx80Sigs( a, b, aSign );
2881    }
2882    else {
2883        return subFloatx80Sigs( a, b, aSign );
2884    }
2885
2886}
2887
2888/*
2889-------------------------------------------------------------------------------
2890Returns the result of subtracting the extended double-precision floating-
2891point values `a' and `b'.  The operation is performed according to the
2892IEC/IEEE Standard for Binary Floating-point Arithmetic.
2893-------------------------------------------------------------------------------
2894*/
2895floatx80 floatx80_sub( floatx80 a, floatx80 b )
2896{
2897    flag aSign, bSign;
2898
2899    aSign = extractFloatx80Sign( a );
2900    bSign = extractFloatx80Sign( b );
2901    if ( aSign == bSign ) {
2902        return subFloatx80Sigs( a, b, aSign );
2903    }
2904    else {
2905        return addFloatx80Sigs( a, b, aSign );
2906    }
2907
2908}
2909
2910/*
2911-------------------------------------------------------------------------------
2912Returns the result of multiplying the extended double-precision floating-
2913point values `a' and `b'.  The operation is performed according to the
2914IEC/IEEE Standard for Binary Floating-point Arithmetic.
2915-------------------------------------------------------------------------------
2916*/
2917floatx80 floatx80_mul( floatx80 a, floatx80 b )
2918{
2919    flag aSign, bSign, zSign;
2920    int32 aExp, bExp, zExp;
2921    bits64 aSig, bSig, zSig0, zSig1;
2922    floatx80 z;
2923
2924    aSig = extractFloatx80Frac( a );
2925    aExp = extractFloatx80Exp( a );
2926    aSign = extractFloatx80Sign( a );
2927    bSig = extractFloatx80Frac( b );
2928    bExp = extractFloatx80Exp( b );
2929    bSign = extractFloatx80Sign( b );
2930    zSign = aSign ^ bSign;
2931    if ( aExp == 0x7FFF ) {
2932        if (    (bits64) ( aSig<<1 )
2933             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2934            return propagateFloatx80NaN( a, b );
2935        }
2936        if ( ( bExp | bSig ) == 0 ) goto invalid;
2937        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2938    }
2939    if ( bExp == 0x7FFF ) {
2940        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2941        if ( ( aExp | aSig ) == 0 ) {
2942 invalid:
2943            float_raise( float_flag_invalid );
2944            z.low = floatx80_default_nan_low;
2945            z.high = floatx80_default_nan_high;
2946            return z;
2947        }
2948        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2949    }
2950    if ( aExp == 0 ) {
2951        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2952        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2953    }
2954    if ( bExp == 0 ) {
2955        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2956        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2957    }
2958    zExp = aExp + bExp - 0x3FFE;
2959    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2960    if ( 0 < (sbits64) zSig0 ) {
2961        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2962        --zExp;
2963    }
2964    return
2965        roundAndPackFloatx80(
2966            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2967
2968}
2969
2970/*
2971-------------------------------------------------------------------------------
2972Returns the result of dividing the extended double-precision floating-point
2973value `a' by the corresponding value `b'.  The operation is performed
2974according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2975-------------------------------------------------------------------------------
2976*/
2977floatx80 floatx80_div( floatx80 a, floatx80 b )
2978{
2979    flag aSign, bSign, zSign;
2980    int32 aExp, bExp, zExp;
2981    bits64 aSig, bSig, zSig0, zSig1;
2982    bits64 rem0, rem1, rem2, term0, term1, term2;
2983    floatx80 z;
2984
2985    aSig = extractFloatx80Frac( a );
2986    aExp = extractFloatx80Exp( a );
2987    aSign = extractFloatx80Sign( a );
2988    bSig = extractFloatx80Frac( b );
2989    bExp = extractFloatx80Exp( b );
2990    bSign = extractFloatx80Sign( b );
2991    zSign = aSign ^ bSign;
2992    if ( aExp == 0x7FFF ) {
2993        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2994        if ( bExp == 0x7FFF ) {
2995            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2996            goto invalid;
2997        }
2998        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2999    }
3000    if ( bExp == 0x7FFF ) {
3001        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3002        return packFloatx80( zSign, 0, 0 );
3003    }
3004    if ( bExp == 0 ) {
3005        if ( bSig == 0 ) {
3006            if ( ( aExp | aSig ) == 0 ) {
3007 invalid:
3008                float_raise( float_flag_invalid );
3009                z.low = floatx80_default_nan_low;
3010                z.high = floatx80_default_nan_high;
3011                return z;
3012            }
3013            float_raise( float_flag_divbyzero );
3014            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3015        }
3016        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3017    }
3018    if ( aExp == 0 ) {
3019        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3020        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3021    }
3022    zExp = aExp - bExp + 0x3FFE;
3023    rem1 = 0;
3024    if ( bSig <= aSig ) {
3025        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3026        ++zExp;
3027    }
3028    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3029    mul64To128( bSig, zSig0, &term0, &term1 );
3030    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3031    while ( (sbits64) rem0 < 0 ) {
3032        --zSig0;
3033        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3034    }
3035    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3036    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3037        mul64To128( bSig, zSig1, &term1, &term2 );
3038        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3039        while ( (sbits64) rem1 < 0 ) {
3040            --zSig1;
3041            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3042        }
3043        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3044    }
3045    return
3046        roundAndPackFloatx80(
3047            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3048
3049}
3050
3051/*
3052-------------------------------------------------------------------------------
3053Returns the remainder of the extended double-precision floating-point value
3054`a' with respect to the corresponding value `b'.  The operation is performed
3055according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3056-------------------------------------------------------------------------------
3057*/
3058floatx80 floatx80_rem( floatx80 a, floatx80 b )
3059{
3060    flag aSign, bSign, zSign;
3061    int32 aExp, bExp, expDiff;
3062    bits64 aSig0, aSig1, bSig;
3063    bits64 q, term0, term1, alternateASig0, alternateASig1;
3064    floatx80 z;
3065
3066    aSig0 = extractFloatx80Frac( a );
3067    aExp = extractFloatx80Exp( a );
3068    aSign = extractFloatx80Sign( a );
3069    bSig = extractFloatx80Frac( b );
3070    bExp = extractFloatx80Exp( b );
3071    bSign = extractFloatx80Sign( b );
3072    if ( aExp == 0x7FFF ) {
3073        if (    (bits64) ( aSig0<<1 )
3074             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3075            return propagateFloatx80NaN( a, b );
3076        }
3077        goto invalid;
3078    }
3079    if ( bExp == 0x7FFF ) {
3080        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3081        return a;
3082    }
3083    if ( bExp == 0 ) {
3084        if ( bSig == 0 ) {
3085 invalid:
3086            float_raise( float_flag_invalid );
3087            z.low = floatx80_default_nan_low;
3088            z.high = floatx80_default_nan_high;
3089            return z;
3090        }
3091        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3092    }
3093    if ( aExp == 0 ) {
3094        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3095        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3096    }
3097    bSig |= LIT64( 0x8000000000000000 );
3098    zSign = aSign;
3099    expDiff = aExp - bExp;
3100    aSig1 = 0;
3101    if ( expDiff < 0 ) {
3102        if ( expDiff < -1 ) return a;
3103        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3104        expDiff = 0;
3105    }
3106    q = ( bSig <= aSig0 );
3107    if ( q ) aSig0 -= bSig;
3108    expDiff -= 64;
3109    while ( 0 < expDiff ) {
3110        q = estimateDiv128To64( aSig0, aSig1, bSig );
3111        q = ( 2 < q ) ? q - 2 : 0;
3112        mul64To128( bSig, q, &term0, &term1 );
3113        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3114        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3115        expDiff -= 62;
3116    }
3117    expDiff += 64;
3118    if ( 0 < expDiff ) {
3119        q = estimateDiv128To64( aSig0, aSig1, bSig );
3120        q = ( 2 < q ) ? q - 2 : 0;
3121        q >>= 64 - expDiff;
3122        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3123        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3124        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3125        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3126            ++q;
3127            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3128        }
3129    }
3130    else {
3131        term1 = 0;
3132        term0 = bSig;
3133    }
3134    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3135    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3136         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3137              && ( q & 1 ) )
3138       ) {
3139        aSig0 = alternateASig0;
3140        aSig1 = alternateASig1;
3141        zSign = ! zSign;
3142    }
3143    return
3144        normalizeRoundAndPackFloatx80(
3145            80, zSign, bExp + expDiff, aSig0, aSig1 );
3146
3147}
3148
3149/*
3150-------------------------------------------------------------------------------
3151Returns the square root of the extended double-precision floating-point
3152value `a'.  The operation is performed according to the IEC/IEEE Standard
3153for Binary Floating-point Arithmetic.
3154-------------------------------------------------------------------------------
3155*/
3156floatx80 floatx80_sqrt( floatx80 a )
3157{
3158    flag aSign;
3159    int32 aExp, zExp;
3160    bits64 aSig0, aSig1, zSig0, zSig1;
3161    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3162    bits64 shiftedRem0, shiftedRem1;
3163    floatx80 z;
3164
3165    aSig0 = extractFloatx80Frac( a );
3166    aExp = extractFloatx80Exp( a );
3167    aSign = extractFloatx80Sign( a );
3168    if ( aExp == 0x7FFF ) {
3169        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3170        if ( ! aSign ) return a;
3171        goto invalid;
3172    }
3173    if ( aSign ) {
3174        if ( ( aExp | aSig0 ) == 0 ) return a;
3175 invalid:
3176        float_raise( float_flag_invalid );
3177        z.low = floatx80_default_nan_low;
3178        z.high = floatx80_default_nan_high;
3179        return z;
3180    }
3181    if ( aExp == 0 ) {
3182        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3183        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3184    }
3185    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3186    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3187    zSig0 <<= 31;
3188    aSig1 = 0;
3189    shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3190    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3191    if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3192    shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3193    mul64To128( zSig0, zSig0, &term0, &term1 );
3194    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3195    while ( (sbits64) rem0 < 0 ) {
3196        --zSig0;
3197        shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3198        term1 |= 1;
3199        add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3200    }
3201    shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3202    zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3203    if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3204        if ( zSig1 == 0 ) zSig1 = 1;
3205        mul64To128( zSig0, zSig1, &term1, &term2 );
3206        shortShift128Left( term1, term2, 1, &term1, &term2 );
3207        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3208        mul64To128( zSig1, zSig1, &term2, &term3 );
3209        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3210        while ( (sbits64) rem1 < 0 ) {
3211            --zSig1;
3212            shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3213            term3 |= 1;
3214            add192(
3215                rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3216        }
3217        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3218    }
3219    return
3220        roundAndPackFloatx80(
3221            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3222
3223}
3224
3225/*
3226-------------------------------------------------------------------------------
3227Returns 1 if the extended double-precision floating-point value `a' is
3228equal to the corresponding value `b', and 0 otherwise.  The comparison is
3229performed according to the IEC/IEEE Standard for Binary Floating-point
3230Arithmetic.
3231-------------------------------------------------------------------------------
3232*/
3233flag floatx80_eq( floatx80 a, floatx80 b )
3234{
3235
3236    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3237              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3238         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3239              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3240       ) {
3241        if (    floatx80_is_signaling_nan( a )
3242             || floatx80_is_signaling_nan( b ) ) {
3243            float_raise( float_flag_invalid );
3244        }
3245        return 0;
3246    }
3247    return
3248           ( a.low == b.low )
3249        && (    ( a.high == b.high )
3250             || (    ( a.low == 0 )
3251                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3252           );
3253
3254}
3255
3256/*
3257-------------------------------------------------------------------------------
3258Returns 1 if the extended double-precision floating-point value `a' is
3259less than or equal to the corresponding value `b', and 0 otherwise.  The
3260comparison is performed according to the IEC/IEEE Standard for Binary
3261Floating-point Arithmetic.
3262-------------------------------------------------------------------------------
3263*/
3264flag floatx80_le( floatx80 a, floatx80 b )
3265{
3266    flag aSign, bSign;
3267
3268    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3269              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3270         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3271              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3272       ) {
3273        float_raise( float_flag_invalid );
3274        return 0;
3275    }
3276    aSign = extractFloatx80Sign( a );
3277    bSign = extractFloatx80Sign( b );
3278    if ( aSign != bSign ) {
3279        return
3280               aSign
3281            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3282                 == 0 );
3283    }
3284    return
3285          aSign ? le128( b.high, b.low, a.high, a.low )
3286        : le128( a.high, a.low, b.high, b.low );
3287
3288}
3289
3290/*
3291-------------------------------------------------------------------------------
3292Returns 1 if the extended double-precision floating-point value `a' is
3293less than the corresponding value `b', and 0 otherwise.  The comparison
3294is performed according to the IEC/IEEE Standard for Binary Floating-point
3295Arithmetic.
3296-------------------------------------------------------------------------------
3297*/
3298flag floatx80_lt( floatx80 a, floatx80 b )
3299{
3300    flag aSign, bSign;
3301
3302    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3303              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3304         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3305              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3306       ) {
3307        float_raise( float_flag_invalid );
3308        return 0;
3309    }
3310    aSign = extractFloatx80Sign( a );
3311    bSign = extractFloatx80Sign( b );
3312    if ( aSign != bSign ) {
3313        return
3314               aSign
3315            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3316                 != 0 );
3317    }
3318    return
3319          aSign ? lt128( b.high, b.low, a.high, a.low )
3320        : lt128( a.high, a.low, b.high, b.low );
3321
3322}
3323
3324/*
3325-------------------------------------------------------------------------------
3326Returns 1 if the extended double-precision floating-point value `a' is equal
3327to the corresponding value `b', and 0 otherwise.  The invalid exception is
3328raised if either operand is a NaN.  Otherwise, the comparison is performed
3329according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3330-------------------------------------------------------------------------------
3331*/
3332flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3333{
3334
3335    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3336              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3337         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3338              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3339       ) {
3340        float_raise( float_flag_invalid );
3341        return 0;
3342    }
3343    return
3344           ( a.low == b.low )
3345        && (    ( a.high == b.high )
3346             || (    ( a.low == 0 )
3347                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3348           );
3349
3350}
3351
3352/*
3353-------------------------------------------------------------------------------
3354Returns 1 if the extended double-precision floating-point value `a' is less
3355than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3356do not cause an exception.  Otherwise, the comparison is performed according
3357to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3358-------------------------------------------------------------------------------
3359*/
3360flag floatx80_le_quiet( floatx80 a, floatx80 b )
3361{
3362    flag aSign, bSign;
3363
3364    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3365              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3366         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3367              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3368       ) {
3369        if (    floatx80_is_signaling_nan( a )
3370             || floatx80_is_signaling_nan( b ) ) {
3371            float_raise( float_flag_invalid );
3372        }
3373        return 0;
3374    }
3375    aSign = extractFloatx80Sign( a );
3376    bSign = extractFloatx80Sign( b );
3377    if ( aSign != bSign ) {
3378        return
3379               aSign
3380            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3381                 == 0 );
3382    }
3383    return
3384          aSign ? le128( b.high, b.low, a.high, a.low )
3385        : le128( a.high, a.low, b.high, b.low );
3386
3387}
3388
3389/*
3390-------------------------------------------------------------------------------
3391Returns 1 if the extended double-precision floating-point value `a' is less
3392than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3393an exception.  Otherwise, the comparison is performed according to the
3394IEC/IEEE Standard for Binary Floating-point Arithmetic.
3395-------------------------------------------------------------------------------
3396*/
3397flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3398{
3399    flag aSign, bSign;
3400
3401    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3402              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3403         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3404              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3405       ) {
3406        if (    floatx80_is_signaling_nan( a )
3407             || floatx80_is_signaling_nan( b ) ) {
3408            float_raise( float_flag_invalid );
3409        }
3410        return 0;
3411    }
3412    aSign = extractFloatx80Sign( a );
3413    bSign = extractFloatx80Sign( b );
3414    if ( aSign != bSign ) {
3415        return
3416               aSign
3417            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3418                 != 0 );
3419    }
3420    return
3421          aSign ? lt128( b.high, b.low, a.high, a.low )
3422        : lt128( a.high, a.low, b.high, b.low );
3423
3424}
3425
3426#endif
3427