1/*
2===============================================================================
3
4This C source file is part of the SoftFloat IEC/IEEE Floating-point
5Arithmetic Package, Release 2.
6
7Written by John R. Hauser.  This work was made possible in part by the
8International Computer Science Institute, located at Suite 600, 1947 Center
9Street, Berkeley, California 94704.  Funding was partially provided by the
10National Science Foundation under grant MIP-9311980.  The original version
11of this code was written as part of a project to build a fixed-point vector
12processor in collaboration with the University of California at Berkeley,
13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15arithmetic/softfloat.html'.
16
17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
18has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
20PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22
23Derivative works are acceptable, even for commercial purposes, so long as
24(1) they include prominent notice that the work is derivative, and (2) they
25include prominent notice akin to these three paragraphs for those parts of
26this code that are retained.
27
28===============================================================================
29*/
30
31#include <asm/div64.h>
32
33#include "fpa11.h"
34//#include "milieu.h"
35//#include "softfloat.h"
36
37/*
38-------------------------------------------------------------------------------
39Primitive arithmetic functions, including multi-word arithmetic, and
40division and square root approximations.  (Can be specialized to target if
41desired.)
42-------------------------------------------------------------------------------
43*/
44#include "softfloat-macros"
45
46/*
47-------------------------------------------------------------------------------
48Functions and definitions to determine:  (1) whether tininess for underflow
49is detected before or after rounding by default, (2) what (if anything)
50happens when exceptions are raised, (3) how signaling NaNs are distinguished
51from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52are propagated from function inputs to output.  These details are target-
53specific.
54-------------------------------------------------------------------------------
55*/
56#include "softfloat-specialize"
57
58/*
59-------------------------------------------------------------------------------
60Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61and 7, and returns the properly rounded 32-bit integer corresponding to the
62input.  If `zSign' is nonzero, the input is negated before being converted
63to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
64input is simply rounded to an integer, with the inexact exception raised if
65the input cannot be represented exactly as an integer.  If the fixed-point
66input is too large, however, the invalid exception is raised and the largest
67positive or negative integer is returned.
68-------------------------------------------------------------------------------
69*/
70static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
71{
72    int8 roundingMode;
73    flag roundNearestEven;
74    int8 roundIncrement, roundBits;
75    int32 z;
76
77    roundingMode = roundData->mode;
78    roundNearestEven = ( roundingMode == float_round_nearest_even );
79    roundIncrement = 0x40;
80    if ( ! roundNearestEven ) {
81        if ( roundingMode == float_round_to_zero ) {
82            roundIncrement = 0;
83        }
84        else {
85            roundIncrement = 0x7F;
86            if ( zSign ) {
87                if ( roundingMode == float_round_up ) roundIncrement = 0;
88            }
89            else {
90                if ( roundingMode == float_round_down ) roundIncrement = 0;
91            }
92        }
93    }
94    roundBits = absZ & 0x7F;
95    absZ = ( absZ + roundIncrement )>>7;
96    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
97    z = absZ;
98    if ( zSign ) z = - z;
99    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
100        roundData->exception |= float_flag_invalid;
101        return zSign ? 0x80000000 : 0x7FFFFFFF;
102    }
103    if ( roundBits ) roundData->exception |= float_flag_inexact;
104    return z;
105
106}
107
108/*
109-------------------------------------------------------------------------------
110Returns the fraction bits of the single-precision floating-point value `a'.
111-------------------------------------------------------------------------------
112*/
113INLINE bits32 extractFloat32Frac( float32 a )
114{
115
116    return a & 0x007FFFFF;
117
118}
119
120/*
121-------------------------------------------------------------------------------
122Returns the exponent bits of the single-precision floating-point value `a'.
123-------------------------------------------------------------------------------
124*/
125INLINE int16 extractFloat32Exp( float32 a )
126{
127
128    return ( a>>23 ) & 0xFF;
129
130}
131
132/*
133-------------------------------------------------------------------------------
134Returns the sign bit of the single-precision floating-point value `a'.
135-------------------------------------------------------------------------------
136*/
137
138/*
139-------------------------------------------------------------------------------
140Normalizes the subnormal single-precision floating-point value represented
141by the denormalized significand `aSig'.  The normalized exponent and
142significand are stored at the locations pointed to by `zExpPtr' and
143`zSigPtr', respectively.
144-------------------------------------------------------------------------------
145*/
146static void
147 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
148{
149    int8 shiftCount;
150
151    shiftCount = countLeadingZeros32( aSig ) - 8;
152    *zSigPtr = aSig<<shiftCount;
153    *zExpPtr = 1 - shiftCount;
154
155}
156
157/*
158-------------------------------------------------------------------------------
159Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
160single-precision floating-point value, returning the result.  After being
161shifted into the proper positions, the three fields are simply added
162together to form the result.  This means that any integer portion of `zSig'
163will be added into the exponent.  Since a properly normalized significand
164will have an integer portion equal to 1, the `zExp' input should be 1 less
165than the desired result exponent whenever `zSig' is a complete, normalized
166significand.
167-------------------------------------------------------------------------------
168*/
169INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
170{
171    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
172}
173
174/*
175-------------------------------------------------------------------------------
176Takes an abstract floating-point value having sign `zSign', exponent `zExp',
177and significand `zSig', and returns the proper single-precision floating-
178point value corresponding to the abstract input.  Ordinarily, the abstract
179value is simply rounded and packed into the single-precision format, with
180the inexact exception raised if the abstract input cannot be represented
181exactly.  If the abstract value is too large, however, the overflow and
182inexact exceptions are raised and an infinity or maximal finite value is
183returned.  If the abstract value is too small, the input value is rounded to
184a subnormal number, and the underflow and inexact exceptions are raised if
185the abstract input cannot be represented exactly as a subnormal single-
186precision floating-point number.
187    The input significand `zSig' has its binary point between bits 30
188and 29, which is 7 bits to the left of the usual location.  This shifted
189significand must be normalized or smaller.  If `zSig' is not normalized,
190`zExp' must be 0; in that case, the result returned is a subnormal number,
191and it must not require rounding.  In the usual case that `zSig' is
192normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
193The handling of underflow and overflow follows the IEC/IEEE Standard for
194Binary Floating-point Arithmetic.
195-------------------------------------------------------------------------------
196*/
197static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
198{
199    int8 roundingMode;
200    flag roundNearestEven;
201    int8 roundIncrement, roundBits;
202    flag isTiny;
203
204    roundingMode = roundData->mode;
205    roundNearestEven = ( roundingMode == float_round_nearest_even );
206    roundIncrement = 0x40;
207    if ( ! roundNearestEven ) {
208        if ( roundingMode == float_round_to_zero ) {
209            roundIncrement = 0;
210        }
211        else {
212            roundIncrement = 0x7F;
213            if ( zSign ) {
214                if ( roundingMode == float_round_up ) roundIncrement = 0;
215            }
216            else {
217                if ( roundingMode == float_round_down ) roundIncrement = 0;
218            }
219        }
220    }
221    roundBits = zSig & 0x7F;
222    if ( 0xFD <= (bits16) zExp ) {
223        if (    ( 0xFD < zExp )
224             || (    ( zExp == 0xFD )
225                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
226           ) {
227            roundData->exception |= float_flag_overflow | float_flag_inexact;
228            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
229        }
230        if ( zExp < 0 ) {
231            isTiny =
232                   ( float_detect_tininess == float_tininess_before_rounding )
233                || ( zExp < -1 )
234                || ( zSig + roundIncrement < 0x80000000 );
235            shift32RightJamming( zSig, - zExp, &zSig );
236            zExp = 0;
237            roundBits = zSig & 0x7F;
238            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
239        }
240    }
241    if ( roundBits ) roundData->exception |= float_flag_inexact;
242    zSig = ( zSig + roundIncrement )>>7;
243    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
244    if ( zSig == 0 ) zExp = 0;
245    return packFloat32( zSign, zExp, zSig );
246
247}
248
249/*
250-------------------------------------------------------------------------------
251Takes an abstract floating-point value having sign `zSign', exponent `zExp',
252and significand `zSig', and returns the proper single-precision floating-
253point value corresponding to the abstract input.  This routine is just like
254`roundAndPackFloat32' except that `zSig' does not have to be normalized in
255any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
256point exponent.
257-------------------------------------------------------------------------------
258*/
259static float32
260 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
261{
262    int8 shiftCount;
263
264    shiftCount = countLeadingZeros32( zSig ) - 1;
265    return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
266
267}
268
269/*
270-------------------------------------------------------------------------------
271Returns the fraction bits of the double-precision floating-point value `a'.
272-------------------------------------------------------------------------------
273*/
274INLINE bits64 extractFloat64Frac( float64 a )
275{
276
277    return a & LIT64( 0x000FFFFFFFFFFFFF );
278
279}
280
281/*
282-------------------------------------------------------------------------------
283Returns the exponent bits of the double-precision floating-point value `a'.
284-------------------------------------------------------------------------------
285*/
286INLINE int16 extractFloat64Exp( float64 a )
287{
288
289    return ( a>>52 ) & 0x7FF;
290
291}
292
293/*
294-------------------------------------------------------------------------------
295Returns the sign bit of the double-precision floating-point value `a'.
296-------------------------------------------------------------------------------
297*/
298
299/*
300-------------------------------------------------------------------------------
301Normalizes the subnormal double-precision floating-point value represented
302by the denormalized significand `aSig'.  The normalized exponent and
303significand are stored at the locations pointed to by `zExpPtr' and
304`zSigPtr', respectively.
305-------------------------------------------------------------------------------
306*/
307static void
308 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
309{
310    int8 shiftCount;
311
312    shiftCount = countLeadingZeros64( aSig ) - 11;
313    *zSigPtr = aSig<<shiftCount;
314    *zExpPtr = 1 - shiftCount;
315
316}
317
318/*
319-------------------------------------------------------------------------------
320Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
321double-precision floating-point value, returning the result.  After being
322shifted into the proper positions, the three fields are simply added
323together to form the result.  This means that any integer portion of `zSig'
324will be added into the exponent.  Since a properly normalized significand
325will have an integer portion equal to 1, the `zExp' input should be 1 less
326than the desired result exponent whenever `zSig' is a complete, normalized
327significand.
328-------------------------------------------------------------------------------
329*/
330INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
331{
332
333    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
334
335}
336
337/*
338-------------------------------------------------------------------------------
339Takes an abstract floating-point value having sign `zSign', exponent `zExp',
340and significand `zSig', and returns the proper double-precision floating-
341point value corresponding to the abstract input.  Ordinarily, the abstract
342value is simply rounded and packed into the double-precision format, with
343the inexact exception raised if the abstract input cannot be represented
344exactly.  If the abstract value is too large, however, the overflow and
345inexact exceptions are raised and an infinity or maximal finite value is
346returned.  If the abstract value is too small, the input value is rounded to
347a subnormal number, and the underflow and inexact exceptions are raised if
348the abstract input cannot be represented exactly as a subnormal double-
349precision floating-point number.
350    The input significand `zSig' has its binary point between bits 62
351and 61, which is 10 bits to the left of the usual location.  This shifted
352significand must be normalized or smaller.  If `zSig' is not normalized,
353`zExp' must be 0; in that case, the result returned is a subnormal number,
354and it must not require rounding.  In the usual case that `zSig' is
355normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
356The handling of underflow and overflow follows the IEC/IEEE Standard for
357Binary Floating-point Arithmetic.
358-------------------------------------------------------------------------------
359*/
360static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
361{
362    int8 roundingMode;
363    flag roundNearestEven;
364    int16 roundIncrement, roundBits;
365    flag isTiny;
366
367    roundingMode = roundData->mode;
368    roundNearestEven = ( roundingMode == float_round_nearest_even );
369    roundIncrement = 0x200;
370    if ( ! roundNearestEven ) {
371        if ( roundingMode == float_round_to_zero ) {
372            roundIncrement = 0;
373        }
374        else {
375            roundIncrement = 0x3FF;
376            if ( zSign ) {
377                if ( roundingMode == float_round_up ) roundIncrement = 0;
378            }
379            else {
380                if ( roundingMode == float_round_down ) roundIncrement = 0;
381            }
382        }
383    }
384    roundBits = zSig & 0x3FF;
385    if ( 0x7FD <= (bits16) zExp ) {
386        if (    ( 0x7FD < zExp )
387             || (    ( zExp == 0x7FD )
388                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
389           ) {
390            //register int lr = __builtin_return_address(0);
391            //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
392            roundData->exception |= float_flag_overflow | float_flag_inexact;
393            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
394        }
395        if ( zExp < 0 ) {
396            isTiny =
397                   ( float_detect_tininess == float_tininess_before_rounding )
398                || ( zExp < -1 )
399                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
400            shift64RightJamming( zSig, - zExp, &zSig );
401            zExp = 0;
402            roundBits = zSig & 0x3FF;
403            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
404        }
405    }
406    if ( roundBits ) roundData->exception |= float_flag_inexact;
407    zSig = ( zSig + roundIncrement )>>10;
408    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
409    if ( zSig == 0 ) zExp = 0;
410    return packFloat64( zSign, zExp, zSig );
411
412}
413
414/*
415-------------------------------------------------------------------------------
416Takes an abstract floating-point value having sign `zSign', exponent `zExp',
417and significand `zSig', and returns the proper double-precision floating-
418point value corresponding to the abstract input.  This routine is just like
419`roundAndPackFloat64' except that `zSig' does not have to be normalized in
420any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
421point exponent.
422-------------------------------------------------------------------------------
423*/
424static float64
425 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
426{
427    int8 shiftCount;
428
429    shiftCount = countLeadingZeros64( zSig ) - 1;
430    return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
431
432}
433
434#ifdef FLOATX80
435
436/*
437-------------------------------------------------------------------------------
438Returns the fraction bits of the extended double-precision floating-point
439value `a'.
440-------------------------------------------------------------------------------
441*/
442INLINE bits64 extractFloatx80Frac( floatx80 a )
443{
444
445    return a.low;
446
447}
448
449/*
450-------------------------------------------------------------------------------
451Returns the exponent bits of the extended double-precision floating-point
452value `a'.
453-------------------------------------------------------------------------------
454*/
455INLINE int32 extractFloatx80Exp( floatx80 a )
456{
457
458    return a.high & 0x7FFF;
459
460}
461
462/*
463-------------------------------------------------------------------------------
464Returns the sign bit of the extended double-precision floating-point value
465`a'.
466-------------------------------------------------------------------------------
467*/
468INLINE flag extractFloatx80Sign( floatx80 a )
469{
470
471    return a.high>>15;
472
473}
474
475/*
476-------------------------------------------------------------------------------
477Normalizes the subnormal extended double-precision floating-point value
478represented by the denormalized significand `aSig'.  The normalized exponent
479and significand are stored at the locations pointed to by `zExpPtr' and
480`zSigPtr', respectively.
481-------------------------------------------------------------------------------
482*/
483static void
484 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
485{
486    int8 shiftCount;
487
488    shiftCount = countLeadingZeros64( aSig );
489    *zSigPtr = aSig<<shiftCount;
490    *zExpPtr = 1 - shiftCount;
491
492}
493
494/*
495-------------------------------------------------------------------------------
496Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
497extended double-precision floating-point value, returning the result.
498-------------------------------------------------------------------------------
499*/
500INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
501{
502    floatx80 z;
503
504    z.low = zSig;
505    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
506    z.__padding = 0;
507    return z;
508
509}
510
511/*
512-------------------------------------------------------------------------------
513Takes an abstract floating-point value having sign `zSign', exponent `zExp',
514and extended significand formed by the concatenation of `zSig0' and `zSig1',
515and returns the proper extended double-precision floating-point value
516corresponding to the abstract input.  Ordinarily, the abstract value is
517rounded and packed into the extended double-precision format, with the
518inexact exception raised if the abstract input cannot be represented
519exactly.  If the abstract value is too large, however, the overflow and
520inexact exceptions are raised and an infinity or maximal finite value is
521returned.  If the abstract value is too small, the input value is rounded to
522a subnormal number, and the underflow and inexact exceptions are raised if
523the abstract input cannot be represented exactly as a subnormal extended
524double-precision floating-point number.
525    If `roundingPrecision' is 32 or 64, the result is rounded to the same
526number of bits as single or double precision, respectively.  Otherwise, the
527result is rounded to the full precision of the extended double-precision
528format.
529    The input significand must be normalized or smaller.  If the input
530significand is not normalized, `zExp' must be 0; in that case, the result
531returned is a subnormal number, and it must not require rounding.  The
532handling of underflow and overflow follows the IEC/IEEE Standard for Binary
533Floating-point Arithmetic.
534-------------------------------------------------------------------------------
535*/
536static floatx80
537 roundAndPackFloatx80(
538     struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
539 )
540{
541    int8 roundingMode, roundingPrecision;
542    flag roundNearestEven, increment, isTiny;
543    int64 roundIncrement, roundMask, roundBits;
544
545    roundingMode = roundData->mode;
546    roundingPrecision = roundData->precision;
547    roundNearestEven = ( roundingMode == float_round_nearest_even );
548    if ( roundingPrecision == 80 ) goto precision80;
549    if ( roundingPrecision == 64 ) {
550        roundIncrement = LIT64( 0x0000000000000400 );
551        roundMask = LIT64( 0x00000000000007FF );
552    }
553    else if ( roundingPrecision == 32 ) {
554        roundIncrement = LIT64( 0x0000008000000000 );
555        roundMask = LIT64( 0x000000FFFFFFFFFF );
556    }
557    else {
558        goto precision80;
559    }
560    zSig0 |= ( zSig1 != 0 );
561    if ( ! roundNearestEven ) {
562        if ( roundingMode == float_round_to_zero ) {
563            roundIncrement = 0;
564        }
565        else {
566            roundIncrement = roundMask;
567            if ( zSign ) {
568                if ( roundingMode == float_round_up ) roundIncrement = 0;
569            }
570            else {
571                if ( roundingMode == float_round_down ) roundIncrement = 0;
572            }
573        }
574    }
575    roundBits = zSig0 & roundMask;
576    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
577        if (    ( 0x7FFE < zExp )
578             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
579           ) {
580            goto overflow;
581        }
582        if ( zExp <= 0 ) {
583            isTiny =
584                   ( float_detect_tininess == float_tininess_before_rounding )
585                || ( zExp < 0 )
586                || ( zSig0 <= zSig0 + roundIncrement );
587            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
588            zExp = 0;
589            roundBits = zSig0 & roundMask;
590            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
591            if ( roundBits ) roundData->exception |= float_flag_inexact;
592            zSig0 += roundIncrement;
593            if ( (sbits64) zSig0 < 0 ) zExp = 1;
594            roundIncrement = roundMask + 1;
595            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
596                roundMask |= roundIncrement;
597            }
598            zSig0 &= ~ roundMask;
599            return packFloatx80( zSign, zExp, zSig0 );
600        }
601    }
602    if ( roundBits ) roundData->exception |= float_flag_inexact;
603    zSig0 += roundIncrement;
604    if ( zSig0 < roundIncrement ) {
605        ++zExp;
606        zSig0 = LIT64( 0x8000000000000000 );
607    }
608    roundIncrement = roundMask + 1;
609    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
610        roundMask |= roundIncrement;
611    }
612    zSig0 &= ~ roundMask;
613    if ( zSig0 == 0 ) zExp = 0;
614    return packFloatx80( zSign, zExp, zSig0 );
615 precision80:
616    increment = ( (sbits64) zSig1 < 0 );
617    if ( ! roundNearestEven ) {
618        if ( roundingMode == float_round_to_zero ) {
619            increment = 0;
620        }
621        else {
622            if ( zSign ) {
623                increment = ( roundingMode == float_round_down ) && zSig1;
624            }
625            else {
626                increment = ( roundingMode == float_round_up ) && zSig1;
627            }
628        }
629    }
630    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
631        if (    ( 0x7FFE < zExp )
632             || (    ( zExp == 0x7FFE )
633                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
634                  && increment
635                )
636           ) {
637            roundMask = 0;
638 overflow:
639            roundData->exception |= float_flag_overflow | float_flag_inexact;
640            if (    ( roundingMode == float_round_to_zero )
641                 || ( zSign && ( roundingMode == float_round_up ) )
642                 || ( ! zSign && ( roundingMode == float_round_down ) )
643               ) {
644                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
645            }
646            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
647        }
648        if ( zExp <= 0 ) {
649            isTiny =
650                   ( float_detect_tininess == float_tininess_before_rounding )
651                || ( zExp < 0 )
652                || ! increment
653                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
654            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
655            zExp = 0;
656            if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
657            if ( zSig1 ) roundData->exception |= float_flag_inexact;
658            if ( roundNearestEven ) {
659                increment = ( (sbits64) zSig1 < 0 );
660            }
661            else {
662                if ( zSign ) {
663                    increment = ( roundingMode == float_round_down ) && zSig1;
664                }
665                else {
666                    increment = ( roundingMode == float_round_up ) && zSig1;
667                }
668            }
669            if ( increment ) {
670                ++zSig0;
671                zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
672                if ( (sbits64) zSig0 < 0 ) zExp = 1;
673            }
674            return packFloatx80( zSign, zExp, zSig0 );
675        }
676    }
677    if ( zSig1 ) roundData->exception |= float_flag_inexact;
678    if ( increment ) {
679        ++zSig0;
680        if ( zSig0 == 0 ) {
681            ++zExp;
682            zSig0 = LIT64( 0x8000000000000000 );
683        }
684        else {
685            zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
686        }
687    }
688    else {
689        if ( zSig0 == 0 ) zExp = 0;
690    }
691
692    return packFloatx80( zSign, zExp, zSig0 );
693}
694
695/*
696-------------------------------------------------------------------------------
697Takes an abstract floating-point value having sign `zSign', exponent
698`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
699and returns the proper extended double-precision floating-point value
700corresponding to the abstract input.  This routine is just like
701`roundAndPackFloatx80' except that the input significand does not have to be
702normalized.
703-------------------------------------------------------------------------------
704*/
705static floatx80
706 normalizeRoundAndPackFloatx80(
707     struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
708 )
709{
710    int8 shiftCount;
711
712    if ( zSig0 == 0 ) {
713        zSig0 = zSig1;
714        zSig1 = 0;
715        zExp -= 64;
716    }
717    shiftCount = countLeadingZeros64( zSig0 );
718    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
719    zExp -= shiftCount;
720    return
721        roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
722
723}
724
725#endif
726
727/*
728-------------------------------------------------------------------------------
729Returns the result of converting the 32-bit two's complement integer `a' to
730the single-precision floating-point format.  The conversion is performed
731according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
732-------------------------------------------------------------------------------
733*/
734float32 int32_to_float32(struct roundingData *roundData, int32 a)
735{
736    flag zSign;
737
738    if ( a == 0 ) return 0;
739    if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
740    zSign = ( a < 0 );
741    return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
742
743}
744
745/*
746-------------------------------------------------------------------------------
747Returns the result of converting the 32-bit two's complement integer `a' to
748the double-precision floating-point format.  The conversion is performed
749according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
750-------------------------------------------------------------------------------
751*/
752float64 int32_to_float64( int32 a )
753{
754    flag aSign;
755    uint32 absA;
756    int8 shiftCount;
757    bits64 zSig;
758
759    if ( a == 0 ) return 0;
760    aSign = ( a < 0 );
761    absA = aSign ? - a : a;
762    shiftCount = countLeadingZeros32( absA ) + 21;
763    zSig = absA;
764    return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
765
766}
767
768#ifdef FLOATX80
769
770/*
771-------------------------------------------------------------------------------
772Returns the result of converting the 32-bit two's complement integer `a'
773to the extended double-precision floating-point format.  The conversion
774is performed according to the IEC/IEEE Standard for Binary Floating-point
775Arithmetic.
776-------------------------------------------------------------------------------
777*/
778floatx80 int32_to_floatx80( int32 a )
779{
780    flag zSign;
781    uint32 absA;
782    int8 shiftCount;
783    bits64 zSig;
784
785    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
786    zSign = ( a < 0 );
787    absA = zSign ? - a : a;
788    shiftCount = countLeadingZeros32( absA ) + 32;
789    zSig = absA;
790    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
791
792}
793
794#endif
795
796/*
797-------------------------------------------------------------------------------
798Returns the result of converting the single-precision floating-point value
799`a' to the 32-bit two's complement integer format.  The conversion is
800performed according to the IEC/IEEE Standard for Binary Floating-point
801Arithmetic---which means in particular that the conversion is rounded
802according to the current rounding mode.  If `a' is a NaN, the largest
803positive integer is returned.  Otherwise, if the conversion overflows, the
804largest integer with the same sign as `a' is returned.
805-------------------------------------------------------------------------------
806*/
807int32 float32_to_int32( struct roundingData *roundData, float32 a )
808{
809    flag aSign;
810    int16 aExp, shiftCount;
811    bits32 aSig;
812    bits64 zSig;
813
814    aSig = extractFloat32Frac( a );
815    aExp = extractFloat32Exp( a );
816    aSign = extractFloat32Sign( a );
817    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
818    if ( aExp ) aSig |= 0x00800000;
819    shiftCount = 0xAF - aExp;
820    zSig = aSig;
821    zSig <<= 32;
822    if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
823    return roundAndPackInt32( roundData, aSign, zSig );
824
825}
826
827/*
828-------------------------------------------------------------------------------
829Returns the result of converting the single-precision floating-point value
830`a' to the 32-bit two's complement integer format.  The conversion is
831performed according to the IEC/IEEE Standard for Binary Floating-point
832Arithmetic, except that the conversion is always rounded toward zero.  If
833`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
834conversion overflows, the largest integer with the same sign as `a' is
835returned.
836-------------------------------------------------------------------------------
837*/
838int32 float32_to_int32_round_to_zero( float32 a )
839{
840    flag aSign;
841    int16 aExp, shiftCount;
842    bits32 aSig;
843    int32 z;
844
845    aSig = extractFloat32Frac( a );
846    aExp = extractFloat32Exp( a );
847    aSign = extractFloat32Sign( a );
848    shiftCount = aExp - 0x9E;
849    if ( 0 <= shiftCount ) {
850        if ( a == 0xCF000000 ) return 0x80000000;
851        float_raise( float_flag_invalid );
852        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
853        return 0x80000000;
854    }
855    else if ( aExp <= 0x7E ) {
856        if ( aExp | aSig ) float_raise( float_flag_inexact );
857        return 0;
858    }
859    aSig = ( aSig | 0x00800000 )<<8;
860    z = aSig>>( - shiftCount );
861    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
862        float_raise( float_flag_inexact );
863    }
864    return aSign ? - z : z;
865
866}
867
868/*
869-------------------------------------------------------------------------------
870Returns the result of converting the single-precision floating-point value
871`a' to the double-precision floating-point format.  The conversion is
872performed according to the IEC/IEEE Standard for Binary Floating-point
873Arithmetic.
874-------------------------------------------------------------------------------
875*/
876float64 float32_to_float64( float32 a )
877{
878    flag aSign;
879    int16 aExp;
880    bits32 aSig;
881
882    aSig = extractFloat32Frac( a );
883    aExp = extractFloat32Exp( a );
884    aSign = extractFloat32Sign( a );
885    if ( aExp == 0xFF ) {
886        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
887        return packFloat64( aSign, 0x7FF, 0 );
888    }
889    if ( aExp == 0 ) {
890        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
891        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
892        --aExp;
893    }
894    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
895
896}
897
898#ifdef FLOATX80
899
900/*
901-------------------------------------------------------------------------------
902Returns the result of converting the single-precision floating-point value
903`a' to the extended double-precision floating-point format.  The conversion
904is performed according to the IEC/IEEE Standard for Binary Floating-point
905Arithmetic.
906-------------------------------------------------------------------------------
907*/
908floatx80 float32_to_floatx80( float32 a )
909{
910    flag aSign;
911    int16 aExp;
912    bits32 aSig;
913
914    aSig = extractFloat32Frac( a );
915    aExp = extractFloat32Exp( a );
916    aSign = extractFloat32Sign( a );
917    if ( aExp == 0xFF ) {
918        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
919        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
920    }
921    if ( aExp == 0 ) {
922        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
923        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
924    }
925    aSig |= 0x00800000;
926    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
927
928}
929
930#endif
931
932/*
933-------------------------------------------------------------------------------
934Rounds the single-precision floating-point value `a' to an integer, and
935returns the result as a single-precision floating-point value.  The
936operation is performed according to the IEC/IEEE Standard for Binary
937Floating-point Arithmetic.
938-------------------------------------------------------------------------------
939*/
940float32 float32_round_to_int( struct roundingData *roundData, float32 a )
941{
942    flag aSign;
943    int16 aExp;
944    bits32 lastBitMask, roundBitsMask;
945    int8 roundingMode;
946    float32 z;
947
948    aExp = extractFloat32Exp( a );
949    if ( 0x96 <= aExp ) {
950        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
951            return propagateFloat32NaN( a, a );
952        }
953        return a;
954    }
955    roundingMode = roundData->mode;
956    if ( aExp <= 0x7E ) {
957        if ( (bits32) ( a<<1 ) == 0 ) return a;
958        roundData->exception |= float_flag_inexact;
959        aSign = extractFloat32Sign( a );
960        switch ( roundingMode ) {
961         case float_round_nearest_even:
962            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
963                return packFloat32( aSign, 0x7F, 0 );
964            }
965            break;
966         case float_round_down:
967            return aSign ? 0xBF800000 : 0;
968         case float_round_up:
969            return aSign ? 0x80000000 : 0x3F800000;
970        }
971        return packFloat32( aSign, 0, 0 );
972    }
973    lastBitMask = 1;
974    lastBitMask <<= 0x96 - aExp;
975    roundBitsMask = lastBitMask - 1;
976    z = a;
977    if ( roundingMode == float_round_nearest_even ) {
978        z += lastBitMask>>1;
979        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
980    }
981    else if ( roundingMode != float_round_to_zero ) {
982        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
983            z += roundBitsMask;
984        }
985    }
986    z &= ~ roundBitsMask;
987    if ( z != a ) roundData->exception |= float_flag_inexact;
988    return z;
989
990}
991
992/*
993-------------------------------------------------------------------------------
994Returns the result of adding the absolute values of the single-precision
995floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
996before being returned.  `zSign' is ignored if the result is a NaN.  The
997addition is performed according to the IEC/IEEE Standard for Binary
998Floating-point Arithmetic.
999-------------------------------------------------------------------------------
1000*/
1001static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1002{
1003    int16 aExp, bExp, zExp;
1004    bits32 aSig, bSig, zSig;
1005    int16 expDiff;
1006
1007    aSig = extractFloat32Frac( a );
1008    aExp = extractFloat32Exp( a );
1009    bSig = extractFloat32Frac( b );
1010    bExp = extractFloat32Exp( b );
1011    expDiff = aExp - bExp;
1012    aSig <<= 6;
1013    bSig <<= 6;
1014    if ( 0 < expDiff ) {
1015        if ( aExp == 0xFF ) {
1016            if ( aSig ) return propagateFloat32NaN( a, b );
1017            return a;
1018        }
1019        if ( bExp == 0 ) {
1020            --expDiff;
1021        }
1022        else {
1023            bSig |= 0x20000000;
1024        }
1025        shift32RightJamming( bSig, expDiff, &bSig );
1026        zExp = aExp;
1027    }
1028    else if ( expDiff < 0 ) {
1029        if ( bExp == 0xFF ) {
1030            if ( bSig ) return propagateFloat32NaN( a, b );
1031            return packFloat32( zSign, 0xFF, 0 );
1032        }
1033        if ( aExp == 0 ) {
1034            ++expDiff;
1035        }
1036        else {
1037            aSig |= 0x20000000;
1038        }
1039        shift32RightJamming( aSig, - expDiff, &aSig );
1040        zExp = bExp;
1041    }
1042    else {
1043        if ( aExp == 0xFF ) {
1044            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1045            return a;
1046        }
1047        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1048        zSig = 0x40000000 + aSig + bSig;
1049        zExp = aExp;
1050        goto roundAndPack;
1051    }
1052    aSig |= 0x20000000;
1053    zSig = ( aSig + bSig )<<1;
1054    --zExp;
1055    if ( (sbits32) zSig < 0 ) {
1056        zSig = aSig + bSig;
1057        ++zExp;
1058    }
1059 roundAndPack:
1060    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1061
1062}
1063
1064/*
1065-------------------------------------------------------------------------------
1066Returns the result of subtracting the absolute values of the single-
1067precision floating-point values `a' and `b'.  If `zSign' is true, the
1068difference is negated before being returned.  `zSign' is ignored if the
1069result is a NaN.  The subtraction is performed according to the IEC/IEEE
1070Standard for Binary Floating-point Arithmetic.
1071-------------------------------------------------------------------------------
1072*/
1073static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1074{
1075    int16 aExp, bExp, zExp;
1076    bits32 aSig, bSig, zSig;
1077    int16 expDiff;
1078
1079    aSig = extractFloat32Frac( a );
1080    aExp = extractFloat32Exp( a );
1081    bSig = extractFloat32Frac( b );
1082    bExp = extractFloat32Exp( b );
1083    expDiff = aExp - bExp;
1084    aSig <<= 7;
1085    bSig <<= 7;
1086    if ( 0 < expDiff ) goto aExpBigger;
1087    if ( expDiff < 0 ) goto bExpBigger;
1088    if ( aExp == 0xFF ) {
1089        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1090        roundData->exception |= float_flag_invalid;
1091        return float32_default_nan;
1092    }
1093    if ( aExp == 0 ) {
1094        aExp = 1;
1095        bExp = 1;
1096    }
1097    if ( bSig < aSig ) goto aBigger;
1098    if ( aSig < bSig ) goto bBigger;
1099    return packFloat32( roundData->mode == float_round_down, 0, 0 );
1100 bExpBigger:
1101    if ( bExp == 0xFF ) {
1102        if ( bSig ) return propagateFloat32NaN( a, b );
1103        return packFloat32( zSign ^ 1, 0xFF, 0 );
1104    }
1105    if ( aExp == 0 ) {
1106        ++expDiff;
1107    }
1108    else {
1109        aSig |= 0x40000000;
1110    }
1111    shift32RightJamming( aSig, - expDiff, &aSig );
1112    bSig |= 0x40000000;
1113 bBigger:
1114    zSig = bSig - aSig;
1115    zExp = bExp;
1116    zSign ^= 1;
1117    goto normalizeRoundAndPack;
1118 aExpBigger:
1119    if ( aExp == 0xFF ) {
1120        if ( aSig ) return propagateFloat32NaN( a, b );
1121        return a;
1122    }
1123    if ( bExp == 0 ) {
1124        --expDiff;
1125    }
1126    else {
1127        bSig |= 0x40000000;
1128    }
1129    shift32RightJamming( bSig, expDiff, &bSig );
1130    aSig |= 0x40000000;
1131 aBigger:
1132    zSig = aSig - bSig;
1133    zExp = aExp;
1134 normalizeRoundAndPack:
1135    --zExp;
1136    return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1137
1138}
1139
1140/*
1141-------------------------------------------------------------------------------
1142Returns the result of adding the single-precision floating-point values `a'
1143and `b'.  The operation is performed according to the IEC/IEEE Standard for
1144Binary Floating-point Arithmetic.
1145-------------------------------------------------------------------------------
1146*/
1147float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1148{
1149    flag aSign, bSign;
1150
1151    aSign = extractFloat32Sign( a );
1152    bSign = extractFloat32Sign( b );
1153    if ( aSign == bSign ) {
1154        return addFloat32Sigs( roundData, a, b, aSign );
1155    }
1156    else {
1157        return subFloat32Sigs( roundData, a, b, aSign );
1158    }
1159
1160}
1161
1162/*
1163-------------------------------------------------------------------------------
1164Returns the result of subtracting the single-precision floating-point values
1165`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1166for Binary Floating-point Arithmetic.
1167-------------------------------------------------------------------------------
1168*/
1169float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1170{
1171    flag aSign, bSign;
1172
1173    aSign = extractFloat32Sign( a );
1174    bSign = extractFloat32Sign( b );
1175    if ( aSign == bSign ) {
1176        return subFloat32Sigs( roundData, a, b, aSign );
1177    }
1178    else {
1179        return addFloat32Sigs( roundData, a, b, aSign );
1180    }
1181
1182}
1183
1184/*
1185-------------------------------------------------------------------------------
1186Returns the result of multiplying the single-precision floating-point values
1187`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1188for Binary Floating-point Arithmetic.
1189-------------------------------------------------------------------------------
1190*/
1191float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1192{
1193    flag aSign, bSign, zSign;
1194    int16 aExp, bExp, zExp;
1195    bits32 aSig, bSig;
1196    bits64 zSig64;
1197    bits32 zSig;
1198
1199    aSig = extractFloat32Frac( a );
1200    aExp = extractFloat32Exp( a );
1201    aSign = extractFloat32Sign( a );
1202    bSig = extractFloat32Frac( b );
1203    bExp = extractFloat32Exp( b );
1204    bSign = extractFloat32Sign( b );
1205    zSign = aSign ^ bSign;
1206    if ( aExp == 0xFF ) {
1207        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1208            return propagateFloat32NaN( a, b );
1209        }
1210        if ( ( bExp | bSig ) == 0 ) {
1211            roundData->exception |= float_flag_invalid;
1212            return float32_default_nan;
1213        }
1214        return packFloat32( zSign, 0xFF, 0 );
1215    }
1216    if ( bExp == 0xFF ) {
1217        if ( bSig ) return propagateFloat32NaN( a, b );
1218        if ( ( aExp | aSig ) == 0 ) {
1219            roundData->exception |= float_flag_invalid;
1220            return float32_default_nan;
1221        }
1222        return packFloat32( zSign, 0xFF, 0 );
1223    }
1224    if ( aExp == 0 ) {
1225        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1226        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1227    }
1228    if ( bExp == 0 ) {
1229        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1230        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1231    }
1232    zExp = aExp + bExp - 0x7F;
1233    aSig = ( aSig | 0x00800000 )<<7;
1234    bSig = ( bSig | 0x00800000 )<<8;
1235    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1236    zSig = zSig64;
1237    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1238        zSig <<= 1;
1239        --zExp;
1240    }
1241    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1242
1243}
1244
1245/*
1246-------------------------------------------------------------------------------
1247Returns the result of dividing the single-precision floating-point value `a'
1248by the corresponding value `b'.  The operation is performed according to the
1249IEC/IEEE Standard for Binary Floating-point Arithmetic.
1250-------------------------------------------------------------------------------
1251*/
1252float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1253{
1254    flag aSign, bSign, zSign;
1255    int16 aExp, bExp, zExp;
1256    bits32 aSig, bSig, zSig;
1257
1258    aSig = extractFloat32Frac( a );
1259    aExp = extractFloat32Exp( a );
1260    aSign = extractFloat32Sign( a );
1261    bSig = extractFloat32Frac( b );
1262    bExp = extractFloat32Exp( b );
1263    bSign = extractFloat32Sign( b );
1264    zSign = aSign ^ bSign;
1265    if ( aExp == 0xFF ) {
1266        if ( aSig ) return propagateFloat32NaN( a, b );
1267        if ( bExp == 0xFF ) {
1268            if ( bSig ) return propagateFloat32NaN( a, b );
1269            roundData->exception |= float_flag_invalid;
1270            return float32_default_nan;
1271        }
1272        return packFloat32( zSign, 0xFF, 0 );
1273    }
1274    if ( bExp == 0xFF ) {
1275        if ( bSig ) return propagateFloat32NaN( a, b );
1276        return packFloat32( zSign, 0, 0 );
1277    }
1278    if ( bExp == 0 ) {
1279        if ( bSig == 0 ) {
1280            if ( ( aExp | aSig ) == 0 ) {
1281                roundData->exception |= float_flag_invalid;
1282                return float32_default_nan;
1283            }
1284            roundData->exception |= float_flag_divbyzero;
1285            return packFloat32( zSign, 0xFF, 0 );
1286        }
1287        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1288    }
1289    if ( aExp == 0 ) {
1290        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1291        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1292    }
1293    zExp = aExp - bExp + 0x7D;
1294    aSig = ( aSig | 0x00800000 )<<7;
1295    bSig = ( bSig | 0x00800000 )<<8;
1296    if ( bSig <= ( aSig + aSig ) ) {
1297        aSig >>= 1;
1298        ++zExp;
1299    }
1300    {
1301        bits64 tmp = ( (bits64) aSig )<<32;
1302        do_div( tmp, bSig );
1303        zSig = tmp;
1304    }
1305    if ( ( zSig & 0x3F ) == 0 ) {
1306        zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1307    }
1308    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1309
1310}
1311
1312/*
1313-------------------------------------------------------------------------------
1314Returns the remainder of the single-precision floating-point value `a'
1315with respect to the corresponding value `b'.  The operation is performed
1316according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1317-------------------------------------------------------------------------------
1318*/
1319float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1320{
1321    flag aSign, bSign, zSign;
1322    int16 aExp, bExp, expDiff;
1323    bits32 aSig, bSig;
1324    bits32 q;
1325    bits64 aSig64, bSig64, q64;
1326    bits32 alternateASig;
1327    sbits32 sigMean;
1328
1329    aSig = extractFloat32Frac( a );
1330    aExp = extractFloat32Exp( a );
1331    aSign = extractFloat32Sign( a );
1332    bSig = extractFloat32Frac( b );
1333    bExp = extractFloat32Exp( b );
1334    bSign = extractFloat32Sign( b );
1335    if ( aExp == 0xFF ) {
1336        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1337            return propagateFloat32NaN( a, b );
1338        }
1339        roundData->exception |= float_flag_invalid;
1340        return float32_default_nan;
1341    }
1342    if ( bExp == 0xFF ) {
1343        if ( bSig ) return propagateFloat32NaN( a, b );
1344        return a;
1345    }
1346    if ( bExp == 0 ) {
1347        if ( bSig == 0 ) {
1348            roundData->exception |= float_flag_invalid;
1349            return float32_default_nan;
1350        }
1351        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1352    }
1353    if ( aExp == 0 ) {
1354        if ( aSig == 0 ) return a;
1355        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1356    }
1357    expDiff = aExp - bExp;
1358    aSig |= 0x00800000;
1359    bSig |= 0x00800000;
1360    if ( expDiff < 32 ) {
1361        aSig <<= 8;
1362        bSig <<= 8;
1363        if ( expDiff < 0 ) {
1364            if ( expDiff < -1 ) return a;
1365            aSig >>= 1;
1366        }
1367        q = ( bSig <= aSig );
1368        if ( q ) aSig -= bSig;
1369        if ( 0 < expDiff ) {
1370            bits64 tmp = ( (bits64) aSig )<<32;
1371            do_div( tmp, bSig );
1372            q = tmp;
1373            q >>= 32 - expDiff;
1374            bSig >>= 2;
1375            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1376        }
1377        else {
1378            aSig >>= 2;
1379            bSig >>= 2;
1380        }
1381    }
1382    else {
1383        if ( bSig <= aSig ) aSig -= bSig;
1384        aSig64 = ( (bits64) aSig )<<40;
1385        bSig64 = ( (bits64) bSig )<<40;
1386        expDiff -= 64;
1387        while ( 0 < expDiff ) {
1388            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1389            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1390            aSig64 = - ( ( bSig * q64 )<<38 );
1391            expDiff -= 62;
1392        }
1393        expDiff += 64;
1394        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1395        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1396        q = q64>>( 64 - expDiff );
1397        bSig <<= 6;
1398        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1399    }
1400    do {
1401        alternateASig = aSig;
1402        ++q;
1403        aSig -= bSig;
1404    } while ( 0 <= (sbits32) aSig );
1405    sigMean = aSig + alternateASig;
1406    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1407        aSig = alternateASig;
1408    }
1409    zSign = ( (sbits32) aSig < 0 );
1410    if ( zSign ) aSig = - aSig;
1411    return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1412
1413}
1414
1415/*
1416-------------------------------------------------------------------------------
1417Returns the square root of the single-precision floating-point value `a'.
1418The operation is performed according to the IEC/IEEE Standard for Binary
1419Floating-point Arithmetic.
1420-------------------------------------------------------------------------------
1421*/
1422float32 float32_sqrt( struct roundingData *roundData, float32 a )
1423{
1424    flag aSign;
1425    int16 aExp, zExp;
1426    bits32 aSig, zSig;
1427    bits64 rem, term;
1428
1429    aSig = extractFloat32Frac( a );
1430    aExp = extractFloat32Exp( a );
1431    aSign = extractFloat32Sign( a );
1432    if ( aExp == 0xFF ) {
1433        if ( aSig ) return propagateFloat32NaN( a, 0 );
1434        if ( ! aSign ) return a;
1435        roundData->exception |= float_flag_invalid;
1436        return float32_default_nan;
1437    }
1438    if ( aSign ) {
1439        if ( ( aExp | aSig ) == 0 ) return a;
1440        roundData->exception |= float_flag_invalid;
1441        return float32_default_nan;
1442    }
1443    if ( aExp == 0 ) {
1444        if ( aSig == 0 ) return 0;
1445        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1446    }
1447    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1448    aSig = ( aSig | 0x00800000 )<<8;
1449    zSig = estimateSqrt32( aExp, aSig ) + 2;
1450    if ( ( zSig & 0x7F ) <= 5 ) {
1451        if ( zSig < 2 ) {
1452            zSig = 0xFFFFFFFF;
1453        }
1454        else {
1455            aSig >>= aExp & 1;
1456            term = ( (bits64) zSig ) * zSig;
1457            rem = ( ( (bits64) aSig )<<32 ) - term;
1458            while ( (sbits64) rem < 0 ) {
1459                --zSig;
1460                rem += ( ( (bits64) zSig )<<1 ) | 1;
1461            }
1462            zSig |= ( rem != 0 );
1463        }
1464    }
1465    shift32RightJamming( zSig, 1, &zSig );
1466    return roundAndPackFloat32( roundData, 0, zExp, zSig );
1467
1468}
1469
1470/*
1471-------------------------------------------------------------------------------
1472Returns 1 if the single-precision floating-point value `a' is equal to the
1473corresponding value `b', and 0 otherwise.  The comparison is performed
1474according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1475-------------------------------------------------------------------------------
1476*/
1477flag float32_eq( float32 a, float32 b )
1478{
1479
1480    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1481         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1482       ) {
1483        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1484            float_raise( float_flag_invalid );
1485        }
1486        return 0;
1487    }
1488    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1489
1490}
1491
1492/*
1493-------------------------------------------------------------------------------
1494Returns 1 if the single-precision floating-point value `a' is less than or
1495equal to the corresponding value `b', and 0 otherwise.  The comparison is
1496performed according to the IEC/IEEE Standard for Binary Floating-point
1497Arithmetic.
1498-------------------------------------------------------------------------------
1499*/
1500flag float32_le( float32 a, float32 b )
1501{
1502    flag aSign, bSign;
1503
1504    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1505         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1506       ) {
1507        float_raise( float_flag_invalid );
1508        return 0;
1509    }
1510    aSign = extractFloat32Sign( a );
1511    bSign = extractFloat32Sign( b );
1512    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1513    return ( a == b ) || ( aSign ^ ( a < b ) );
1514
1515}
1516
1517/*
1518-------------------------------------------------------------------------------
1519Returns 1 if the single-precision floating-point value `a' is less than
1520the corresponding value `b', and 0 otherwise.  The comparison is performed
1521according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1522-------------------------------------------------------------------------------
1523*/
1524flag float32_lt( float32 a, float32 b )
1525{
1526    flag aSign, bSign;
1527
1528    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1529         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1530       ) {
1531        float_raise( float_flag_invalid );
1532        return 0;
1533    }
1534    aSign = extractFloat32Sign( a );
1535    bSign = extractFloat32Sign( b );
1536    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1537    return ( a != b ) && ( aSign ^ ( a < b ) );
1538
1539}
1540
1541/*
1542-------------------------------------------------------------------------------
1543Returns 1 if the single-precision floating-point value `a' is equal to the
1544corresponding value `b', and 0 otherwise.  The invalid exception is raised
1545if either operand is a NaN.  Otherwise, the comparison is performed
1546according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1547-------------------------------------------------------------------------------
1548*/
1549flag float32_eq_signaling( float32 a, float32 b )
1550{
1551
1552    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1553         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1554       ) {
1555        float_raise( float_flag_invalid );
1556        return 0;
1557    }
1558    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1559
1560}
1561
1562/*
1563-------------------------------------------------------------------------------
1564Returns 1 if the single-precision floating-point value `a' is less than or
1565equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1566cause an exception.  Otherwise, the comparison is performed according to the
1567IEC/IEEE Standard for Binary Floating-point Arithmetic.
1568-------------------------------------------------------------------------------
1569*/
1570flag float32_le_quiet( float32 a, float32 b )
1571{
1572    flag aSign, bSign;
1573    //int16 aExp, bExp;
1574
1575    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1576         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1577       ) {
1578        /* Do nothing, even if NaN as we're quiet */
1579        return 0;
1580    }
1581    aSign = extractFloat32Sign( a );
1582    bSign = extractFloat32Sign( b );
1583    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1584    return ( a == b ) || ( aSign ^ ( a < b ) );
1585
1586}
1587
1588/*
1589-------------------------------------------------------------------------------
1590Returns 1 if the single-precision floating-point value `a' is less than
1591the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1592exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1593Standard for Binary Floating-point Arithmetic.
1594-------------------------------------------------------------------------------
1595*/
1596flag float32_lt_quiet( float32 a, float32 b )
1597{
1598    flag aSign, bSign;
1599
1600    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1601         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1602       ) {
1603        /* Do nothing, even if NaN as we're quiet */
1604        return 0;
1605    }
1606    aSign = extractFloat32Sign( a );
1607    bSign = extractFloat32Sign( b );
1608    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1609    return ( a != b ) && ( aSign ^ ( a < b ) );
1610
1611}
1612
1613/*
1614-------------------------------------------------------------------------------
1615Returns the result of converting the double-precision floating-point value
1616`a' to the 32-bit two's complement integer format.  The conversion is
1617performed according to the IEC/IEEE Standard for Binary Floating-point
1618Arithmetic---which means in particular that the conversion is rounded
1619according to the current rounding mode.  If `a' is a NaN, the largest
1620positive integer is returned.  Otherwise, if the conversion overflows, the
1621largest integer with the same sign as `a' is returned.
1622-------------------------------------------------------------------------------
1623*/
1624int32 float64_to_int32( struct roundingData *roundData, float64 a )
1625{
1626    flag aSign;
1627    int16 aExp, shiftCount;
1628    bits64 aSig;
1629
1630    aSig = extractFloat64Frac( a );
1631    aExp = extractFloat64Exp( a );
1632    aSign = extractFloat64Sign( a );
1633    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1634    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1635    shiftCount = 0x42C - aExp;
1636    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1637    return roundAndPackInt32( roundData, aSign, aSig );
1638
1639}
1640
1641/*
1642-------------------------------------------------------------------------------
1643Returns the result of converting the double-precision floating-point value
1644`a' to the 32-bit two's complement integer format.  The conversion is
1645performed according to the IEC/IEEE Standard for Binary Floating-point
1646Arithmetic, except that the conversion is always rounded toward zero.  If
1647`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1648conversion overflows, the largest integer with the same sign as `a' is
1649returned.
1650-------------------------------------------------------------------------------
1651*/
1652int32 float64_to_int32_round_to_zero( float64 a )
1653{
1654    flag aSign;
1655    int16 aExp, shiftCount;
1656    bits64 aSig, savedASig;
1657    int32 z;
1658
1659    aSig = extractFloat64Frac( a );
1660    aExp = extractFloat64Exp( a );
1661    aSign = extractFloat64Sign( a );
1662    shiftCount = 0x433 - aExp;
1663    if ( shiftCount < 21 ) {
1664        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1665        goto invalid;
1666    }
1667    else if ( 52 < shiftCount ) {
1668        if ( aExp || aSig ) float_raise( float_flag_inexact );
1669        return 0;
1670    }
1671    aSig |= LIT64( 0x0010000000000000 );
1672    savedASig = aSig;
1673    aSig >>= shiftCount;
1674    z = aSig;
1675    if ( aSign ) z = - z;
1676    if ( ( z < 0 ) ^ aSign ) {
1677 invalid:
1678        float_raise( float_flag_invalid );
1679        return aSign ? 0x80000000 : 0x7FFFFFFF;
1680    }
1681    if ( ( aSig<<shiftCount ) != savedASig ) {
1682        float_raise( float_flag_inexact );
1683    }
1684    return z;
1685
1686}
1687
1688/*
1689-------------------------------------------------------------------------------
1690Returns the result of converting the double-precision floating-point value
1691`a' to the 32-bit two's complement unsigned integer format.  The conversion
1692is performed according to the IEC/IEEE Standard for Binary Floating-point
1693Arithmetic---which means in particular that the conversion is rounded
1694according to the current rounding mode.  If `a' is a NaN, the largest
1695positive integer is returned.  Otherwise, if the conversion overflows, the
1696largest positive integer is returned.
1697-------------------------------------------------------------------------------
1698*/
1699int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1700{
1701    flag aSign;
1702    int16 aExp, shiftCount;
1703    bits64 aSig;
1704
1705    aSig = extractFloat64Frac( a );
1706    aExp = extractFloat64Exp( a );
1707    aSign = 0; //extractFloat64Sign( a );
1708    //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1709    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1710    shiftCount = 0x42C - aExp;
1711    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1712    return roundAndPackInt32( roundData, aSign, aSig );
1713}
1714
1715/*
1716-------------------------------------------------------------------------------
1717Returns the result of converting the double-precision floating-point value
1718`a' to the 32-bit two's complement integer format.  The conversion is
1719performed according to the IEC/IEEE Standard for Binary Floating-point
1720Arithmetic, except that the conversion is always rounded toward zero.  If
1721`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1722conversion overflows, the largest positive integer is returned.
1723-------------------------------------------------------------------------------
1724*/
1725int32 float64_to_uint32_round_to_zero( float64 a )
1726{
1727    flag aSign;
1728    int16 aExp, shiftCount;
1729    bits64 aSig, savedASig;
1730    int32 z;
1731
1732    aSig = extractFloat64Frac( a );
1733    aExp = extractFloat64Exp( a );
1734    aSign = extractFloat64Sign( a );
1735    shiftCount = 0x433 - aExp;
1736    if ( shiftCount < 21 ) {
1737        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1738        goto invalid;
1739    }
1740    else if ( 52 < shiftCount ) {
1741        if ( aExp || aSig ) float_raise( float_flag_inexact );
1742        return 0;
1743    }
1744    aSig |= LIT64( 0x0010000000000000 );
1745    savedASig = aSig;
1746    aSig >>= shiftCount;
1747    z = aSig;
1748    if ( aSign ) z = - z;
1749    if ( ( z < 0 ) ^ aSign ) {
1750 invalid:
1751        float_raise( float_flag_invalid );
1752        return aSign ? 0x80000000 : 0x7FFFFFFF;
1753    }
1754    if ( ( aSig<<shiftCount ) != savedASig ) {
1755        float_raise( float_flag_inexact );
1756    }
1757    return z;
1758}
1759
1760/*
1761-------------------------------------------------------------------------------
1762Returns the result of converting the double-precision floating-point value
1763`a' to the single-precision floating-point format.  The conversion is
1764performed according to the IEC/IEEE Standard for Binary Floating-point
1765Arithmetic.
1766-------------------------------------------------------------------------------
1767*/
1768float32 float64_to_float32( struct roundingData *roundData, float64 a )
1769{
1770    flag aSign;
1771    int16 aExp;
1772    bits64 aSig;
1773    bits32 zSig;
1774
1775    aSig = extractFloat64Frac( a );
1776    aExp = extractFloat64Exp( a );
1777    aSign = extractFloat64Sign( a );
1778    if ( aExp == 0x7FF ) {
1779        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1780        return packFloat32( aSign, 0xFF, 0 );
1781    }
1782    shift64RightJamming( aSig, 22, &aSig );
1783    zSig = aSig;
1784    if ( aExp || zSig ) {
1785        zSig |= 0x40000000;
1786        aExp -= 0x381;
1787    }
1788    return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1789
1790}
1791
1792#ifdef FLOATX80
1793
1794/*
1795-------------------------------------------------------------------------------
1796Returns the result of converting the double-precision floating-point value
1797`a' to the extended double-precision floating-point format.  The conversion
1798is performed according to the IEC/IEEE Standard for Binary Floating-point
1799Arithmetic.
1800-------------------------------------------------------------------------------
1801*/
1802floatx80 float64_to_floatx80( float64 a )
1803{
1804    flag aSign;
1805    int16 aExp;
1806    bits64 aSig;
1807
1808    aSig = extractFloat64Frac( a );
1809    aExp = extractFloat64Exp( a );
1810    aSign = extractFloat64Sign( a );
1811    if ( aExp == 0x7FF ) {
1812        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1813        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1814    }
1815    if ( aExp == 0 ) {
1816        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1817        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1818    }
1819    return
1820        packFloatx80(
1821            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1822
1823}
1824
1825#endif
1826
1827/*
1828-------------------------------------------------------------------------------
1829Rounds the double-precision floating-point value `a' to an integer, and
1830returns the result as a double-precision floating-point value.  The
1831operation is performed according to the IEC/IEEE Standard for Binary
1832Floating-point Arithmetic.
1833-------------------------------------------------------------------------------
1834*/
1835float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1836{
1837    flag aSign;
1838    int16 aExp;
1839    bits64 lastBitMask, roundBitsMask;
1840    int8 roundingMode;
1841    float64 z;
1842
1843    aExp = extractFloat64Exp( a );
1844    if ( 0x433 <= aExp ) {
1845        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1846            return propagateFloat64NaN( a, a );
1847        }
1848        return a;
1849    }
1850    if ( aExp <= 0x3FE ) {
1851        if ( (bits64) ( a<<1 ) == 0 ) return a;
1852        roundData->exception |= float_flag_inexact;
1853        aSign = extractFloat64Sign( a );
1854        switch ( roundData->mode ) {
1855         case float_round_nearest_even:
1856            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1857                return packFloat64( aSign, 0x3FF, 0 );
1858            }
1859            break;
1860         case float_round_down:
1861            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1862         case float_round_up:
1863            return
1864            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1865        }
1866        return packFloat64( aSign, 0, 0 );
1867    }
1868    lastBitMask = 1;
1869    lastBitMask <<= 0x433 - aExp;
1870    roundBitsMask = lastBitMask - 1;
1871    z = a;
1872    roundingMode = roundData->mode;
1873    if ( roundingMode == float_round_nearest_even ) {
1874        z += lastBitMask>>1;
1875        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1876    }
1877    else if ( roundingMode != float_round_to_zero ) {
1878        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1879            z += roundBitsMask;
1880        }
1881    }
1882    z &= ~ roundBitsMask;
1883    if ( z != a ) roundData->exception |= float_flag_inexact;
1884    return z;
1885
1886}
1887
1888/*
1889-------------------------------------------------------------------------------
1890Returns the result of adding the absolute values of the double-precision
1891floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1892before being returned.  `zSign' is ignored if the result is a NaN.  The
1893addition is performed according to the IEC/IEEE Standard for Binary
1894Floating-point Arithmetic.
1895-------------------------------------------------------------------------------
1896*/
1897static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1898{
1899    int16 aExp, bExp, zExp;
1900    bits64 aSig, bSig, zSig;
1901    int16 expDiff;
1902
1903    aSig = extractFloat64Frac( a );
1904    aExp = extractFloat64Exp( a );
1905    bSig = extractFloat64Frac( b );
1906    bExp = extractFloat64Exp( b );
1907    expDiff = aExp - bExp;
1908    aSig <<= 9;
1909    bSig <<= 9;
1910    if ( 0 < expDiff ) {
1911        if ( aExp == 0x7FF ) {
1912            if ( aSig ) return propagateFloat64NaN( a, b );
1913            return a;
1914        }
1915        if ( bExp == 0 ) {
1916            --expDiff;
1917        }
1918        else {
1919            bSig |= LIT64( 0x2000000000000000 );
1920        }
1921        shift64RightJamming( bSig, expDiff, &bSig );
1922        zExp = aExp;
1923    }
1924    else if ( expDiff < 0 ) {
1925        if ( bExp == 0x7FF ) {
1926            if ( bSig ) return propagateFloat64NaN( a, b );
1927            return packFloat64( zSign, 0x7FF, 0 );
1928        }
1929        if ( aExp == 0 ) {
1930            ++expDiff;
1931        }
1932        else {
1933            aSig |= LIT64( 0x2000000000000000 );
1934        }
1935        shift64RightJamming( aSig, - expDiff, &aSig );
1936        zExp = bExp;
1937    }
1938    else {
1939        if ( aExp == 0x7FF ) {
1940            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1941            return a;
1942        }
1943        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1944        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1945        zExp = aExp;
1946        goto roundAndPack;
1947    }
1948    aSig |= LIT64( 0x2000000000000000 );
1949    zSig = ( aSig + bSig )<<1;
1950    --zExp;
1951    if ( (sbits64) zSig < 0 ) {
1952        zSig = aSig + bSig;
1953        ++zExp;
1954    }
1955 roundAndPack:
1956    return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1957
1958}
1959
1960/*
1961-------------------------------------------------------------------------------
1962Returns the result of subtracting the absolute values of the double-
1963precision floating-point values `a' and `b'.  If `zSign' is true, the
1964difference is negated before being returned.  `zSign' is ignored if the
1965result is a NaN.  The subtraction is performed according to the IEC/IEEE
1966Standard for Binary Floating-point Arithmetic.
1967-------------------------------------------------------------------------------
1968*/
1969static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1970{
1971    int16 aExp, bExp, zExp;
1972    bits64 aSig, bSig, zSig;
1973    int16 expDiff;
1974
1975    aSig = extractFloat64Frac( a );
1976    aExp = extractFloat64Exp( a );
1977    bSig = extractFloat64Frac( b );
1978    bExp = extractFloat64Exp( b );
1979    expDiff = aExp - bExp;
1980    aSig <<= 10;
1981    bSig <<= 10;
1982    if ( 0 < expDiff ) goto aExpBigger;
1983    if ( expDiff < 0 ) goto bExpBigger;
1984    if ( aExp == 0x7FF ) {
1985        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1986        roundData->exception |= float_flag_invalid;
1987        return float64_default_nan;
1988    }
1989    if ( aExp == 0 ) {
1990        aExp = 1;
1991        bExp = 1;
1992    }
1993    if ( bSig < aSig ) goto aBigger;
1994    if ( aSig < bSig ) goto bBigger;
1995    return packFloat64( roundData->mode == float_round_down, 0, 0 );
1996 bExpBigger:
1997    if ( bExp == 0x7FF ) {
1998        if ( bSig ) return propagateFloat64NaN( a, b );
1999        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2000    }
2001    if ( aExp == 0 ) {
2002        ++expDiff;
2003    }
2004    else {
2005        aSig |= LIT64( 0x4000000000000000 );
2006    }
2007    shift64RightJamming( aSig, - expDiff, &aSig );
2008    bSig |= LIT64( 0x4000000000000000 );
2009 bBigger:
2010    zSig = bSig - aSig;
2011    zExp = bExp;
2012    zSign ^= 1;
2013    goto normalizeRoundAndPack;
2014 aExpBigger:
2015    if ( aExp == 0x7FF ) {
2016        if ( aSig ) return propagateFloat64NaN( a, b );
2017        return a;
2018    }
2019    if ( bExp == 0 ) {
2020        --expDiff;
2021    }
2022    else {
2023        bSig |= LIT64( 0x4000000000000000 );
2024    }
2025    shift64RightJamming( bSig, expDiff, &bSig );
2026    aSig |= LIT64( 0x4000000000000000 );
2027 aBigger:
2028    zSig = aSig - bSig;
2029    zExp = aExp;
2030 normalizeRoundAndPack:
2031    --zExp;
2032    return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2033
2034}
2035
2036/*
2037-------------------------------------------------------------------------------
2038Returns the result of adding the double-precision floating-point values `a'
2039and `b'.  The operation is performed according to the IEC/IEEE Standard for
2040Binary Floating-point Arithmetic.
2041-------------------------------------------------------------------------------
2042*/
2043float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2044{
2045    flag aSign, bSign;
2046
2047    aSign = extractFloat64Sign( a );
2048    bSign = extractFloat64Sign( b );
2049    if ( aSign == bSign ) {
2050        return addFloat64Sigs( roundData, a, b, aSign );
2051    }
2052    else {
2053        return subFloat64Sigs( roundData, a, b, aSign );
2054    }
2055
2056}
2057
2058/*
2059-------------------------------------------------------------------------------
2060Returns the result of subtracting the double-precision floating-point values
2061`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2062for Binary Floating-point Arithmetic.
2063-------------------------------------------------------------------------------
2064*/
2065float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2066{
2067    flag aSign, bSign;
2068
2069    aSign = extractFloat64Sign( a );
2070    bSign = extractFloat64Sign( b );
2071    if ( aSign == bSign ) {
2072        return subFloat64Sigs( roundData, a, b, aSign );
2073    }
2074    else {
2075        return addFloat64Sigs( roundData, a, b, aSign );
2076    }
2077
2078}
2079
2080/*
2081-------------------------------------------------------------------------------
2082Returns the result of multiplying the double-precision floating-point values
2083`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2084for Binary Floating-point Arithmetic.
2085-------------------------------------------------------------------------------
2086*/
2087float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2088{
2089    flag aSign, bSign, zSign;
2090    int16 aExp, bExp, zExp;
2091    bits64 aSig, bSig, zSig0, zSig1;
2092
2093    aSig = extractFloat64Frac( a );
2094    aExp = extractFloat64Exp( a );
2095    aSign = extractFloat64Sign( a );
2096    bSig = extractFloat64Frac( b );
2097    bExp = extractFloat64Exp( b );
2098    bSign = extractFloat64Sign( b );
2099    zSign = aSign ^ bSign;
2100    if ( aExp == 0x7FF ) {
2101        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2102            return propagateFloat64NaN( a, b );
2103        }
2104        if ( ( bExp | bSig ) == 0 ) {
2105            roundData->exception |= float_flag_invalid;
2106            return float64_default_nan;
2107        }
2108        return packFloat64( zSign, 0x7FF, 0 );
2109    }
2110    if ( bExp == 0x7FF ) {
2111        if ( bSig ) return propagateFloat64NaN( a, b );
2112        if ( ( aExp | aSig ) == 0 ) {
2113            roundData->exception |= float_flag_invalid;
2114            return float64_default_nan;
2115        }
2116        return packFloat64( zSign, 0x7FF, 0 );
2117    }
2118    if ( aExp == 0 ) {
2119        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2120        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2121    }
2122    if ( bExp == 0 ) {
2123        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2124        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2125    }
2126    zExp = aExp + bExp - 0x3FF;
2127    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2128    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2129    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2130    zSig0 |= ( zSig1 != 0 );
2131    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2132        zSig0 <<= 1;
2133        --zExp;
2134    }
2135    return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2136
2137}
2138
2139/*
2140-------------------------------------------------------------------------------
2141Returns the result of dividing the double-precision floating-point value `a'
2142by the corresponding value `b'.  The operation is performed according to
2143the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2144-------------------------------------------------------------------------------
2145*/
2146float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2147{
2148    flag aSign, bSign, zSign;
2149    int16 aExp, bExp, zExp;
2150    bits64 aSig, bSig, zSig;
2151    bits64 rem0, rem1;
2152    bits64 term0, term1;
2153
2154    aSig = extractFloat64Frac( a );
2155    aExp = extractFloat64Exp( a );
2156    aSign = extractFloat64Sign( a );
2157    bSig = extractFloat64Frac( b );
2158    bExp = extractFloat64Exp( b );
2159    bSign = extractFloat64Sign( b );
2160    zSign = aSign ^ bSign;
2161    if ( aExp == 0x7FF ) {
2162        if ( aSig ) return propagateFloat64NaN( a, b );
2163        if ( bExp == 0x7FF ) {
2164            if ( bSig ) return propagateFloat64NaN( a, b );
2165            roundData->exception |= float_flag_invalid;
2166            return float64_default_nan;
2167        }
2168        return packFloat64( zSign, 0x7FF, 0 );
2169    }
2170    if ( bExp == 0x7FF ) {
2171        if ( bSig ) return propagateFloat64NaN( a, b );
2172        return packFloat64( zSign, 0, 0 );
2173    }
2174    if ( bExp == 0 ) {
2175        if ( bSig == 0 ) {
2176            if ( ( aExp | aSig ) == 0 ) {
2177                roundData->exception |= float_flag_invalid;
2178                return float64_default_nan;
2179            }
2180            roundData->exception |= float_flag_divbyzero;
2181            return packFloat64( zSign, 0x7FF, 0 );
2182        }
2183        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2184    }
2185    if ( aExp == 0 ) {
2186        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2187        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2188    }
2189    zExp = aExp - bExp + 0x3FD;
2190    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2191    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2192    if ( bSig <= ( aSig + aSig ) ) {
2193        aSig >>= 1;
2194        ++zExp;
2195    }
2196    zSig = estimateDiv128To64( aSig, 0, bSig );
2197    if ( ( zSig & 0x1FF ) <= 2 ) {
2198        mul64To128( bSig, zSig, &term0, &term1 );
2199        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2200        while ( (sbits64) rem0 < 0 ) {
2201            --zSig;
2202            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2203        }
2204        zSig |= ( rem1 != 0 );
2205    }
2206    return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2207
2208}
2209
2210/*
2211-------------------------------------------------------------------------------
2212Returns the remainder of the double-precision floating-point value `a'
2213with respect to the corresponding value `b'.  The operation is performed
2214according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2215-------------------------------------------------------------------------------
2216*/
2217float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2218{
2219    flag aSign, bSign, zSign;
2220    int16 aExp, bExp, expDiff;
2221    bits64 aSig, bSig;
2222    bits64 q, alternateASig;
2223    sbits64 sigMean;
2224
2225    aSig = extractFloat64Frac( a );
2226    aExp = extractFloat64Exp( a );
2227    aSign = extractFloat64Sign( a );
2228    bSig = extractFloat64Frac( b );
2229    bExp = extractFloat64Exp( b );
2230    bSign = extractFloat64Sign( b );
2231    if ( aExp == 0x7FF ) {
2232        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2233            return propagateFloat64NaN( a, b );
2234        }
2235        roundData->exception |= float_flag_invalid;
2236        return float64_default_nan;
2237    }
2238    if ( bExp == 0x7FF ) {
2239        if ( bSig ) return propagateFloat64NaN( a, b );
2240        return a;
2241    }
2242    if ( bExp == 0 ) {
2243        if ( bSig == 0 ) {
2244            roundData->exception |= float_flag_invalid;
2245            return float64_default_nan;
2246        }
2247        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2248    }
2249    if ( aExp == 0 ) {
2250        if ( aSig == 0 ) return a;
2251        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2252    }
2253    expDiff = aExp - bExp;
2254    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2255    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2256    if ( expDiff < 0 ) {
2257        if ( expDiff < -1 ) return a;
2258        aSig >>= 1;
2259    }
2260    q = ( bSig <= aSig );
2261    if ( q ) aSig -= bSig;
2262    expDiff -= 64;
2263    while ( 0 < expDiff ) {
2264        q = estimateDiv128To64( aSig, 0, bSig );
2265        q = ( 2 < q ) ? q - 2 : 0;
2266        aSig = - ( ( bSig>>2 ) * q );
2267        expDiff -= 62;
2268    }
2269    expDiff += 64;
2270    if ( 0 < expDiff ) {
2271        q = estimateDiv128To64( aSig, 0, bSig );
2272        q = ( 2 < q ) ? q - 2 : 0;
2273        q >>= 64 - expDiff;
2274        bSig >>= 2;
2275        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2276    }
2277    else {
2278        aSig >>= 2;
2279        bSig >>= 2;
2280    }
2281    do {
2282        alternateASig = aSig;
2283        ++q;
2284        aSig -= bSig;
2285    } while ( 0 <= (sbits64) aSig );
2286    sigMean = aSig + alternateASig;
2287    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2288        aSig = alternateASig;
2289    }
2290    zSign = ( (sbits64) aSig < 0 );
2291    if ( zSign ) aSig = - aSig;
2292    return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2293
2294}
2295
2296/*
2297-------------------------------------------------------------------------------
2298Returns the square root of the double-precision floating-point value `a'.
2299The operation is performed according to the IEC/IEEE Standard for Binary
2300Floating-point Arithmetic.
2301-------------------------------------------------------------------------------
2302*/
2303float64 float64_sqrt( struct roundingData *roundData, float64 a )
2304{
2305    flag aSign;
2306    int16 aExp, zExp;
2307    bits64 aSig, zSig;
2308    bits64 rem0, rem1, term0, term1; //, shiftedRem;
2309    //float64 z;
2310
2311    aSig = extractFloat64Frac( a );
2312    aExp = extractFloat64Exp( a );
2313    aSign = extractFloat64Sign( a );
2314    if ( aExp == 0x7FF ) {
2315        if ( aSig ) return propagateFloat64NaN( a, a );
2316        if ( ! aSign ) return a;
2317        roundData->exception |= float_flag_invalid;
2318        return float64_default_nan;
2319    }
2320    if ( aSign ) {
2321        if ( ( aExp | aSig ) == 0 ) return a;
2322        roundData->exception |= float_flag_invalid;
2323        return float64_default_nan;
2324    }
2325    if ( aExp == 0 ) {
2326        if ( aSig == 0 ) return 0;
2327        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2328    }
2329    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2330    aSig |= LIT64( 0x0010000000000000 );
2331    zSig = estimateSqrt32( aExp, aSig>>21 );
2332    zSig <<= 31;
2333    aSig <<= 9 - ( aExp & 1 );
2334    zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2335    if ( ( zSig & 0x3FF ) <= 5 ) {
2336        if ( zSig < 2 ) {
2337            zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2338        }
2339        else {
2340            aSig <<= 2;
2341            mul64To128( zSig, zSig, &term0, &term1 );
2342            sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2343            while ( (sbits64) rem0 < 0 ) {
2344                --zSig;
2345                shortShift128Left( 0, zSig, 1, &term0, &term1 );
2346                term1 |= 1;
2347                add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2348            }
2349            zSig |= ( ( rem0 | rem1 ) != 0 );
2350        }
2351    }
2352    shift64RightJamming( zSig, 1, &zSig );
2353    return roundAndPackFloat64( roundData, 0, zExp, zSig );
2354
2355}
2356
2357/*
2358-------------------------------------------------------------------------------
2359Returns 1 if the double-precision floating-point value `a' is equal to the
2360corresponding value `b', and 0 otherwise.  The comparison is performed
2361according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2362-------------------------------------------------------------------------------
2363*/
2364flag float64_eq( float64 a, float64 b )
2365{
2366
2367    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2368         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2369       ) {
2370        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2371            float_raise( float_flag_invalid );
2372        }
2373        return 0;
2374    }
2375    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2376
2377}
2378
2379/*
2380-------------------------------------------------------------------------------
2381Returns 1 if the double-precision floating-point value `a' is less than or
2382equal to the corresponding value `b', and 0 otherwise.  The comparison is
2383performed according to the IEC/IEEE Standard for Binary Floating-point
2384Arithmetic.
2385-------------------------------------------------------------------------------
2386*/
2387flag float64_le( float64 a, float64 b )
2388{
2389    flag aSign, bSign;
2390
2391    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2392         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2393       ) {
2394        float_raise( float_flag_invalid );
2395        return 0;
2396    }
2397    aSign = extractFloat64Sign( a );
2398    bSign = extractFloat64Sign( b );
2399    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2400    return ( a == b ) || ( aSign ^ ( a < b ) );
2401
2402}
2403
2404/*
2405-------------------------------------------------------------------------------
2406Returns 1 if the double-precision floating-point value `a' is less than
2407the corresponding value `b', and 0 otherwise.  The comparison is performed
2408according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2409-------------------------------------------------------------------------------
2410*/
2411flag float64_lt( float64 a, float64 b )
2412{
2413    flag aSign, bSign;
2414
2415    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2416         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2417       ) {
2418        float_raise( float_flag_invalid );
2419        return 0;
2420    }
2421    aSign = extractFloat64Sign( a );
2422    bSign = extractFloat64Sign( b );
2423    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2424    return ( a != b ) && ( aSign ^ ( a < b ) );
2425
2426}
2427
2428/*
2429-------------------------------------------------------------------------------
2430Returns 1 if the double-precision floating-point value `a' is equal to the
2431corresponding value `b', and 0 otherwise.  The invalid exception is raised
2432if either operand is a NaN.  Otherwise, the comparison is performed
2433according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2434-------------------------------------------------------------------------------
2435*/
2436flag float64_eq_signaling( float64 a, float64 b )
2437{
2438
2439    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2440         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2441       ) {
2442        float_raise( float_flag_invalid );
2443        return 0;
2444    }
2445    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2446
2447}
2448
2449/*
2450-------------------------------------------------------------------------------
2451Returns 1 if the double-precision floating-point value `a' is less than or
2452equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2453cause an exception.  Otherwise, the comparison is performed according to the
2454IEC/IEEE Standard for Binary Floating-point Arithmetic.
2455-------------------------------------------------------------------------------
2456*/
2457flag float64_le_quiet( float64 a, float64 b )
2458{
2459    flag aSign, bSign;
2460    //int16 aExp, bExp;
2461
2462    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2463         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2464       ) {
2465        /* Do nothing, even if NaN as we're quiet */
2466        return 0;
2467    }
2468    aSign = extractFloat64Sign( a );
2469    bSign = extractFloat64Sign( b );
2470    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2471    return ( a == b ) || ( aSign ^ ( a < b ) );
2472
2473}
2474
2475/*
2476-------------------------------------------------------------------------------
2477Returns 1 if the double-precision floating-point value `a' is less than
2478the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2479exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2480Standard for Binary Floating-point Arithmetic.
2481-------------------------------------------------------------------------------
2482*/
2483flag float64_lt_quiet( float64 a, float64 b )
2484{
2485    flag aSign, bSign;
2486
2487    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2488         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2489       ) {
2490        /* Do nothing, even if NaN as we're quiet */
2491        return 0;
2492    }
2493    aSign = extractFloat64Sign( a );
2494    bSign = extractFloat64Sign( b );
2495    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2496    return ( a != b ) && ( aSign ^ ( a < b ) );
2497
2498}
2499
2500#ifdef FLOATX80
2501
2502/*
2503-------------------------------------------------------------------------------
2504Returns the result of converting the extended double-precision floating-
2505point value `a' to the 32-bit two's complement integer format.  The
2506conversion is performed according to the IEC/IEEE Standard for Binary
2507Floating-point Arithmetic---which means in particular that the conversion
2508is rounded according to the current rounding mode.  If `a' is a NaN, the
2509largest positive integer is returned.  Otherwise, if the conversion
2510overflows, the largest integer with the same sign as `a' is returned.
2511-------------------------------------------------------------------------------
2512*/
2513int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2514{
2515    flag aSign;
2516    int32 aExp, shiftCount;
2517    bits64 aSig;
2518
2519    aSig = extractFloatx80Frac( a );
2520    aExp = extractFloatx80Exp( a );
2521    aSign = extractFloatx80Sign( a );
2522    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2523    shiftCount = 0x4037 - aExp;
2524    if ( shiftCount <= 0 ) shiftCount = 1;
2525    shift64RightJamming( aSig, shiftCount, &aSig );
2526    return roundAndPackInt32( roundData, aSign, aSig );
2527
2528}
2529
2530/*
2531-------------------------------------------------------------------------------
2532Returns the result of converting the extended double-precision floating-
2533point value `a' to the 32-bit two's complement integer format.  The
2534conversion is performed according to the IEC/IEEE Standard for Binary
2535Floating-point Arithmetic, except that the conversion is always rounded
2536toward zero.  If `a' is a NaN, the largest positive integer is returned.
2537Otherwise, if the conversion overflows, the largest integer with the same
2538sign as `a' is returned.
2539-------------------------------------------------------------------------------
2540*/
2541int32 floatx80_to_int32_round_to_zero( floatx80 a )
2542{
2543    flag aSign;
2544    int32 aExp, shiftCount;
2545    bits64 aSig, savedASig;
2546    int32 z;
2547
2548    aSig = extractFloatx80Frac( a );
2549    aExp = extractFloatx80Exp( a );
2550    aSign = extractFloatx80Sign( a );
2551    shiftCount = 0x403E - aExp;
2552    if ( shiftCount < 32 ) {
2553        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2554        goto invalid;
2555    }
2556    else if ( 63 < shiftCount ) {
2557        if ( aExp || aSig ) float_raise( float_flag_inexact );
2558        return 0;
2559    }
2560    savedASig = aSig;
2561    aSig >>= shiftCount;
2562    z = aSig;
2563    if ( aSign ) z = - z;
2564    if ( ( z < 0 ) ^ aSign ) {
2565 invalid:
2566        float_raise( float_flag_invalid );
2567        return aSign ? 0x80000000 : 0x7FFFFFFF;
2568    }
2569    if ( ( aSig<<shiftCount ) != savedASig ) {
2570        float_raise( float_flag_inexact );
2571    }
2572    return z;
2573
2574}
2575
2576/*
2577-------------------------------------------------------------------------------
2578Returns the result of converting the extended double-precision floating-
2579point value `a' to the single-precision floating-point format.  The
2580conversion is performed according to the IEC/IEEE Standard for Binary
2581Floating-point Arithmetic.
2582-------------------------------------------------------------------------------
2583*/
2584float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2585{
2586    flag aSign;
2587    int32 aExp;
2588    bits64 aSig;
2589
2590    aSig = extractFloatx80Frac( a );
2591    aExp = extractFloatx80Exp( a );
2592    aSign = extractFloatx80Sign( a );
2593    if ( aExp == 0x7FFF ) {
2594        if ( (bits64) ( aSig<<1 ) ) {
2595            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2596        }
2597        return packFloat32( aSign, 0xFF, 0 );
2598    }
2599    shift64RightJamming( aSig, 33, &aSig );
2600    if ( aExp || aSig ) aExp -= 0x3F81;
2601    return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2602
2603}
2604
2605/*
2606-------------------------------------------------------------------------------
2607Returns the result of converting the extended double-precision floating-
2608point value `a' to the double-precision floating-point format.  The
2609conversion is performed according to the IEC/IEEE Standard for Binary
2610Floating-point Arithmetic.
2611-------------------------------------------------------------------------------
2612*/
2613float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2614{
2615    flag aSign;
2616    int32 aExp;
2617    bits64 aSig, zSig;
2618
2619    aSig = extractFloatx80Frac( a );
2620    aExp = extractFloatx80Exp( a );
2621    aSign = extractFloatx80Sign( a );
2622    if ( aExp == 0x7FFF ) {
2623        if ( (bits64) ( aSig<<1 ) ) {
2624            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2625        }
2626        return packFloat64( aSign, 0x7FF, 0 );
2627    }
2628    shift64RightJamming( aSig, 1, &zSig );
2629    if ( aExp || aSig ) aExp -= 0x3C01;
2630    return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2631
2632}
2633
2634/*
2635-------------------------------------------------------------------------------
2636Rounds the extended double-precision floating-point value `a' to an integer,
2637and returns the result as an extended quadruple-precision floating-point
2638value.  The operation is performed according to the IEC/IEEE Standard for
2639Binary Floating-point Arithmetic.
2640-------------------------------------------------------------------------------
2641*/
2642floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2643{
2644    flag aSign;
2645    int32 aExp;
2646    bits64 lastBitMask, roundBitsMask;
2647    int8 roundingMode;
2648    floatx80 z;
2649
2650    aExp = extractFloatx80Exp( a );
2651    if ( 0x403E <= aExp ) {
2652        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2653            return propagateFloatx80NaN( a, a );
2654        }
2655        return a;
2656    }
2657    if ( aExp <= 0x3FFE ) {
2658        if (    ( aExp == 0 )
2659             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2660            return a;
2661        }
2662        roundData->exception |= float_flag_inexact;
2663        aSign = extractFloatx80Sign( a );
2664        switch ( roundData->mode ) {
2665         case float_round_nearest_even:
2666            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2667               ) {
2668                return
2669                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2670            }
2671            break;
2672         case float_round_down:
2673            return
2674                  aSign ?
2675                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2676                : packFloatx80( 0, 0, 0 );
2677         case float_round_up:
2678            return
2679                  aSign ? packFloatx80( 1, 0, 0 )
2680                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2681        }
2682        return packFloatx80( aSign, 0, 0 );
2683    }
2684    lastBitMask = 1;
2685    lastBitMask <<= 0x403E - aExp;
2686    roundBitsMask = lastBitMask - 1;
2687    z = a;
2688    roundingMode = roundData->mode;
2689    if ( roundingMode == float_round_nearest_even ) {
2690        z.low += lastBitMask>>1;
2691        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2692    }
2693    else if ( roundingMode != float_round_to_zero ) {
2694        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2695            z.low += roundBitsMask;
2696        }
2697    }
2698    z.low &= ~ roundBitsMask;
2699    if ( z.low == 0 ) {
2700        ++z.high;
2701        z.low = LIT64( 0x8000000000000000 );
2702    }
2703    if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2704    return z;
2705
2706}
2707
2708/*
2709-------------------------------------------------------------------------------
2710Returns the result of adding the absolute values of the extended double-
2711precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2712negated before being returned.  `zSign' is ignored if the result is a NaN.
2713The addition is performed according to the IEC/IEEE Standard for Binary
2714Floating-point Arithmetic.
2715-------------------------------------------------------------------------------
2716*/
2717static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2718{
2719    int32 aExp, bExp, zExp;
2720    bits64 aSig, bSig, zSig0, zSig1;
2721    int32 expDiff;
2722
2723    aSig = extractFloatx80Frac( a );
2724    aExp = extractFloatx80Exp( a );
2725    bSig = extractFloatx80Frac( b );
2726    bExp = extractFloatx80Exp( b );
2727    expDiff = aExp - bExp;
2728    if ( 0 < expDiff ) {
2729        if ( aExp == 0x7FFF ) {
2730            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2731            return a;
2732        }
2733        if ( bExp == 0 ) --expDiff;
2734        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2735        zExp = aExp;
2736    }
2737    else if ( expDiff < 0 ) {
2738        if ( bExp == 0x7FFF ) {
2739            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2740            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2741        }
2742        if ( aExp == 0 ) ++expDiff;
2743        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2744        zExp = bExp;
2745    }
2746    else {
2747        if ( aExp == 0x7FFF ) {
2748            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2749                return propagateFloatx80NaN( a, b );
2750            }
2751            return a;
2752        }
2753        zSig1 = 0;
2754        zSig0 = aSig + bSig;
2755        if ( aExp == 0 ) {
2756            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2757            goto roundAndPack;
2758        }
2759        zExp = aExp;
2760        goto shiftRight1;
2761    }
2762
2763    zSig0 = aSig + bSig;
2764
2765    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2766 shiftRight1:
2767    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2768    zSig0 |= LIT64( 0x8000000000000000 );
2769    ++zExp;
2770 roundAndPack:
2771    return
2772        roundAndPackFloatx80(
2773            roundData, zSign, zExp, zSig0, zSig1 );
2774
2775}
2776
2777/*
2778-------------------------------------------------------------------------------
2779Returns the result of subtracting the absolute values of the extended
2780double-precision floating-point values `a' and `b'.  If `zSign' is true,
2781the difference is negated before being returned.  `zSign' is ignored if the
2782result is a NaN.  The subtraction is performed according to the IEC/IEEE
2783Standard for Binary Floating-point Arithmetic.
2784-------------------------------------------------------------------------------
2785*/
2786static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2787{
2788    int32 aExp, bExp, zExp;
2789    bits64 aSig, bSig, zSig0, zSig1;
2790    int32 expDiff;
2791    floatx80 z;
2792
2793    aSig = extractFloatx80Frac( a );
2794    aExp = extractFloatx80Exp( a );
2795    bSig = extractFloatx80Frac( b );
2796    bExp = extractFloatx80Exp( b );
2797    expDiff = aExp - bExp;
2798    if ( 0 < expDiff ) goto aExpBigger;
2799    if ( expDiff < 0 ) goto bExpBigger;
2800    if ( aExp == 0x7FFF ) {
2801        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2802            return propagateFloatx80NaN( a, b );
2803        }
2804        roundData->exception |= float_flag_invalid;
2805        z.low = floatx80_default_nan_low;
2806        z.high = floatx80_default_nan_high;
2807        z.__padding = 0;
2808        return z;
2809    }
2810    if ( aExp == 0 ) {
2811        aExp = 1;
2812        bExp = 1;
2813    }
2814    zSig1 = 0;
2815    if ( bSig < aSig ) goto aBigger;
2816    if ( aSig < bSig ) goto bBigger;
2817    return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2818 bExpBigger:
2819    if ( bExp == 0x7FFF ) {
2820        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2821        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2822    }
2823    if ( aExp == 0 ) ++expDiff;
2824    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2825 bBigger:
2826    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2827    zExp = bExp;
2828    zSign ^= 1;
2829    goto normalizeRoundAndPack;
2830 aExpBigger:
2831    if ( aExp == 0x7FFF ) {
2832        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2833        return a;
2834    }
2835    if ( bExp == 0 ) --expDiff;
2836    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2837 aBigger:
2838    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2839    zExp = aExp;
2840 normalizeRoundAndPack:
2841    return
2842        normalizeRoundAndPackFloatx80(
2843            roundData, zSign, zExp, zSig0, zSig1 );
2844
2845}
2846
2847/*
2848-------------------------------------------------------------------------------
2849Returns the result of adding the extended double-precision floating-point
2850values `a' and `b'.  The operation is performed according to the IEC/IEEE
2851Standard for Binary Floating-point Arithmetic.
2852-------------------------------------------------------------------------------
2853*/
2854floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2855{
2856    flag aSign, bSign;
2857
2858    aSign = extractFloatx80Sign( a );
2859    bSign = extractFloatx80Sign( b );
2860    if ( aSign == bSign ) {
2861        return addFloatx80Sigs( roundData, a, b, aSign );
2862    }
2863    else {
2864        return subFloatx80Sigs( roundData, a, b, aSign );
2865    }
2866
2867}
2868
2869/*
2870-------------------------------------------------------------------------------
2871Returns the result of subtracting the extended double-precision floating-
2872point values `a' and `b'.  The operation is performed according to the
2873IEC/IEEE Standard for Binary Floating-point Arithmetic.
2874-------------------------------------------------------------------------------
2875*/
2876floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2877{
2878    flag aSign, bSign;
2879
2880    aSign = extractFloatx80Sign( a );
2881    bSign = extractFloatx80Sign( b );
2882    if ( aSign == bSign ) {
2883        return subFloatx80Sigs( roundData, a, b, aSign );
2884    }
2885    else {
2886        return addFloatx80Sigs( roundData, a, b, aSign );
2887    }
2888
2889}
2890
2891/*
2892-------------------------------------------------------------------------------
2893Returns the result of multiplying the extended double-precision floating-
2894point values `a' and `b'.  The operation is performed according to the
2895IEC/IEEE Standard for Binary Floating-point Arithmetic.
2896-------------------------------------------------------------------------------
2897*/
2898floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2899{
2900    flag aSign, bSign, zSign;
2901    int32 aExp, bExp, zExp;
2902    bits64 aSig, bSig, zSig0, zSig1;
2903    floatx80 z;
2904
2905    aSig = extractFloatx80Frac( a );
2906    aExp = extractFloatx80Exp( a );
2907    aSign = extractFloatx80Sign( a );
2908    bSig = extractFloatx80Frac( b );
2909    bExp = extractFloatx80Exp( b );
2910    bSign = extractFloatx80Sign( b );
2911    zSign = aSign ^ bSign;
2912    if ( aExp == 0x7FFF ) {
2913        if (    (bits64) ( aSig<<1 )
2914             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2915            return propagateFloatx80NaN( a, b );
2916        }
2917        if ( ( bExp | bSig ) == 0 ) goto invalid;
2918        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2919    }
2920    if ( bExp == 0x7FFF ) {
2921        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2922        if ( ( aExp | aSig ) == 0 ) {
2923 invalid:
2924            roundData->exception |= float_flag_invalid;
2925            z.low = floatx80_default_nan_low;
2926            z.high = floatx80_default_nan_high;
2927            z.__padding = 0;
2928            return z;
2929        }
2930        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2931    }
2932    if ( aExp == 0 ) {
2933        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2934        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2935    }
2936    if ( bExp == 0 ) {
2937        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2938        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2939    }
2940    zExp = aExp + bExp - 0x3FFE;
2941    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2942    if ( 0 < (sbits64) zSig0 ) {
2943        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2944        --zExp;
2945    }
2946    return
2947        roundAndPackFloatx80(
2948            roundData, zSign, zExp, zSig0, zSig1 );
2949
2950}
2951
2952/*
2953-------------------------------------------------------------------------------
2954Returns the result of dividing the extended double-precision floating-point
2955value `a' by the corresponding value `b'.  The operation is performed
2956according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2957-------------------------------------------------------------------------------
2958*/
2959floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2960{
2961    flag aSign, bSign, zSign;
2962    int32 aExp, bExp, zExp;
2963    bits64 aSig, bSig, zSig0, zSig1;
2964    bits64 rem0, rem1, rem2, term0, term1, term2;
2965    floatx80 z;
2966
2967    aSig = extractFloatx80Frac( a );
2968    aExp = extractFloatx80Exp( a );
2969    aSign = extractFloatx80Sign( a );
2970    bSig = extractFloatx80Frac( b );
2971    bExp = extractFloatx80Exp( b );
2972    bSign = extractFloatx80Sign( b );
2973    zSign = aSign ^ bSign;
2974    if ( aExp == 0x7FFF ) {
2975        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2976        if ( bExp == 0x7FFF ) {
2977            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2978            goto invalid;
2979        }
2980        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2981    }
2982    if ( bExp == 0x7FFF ) {
2983        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2984        return packFloatx80( zSign, 0, 0 );
2985    }
2986    if ( bExp == 0 ) {
2987        if ( bSig == 0 ) {
2988            if ( ( aExp | aSig ) == 0 ) {
2989 invalid:
2990                roundData->exception |= float_flag_invalid;
2991                z.low = floatx80_default_nan_low;
2992                z.high = floatx80_default_nan_high;
2993                z.__padding = 0;
2994                return z;
2995            }
2996            roundData->exception |= float_flag_divbyzero;
2997            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2998        }
2999        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3000    }
3001    if ( aExp == 0 ) {
3002        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3003        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3004    }
3005    zExp = aExp - bExp + 0x3FFE;
3006    rem1 = 0;
3007    if ( bSig <= aSig ) {
3008        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3009        ++zExp;
3010    }
3011    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3012    mul64To128( bSig, zSig0, &term0, &term1 );
3013    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3014    while ( (sbits64) rem0 < 0 ) {
3015        --zSig0;
3016        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3017    }
3018    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3019    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3020        mul64To128( bSig, zSig1, &term1, &term2 );
3021        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3022        while ( (sbits64) rem1 < 0 ) {
3023            --zSig1;
3024            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3025        }
3026        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3027    }
3028    return
3029        roundAndPackFloatx80(
3030            roundData, zSign, zExp, zSig0, zSig1 );
3031
3032}
3033
3034/*
3035-------------------------------------------------------------------------------
3036Returns the remainder of the extended double-precision floating-point value
3037`a' with respect to the corresponding value `b'.  The operation is performed
3038according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3039-------------------------------------------------------------------------------
3040*/
3041floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3042{
3043    flag aSign, bSign, zSign;
3044    int32 aExp, bExp, expDiff;
3045    bits64 aSig0, aSig1, bSig;
3046    bits64 q, term0, term1, alternateASig0, alternateASig1;
3047    floatx80 z;
3048
3049    aSig0 = extractFloatx80Frac( a );
3050    aExp = extractFloatx80Exp( a );
3051    aSign = extractFloatx80Sign( a );
3052    bSig = extractFloatx80Frac( b );
3053    bExp = extractFloatx80Exp( b );
3054    bSign = extractFloatx80Sign( b );
3055    if ( aExp == 0x7FFF ) {
3056        if (    (bits64) ( aSig0<<1 )
3057             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3058            return propagateFloatx80NaN( a, b );
3059        }
3060        goto invalid;
3061    }
3062    if ( bExp == 0x7FFF ) {
3063        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3064        return a;
3065    }
3066    if ( bExp == 0 ) {
3067        if ( bSig == 0 ) {
3068 invalid:
3069            roundData->exception |= float_flag_invalid;
3070            z.low = floatx80_default_nan_low;
3071            z.high = floatx80_default_nan_high;
3072            z.__padding = 0;
3073            return z;
3074        }
3075        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3076    }
3077    if ( aExp == 0 ) {
3078        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3079        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3080    }
3081    bSig |= LIT64( 0x8000000000000000 );
3082    zSign = aSign;
3083    expDiff = aExp - bExp;
3084    aSig1 = 0;
3085    if ( expDiff < 0 ) {
3086        if ( expDiff < -1 ) return a;
3087        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3088        expDiff = 0;
3089    }
3090    q = ( bSig <= aSig0 );
3091    if ( q ) aSig0 -= bSig;
3092    expDiff -= 64;
3093    while ( 0 < expDiff ) {
3094        q = estimateDiv128To64( aSig0, aSig1, bSig );
3095        q = ( 2 < q ) ? q - 2 : 0;
3096        mul64To128( bSig, q, &term0, &term1 );
3097        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3098        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3099        expDiff -= 62;
3100    }
3101    expDiff += 64;
3102    if ( 0 < expDiff ) {
3103        q = estimateDiv128To64( aSig0, aSig1, bSig );
3104        q = ( 2 < q ) ? q - 2 : 0;
3105        q >>= 64 - expDiff;
3106        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3107        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3108        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3109        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3110            ++q;
3111            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3112        }
3113    }
3114    else {
3115        term1 = 0;
3116        term0 = bSig;
3117    }
3118    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3119    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3120         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3121              && ( q & 1 ) )
3122       ) {
3123        aSig0 = alternateASig0;
3124        aSig1 = alternateASig1;
3125        zSign = ! zSign;
3126    }
3127
3128    return
3129        normalizeRoundAndPackFloatx80(
3130            roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3131
3132}
3133
3134/*
3135-------------------------------------------------------------------------------
3136Returns the square root of the extended double-precision floating-point
3137value `a'.  The operation is performed according to the IEC/IEEE Standard
3138for Binary Floating-point Arithmetic.
3139-------------------------------------------------------------------------------
3140*/
3141floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3142{
3143    flag aSign;
3144    int32 aExp, zExp;
3145    bits64 aSig0, aSig1, zSig0, zSig1;
3146    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3147    bits64 shiftedRem0, shiftedRem1;
3148    floatx80 z;
3149
3150    aSig0 = extractFloatx80Frac( a );
3151    aExp = extractFloatx80Exp( a );
3152    aSign = extractFloatx80Sign( a );
3153    if ( aExp == 0x7FFF ) {
3154        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3155        if ( ! aSign ) return a;
3156        goto invalid;
3157    }
3158    if ( aSign ) {
3159        if ( ( aExp | aSig0 ) == 0 ) return a;
3160 invalid:
3161        roundData->exception |= float_flag_invalid;
3162        z.low = floatx80_default_nan_low;
3163        z.high = floatx80_default_nan_high;
3164        z.__padding = 0;
3165        return z;
3166    }
3167    if ( aExp == 0 ) {
3168        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3169        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3170    }
3171    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3172    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3173    zSig0 <<= 31;
3174    aSig1 = 0;
3175    shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3176    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3177    if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3178    shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3179    mul64To128( zSig0, zSig0, &term0, &term1 );
3180    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3181    while ( (sbits64) rem0 < 0 ) {
3182        --zSig0;
3183        shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3184        term1 |= 1;
3185        add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3186    }
3187    shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3188    zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3189    if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3190        if ( zSig1 == 0 ) zSig1 = 1;
3191        mul64To128( zSig0, zSig1, &term1, &term2 );
3192        shortShift128Left( term1, term2, 1, &term1, &term2 );
3193        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3194        mul64To128( zSig1, zSig1, &term2, &term3 );
3195        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3196        while ( (sbits64) rem1 < 0 ) {
3197            --zSig1;
3198            shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3199            term3 |= 1;
3200            add192(
3201                rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3202        }
3203        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3204    }
3205    return
3206        roundAndPackFloatx80(
3207            roundData, 0, zExp, zSig0, zSig1 );
3208
3209}
3210
3211/*
3212-------------------------------------------------------------------------------
3213Returns 1 if the extended double-precision floating-point value `a' is
3214equal to the corresponding value `b', and 0 otherwise.  The comparison is
3215performed according to the IEC/IEEE Standard for Binary Floating-point
3216Arithmetic.
3217-------------------------------------------------------------------------------
3218*/
3219flag floatx80_eq( floatx80 a, floatx80 b )
3220{
3221
3222    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3223              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3224         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3225              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3226       ) {
3227        if (    floatx80_is_signaling_nan( a )
3228             || floatx80_is_signaling_nan( b ) ) {
3229            float_raise( float_flag_invalid );
3230        }
3231        return 0;
3232    }
3233    return
3234           ( a.low == b.low )
3235        && (    ( a.high == b.high )
3236             || (    ( a.low == 0 )
3237                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3238           );
3239
3240}
3241
3242/*
3243-------------------------------------------------------------------------------
3244Returns 1 if the extended double-precision floating-point value `a' is
3245less than or equal to the corresponding value `b', and 0 otherwise.  The
3246comparison is performed according to the IEC/IEEE Standard for Binary
3247Floating-point Arithmetic.
3248-------------------------------------------------------------------------------
3249*/
3250flag floatx80_le( floatx80 a, floatx80 b )
3251{
3252    flag aSign, bSign;
3253
3254    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3255              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3256         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3257              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3258       ) {
3259        float_raise( float_flag_invalid );
3260        return 0;
3261    }
3262    aSign = extractFloatx80Sign( a );
3263    bSign = extractFloatx80Sign( b );
3264    if ( aSign != bSign ) {
3265        return
3266               aSign
3267            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3268                 == 0 );
3269    }
3270    return
3271          aSign ? le128( b.high, b.low, a.high, a.low )
3272        : le128( a.high, a.low, b.high, b.low );
3273
3274}
3275
3276/*
3277-------------------------------------------------------------------------------
3278Returns 1 if the extended double-precision floating-point value `a' is
3279less than the corresponding value `b', and 0 otherwise.  The comparison
3280is performed according to the IEC/IEEE Standard for Binary Floating-point
3281Arithmetic.
3282-------------------------------------------------------------------------------
3283*/
3284flag floatx80_lt( floatx80 a, floatx80 b )
3285{
3286    flag aSign, bSign;
3287
3288    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3289              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3290         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3291              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3292       ) {
3293        float_raise( float_flag_invalid );
3294        return 0;
3295    }
3296    aSign = extractFloatx80Sign( a );
3297    bSign = extractFloatx80Sign( b );
3298    if ( aSign != bSign ) {
3299        return
3300               aSign
3301            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3302                 != 0 );
3303    }
3304    return
3305          aSign ? lt128( b.high, b.low, a.high, a.low )
3306        : lt128( a.high, a.low, b.high, b.low );
3307
3308}
3309
3310/*
3311-------------------------------------------------------------------------------
3312Returns 1 if the extended double-precision floating-point value `a' is equal
3313to the corresponding value `b', and 0 otherwise.  The invalid exception is
3314raised if either operand is a NaN.  Otherwise, the comparison is performed
3315according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3316-------------------------------------------------------------------------------
3317*/
3318flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3319{
3320
3321    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3322              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3323         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3324              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3325       ) {
3326        float_raise( float_flag_invalid );
3327        return 0;
3328    }
3329    return
3330           ( a.low == b.low )
3331        && (    ( a.high == b.high )
3332             || (    ( a.low == 0 )
3333                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3334           );
3335
3336}
3337
3338/*
3339-------------------------------------------------------------------------------
3340Returns 1 if the extended double-precision floating-point value `a' is less
3341than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3342do not cause an exception.  Otherwise, the comparison is performed according
3343to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344-------------------------------------------------------------------------------
3345*/
3346flag floatx80_le_quiet( floatx80 a, floatx80 b )
3347{
3348    flag aSign, bSign;
3349
3350    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3351              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3352         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3353              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3354       ) {
3355        /* Do nothing, even if NaN as we're quiet */
3356        return 0;
3357    }
3358    aSign = extractFloatx80Sign( a );
3359    bSign = extractFloatx80Sign( b );
3360    if ( aSign != bSign ) {
3361        return
3362               aSign
3363            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3364                 == 0 );
3365    }
3366    return
3367          aSign ? le128( b.high, b.low, a.high, a.low )
3368        : le128( a.high, a.low, b.high, b.low );
3369
3370}
3371
3372/*
3373-------------------------------------------------------------------------------
3374Returns 1 if the extended double-precision floating-point value `a' is less
3375than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3376an exception.  Otherwise, the comparison is performed according to the
3377IEC/IEEE Standard for Binary Floating-point Arithmetic.
3378-------------------------------------------------------------------------------
3379*/
3380flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3381{
3382    flag aSign, bSign;
3383
3384    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3385              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3386         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3387              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3388       ) {
3389        /* Do nothing, even if NaN as we're quiet */
3390        return 0;
3391    }
3392    aSign = extractFloatx80Sign( a );
3393    bSign = extractFloatx80Sign( b );
3394    if ( aSign != bSign ) {
3395        return
3396               aSign
3397            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3398                 != 0 );
3399    }
3400    return
3401          aSign ? lt128( b.high, b.low, a.high, a.low )
3402        : lt128( a.high, a.low, b.high, b.low );
3403
3404}
3405
3406#endif
3407