1/* $NetBSD: softfloat.c,v 1.2 2003/07/26 19:24:52 salo Exp $ */
2
3/*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 *  itself).
7 */
8
9/*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14 *   properly renamed.
15 */
16
17/*
18===============================================================================
19
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
22
23Written by John R. Hauser.  This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704.  Funding was partially provided by the
26National Science Foundation under grant MIP-9311980.  The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31arithmetic/SoftFloat.html'.
32
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39Derivative works are acceptable, even for commercial purposes, so long as
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
43
44===============================================================================
45*/
46
47#include <sys/cdefs.h>
48__FBSDID("$FreeBSD$");
49
50#ifdef SOFTFLOAT_FOR_GCC
51#include "softfloat-for-gcc.h"
52#endif
53
54#include "milieu.h"
55#include "softfloat.h"
56
57/*
58 * Conversions between floats as stored in memory and floats as
59 * SoftFloat uses them
60 */
61#ifndef FLOAT64_DEMANGLE
62#define FLOAT64_DEMANGLE(a)	(a)
63#endif
64#ifndef FLOAT64_MANGLE
65#define FLOAT64_MANGLE(a)	(a)
66#endif
67
68/*
69-------------------------------------------------------------------------------
70Floating-point rounding mode, extended double-precision rounding precision,
71and exception flags.
72-------------------------------------------------------------------------------
73*/
74fp_rnd_t float_rounding_mode = float_round_nearest_even;
75fp_except float_exception_flags = 0;
76#ifdef FLOATX80
77int8 floatx80_rounding_precision = 80;
78#endif
79
80/*
81-------------------------------------------------------------------------------
82Primitive arithmetic functions, including multi-word arithmetic, and
83division and square root approximations.  (Can be specialized to target if
84desired.)
85-------------------------------------------------------------------------------
86*/
87#include "softfloat-macros"
88
89/*
90-------------------------------------------------------------------------------
91Functions and definitions to determine:  (1) whether tininess for underflow
92is detected before or after rounding by default, (2) what (if anything)
93happens when exceptions are raised, (3) how signaling NaNs are distinguished
94from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
95are propagated from function inputs to output.  These details are target-
96specific.
97-------------------------------------------------------------------------------
98*/
99#include "softfloat-specialize"
100
101#if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
102/*
103-------------------------------------------------------------------------------
104Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
105and 7, and returns the properly rounded 32-bit integer corresponding to the
106input.  If `zSign' is 1, the input is negated before being converted to an
107integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
108is simply rounded to an integer, with the inexact exception raised if the
109input cannot be represented exactly as an integer.  However, if the fixed-
110point input is too large, the invalid exception is raised and the largest
111positive or negative integer is returned.
112-------------------------------------------------------------------------------
113*/
114static int32 roundAndPackInt32( flag zSign, bits64 absZ )
115{
116    int8 roundingMode;
117    flag roundNearestEven;
118    int8 roundIncrement, roundBits;
119    int32 z;
120
121    roundingMode = float_rounding_mode;
122    roundNearestEven = ( roundingMode == float_round_nearest_even );
123    roundIncrement = 0x40;
124    if ( ! roundNearestEven ) {
125        if ( roundingMode == float_round_to_zero ) {
126            roundIncrement = 0;
127        }
128        else {
129            roundIncrement = 0x7F;
130            if ( zSign ) {
131                if ( roundingMode == float_round_up ) roundIncrement = 0;
132            }
133            else {
134                if ( roundingMode == float_round_down ) roundIncrement = 0;
135            }
136        }
137    }
138    roundBits = absZ & 0x7F;
139    absZ = ( absZ + roundIncrement )>>7;
140    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
141    z = absZ;
142    if ( zSign ) z = - z;
143    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
144        float_raise( float_flag_invalid );
145        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
146    }
147    if ( roundBits ) float_exception_flags |= float_flag_inexact;
148    return z;
149
150}
151
152/*
153-------------------------------------------------------------------------------
154Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155`absZ1', with binary point between bits 63 and 64 (between the input words),
156and returns the properly rounded 64-bit integer corresponding to the input.
157If `zSign' is 1, the input is negated before being converted to an integer.
158Ordinarily, the fixed-point input is simply rounded to an integer, with
159the inexact exception raised if the input cannot be represented exactly as
160an integer.  However, if the fixed-point input is too large, the invalid
161exception is raised and the largest positive or negative integer is
162returned.
163-------------------------------------------------------------------------------
164*/
165static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
166{
167    int8 roundingMode;
168    flag roundNearestEven, increment;
169    int64 z;
170
171    roundingMode = float_rounding_mode;
172    roundNearestEven = ( roundingMode == float_round_nearest_even );
173    increment = ( (sbits64) absZ1 < 0 );
174    if ( ! roundNearestEven ) {
175        if ( roundingMode == float_round_to_zero ) {
176            increment = 0;
177        }
178        else {
179            if ( zSign ) {
180                increment = ( roundingMode == float_round_down ) && absZ1;
181            }
182            else {
183                increment = ( roundingMode == float_round_up ) && absZ1;
184            }
185        }
186    }
187    if ( increment ) {
188        ++absZ0;
189        if ( absZ0 == 0 ) goto overflow;
190        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
191    }
192    z = absZ0;
193    if ( zSign ) z = - z;
194    if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196        float_raise( float_flag_invalid );
197        return
198              zSign ? (sbits64) LIT64( 0x8000000000000000 )
199            : LIT64( 0x7FFFFFFFFFFFFFFF );
200    }
201    if ( absZ1 ) float_exception_flags |= float_flag_inexact;
202    return z;
203
204}
205#endif
206
207/*
208-------------------------------------------------------------------------------
209Returns the fraction bits of the single-precision floating-point value `a'.
210-------------------------------------------------------------------------------
211*/
212INLINE bits32 extractFloat32Frac( float32 a )
213{
214
215    return a & 0x007FFFFF;
216
217}
218
219/*
220-------------------------------------------------------------------------------
221Returns the exponent bits of the single-precision floating-point value `a'.
222-------------------------------------------------------------------------------
223*/
224INLINE int16 extractFloat32Exp( float32 a )
225{
226
227    return ( a>>23 ) & 0xFF;
228
229}
230
231/*
232-------------------------------------------------------------------------------
233Returns the sign bit of the single-precision floating-point value `a'.
234-------------------------------------------------------------------------------
235*/
236INLINE flag extractFloat32Sign( float32 a )
237{
238
239    return a>>31;
240
241}
242
243/*
244-------------------------------------------------------------------------------
245Normalizes the subnormal single-precision floating-point value represented
246by the denormalized significand `aSig'.  The normalized exponent and
247significand are stored at the locations pointed to by `zExpPtr' and
248`zSigPtr', respectively.
249-------------------------------------------------------------------------------
250*/
251static void
252 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
253{
254    int8 shiftCount;
255
256    shiftCount = countLeadingZeros32( aSig ) - 8;
257    *zSigPtr = aSig<<shiftCount;
258    *zExpPtr = 1 - shiftCount;
259
260}
261
262/*
263-------------------------------------------------------------------------------
264Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
265single-precision floating-point value, returning the result.  After being
266shifted into the proper positions, the three fields are simply added
267together to form the result.  This means that any integer portion of `zSig'
268will be added into the exponent.  Since a properly normalized significand
269will have an integer portion equal to 1, the `zExp' input should be 1 less
270than the desired result exponent whenever `zSig' is a complete, normalized
271significand.
272-------------------------------------------------------------------------------
273*/
274INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
275{
276
277    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
278
279}
280
281/*
282-------------------------------------------------------------------------------
283Takes an abstract floating-point value having sign `zSign', exponent `zExp',
284and significand `zSig', and returns the proper single-precision floating-
285point value corresponding to the abstract input.  Ordinarily, the abstract
286value is simply rounded and packed into the single-precision format, with
287the inexact exception raised if the abstract input cannot be represented
288exactly.  However, if the abstract value is too large, the overflow and
289inexact exceptions are raised and an infinity or maximal finite value is
290returned.  If the abstract value is too small, the input value is rounded to
291a subnormal number, and the underflow and inexact exceptions are raised if
292the abstract input cannot be represented exactly as a subnormal single-
293precision floating-point number.
294    The input significand `zSig' has its binary point between bits 30
295and 29, which is 7 bits to the left of the usual location.  This shifted
296significand must be normalized or smaller.  If `zSig' is not normalized,
297`zExp' must be 0; in that case, the result returned is a subnormal number,
298and it must not require rounding.  In the usual case that `zSig' is
299normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
300The handling of underflow and overflow follows the IEC/IEEE Standard for
301Binary Floating-Point Arithmetic.
302-------------------------------------------------------------------------------
303*/
304static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
305{
306    int8 roundingMode;
307    flag roundNearestEven;
308    int8 roundIncrement, roundBits;
309    flag isTiny;
310
311    roundingMode = float_rounding_mode;
312    roundNearestEven = ( roundingMode == float_round_nearest_even );
313    roundIncrement = 0x40;
314    if ( ! roundNearestEven ) {
315        if ( roundingMode == float_round_to_zero ) {
316            roundIncrement = 0;
317        }
318        else {
319            roundIncrement = 0x7F;
320            if ( zSign ) {
321                if ( roundingMode == float_round_up ) roundIncrement = 0;
322            }
323            else {
324                if ( roundingMode == float_round_down ) roundIncrement = 0;
325            }
326        }
327    }
328    roundBits = zSig & 0x7F;
329    if ( 0xFD <= (bits16) zExp ) {
330        if (    ( 0xFD < zExp )
331             || (    ( zExp == 0xFD )
332                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
333           ) {
334            float_raise( float_flag_overflow | float_flag_inexact );
335            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
336        }
337        if ( zExp < 0 ) {
338            isTiny =
339                   ( float_detect_tininess == float_tininess_before_rounding )
340                || ( zExp < -1 )
341                || ( zSig + roundIncrement < 0x80000000 );
342            shift32RightJamming( zSig, - zExp, &zSig );
343            zExp = 0;
344            roundBits = zSig & 0x7F;
345            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
346        }
347    }
348    if ( roundBits ) float_exception_flags |= float_flag_inexact;
349    zSig = ( zSig + roundIncrement )>>7;
350    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
351    if ( zSig == 0 ) zExp = 0;
352    return packFloat32( zSign, zExp, zSig );
353
354}
355
356/*
357-------------------------------------------------------------------------------
358Takes an abstract floating-point value having sign `zSign', exponent `zExp',
359and significand `zSig', and returns the proper single-precision floating-
360point value corresponding to the abstract input.  This routine is just like
361`roundAndPackFloat32' except that `zSig' does not have to be normalized.
362Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
363floating-point exponent.
364-------------------------------------------------------------------------------
365*/
366static float32
367 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
368{
369    int8 shiftCount;
370
371    shiftCount = countLeadingZeros32( zSig ) - 1;
372    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
373
374}
375
376/*
377-------------------------------------------------------------------------------
378Returns the fraction bits of the double-precision floating-point value `a'.
379-------------------------------------------------------------------------------
380*/
381INLINE bits64 extractFloat64Frac( float64 a )
382{
383
384    return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
385
386}
387
388/*
389-------------------------------------------------------------------------------
390Returns the exponent bits of the double-precision floating-point value `a'.
391-------------------------------------------------------------------------------
392*/
393INLINE int16 extractFloat64Exp( float64 a )
394{
395
396    return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
397
398}
399
400/*
401-------------------------------------------------------------------------------
402Returns the sign bit of the double-precision floating-point value `a'.
403-------------------------------------------------------------------------------
404*/
405INLINE flag extractFloat64Sign( float64 a )
406{
407
408    return FLOAT64_DEMANGLE(a)>>63;
409
410}
411
412/*
413-------------------------------------------------------------------------------
414Normalizes the subnormal double-precision floating-point value represented
415by the denormalized significand `aSig'.  The normalized exponent and
416significand are stored at the locations pointed to by `zExpPtr' and
417`zSigPtr', respectively.
418-------------------------------------------------------------------------------
419*/
420static void
421 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
422{
423    int8 shiftCount;
424
425    shiftCount = countLeadingZeros64( aSig ) - 11;
426    *zSigPtr = aSig<<shiftCount;
427    *zExpPtr = 1 - shiftCount;
428
429}
430
431/*
432-------------------------------------------------------------------------------
433Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
434double-precision floating-point value, returning the result.  After being
435shifted into the proper positions, the three fields are simply added
436together to form the result.  This means that any integer portion of `zSig'
437will be added into the exponent.  Since a properly normalized significand
438will have an integer portion equal to 1, the `zExp' input should be 1 less
439than the desired result exponent whenever `zSig' is a complete, normalized
440significand.
441-------------------------------------------------------------------------------
442*/
443INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
444{
445
446    return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
447			   ( ( (bits64) zExp )<<52 ) + zSig );
448
449}
450
451/*
452-------------------------------------------------------------------------------
453Takes an abstract floating-point value having sign `zSign', exponent `zExp',
454and significand `zSig', and returns the proper double-precision floating-
455point value corresponding to the abstract input.  Ordinarily, the abstract
456value is simply rounded and packed into the double-precision format, with
457the inexact exception raised if the abstract input cannot be represented
458exactly.  However, if the abstract value is too large, the overflow and
459inexact exceptions are raised and an infinity or maximal finite value is
460returned.  If the abstract value is too small, the input value is rounded to
461a subnormal number, and the underflow and inexact exceptions are raised if
462the abstract input cannot be represented exactly as a subnormal double-
463precision floating-point number.
464    The input significand `zSig' has its binary point between bits 62
465and 61, which is 10 bits to the left of the usual location.  This shifted
466significand must be normalized or smaller.  If `zSig' is not normalized,
467`zExp' must be 0; in that case, the result returned is a subnormal number,
468and it must not require rounding.  In the usual case that `zSig' is
469normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
470The handling of underflow and overflow follows the IEC/IEEE Standard for
471Binary Floating-Point Arithmetic.
472-------------------------------------------------------------------------------
473*/
474static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
475{
476    int8 roundingMode;
477    flag roundNearestEven;
478    int16 roundIncrement, roundBits;
479    flag isTiny;
480
481    roundingMode = float_rounding_mode;
482    roundNearestEven = ( roundingMode == float_round_nearest_even );
483    roundIncrement = 0x200;
484    if ( ! roundNearestEven ) {
485        if ( roundingMode == float_round_to_zero ) {
486            roundIncrement = 0;
487        }
488        else {
489            roundIncrement = 0x3FF;
490            if ( zSign ) {
491                if ( roundingMode == float_round_up ) roundIncrement = 0;
492            }
493            else {
494                if ( roundingMode == float_round_down ) roundIncrement = 0;
495            }
496        }
497    }
498    roundBits = zSig & 0x3FF;
499    if ( 0x7FD <= (bits16) zExp ) {
500        if (    ( 0x7FD < zExp )
501             || (    ( zExp == 0x7FD )
502                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
503           ) {
504            float_raise( float_flag_overflow | float_flag_inexact );
505            return FLOAT64_MANGLE(
506		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
507		( roundIncrement == 0 ));
508        }
509        if ( zExp < 0 ) {
510            isTiny =
511                   ( float_detect_tininess == float_tininess_before_rounding )
512                || ( zExp < -1 )
513                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
514            shift64RightJamming( zSig, - zExp, &zSig );
515            zExp = 0;
516            roundBits = zSig & 0x3FF;
517            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
518        }
519    }
520    if ( roundBits ) float_exception_flags |= float_flag_inexact;
521    zSig = ( zSig + roundIncrement )>>10;
522    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
523    if ( zSig == 0 ) zExp = 0;
524    return packFloat64( zSign, zExp, zSig );
525
526}
527
528/*
529-------------------------------------------------------------------------------
530Takes an abstract floating-point value having sign `zSign', exponent `zExp',
531and significand `zSig', and returns the proper double-precision floating-
532point value corresponding to the abstract input.  This routine is just like
533`roundAndPackFloat64' except that `zSig' does not have to be normalized.
534Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
535floating-point exponent.
536-------------------------------------------------------------------------------
537*/
538static float64
539 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
540{
541    int8 shiftCount;
542
543    shiftCount = countLeadingZeros64( zSig ) - 1;
544    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
545
546}
547
548#ifdef FLOATX80
549
550/*
551-------------------------------------------------------------------------------
552Returns the fraction bits of the extended double-precision floating-point
553value `a'.
554-------------------------------------------------------------------------------
555*/
556INLINE bits64 extractFloatx80Frac( floatx80 a )
557{
558
559    return a.low;
560
561}
562
563/*
564-------------------------------------------------------------------------------
565Returns the exponent bits of the extended double-precision floating-point
566value `a'.
567-------------------------------------------------------------------------------
568*/
569INLINE int32 extractFloatx80Exp( floatx80 a )
570{
571
572    return a.high & 0x7FFF;
573
574}
575
576/*
577-------------------------------------------------------------------------------
578Returns the sign bit of the extended double-precision floating-point value
579`a'.
580-------------------------------------------------------------------------------
581*/
582INLINE flag extractFloatx80Sign( floatx80 a )
583{
584
585    return a.high>>15;
586
587}
588
589/*
590-------------------------------------------------------------------------------
591Normalizes the subnormal extended double-precision floating-point value
592represented by the denormalized significand `aSig'.  The normalized exponent
593and significand are stored at the locations pointed to by `zExpPtr' and
594`zSigPtr', respectively.
595-------------------------------------------------------------------------------
596*/
597static void
598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
599{
600    int8 shiftCount;
601
602    shiftCount = countLeadingZeros64( aSig );
603    *zSigPtr = aSig<<shiftCount;
604    *zExpPtr = 1 - shiftCount;
605
606}
607
608/*
609-------------------------------------------------------------------------------
610Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
611extended double-precision floating-point value, returning the result.
612-------------------------------------------------------------------------------
613*/
614INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
615{
616    floatx80 z;
617
618    z.low = zSig;
619    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
620    return z;
621
622}
623
624/*
625-------------------------------------------------------------------------------
626Takes an abstract floating-point value having sign `zSign', exponent `zExp',
627and extended significand formed by the concatenation of `zSig0' and `zSig1',
628and returns the proper extended double-precision floating-point value
629corresponding to the abstract input.  Ordinarily, the abstract value is
630rounded and packed into the extended double-precision format, with the
631inexact exception raised if the abstract input cannot be represented
632exactly.  However, if the abstract value is too large, the overflow and
633inexact exceptions are raised and an infinity or maximal finite value is
634returned.  If the abstract value is too small, the input value is rounded to
635a subnormal number, and the underflow and inexact exceptions are raised if
636the abstract input cannot be represented exactly as a subnormal extended
637double-precision floating-point number.
638    If `roundingPrecision' is 32 or 64, the result is rounded to the same
639number of bits as single or double precision, respectively.  Otherwise, the
640result is rounded to the full precision of the extended double-precision
641format.
642    The input significand must be normalized or smaller.  If the input
643significand is not normalized, `zExp' must be 0; in that case, the result
644returned is a subnormal number, and it must not require rounding.  The
645handling of underflow and overflow follows the IEC/IEEE Standard for Binary
646Floating-Point Arithmetic.
647-------------------------------------------------------------------------------
648*/
649static floatx80
650 roundAndPackFloatx80(
651     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
652 )
653{
654    int8 roundingMode;
655    flag roundNearestEven, increment, isTiny;
656    int64 roundIncrement, roundMask, roundBits;
657
658    roundingMode = float_rounding_mode;
659    roundNearestEven = ( roundingMode == float_round_nearest_even );
660    if ( roundingPrecision == 80 ) goto precision80;
661    if ( roundingPrecision == 64 ) {
662        roundIncrement = LIT64( 0x0000000000000400 );
663        roundMask = LIT64( 0x00000000000007FF );
664    }
665    else if ( roundingPrecision == 32 ) {
666        roundIncrement = LIT64( 0x0000008000000000 );
667        roundMask = LIT64( 0x000000FFFFFFFFFF );
668    }
669    else {
670        goto precision80;
671    }
672    zSig0 |= ( zSig1 != 0 );
673    if ( ! roundNearestEven ) {
674        if ( roundingMode == float_round_to_zero ) {
675            roundIncrement = 0;
676        }
677        else {
678            roundIncrement = roundMask;
679            if ( zSign ) {
680                if ( roundingMode == float_round_up ) roundIncrement = 0;
681            }
682            else {
683                if ( roundingMode == float_round_down ) roundIncrement = 0;
684            }
685        }
686    }
687    roundBits = zSig0 & roundMask;
688    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
689        if (    ( 0x7FFE < zExp )
690             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
691           ) {
692            goto overflow;
693        }
694        if ( zExp <= 0 ) {
695            isTiny =
696                   ( float_detect_tininess == float_tininess_before_rounding )
697                || ( zExp < 0 )
698                || ( zSig0 <= zSig0 + roundIncrement );
699            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
700            zExp = 0;
701            roundBits = zSig0 & roundMask;
702            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
703            if ( roundBits ) float_exception_flags |= float_flag_inexact;
704            zSig0 += roundIncrement;
705            if ( (sbits64) zSig0 < 0 ) zExp = 1;
706            roundIncrement = roundMask + 1;
707            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
708                roundMask |= roundIncrement;
709            }
710            zSig0 &= ~ roundMask;
711            return packFloatx80( zSign, zExp, zSig0 );
712        }
713    }
714    if ( roundBits ) float_exception_flags |= float_flag_inexact;
715    zSig0 += roundIncrement;
716    if ( zSig0 < roundIncrement ) {
717        ++zExp;
718        zSig0 = LIT64( 0x8000000000000000 );
719    }
720    roundIncrement = roundMask + 1;
721    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
722        roundMask |= roundIncrement;
723    }
724    zSig0 &= ~ roundMask;
725    if ( zSig0 == 0 ) zExp = 0;
726    return packFloatx80( zSign, zExp, zSig0 );
727 precision80:
728    increment = ( (sbits64) zSig1 < 0 );
729    if ( ! roundNearestEven ) {
730        if ( roundingMode == float_round_to_zero ) {
731            increment = 0;
732        }
733        else {
734            if ( zSign ) {
735                increment = ( roundingMode == float_round_down ) && zSig1;
736            }
737            else {
738                increment = ( roundingMode == float_round_up ) && zSig1;
739            }
740        }
741    }
742    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
743        if (    ( 0x7FFE < zExp )
744             || (    ( zExp == 0x7FFE )
745                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
746                  && increment
747                )
748           ) {
749            roundMask = 0;
750 overflow:
751            float_raise( float_flag_overflow | float_flag_inexact );
752            if (    ( roundingMode == float_round_to_zero )
753                 || ( zSign && ( roundingMode == float_round_up ) )
754                 || ( ! zSign && ( roundingMode == float_round_down ) )
755               ) {
756                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
757            }
758            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
759        }
760        if ( zExp <= 0 ) {
761            isTiny =
762                   ( float_detect_tininess == float_tininess_before_rounding )
763                || ( zExp < 0 )
764                || ! increment
765                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
766            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
767            zExp = 0;
768            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
769            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
770            if ( roundNearestEven ) {
771                increment = ( (sbits64) zSig1 < 0 );
772            }
773            else {
774                if ( zSign ) {
775                    increment = ( roundingMode == float_round_down ) && zSig1;
776                }
777                else {
778                    increment = ( roundingMode == float_round_up ) && zSig1;
779                }
780            }
781            if ( increment ) {
782                ++zSig0;
783                zSig0 &=
784                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
785                if ( (sbits64) zSig0 < 0 ) zExp = 1;
786            }
787            return packFloatx80( zSign, zExp, zSig0 );
788        }
789    }
790    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
791    if ( increment ) {
792        ++zSig0;
793        if ( zSig0 == 0 ) {
794            ++zExp;
795            zSig0 = LIT64( 0x8000000000000000 );
796        }
797        else {
798            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
799        }
800    }
801    else {
802        if ( zSig0 == 0 ) zExp = 0;
803    }
804    return packFloatx80( zSign, zExp, zSig0 );
805
806}
807
808/*
809-------------------------------------------------------------------------------
810Takes an abstract floating-point value having sign `zSign', exponent
811`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
812and returns the proper extended double-precision floating-point value
813corresponding to the abstract input.  This routine is just like
814`roundAndPackFloatx80' except that the input significand does not have to be
815normalized.
816-------------------------------------------------------------------------------
817*/
818static floatx80
819 normalizeRoundAndPackFloatx80(
820     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
821 )
822{
823    int8 shiftCount;
824
825    if ( zSig0 == 0 ) {
826        zSig0 = zSig1;
827        zSig1 = 0;
828        zExp -= 64;
829    }
830    shiftCount = countLeadingZeros64( zSig0 );
831    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
832    zExp -= shiftCount;
833    return
834        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
835
836}
837
838#endif
839
840#ifdef FLOAT128
841
842/*
843-------------------------------------------------------------------------------
844Returns the least-significant 64 fraction bits of the quadruple-precision
845floating-point value `a'.
846-------------------------------------------------------------------------------
847*/
848INLINE bits64 extractFloat128Frac1( float128 a )
849{
850
851    return a.low;
852
853}
854
855/*
856-------------------------------------------------------------------------------
857Returns the most-significant 48 fraction bits of the quadruple-precision
858floating-point value `a'.
859-------------------------------------------------------------------------------
860*/
861INLINE bits64 extractFloat128Frac0( float128 a )
862{
863
864    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
865
866}
867
868/*
869-------------------------------------------------------------------------------
870Returns the exponent bits of the quadruple-precision floating-point value
871`a'.
872-------------------------------------------------------------------------------
873*/
874INLINE int32 extractFloat128Exp( float128 a )
875{
876
877    return ( a.high>>48 ) & 0x7FFF;
878
879}
880
881/*
882-------------------------------------------------------------------------------
883Returns the sign bit of the quadruple-precision floating-point value `a'.
884-------------------------------------------------------------------------------
885*/
886INLINE flag extractFloat128Sign( float128 a )
887{
888
889    return a.high>>63;
890
891}
892
893/*
894-------------------------------------------------------------------------------
895Normalizes the subnormal quadruple-precision floating-point value
896represented by the denormalized significand formed by the concatenation of
897`aSig0' and `aSig1'.  The normalized exponent is stored at the location
898pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
899significand are stored at the location pointed to by `zSig0Ptr', and the
900least significant 64 bits of the normalized significand are stored at the
901location pointed to by `zSig1Ptr'.
902-------------------------------------------------------------------------------
903*/
904static void
905 normalizeFloat128Subnormal(
906     bits64 aSig0,
907     bits64 aSig1,
908     int32 *zExpPtr,
909     bits64 *zSig0Ptr,
910     bits64 *zSig1Ptr
911 )
912{
913    int8 shiftCount;
914
915    if ( aSig0 == 0 ) {
916        shiftCount = countLeadingZeros64( aSig1 ) - 15;
917        if ( shiftCount < 0 ) {
918            *zSig0Ptr = aSig1>>( - shiftCount );
919            *zSig1Ptr = aSig1<<( shiftCount & 63 );
920        }
921        else {
922            *zSig0Ptr = aSig1<<shiftCount;
923            *zSig1Ptr = 0;
924        }
925        *zExpPtr = - shiftCount - 63;
926    }
927    else {
928        shiftCount = countLeadingZeros64( aSig0 ) - 15;
929        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
930        *zExpPtr = 1 - shiftCount;
931    }
932
933}
934
935/*
936-------------------------------------------------------------------------------
937Packs the sign `zSign', the exponent `zExp', and the significand formed
938by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
939floating-point value, returning the result.  After being shifted into the
940proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
941added together to form the most significant 32 bits of the result.  This
942means that any integer portion of `zSig0' will be added into the exponent.
943Since a properly normalized significand will have an integer portion equal
944to 1, the `zExp' input should be 1 less than the desired result exponent
945whenever `zSig0' and `zSig1' concatenated form a complete, normalized
946significand.
947-------------------------------------------------------------------------------
948*/
949INLINE float128
950 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
951{
952    float128 z;
953
954    z.low = zSig1;
955    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
956    return z;
957
958}
959
960/*
961-------------------------------------------------------------------------------
962Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963and extended significand formed by the concatenation of `zSig0', `zSig1',
964and `zSig2', and returns the proper quadruple-precision floating-point value
965corresponding to the abstract input.  Ordinarily, the abstract value is
966simply rounded and packed into the quadruple-precision format, with the
967inexact exception raised if the abstract input cannot be represented
968exactly.  However, if the abstract value is too large, the overflow and
969inexact exceptions are raised and an infinity or maximal finite value is
970returned.  If the abstract value is too small, the input value is rounded to
971a subnormal number, and the underflow and inexact exceptions are raised if
972the abstract input cannot be represented exactly as a subnormal quadruple-
973precision floating-point number.
974    The input significand must be normalized or smaller.  If the input
975significand is not normalized, `zExp' must be 0; in that case, the result
976returned is a subnormal number, and it must not require rounding.  In the
977usual case that the input significand is normalized, `zExp' must be 1 less
978than the ``true'' floating-point exponent.  The handling of underflow and
979overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980-------------------------------------------------------------------------------
981*/
982static float128
983 roundAndPackFloat128(
984     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
985{
986    int8 roundingMode;
987    flag roundNearestEven, increment, isTiny;
988
989    roundingMode = float_rounding_mode;
990    roundNearestEven = ( roundingMode == float_round_nearest_even );
991    increment = ( (sbits64) zSig2 < 0 );
992    if ( ! roundNearestEven ) {
993        if ( roundingMode == float_round_to_zero ) {
994            increment = 0;
995        }
996        else {
997            if ( zSign ) {
998                increment = ( roundingMode == float_round_down ) && zSig2;
999            }
1000            else {
1001                increment = ( roundingMode == float_round_up ) && zSig2;
1002            }
1003        }
1004    }
1005    if ( 0x7FFD <= (bits32) zExp ) {
1006        if (    ( 0x7FFD < zExp )
1007             || (    ( zExp == 0x7FFD )
1008                  && eq128(
1009                         LIT64( 0x0001FFFFFFFFFFFF ),
1010                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                         zSig0,
1012                         zSig1
1013                     )
1014                  && increment
1015                )
1016           ) {
1017            float_raise( float_flag_overflow | float_flag_inexact );
1018            if (    ( roundingMode == float_round_to_zero )
1019                 || ( zSign && ( roundingMode == float_round_up ) )
1020                 || ( ! zSign && ( roundingMode == float_round_down ) )
1021               ) {
1022                return
1023                    packFloat128(
1024                        zSign,
1025                        0x7FFE,
1026                        LIT64( 0x0000FFFFFFFFFFFF ),
1027                        LIT64( 0xFFFFFFFFFFFFFFFF )
1028                    );
1029            }
1030            return packFloat128( zSign, 0x7FFF, 0, 0 );
1031        }
1032        if ( zExp < 0 ) {
1033            isTiny =
1034                   ( float_detect_tininess == float_tininess_before_rounding )
1035                || ( zExp < -1 )
1036                || ! increment
1037                || lt128(
1038                       zSig0,
1039                       zSig1,
1040                       LIT64( 0x0001FFFFFFFFFFFF ),
1041                       LIT64( 0xFFFFFFFFFFFFFFFF )
1042                   );
1043            shift128ExtraRightJamming(
1044                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1045            zExp = 0;
1046            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1047            if ( roundNearestEven ) {
1048                increment = ( (sbits64) zSig2 < 0 );
1049            }
1050            else {
1051                if ( zSign ) {
1052                    increment = ( roundingMode == float_round_down ) && zSig2;
1053                }
1054                else {
1055                    increment = ( roundingMode == float_round_up ) && zSig2;
1056                }
1057            }
1058        }
1059    }
1060    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1061    if ( increment ) {
1062        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1063        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1064    }
1065    else {
1066        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1067    }
1068    return packFloat128( zSign, zExp, zSig0, zSig1 );
1069
1070}
1071
1072/*
1073-------------------------------------------------------------------------------
1074Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075and significand formed by the concatenation of `zSig0' and `zSig1', and
1076returns the proper quadruple-precision floating-point value corresponding
1077to the abstract input.  This routine is just like `roundAndPackFloat128'
1078except that the input significand has fewer bits and does not have to be
1079normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080point exponent.
1081-------------------------------------------------------------------------------
1082*/
1083static float128
1084 normalizeRoundAndPackFloat128(
1085     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1086{
1087    int8 shiftCount;
1088    bits64 zSig2;
1089
1090    if ( zSig0 == 0 ) {
1091        zSig0 = zSig1;
1092        zSig1 = 0;
1093        zExp -= 64;
1094    }
1095    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096    if ( 0 <= shiftCount ) {
1097        zSig2 = 0;
1098        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099    }
1100    else {
1101        shift128ExtraRightJamming(
1102            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103    }
1104    zExp -= shiftCount;
1105    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1106
1107}
1108
1109#endif
1110
1111/*
1112-------------------------------------------------------------------------------
1113Returns the result of converting the 32-bit two's complement integer `a'
1114to the single-precision floating-point format.  The conversion is performed
1115according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1116-------------------------------------------------------------------------------
1117*/
1118float32 int32_to_float32( int32 a )
1119{
1120    flag zSign;
1121
1122    if ( a == 0 ) return 0;
1123    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1124    zSign = ( a < 0 );
1125    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1126
1127}
1128
1129/*
1130-------------------------------------------------------------------------------
1131Returns the result of converting the 32-bit two's complement integer `a'
1132to the double-precision floating-point format.  The conversion is performed
1133according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134-------------------------------------------------------------------------------
1135*/
1136float64 int32_to_float64( int32 a )
1137{
1138    flag zSign;
1139    uint32 absA;
1140    int8 shiftCount;
1141    bits64 zSig;
1142
1143    if ( a == 0 ) return 0;
1144    zSign = ( a < 0 );
1145    absA = zSign ? - a : a;
1146    shiftCount = countLeadingZeros32( absA ) + 21;
1147    zSig = absA;
1148    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1149
1150}
1151
1152#ifdef FLOATX80
1153
1154/*
1155-------------------------------------------------------------------------------
1156Returns the result of converting the 32-bit two's complement integer `a'
1157to the extended double-precision floating-point format.  The conversion
1158is performed according to the IEC/IEEE Standard for Binary Floating-Point
1159Arithmetic.
1160-------------------------------------------------------------------------------
1161*/
1162floatx80 int32_to_floatx80( int32 a )
1163{
1164    flag zSign;
1165    uint32 absA;
1166    int8 shiftCount;
1167    bits64 zSig;
1168
1169    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1170    zSign = ( a < 0 );
1171    absA = zSign ? - a : a;
1172    shiftCount = countLeadingZeros32( absA ) + 32;
1173    zSig = absA;
1174    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1175
1176}
1177
1178#endif
1179
1180#ifdef FLOAT128
1181
1182/*
1183-------------------------------------------------------------------------------
1184Returns the result of converting the 32-bit two's complement integer `a' to
1185the quadruple-precision floating-point format.  The conversion is performed
1186according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187-------------------------------------------------------------------------------
1188*/
1189float128 int32_to_float128( int32 a )
1190{
1191    flag zSign;
1192    uint32 absA;
1193    int8 shiftCount;
1194    bits64 zSig0;
1195
1196    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1197    zSign = ( a < 0 );
1198    absA = zSign ? - a : a;
1199    shiftCount = countLeadingZeros32( absA ) + 17;
1200    zSig0 = absA;
1201    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202
1203}
1204
1205#endif
1206
1207#ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1208/*
1209-------------------------------------------------------------------------------
1210Returns the result of converting the 64-bit two's complement integer `a'
1211to the single-precision floating-point format.  The conversion is performed
1212according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1213-------------------------------------------------------------------------------
1214*/
1215float32 int64_to_float32( int64 a )
1216{
1217    flag zSign;
1218    uint64 absA;
1219    int8 shiftCount;
1220
1221    if ( a == 0 ) return 0;
1222    zSign = ( a < 0 );
1223    absA = zSign ? - a : a;
1224    shiftCount = countLeadingZeros64( absA ) - 40;
1225    if ( 0 <= shiftCount ) {
1226        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1227    }
1228    else {
1229        shiftCount += 7;
1230        if ( shiftCount < 0 ) {
1231            shift64RightJamming( absA, - shiftCount, &absA );
1232        }
1233        else {
1234            absA <<= shiftCount;
1235        }
1236        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1237    }
1238
1239}
1240
1241/*
1242-------------------------------------------------------------------------------
1243Returns the result of converting the 64-bit two's complement integer `a'
1244to the double-precision floating-point format.  The conversion is performed
1245according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1246-------------------------------------------------------------------------------
1247*/
1248float64 int64_to_float64( int64 a )
1249{
1250    flag zSign;
1251
1252    if ( a == 0 ) return 0;
1253    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1254        return packFloat64( 1, 0x43E, 0 );
1255    }
1256    zSign = ( a < 0 );
1257    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1258
1259}
1260
1261#ifdef FLOATX80
1262
1263/*
1264-------------------------------------------------------------------------------
1265Returns the result of converting the 64-bit two's complement integer `a'
1266to the extended double-precision floating-point format.  The conversion
1267is performed according to the IEC/IEEE Standard for Binary Floating-Point
1268Arithmetic.
1269-------------------------------------------------------------------------------
1270*/
1271floatx80 int64_to_floatx80( int64 a )
1272{
1273    flag zSign;
1274    uint64 absA;
1275    int8 shiftCount;
1276
1277    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1278    zSign = ( a < 0 );
1279    absA = zSign ? - a : a;
1280    shiftCount = countLeadingZeros64( absA );
1281    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1282
1283}
1284
1285#endif
1286
1287#endif /* !SOFTFLOAT_FOR_GCC */
1288
1289#ifdef FLOAT128
1290
1291/*
1292-------------------------------------------------------------------------------
1293Returns the result of converting the 64-bit two's complement integer `a' to
1294the quadruple-precision floating-point format.  The conversion is performed
1295according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296-------------------------------------------------------------------------------
1297*/
1298float128 int64_to_float128( int64 a )
1299{
1300    flag zSign;
1301    uint64 absA;
1302    int8 shiftCount;
1303    int32 zExp;
1304    bits64 zSig0, zSig1;
1305
1306    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1307    zSign = ( a < 0 );
1308    absA = zSign ? - a : a;
1309    shiftCount = countLeadingZeros64( absA ) + 49;
1310    zExp = 0x406E - shiftCount;
1311    if ( 64 <= shiftCount ) {
1312        zSig1 = 0;
1313        zSig0 = absA;
1314        shiftCount -= 64;
1315    }
1316    else {
1317        zSig1 = absA;
1318        zSig0 = 0;
1319    }
1320    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1321    return packFloat128( zSign, zExp, zSig0, zSig1 );
1322
1323}
1324
1325#endif
1326
1327#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1328/*
1329-------------------------------------------------------------------------------
1330Returns the result of converting the single-precision floating-point value
1331`a' to the 32-bit two's complement integer format.  The conversion is
1332performed according to the IEC/IEEE Standard for Binary Floating-Point
1333Arithmetic---which means in particular that the conversion is rounded
1334according to the current rounding mode.  If `a' is a NaN, the largest
1335positive integer is returned.  Otherwise, if the conversion overflows, the
1336largest integer with the same sign as `a' is returned.
1337-------------------------------------------------------------------------------
1338*/
1339int32 float32_to_int32( float32 a )
1340{
1341    flag aSign;
1342    int16 aExp, shiftCount;
1343    bits32 aSig;
1344    bits64 aSig64;
1345
1346    aSig = extractFloat32Frac( a );
1347    aExp = extractFloat32Exp( a );
1348    aSign = extractFloat32Sign( a );
1349    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1350    if ( aExp ) aSig |= 0x00800000;
1351    shiftCount = 0xAF - aExp;
1352    aSig64 = aSig;
1353    aSig64 <<= 32;
1354    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1355    return roundAndPackInt32( aSign, aSig64 );
1356
1357}
1358#endif /* !SOFTFLOAT_FOR_GCC */
1359
1360/*
1361-------------------------------------------------------------------------------
1362Returns the result of converting the single-precision floating-point value
1363`a' to the 32-bit two's complement integer format.  The conversion is
1364performed according to the IEC/IEEE Standard for Binary Floating-Point
1365Arithmetic, except that the conversion is always rounded toward zero.
1366If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1367the conversion overflows, the largest integer with the same sign as `a' is
1368returned.
1369-------------------------------------------------------------------------------
1370*/
1371int32 float32_to_int32_round_to_zero( float32 a )
1372{
1373    flag aSign;
1374    int16 aExp, shiftCount;
1375    bits32 aSig;
1376    int32 z;
1377
1378    aSig = extractFloat32Frac( a );
1379    aExp = extractFloat32Exp( a );
1380    aSign = extractFloat32Sign( a );
1381    shiftCount = aExp - 0x9E;
1382    if ( 0 <= shiftCount ) {
1383        if ( a != 0xCF000000 ) {
1384            float_raise( float_flag_invalid );
1385            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1386        }
1387        return (sbits32) 0x80000000;
1388    }
1389    else if ( aExp <= 0x7E ) {
1390        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1391        return 0;
1392    }
1393    aSig = ( aSig | 0x00800000 )<<8;
1394    z = aSig>>( - shiftCount );
1395    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1396        float_exception_flags |= float_flag_inexact;
1397    }
1398    if ( aSign ) z = - z;
1399    return z;
1400
1401}
1402
1403#ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1404/*
1405-------------------------------------------------------------------------------
1406Returns the result of converting the single-precision floating-point value
1407`a' to the 64-bit two's complement integer format.  The conversion is
1408performed according to the IEC/IEEE Standard for Binary Floating-Point
1409Arithmetic---which means in particular that the conversion is rounded
1410according to the current rounding mode.  If `a' is a NaN, the largest
1411positive integer is returned.  Otherwise, if the conversion overflows, the
1412largest integer with the same sign as `a' is returned.
1413-------------------------------------------------------------------------------
1414*/
1415int64 float32_to_int64( float32 a )
1416{
1417    flag aSign;
1418    int16 aExp, shiftCount;
1419    bits32 aSig;
1420    bits64 aSig64, aSigExtra;
1421
1422    aSig = extractFloat32Frac( a );
1423    aExp = extractFloat32Exp( a );
1424    aSign = extractFloat32Sign( a );
1425    shiftCount = 0xBE - aExp;
1426    if ( shiftCount < 0 ) {
1427        float_raise( float_flag_invalid );
1428        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429            return LIT64( 0x7FFFFFFFFFFFFFFF );
1430        }
1431        return (sbits64) LIT64( 0x8000000000000000 );
1432    }
1433    if ( aExp ) aSig |= 0x00800000;
1434    aSig64 = aSig;
1435    aSig64 <<= 40;
1436    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1437    return roundAndPackInt64( aSign, aSig64, aSigExtra );
1438
1439}
1440
1441/*
1442-------------------------------------------------------------------------------
1443Returns the result of converting the single-precision floating-point value
1444`a' to the 64-bit two's complement integer format.  The conversion is
1445performed according to the IEC/IEEE Standard for Binary Floating-Point
1446Arithmetic, except that the conversion is always rounded toward zero.  If
1447`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1448conversion overflows, the largest integer with the same sign as `a' is
1449returned.
1450-------------------------------------------------------------------------------
1451*/
1452int64 float32_to_int64_round_to_zero( float32 a )
1453{
1454    flag aSign;
1455    int16 aExp, shiftCount;
1456    bits32 aSig;
1457    bits64 aSig64;
1458    int64 z;
1459
1460    aSig = extractFloat32Frac( a );
1461    aExp = extractFloat32Exp( a );
1462    aSign = extractFloat32Sign( a );
1463    shiftCount = aExp - 0xBE;
1464    if ( 0 <= shiftCount ) {
1465        if ( a != 0xDF000000 ) {
1466            float_raise( float_flag_invalid );
1467            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1468                return LIT64( 0x7FFFFFFFFFFFFFFF );
1469            }
1470        }
1471        return (sbits64) LIT64( 0x8000000000000000 );
1472    }
1473    else if ( aExp <= 0x7E ) {
1474        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1475        return 0;
1476    }
1477    aSig64 = aSig | 0x00800000;
1478    aSig64 <<= 40;
1479    z = aSig64>>( - shiftCount );
1480    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1481        float_exception_flags |= float_flag_inexact;
1482    }
1483    if ( aSign ) z = - z;
1484    return z;
1485
1486}
1487#endif /* !SOFTFLOAT_FOR_GCC */
1488
1489/*
1490-------------------------------------------------------------------------------
1491Returns the result of converting the single-precision floating-point value
1492`a' to the double-precision floating-point format.  The conversion is
1493performed according to the IEC/IEEE Standard for Binary Floating-Point
1494Arithmetic.
1495-------------------------------------------------------------------------------
1496*/
1497float64 float32_to_float64( float32 a )
1498{
1499    flag aSign;
1500    int16 aExp;
1501    bits32 aSig;
1502
1503    aSig = extractFloat32Frac( a );
1504    aExp = extractFloat32Exp( a );
1505    aSign = extractFloat32Sign( a );
1506    if ( aExp == 0xFF ) {
1507        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1508        return packFloat64( aSign, 0x7FF, 0 );
1509    }
1510    if ( aExp == 0 ) {
1511        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1512        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1513        --aExp;
1514    }
1515    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1516
1517}
1518
1519#ifdef FLOATX80
1520
1521/*
1522-------------------------------------------------------------------------------
1523Returns the result of converting the single-precision floating-point value
1524`a' to the extended double-precision floating-point format.  The conversion
1525is performed according to the IEC/IEEE Standard for Binary Floating-Point
1526Arithmetic.
1527-------------------------------------------------------------------------------
1528*/
1529floatx80 float32_to_floatx80( float32 a )
1530{
1531    flag aSign;
1532    int16 aExp;
1533    bits32 aSig;
1534
1535    aSig = extractFloat32Frac( a );
1536    aExp = extractFloat32Exp( a );
1537    aSign = extractFloat32Sign( a );
1538    if ( aExp == 0xFF ) {
1539        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1540        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1541    }
1542    if ( aExp == 0 ) {
1543        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1544        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1545    }
1546    aSig |= 0x00800000;
1547    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1548
1549}
1550
1551#endif
1552
1553#ifdef FLOAT128
1554
1555/*
1556-------------------------------------------------------------------------------
1557Returns the result of converting the single-precision floating-point value
1558`a' to the double-precision floating-point format.  The conversion is
1559performed according to the IEC/IEEE Standard for Binary Floating-Point
1560Arithmetic.
1561-------------------------------------------------------------------------------
1562*/
1563float128 float32_to_float128( float32 a )
1564{
1565    flag aSign;
1566    int16 aExp;
1567    bits32 aSig;
1568
1569    aSig = extractFloat32Frac( a );
1570    aExp = extractFloat32Exp( a );
1571    aSign = extractFloat32Sign( a );
1572    if ( aExp == 0xFF ) {
1573        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1574        return packFloat128( aSign, 0x7FFF, 0, 0 );
1575    }
1576    if ( aExp == 0 ) {
1577        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1578        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1579        --aExp;
1580    }
1581    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1582
1583}
1584
1585#endif
1586
1587#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1588/*
1589-------------------------------------------------------------------------------
1590Rounds the single-precision floating-point value `a' to an integer, and
1591returns the result as a single-precision floating-point value.  The
1592operation is performed according to the IEC/IEEE Standard for Binary
1593Floating-Point Arithmetic.
1594-------------------------------------------------------------------------------
1595*/
1596float32 float32_round_to_int( float32 a )
1597{
1598    flag aSign;
1599    int16 aExp;
1600    bits32 lastBitMask, roundBitsMask;
1601    int8 roundingMode;
1602    float32 z;
1603
1604    aExp = extractFloat32Exp( a );
1605    if ( 0x96 <= aExp ) {
1606        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1607            return propagateFloat32NaN( a, a );
1608        }
1609        return a;
1610    }
1611    if ( aExp <= 0x7E ) {
1612        if ( (bits32) ( a<<1 ) == 0 ) return a;
1613        float_exception_flags |= float_flag_inexact;
1614        aSign = extractFloat32Sign( a );
1615        switch ( float_rounding_mode ) {
1616         case float_round_nearest_even:
1617            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1618                return packFloat32( aSign, 0x7F, 0 );
1619            }
1620            break;
1621	 case float_round_to_zero:
1622	    break;
1623         case float_round_down:
1624            return aSign ? 0xBF800000 : 0;
1625         case float_round_up:
1626            return aSign ? 0x80000000 : 0x3F800000;
1627        }
1628        return packFloat32( aSign, 0, 0 );
1629    }
1630    lastBitMask = 1;
1631    lastBitMask <<= 0x96 - aExp;
1632    roundBitsMask = lastBitMask - 1;
1633    z = a;
1634    roundingMode = float_rounding_mode;
1635    if ( roundingMode == float_round_nearest_even ) {
1636        z += lastBitMask>>1;
1637        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1638    }
1639    else if ( roundingMode != float_round_to_zero ) {
1640        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1641            z += roundBitsMask;
1642        }
1643    }
1644    z &= ~ roundBitsMask;
1645    if ( z != a ) float_exception_flags |= float_flag_inexact;
1646    return z;
1647
1648}
1649#endif /* !SOFTFLOAT_FOR_GCC */
1650
1651/*
1652-------------------------------------------------------------------------------
1653Returns the result of adding the absolute values of the single-precision
1654floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1655before being returned.  `zSign' is ignored if the result is a NaN.
1656The addition is performed according to the IEC/IEEE Standard for Binary
1657Floating-Point Arithmetic.
1658-------------------------------------------------------------------------------
1659*/
1660static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1661{
1662    int16 aExp, bExp, zExp;
1663    bits32 aSig, bSig, zSig;
1664    int16 expDiff;
1665
1666    aSig = extractFloat32Frac( a );
1667    aExp = extractFloat32Exp( a );
1668    bSig = extractFloat32Frac( b );
1669    bExp = extractFloat32Exp( b );
1670    expDiff = aExp - bExp;
1671    aSig <<= 6;
1672    bSig <<= 6;
1673    if ( 0 < expDiff ) {
1674        if ( aExp == 0xFF ) {
1675            if ( aSig ) return propagateFloat32NaN( a, b );
1676            return a;
1677        }
1678        if ( bExp == 0 ) {
1679            --expDiff;
1680        }
1681        else {
1682            bSig |= 0x20000000;
1683        }
1684        shift32RightJamming( bSig, expDiff, &bSig );
1685        zExp = aExp;
1686    }
1687    else if ( expDiff < 0 ) {
1688        if ( bExp == 0xFF ) {
1689            if ( bSig ) return propagateFloat32NaN( a, b );
1690            return packFloat32( zSign, 0xFF, 0 );
1691        }
1692        if ( aExp == 0 ) {
1693            ++expDiff;
1694        }
1695        else {
1696            aSig |= 0x20000000;
1697        }
1698        shift32RightJamming( aSig, - expDiff, &aSig );
1699        zExp = bExp;
1700    }
1701    else {
1702        if ( aExp == 0xFF ) {
1703            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1704            return a;
1705        }
1706        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1707        zSig = 0x40000000 + aSig + bSig;
1708        zExp = aExp;
1709        goto roundAndPack;
1710    }
1711    aSig |= 0x20000000;
1712    zSig = ( aSig + bSig )<<1;
1713    --zExp;
1714    if ( (sbits32) zSig < 0 ) {
1715        zSig = aSig + bSig;
1716        ++zExp;
1717    }
1718 roundAndPack:
1719    return roundAndPackFloat32( zSign, zExp, zSig );
1720
1721}
1722
1723/*
1724-------------------------------------------------------------------------------
1725Returns the result of subtracting the absolute values of the single-
1726precision floating-point values `a' and `b'.  If `zSign' is 1, the
1727difference is negated before being returned.  `zSign' is ignored if the
1728result is a NaN.  The subtraction is performed according to the IEC/IEEE
1729Standard for Binary Floating-Point Arithmetic.
1730-------------------------------------------------------------------------------
1731*/
1732static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1733{
1734    int16 aExp, bExp, zExp;
1735    bits32 aSig, bSig, zSig;
1736    int16 expDiff;
1737
1738    aSig = extractFloat32Frac( a );
1739    aExp = extractFloat32Exp( a );
1740    bSig = extractFloat32Frac( b );
1741    bExp = extractFloat32Exp( b );
1742    expDiff = aExp - bExp;
1743    aSig <<= 7;
1744    bSig <<= 7;
1745    if ( 0 < expDiff ) goto aExpBigger;
1746    if ( expDiff < 0 ) goto bExpBigger;
1747    if ( aExp == 0xFF ) {
1748        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1749        float_raise( float_flag_invalid );
1750        return float32_default_nan;
1751    }
1752    if ( aExp == 0 ) {
1753        aExp = 1;
1754        bExp = 1;
1755    }
1756    if ( bSig < aSig ) goto aBigger;
1757    if ( aSig < bSig ) goto bBigger;
1758    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1759 bExpBigger:
1760    if ( bExp == 0xFF ) {
1761        if ( bSig ) return propagateFloat32NaN( a, b );
1762        return packFloat32( zSign ^ 1, 0xFF, 0 );
1763    }
1764    if ( aExp == 0 ) {
1765        ++expDiff;
1766    }
1767    else {
1768        aSig |= 0x40000000;
1769    }
1770    shift32RightJamming( aSig, - expDiff, &aSig );
1771    bSig |= 0x40000000;
1772 bBigger:
1773    zSig = bSig - aSig;
1774    zExp = bExp;
1775    zSign ^= 1;
1776    goto normalizeRoundAndPack;
1777 aExpBigger:
1778    if ( aExp == 0xFF ) {
1779        if ( aSig ) return propagateFloat32NaN( a, b );
1780        return a;
1781    }
1782    if ( bExp == 0 ) {
1783        --expDiff;
1784    }
1785    else {
1786        bSig |= 0x40000000;
1787    }
1788    shift32RightJamming( bSig, expDiff, &bSig );
1789    aSig |= 0x40000000;
1790 aBigger:
1791    zSig = aSig - bSig;
1792    zExp = aExp;
1793 normalizeRoundAndPack:
1794    --zExp;
1795    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1796
1797}
1798
1799/*
1800-------------------------------------------------------------------------------
1801Returns the result of adding the single-precision floating-point values `a'
1802and `b'.  The operation is performed according to the IEC/IEEE Standard for
1803Binary Floating-Point Arithmetic.
1804-------------------------------------------------------------------------------
1805*/
1806float32 float32_add( float32 a, float32 b )
1807{
1808    flag aSign, bSign;
1809
1810    aSign = extractFloat32Sign( a );
1811    bSign = extractFloat32Sign( b );
1812    if ( aSign == bSign ) {
1813        return addFloat32Sigs( a, b, aSign );
1814    }
1815    else {
1816        return subFloat32Sigs( a, b, aSign );
1817    }
1818
1819}
1820
1821/*
1822-------------------------------------------------------------------------------
1823Returns the result of subtracting the single-precision floating-point values
1824`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1825for Binary Floating-Point Arithmetic.
1826-------------------------------------------------------------------------------
1827*/
1828float32 float32_sub( float32 a, float32 b )
1829{
1830    flag aSign, bSign;
1831
1832    aSign = extractFloat32Sign( a );
1833    bSign = extractFloat32Sign( b );
1834    if ( aSign == bSign ) {
1835        return subFloat32Sigs( a, b, aSign );
1836    }
1837    else {
1838        return addFloat32Sigs( a, b, aSign );
1839    }
1840
1841}
1842
1843/*
1844-------------------------------------------------------------------------------
1845Returns the result of multiplying the single-precision floating-point values
1846`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1847for Binary Floating-Point Arithmetic.
1848-------------------------------------------------------------------------------
1849*/
1850float32 float32_mul( float32 a, float32 b )
1851{
1852    flag aSign, bSign, zSign;
1853    int16 aExp, bExp, zExp;
1854    bits32 aSig, bSig;
1855    bits64 zSig64;
1856    bits32 zSig;
1857
1858    aSig = extractFloat32Frac( a );
1859    aExp = extractFloat32Exp( a );
1860    aSign = extractFloat32Sign( a );
1861    bSig = extractFloat32Frac( b );
1862    bExp = extractFloat32Exp( b );
1863    bSign = extractFloat32Sign( b );
1864    zSign = aSign ^ bSign;
1865    if ( aExp == 0xFF ) {
1866        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1867            return propagateFloat32NaN( a, b );
1868        }
1869        if ( ( bExp | bSig ) == 0 ) {
1870            float_raise( float_flag_invalid );
1871            return float32_default_nan;
1872        }
1873        return packFloat32( zSign, 0xFF, 0 );
1874    }
1875    if ( bExp == 0xFF ) {
1876        if ( bSig ) return propagateFloat32NaN( a, b );
1877        if ( ( aExp | aSig ) == 0 ) {
1878            float_raise( float_flag_invalid );
1879            return float32_default_nan;
1880        }
1881        return packFloat32( zSign, 0xFF, 0 );
1882    }
1883    if ( aExp == 0 ) {
1884        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1885        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1886    }
1887    if ( bExp == 0 ) {
1888        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1889        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1890    }
1891    zExp = aExp + bExp - 0x7F;
1892    aSig = ( aSig | 0x00800000 )<<7;
1893    bSig = ( bSig | 0x00800000 )<<8;
1894    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1895    zSig = zSig64;
1896    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1897        zSig <<= 1;
1898        --zExp;
1899    }
1900    return roundAndPackFloat32( zSign, zExp, zSig );
1901
1902}
1903
1904/*
1905-------------------------------------------------------------------------------
1906Returns the result of dividing the single-precision floating-point value `a'
1907by the corresponding value `b'.  The operation is performed according to the
1908IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909-------------------------------------------------------------------------------
1910*/
1911float32 float32_div( float32 a, float32 b )
1912{
1913    flag aSign, bSign, zSign;
1914    int16 aExp, bExp, zExp;
1915    bits32 aSig, bSig, zSig;
1916
1917    aSig = extractFloat32Frac( a );
1918    aExp = extractFloat32Exp( a );
1919    aSign = extractFloat32Sign( a );
1920    bSig = extractFloat32Frac( b );
1921    bExp = extractFloat32Exp( b );
1922    bSign = extractFloat32Sign( b );
1923    zSign = aSign ^ bSign;
1924    if ( aExp == 0xFF ) {
1925        if ( aSig ) return propagateFloat32NaN( a, b );
1926        if ( bExp == 0xFF ) {
1927            if ( bSig ) return propagateFloat32NaN( a, b );
1928            float_raise( float_flag_invalid );
1929            return float32_default_nan;
1930        }
1931        return packFloat32( zSign, 0xFF, 0 );
1932    }
1933    if ( bExp == 0xFF ) {
1934        if ( bSig ) return propagateFloat32NaN( a, b );
1935        return packFloat32( zSign, 0, 0 );
1936    }
1937    if ( bExp == 0 ) {
1938        if ( bSig == 0 ) {
1939            if ( ( aExp | aSig ) == 0 ) {
1940                float_raise( float_flag_invalid );
1941                return float32_default_nan;
1942            }
1943            float_raise( float_flag_divbyzero );
1944            return packFloat32( zSign, 0xFF, 0 );
1945        }
1946        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1947    }
1948    if ( aExp == 0 ) {
1949        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1950        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1951    }
1952    zExp = aExp - bExp + 0x7D;
1953    aSig = ( aSig | 0x00800000 )<<7;
1954    bSig = ( bSig | 0x00800000 )<<8;
1955    if ( bSig <= ( aSig + aSig ) ) {
1956        aSig >>= 1;
1957        ++zExp;
1958    }
1959    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1960    if ( ( zSig & 0x3F ) == 0 ) {
1961        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1962    }
1963    return roundAndPackFloat32( zSign, zExp, zSig );
1964
1965}
1966
1967#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1968/*
1969-------------------------------------------------------------------------------
1970Returns the remainder of the single-precision floating-point value `a'
1971with respect to the corresponding value `b'.  The operation is performed
1972according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1973-------------------------------------------------------------------------------
1974*/
1975float32 float32_rem( float32 a, float32 b )
1976{
1977    flag aSign, bSign, zSign;
1978    int16 aExp, bExp, expDiff;
1979    bits32 aSig, bSig;
1980    bits32 q;
1981    bits64 aSig64, bSig64, q64;
1982    bits32 alternateASig;
1983    sbits32 sigMean;
1984
1985    aSig = extractFloat32Frac( a );
1986    aExp = extractFloat32Exp( a );
1987    aSign = extractFloat32Sign( a );
1988    bSig = extractFloat32Frac( b );
1989    bExp = extractFloat32Exp( b );
1990    bSign = extractFloat32Sign( b );
1991    if ( aExp == 0xFF ) {
1992        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1993            return propagateFloat32NaN( a, b );
1994        }
1995        float_raise( float_flag_invalid );
1996        return float32_default_nan;
1997    }
1998    if ( bExp == 0xFF ) {
1999        if ( bSig ) return propagateFloat32NaN( a, b );
2000        return a;
2001    }
2002    if ( bExp == 0 ) {
2003        if ( bSig == 0 ) {
2004            float_raise( float_flag_invalid );
2005            return float32_default_nan;
2006        }
2007        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2008    }
2009    if ( aExp == 0 ) {
2010        if ( aSig == 0 ) return a;
2011        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2012    }
2013    expDiff = aExp - bExp;
2014    aSig |= 0x00800000;
2015    bSig |= 0x00800000;
2016    if ( expDiff < 32 ) {
2017        aSig <<= 8;
2018        bSig <<= 8;
2019        if ( expDiff < 0 ) {
2020            if ( expDiff < -1 ) return a;
2021            aSig >>= 1;
2022        }
2023        q = ( bSig <= aSig );
2024        if ( q ) aSig -= bSig;
2025        if ( 0 < expDiff ) {
2026            q = ( ( (bits64) aSig )<<32 ) / bSig;
2027            q >>= 32 - expDiff;
2028            bSig >>= 2;
2029            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2030        }
2031        else {
2032            aSig >>= 2;
2033            bSig >>= 2;
2034        }
2035    }
2036    else {
2037        if ( bSig <= aSig ) aSig -= bSig;
2038        aSig64 = ( (bits64) aSig )<<40;
2039        bSig64 = ( (bits64) bSig )<<40;
2040        expDiff -= 64;
2041        while ( 0 < expDiff ) {
2042            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2043            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2044            aSig64 = - ( ( bSig * q64 )<<38 );
2045            expDiff -= 62;
2046        }
2047        expDiff += 64;
2048        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2049        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2050        q = q64>>( 64 - expDiff );
2051        bSig <<= 6;
2052        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2053    }
2054    do {
2055        alternateASig = aSig;
2056        ++q;
2057        aSig -= bSig;
2058    } while ( 0 <= (sbits32) aSig );
2059    sigMean = aSig + alternateASig;
2060    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2061        aSig = alternateASig;
2062    }
2063    zSign = ( (sbits32) aSig < 0 );
2064    if ( zSign ) aSig = - aSig;
2065    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2066
2067}
2068#endif /* !SOFTFLOAT_FOR_GCC */
2069
2070#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2071/*
2072-------------------------------------------------------------------------------
2073Returns the square root of the single-precision floating-point value `a'.
2074The operation is performed according to the IEC/IEEE Standard for Binary
2075Floating-Point Arithmetic.
2076-------------------------------------------------------------------------------
2077*/
2078float32 float32_sqrt( float32 a )
2079{
2080    flag aSign;
2081    int16 aExp, zExp;
2082    bits32 aSig, zSig;
2083    bits64 rem, term;
2084
2085    aSig = extractFloat32Frac( a );
2086    aExp = extractFloat32Exp( a );
2087    aSign = extractFloat32Sign( a );
2088    if ( aExp == 0xFF ) {
2089        if ( aSig ) return propagateFloat32NaN( a, 0 );
2090        if ( ! aSign ) return a;
2091        float_raise( float_flag_invalid );
2092        return float32_default_nan;
2093    }
2094    if ( aSign ) {
2095        if ( ( aExp | aSig ) == 0 ) return a;
2096        float_raise( float_flag_invalid );
2097        return float32_default_nan;
2098    }
2099    if ( aExp == 0 ) {
2100        if ( aSig == 0 ) return 0;
2101        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2102    }
2103    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2104    aSig = ( aSig | 0x00800000 )<<8;
2105    zSig = estimateSqrt32( aExp, aSig ) + 2;
2106    if ( ( zSig & 0x7F ) <= 5 ) {
2107        if ( zSig < 2 ) {
2108            zSig = 0x7FFFFFFF;
2109            goto roundAndPack;
2110        }
2111        aSig >>= aExp & 1;
2112        term = ( (bits64) zSig ) * zSig;
2113        rem = ( ( (bits64) aSig )<<32 ) - term;
2114        while ( (sbits64) rem < 0 ) {
2115            --zSig;
2116            rem += ( ( (bits64) zSig )<<1 ) | 1;
2117        }
2118        zSig |= ( rem != 0 );
2119    }
2120    shift32RightJamming( zSig, 1, &zSig );
2121 roundAndPack:
2122    return roundAndPackFloat32( 0, zExp, zSig );
2123
2124}
2125#endif /* !SOFTFLOAT_FOR_GCC */
2126
2127/*
2128-------------------------------------------------------------------------------
2129Returns 1 if the single-precision floating-point value `a' is equal to
2130the corresponding value `b', and 0 otherwise.  The comparison is performed
2131according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2132-------------------------------------------------------------------------------
2133*/
2134flag float32_eq( float32 a, float32 b )
2135{
2136
2137    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2138         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2139       ) {
2140        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2141            float_raise( float_flag_invalid );
2142        }
2143        return 0;
2144    }
2145    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2146
2147}
2148
2149/*
2150-------------------------------------------------------------------------------
2151Returns 1 if the single-precision floating-point value `a' is less than
2152or equal to the corresponding value `b', and 0 otherwise.  The comparison
2153is performed according to the IEC/IEEE Standard for Binary Floating-Point
2154Arithmetic.
2155-------------------------------------------------------------------------------
2156*/
2157flag float32_le( float32 a, float32 b )
2158{
2159    flag aSign, bSign;
2160
2161    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2162         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2163       ) {
2164        float_raise( float_flag_invalid );
2165        return 0;
2166    }
2167    aSign = extractFloat32Sign( a );
2168    bSign = extractFloat32Sign( b );
2169    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2170    return ( a == b ) || ( aSign ^ ( a < b ) );
2171
2172}
2173
2174/*
2175-------------------------------------------------------------------------------
2176Returns 1 if the single-precision floating-point value `a' is less than
2177the corresponding value `b', and 0 otherwise.  The comparison is performed
2178according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2179-------------------------------------------------------------------------------
2180*/
2181flag float32_lt( float32 a, float32 b )
2182{
2183    flag aSign, bSign;
2184
2185    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2186         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2187       ) {
2188        float_raise( float_flag_invalid );
2189        return 0;
2190    }
2191    aSign = extractFloat32Sign( a );
2192    bSign = extractFloat32Sign( b );
2193    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2194    return ( a != b ) && ( aSign ^ ( a < b ) );
2195
2196}
2197
2198#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2199/*
2200-------------------------------------------------------------------------------
2201Returns 1 if the single-precision floating-point value `a' is equal to
2202the corresponding value `b', and 0 otherwise.  The invalid exception is
2203raised if either operand is a NaN.  Otherwise, the comparison is performed
2204according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2205-------------------------------------------------------------------------------
2206*/
2207flag float32_eq_signaling( float32 a, float32 b )
2208{
2209
2210    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2211         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2212       ) {
2213        float_raise( float_flag_invalid );
2214        return 0;
2215    }
2216    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2217
2218}
2219
2220/*
2221-------------------------------------------------------------------------------
2222Returns 1 if the single-precision floating-point value `a' is less than or
2223equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2224cause an exception.  Otherwise, the comparison is performed according to the
2225IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2226-------------------------------------------------------------------------------
2227*/
2228flag float32_le_quiet( float32 a, float32 b )
2229{
2230    flag aSign, bSign;
2231
2232    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2233         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2234       ) {
2235        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2236            float_raise( float_flag_invalid );
2237        }
2238        return 0;
2239    }
2240    aSign = extractFloat32Sign( a );
2241    bSign = extractFloat32Sign( b );
2242    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2243    return ( a == b ) || ( aSign ^ ( a < b ) );
2244
2245}
2246
2247/*
2248-------------------------------------------------------------------------------
2249Returns 1 if the single-precision floating-point value `a' is less than
2250the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2251exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2252Standard for Binary Floating-Point Arithmetic.
2253-------------------------------------------------------------------------------
2254*/
2255flag float32_lt_quiet( float32 a, float32 b )
2256{
2257    flag aSign, bSign;
2258
2259    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2260         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2261       ) {
2262        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2263            float_raise( float_flag_invalid );
2264        }
2265        return 0;
2266    }
2267    aSign = extractFloat32Sign( a );
2268    bSign = extractFloat32Sign( b );
2269    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2270    return ( a != b ) && ( aSign ^ ( a < b ) );
2271
2272}
2273#endif /* !SOFTFLOAT_FOR_GCC */
2274
2275#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2276/*
2277-------------------------------------------------------------------------------
2278Returns the result of converting the double-precision floating-point value
2279`a' to the 32-bit two's complement integer format.  The conversion is
2280performed according to the IEC/IEEE Standard for Binary Floating-Point
2281Arithmetic---which means in particular that the conversion is rounded
2282according to the current rounding mode.  If `a' is a NaN, the largest
2283positive integer is returned.  Otherwise, if the conversion overflows, the
2284largest integer with the same sign as `a' is returned.
2285-------------------------------------------------------------------------------
2286*/
2287int32 float64_to_int32( float64 a )
2288{
2289    flag aSign;
2290    int16 aExp, shiftCount;
2291    bits64 aSig;
2292
2293    aSig = extractFloat64Frac( a );
2294    aExp = extractFloat64Exp( a );
2295    aSign = extractFloat64Sign( a );
2296    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2297    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2298    shiftCount = 0x42C - aExp;
2299    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2300    return roundAndPackInt32( aSign, aSig );
2301
2302}
2303#endif /* !SOFTFLOAT_FOR_GCC */
2304
2305/*
2306-------------------------------------------------------------------------------
2307Returns the result of converting the double-precision floating-point value
2308`a' to the 32-bit two's complement integer format.  The conversion is
2309performed according to the IEC/IEEE Standard for Binary Floating-Point
2310Arithmetic, except that the conversion is always rounded toward zero.
2311If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2312the conversion overflows, the largest integer with the same sign as `a' is
2313returned.
2314-------------------------------------------------------------------------------
2315*/
2316int32 float64_to_int32_round_to_zero( float64 a )
2317{
2318    flag aSign;
2319    int16 aExp, shiftCount;
2320    bits64 aSig, savedASig;
2321    int32 z;
2322
2323    aSig = extractFloat64Frac( a );
2324    aExp = extractFloat64Exp( a );
2325    aSign = extractFloat64Sign( a );
2326    if ( 0x41E < aExp ) {
2327        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2328        goto invalid;
2329    }
2330    else if ( aExp < 0x3FF ) {
2331        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2332        return 0;
2333    }
2334    aSig |= LIT64( 0x0010000000000000 );
2335    shiftCount = 0x433 - aExp;
2336    savedASig = aSig;
2337    aSig >>= shiftCount;
2338    z = aSig;
2339    if ( aSign ) z = - z;
2340    if ( ( z < 0 ) ^ aSign ) {
2341 invalid:
2342        float_raise( float_flag_invalid );
2343        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2344    }
2345    if ( ( aSig<<shiftCount ) != savedASig ) {
2346        float_exception_flags |= float_flag_inexact;
2347    }
2348    return z;
2349
2350}
2351
2352#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2353/*
2354-------------------------------------------------------------------------------
2355Returns the result of converting the double-precision floating-point value
2356`a' to the 64-bit two's complement integer format.  The conversion is
2357performed according to the IEC/IEEE Standard for Binary Floating-Point
2358Arithmetic---which means in particular that the conversion is rounded
2359according to the current rounding mode.  If `a' is a NaN, the largest
2360positive integer is returned.  Otherwise, if the conversion overflows, the
2361largest integer with the same sign as `a' is returned.
2362-------------------------------------------------------------------------------
2363*/
2364int64 float64_to_int64( float64 a )
2365{
2366    flag aSign;
2367    int16 aExp, shiftCount;
2368    bits64 aSig, aSigExtra;
2369
2370    aSig = extractFloat64Frac( a );
2371    aExp = extractFloat64Exp( a );
2372    aSign = extractFloat64Sign( a );
2373    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2374    shiftCount = 0x433 - aExp;
2375    if ( shiftCount <= 0 ) {
2376        if ( 0x43E < aExp ) {
2377            float_raise( float_flag_invalid );
2378            if (    ! aSign
2379                 || (    ( aExp == 0x7FF )
2380                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2381               ) {
2382                return LIT64( 0x7FFFFFFFFFFFFFFF );
2383            }
2384            return (sbits64) LIT64( 0x8000000000000000 );
2385        }
2386        aSigExtra = 0;
2387        aSig <<= - shiftCount;
2388    }
2389    else {
2390        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2391    }
2392    return roundAndPackInt64( aSign, aSig, aSigExtra );
2393
2394}
2395
2396/*
2397-------------------------------------------------------------------------------
2398Returns the result of converting the double-precision floating-point value
2399`a' to the 64-bit two's complement integer format.  The conversion is
2400performed according to the IEC/IEEE Standard for Binary Floating-Point
2401Arithmetic, except that the conversion is always rounded toward zero.
2402If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2403the conversion overflows, the largest integer with the same sign as `a' is
2404returned.
2405-------------------------------------------------------------------------------
2406*/
2407int64 float64_to_int64_round_to_zero( float64 a )
2408{
2409    flag aSign;
2410    int16 aExp, shiftCount;
2411    bits64 aSig;
2412    int64 z;
2413
2414    aSig = extractFloat64Frac( a );
2415    aExp = extractFloat64Exp( a );
2416    aSign = extractFloat64Sign( a );
2417    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418    shiftCount = aExp - 0x433;
2419    if ( 0 <= shiftCount ) {
2420        if ( 0x43E <= aExp ) {
2421            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2422                float_raise( float_flag_invalid );
2423                if (    ! aSign
2424                     || (    ( aExp == 0x7FF )
2425                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2426                   ) {
2427                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2428                }
2429            }
2430            return (sbits64) LIT64( 0x8000000000000000 );
2431        }
2432        z = aSig<<shiftCount;
2433    }
2434    else {
2435        if ( aExp < 0x3FE ) {
2436            if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2437            return 0;
2438        }
2439        z = aSig>>( - shiftCount );
2440        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2441            float_exception_flags |= float_flag_inexact;
2442        }
2443    }
2444    if ( aSign ) z = - z;
2445    return z;
2446
2447}
2448#endif /* !SOFTFLOAT_FOR_GCC */
2449
2450/*
2451-------------------------------------------------------------------------------
2452Returns the result of converting the double-precision floating-point value
2453`a' to the single-precision floating-point format.  The conversion is
2454performed according to the IEC/IEEE Standard for Binary Floating-Point
2455Arithmetic.
2456-------------------------------------------------------------------------------
2457*/
2458float32 float64_to_float32( float64 a )
2459{
2460    flag aSign;
2461    int16 aExp;
2462    bits64 aSig;
2463    bits32 zSig;
2464
2465    aSig = extractFloat64Frac( a );
2466    aExp = extractFloat64Exp( a );
2467    aSign = extractFloat64Sign( a );
2468    if ( aExp == 0x7FF ) {
2469        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2470        return packFloat32( aSign, 0xFF, 0 );
2471    }
2472    shift64RightJamming( aSig, 22, &aSig );
2473    zSig = aSig;
2474    if ( aExp || zSig ) {
2475        zSig |= 0x40000000;
2476        aExp -= 0x381;
2477    }
2478    return roundAndPackFloat32( aSign, aExp, zSig );
2479
2480}
2481
2482#ifdef FLOATX80
2483
2484/*
2485-------------------------------------------------------------------------------
2486Returns the result of converting the double-precision floating-point value
2487`a' to the extended double-precision floating-point format.  The conversion
2488is performed according to the IEC/IEEE Standard for Binary Floating-Point
2489Arithmetic.
2490-------------------------------------------------------------------------------
2491*/
2492floatx80 float64_to_floatx80( float64 a )
2493{
2494    flag aSign;
2495    int16 aExp;
2496    bits64 aSig;
2497
2498    aSig = extractFloat64Frac( a );
2499    aExp = extractFloat64Exp( a );
2500    aSign = extractFloat64Sign( a );
2501    if ( aExp == 0x7FF ) {
2502        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2503        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2504    }
2505    if ( aExp == 0 ) {
2506        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2507        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2508    }
2509    return
2510        packFloatx80(
2511            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2512
2513}
2514
2515#endif
2516
2517#ifdef FLOAT128
2518
2519/*
2520-------------------------------------------------------------------------------
2521Returns the result of converting the double-precision floating-point value
2522`a' to the quadruple-precision floating-point format.  The conversion is
2523performed according to the IEC/IEEE Standard for Binary Floating-Point
2524Arithmetic.
2525-------------------------------------------------------------------------------
2526*/
2527float128 float64_to_float128( float64 a )
2528{
2529    flag aSign;
2530    int16 aExp;
2531    bits64 aSig, zSig0, zSig1;
2532
2533    aSig = extractFloat64Frac( a );
2534    aExp = extractFloat64Exp( a );
2535    aSign = extractFloat64Sign( a );
2536    if ( aExp == 0x7FF ) {
2537        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2538        return packFloat128( aSign, 0x7FFF, 0, 0 );
2539    }
2540    if ( aExp == 0 ) {
2541        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2542        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2543        --aExp;
2544    }
2545    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2546    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2547
2548}
2549
2550#endif
2551
2552#ifndef SOFTFLOAT_FOR_GCC
2553/*
2554-------------------------------------------------------------------------------
2555Rounds the double-precision floating-point value `a' to an integer, and
2556returns the result as a double-precision floating-point value.  The
2557operation is performed according to the IEC/IEEE Standard for Binary
2558Floating-Point Arithmetic.
2559-------------------------------------------------------------------------------
2560*/
2561float64 float64_round_to_int( float64 a )
2562{
2563    flag aSign;
2564    int16 aExp;
2565    bits64 lastBitMask, roundBitsMask;
2566    int8 roundingMode;
2567    float64 z;
2568
2569    aExp = extractFloat64Exp( a );
2570    if ( 0x433 <= aExp ) {
2571        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2572            return propagateFloat64NaN( a, a );
2573        }
2574        return a;
2575    }
2576    if ( aExp < 0x3FF ) {
2577        if ( (bits64) ( a<<1 ) == 0 ) return a;
2578        float_exception_flags |= float_flag_inexact;
2579        aSign = extractFloat64Sign( a );
2580        switch ( float_rounding_mode ) {
2581         case float_round_nearest_even:
2582            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2583                return packFloat64( aSign, 0x3FF, 0 );
2584            }
2585            break;
2586	 case float_round_to_zero:
2587	    break;
2588         case float_round_down:
2589            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2590         case float_round_up:
2591            return
2592            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2593        }
2594        return packFloat64( aSign, 0, 0 );
2595    }
2596    lastBitMask = 1;
2597    lastBitMask <<= 0x433 - aExp;
2598    roundBitsMask = lastBitMask - 1;
2599    z = a;
2600    roundingMode = float_rounding_mode;
2601    if ( roundingMode == float_round_nearest_even ) {
2602        z += lastBitMask>>1;
2603        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2604    }
2605    else if ( roundingMode != float_round_to_zero ) {
2606        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2607            z += roundBitsMask;
2608        }
2609    }
2610    z &= ~ roundBitsMask;
2611    if ( z != a ) float_exception_flags |= float_flag_inexact;
2612    return z;
2613
2614}
2615#endif
2616
2617/*
2618-------------------------------------------------------------------------------
2619Returns the result of adding the absolute values of the double-precision
2620floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2621before being returned.  `zSign' is ignored if the result is a NaN.
2622The addition is performed according to the IEC/IEEE Standard for Binary
2623Floating-Point Arithmetic.
2624-------------------------------------------------------------------------------
2625*/
2626static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2627{
2628    int16 aExp, bExp, zExp;
2629    bits64 aSig, bSig, zSig;
2630    int16 expDiff;
2631
2632    aSig = extractFloat64Frac( a );
2633    aExp = extractFloat64Exp( a );
2634    bSig = extractFloat64Frac( b );
2635    bExp = extractFloat64Exp( b );
2636    expDiff = aExp - bExp;
2637    aSig <<= 9;
2638    bSig <<= 9;
2639    if ( 0 < expDiff ) {
2640        if ( aExp == 0x7FF ) {
2641            if ( aSig ) return propagateFloat64NaN( a, b );
2642            return a;
2643        }
2644        if ( bExp == 0 ) {
2645            --expDiff;
2646        }
2647        else {
2648            bSig |= LIT64( 0x2000000000000000 );
2649        }
2650        shift64RightJamming( bSig, expDiff, &bSig );
2651        zExp = aExp;
2652    }
2653    else if ( expDiff < 0 ) {
2654        if ( bExp == 0x7FF ) {
2655            if ( bSig ) return propagateFloat64NaN( a, b );
2656            return packFloat64( zSign, 0x7FF, 0 );
2657        }
2658        if ( aExp == 0 ) {
2659            ++expDiff;
2660        }
2661        else {
2662            aSig |= LIT64( 0x2000000000000000 );
2663        }
2664        shift64RightJamming( aSig, - expDiff, &aSig );
2665        zExp = bExp;
2666    }
2667    else {
2668        if ( aExp == 0x7FF ) {
2669            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2670            return a;
2671        }
2672        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2673        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2674        zExp = aExp;
2675        goto roundAndPack;
2676    }
2677    aSig |= LIT64( 0x2000000000000000 );
2678    zSig = ( aSig + bSig )<<1;
2679    --zExp;
2680    if ( (sbits64) zSig < 0 ) {
2681        zSig = aSig + bSig;
2682        ++zExp;
2683    }
2684 roundAndPack:
2685    return roundAndPackFloat64( zSign, zExp, zSig );
2686
2687}
2688
2689/*
2690-------------------------------------------------------------------------------
2691Returns the result of subtracting the absolute values of the double-
2692precision floating-point values `a' and `b'.  If `zSign' is 1, the
2693difference is negated before being returned.  `zSign' is ignored if the
2694result is a NaN.  The subtraction is performed according to the IEC/IEEE
2695Standard for Binary Floating-Point Arithmetic.
2696-------------------------------------------------------------------------------
2697*/
2698static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2699{
2700    int16 aExp, bExp, zExp;
2701    bits64 aSig, bSig, zSig;
2702    int16 expDiff;
2703
2704    aSig = extractFloat64Frac( a );
2705    aExp = extractFloat64Exp( a );
2706    bSig = extractFloat64Frac( b );
2707    bExp = extractFloat64Exp( b );
2708    expDiff = aExp - bExp;
2709    aSig <<= 10;
2710    bSig <<= 10;
2711    if ( 0 < expDiff ) goto aExpBigger;
2712    if ( expDiff < 0 ) goto bExpBigger;
2713    if ( aExp == 0x7FF ) {
2714        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2715        float_raise( float_flag_invalid );
2716        return float64_default_nan;
2717    }
2718    if ( aExp == 0 ) {
2719        aExp = 1;
2720        bExp = 1;
2721    }
2722    if ( bSig < aSig ) goto aBigger;
2723    if ( aSig < bSig ) goto bBigger;
2724    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2725 bExpBigger:
2726    if ( bExp == 0x7FF ) {
2727        if ( bSig ) return propagateFloat64NaN( a, b );
2728        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2729    }
2730    if ( aExp == 0 ) {
2731        ++expDiff;
2732    }
2733    else {
2734        aSig |= LIT64( 0x4000000000000000 );
2735    }
2736    shift64RightJamming( aSig, - expDiff, &aSig );
2737    bSig |= LIT64( 0x4000000000000000 );
2738 bBigger:
2739    zSig = bSig - aSig;
2740    zExp = bExp;
2741    zSign ^= 1;
2742    goto normalizeRoundAndPack;
2743 aExpBigger:
2744    if ( aExp == 0x7FF ) {
2745        if ( aSig ) return propagateFloat64NaN( a, b );
2746        return a;
2747    }
2748    if ( bExp == 0 ) {
2749        --expDiff;
2750    }
2751    else {
2752        bSig |= LIT64( 0x4000000000000000 );
2753    }
2754    shift64RightJamming( bSig, expDiff, &bSig );
2755    aSig |= LIT64( 0x4000000000000000 );
2756 aBigger:
2757    zSig = aSig - bSig;
2758    zExp = aExp;
2759 normalizeRoundAndPack:
2760    --zExp;
2761    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2762
2763}
2764
2765/*
2766-------------------------------------------------------------------------------
2767Returns the result of adding the double-precision floating-point values `a'
2768and `b'.  The operation is performed according to the IEC/IEEE Standard for
2769Binary Floating-Point Arithmetic.
2770-------------------------------------------------------------------------------
2771*/
2772float64 float64_add( float64 a, float64 b )
2773{
2774    flag aSign, bSign;
2775
2776    aSign = extractFloat64Sign( a );
2777    bSign = extractFloat64Sign( b );
2778    if ( aSign == bSign ) {
2779        return addFloat64Sigs( a, b, aSign );
2780    }
2781    else {
2782        return subFloat64Sigs( a, b, aSign );
2783    }
2784
2785}
2786
2787/*
2788-------------------------------------------------------------------------------
2789Returns the result of subtracting the double-precision floating-point values
2790`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2791for Binary Floating-Point Arithmetic.
2792-------------------------------------------------------------------------------
2793*/
2794float64 float64_sub( float64 a, float64 b )
2795{
2796    flag aSign, bSign;
2797
2798    aSign = extractFloat64Sign( a );
2799    bSign = extractFloat64Sign( b );
2800    if ( aSign == bSign ) {
2801        return subFloat64Sigs( a, b, aSign );
2802    }
2803    else {
2804        return addFloat64Sigs( a, b, aSign );
2805    }
2806
2807}
2808
2809/*
2810-------------------------------------------------------------------------------
2811Returns the result of multiplying the double-precision floating-point values
2812`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2813for Binary Floating-Point Arithmetic.
2814-------------------------------------------------------------------------------
2815*/
2816float64 float64_mul( float64 a, float64 b )
2817{
2818    flag aSign, bSign, zSign;
2819    int16 aExp, bExp, zExp;
2820    bits64 aSig, bSig, zSig0, zSig1;
2821
2822    aSig = extractFloat64Frac( a );
2823    aExp = extractFloat64Exp( a );
2824    aSign = extractFloat64Sign( a );
2825    bSig = extractFloat64Frac( b );
2826    bExp = extractFloat64Exp( b );
2827    bSign = extractFloat64Sign( b );
2828    zSign = aSign ^ bSign;
2829    if ( aExp == 0x7FF ) {
2830        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2831            return propagateFloat64NaN( a, b );
2832        }
2833        if ( ( bExp | bSig ) == 0 ) {
2834            float_raise( float_flag_invalid );
2835            return float64_default_nan;
2836        }
2837        return packFloat64( zSign, 0x7FF, 0 );
2838    }
2839    if ( bExp == 0x7FF ) {
2840        if ( bSig ) return propagateFloat64NaN( a, b );
2841        if ( ( aExp | aSig ) == 0 ) {
2842            float_raise( float_flag_invalid );
2843            return float64_default_nan;
2844        }
2845        return packFloat64( zSign, 0x7FF, 0 );
2846    }
2847    if ( aExp == 0 ) {
2848        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2849        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2850    }
2851    if ( bExp == 0 ) {
2852        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2853        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2854    }
2855    zExp = aExp + bExp - 0x3FF;
2856    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2857    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2858    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2859    zSig0 |= ( zSig1 != 0 );
2860    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2861        zSig0 <<= 1;
2862        --zExp;
2863    }
2864    return roundAndPackFloat64( zSign, zExp, zSig0 );
2865
2866}
2867
2868/*
2869-------------------------------------------------------------------------------
2870Returns the result of dividing the double-precision floating-point value `a'
2871by the corresponding value `b'.  The operation is performed according to
2872the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2873-------------------------------------------------------------------------------
2874*/
2875float64 float64_div( float64 a, float64 b )
2876{
2877    flag aSign, bSign, zSign;
2878    int16 aExp, bExp, zExp;
2879    bits64 aSig, bSig, zSig;
2880    bits64 rem0, rem1;
2881    bits64 term0, term1;
2882
2883    aSig = extractFloat64Frac( a );
2884    aExp = extractFloat64Exp( a );
2885    aSign = extractFloat64Sign( a );
2886    bSig = extractFloat64Frac( b );
2887    bExp = extractFloat64Exp( b );
2888    bSign = extractFloat64Sign( b );
2889    zSign = aSign ^ bSign;
2890    if ( aExp == 0x7FF ) {
2891        if ( aSig ) return propagateFloat64NaN( a, b );
2892        if ( bExp == 0x7FF ) {
2893            if ( bSig ) return propagateFloat64NaN( a, b );
2894            float_raise( float_flag_invalid );
2895            return float64_default_nan;
2896        }
2897        return packFloat64( zSign, 0x7FF, 0 );
2898    }
2899    if ( bExp == 0x7FF ) {
2900        if ( bSig ) return propagateFloat64NaN( a, b );
2901        return packFloat64( zSign, 0, 0 );
2902    }
2903    if ( bExp == 0 ) {
2904        if ( bSig == 0 ) {
2905            if ( ( aExp | aSig ) == 0 ) {
2906                float_raise( float_flag_invalid );
2907                return float64_default_nan;
2908            }
2909            float_raise( float_flag_divbyzero );
2910            return packFloat64( zSign, 0x7FF, 0 );
2911        }
2912        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2913    }
2914    if ( aExp == 0 ) {
2915        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2916        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2917    }
2918    zExp = aExp - bExp + 0x3FD;
2919    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2920    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2921    if ( bSig <= ( aSig + aSig ) ) {
2922        aSig >>= 1;
2923        ++zExp;
2924    }
2925    zSig = estimateDiv128To64( aSig, 0, bSig );
2926    if ( ( zSig & 0x1FF ) <= 2 ) {
2927        mul64To128( bSig, zSig, &term0, &term1 );
2928        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2929        while ( (sbits64) rem0 < 0 ) {
2930            --zSig;
2931            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2932        }
2933        zSig |= ( rem1 != 0 );
2934    }
2935    return roundAndPackFloat64( zSign, zExp, zSig );
2936
2937}
2938
2939#ifndef SOFTFLOAT_FOR_GCC
2940/*
2941-------------------------------------------------------------------------------
2942Returns the remainder of the double-precision floating-point value `a'
2943with respect to the corresponding value `b'.  The operation is performed
2944according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2945-------------------------------------------------------------------------------
2946*/
2947float64 float64_rem( float64 a, float64 b )
2948{
2949    flag aSign, bSign, zSign;
2950    int16 aExp, bExp, expDiff;
2951    bits64 aSig, bSig;
2952    bits64 q, alternateASig;
2953    sbits64 sigMean;
2954
2955    aSig = extractFloat64Frac( a );
2956    aExp = extractFloat64Exp( a );
2957    aSign = extractFloat64Sign( a );
2958    bSig = extractFloat64Frac( b );
2959    bExp = extractFloat64Exp( b );
2960    bSign = extractFloat64Sign( b );
2961    if ( aExp == 0x7FF ) {
2962        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2963            return propagateFloat64NaN( a, b );
2964        }
2965        float_raise( float_flag_invalid );
2966        return float64_default_nan;
2967    }
2968    if ( bExp == 0x7FF ) {
2969        if ( bSig ) return propagateFloat64NaN( a, b );
2970        return a;
2971    }
2972    if ( bExp == 0 ) {
2973        if ( bSig == 0 ) {
2974            float_raise( float_flag_invalid );
2975            return float64_default_nan;
2976        }
2977        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2978    }
2979    if ( aExp == 0 ) {
2980        if ( aSig == 0 ) return a;
2981        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2982    }
2983    expDiff = aExp - bExp;
2984    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2985    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2986    if ( expDiff < 0 ) {
2987        if ( expDiff < -1 ) return a;
2988        aSig >>= 1;
2989    }
2990    q = ( bSig <= aSig );
2991    if ( q ) aSig -= bSig;
2992    expDiff -= 64;
2993    while ( 0 < expDiff ) {
2994        q = estimateDiv128To64( aSig, 0, bSig );
2995        q = ( 2 < q ) ? q - 2 : 0;
2996        aSig = - ( ( bSig>>2 ) * q );
2997        expDiff -= 62;
2998    }
2999    expDiff += 64;
3000    if ( 0 < expDiff ) {
3001        q = estimateDiv128To64( aSig, 0, bSig );
3002        q = ( 2 < q ) ? q - 2 : 0;
3003        q >>= 64 - expDiff;
3004        bSig >>= 2;
3005        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3006    }
3007    else {
3008        aSig >>= 2;
3009        bSig >>= 2;
3010    }
3011    do {
3012        alternateASig = aSig;
3013        ++q;
3014        aSig -= bSig;
3015    } while ( 0 <= (sbits64) aSig );
3016    sigMean = aSig + alternateASig;
3017    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3018        aSig = alternateASig;
3019    }
3020    zSign = ( (sbits64) aSig < 0 );
3021    if ( zSign ) aSig = - aSig;
3022    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3023
3024}
3025
3026/*
3027-------------------------------------------------------------------------------
3028Returns the square root of the double-precision floating-point value `a'.
3029The operation is performed according to the IEC/IEEE Standard for Binary
3030Floating-Point Arithmetic.
3031-------------------------------------------------------------------------------
3032*/
3033float64 float64_sqrt( float64 a )
3034{
3035    flag aSign;
3036    int16 aExp, zExp;
3037    bits64 aSig, zSig, doubleZSig;
3038    bits64 rem0, rem1, term0, term1;
3039
3040    aSig = extractFloat64Frac( a );
3041    aExp = extractFloat64Exp( a );
3042    aSign = extractFloat64Sign( a );
3043    if ( aExp == 0x7FF ) {
3044        if ( aSig ) return propagateFloat64NaN( a, a );
3045        if ( ! aSign ) return a;
3046        float_raise( float_flag_invalid );
3047        return float64_default_nan;
3048    }
3049    if ( aSign ) {
3050        if ( ( aExp | aSig ) == 0 ) return a;
3051        float_raise( float_flag_invalid );
3052        return float64_default_nan;
3053    }
3054    if ( aExp == 0 ) {
3055        if ( aSig == 0 ) return 0;
3056        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3057    }
3058    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3059    aSig |= LIT64( 0x0010000000000000 );
3060    zSig = estimateSqrt32( aExp, aSig>>21 );
3061    aSig <<= 9 - ( aExp & 1 );
3062    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3063    if ( ( zSig & 0x1FF ) <= 5 ) {
3064        doubleZSig = zSig<<1;
3065        mul64To128( zSig, zSig, &term0, &term1 );
3066        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3067        while ( (sbits64) rem0 < 0 ) {
3068            --zSig;
3069            doubleZSig -= 2;
3070            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3071        }
3072        zSig |= ( ( rem0 | rem1 ) != 0 );
3073    }
3074    return roundAndPackFloat64( 0, zExp, zSig );
3075
3076}
3077#endif
3078
3079/*
3080-------------------------------------------------------------------------------
3081Returns 1 if the double-precision floating-point value `a' is equal to the
3082corresponding value `b', and 0 otherwise.  The comparison is performed
3083according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3084-------------------------------------------------------------------------------
3085*/
3086flag float64_eq( float64 a, float64 b )
3087{
3088
3089    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3090         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3091       ) {
3092        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3093            float_raise( float_flag_invalid );
3094        }
3095        return 0;
3096    }
3097    return ( a == b ) ||
3098	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3099
3100}
3101
3102/*
3103-------------------------------------------------------------------------------
3104Returns 1 if the double-precision floating-point value `a' is less than or
3105equal to the corresponding value `b', and 0 otherwise.  The comparison is
3106performed according to the IEC/IEEE Standard for Binary Floating-Point
3107Arithmetic.
3108-------------------------------------------------------------------------------
3109*/
3110flag float64_le( float64 a, float64 b )
3111{
3112    flag aSign, bSign;
3113
3114    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3115         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3116       ) {
3117        float_raise( float_flag_invalid );
3118        return 0;
3119    }
3120    aSign = extractFloat64Sign( a );
3121    bSign = extractFloat64Sign( b );
3122    if ( aSign != bSign )
3123	return aSign ||
3124	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3125	      0 );
3126    return ( a == b ) ||
3127	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3128
3129}
3130
3131/*
3132-------------------------------------------------------------------------------
3133Returns 1 if the double-precision floating-point value `a' is less than
3134the corresponding value `b', and 0 otherwise.  The comparison is performed
3135according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3136-------------------------------------------------------------------------------
3137*/
3138flag float64_lt( float64 a, float64 b )
3139{
3140    flag aSign, bSign;
3141
3142    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3143         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3144       ) {
3145        float_raise( float_flag_invalid );
3146        return 0;
3147    }
3148    aSign = extractFloat64Sign( a );
3149    bSign = extractFloat64Sign( b );
3150    if ( aSign != bSign )
3151	return aSign &&
3152	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3153	      0 );
3154    return ( a != b ) &&
3155	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3156
3157}
3158
3159#ifndef SOFTFLOAT_FOR_GCC
3160/*
3161-------------------------------------------------------------------------------
3162Returns 1 if the double-precision floating-point value `a' is equal to the
3163corresponding value `b', and 0 otherwise.  The invalid exception is raised
3164if either operand is a NaN.  Otherwise, the comparison is performed
3165according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3166-------------------------------------------------------------------------------
3167*/
3168flag float64_eq_signaling( float64 a, float64 b )
3169{
3170
3171    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3172         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3173       ) {
3174        float_raise( float_flag_invalid );
3175        return 0;
3176    }
3177    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3178
3179}
3180
3181/*
3182-------------------------------------------------------------------------------
3183Returns 1 if the double-precision floating-point value `a' is less than or
3184equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3185cause an exception.  Otherwise, the comparison is performed according to the
3186IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3187-------------------------------------------------------------------------------
3188*/
3189flag float64_le_quiet( float64 a, float64 b )
3190{
3191    flag aSign, bSign;
3192
3193    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3194         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3195       ) {
3196        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3197            float_raise( float_flag_invalid );
3198        }
3199        return 0;
3200    }
3201    aSign = extractFloat64Sign( a );
3202    bSign = extractFloat64Sign( b );
3203    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3204    return ( a == b ) || ( aSign ^ ( a < b ) );
3205
3206}
3207
3208/*
3209-------------------------------------------------------------------------------
3210Returns 1 if the double-precision floating-point value `a' is less than
3211the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3212exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3213Standard for Binary Floating-Point Arithmetic.
3214-------------------------------------------------------------------------------
3215*/
3216flag float64_lt_quiet( float64 a, float64 b )
3217{
3218    flag aSign, bSign;
3219
3220    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3221         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3222       ) {
3223        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3224            float_raise( float_flag_invalid );
3225        }
3226        return 0;
3227    }
3228    aSign = extractFloat64Sign( a );
3229    bSign = extractFloat64Sign( b );
3230    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3231    return ( a != b ) && ( aSign ^ ( a < b ) );
3232
3233}
3234#endif
3235
3236#ifdef FLOATX80
3237
3238/*
3239-------------------------------------------------------------------------------
3240Returns the result of converting the extended double-precision floating-
3241point value `a' to the 32-bit two's complement integer format.  The
3242conversion is performed according to the IEC/IEEE Standard for Binary
3243Floating-Point Arithmetic---which means in particular that the conversion
3244is rounded according to the current rounding mode.  If `a' is a NaN, the
3245largest positive integer is returned.  Otherwise, if the conversion
3246overflows, the largest integer with the same sign as `a' is returned.
3247-------------------------------------------------------------------------------
3248*/
3249int32 floatx80_to_int32( floatx80 a )
3250{
3251    flag aSign;
3252    int32 aExp, shiftCount;
3253    bits64 aSig;
3254
3255    aSig = extractFloatx80Frac( a );
3256    aExp = extractFloatx80Exp( a );
3257    aSign = extractFloatx80Sign( a );
3258    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3259    shiftCount = 0x4037 - aExp;
3260    if ( shiftCount <= 0 ) shiftCount = 1;
3261    shift64RightJamming( aSig, shiftCount, &aSig );
3262    return roundAndPackInt32( aSign, aSig );
3263
3264}
3265
3266/*
3267-------------------------------------------------------------------------------
3268Returns the result of converting the extended double-precision floating-
3269point value `a' to the 32-bit two's complement integer format.  The
3270conversion is performed according to the IEC/IEEE Standard for Binary
3271Floating-Point Arithmetic, except that the conversion is always rounded
3272toward zero.  If `a' is a NaN, the largest positive integer is returned.
3273Otherwise, if the conversion overflows, the largest integer with the same
3274sign as `a' is returned.
3275-------------------------------------------------------------------------------
3276*/
3277int32 floatx80_to_int32_round_to_zero( floatx80 a )
3278{
3279    flag aSign;
3280    int32 aExp, shiftCount;
3281    bits64 aSig, savedASig;
3282    int32 z;
3283
3284    aSig = extractFloatx80Frac( a );
3285    aExp = extractFloatx80Exp( a );
3286    aSign = extractFloatx80Sign( a );
3287    if ( 0x401E < aExp ) {
3288        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3289        goto invalid;
3290    }
3291    else if ( aExp < 0x3FFF ) {
3292        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3293        return 0;
3294    }
3295    shiftCount = 0x403E - aExp;
3296    savedASig = aSig;
3297    aSig >>= shiftCount;
3298    z = aSig;
3299    if ( aSign ) z = - z;
3300    if ( ( z < 0 ) ^ aSign ) {
3301 invalid:
3302        float_raise( float_flag_invalid );
3303        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3304    }
3305    if ( ( aSig<<shiftCount ) != savedASig ) {
3306        float_exception_flags |= float_flag_inexact;
3307    }
3308    return z;
3309
3310}
3311
3312/*
3313-------------------------------------------------------------------------------
3314Returns the result of converting the extended double-precision floating-
3315point value `a' to the 64-bit two's complement integer format.  The
3316conversion is performed according to the IEC/IEEE Standard for Binary
3317Floating-Point Arithmetic---which means in particular that the conversion
3318is rounded according to the current rounding mode.  If `a' is a NaN,
3319the largest positive integer is returned.  Otherwise, if the conversion
3320overflows, the largest integer with the same sign as `a' is returned.
3321-------------------------------------------------------------------------------
3322*/
3323int64 floatx80_to_int64( floatx80 a )
3324{
3325    flag aSign;
3326    int32 aExp, shiftCount;
3327    bits64 aSig, aSigExtra;
3328
3329    aSig = extractFloatx80Frac( a );
3330    aExp = extractFloatx80Exp( a );
3331    aSign = extractFloatx80Sign( a );
3332    shiftCount = 0x403E - aExp;
3333    if ( shiftCount <= 0 ) {
3334        if ( shiftCount ) {
3335            float_raise( float_flag_invalid );
3336            if (    ! aSign
3337                 || (    ( aExp == 0x7FFF )
3338                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3339               ) {
3340                return LIT64( 0x7FFFFFFFFFFFFFFF );
3341            }
3342            return (sbits64) LIT64( 0x8000000000000000 );
3343        }
3344        aSigExtra = 0;
3345    }
3346    else {
3347        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3348    }
3349    return roundAndPackInt64( aSign, aSig, aSigExtra );
3350
3351}
3352
3353/*
3354-------------------------------------------------------------------------------
3355Returns the result of converting the extended double-precision floating-
3356point value `a' to the 64-bit two's complement integer format.  The
3357conversion is performed according to the IEC/IEEE Standard for Binary
3358Floating-Point Arithmetic, except that the conversion is always rounded
3359toward zero.  If `a' is a NaN, the largest positive integer is returned.
3360Otherwise, if the conversion overflows, the largest integer with the same
3361sign as `a' is returned.
3362-------------------------------------------------------------------------------
3363*/
3364int64 floatx80_to_int64_round_to_zero( floatx80 a )
3365{
3366    flag aSign;
3367    int32 aExp, shiftCount;
3368    bits64 aSig;
3369    int64 z;
3370
3371    aSig = extractFloatx80Frac( a );
3372    aExp = extractFloatx80Exp( a );
3373    aSign = extractFloatx80Sign( a );
3374    shiftCount = aExp - 0x403E;
3375    if ( 0 <= shiftCount ) {
3376        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3377        if ( ( a.high != 0xC03E ) || aSig ) {
3378            float_raise( float_flag_invalid );
3379            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3380                return LIT64( 0x7FFFFFFFFFFFFFFF );
3381            }
3382        }
3383        return (sbits64) LIT64( 0x8000000000000000 );
3384    }
3385    else if ( aExp < 0x3FFF ) {
3386        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3387        return 0;
3388    }
3389    z = aSig>>( - shiftCount );
3390    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3391        float_exception_flags |= float_flag_inexact;
3392    }
3393    if ( aSign ) z = - z;
3394    return z;
3395
3396}
3397
3398/*
3399-------------------------------------------------------------------------------
3400Returns the result of converting the extended double-precision floating-
3401point value `a' to the single-precision floating-point format.  The
3402conversion is performed according to the IEC/IEEE Standard for Binary
3403Floating-Point Arithmetic.
3404-------------------------------------------------------------------------------
3405*/
3406float32 floatx80_to_float32( floatx80 a )
3407{
3408    flag aSign;
3409    int32 aExp;
3410    bits64 aSig;
3411
3412    aSig = extractFloatx80Frac( a );
3413    aExp = extractFloatx80Exp( a );
3414    aSign = extractFloatx80Sign( a );
3415    if ( aExp == 0x7FFF ) {
3416        if ( (bits64) ( aSig<<1 ) ) {
3417            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3418        }
3419        return packFloat32( aSign, 0xFF, 0 );
3420    }
3421    shift64RightJamming( aSig, 33, &aSig );
3422    if ( aExp || aSig ) aExp -= 0x3F81;
3423    return roundAndPackFloat32( aSign, aExp, aSig );
3424
3425}
3426
3427/*
3428-------------------------------------------------------------------------------
3429Returns the result of converting the extended double-precision floating-
3430point value `a' to the double-precision floating-point format.  The
3431conversion is performed according to the IEC/IEEE Standard for Binary
3432Floating-Point Arithmetic.
3433-------------------------------------------------------------------------------
3434*/
3435float64 floatx80_to_float64( floatx80 a )
3436{
3437    flag aSign;
3438    int32 aExp;
3439    bits64 aSig, zSig;
3440
3441    aSig = extractFloatx80Frac( a );
3442    aExp = extractFloatx80Exp( a );
3443    aSign = extractFloatx80Sign( a );
3444    if ( aExp == 0x7FFF ) {
3445        if ( (bits64) ( aSig<<1 ) ) {
3446            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3447        }
3448        return packFloat64( aSign, 0x7FF, 0 );
3449    }
3450    shift64RightJamming( aSig, 1, &zSig );
3451    if ( aExp || aSig ) aExp -= 0x3C01;
3452    return roundAndPackFloat64( aSign, aExp, zSig );
3453
3454}
3455
3456#ifdef FLOAT128
3457
3458/*
3459-------------------------------------------------------------------------------
3460Returns the result of converting the extended double-precision floating-
3461point value `a' to the quadruple-precision floating-point format.  The
3462conversion is performed according to the IEC/IEEE Standard for Binary
3463Floating-Point Arithmetic.
3464-------------------------------------------------------------------------------
3465*/
3466float128 floatx80_to_float128( floatx80 a )
3467{
3468    flag aSign;
3469    int16 aExp;
3470    bits64 aSig, zSig0, zSig1;
3471
3472    aSig = extractFloatx80Frac( a );
3473    aExp = extractFloatx80Exp( a );
3474    aSign = extractFloatx80Sign( a );
3475    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3476        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3477    }
3478    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3479    return packFloat128( aSign, aExp, zSig0, zSig1 );
3480
3481}
3482
3483#endif
3484
3485/*
3486-------------------------------------------------------------------------------
3487Rounds the extended double-precision floating-point value `a' to an integer,
3488and returns the result as an extended quadruple-precision floating-point
3489value.  The operation is performed according to the IEC/IEEE Standard for
3490Binary Floating-Point Arithmetic.
3491-------------------------------------------------------------------------------
3492*/
3493floatx80 floatx80_round_to_int( floatx80 a )
3494{
3495    flag aSign;
3496    int32 aExp;
3497    bits64 lastBitMask, roundBitsMask;
3498    int8 roundingMode;
3499    floatx80 z;
3500
3501    aExp = extractFloatx80Exp( a );
3502    if ( 0x403E <= aExp ) {
3503        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3504            return propagateFloatx80NaN( a, a );
3505        }
3506        return a;
3507    }
3508    if ( aExp < 0x3FFF ) {
3509        if (    ( aExp == 0 )
3510             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3511            return a;
3512        }
3513        float_exception_flags |= float_flag_inexact;
3514        aSign = extractFloatx80Sign( a );
3515        switch ( float_rounding_mode ) {
3516         case float_round_nearest_even:
3517            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3518               ) {
3519                return
3520                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3521            }
3522            break;
3523	 case float_round_to_zero:
3524	    break;
3525         case float_round_down:
3526            return
3527                  aSign ?
3528                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3529                : packFloatx80( 0, 0, 0 );
3530         case float_round_up:
3531            return
3532                  aSign ? packFloatx80( 1, 0, 0 )
3533                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3534        }
3535        return packFloatx80( aSign, 0, 0 );
3536    }
3537    lastBitMask = 1;
3538    lastBitMask <<= 0x403E - aExp;
3539    roundBitsMask = lastBitMask - 1;
3540    z = a;
3541    roundingMode = float_rounding_mode;
3542    if ( roundingMode == float_round_nearest_even ) {
3543        z.low += lastBitMask>>1;
3544        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3545    }
3546    else if ( roundingMode != float_round_to_zero ) {
3547        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3548            z.low += roundBitsMask;
3549        }
3550    }
3551    z.low &= ~ roundBitsMask;
3552    if ( z.low == 0 ) {
3553        ++z.high;
3554        z.low = LIT64( 0x8000000000000000 );
3555    }
3556    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3557    return z;
3558
3559}
3560
3561/*
3562-------------------------------------------------------------------------------
3563Returns the result of adding the absolute values of the extended double-
3564precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3565negated before being returned.  `zSign' is ignored if the result is a NaN.
3566The addition is performed according to the IEC/IEEE Standard for Binary
3567Floating-Point Arithmetic.
3568-------------------------------------------------------------------------------
3569*/
3570static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3571{
3572    int32 aExp, bExp, zExp;
3573    bits64 aSig, bSig, zSig0, zSig1;
3574    int32 expDiff;
3575
3576    aSig = extractFloatx80Frac( a );
3577    aExp = extractFloatx80Exp( a );
3578    bSig = extractFloatx80Frac( b );
3579    bExp = extractFloatx80Exp( b );
3580    expDiff = aExp - bExp;
3581    if ( 0 < expDiff ) {
3582        if ( aExp == 0x7FFF ) {
3583            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3584            return a;
3585        }
3586        if ( bExp == 0 ) --expDiff;
3587        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3588        zExp = aExp;
3589    }
3590    else if ( expDiff < 0 ) {
3591        if ( bExp == 0x7FFF ) {
3592            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3593            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3594        }
3595        if ( aExp == 0 ) ++expDiff;
3596        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3597        zExp = bExp;
3598    }
3599    else {
3600        if ( aExp == 0x7FFF ) {
3601            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3602                return propagateFloatx80NaN( a, b );
3603            }
3604            return a;
3605        }
3606        zSig1 = 0;
3607        zSig0 = aSig + bSig;
3608        if ( aExp == 0 ) {
3609            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3610            goto roundAndPack;
3611        }
3612        zExp = aExp;
3613        goto shiftRight1;
3614    }
3615    zSig0 = aSig + bSig;
3616    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3617 shiftRight1:
3618    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3619    zSig0 |= LIT64( 0x8000000000000000 );
3620    ++zExp;
3621 roundAndPack:
3622    return
3623        roundAndPackFloatx80(
3624            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3625
3626}
3627
3628/*
3629-------------------------------------------------------------------------------
3630Returns the result of subtracting the absolute values of the extended
3631double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3632difference is negated before being returned.  `zSign' is ignored if the
3633result is a NaN.  The subtraction is performed according to the IEC/IEEE
3634Standard for Binary Floating-Point Arithmetic.
3635-------------------------------------------------------------------------------
3636*/
3637static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3638{
3639    int32 aExp, bExp, zExp;
3640    bits64 aSig, bSig, zSig0, zSig1;
3641    int32 expDiff;
3642    floatx80 z;
3643
3644    aSig = extractFloatx80Frac( a );
3645    aExp = extractFloatx80Exp( a );
3646    bSig = extractFloatx80Frac( b );
3647    bExp = extractFloatx80Exp( b );
3648    expDiff = aExp - bExp;
3649    if ( 0 < expDiff ) goto aExpBigger;
3650    if ( expDiff < 0 ) goto bExpBigger;
3651    if ( aExp == 0x7FFF ) {
3652        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3653            return propagateFloatx80NaN( a, b );
3654        }
3655        float_raise( float_flag_invalid );
3656        z.low = floatx80_default_nan_low;
3657        z.high = floatx80_default_nan_high;
3658        return z;
3659    }
3660    if ( aExp == 0 ) {
3661        aExp = 1;
3662        bExp = 1;
3663    }
3664    zSig1 = 0;
3665    if ( bSig < aSig ) goto aBigger;
3666    if ( aSig < bSig ) goto bBigger;
3667    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3668 bExpBigger:
3669    if ( bExp == 0x7FFF ) {
3670        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3671        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672    }
3673    if ( aExp == 0 ) ++expDiff;
3674    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3675 bBigger:
3676    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3677    zExp = bExp;
3678    zSign ^= 1;
3679    goto normalizeRoundAndPack;
3680 aExpBigger:
3681    if ( aExp == 0x7FFF ) {
3682        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3683        return a;
3684    }
3685    if ( bExp == 0 ) --expDiff;
3686    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3687 aBigger:
3688    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3689    zExp = aExp;
3690 normalizeRoundAndPack:
3691    return
3692        normalizeRoundAndPackFloatx80(
3693            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3694
3695}
3696
3697/*
3698-------------------------------------------------------------------------------
3699Returns the result of adding the extended double-precision floating-point
3700values `a' and `b'.  The operation is performed according to the IEC/IEEE
3701Standard for Binary Floating-Point Arithmetic.
3702-------------------------------------------------------------------------------
3703*/
3704floatx80 floatx80_add( floatx80 a, floatx80 b )
3705{
3706    flag aSign, bSign;
3707
3708    aSign = extractFloatx80Sign( a );
3709    bSign = extractFloatx80Sign( b );
3710    if ( aSign == bSign ) {
3711        return addFloatx80Sigs( a, b, aSign );
3712    }
3713    else {
3714        return subFloatx80Sigs( a, b, aSign );
3715    }
3716
3717}
3718
3719/*
3720-------------------------------------------------------------------------------
3721Returns the result of subtracting the extended double-precision floating-
3722point values `a' and `b'.  The operation is performed according to the
3723IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3724-------------------------------------------------------------------------------
3725*/
3726floatx80 floatx80_sub( floatx80 a, floatx80 b )
3727{
3728    flag aSign, bSign;
3729
3730    aSign = extractFloatx80Sign( a );
3731    bSign = extractFloatx80Sign( b );
3732    if ( aSign == bSign ) {
3733        return subFloatx80Sigs( a, b, aSign );
3734    }
3735    else {
3736        return addFloatx80Sigs( a, b, aSign );
3737    }
3738
3739}
3740
3741/*
3742-------------------------------------------------------------------------------
3743Returns the result of multiplying the extended double-precision floating-
3744point values `a' and `b'.  The operation is performed according to the
3745IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3746-------------------------------------------------------------------------------
3747*/
3748floatx80 floatx80_mul( floatx80 a, floatx80 b )
3749{
3750    flag aSign, bSign, zSign;
3751    int32 aExp, bExp, zExp;
3752    bits64 aSig, bSig, zSig0, zSig1;
3753    floatx80 z;
3754
3755    aSig = extractFloatx80Frac( a );
3756    aExp = extractFloatx80Exp( a );
3757    aSign = extractFloatx80Sign( a );
3758    bSig = extractFloatx80Frac( b );
3759    bExp = extractFloatx80Exp( b );
3760    bSign = extractFloatx80Sign( b );
3761    zSign = aSign ^ bSign;
3762    if ( aExp == 0x7FFF ) {
3763        if (    (bits64) ( aSig<<1 )
3764             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3765            return propagateFloatx80NaN( a, b );
3766        }
3767        if ( ( bExp | bSig ) == 0 ) goto invalid;
3768        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3769    }
3770    if ( bExp == 0x7FFF ) {
3771        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3772        if ( ( aExp | aSig ) == 0 ) {
3773 invalid:
3774            float_raise( float_flag_invalid );
3775            z.low = floatx80_default_nan_low;
3776            z.high = floatx80_default_nan_high;
3777            return z;
3778        }
3779        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3780    }
3781    if ( aExp == 0 ) {
3782        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3783        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3784    }
3785    if ( bExp == 0 ) {
3786        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3787        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3788    }
3789    zExp = aExp + bExp - 0x3FFE;
3790    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3791    if ( 0 < (sbits64) zSig0 ) {
3792        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3793        --zExp;
3794    }
3795    return
3796        roundAndPackFloatx80(
3797            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3798
3799}
3800
3801/*
3802-------------------------------------------------------------------------------
3803Returns the result of dividing the extended double-precision floating-point
3804value `a' by the corresponding value `b'.  The operation is performed
3805according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3806-------------------------------------------------------------------------------
3807*/
3808floatx80 floatx80_div( floatx80 a, floatx80 b )
3809{
3810    flag aSign, bSign, zSign;
3811    int32 aExp, bExp, zExp;
3812    bits64 aSig, bSig, zSig0, zSig1;
3813    bits64 rem0, rem1, rem2, term0, term1, term2;
3814    floatx80 z;
3815
3816    aSig = extractFloatx80Frac( a );
3817    aExp = extractFloatx80Exp( a );
3818    aSign = extractFloatx80Sign( a );
3819    bSig = extractFloatx80Frac( b );
3820    bExp = extractFloatx80Exp( b );
3821    bSign = extractFloatx80Sign( b );
3822    zSign = aSign ^ bSign;
3823    if ( aExp == 0x7FFF ) {
3824        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3825        if ( bExp == 0x7FFF ) {
3826            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3827            goto invalid;
3828        }
3829        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3830    }
3831    if ( bExp == 0x7FFF ) {
3832        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3833        return packFloatx80( zSign, 0, 0 );
3834    }
3835    if ( bExp == 0 ) {
3836        if ( bSig == 0 ) {
3837            if ( ( aExp | aSig ) == 0 ) {
3838 invalid:
3839                float_raise( float_flag_invalid );
3840                z.low = floatx80_default_nan_low;
3841                z.high = floatx80_default_nan_high;
3842                return z;
3843            }
3844            float_raise( float_flag_divbyzero );
3845            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3846        }
3847        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3848    }
3849    if ( aExp == 0 ) {
3850        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3851        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3852    }
3853    zExp = aExp - bExp + 0x3FFE;
3854    rem1 = 0;
3855    if ( bSig <= aSig ) {
3856        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3857        ++zExp;
3858    }
3859    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3860    mul64To128( bSig, zSig0, &term0, &term1 );
3861    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3862    while ( (sbits64) rem0 < 0 ) {
3863        --zSig0;
3864        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3865    }
3866    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3867    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3868        mul64To128( bSig, zSig1, &term1, &term2 );
3869        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3870        while ( (sbits64) rem1 < 0 ) {
3871            --zSig1;
3872            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3873        }
3874        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3875    }
3876    return
3877        roundAndPackFloatx80(
3878            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3879
3880}
3881
3882/*
3883-------------------------------------------------------------------------------
3884Returns the remainder of the extended double-precision floating-point value
3885`a' with respect to the corresponding value `b'.  The operation is performed
3886according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3887-------------------------------------------------------------------------------
3888*/
3889floatx80 floatx80_rem( floatx80 a, floatx80 b )
3890{
3891    flag aSign, bSign, zSign;
3892    int32 aExp, bExp, expDiff;
3893    bits64 aSig0, aSig1, bSig;
3894    bits64 q, term0, term1, alternateASig0, alternateASig1;
3895    floatx80 z;
3896
3897    aSig0 = extractFloatx80Frac( a );
3898    aExp = extractFloatx80Exp( a );
3899    aSign = extractFloatx80Sign( a );
3900    bSig = extractFloatx80Frac( b );
3901    bExp = extractFloatx80Exp( b );
3902    bSign = extractFloatx80Sign( b );
3903    if ( aExp == 0x7FFF ) {
3904        if (    (bits64) ( aSig0<<1 )
3905             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3906            return propagateFloatx80NaN( a, b );
3907        }
3908        goto invalid;
3909    }
3910    if ( bExp == 0x7FFF ) {
3911        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3912        return a;
3913    }
3914    if ( bExp == 0 ) {
3915        if ( bSig == 0 ) {
3916 invalid:
3917            float_raise( float_flag_invalid );
3918            z.low = floatx80_default_nan_low;
3919            z.high = floatx80_default_nan_high;
3920            return z;
3921        }
3922        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3923    }
3924    if ( aExp == 0 ) {
3925        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3926        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3927    }
3928    bSig |= LIT64( 0x8000000000000000 );
3929    zSign = aSign;
3930    expDiff = aExp - bExp;
3931    aSig1 = 0;
3932    if ( expDiff < 0 ) {
3933        if ( expDiff < -1 ) return a;
3934        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3935        expDiff = 0;
3936    }
3937    q = ( bSig <= aSig0 );
3938    if ( q ) aSig0 -= bSig;
3939    expDiff -= 64;
3940    while ( 0 < expDiff ) {
3941        q = estimateDiv128To64( aSig0, aSig1, bSig );
3942        q = ( 2 < q ) ? q - 2 : 0;
3943        mul64To128( bSig, q, &term0, &term1 );
3944        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3945        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3946        expDiff -= 62;
3947    }
3948    expDiff += 64;
3949    if ( 0 < expDiff ) {
3950        q = estimateDiv128To64( aSig0, aSig1, bSig );
3951        q = ( 2 < q ) ? q - 2 : 0;
3952        q >>= 64 - expDiff;
3953        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3954        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3955        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3956        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3957            ++q;
3958            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3959        }
3960    }
3961    else {
3962        term1 = 0;
3963        term0 = bSig;
3964    }
3965    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3966    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3967         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3968              && ( q & 1 ) )
3969       ) {
3970        aSig0 = alternateASig0;
3971        aSig1 = alternateASig1;
3972        zSign = ! zSign;
3973    }
3974    return
3975        normalizeRoundAndPackFloatx80(
3976            80, zSign, bExp + expDiff, aSig0, aSig1 );
3977
3978}
3979
3980/*
3981-------------------------------------------------------------------------------
3982Returns the square root of the extended double-precision floating-point
3983value `a'.  The operation is performed according to the IEC/IEEE Standard
3984for Binary Floating-Point Arithmetic.
3985-------------------------------------------------------------------------------
3986*/
3987floatx80 floatx80_sqrt( floatx80 a )
3988{
3989    flag aSign;
3990    int32 aExp, zExp;
3991    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3992    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3993    floatx80 z;
3994
3995    aSig0 = extractFloatx80Frac( a );
3996    aExp = extractFloatx80Exp( a );
3997    aSign = extractFloatx80Sign( a );
3998    if ( aExp == 0x7FFF ) {
3999        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4000        if ( ! aSign ) return a;
4001        goto invalid;
4002    }
4003    if ( aSign ) {
4004        if ( ( aExp | aSig0 ) == 0 ) return a;
4005 invalid:
4006        float_raise( float_flag_invalid );
4007        z.low = floatx80_default_nan_low;
4008        z.high = floatx80_default_nan_high;
4009        return z;
4010    }
4011    if ( aExp == 0 ) {
4012        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4013        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4014    }
4015    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4016    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4017    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4018    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4019    doubleZSig0 = zSig0<<1;
4020    mul64To128( zSig0, zSig0, &term0, &term1 );
4021    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4022    while ( (sbits64) rem0 < 0 ) {
4023        --zSig0;
4024        doubleZSig0 -= 2;
4025        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4026    }
4027    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4028    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4029        if ( zSig1 == 0 ) zSig1 = 1;
4030        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4031        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4032        mul64To128( zSig1, zSig1, &term2, &term3 );
4033        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4034        while ( (sbits64) rem1 < 0 ) {
4035            --zSig1;
4036            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4037            term3 |= 1;
4038            term2 |= doubleZSig0;
4039            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4040        }
4041        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4042    }
4043    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4044    zSig0 |= doubleZSig0;
4045    return
4046        roundAndPackFloatx80(
4047            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4048
4049}
4050
4051/*
4052-------------------------------------------------------------------------------
4053Returns 1 if the extended double-precision floating-point value `a' is
4054equal to the corresponding value `b', and 0 otherwise.  The comparison is
4055performed according to the IEC/IEEE Standard for Binary Floating-Point
4056Arithmetic.
4057-------------------------------------------------------------------------------
4058*/
4059flag floatx80_eq( floatx80 a, floatx80 b )
4060{
4061
4062    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4063              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4064         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4065              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4066       ) {
4067        if (    floatx80_is_signaling_nan( a )
4068             || floatx80_is_signaling_nan( b ) ) {
4069            float_raise( float_flag_invalid );
4070        }
4071        return 0;
4072    }
4073    return
4074           ( a.low == b.low )
4075        && (    ( a.high == b.high )
4076             || (    ( a.low == 0 )
4077                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4078           );
4079
4080}
4081
4082/*
4083-------------------------------------------------------------------------------
4084Returns 1 if the extended double-precision floating-point value `a' is
4085less than or equal to the corresponding value `b', and 0 otherwise.  The
4086comparison is performed according to the IEC/IEEE Standard for Binary
4087Floating-Point Arithmetic.
4088-------------------------------------------------------------------------------
4089*/
4090flag floatx80_le( floatx80 a, floatx80 b )
4091{
4092    flag aSign, bSign;
4093
4094    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4095              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4096         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4097              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4098       ) {
4099        float_raise( float_flag_invalid );
4100        return 0;
4101    }
4102    aSign = extractFloatx80Sign( a );
4103    bSign = extractFloatx80Sign( b );
4104    if ( aSign != bSign ) {
4105        return
4106               aSign
4107            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4108                 == 0 );
4109    }
4110    return
4111          aSign ? le128( b.high, b.low, a.high, a.low )
4112        : le128( a.high, a.low, b.high, b.low );
4113
4114}
4115
4116/*
4117-------------------------------------------------------------------------------
4118Returns 1 if the extended double-precision floating-point value `a' is
4119less than the corresponding value `b', and 0 otherwise.  The comparison
4120is performed according to the IEC/IEEE Standard for Binary Floating-Point
4121Arithmetic.
4122-------------------------------------------------------------------------------
4123*/
4124flag floatx80_lt( floatx80 a, floatx80 b )
4125{
4126    flag aSign, bSign;
4127
4128    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4129              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4130         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4131              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4132       ) {
4133        float_raise( float_flag_invalid );
4134        return 0;
4135    }
4136    aSign = extractFloatx80Sign( a );
4137    bSign = extractFloatx80Sign( b );
4138    if ( aSign != bSign ) {
4139        return
4140               aSign
4141            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4142                 != 0 );
4143    }
4144    return
4145          aSign ? lt128( b.high, b.low, a.high, a.low )
4146        : lt128( a.high, a.low, b.high, b.low );
4147
4148}
4149
4150/*
4151-------------------------------------------------------------------------------
4152Returns 1 if the extended double-precision floating-point value `a' is equal
4153to the corresponding value `b', and 0 otherwise.  The invalid exception is
4154raised if either operand is a NaN.  Otherwise, the comparison is performed
4155according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4156-------------------------------------------------------------------------------
4157*/
4158flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4159{
4160
4161    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4162              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4163         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4164              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4165       ) {
4166        float_raise( float_flag_invalid );
4167        return 0;
4168    }
4169    return
4170           ( a.low == b.low )
4171        && (    ( a.high == b.high )
4172             || (    ( a.low == 0 )
4173                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4174           );
4175
4176}
4177
4178/*
4179-------------------------------------------------------------------------------
4180Returns 1 if the extended double-precision floating-point value `a' is less
4181than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4182do not cause an exception.  Otherwise, the comparison is performed according
4183to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184-------------------------------------------------------------------------------
4185*/
4186flag floatx80_le_quiet( floatx80 a, floatx80 b )
4187{
4188    flag aSign, bSign;
4189
4190    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4191              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4192         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4193              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4194       ) {
4195        if (    floatx80_is_signaling_nan( a )
4196             || floatx80_is_signaling_nan( b ) ) {
4197            float_raise( float_flag_invalid );
4198        }
4199        return 0;
4200    }
4201    aSign = extractFloatx80Sign( a );
4202    bSign = extractFloatx80Sign( b );
4203    if ( aSign != bSign ) {
4204        return
4205               aSign
4206            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4207                 == 0 );
4208    }
4209    return
4210          aSign ? le128( b.high, b.low, a.high, a.low )
4211        : le128( a.high, a.low, b.high, b.low );
4212
4213}
4214
4215/*
4216-------------------------------------------------------------------------------
4217Returns 1 if the extended double-precision floating-point value `a' is less
4218than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4219an exception.  Otherwise, the comparison is performed according to the
4220IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4221-------------------------------------------------------------------------------
4222*/
4223flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4224{
4225    flag aSign, bSign;
4226
4227    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4228              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4229         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4230              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4231       ) {
4232        if (    floatx80_is_signaling_nan( a )
4233             || floatx80_is_signaling_nan( b ) ) {
4234            float_raise( float_flag_invalid );
4235        }
4236        return 0;
4237    }
4238    aSign = extractFloatx80Sign( a );
4239    bSign = extractFloatx80Sign( b );
4240    if ( aSign != bSign ) {
4241        return
4242               aSign
4243            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4244                 != 0 );
4245    }
4246    return
4247          aSign ? lt128( b.high, b.low, a.high, a.low )
4248        : lt128( a.high, a.low, b.high, b.low );
4249
4250}
4251
4252#endif
4253
4254#ifdef FLOAT128
4255
4256/*
4257-------------------------------------------------------------------------------
4258Returns the result of converting the quadruple-precision floating-point
4259value `a' to the 32-bit two's complement integer format.  The conversion
4260is performed according to the IEC/IEEE Standard for Binary Floating-Point
4261Arithmetic---which means in particular that the conversion is rounded
4262according to the current rounding mode.  If `a' is a NaN, the largest
4263positive integer is returned.  Otherwise, if the conversion overflows, the
4264largest integer with the same sign as `a' is returned.
4265-------------------------------------------------------------------------------
4266*/
4267int32 float128_to_int32( float128 a )
4268{
4269    flag aSign;
4270    int32 aExp, shiftCount;
4271    bits64 aSig0, aSig1;
4272
4273    aSig1 = extractFloat128Frac1( a );
4274    aSig0 = extractFloat128Frac0( a );
4275    aExp = extractFloat128Exp( a );
4276    aSign = extractFloat128Sign( a );
4277    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4278    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4279    aSig0 |= ( aSig1 != 0 );
4280    shiftCount = 0x4028 - aExp;
4281    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4282    return roundAndPackInt32( aSign, aSig0 );
4283
4284}
4285
4286/*
4287-------------------------------------------------------------------------------
4288Returns the result of converting the quadruple-precision floating-point
4289value `a' to the 32-bit two's complement integer format.  The conversion
4290is performed according to the IEC/IEEE Standard for Binary Floating-Point
4291Arithmetic, except that the conversion is always rounded toward zero.  If
4292`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4293conversion overflows, the largest integer with the same sign as `a' is
4294returned.
4295-------------------------------------------------------------------------------
4296*/
4297int32 float128_to_int32_round_to_zero( float128 a )
4298{
4299    flag aSign;
4300    int32 aExp, shiftCount;
4301    bits64 aSig0, aSig1, savedASig;
4302    int32 z;
4303
4304    aSig1 = extractFloat128Frac1( a );
4305    aSig0 = extractFloat128Frac0( a );
4306    aExp = extractFloat128Exp( a );
4307    aSign = extractFloat128Sign( a );
4308    aSig0 |= ( aSig1 != 0 );
4309    if ( 0x401E < aExp ) {
4310        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4311        goto invalid;
4312    }
4313    else if ( aExp < 0x3FFF ) {
4314        if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4315        return 0;
4316    }
4317    aSig0 |= LIT64( 0x0001000000000000 );
4318    shiftCount = 0x402F - aExp;
4319    savedASig = aSig0;
4320    aSig0 >>= shiftCount;
4321    z = aSig0;
4322    if ( aSign ) z = - z;
4323    if ( ( z < 0 ) ^ aSign ) {
4324 invalid:
4325        float_raise( float_flag_invalid );
4326        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4327    }
4328    if ( ( aSig0<<shiftCount ) != savedASig ) {
4329        float_exception_flags |= float_flag_inexact;
4330    }
4331    return z;
4332
4333}
4334
4335/*
4336-------------------------------------------------------------------------------
4337Returns the result of converting the quadruple-precision floating-point
4338value `a' to the 64-bit two's complement integer format.  The conversion
4339is performed according to the IEC/IEEE Standard for Binary Floating-Point
4340Arithmetic---which means in particular that the conversion is rounded
4341according to the current rounding mode.  If `a' is a NaN, the largest
4342positive integer is returned.  Otherwise, if the conversion overflows, the
4343largest integer with the same sign as `a' is returned.
4344-------------------------------------------------------------------------------
4345*/
4346int64 float128_to_int64( float128 a )
4347{
4348    flag aSign;
4349    int32 aExp, shiftCount;
4350    bits64 aSig0, aSig1;
4351
4352    aSig1 = extractFloat128Frac1( a );
4353    aSig0 = extractFloat128Frac0( a );
4354    aExp = extractFloat128Exp( a );
4355    aSign = extractFloat128Sign( a );
4356    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4357    shiftCount = 0x402F - aExp;
4358    if ( shiftCount <= 0 ) {
4359        if ( 0x403E < aExp ) {
4360            float_raise( float_flag_invalid );
4361            if (    ! aSign
4362                 || (    ( aExp == 0x7FFF )
4363                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4364                    )
4365               ) {
4366                return LIT64( 0x7FFFFFFFFFFFFFFF );
4367            }
4368            return (sbits64) LIT64( 0x8000000000000000 );
4369        }
4370        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4371    }
4372    else {
4373        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4374    }
4375    return roundAndPackInt64( aSign, aSig0, aSig1 );
4376
4377}
4378
4379/*
4380-------------------------------------------------------------------------------
4381Returns the result of converting the quadruple-precision floating-point
4382value `a' to the 64-bit two's complement integer format.  The conversion
4383is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384Arithmetic, except that the conversion is always rounded toward zero.
4385If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4386the conversion overflows, the largest integer with the same sign as `a' is
4387returned.
4388-------------------------------------------------------------------------------
4389*/
4390int64 float128_to_int64_round_to_zero( float128 a )
4391{
4392    flag aSign;
4393    int32 aExp, shiftCount;
4394    bits64 aSig0, aSig1;
4395    int64 z;
4396
4397    aSig1 = extractFloat128Frac1( a );
4398    aSig0 = extractFloat128Frac0( a );
4399    aExp = extractFloat128Exp( a );
4400    aSign = extractFloat128Sign( a );
4401    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4402    shiftCount = aExp - 0x402F;
4403    if ( 0 < shiftCount ) {
4404        if ( 0x403E <= aExp ) {
4405            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4406            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4407                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4408                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4409            }
4410            else {
4411                float_raise( float_flag_invalid );
4412                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4413                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4414                }
4415            }
4416            return (sbits64) LIT64( 0x8000000000000000 );
4417        }
4418        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4419        if ( (bits64) ( aSig1<<shiftCount ) ) {
4420            float_exception_flags |= float_flag_inexact;
4421        }
4422    }
4423    else {
4424        if ( aExp < 0x3FFF ) {
4425            if ( aExp | aSig0 | aSig1 ) {
4426                float_exception_flags |= float_flag_inexact;
4427            }
4428            return 0;
4429        }
4430        z = aSig0>>( - shiftCount );
4431        if (    aSig1
4432             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4433            float_exception_flags |= float_flag_inexact;
4434        }
4435    }
4436    if ( aSign ) z = - z;
4437    return z;
4438
4439}
4440
4441/*
4442-------------------------------------------------------------------------------
4443Returns the result of converting the quadruple-precision floating-point
4444value `a' to the single-precision floating-point format.  The conversion
4445is performed according to the IEC/IEEE Standard for Binary Floating-Point
4446Arithmetic.
4447-------------------------------------------------------------------------------
4448*/
4449float32 float128_to_float32( float128 a )
4450{
4451    flag aSign;
4452    int32 aExp;
4453    bits64 aSig0, aSig1;
4454    bits32 zSig;
4455
4456    aSig1 = extractFloat128Frac1( a );
4457    aSig0 = extractFloat128Frac0( a );
4458    aExp = extractFloat128Exp( a );
4459    aSign = extractFloat128Sign( a );
4460    if ( aExp == 0x7FFF ) {
4461        if ( aSig0 | aSig1 ) {
4462            return commonNaNToFloat32( float128ToCommonNaN( a ) );
4463        }
4464        return packFloat32( aSign, 0xFF, 0 );
4465    }
4466    aSig0 |= ( aSig1 != 0 );
4467    shift64RightJamming( aSig0, 18, &aSig0 );
4468    zSig = aSig0;
4469    if ( aExp || zSig ) {
4470        zSig |= 0x40000000;
4471        aExp -= 0x3F81;
4472    }
4473    return roundAndPackFloat32( aSign, aExp, zSig );
4474
4475}
4476
4477/*
4478-------------------------------------------------------------------------------
4479Returns the result of converting the quadruple-precision floating-point
4480value `a' to the double-precision floating-point format.  The conversion
4481is performed according to the IEC/IEEE Standard for Binary Floating-Point
4482Arithmetic.
4483-------------------------------------------------------------------------------
4484*/
4485float64 float128_to_float64( float128 a )
4486{
4487    flag aSign;
4488    int32 aExp;
4489    bits64 aSig0, aSig1;
4490
4491    aSig1 = extractFloat128Frac1( a );
4492    aSig0 = extractFloat128Frac0( a );
4493    aExp = extractFloat128Exp( a );
4494    aSign = extractFloat128Sign( a );
4495    if ( aExp == 0x7FFF ) {
4496        if ( aSig0 | aSig1 ) {
4497            return commonNaNToFloat64( float128ToCommonNaN( a ) );
4498        }
4499        return packFloat64( aSign, 0x7FF, 0 );
4500    }
4501    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4502    aSig0 |= ( aSig1 != 0 );
4503    if ( aExp || aSig0 ) {
4504        aSig0 |= LIT64( 0x4000000000000000 );
4505        aExp -= 0x3C01;
4506    }
4507    return roundAndPackFloat64( aSign, aExp, aSig0 );
4508
4509}
4510
4511#ifdef FLOATX80
4512
4513/*
4514-------------------------------------------------------------------------------
4515Returns the result of converting the quadruple-precision floating-point
4516value `a' to the extended double-precision floating-point format.  The
4517conversion is performed according to the IEC/IEEE Standard for Binary
4518Floating-Point Arithmetic.
4519-------------------------------------------------------------------------------
4520*/
4521floatx80 float128_to_floatx80( float128 a )
4522{
4523    flag aSign;
4524    int32 aExp;
4525    bits64 aSig0, aSig1;
4526
4527    aSig1 = extractFloat128Frac1( a );
4528    aSig0 = extractFloat128Frac0( a );
4529    aExp = extractFloat128Exp( a );
4530    aSign = extractFloat128Sign( a );
4531    if ( aExp == 0x7FFF ) {
4532        if ( aSig0 | aSig1 ) {
4533            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4534        }
4535        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4536    }
4537    if ( aExp == 0 ) {
4538        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4539        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4540    }
4541    else {
4542        aSig0 |= LIT64( 0x0001000000000000 );
4543    }
4544    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4545    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4546
4547}
4548
4549#endif
4550
4551/*
4552-------------------------------------------------------------------------------
4553Rounds the quadruple-precision floating-point value `a' to an integer, and
4554returns the result as a quadruple-precision floating-point value.  The
4555operation is performed according to the IEC/IEEE Standard for Binary
4556Floating-Point Arithmetic.
4557-------------------------------------------------------------------------------
4558*/
4559float128 float128_round_to_int( float128 a )
4560{
4561    flag aSign;
4562    int32 aExp;
4563    bits64 lastBitMask, roundBitsMask;
4564    int8 roundingMode;
4565    float128 z;
4566
4567    aExp = extractFloat128Exp( a );
4568    if ( 0x402F <= aExp ) {
4569        if ( 0x406F <= aExp ) {
4570            if (    ( aExp == 0x7FFF )
4571                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4572               ) {
4573                return propagateFloat128NaN( a, a );
4574            }
4575            return a;
4576        }
4577        lastBitMask = 1;
4578        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4579        roundBitsMask = lastBitMask - 1;
4580        z = a;
4581        roundingMode = float_rounding_mode;
4582        if ( roundingMode == float_round_nearest_even ) {
4583            if ( lastBitMask ) {
4584                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4585                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4586            }
4587            else {
4588                if ( (sbits64) z.low < 0 ) {
4589                    ++z.high;
4590                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4591                }
4592            }
4593        }
4594        else if ( roundingMode != float_round_to_zero ) {
4595            if (   extractFloat128Sign( z )
4596                 ^ ( roundingMode == float_round_up ) ) {
4597                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4598            }
4599        }
4600        z.low &= ~ roundBitsMask;
4601    }
4602    else {
4603        if ( aExp < 0x3FFF ) {
4604            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4605            float_exception_flags |= float_flag_inexact;
4606            aSign = extractFloat128Sign( a );
4607            switch ( float_rounding_mode ) {
4608             case float_round_nearest_even:
4609                if (    ( aExp == 0x3FFE )
4610                     && (   extractFloat128Frac0( a )
4611                          | extractFloat128Frac1( a ) )
4612                   ) {
4613                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4614                }
4615                break;
4616	     case float_round_to_zero:
4617		break;
4618             case float_round_down:
4619                return
4620                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4621                    : packFloat128( 0, 0, 0, 0 );
4622             case float_round_up:
4623                return
4624                      aSign ? packFloat128( 1, 0, 0, 0 )
4625                    : packFloat128( 0, 0x3FFF, 0, 0 );
4626            }
4627            return packFloat128( aSign, 0, 0, 0 );
4628        }
4629        lastBitMask = 1;
4630        lastBitMask <<= 0x402F - aExp;
4631        roundBitsMask = lastBitMask - 1;
4632        z.low = 0;
4633        z.high = a.high;
4634        roundingMode = float_rounding_mode;
4635        if ( roundingMode == float_round_nearest_even ) {
4636            z.high += lastBitMask>>1;
4637            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4638                z.high &= ~ lastBitMask;
4639            }
4640        }
4641        else if ( roundingMode != float_round_to_zero ) {
4642            if (   extractFloat128Sign( z )
4643                 ^ ( roundingMode == float_round_up ) ) {
4644                z.high |= ( a.low != 0 );
4645                z.high += roundBitsMask;
4646            }
4647        }
4648        z.high &= ~ roundBitsMask;
4649    }
4650    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4651        float_exception_flags |= float_flag_inexact;
4652    }
4653    return z;
4654
4655}
4656
4657/*
4658-------------------------------------------------------------------------------
4659Returns the result of adding the absolute values of the quadruple-precision
4660floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4661before being returned.  `zSign' is ignored if the result is a NaN.
4662The addition is performed according to the IEC/IEEE Standard for Binary
4663Floating-Point Arithmetic.
4664-------------------------------------------------------------------------------
4665*/
4666static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4667{
4668    int32 aExp, bExp, zExp;
4669    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4670    int32 expDiff;
4671
4672    aSig1 = extractFloat128Frac1( a );
4673    aSig0 = extractFloat128Frac0( a );
4674    aExp = extractFloat128Exp( a );
4675    bSig1 = extractFloat128Frac1( b );
4676    bSig0 = extractFloat128Frac0( b );
4677    bExp = extractFloat128Exp( b );
4678    expDiff = aExp - bExp;
4679    if ( 0 < expDiff ) {
4680        if ( aExp == 0x7FFF ) {
4681            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4682            return a;
4683        }
4684        if ( bExp == 0 ) {
4685            --expDiff;
4686        }
4687        else {
4688            bSig0 |= LIT64( 0x0001000000000000 );
4689        }
4690        shift128ExtraRightJamming(
4691            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4692        zExp = aExp;
4693    }
4694    else if ( expDiff < 0 ) {
4695        if ( bExp == 0x7FFF ) {
4696            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4697            return packFloat128( zSign, 0x7FFF, 0, 0 );
4698        }
4699        if ( aExp == 0 ) {
4700            ++expDiff;
4701        }
4702        else {
4703            aSig0 |= LIT64( 0x0001000000000000 );
4704        }
4705        shift128ExtraRightJamming(
4706            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4707        zExp = bExp;
4708    }
4709    else {
4710        if ( aExp == 0x7FFF ) {
4711            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4712                return propagateFloat128NaN( a, b );
4713            }
4714            return a;
4715        }
4716        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4717        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4718        zSig2 = 0;
4719        zSig0 |= LIT64( 0x0002000000000000 );
4720        zExp = aExp;
4721        goto shiftRight1;
4722    }
4723    aSig0 |= LIT64( 0x0001000000000000 );
4724    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4725    --zExp;
4726    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4727    ++zExp;
4728 shiftRight1:
4729    shift128ExtraRightJamming(
4730        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4731 roundAndPack:
4732    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4733
4734}
4735
4736/*
4737-------------------------------------------------------------------------------
4738Returns the result of subtracting the absolute values of the quadruple-
4739precision floating-point values `a' and `b'.  If `zSign' is 1, the
4740difference is negated before being returned.  `zSign' is ignored if the
4741result is a NaN.  The subtraction is performed according to the IEC/IEEE
4742Standard for Binary Floating-Point Arithmetic.
4743-------------------------------------------------------------------------------
4744*/
4745static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4746{
4747    int32 aExp, bExp, zExp;
4748    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4749    int32 expDiff;
4750    float128 z;
4751
4752    aSig1 = extractFloat128Frac1( a );
4753    aSig0 = extractFloat128Frac0( a );
4754    aExp = extractFloat128Exp( a );
4755    bSig1 = extractFloat128Frac1( b );
4756    bSig0 = extractFloat128Frac0( b );
4757    bExp = extractFloat128Exp( b );
4758    expDiff = aExp - bExp;
4759    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4760    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4761    if ( 0 < expDiff ) goto aExpBigger;
4762    if ( expDiff < 0 ) goto bExpBigger;
4763    if ( aExp == 0x7FFF ) {
4764        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4765            return propagateFloat128NaN( a, b );
4766        }
4767        float_raise( float_flag_invalid );
4768        z.low = float128_default_nan_low;
4769        z.high = float128_default_nan_high;
4770        return z;
4771    }
4772    if ( aExp == 0 ) {
4773        aExp = 1;
4774        bExp = 1;
4775    }
4776    if ( bSig0 < aSig0 ) goto aBigger;
4777    if ( aSig0 < bSig0 ) goto bBigger;
4778    if ( bSig1 < aSig1 ) goto aBigger;
4779    if ( aSig1 < bSig1 ) goto bBigger;
4780    return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4781 bExpBigger:
4782    if ( bExp == 0x7FFF ) {
4783        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4784        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4785    }
4786    if ( aExp == 0 ) {
4787        ++expDiff;
4788    }
4789    else {
4790        aSig0 |= LIT64( 0x4000000000000000 );
4791    }
4792    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4793    bSig0 |= LIT64( 0x4000000000000000 );
4794 bBigger:
4795    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4796    zExp = bExp;
4797    zSign ^= 1;
4798    goto normalizeRoundAndPack;
4799 aExpBigger:
4800    if ( aExp == 0x7FFF ) {
4801        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4802        return a;
4803    }
4804    if ( bExp == 0 ) {
4805        --expDiff;
4806    }
4807    else {
4808        bSig0 |= LIT64( 0x4000000000000000 );
4809    }
4810    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4811    aSig0 |= LIT64( 0x4000000000000000 );
4812 aBigger:
4813    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814    zExp = aExp;
4815 normalizeRoundAndPack:
4816    --zExp;
4817    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4818
4819}
4820
4821/*
4822-------------------------------------------------------------------------------
4823Returns the result of adding the quadruple-precision floating-point values
4824`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4825for Binary Floating-Point Arithmetic.
4826-------------------------------------------------------------------------------
4827*/
4828float128 float128_add( float128 a, float128 b )
4829{
4830    flag aSign, bSign;
4831
4832    aSign = extractFloat128Sign( a );
4833    bSign = extractFloat128Sign( b );
4834    if ( aSign == bSign ) {
4835        return addFloat128Sigs( a, b, aSign );
4836    }
4837    else {
4838        return subFloat128Sigs( a, b, aSign );
4839    }
4840
4841}
4842
4843/*
4844-------------------------------------------------------------------------------
4845Returns the result of subtracting the quadruple-precision floating-point
4846values `a' and `b'.  The operation is performed according to the IEC/IEEE
4847Standard for Binary Floating-Point Arithmetic.
4848-------------------------------------------------------------------------------
4849*/
4850float128 float128_sub( float128 a, float128 b )
4851{
4852    flag aSign, bSign;
4853
4854    aSign = extractFloat128Sign( a );
4855    bSign = extractFloat128Sign( b );
4856    if ( aSign == bSign ) {
4857        return subFloat128Sigs( a, b, aSign );
4858    }
4859    else {
4860        return addFloat128Sigs( a, b, aSign );
4861    }
4862
4863}
4864
4865/*
4866-------------------------------------------------------------------------------
4867Returns the result of multiplying the quadruple-precision floating-point
4868values `a' and `b'.  The operation is performed according to the IEC/IEEE
4869Standard for Binary Floating-Point Arithmetic.
4870-------------------------------------------------------------------------------
4871*/
4872float128 float128_mul( float128 a, float128 b )
4873{
4874    flag aSign, bSign, zSign;
4875    int32 aExp, bExp, zExp;
4876    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4877    float128 z;
4878
4879    aSig1 = extractFloat128Frac1( a );
4880    aSig0 = extractFloat128Frac0( a );
4881    aExp = extractFloat128Exp( a );
4882    aSign = extractFloat128Sign( a );
4883    bSig1 = extractFloat128Frac1( b );
4884    bSig0 = extractFloat128Frac0( b );
4885    bExp = extractFloat128Exp( b );
4886    bSign = extractFloat128Sign( b );
4887    zSign = aSign ^ bSign;
4888    if ( aExp == 0x7FFF ) {
4889        if (    ( aSig0 | aSig1 )
4890             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4891            return propagateFloat128NaN( a, b );
4892        }
4893        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4894        return packFloat128( zSign, 0x7FFF, 0, 0 );
4895    }
4896    if ( bExp == 0x7FFF ) {
4897        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4898        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4899 invalid:
4900            float_raise( float_flag_invalid );
4901            z.low = float128_default_nan_low;
4902            z.high = float128_default_nan_high;
4903            return z;
4904        }
4905        return packFloat128( zSign, 0x7FFF, 0, 0 );
4906    }
4907    if ( aExp == 0 ) {
4908        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4909        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4910    }
4911    if ( bExp == 0 ) {
4912        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4913        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4914    }
4915    zExp = aExp + bExp - 0x4000;
4916    aSig0 |= LIT64( 0x0001000000000000 );
4917    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4918    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4919    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4920    zSig2 |= ( zSig3 != 0 );
4921    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4922        shift128ExtraRightJamming(
4923            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4924        ++zExp;
4925    }
4926    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4927
4928}
4929
4930/*
4931-------------------------------------------------------------------------------
4932Returns the result of dividing the quadruple-precision floating-point value
4933`a' by the corresponding value `b'.  The operation is performed according to
4934the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935-------------------------------------------------------------------------------
4936*/
4937float128 float128_div( float128 a, float128 b )
4938{
4939    flag aSign, bSign, zSign;
4940    int32 aExp, bExp, zExp;
4941    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4942    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4943    float128 z;
4944
4945    aSig1 = extractFloat128Frac1( a );
4946    aSig0 = extractFloat128Frac0( a );
4947    aExp = extractFloat128Exp( a );
4948    aSign = extractFloat128Sign( a );
4949    bSig1 = extractFloat128Frac1( b );
4950    bSig0 = extractFloat128Frac0( b );
4951    bExp = extractFloat128Exp( b );
4952    bSign = extractFloat128Sign( b );
4953    zSign = aSign ^ bSign;
4954    if ( aExp == 0x7FFF ) {
4955        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4956        if ( bExp == 0x7FFF ) {
4957            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4958            goto invalid;
4959        }
4960        return packFloat128( zSign, 0x7FFF, 0, 0 );
4961    }
4962    if ( bExp == 0x7FFF ) {
4963        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4964        return packFloat128( zSign, 0, 0, 0 );
4965    }
4966    if ( bExp == 0 ) {
4967        if ( ( bSig0 | bSig1 ) == 0 ) {
4968            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4969 invalid:
4970                float_raise( float_flag_invalid );
4971                z.low = float128_default_nan_low;
4972                z.high = float128_default_nan_high;
4973                return z;
4974            }
4975            float_raise( float_flag_divbyzero );
4976            return packFloat128( zSign, 0x7FFF, 0, 0 );
4977        }
4978        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4979    }
4980    if ( aExp == 0 ) {
4981        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4982        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4983    }
4984    zExp = aExp - bExp + 0x3FFD;
4985    shortShift128Left(
4986        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4987    shortShift128Left(
4988        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4989    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4990        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4991        ++zExp;
4992    }
4993    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4994    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4995    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4996    while ( (sbits64) rem0 < 0 ) {
4997        --zSig0;
4998        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4999    }
5000    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5001    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5002        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5003        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5004        while ( (sbits64) rem1 < 0 ) {
5005            --zSig1;
5006            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5007        }
5008        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5009    }
5010    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5011    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5012
5013}
5014
5015/*
5016-------------------------------------------------------------------------------
5017Returns the remainder of the quadruple-precision floating-point value `a'
5018with respect to the corresponding value `b'.  The operation is performed
5019according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020-------------------------------------------------------------------------------
5021*/
5022float128 float128_rem( float128 a, float128 b )
5023{
5024    flag aSign, bSign, zSign;
5025    int32 aExp, bExp, expDiff;
5026    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5027    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5028    sbits64 sigMean0;
5029    float128 z;
5030
5031    aSig1 = extractFloat128Frac1( a );
5032    aSig0 = extractFloat128Frac0( a );
5033    aExp = extractFloat128Exp( a );
5034    aSign = extractFloat128Sign( a );
5035    bSig1 = extractFloat128Frac1( b );
5036    bSig0 = extractFloat128Frac0( b );
5037    bExp = extractFloat128Exp( b );
5038    bSign = extractFloat128Sign( b );
5039    if ( aExp == 0x7FFF ) {
5040        if (    ( aSig0 | aSig1 )
5041             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5042            return propagateFloat128NaN( a, b );
5043        }
5044        goto invalid;
5045    }
5046    if ( bExp == 0x7FFF ) {
5047        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5048        return a;
5049    }
5050    if ( bExp == 0 ) {
5051        if ( ( bSig0 | bSig1 ) == 0 ) {
5052 invalid:
5053            float_raise( float_flag_invalid );
5054            z.low = float128_default_nan_low;
5055            z.high = float128_default_nan_high;
5056            return z;
5057        }
5058        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5059    }
5060    if ( aExp == 0 ) {
5061        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5062        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5063    }
5064    expDiff = aExp - bExp;
5065    if ( expDiff < -1 ) return a;
5066    shortShift128Left(
5067        aSig0 | LIT64( 0x0001000000000000 ),
5068        aSig1,
5069        15 - ( expDiff < 0 ),
5070        &aSig0,
5071        &aSig1
5072    );
5073    shortShift128Left(
5074        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5075    q = le128( bSig0, bSig1, aSig0, aSig1 );
5076    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5077    expDiff -= 64;
5078    while ( 0 < expDiff ) {
5079        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5080        q = ( 4 < q ) ? q - 4 : 0;
5081        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5082        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5083        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5084        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5085        expDiff -= 61;
5086    }
5087    if ( -64 < expDiff ) {
5088        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089        q = ( 4 < q ) ? q - 4 : 0;
5090        q >>= - expDiff;
5091        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5092        expDiff += 52;
5093        if ( expDiff < 0 ) {
5094            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5095        }
5096        else {
5097            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5098        }
5099        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5100        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5101    }
5102    else {
5103        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5104        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5105    }
5106    do {
5107        alternateASig0 = aSig0;
5108        alternateASig1 = aSig1;
5109        ++q;
5110        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5111    } while ( 0 <= (sbits64) aSig0 );
5112    add128(
5113        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5114    if (    ( sigMean0 < 0 )
5115         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5116        aSig0 = alternateASig0;
5117        aSig1 = alternateASig1;
5118    }
5119    zSign = ( (sbits64) aSig0 < 0 );
5120    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5121    return
5122        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5123
5124}
5125
5126/*
5127-------------------------------------------------------------------------------
5128Returns the square root of the quadruple-precision floating-point value `a'.
5129The operation is performed according to the IEC/IEEE Standard for Binary
5130Floating-Point Arithmetic.
5131-------------------------------------------------------------------------------
5132*/
5133float128 float128_sqrt( float128 a )
5134{
5135    flag aSign;
5136    int32 aExp, zExp;
5137    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5138    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5139    float128 z;
5140
5141    aSig1 = extractFloat128Frac1( a );
5142    aSig0 = extractFloat128Frac0( a );
5143    aExp = extractFloat128Exp( a );
5144    aSign = extractFloat128Sign( a );
5145    if ( aExp == 0x7FFF ) {
5146        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5147        if ( ! aSign ) return a;
5148        goto invalid;
5149    }
5150    if ( aSign ) {
5151        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5152 invalid:
5153        float_raise( float_flag_invalid );
5154        z.low = float128_default_nan_low;
5155        z.high = float128_default_nan_high;
5156        return z;
5157    }
5158    if ( aExp == 0 ) {
5159        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5160        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5161    }
5162    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5163    aSig0 |= LIT64( 0x0001000000000000 );
5164    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5165    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5166    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5167    doubleZSig0 = zSig0<<1;
5168    mul64To128( zSig0, zSig0, &term0, &term1 );
5169    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5170    while ( (sbits64) rem0 < 0 ) {
5171        --zSig0;
5172        doubleZSig0 -= 2;
5173        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5174    }
5175    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5176    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5177        if ( zSig1 == 0 ) zSig1 = 1;
5178        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5179        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5180        mul64To128( zSig1, zSig1, &term2, &term3 );
5181        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5182        while ( (sbits64) rem1 < 0 ) {
5183            --zSig1;
5184            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5185            term3 |= 1;
5186            term2 |= doubleZSig0;
5187            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5188        }
5189        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5190    }
5191    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5192    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5193
5194}
5195
5196/*
5197-------------------------------------------------------------------------------
5198Returns 1 if the quadruple-precision floating-point value `a' is equal to
5199the corresponding value `b', and 0 otherwise.  The comparison is performed
5200according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5201-------------------------------------------------------------------------------
5202*/
5203flag float128_eq( float128 a, float128 b )
5204{
5205
5206    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5207              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5208         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5209              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5210       ) {
5211        if (    float128_is_signaling_nan( a )
5212             || float128_is_signaling_nan( b ) ) {
5213            float_raise( float_flag_invalid );
5214        }
5215        return 0;
5216    }
5217    return
5218           ( a.low == b.low )
5219        && (    ( a.high == b.high )
5220             || (    ( a.low == 0 )
5221                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5222           );
5223
5224}
5225
5226/*
5227-------------------------------------------------------------------------------
5228Returns 1 if the quadruple-precision floating-point value `a' is less than
5229or equal to the corresponding value `b', and 0 otherwise.  The comparison
5230is performed according to the IEC/IEEE Standard for Binary Floating-Point
5231Arithmetic.
5232-------------------------------------------------------------------------------
5233*/
5234flag float128_le( float128 a, float128 b )
5235{
5236    flag aSign, bSign;
5237
5238    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5239              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5240         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5241              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5242       ) {
5243        float_raise( float_flag_invalid );
5244        return 0;
5245    }
5246    aSign = extractFloat128Sign( a );
5247    bSign = extractFloat128Sign( b );
5248    if ( aSign != bSign ) {
5249        return
5250               aSign
5251            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5252                 == 0 );
5253    }
5254    return
5255          aSign ? le128( b.high, b.low, a.high, a.low )
5256        : le128( a.high, a.low, b.high, b.low );
5257
5258}
5259
5260/*
5261-------------------------------------------------------------------------------
5262Returns 1 if the quadruple-precision floating-point value `a' is less than
5263the corresponding value `b', and 0 otherwise.  The comparison is performed
5264according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265-------------------------------------------------------------------------------
5266*/
5267flag float128_lt( float128 a, float128 b )
5268{
5269    flag aSign, bSign;
5270
5271    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5272              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5273         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5274              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5275       ) {
5276        float_raise( float_flag_invalid );
5277        return 0;
5278    }
5279    aSign = extractFloat128Sign( a );
5280    bSign = extractFloat128Sign( b );
5281    if ( aSign != bSign ) {
5282        return
5283               aSign
5284            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5285                 != 0 );
5286    }
5287    return
5288          aSign ? lt128( b.high, b.low, a.high, a.low )
5289        : lt128( a.high, a.low, b.high, b.low );
5290
5291}
5292
5293/*
5294-------------------------------------------------------------------------------
5295Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296the corresponding value `b', and 0 otherwise.  The invalid exception is
5297raised if either operand is a NaN.  Otherwise, the comparison is performed
5298according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5299-------------------------------------------------------------------------------
5300*/
5301flag float128_eq_signaling( float128 a, float128 b )
5302{
5303
5304    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5305              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5306         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5307              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5308       ) {
5309        float_raise( float_flag_invalid );
5310        return 0;
5311    }
5312    return
5313           ( a.low == b.low )
5314        && (    ( a.high == b.high )
5315             || (    ( a.low == 0 )
5316                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5317           );
5318
5319}
5320
5321/*
5322-------------------------------------------------------------------------------
5323Returns 1 if the quadruple-precision floating-point value `a' is less than
5324or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5325cause an exception.  Otherwise, the comparison is performed according to the
5326IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327-------------------------------------------------------------------------------
5328*/
5329flag float128_le_quiet( float128 a, float128 b )
5330{
5331    flag aSign, bSign;
5332
5333    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5334              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5336              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5337       ) {
5338        if (    float128_is_signaling_nan( a )
5339             || float128_is_signaling_nan( b ) ) {
5340            float_raise( float_flag_invalid );
5341        }
5342        return 0;
5343    }
5344    aSign = extractFloat128Sign( a );
5345    bSign = extractFloat128Sign( b );
5346    if ( aSign != bSign ) {
5347        return
5348               aSign
5349            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5350                 == 0 );
5351    }
5352    return
5353          aSign ? le128( b.high, b.low, a.high, a.low )
5354        : le128( a.high, a.low, b.high, b.low );
5355
5356}
5357
5358/*
5359-------------------------------------------------------------------------------
5360Returns 1 if the quadruple-precision floating-point value `a' is less than
5361the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5362exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5363Standard for Binary Floating-Point Arithmetic.
5364-------------------------------------------------------------------------------
5365*/
5366flag float128_lt_quiet( float128 a, float128 b )
5367{
5368    flag aSign, bSign;
5369
5370    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5371              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5373              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5374       ) {
5375        if (    float128_is_signaling_nan( a )
5376             || float128_is_signaling_nan( b ) ) {
5377            float_raise( float_flag_invalid );
5378        }
5379        return 0;
5380    }
5381    aSign = extractFloat128Sign( a );
5382    bSign = extractFloat128Sign( b );
5383    if ( aSign != bSign ) {
5384        return
5385               aSign
5386            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387                 != 0 );
5388    }
5389    return
5390          aSign ? lt128( b.high, b.low, a.high, a.low )
5391        : lt128( a.high, a.low, b.high, b.low );
5392
5393}
5394
5395#endif
5396
5397
5398#if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5399
5400/*
5401 * These two routines are not part of the original softfloat distribution.
5402 *
5403 * They are based on the corresponding conversions to integer but return
5404 * unsigned numbers instead since these functions are required by GCC.
5405 *
5406 * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5407 *
5408 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5409 */
5410
5411/*
5412-------------------------------------------------------------------------------
5413Returns the result of converting the double-precision floating-point value
5414`a' to the 32-bit unsigned integer format.  The conversion is
5415performed according to the IEC/IEEE Standard for Binary Floating-point
5416Arithmetic, except that the conversion is always rounded toward zero.  If
5417`a' is a NaN, the largest positive integer is returned.  If the conversion
5418overflows, the largest integer positive is returned.
5419-------------------------------------------------------------------------------
5420*/
5421uint32 float64_to_uint32_round_to_zero( float64 a )
5422{
5423    flag aSign;
5424    int16 aExp, shiftCount;
5425    bits64 aSig, savedASig;
5426    uint32 z;
5427
5428    aSig = extractFloat64Frac( a );
5429    aExp = extractFloat64Exp( a );
5430    aSign = extractFloat64Sign( a );
5431
5432    if (aSign) {
5433        float_raise( float_flag_invalid );
5434    	return(0);
5435    }
5436
5437    if ( 0x41E < aExp ) {
5438        float_raise( float_flag_invalid );
5439        return 0xffffffff;
5440    }
5441    else if ( aExp < 0x3FF ) {
5442        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5443        return 0;
5444    }
5445    aSig |= LIT64( 0x0010000000000000 );
5446    shiftCount = 0x433 - aExp;
5447    savedASig = aSig;
5448    aSig >>= shiftCount;
5449    z = aSig;
5450    if ( ( aSig<<shiftCount ) != savedASig ) {
5451        float_exception_flags |= float_flag_inexact;
5452    }
5453    return z;
5454
5455}
5456
5457/*
5458-------------------------------------------------------------------------------
5459Returns the result of converting the single-precision floating-point value
5460`a' to the 32-bit unsigned integer format.  The conversion is
5461performed according to the IEC/IEEE Standard for Binary Floating-point
5462Arithmetic, except that the conversion is always rounded toward zero.  If
5463`a' is a NaN, the largest positive integer is returned.  If the conversion
5464overflows, the largest positive integer is returned.
5465-------------------------------------------------------------------------------
5466*/
5467uint32 float32_to_uint32_round_to_zero( float32 a )
5468{
5469    flag aSign;
5470    int16 aExp, shiftCount;
5471    bits32 aSig;
5472    uint32 z;
5473
5474    aSig = extractFloat32Frac( a );
5475    aExp = extractFloat32Exp( a );
5476    aSign = extractFloat32Sign( a );
5477    shiftCount = aExp - 0x9E;
5478
5479    if (aSign) {
5480        float_raise( float_flag_invalid );
5481    	return(0);
5482    }
5483    if ( 0 < shiftCount ) {
5484        float_raise( float_flag_invalid );
5485        return 0xFFFFFFFF;
5486    }
5487    else if ( aExp <= 0x7E ) {
5488        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5489        return 0;
5490    }
5491    aSig = ( aSig | 0x800000 )<<8;
5492    z = aSig>>( - shiftCount );
5493    if ( aSig<<( shiftCount & 31 ) ) {
5494        float_exception_flags |= float_flag_inexact;
5495    }
5496    return z;
5497
5498}
5499
5500#endif
5501