softfloat.c revision 230363
1/* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
2
3/*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 *  itself).
7 */
8
9/*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14 *   properly renamed.
15 */
16
17/*
18===============================================================================
19
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
22
23Written by John R. Hauser.  This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704.  Funding was partially provided by the
26National Science Foundation under grant MIP-9311980.  The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31arithmetic/SoftFloat.html'.
32
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39Derivative works are acceptable, even for commercial purposes, so long as
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
43
44===============================================================================
45*/
46
47#include <sys/cdefs.h>
48__FBSDID("$FreeBSD: head/lib/libc/softfloat/bits64/softfloat.c 230363 2012-01-20 06:16:14Z das $");
49
50#ifdef SOFTFLOAT_FOR_GCC
51#include "softfloat-for-gcc.h"
52#endif
53
54#include "milieu.h"
55#include "softfloat.h"
56
57/*
58 * Conversions between floats as stored in memory and floats as
59 * SoftFloat uses them
60 */
61#ifndef FLOAT64_DEMANGLE
62#define FLOAT64_DEMANGLE(a)	(a)
63#endif
64#ifndef FLOAT64_MANGLE
65#define FLOAT64_MANGLE(a)	(a)
66#endif
67
68/*
69-------------------------------------------------------------------------------
70Floating-point rounding mode, extended double-precision rounding precision,
71and exception flags.
72-------------------------------------------------------------------------------
73*/
74int float_rounding_mode = float_round_nearest_even;
75int float_exception_flags = 0;
76#ifdef FLOATX80
77int8 floatx80_rounding_precision = 80;
78#endif
79
80/*
81-------------------------------------------------------------------------------
82Primitive arithmetic functions, including multi-word arithmetic, and
83division and square root approximations.  (Can be specialized to target if
84desired.)
85-------------------------------------------------------------------------------
86*/
87#include "softfloat-macros"
88
89/*
90-------------------------------------------------------------------------------
91Functions and definitions to determine:  (1) whether tininess for underflow
92is detected before or after rounding by default, (2) what (if anything)
93happens when exceptions are raised, (3) how signaling NaNs are distinguished
94from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
95are propagated from function inputs to output.  These details are target-
96specific.
97-------------------------------------------------------------------------------
98*/
99#include "softfloat-specialize"
100
101#if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
102/*
103-------------------------------------------------------------------------------
104Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
105and 7, and returns the properly rounded 32-bit integer corresponding to the
106input.  If `zSign' is 1, the input is negated before being converted to an
107integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
108is simply rounded to an integer, with the inexact exception raised if the
109input cannot be represented exactly as an integer.  However, if the fixed-
110point input is too large, the invalid exception is raised and the largest
111positive or negative integer is returned.
112-------------------------------------------------------------------------------
113*/
114static int32 roundAndPackInt32( flag zSign, bits64 absZ )
115{
116    int8 roundingMode;
117    flag roundNearestEven;
118    int8 roundIncrement, roundBits;
119    int32 z;
120
121    roundingMode = float_rounding_mode;
122    roundNearestEven = ( roundingMode == float_round_nearest_even );
123    roundIncrement = 0x40;
124    if ( ! roundNearestEven ) {
125        if ( roundingMode == float_round_to_zero ) {
126            roundIncrement = 0;
127        }
128        else {
129            roundIncrement = 0x7F;
130            if ( zSign ) {
131                if ( roundingMode == float_round_up ) roundIncrement = 0;
132            }
133            else {
134                if ( roundingMode == float_round_down ) roundIncrement = 0;
135            }
136        }
137    }
138    roundBits = absZ & 0x7F;
139    absZ = ( absZ + roundIncrement )>>7;
140    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
141    z = absZ;
142    if ( zSign ) z = - z;
143    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
144        float_raise( float_flag_invalid );
145        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
146    }
147    if ( roundBits ) float_exception_flags |= float_flag_inexact;
148    return z;
149
150}
151
152/*
153-------------------------------------------------------------------------------
154Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155`absZ1', with binary point between bits 63 and 64 (between the input words),
156and returns the properly rounded 64-bit integer corresponding to the input.
157If `zSign' is 1, the input is negated before being converted to an integer.
158Ordinarily, the fixed-point input is simply rounded to an integer, with
159the inexact exception raised if the input cannot be represented exactly as
160an integer.  However, if the fixed-point input is too large, the invalid
161exception is raised and the largest positive or negative integer is
162returned.
163-------------------------------------------------------------------------------
164*/
165static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
166{
167    int8 roundingMode;
168    flag roundNearestEven, increment;
169    int64 z;
170
171    roundingMode = float_rounding_mode;
172    roundNearestEven = ( roundingMode == float_round_nearest_even );
173    increment = ( (sbits64) absZ1 < 0 );
174    if ( ! roundNearestEven ) {
175        if ( roundingMode == float_round_to_zero ) {
176            increment = 0;
177        }
178        else {
179            if ( zSign ) {
180                increment = ( roundingMode == float_round_down ) && absZ1;
181            }
182            else {
183                increment = ( roundingMode == float_round_up ) && absZ1;
184            }
185        }
186    }
187    if ( increment ) {
188        ++absZ0;
189        if ( absZ0 == 0 ) goto overflow;
190        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
191    }
192    z = absZ0;
193    if ( zSign ) z = - z;
194    if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196        float_raise( float_flag_invalid );
197        return
198              zSign ? (sbits64) LIT64( 0x8000000000000000 )
199            : LIT64( 0x7FFFFFFFFFFFFFFF );
200    }
201    if ( absZ1 ) float_exception_flags |= float_flag_inexact;
202    return z;
203
204}
205#endif
206
207/*
208-------------------------------------------------------------------------------
209Returns the fraction bits of the single-precision floating-point value `a'.
210-------------------------------------------------------------------------------
211*/
212INLINE bits32 extractFloat32Frac( float32 a )
213{
214
215    return a & 0x007FFFFF;
216
217}
218
219/*
220-------------------------------------------------------------------------------
221Returns the exponent bits of the single-precision floating-point value `a'.
222-------------------------------------------------------------------------------
223*/
224INLINE int16 extractFloat32Exp( float32 a )
225{
226
227    return ( a>>23 ) & 0xFF;
228
229}
230
231/*
232-------------------------------------------------------------------------------
233Returns the sign bit of the single-precision floating-point value `a'.
234-------------------------------------------------------------------------------
235*/
236INLINE flag extractFloat32Sign( float32 a )
237{
238
239    return a>>31;
240
241}
242
243/*
244-------------------------------------------------------------------------------
245Normalizes the subnormal single-precision floating-point value represented
246by the denormalized significand `aSig'.  The normalized exponent and
247significand are stored at the locations pointed to by `zExpPtr' and
248`zSigPtr', respectively.
249-------------------------------------------------------------------------------
250*/
251static void
252 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
253{
254    int8 shiftCount;
255
256    shiftCount = countLeadingZeros32( aSig ) - 8;
257    *zSigPtr = aSig<<shiftCount;
258    *zExpPtr = 1 - shiftCount;
259
260}
261
262/*
263-------------------------------------------------------------------------------
264Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
265single-precision floating-point value, returning the result.  After being
266shifted into the proper positions, the three fields are simply added
267together to form the result.  This means that any integer portion of `zSig'
268will be added into the exponent.  Since a properly normalized significand
269will have an integer portion equal to 1, the `zExp' input should be 1 less
270than the desired result exponent whenever `zSig' is a complete, normalized
271significand.
272-------------------------------------------------------------------------------
273*/
274INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
275{
276
277    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
278
279}
280
281/*
282-------------------------------------------------------------------------------
283Takes an abstract floating-point value having sign `zSign', exponent `zExp',
284and significand `zSig', and returns the proper single-precision floating-
285point value corresponding to the abstract input.  Ordinarily, the abstract
286value is simply rounded and packed into the single-precision format, with
287the inexact exception raised if the abstract input cannot be represented
288exactly.  However, if the abstract value is too large, the overflow and
289inexact exceptions are raised and an infinity or maximal finite value is
290returned.  If the abstract value is too small, the input value is rounded to
291a subnormal number, and the underflow and inexact exceptions are raised if
292the abstract input cannot be represented exactly as a subnormal single-
293precision floating-point number.
294    The input significand `zSig' has its binary point between bits 30
295and 29, which is 7 bits to the left of the usual location.  This shifted
296significand must be normalized or smaller.  If `zSig' is not normalized,
297`zExp' must be 0; in that case, the result returned is a subnormal number,
298and it must not require rounding.  In the usual case that `zSig' is
299normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
300The handling of underflow and overflow follows the IEC/IEEE Standard for
301Binary Floating-Point Arithmetic.
302-------------------------------------------------------------------------------
303*/
304static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
305{
306    int8 roundingMode;
307    flag roundNearestEven;
308    int8 roundIncrement, roundBits;
309    flag isTiny;
310
311    roundingMode = float_rounding_mode;
312    roundNearestEven = ( roundingMode == float_round_nearest_even );
313    roundIncrement = 0x40;
314    if ( ! roundNearestEven ) {
315        if ( roundingMode == float_round_to_zero ) {
316            roundIncrement = 0;
317        }
318        else {
319            roundIncrement = 0x7F;
320            if ( zSign ) {
321                if ( roundingMode == float_round_up ) roundIncrement = 0;
322            }
323            else {
324                if ( roundingMode == float_round_down ) roundIncrement = 0;
325            }
326        }
327    }
328    roundBits = zSig & 0x7F;
329    if ( 0xFD <= (bits16) zExp ) {
330        if (    ( 0xFD < zExp )
331             || (    ( zExp == 0xFD )
332                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
333           ) {
334            float_raise( float_flag_overflow | float_flag_inexact );
335            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
336        }
337        if ( zExp < 0 ) {
338            isTiny =
339                   ( float_detect_tininess == float_tininess_before_rounding )
340                || ( zExp < -1 )
341                || ( zSig + roundIncrement < 0x80000000 );
342            shift32RightJamming( zSig, - zExp, &zSig );
343            zExp = 0;
344            roundBits = zSig & 0x7F;
345            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
346        }
347    }
348    if ( roundBits ) float_exception_flags |= float_flag_inexact;
349    zSig = ( zSig + roundIncrement )>>7;
350    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
351    if ( zSig == 0 ) zExp = 0;
352    return packFloat32( zSign, zExp, zSig );
353
354}
355
356/*
357-------------------------------------------------------------------------------
358Takes an abstract floating-point value having sign `zSign', exponent `zExp',
359and significand `zSig', and returns the proper single-precision floating-
360point value corresponding to the abstract input.  This routine is just like
361`roundAndPackFloat32' except that `zSig' does not have to be normalized.
362Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
363floating-point exponent.
364-------------------------------------------------------------------------------
365*/
366static float32
367 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
368{
369    int8 shiftCount;
370
371    shiftCount = countLeadingZeros32( zSig ) - 1;
372    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
373
374}
375
376/*
377-------------------------------------------------------------------------------
378Returns the fraction bits of the double-precision floating-point value `a'.
379-------------------------------------------------------------------------------
380*/
381INLINE bits64 extractFloat64Frac( float64 a )
382{
383
384    return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
385
386}
387
388/*
389-------------------------------------------------------------------------------
390Returns the exponent bits of the double-precision floating-point value `a'.
391-------------------------------------------------------------------------------
392*/
393INLINE int16 extractFloat64Exp( float64 a )
394{
395
396    return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
397
398}
399
400/*
401-------------------------------------------------------------------------------
402Returns the sign bit of the double-precision floating-point value `a'.
403-------------------------------------------------------------------------------
404*/
405INLINE flag extractFloat64Sign( float64 a )
406{
407
408    return FLOAT64_DEMANGLE(a)>>63;
409
410}
411
412/*
413-------------------------------------------------------------------------------
414Normalizes the subnormal double-precision floating-point value represented
415by the denormalized significand `aSig'.  The normalized exponent and
416significand are stored at the locations pointed to by `zExpPtr' and
417`zSigPtr', respectively.
418-------------------------------------------------------------------------------
419*/
420static void
421 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
422{
423    int8 shiftCount;
424
425    shiftCount = countLeadingZeros64( aSig ) - 11;
426    *zSigPtr = aSig<<shiftCount;
427    *zExpPtr = 1 - shiftCount;
428
429}
430
431/*
432-------------------------------------------------------------------------------
433Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
434double-precision floating-point value, returning the result.  After being
435shifted into the proper positions, the three fields are simply added
436together to form the result.  This means that any integer portion of `zSig'
437will be added into the exponent.  Since a properly normalized significand
438will have an integer portion equal to 1, the `zExp' input should be 1 less
439than the desired result exponent whenever `zSig' is a complete, normalized
440significand.
441-------------------------------------------------------------------------------
442*/
443INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
444{
445
446    return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
447			   ( ( (bits64) zExp )<<52 ) + zSig );
448
449}
450
451/*
452-------------------------------------------------------------------------------
453Takes an abstract floating-point value having sign `zSign', exponent `zExp',
454and significand `zSig', and returns the proper double-precision floating-
455point value corresponding to the abstract input.  Ordinarily, the abstract
456value is simply rounded and packed into the double-precision format, with
457the inexact exception raised if the abstract input cannot be represented
458exactly.  However, if the abstract value is too large, the overflow and
459inexact exceptions are raised and an infinity or maximal finite value is
460returned.  If the abstract value is too small, the input value is rounded to
461a subnormal number, and the underflow and inexact exceptions are raised if
462the abstract input cannot be represented exactly as a subnormal double-
463precision floating-point number.
464    The input significand `zSig' has its binary point between bits 62
465and 61, which is 10 bits to the left of the usual location.  This shifted
466significand must be normalized or smaller.  If `zSig' is not normalized,
467`zExp' must be 0; in that case, the result returned is a subnormal number,
468and it must not require rounding.  In the usual case that `zSig' is
469normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
470The handling of underflow and overflow follows the IEC/IEEE Standard for
471Binary Floating-Point Arithmetic.
472-------------------------------------------------------------------------------
473*/
474static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
475{
476    int8 roundingMode;
477    flag roundNearestEven;
478    int16 roundIncrement, roundBits;
479    flag isTiny;
480
481    roundingMode = float_rounding_mode;
482    roundNearestEven = ( roundingMode == float_round_nearest_even );
483    roundIncrement = 0x200;
484    if ( ! roundNearestEven ) {
485        if ( roundingMode == float_round_to_zero ) {
486            roundIncrement = 0;
487        }
488        else {
489            roundIncrement = 0x3FF;
490            if ( zSign ) {
491                if ( roundingMode == float_round_up ) roundIncrement = 0;
492            }
493            else {
494                if ( roundingMode == float_round_down ) roundIncrement = 0;
495            }
496        }
497    }
498    roundBits = zSig & 0x3FF;
499    if ( 0x7FD <= (bits16) zExp ) {
500        if (    ( 0x7FD < zExp )
501             || (    ( zExp == 0x7FD )
502                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
503           ) {
504            float_raise( float_flag_overflow | float_flag_inexact );
505            return FLOAT64_MANGLE(
506		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
507		( roundIncrement == 0 ));
508        }
509        if ( zExp < 0 ) {
510            isTiny =
511                   ( float_detect_tininess == float_tininess_before_rounding )
512                || ( zExp < -1 )
513                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
514            shift64RightJamming( zSig, - zExp, &zSig );
515            zExp = 0;
516            roundBits = zSig & 0x3FF;
517            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
518        }
519    }
520    if ( roundBits ) float_exception_flags |= float_flag_inexact;
521    zSig = ( zSig + roundIncrement )>>10;
522    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
523    if ( zSig == 0 ) zExp = 0;
524    return packFloat64( zSign, zExp, zSig );
525
526}
527
528/*
529-------------------------------------------------------------------------------
530Takes an abstract floating-point value having sign `zSign', exponent `zExp',
531and significand `zSig', and returns the proper double-precision floating-
532point value corresponding to the abstract input.  This routine is just like
533`roundAndPackFloat64' except that `zSig' does not have to be normalized.
534Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
535floating-point exponent.
536-------------------------------------------------------------------------------
537*/
538static float64
539 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
540{
541    int8 shiftCount;
542
543    shiftCount = countLeadingZeros64( zSig ) - 1;
544    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
545
546}
547
548#ifdef FLOATX80
549
550/*
551-------------------------------------------------------------------------------
552Returns the fraction bits of the extended double-precision floating-point
553value `a'.
554-------------------------------------------------------------------------------
555*/
556INLINE bits64 extractFloatx80Frac( floatx80 a )
557{
558
559    return a.low;
560
561}
562
563/*
564-------------------------------------------------------------------------------
565Returns the exponent bits of the extended double-precision floating-point
566value `a'.
567-------------------------------------------------------------------------------
568*/
569INLINE int32 extractFloatx80Exp( floatx80 a )
570{
571
572    return a.high & 0x7FFF;
573
574}
575
576/*
577-------------------------------------------------------------------------------
578Returns the sign bit of the extended double-precision floating-point value
579`a'.
580-------------------------------------------------------------------------------
581*/
582INLINE flag extractFloatx80Sign( floatx80 a )
583{
584
585    return a.high>>15;
586
587}
588
589/*
590-------------------------------------------------------------------------------
591Normalizes the subnormal extended double-precision floating-point value
592represented by the denormalized significand `aSig'.  The normalized exponent
593and significand are stored at the locations pointed to by `zExpPtr' and
594`zSigPtr', respectively.
595-------------------------------------------------------------------------------
596*/
597static void
598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
599{
600    int8 shiftCount;
601
602    shiftCount = countLeadingZeros64( aSig );
603    *zSigPtr = aSig<<shiftCount;
604    *zExpPtr = 1 - shiftCount;
605
606}
607
608/*
609-------------------------------------------------------------------------------
610Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
611extended double-precision floating-point value, returning the result.
612-------------------------------------------------------------------------------
613*/
614INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
615{
616    floatx80 z;
617
618    z.low = zSig;
619    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
620    return z;
621
622}
623
624/*
625-------------------------------------------------------------------------------
626Takes an abstract floating-point value having sign `zSign', exponent `zExp',
627and extended significand formed by the concatenation of `zSig0' and `zSig1',
628and returns the proper extended double-precision floating-point value
629corresponding to the abstract input.  Ordinarily, the abstract value is
630rounded and packed into the extended double-precision format, with the
631inexact exception raised if the abstract input cannot be represented
632exactly.  However, if the abstract value is too large, the overflow and
633inexact exceptions are raised and an infinity or maximal finite value is
634returned.  If the abstract value is too small, the input value is rounded to
635a subnormal number, and the underflow and inexact exceptions are raised if
636the abstract input cannot be represented exactly as a subnormal extended
637double-precision floating-point number.
638    If `roundingPrecision' is 32 or 64, the result is rounded to the same
639number of bits as single or double precision, respectively.  Otherwise, the
640result is rounded to the full precision of the extended double-precision
641format.
642    The input significand must be normalized or smaller.  If the input
643significand is not normalized, `zExp' must be 0; in that case, the result
644returned is a subnormal number, and it must not require rounding.  The
645handling of underflow and overflow follows the IEC/IEEE Standard for Binary
646Floating-Point Arithmetic.
647-------------------------------------------------------------------------------
648*/
649static floatx80
650 roundAndPackFloatx80(
651     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
652 )
653{
654    int8 roundingMode;
655    flag roundNearestEven, increment, isTiny;
656    int64 roundIncrement, roundMask, roundBits;
657
658    roundingMode = float_rounding_mode;
659    roundNearestEven = ( roundingMode == float_round_nearest_even );
660    if ( roundingPrecision == 80 ) goto precision80;
661    if ( roundingPrecision == 64 ) {
662        roundIncrement = LIT64( 0x0000000000000400 );
663        roundMask = LIT64( 0x00000000000007FF );
664    }
665    else if ( roundingPrecision == 32 ) {
666        roundIncrement = LIT64( 0x0000008000000000 );
667        roundMask = LIT64( 0x000000FFFFFFFFFF );
668    }
669    else {
670        goto precision80;
671    }
672    zSig0 |= ( zSig1 != 0 );
673    if ( ! roundNearestEven ) {
674        if ( roundingMode == float_round_to_zero ) {
675            roundIncrement = 0;
676        }
677        else {
678            roundIncrement = roundMask;
679            if ( zSign ) {
680                if ( roundingMode == float_round_up ) roundIncrement = 0;
681            }
682            else {
683                if ( roundingMode == float_round_down ) roundIncrement = 0;
684            }
685        }
686    }
687    roundBits = zSig0 & roundMask;
688    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
689        if (    ( 0x7FFE < zExp )
690             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
691           ) {
692            goto overflow;
693        }
694        if ( zExp <= 0 ) {
695            isTiny =
696                   ( float_detect_tininess == float_tininess_before_rounding )
697                || ( zExp < 0 )
698                || ( zSig0 <= zSig0 + roundIncrement );
699            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
700            zExp = 0;
701            roundBits = zSig0 & roundMask;
702            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
703            if ( roundBits ) float_exception_flags |= float_flag_inexact;
704            zSig0 += roundIncrement;
705            if ( (sbits64) zSig0 < 0 ) zExp = 1;
706            roundIncrement = roundMask + 1;
707            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
708                roundMask |= roundIncrement;
709            }
710            zSig0 &= ~ roundMask;
711            return packFloatx80( zSign, zExp, zSig0 );
712        }
713    }
714    if ( roundBits ) float_exception_flags |= float_flag_inexact;
715    zSig0 += roundIncrement;
716    if ( zSig0 < roundIncrement ) {
717        ++zExp;
718        zSig0 = LIT64( 0x8000000000000000 );
719    }
720    roundIncrement = roundMask + 1;
721    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
722        roundMask |= roundIncrement;
723    }
724    zSig0 &= ~ roundMask;
725    if ( zSig0 == 0 ) zExp = 0;
726    return packFloatx80( zSign, zExp, zSig0 );
727 precision80:
728    increment = ( (sbits64) zSig1 < 0 );
729    if ( ! roundNearestEven ) {
730        if ( roundingMode == float_round_to_zero ) {
731            increment = 0;
732        }
733        else {
734            if ( zSign ) {
735                increment = ( roundingMode == float_round_down ) && zSig1;
736            }
737            else {
738                increment = ( roundingMode == float_round_up ) && zSig1;
739            }
740        }
741    }
742    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
743        if (    ( 0x7FFE < zExp )
744             || (    ( zExp == 0x7FFE )
745                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
746                  && increment
747                )
748           ) {
749            roundMask = 0;
750 overflow:
751            float_raise( float_flag_overflow | float_flag_inexact );
752            if (    ( roundingMode == float_round_to_zero )
753                 || ( zSign && ( roundingMode == float_round_up ) )
754                 || ( ! zSign && ( roundingMode == float_round_down ) )
755               ) {
756                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
757            }
758            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
759        }
760        if ( zExp <= 0 ) {
761            isTiny =
762                   ( float_detect_tininess == float_tininess_before_rounding )
763                || ( zExp < 0 )
764                || ! increment
765                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
766            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
767            zExp = 0;
768            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
769            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
770            if ( roundNearestEven ) {
771                increment = ( (sbits64) zSig1 < 0 );
772            }
773            else {
774                if ( zSign ) {
775                    increment = ( roundingMode == float_round_down ) && zSig1;
776                }
777                else {
778                    increment = ( roundingMode == float_round_up ) && zSig1;
779                }
780            }
781            if ( increment ) {
782                ++zSig0;
783                zSig0 &=
784                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
785                if ( (sbits64) zSig0 < 0 ) zExp = 1;
786            }
787            return packFloatx80( zSign, zExp, zSig0 );
788        }
789    }
790    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
791    if ( increment ) {
792        ++zSig0;
793        if ( zSig0 == 0 ) {
794            ++zExp;
795            zSig0 = LIT64( 0x8000000000000000 );
796        }
797        else {
798            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
799        }
800    }
801    else {
802        if ( zSig0 == 0 ) zExp = 0;
803    }
804    return packFloatx80( zSign, zExp, zSig0 );
805
806}
807
808/*
809-------------------------------------------------------------------------------
810Takes an abstract floating-point value having sign `zSign', exponent
811`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
812and returns the proper extended double-precision floating-point value
813corresponding to the abstract input.  This routine is just like
814`roundAndPackFloatx80' except that the input significand does not have to be
815normalized.
816-------------------------------------------------------------------------------
817*/
818static floatx80
819 normalizeRoundAndPackFloatx80(
820     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
821 )
822{
823    int8 shiftCount;
824
825    if ( zSig0 == 0 ) {
826        zSig0 = zSig1;
827        zSig1 = 0;
828        zExp -= 64;
829    }
830    shiftCount = countLeadingZeros64( zSig0 );
831    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
832    zExp -= shiftCount;
833    return
834        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
835
836}
837
838#endif
839
840#ifdef FLOAT128
841
842/*
843-------------------------------------------------------------------------------
844Returns the least-significant 64 fraction bits of the quadruple-precision
845floating-point value `a'.
846-------------------------------------------------------------------------------
847*/
848INLINE bits64 extractFloat128Frac1( float128 a )
849{
850
851    return a.low;
852
853}
854
855/*
856-------------------------------------------------------------------------------
857Returns the most-significant 48 fraction bits of the quadruple-precision
858floating-point value `a'.
859-------------------------------------------------------------------------------
860*/
861INLINE bits64 extractFloat128Frac0( float128 a )
862{
863
864    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
865
866}
867
868/*
869-------------------------------------------------------------------------------
870Returns the exponent bits of the quadruple-precision floating-point value
871`a'.
872-------------------------------------------------------------------------------
873*/
874INLINE int32 extractFloat128Exp( float128 a )
875{
876
877    return ( a.high>>48 ) & 0x7FFF;
878
879}
880
881/*
882-------------------------------------------------------------------------------
883Returns the sign bit of the quadruple-precision floating-point value `a'.
884-------------------------------------------------------------------------------
885*/
886INLINE flag extractFloat128Sign( float128 a )
887{
888
889    return a.high>>63;
890
891}
892
893/*
894-------------------------------------------------------------------------------
895Normalizes the subnormal quadruple-precision floating-point value
896represented by the denormalized significand formed by the concatenation of
897`aSig0' and `aSig1'.  The normalized exponent is stored at the location
898pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
899significand are stored at the location pointed to by `zSig0Ptr', and the
900least significant 64 bits of the normalized significand are stored at the
901location pointed to by `zSig1Ptr'.
902-------------------------------------------------------------------------------
903*/
904static void
905 normalizeFloat128Subnormal(
906     bits64 aSig0,
907     bits64 aSig1,
908     int32 *zExpPtr,
909     bits64 *zSig0Ptr,
910     bits64 *zSig1Ptr
911 )
912{
913    int8 shiftCount;
914
915    if ( aSig0 == 0 ) {
916        shiftCount = countLeadingZeros64( aSig1 ) - 15;
917        if ( shiftCount < 0 ) {
918            *zSig0Ptr = aSig1>>( - shiftCount );
919            *zSig1Ptr = aSig1<<( shiftCount & 63 );
920        }
921        else {
922            *zSig0Ptr = aSig1<<shiftCount;
923            *zSig1Ptr = 0;
924        }
925        *zExpPtr = - shiftCount - 63;
926    }
927    else {
928        shiftCount = countLeadingZeros64( aSig0 ) - 15;
929        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
930        *zExpPtr = 1 - shiftCount;
931    }
932
933}
934
935/*
936-------------------------------------------------------------------------------
937Packs the sign `zSign', the exponent `zExp', and the significand formed
938by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
939floating-point value, returning the result.  After being shifted into the
940proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
941added together to form the most significant 32 bits of the result.  This
942means that any integer portion of `zSig0' will be added into the exponent.
943Since a properly normalized significand will have an integer portion equal
944to 1, the `zExp' input should be 1 less than the desired result exponent
945whenever `zSig0' and `zSig1' concatenated form a complete, normalized
946significand.
947-------------------------------------------------------------------------------
948*/
949INLINE float128
950 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
951{
952    float128 z;
953
954    z.low = zSig1;
955    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
956    return z;
957
958}
959
960/*
961-------------------------------------------------------------------------------
962Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963and extended significand formed by the concatenation of `zSig0', `zSig1',
964and `zSig2', and returns the proper quadruple-precision floating-point value
965corresponding to the abstract input.  Ordinarily, the abstract value is
966simply rounded and packed into the quadruple-precision format, with the
967inexact exception raised if the abstract input cannot be represented
968exactly.  However, if the abstract value is too large, the overflow and
969inexact exceptions are raised and an infinity or maximal finite value is
970returned.  If the abstract value is too small, the input value is rounded to
971a subnormal number, and the underflow and inexact exceptions are raised if
972the abstract input cannot be represented exactly as a subnormal quadruple-
973precision floating-point number.
974    The input significand must be normalized or smaller.  If the input
975significand is not normalized, `zExp' must be 0; in that case, the result
976returned is a subnormal number, and it must not require rounding.  In the
977usual case that the input significand is normalized, `zExp' must be 1 less
978than the ``true'' floating-point exponent.  The handling of underflow and
979overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980-------------------------------------------------------------------------------
981*/
982static float128
983 roundAndPackFloat128(
984     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
985{
986    int8 roundingMode;
987    flag roundNearestEven, increment, isTiny;
988
989    roundingMode = float_rounding_mode;
990    roundNearestEven = ( roundingMode == float_round_nearest_even );
991    increment = ( (sbits64) zSig2 < 0 );
992    if ( ! roundNearestEven ) {
993        if ( roundingMode == float_round_to_zero ) {
994            increment = 0;
995        }
996        else {
997            if ( zSign ) {
998                increment = ( roundingMode == float_round_down ) && zSig2;
999            }
1000            else {
1001                increment = ( roundingMode == float_round_up ) && zSig2;
1002            }
1003        }
1004    }
1005    if ( 0x7FFD <= (bits32) zExp ) {
1006        if (    ( 0x7FFD < zExp )
1007             || (    ( zExp == 0x7FFD )
1008                  && eq128(
1009                         LIT64( 0x0001FFFFFFFFFFFF ),
1010                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                         zSig0,
1012                         zSig1
1013                     )
1014                  && increment
1015                )
1016           ) {
1017            float_raise( float_flag_overflow | float_flag_inexact );
1018            if (    ( roundingMode == float_round_to_zero )
1019                 || ( zSign && ( roundingMode == float_round_up ) )
1020                 || ( ! zSign && ( roundingMode == float_round_down ) )
1021               ) {
1022                return
1023                    packFloat128(
1024                        zSign,
1025                        0x7FFE,
1026                        LIT64( 0x0000FFFFFFFFFFFF ),
1027                        LIT64( 0xFFFFFFFFFFFFFFFF )
1028                    );
1029            }
1030            return packFloat128( zSign, 0x7FFF, 0, 0 );
1031        }
1032        if ( zExp < 0 ) {
1033            isTiny =
1034                   ( float_detect_tininess == float_tininess_before_rounding )
1035                || ( zExp < -1 )
1036                || ! increment
1037                || lt128(
1038                       zSig0,
1039                       zSig1,
1040                       LIT64( 0x0001FFFFFFFFFFFF ),
1041                       LIT64( 0xFFFFFFFFFFFFFFFF )
1042                   );
1043            shift128ExtraRightJamming(
1044                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1045            zExp = 0;
1046            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1047            if ( roundNearestEven ) {
1048                increment = ( (sbits64) zSig2 < 0 );
1049            }
1050            else {
1051                if ( zSign ) {
1052                    increment = ( roundingMode == float_round_down ) && zSig2;
1053                }
1054                else {
1055                    increment = ( roundingMode == float_round_up ) && zSig2;
1056                }
1057            }
1058        }
1059    }
1060    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1061    if ( increment ) {
1062        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1063        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1064    }
1065    else {
1066        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1067    }
1068    return packFloat128( zSign, zExp, zSig0, zSig1 );
1069
1070}
1071
1072/*
1073-------------------------------------------------------------------------------
1074Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075and significand formed by the concatenation of `zSig0' and `zSig1', and
1076returns the proper quadruple-precision floating-point value corresponding
1077to the abstract input.  This routine is just like `roundAndPackFloat128'
1078except that the input significand has fewer bits and does not have to be
1079normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080point exponent.
1081-------------------------------------------------------------------------------
1082*/
1083static float128
1084 normalizeRoundAndPackFloat128(
1085     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1086{
1087    int8 shiftCount;
1088    bits64 zSig2;
1089
1090    if ( zSig0 == 0 ) {
1091        zSig0 = zSig1;
1092        zSig1 = 0;
1093        zExp -= 64;
1094    }
1095    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096    if ( 0 <= shiftCount ) {
1097        zSig2 = 0;
1098        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099    }
1100    else {
1101        shift128ExtraRightJamming(
1102            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103    }
1104    zExp -= shiftCount;
1105    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1106
1107}
1108
1109#endif
1110
1111/*
1112-------------------------------------------------------------------------------
1113Returns the result of converting the 32-bit two's complement integer `a'
1114to the single-precision floating-point format.  The conversion is performed
1115according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1116-------------------------------------------------------------------------------
1117*/
1118float32 int32_to_float32( int32 a )
1119{
1120    flag zSign;
1121
1122    if ( a == 0 ) return 0;
1123    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1124    zSign = ( a < 0 );
1125    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1126
1127}
1128
1129float32 uint32_to_float32( uint32 a )
1130{
1131    if ( a == 0 ) return 0;
1132    if ( a & (bits32) 0x80000000 )
1133	return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1134    return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1135}
1136
1137
1138/*
1139-------------------------------------------------------------------------------
1140Returns the result of converting the 32-bit two's complement integer `a'
1141to the double-precision floating-point format.  The conversion is performed
1142according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1143-------------------------------------------------------------------------------
1144*/
1145float64 int32_to_float64( int32 a )
1146{
1147    flag zSign;
1148    uint32 absA;
1149    int8 shiftCount;
1150    bits64 zSig;
1151
1152    if ( a == 0 ) return 0;
1153    zSign = ( a < 0 );
1154    absA = zSign ? - a : a;
1155    shiftCount = countLeadingZeros32( absA ) + 21;
1156    zSig = absA;
1157    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1158
1159}
1160
1161float64 uint32_to_float64( uint32 a )
1162{
1163    int8 shiftCount;
1164    bits64 zSig = a;
1165
1166    if ( a == 0 ) return 0;
1167    shiftCount = countLeadingZeros32( a ) + 21;
1168    return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1169
1170}
1171
1172#ifdef FLOATX80
1173
1174/*
1175-------------------------------------------------------------------------------
1176Returns the result of converting the 32-bit two's complement integer `a'
1177to the extended double-precision floating-point format.  The conversion
1178is performed according to the IEC/IEEE Standard for Binary Floating-Point
1179Arithmetic.
1180-------------------------------------------------------------------------------
1181*/
1182floatx80 int32_to_floatx80( int32 a )
1183{
1184    flag zSign;
1185    uint32 absA;
1186    int8 shiftCount;
1187    bits64 zSig;
1188
1189    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1190    zSign = ( a < 0 );
1191    absA = zSign ? - a : a;
1192    shiftCount = countLeadingZeros32( absA ) + 32;
1193    zSig = absA;
1194    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1195
1196}
1197
1198floatx80 uint32_to_floatx80( uint32 a )
1199{
1200    int8 shiftCount;
1201    bits64 zSig = a;
1202
1203    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1204    shiftCount = countLeadingZeros32( a ) + 32;
1205    return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1206
1207}
1208
1209#endif
1210
1211#ifdef FLOAT128
1212
1213/*
1214-------------------------------------------------------------------------------
1215Returns the result of converting the 32-bit two's complement integer `a' to
1216the quadruple-precision floating-point format.  The conversion is performed
1217according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1218-------------------------------------------------------------------------------
1219*/
1220float128 int32_to_float128( int32 a )
1221{
1222    flag zSign;
1223    uint32 absA;
1224    int8 shiftCount;
1225    bits64 zSig0;
1226
1227    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228    zSign = ( a < 0 );
1229    absA = zSign ? - a : a;
1230    shiftCount = countLeadingZeros32( absA ) + 17;
1231    zSig0 = absA;
1232    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1233
1234}
1235
1236float128 uint32_to_float128( uint32 a )
1237{
1238    int8 shiftCount;
1239    bits64 zSig0 = a;
1240
1241    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1242    shiftCount = countLeadingZeros32( a ) + 17;
1243    return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1244
1245}
1246
1247#endif
1248
1249#ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1250/*
1251-------------------------------------------------------------------------------
1252Returns the result of converting the 64-bit two's complement integer `a'
1253to the single-precision floating-point format.  The conversion is performed
1254according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255-------------------------------------------------------------------------------
1256*/
1257float32 int64_to_float32( int64 a )
1258{
1259    flag zSign;
1260    uint64 absA;
1261    int8 shiftCount;
1262
1263    if ( a == 0 ) return 0;
1264    zSign = ( a < 0 );
1265    absA = zSign ? - a : a;
1266    shiftCount = countLeadingZeros64( absA ) - 40;
1267    if ( 0 <= shiftCount ) {
1268        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1269    }
1270    else {
1271        shiftCount += 7;
1272        if ( shiftCount < 0 ) {
1273            shift64RightJamming( absA, - shiftCount, &absA );
1274        }
1275        else {
1276            absA <<= shiftCount;
1277        }
1278        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1279    }
1280
1281}
1282
1283/*
1284-------------------------------------------------------------------------------
1285Returns the result of converting the 64-bit two's complement integer `a'
1286to the double-precision floating-point format.  The conversion is performed
1287according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1288-------------------------------------------------------------------------------
1289*/
1290float64 int64_to_float64( int64 a )
1291{
1292    flag zSign;
1293
1294    if ( a == 0 ) return 0;
1295    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1296        return packFloat64( 1, 0x43E, 0 );
1297    }
1298    zSign = ( a < 0 );
1299    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1300
1301}
1302
1303#ifdef FLOATX80
1304
1305/*
1306-------------------------------------------------------------------------------
1307Returns the result of converting the 64-bit two's complement integer `a'
1308to the extended double-precision floating-point format.  The conversion
1309is performed according to the IEC/IEEE Standard for Binary Floating-Point
1310Arithmetic.
1311-------------------------------------------------------------------------------
1312*/
1313floatx80 int64_to_floatx80( int64 a )
1314{
1315    flag zSign;
1316    uint64 absA;
1317    int8 shiftCount;
1318
1319    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1320    zSign = ( a < 0 );
1321    absA = zSign ? - a : a;
1322    shiftCount = countLeadingZeros64( absA );
1323    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1324
1325}
1326
1327#endif
1328
1329#endif /* !SOFTFLOAT_FOR_GCC */
1330
1331#ifdef FLOAT128
1332
1333/*
1334-------------------------------------------------------------------------------
1335Returns the result of converting the 64-bit two's complement integer `a' to
1336the quadruple-precision floating-point format.  The conversion is performed
1337according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338-------------------------------------------------------------------------------
1339*/
1340float128 int64_to_float128( int64 a )
1341{
1342    flag zSign;
1343    uint64 absA;
1344    int8 shiftCount;
1345    int32 zExp;
1346    bits64 zSig0, zSig1;
1347
1348    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1349    zSign = ( a < 0 );
1350    absA = zSign ? - a : a;
1351    shiftCount = countLeadingZeros64( absA ) + 49;
1352    zExp = 0x406E - shiftCount;
1353    if ( 64 <= shiftCount ) {
1354        zSig1 = 0;
1355        zSig0 = absA;
1356        shiftCount -= 64;
1357    }
1358    else {
1359        zSig1 = absA;
1360        zSig0 = 0;
1361    }
1362    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1363    return packFloat128( zSign, zExp, zSig0, zSig1 );
1364
1365}
1366
1367#endif
1368
1369#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1370/*
1371-------------------------------------------------------------------------------
1372Returns the result of converting the single-precision floating-point value
1373`a' to the 32-bit two's complement integer format.  The conversion is
1374performed according to the IEC/IEEE Standard for Binary Floating-Point
1375Arithmetic---which means in particular that the conversion is rounded
1376according to the current rounding mode.  If `a' is a NaN, the largest
1377positive integer is returned.  Otherwise, if the conversion overflows, the
1378largest integer with the same sign as `a' is returned.
1379-------------------------------------------------------------------------------
1380*/
1381int32 float32_to_int32( float32 a )
1382{
1383    flag aSign;
1384    int16 aExp, shiftCount;
1385    bits32 aSig;
1386    bits64 aSig64;
1387
1388    aSig = extractFloat32Frac( a );
1389    aExp = extractFloat32Exp( a );
1390    aSign = extractFloat32Sign( a );
1391    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1392    if ( aExp ) aSig |= 0x00800000;
1393    shiftCount = 0xAF - aExp;
1394    aSig64 = aSig;
1395    aSig64 <<= 32;
1396    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1397    return roundAndPackInt32( aSign, aSig64 );
1398
1399}
1400#endif /* !SOFTFLOAT_FOR_GCC */
1401
1402/*
1403-------------------------------------------------------------------------------
1404Returns the result of converting the single-precision floating-point value
1405`a' to the 32-bit two's complement integer format.  The conversion is
1406performed according to the IEC/IEEE Standard for Binary Floating-Point
1407Arithmetic, except that the conversion is always rounded toward zero.
1408If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1409the conversion overflows, the largest integer with the same sign as `a' is
1410returned.
1411-------------------------------------------------------------------------------
1412*/
1413int32 float32_to_int32_round_to_zero( float32 a )
1414{
1415    flag aSign;
1416    int16 aExp, shiftCount;
1417    bits32 aSig;
1418    int32 z;
1419
1420    aSig = extractFloat32Frac( a );
1421    aExp = extractFloat32Exp( a );
1422    aSign = extractFloat32Sign( a );
1423    shiftCount = aExp - 0x9E;
1424    if ( 0 <= shiftCount ) {
1425        if ( a != 0xCF000000 ) {
1426            float_raise( float_flag_invalid );
1427            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1428        }
1429        return (sbits32) 0x80000000;
1430    }
1431    else if ( aExp <= 0x7E ) {
1432        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1433        return 0;
1434    }
1435    aSig = ( aSig | 0x00800000 )<<8;
1436    z = aSig>>( - shiftCount );
1437    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1438        float_exception_flags |= float_flag_inexact;
1439    }
1440    if ( aSign ) z = - z;
1441    return z;
1442
1443}
1444
1445#ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1446/*
1447-------------------------------------------------------------------------------
1448Returns the result of converting the single-precision floating-point value
1449`a' to the 64-bit two's complement integer format.  The conversion is
1450performed according to the IEC/IEEE Standard for Binary Floating-Point
1451Arithmetic---which means in particular that the conversion is rounded
1452according to the current rounding mode.  If `a' is a NaN, the largest
1453positive integer is returned.  Otherwise, if the conversion overflows, the
1454largest integer with the same sign as `a' is returned.
1455-------------------------------------------------------------------------------
1456*/
1457int64 float32_to_int64( float32 a )
1458{
1459    flag aSign;
1460    int16 aExp, shiftCount;
1461    bits32 aSig;
1462    bits64 aSig64, aSigExtra;
1463
1464    aSig = extractFloat32Frac( a );
1465    aExp = extractFloat32Exp( a );
1466    aSign = extractFloat32Sign( a );
1467    shiftCount = 0xBE - aExp;
1468    if ( shiftCount < 0 ) {
1469        float_raise( float_flag_invalid );
1470        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1471            return LIT64( 0x7FFFFFFFFFFFFFFF );
1472        }
1473        return (sbits64) LIT64( 0x8000000000000000 );
1474    }
1475    if ( aExp ) aSig |= 0x00800000;
1476    aSig64 = aSig;
1477    aSig64 <<= 40;
1478    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1479    return roundAndPackInt64( aSign, aSig64, aSigExtra );
1480
1481}
1482
1483/*
1484-------------------------------------------------------------------------------
1485Returns the result of converting the single-precision floating-point value
1486`a' to the 64-bit two's complement integer format.  The conversion is
1487performed according to the IEC/IEEE Standard for Binary Floating-Point
1488Arithmetic, except that the conversion is always rounded toward zero.  If
1489`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1490conversion overflows, the largest integer with the same sign as `a' is
1491returned.
1492-------------------------------------------------------------------------------
1493*/
1494int64 float32_to_int64_round_to_zero( float32 a )
1495{
1496    flag aSign;
1497    int16 aExp, shiftCount;
1498    bits32 aSig;
1499    bits64 aSig64;
1500    int64 z;
1501
1502    aSig = extractFloat32Frac( a );
1503    aExp = extractFloat32Exp( a );
1504    aSign = extractFloat32Sign( a );
1505    shiftCount = aExp - 0xBE;
1506    if ( 0 <= shiftCount ) {
1507        if ( a != 0xDF000000 ) {
1508            float_raise( float_flag_invalid );
1509            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1510                return LIT64( 0x7FFFFFFFFFFFFFFF );
1511            }
1512        }
1513        return (sbits64) LIT64( 0x8000000000000000 );
1514    }
1515    else if ( aExp <= 0x7E ) {
1516        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1517        return 0;
1518    }
1519    aSig64 = aSig | 0x00800000;
1520    aSig64 <<= 40;
1521    z = aSig64>>( - shiftCount );
1522    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1523        float_exception_flags |= float_flag_inexact;
1524    }
1525    if ( aSign ) z = - z;
1526    return z;
1527
1528}
1529#endif /* !SOFTFLOAT_FOR_GCC */
1530
1531/*
1532-------------------------------------------------------------------------------
1533Returns the result of converting the single-precision floating-point value
1534`a' to the double-precision floating-point format.  The conversion is
1535performed according to the IEC/IEEE Standard for Binary Floating-Point
1536Arithmetic.
1537-------------------------------------------------------------------------------
1538*/
1539float64 float32_to_float64( float32 a )
1540{
1541    flag aSign;
1542    int16 aExp;
1543    bits32 aSig;
1544
1545    aSig = extractFloat32Frac( a );
1546    aExp = extractFloat32Exp( a );
1547    aSign = extractFloat32Sign( a );
1548    if ( aExp == 0xFF ) {
1549        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1550        return packFloat64( aSign, 0x7FF, 0 );
1551    }
1552    if ( aExp == 0 ) {
1553        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1554        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1555        --aExp;
1556    }
1557    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1558
1559}
1560
1561#ifdef FLOATX80
1562
1563/*
1564-------------------------------------------------------------------------------
1565Returns the result of converting the single-precision floating-point value
1566`a' to the extended double-precision floating-point format.  The conversion
1567is performed according to the IEC/IEEE Standard for Binary Floating-Point
1568Arithmetic.
1569-------------------------------------------------------------------------------
1570*/
1571floatx80 float32_to_floatx80( float32 a )
1572{
1573    flag aSign;
1574    int16 aExp;
1575    bits32 aSig;
1576
1577    aSig = extractFloat32Frac( a );
1578    aExp = extractFloat32Exp( a );
1579    aSign = extractFloat32Sign( a );
1580    if ( aExp == 0xFF ) {
1581        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1582        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1583    }
1584    if ( aExp == 0 ) {
1585        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1586        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587    }
1588    aSig |= 0x00800000;
1589    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1590
1591}
1592
1593#endif
1594
1595#ifdef FLOAT128
1596
1597/*
1598-------------------------------------------------------------------------------
1599Returns the result of converting the single-precision floating-point value
1600`a' to the double-precision floating-point format.  The conversion is
1601performed according to the IEC/IEEE Standard for Binary Floating-Point
1602Arithmetic.
1603-------------------------------------------------------------------------------
1604*/
1605float128 float32_to_float128( float32 a )
1606{
1607    flag aSign;
1608    int16 aExp;
1609    bits32 aSig;
1610
1611    aSig = extractFloat32Frac( a );
1612    aExp = extractFloat32Exp( a );
1613    aSign = extractFloat32Sign( a );
1614    if ( aExp == 0xFF ) {
1615        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1616        return packFloat128( aSign, 0x7FFF, 0, 0 );
1617    }
1618    if ( aExp == 0 ) {
1619        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1620        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1621        --aExp;
1622    }
1623    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1624
1625}
1626
1627#endif
1628
1629#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1630/*
1631-------------------------------------------------------------------------------
1632Rounds the single-precision floating-point value `a' to an integer, and
1633returns the result as a single-precision floating-point value.  The
1634operation is performed according to the IEC/IEEE Standard for Binary
1635Floating-Point Arithmetic.
1636-------------------------------------------------------------------------------
1637*/
1638float32 float32_round_to_int( float32 a )
1639{
1640    flag aSign;
1641    int16 aExp;
1642    bits32 lastBitMask, roundBitsMask;
1643    int8 roundingMode;
1644    float32 z;
1645
1646    aExp = extractFloat32Exp( a );
1647    if ( 0x96 <= aExp ) {
1648        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1649            return propagateFloat32NaN( a, a );
1650        }
1651        return a;
1652    }
1653    if ( aExp <= 0x7E ) {
1654        if ( (bits32) ( a<<1 ) == 0 ) return a;
1655        float_exception_flags |= float_flag_inexact;
1656        aSign = extractFloat32Sign( a );
1657        switch ( float_rounding_mode ) {
1658         case float_round_nearest_even:
1659            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1660                return packFloat32( aSign, 0x7F, 0 );
1661            }
1662            break;
1663	 case float_round_to_zero:
1664	    break;
1665         case float_round_down:
1666            return aSign ? 0xBF800000 : 0;
1667         case float_round_up:
1668            return aSign ? 0x80000000 : 0x3F800000;
1669        }
1670        return packFloat32( aSign, 0, 0 );
1671    }
1672    lastBitMask = 1;
1673    lastBitMask <<= 0x96 - aExp;
1674    roundBitsMask = lastBitMask - 1;
1675    z = a;
1676    roundingMode = float_rounding_mode;
1677    if ( roundingMode == float_round_nearest_even ) {
1678        z += lastBitMask>>1;
1679        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680    }
1681    else if ( roundingMode != float_round_to_zero ) {
1682        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1683            z += roundBitsMask;
1684        }
1685    }
1686    z &= ~ roundBitsMask;
1687    if ( z != a ) float_exception_flags |= float_flag_inexact;
1688    return z;
1689
1690}
1691#endif /* !SOFTFLOAT_FOR_GCC */
1692
1693/*
1694-------------------------------------------------------------------------------
1695Returns the result of adding the absolute values of the single-precision
1696floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1697before being returned.  `zSign' is ignored if the result is a NaN.
1698The addition is performed according to the IEC/IEEE Standard for Binary
1699Floating-Point Arithmetic.
1700-------------------------------------------------------------------------------
1701*/
1702static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1703{
1704    int16 aExp, bExp, zExp;
1705    bits32 aSig, bSig, zSig;
1706    int16 expDiff;
1707
1708    aSig = extractFloat32Frac( a );
1709    aExp = extractFloat32Exp( a );
1710    bSig = extractFloat32Frac( b );
1711    bExp = extractFloat32Exp( b );
1712    expDiff = aExp - bExp;
1713    aSig <<= 6;
1714    bSig <<= 6;
1715    if ( 0 < expDiff ) {
1716        if ( aExp == 0xFF ) {
1717            if ( aSig ) return propagateFloat32NaN( a, b );
1718            return a;
1719        }
1720        if ( bExp == 0 ) {
1721            --expDiff;
1722        }
1723        else {
1724            bSig |= 0x20000000;
1725        }
1726        shift32RightJamming( bSig, expDiff, &bSig );
1727        zExp = aExp;
1728    }
1729    else if ( expDiff < 0 ) {
1730        if ( bExp == 0xFF ) {
1731            if ( bSig ) return propagateFloat32NaN( a, b );
1732            return packFloat32( zSign, 0xFF, 0 );
1733        }
1734        if ( aExp == 0 ) {
1735            ++expDiff;
1736        }
1737        else {
1738            aSig |= 0x20000000;
1739        }
1740        shift32RightJamming( aSig, - expDiff, &aSig );
1741        zExp = bExp;
1742    }
1743    else {
1744        if ( aExp == 0xFF ) {
1745            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1746            return a;
1747        }
1748        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1749        zSig = 0x40000000 + aSig + bSig;
1750        zExp = aExp;
1751        goto roundAndPack;
1752    }
1753    aSig |= 0x20000000;
1754    zSig = ( aSig + bSig )<<1;
1755    --zExp;
1756    if ( (sbits32) zSig < 0 ) {
1757        zSig = aSig + bSig;
1758        ++zExp;
1759    }
1760 roundAndPack:
1761    return roundAndPackFloat32( zSign, zExp, zSig );
1762
1763}
1764
1765/*
1766-------------------------------------------------------------------------------
1767Returns the result of subtracting the absolute values of the single-
1768precision floating-point values `a' and `b'.  If `zSign' is 1, the
1769difference is negated before being returned.  `zSign' is ignored if the
1770result is a NaN.  The subtraction is performed according to the IEC/IEEE
1771Standard for Binary Floating-Point Arithmetic.
1772-------------------------------------------------------------------------------
1773*/
1774static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1775{
1776    int16 aExp, bExp, zExp;
1777    bits32 aSig, bSig, zSig;
1778    int16 expDiff;
1779
1780    aSig = extractFloat32Frac( a );
1781    aExp = extractFloat32Exp( a );
1782    bSig = extractFloat32Frac( b );
1783    bExp = extractFloat32Exp( b );
1784    expDiff = aExp - bExp;
1785    aSig <<= 7;
1786    bSig <<= 7;
1787    if ( 0 < expDiff ) goto aExpBigger;
1788    if ( expDiff < 0 ) goto bExpBigger;
1789    if ( aExp == 0xFF ) {
1790        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1791        float_raise( float_flag_invalid );
1792        return float32_default_nan;
1793    }
1794    if ( aExp == 0 ) {
1795        aExp = 1;
1796        bExp = 1;
1797    }
1798    if ( bSig < aSig ) goto aBigger;
1799    if ( aSig < bSig ) goto bBigger;
1800    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1801 bExpBigger:
1802    if ( bExp == 0xFF ) {
1803        if ( bSig ) return propagateFloat32NaN( a, b );
1804        return packFloat32( zSign ^ 1, 0xFF, 0 );
1805    }
1806    if ( aExp == 0 ) {
1807        ++expDiff;
1808    }
1809    else {
1810        aSig |= 0x40000000;
1811    }
1812    shift32RightJamming( aSig, - expDiff, &aSig );
1813    bSig |= 0x40000000;
1814 bBigger:
1815    zSig = bSig - aSig;
1816    zExp = bExp;
1817    zSign ^= 1;
1818    goto normalizeRoundAndPack;
1819 aExpBigger:
1820    if ( aExp == 0xFF ) {
1821        if ( aSig ) return propagateFloat32NaN( a, b );
1822        return a;
1823    }
1824    if ( bExp == 0 ) {
1825        --expDiff;
1826    }
1827    else {
1828        bSig |= 0x40000000;
1829    }
1830    shift32RightJamming( bSig, expDiff, &bSig );
1831    aSig |= 0x40000000;
1832 aBigger:
1833    zSig = aSig - bSig;
1834    zExp = aExp;
1835 normalizeRoundAndPack:
1836    --zExp;
1837    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1838
1839}
1840
1841/*
1842-------------------------------------------------------------------------------
1843Returns the result of adding the single-precision floating-point values `a'
1844and `b'.  The operation is performed according to the IEC/IEEE Standard for
1845Binary Floating-Point Arithmetic.
1846-------------------------------------------------------------------------------
1847*/
1848float32 float32_add( float32 a, float32 b )
1849{
1850    flag aSign, bSign;
1851
1852    aSign = extractFloat32Sign( a );
1853    bSign = extractFloat32Sign( b );
1854    if ( aSign == bSign ) {
1855        return addFloat32Sigs( a, b, aSign );
1856    }
1857    else {
1858        return subFloat32Sigs( a, b, aSign );
1859    }
1860
1861}
1862
1863/*
1864-------------------------------------------------------------------------------
1865Returns the result of subtracting the single-precision floating-point values
1866`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1867for Binary Floating-Point Arithmetic.
1868-------------------------------------------------------------------------------
1869*/
1870float32 float32_sub( float32 a, float32 b )
1871{
1872    flag aSign, bSign;
1873
1874    aSign = extractFloat32Sign( a );
1875    bSign = extractFloat32Sign( b );
1876    if ( aSign == bSign ) {
1877        return subFloat32Sigs( a, b, aSign );
1878    }
1879    else {
1880        return addFloat32Sigs( a, b, aSign );
1881    }
1882
1883}
1884
1885/*
1886-------------------------------------------------------------------------------
1887Returns the result of multiplying the single-precision floating-point values
1888`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1889for Binary Floating-Point Arithmetic.
1890-------------------------------------------------------------------------------
1891*/
1892float32 float32_mul( float32 a, float32 b )
1893{
1894    flag aSign, bSign, zSign;
1895    int16 aExp, bExp, zExp;
1896    bits32 aSig, bSig;
1897    bits64 zSig64;
1898    bits32 zSig;
1899
1900    aSig = extractFloat32Frac( a );
1901    aExp = extractFloat32Exp( a );
1902    aSign = extractFloat32Sign( a );
1903    bSig = extractFloat32Frac( b );
1904    bExp = extractFloat32Exp( b );
1905    bSign = extractFloat32Sign( b );
1906    zSign = aSign ^ bSign;
1907    if ( aExp == 0xFF ) {
1908        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1909            return propagateFloat32NaN( a, b );
1910        }
1911        if ( ( bExp | bSig ) == 0 ) {
1912            float_raise( float_flag_invalid );
1913            return float32_default_nan;
1914        }
1915        return packFloat32( zSign, 0xFF, 0 );
1916    }
1917    if ( bExp == 0xFF ) {
1918        if ( bSig ) return propagateFloat32NaN( a, b );
1919        if ( ( aExp | aSig ) == 0 ) {
1920            float_raise( float_flag_invalid );
1921            return float32_default_nan;
1922        }
1923        return packFloat32( zSign, 0xFF, 0 );
1924    }
1925    if ( aExp == 0 ) {
1926        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1927        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1928    }
1929    if ( bExp == 0 ) {
1930        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1931        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1932    }
1933    zExp = aExp + bExp - 0x7F;
1934    aSig = ( aSig | 0x00800000 )<<7;
1935    bSig = ( bSig | 0x00800000 )<<8;
1936    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1937    zSig = zSig64;
1938    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1939        zSig <<= 1;
1940        --zExp;
1941    }
1942    return roundAndPackFloat32( zSign, zExp, zSig );
1943
1944}
1945
1946/*
1947-------------------------------------------------------------------------------
1948Returns the result of dividing the single-precision floating-point value `a'
1949by the corresponding value `b'.  The operation is performed according to the
1950IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1951-------------------------------------------------------------------------------
1952*/
1953float32 float32_div( float32 a, float32 b )
1954{
1955    flag aSign, bSign, zSign;
1956    int16 aExp, bExp, zExp;
1957    bits32 aSig, bSig, zSig;
1958
1959    aSig = extractFloat32Frac( a );
1960    aExp = extractFloat32Exp( a );
1961    aSign = extractFloat32Sign( a );
1962    bSig = extractFloat32Frac( b );
1963    bExp = extractFloat32Exp( b );
1964    bSign = extractFloat32Sign( b );
1965    zSign = aSign ^ bSign;
1966    if ( aExp == 0xFF ) {
1967        if ( aSig ) return propagateFloat32NaN( a, b );
1968        if ( bExp == 0xFF ) {
1969            if ( bSig ) return propagateFloat32NaN( a, b );
1970            float_raise( float_flag_invalid );
1971            return float32_default_nan;
1972        }
1973        return packFloat32( zSign, 0xFF, 0 );
1974    }
1975    if ( bExp == 0xFF ) {
1976        if ( bSig ) return propagateFloat32NaN( a, b );
1977        return packFloat32( zSign, 0, 0 );
1978    }
1979    if ( bExp == 0 ) {
1980        if ( bSig == 0 ) {
1981            if ( ( aExp | aSig ) == 0 ) {
1982                float_raise( float_flag_invalid );
1983                return float32_default_nan;
1984            }
1985            float_raise( float_flag_divbyzero );
1986            return packFloat32( zSign, 0xFF, 0 );
1987        }
1988        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1989    }
1990    if ( aExp == 0 ) {
1991        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1992        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1993    }
1994    zExp = aExp - bExp + 0x7D;
1995    aSig = ( aSig | 0x00800000 )<<7;
1996    bSig = ( bSig | 0x00800000 )<<8;
1997    if ( bSig <= ( aSig + aSig ) ) {
1998        aSig >>= 1;
1999        ++zExp;
2000    }
2001    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2002    if ( ( zSig & 0x3F ) == 0 ) {
2003        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2004    }
2005    return roundAndPackFloat32( zSign, zExp, zSig );
2006
2007}
2008
2009#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2010/*
2011-------------------------------------------------------------------------------
2012Returns the remainder of the single-precision floating-point value `a'
2013with respect to the corresponding value `b'.  The operation is performed
2014according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2015-------------------------------------------------------------------------------
2016*/
2017float32 float32_rem( float32 a, float32 b )
2018{
2019    flag aSign, bSign, zSign;
2020    int16 aExp, bExp, expDiff;
2021    bits32 aSig, bSig;
2022    bits32 q;
2023    bits64 aSig64, bSig64, q64;
2024    bits32 alternateASig;
2025    sbits32 sigMean;
2026
2027    aSig = extractFloat32Frac( a );
2028    aExp = extractFloat32Exp( a );
2029    aSign = extractFloat32Sign( a );
2030    bSig = extractFloat32Frac( b );
2031    bExp = extractFloat32Exp( b );
2032    bSign = extractFloat32Sign( b );
2033    if ( aExp == 0xFF ) {
2034        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2035            return propagateFloat32NaN( a, b );
2036        }
2037        float_raise( float_flag_invalid );
2038        return float32_default_nan;
2039    }
2040    if ( bExp == 0xFF ) {
2041        if ( bSig ) return propagateFloat32NaN( a, b );
2042        return a;
2043    }
2044    if ( bExp == 0 ) {
2045        if ( bSig == 0 ) {
2046            float_raise( float_flag_invalid );
2047            return float32_default_nan;
2048        }
2049        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2050    }
2051    if ( aExp == 0 ) {
2052        if ( aSig == 0 ) return a;
2053        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2054    }
2055    expDiff = aExp - bExp;
2056    aSig |= 0x00800000;
2057    bSig |= 0x00800000;
2058    if ( expDiff < 32 ) {
2059        aSig <<= 8;
2060        bSig <<= 8;
2061        if ( expDiff < 0 ) {
2062            if ( expDiff < -1 ) return a;
2063            aSig >>= 1;
2064        }
2065        q = ( bSig <= aSig );
2066        if ( q ) aSig -= bSig;
2067        if ( 0 < expDiff ) {
2068            q = ( ( (bits64) aSig )<<32 ) / bSig;
2069            q >>= 32 - expDiff;
2070            bSig >>= 2;
2071            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2072        }
2073        else {
2074            aSig >>= 2;
2075            bSig >>= 2;
2076        }
2077    }
2078    else {
2079        if ( bSig <= aSig ) aSig -= bSig;
2080        aSig64 = ( (bits64) aSig )<<40;
2081        bSig64 = ( (bits64) bSig )<<40;
2082        expDiff -= 64;
2083        while ( 0 < expDiff ) {
2084            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2085            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2086            aSig64 = - ( ( bSig * q64 )<<38 );
2087            expDiff -= 62;
2088        }
2089        expDiff += 64;
2090        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2091        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2092        q = q64>>( 64 - expDiff );
2093        bSig <<= 6;
2094        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2095    }
2096    do {
2097        alternateASig = aSig;
2098        ++q;
2099        aSig -= bSig;
2100    } while ( 0 <= (sbits32) aSig );
2101    sigMean = aSig + alternateASig;
2102    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2103        aSig = alternateASig;
2104    }
2105    zSign = ( (sbits32) aSig < 0 );
2106    if ( zSign ) aSig = - aSig;
2107    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2108
2109}
2110#endif /* !SOFTFLOAT_FOR_GCC */
2111
2112#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2113/*
2114-------------------------------------------------------------------------------
2115Returns the square root of the single-precision floating-point value `a'.
2116The operation is performed according to the IEC/IEEE Standard for Binary
2117Floating-Point Arithmetic.
2118-------------------------------------------------------------------------------
2119*/
2120float32 float32_sqrt( float32 a )
2121{
2122    flag aSign;
2123    int16 aExp, zExp;
2124    bits32 aSig, zSig;
2125    bits64 rem, term;
2126
2127    aSig = extractFloat32Frac( a );
2128    aExp = extractFloat32Exp( a );
2129    aSign = extractFloat32Sign( a );
2130    if ( aExp == 0xFF ) {
2131        if ( aSig ) return propagateFloat32NaN( a, 0 );
2132        if ( ! aSign ) return a;
2133        float_raise( float_flag_invalid );
2134        return float32_default_nan;
2135    }
2136    if ( aSign ) {
2137        if ( ( aExp | aSig ) == 0 ) return a;
2138        float_raise( float_flag_invalid );
2139        return float32_default_nan;
2140    }
2141    if ( aExp == 0 ) {
2142        if ( aSig == 0 ) return 0;
2143        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2144    }
2145    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2146    aSig = ( aSig | 0x00800000 )<<8;
2147    zSig = estimateSqrt32( aExp, aSig ) + 2;
2148    if ( ( zSig & 0x7F ) <= 5 ) {
2149        if ( zSig < 2 ) {
2150            zSig = 0x7FFFFFFF;
2151            goto roundAndPack;
2152        }
2153        aSig >>= aExp & 1;
2154        term = ( (bits64) zSig ) * zSig;
2155        rem = ( ( (bits64) aSig )<<32 ) - term;
2156        while ( (sbits64) rem < 0 ) {
2157            --zSig;
2158            rem += ( ( (bits64) zSig )<<1 ) | 1;
2159        }
2160        zSig |= ( rem != 0 );
2161    }
2162    shift32RightJamming( zSig, 1, &zSig );
2163 roundAndPack:
2164    return roundAndPackFloat32( 0, zExp, zSig );
2165
2166}
2167#endif /* !SOFTFLOAT_FOR_GCC */
2168
2169/*
2170-------------------------------------------------------------------------------
2171Returns 1 if the single-precision floating-point value `a' is equal to
2172the corresponding value `b', and 0 otherwise.  The comparison is performed
2173according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2174-------------------------------------------------------------------------------
2175*/
2176flag float32_eq( float32 a, float32 b )
2177{
2178
2179    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2180         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2181       ) {
2182        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2183            float_raise( float_flag_invalid );
2184        }
2185        return 0;
2186    }
2187    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2188
2189}
2190
2191/*
2192-------------------------------------------------------------------------------
2193Returns 1 if the single-precision floating-point value `a' is less than
2194or equal to the corresponding value `b', and 0 otherwise.  The comparison
2195is performed according to the IEC/IEEE Standard for Binary Floating-Point
2196Arithmetic.
2197-------------------------------------------------------------------------------
2198*/
2199flag float32_le( float32 a, float32 b )
2200{
2201    flag aSign, bSign;
2202
2203    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2204         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2205       ) {
2206        float_raise( float_flag_invalid );
2207        return 0;
2208    }
2209    aSign = extractFloat32Sign( a );
2210    bSign = extractFloat32Sign( b );
2211    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2212    return ( a == b ) || ( aSign ^ ( a < b ) );
2213
2214}
2215
2216/*
2217-------------------------------------------------------------------------------
2218Returns 1 if the single-precision floating-point value `a' is less than
2219the corresponding value `b', and 0 otherwise.  The comparison is performed
2220according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2221-------------------------------------------------------------------------------
2222*/
2223flag float32_lt( float32 a, float32 b )
2224{
2225    flag aSign, bSign;
2226
2227    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2228         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2229       ) {
2230        float_raise( float_flag_invalid );
2231        return 0;
2232    }
2233    aSign = extractFloat32Sign( a );
2234    bSign = extractFloat32Sign( b );
2235    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2236    return ( a != b ) && ( aSign ^ ( a < b ) );
2237
2238}
2239
2240#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2241/*
2242-------------------------------------------------------------------------------
2243Returns 1 if the single-precision floating-point value `a' is equal to
2244the corresponding value `b', and 0 otherwise.  The invalid exception is
2245raised if either operand is a NaN.  Otherwise, the comparison is performed
2246according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2247-------------------------------------------------------------------------------
2248*/
2249flag float32_eq_signaling( float32 a, float32 b )
2250{
2251
2252    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2253         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2254       ) {
2255        float_raise( float_flag_invalid );
2256        return 0;
2257    }
2258    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2259
2260}
2261
2262/*
2263-------------------------------------------------------------------------------
2264Returns 1 if the single-precision floating-point value `a' is less than or
2265equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2266cause an exception.  Otherwise, the comparison is performed according to the
2267IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2268-------------------------------------------------------------------------------
2269*/
2270flag float32_le_quiet( float32 a, float32 b )
2271{
2272    flag aSign, bSign;
2273
2274    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2275         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2276       ) {
2277        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2278            float_raise( float_flag_invalid );
2279        }
2280        return 0;
2281    }
2282    aSign = extractFloat32Sign( a );
2283    bSign = extractFloat32Sign( b );
2284    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2285    return ( a == b ) || ( aSign ^ ( a < b ) );
2286
2287}
2288
2289/*
2290-------------------------------------------------------------------------------
2291Returns 1 if the single-precision floating-point value `a' is less than
2292the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2293exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2294Standard for Binary Floating-Point Arithmetic.
2295-------------------------------------------------------------------------------
2296*/
2297flag float32_lt_quiet( float32 a, float32 b )
2298{
2299    flag aSign, bSign;
2300
2301    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2302         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2303       ) {
2304        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2305            float_raise( float_flag_invalid );
2306        }
2307        return 0;
2308    }
2309    aSign = extractFloat32Sign( a );
2310    bSign = extractFloat32Sign( b );
2311    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2312    return ( a != b ) && ( aSign ^ ( a < b ) );
2313
2314}
2315#endif /* !SOFTFLOAT_FOR_GCC */
2316
2317#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2318/*
2319-------------------------------------------------------------------------------
2320Returns the result of converting the double-precision floating-point value
2321`a' to the 32-bit two's complement integer format.  The conversion is
2322performed according to the IEC/IEEE Standard for Binary Floating-Point
2323Arithmetic---which means in particular that the conversion is rounded
2324according to the current rounding mode.  If `a' is a NaN, the largest
2325positive integer is returned.  Otherwise, if the conversion overflows, the
2326largest integer with the same sign as `a' is returned.
2327-------------------------------------------------------------------------------
2328*/
2329int32 float64_to_int32( float64 a )
2330{
2331    flag aSign;
2332    int16 aExp, shiftCount;
2333    bits64 aSig;
2334
2335    aSig = extractFloat64Frac( a );
2336    aExp = extractFloat64Exp( a );
2337    aSign = extractFloat64Sign( a );
2338    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2339    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2340    shiftCount = 0x42C - aExp;
2341    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2342    return roundAndPackInt32( aSign, aSig );
2343
2344}
2345#endif /* !SOFTFLOAT_FOR_GCC */
2346
2347/*
2348-------------------------------------------------------------------------------
2349Returns the result of converting the double-precision floating-point value
2350`a' to the 32-bit two's complement integer format.  The conversion is
2351performed according to the IEC/IEEE Standard for Binary Floating-Point
2352Arithmetic, except that the conversion is always rounded toward zero.
2353If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2354the conversion overflows, the largest integer with the same sign as `a' is
2355returned.
2356-------------------------------------------------------------------------------
2357*/
2358int32 float64_to_int32_round_to_zero( float64 a )
2359{
2360    flag aSign;
2361    int16 aExp, shiftCount;
2362    bits64 aSig, savedASig;
2363    int32 z;
2364
2365    aSig = extractFloat64Frac( a );
2366    aExp = extractFloat64Exp( a );
2367    aSign = extractFloat64Sign( a );
2368    if ( 0x41E < aExp ) {
2369        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2370        goto invalid;
2371    }
2372    else if ( aExp < 0x3FF ) {
2373        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2374        return 0;
2375    }
2376    aSig |= LIT64( 0x0010000000000000 );
2377    shiftCount = 0x433 - aExp;
2378    savedASig = aSig;
2379    aSig >>= shiftCount;
2380    z = aSig;
2381    if ( aSign ) z = - z;
2382    if ( ( z < 0 ) ^ aSign ) {
2383 invalid:
2384        float_raise( float_flag_invalid );
2385        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2386    }
2387    if ( ( aSig<<shiftCount ) != savedASig ) {
2388        float_exception_flags |= float_flag_inexact;
2389    }
2390    return z;
2391
2392}
2393
2394#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2395/*
2396-------------------------------------------------------------------------------
2397Returns the result of converting the double-precision floating-point value
2398`a' to the 64-bit two's complement integer format.  The conversion is
2399performed according to the IEC/IEEE Standard for Binary Floating-Point
2400Arithmetic---which means in particular that the conversion is rounded
2401according to the current rounding mode.  If `a' is a NaN, the largest
2402positive integer is returned.  Otherwise, if the conversion overflows, the
2403largest integer with the same sign as `a' is returned.
2404-------------------------------------------------------------------------------
2405*/
2406int64 float64_to_int64( float64 a )
2407{
2408    flag aSign;
2409    int16 aExp, shiftCount;
2410    bits64 aSig, aSigExtra;
2411
2412    aSig = extractFloat64Frac( a );
2413    aExp = extractFloat64Exp( a );
2414    aSign = extractFloat64Sign( a );
2415    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2416    shiftCount = 0x433 - aExp;
2417    if ( shiftCount <= 0 ) {
2418        if ( 0x43E < aExp ) {
2419            float_raise( float_flag_invalid );
2420            if (    ! aSign
2421                 || (    ( aExp == 0x7FF )
2422                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2423               ) {
2424                return LIT64( 0x7FFFFFFFFFFFFFFF );
2425            }
2426            return (sbits64) LIT64( 0x8000000000000000 );
2427        }
2428        aSigExtra = 0;
2429        aSig <<= - shiftCount;
2430    }
2431    else {
2432        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2433    }
2434    return roundAndPackInt64( aSign, aSig, aSigExtra );
2435
2436}
2437
2438/*
2439-------------------------------------------------------------------------------
2440Returns the result of converting the double-precision floating-point value
2441`a' to the 64-bit two's complement integer format.  The conversion is
2442performed according to the IEC/IEEE Standard for Binary Floating-Point
2443Arithmetic, except that the conversion is always rounded toward zero.
2444If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2445the conversion overflows, the largest integer with the same sign as `a' is
2446returned.
2447-------------------------------------------------------------------------------
2448*/
2449int64 float64_to_int64_round_to_zero( float64 a )
2450{
2451    flag aSign;
2452    int16 aExp, shiftCount;
2453    bits64 aSig;
2454    int64 z;
2455
2456    aSig = extractFloat64Frac( a );
2457    aExp = extractFloat64Exp( a );
2458    aSign = extractFloat64Sign( a );
2459    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2460    shiftCount = aExp - 0x433;
2461    if ( 0 <= shiftCount ) {
2462        if ( 0x43E <= aExp ) {
2463            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2464                float_raise( float_flag_invalid );
2465                if (    ! aSign
2466                     || (    ( aExp == 0x7FF )
2467                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2468                   ) {
2469                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2470                }
2471            }
2472            return (sbits64) LIT64( 0x8000000000000000 );
2473        }
2474        z = aSig<<shiftCount;
2475    }
2476    else {
2477        if ( aExp < 0x3FE ) {
2478            if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2479            return 0;
2480        }
2481        z = aSig>>( - shiftCount );
2482        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2483            float_exception_flags |= float_flag_inexact;
2484        }
2485    }
2486    if ( aSign ) z = - z;
2487    return z;
2488
2489}
2490#endif /* !SOFTFLOAT_FOR_GCC */
2491
2492/*
2493-------------------------------------------------------------------------------
2494Returns the result of converting the double-precision floating-point value
2495`a' to the single-precision floating-point format.  The conversion is
2496performed according to the IEC/IEEE Standard for Binary Floating-Point
2497Arithmetic.
2498-------------------------------------------------------------------------------
2499*/
2500float32 float64_to_float32( float64 a )
2501{
2502    flag aSign;
2503    int16 aExp;
2504    bits64 aSig;
2505    bits32 zSig;
2506
2507    aSig = extractFloat64Frac( a );
2508    aExp = extractFloat64Exp( a );
2509    aSign = extractFloat64Sign( a );
2510    if ( aExp == 0x7FF ) {
2511        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2512        return packFloat32( aSign, 0xFF, 0 );
2513    }
2514    shift64RightJamming( aSig, 22, &aSig );
2515    zSig = aSig;
2516    if ( aExp || zSig ) {
2517        zSig |= 0x40000000;
2518        aExp -= 0x381;
2519    }
2520    return roundAndPackFloat32( aSign, aExp, zSig );
2521
2522}
2523
2524#ifdef FLOATX80
2525
2526/*
2527-------------------------------------------------------------------------------
2528Returns the result of converting the double-precision floating-point value
2529`a' to the extended double-precision floating-point format.  The conversion
2530is performed according to the IEC/IEEE Standard for Binary Floating-Point
2531Arithmetic.
2532-------------------------------------------------------------------------------
2533*/
2534floatx80 float64_to_floatx80( float64 a )
2535{
2536    flag aSign;
2537    int16 aExp;
2538    bits64 aSig;
2539
2540    aSig = extractFloat64Frac( a );
2541    aExp = extractFloat64Exp( a );
2542    aSign = extractFloat64Sign( a );
2543    if ( aExp == 0x7FF ) {
2544        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2545        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2546    }
2547    if ( aExp == 0 ) {
2548        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2549        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2550    }
2551    return
2552        packFloatx80(
2553            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2554
2555}
2556
2557#endif
2558
2559#ifdef FLOAT128
2560
2561/*
2562-------------------------------------------------------------------------------
2563Returns the result of converting the double-precision floating-point value
2564`a' to the quadruple-precision floating-point format.  The conversion is
2565performed according to the IEC/IEEE Standard for Binary Floating-Point
2566Arithmetic.
2567-------------------------------------------------------------------------------
2568*/
2569float128 float64_to_float128( float64 a )
2570{
2571    flag aSign;
2572    int16 aExp;
2573    bits64 aSig, zSig0, zSig1;
2574
2575    aSig = extractFloat64Frac( a );
2576    aExp = extractFloat64Exp( a );
2577    aSign = extractFloat64Sign( a );
2578    if ( aExp == 0x7FF ) {
2579        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2580        return packFloat128( aSign, 0x7FFF, 0, 0 );
2581    }
2582    if ( aExp == 0 ) {
2583        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2584        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2585        --aExp;
2586    }
2587    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2588    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2589
2590}
2591
2592#endif
2593
2594#ifndef SOFTFLOAT_FOR_GCC
2595/*
2596-------------------------------------------------------------------------------
2597Rounds the double-precision floating-point value `a' to an integer, and
2598returns the result as a double-precision floating-point value.  The
2599operation is performed according to the IEC/IEEE Standard for Binary
2600Floating-Point Arithmetic.
2601-------------------------------------------------------------------------------
2602*/
2603float64 float64_round_to_int( float64 a )
2604{
2605    flag aSign;
2606    int16 aExp;
2607    bits64 lastBitMask, roundBitsMask;
2608    int8 roundingMode;
2609    float64 z;
2610
2611    aExp = extractFloat64Exp( a );
2612    if ( 0x433 <= aExp ) {
2613        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2614            return propagateFloat64NaN( a, a );
2615        }
2616        return a;
2617    }
2618    if ( aExp < 0x3FF ) {
2619        if ( (bits64) ( a<<1 ) == 0 ) return a;
2620        float_exception_flags |= float_flag_inexact;
2621        aSign = extractFloat64Sign( a );
2622        switch ( float_rounding_mode ) {
2623         case float_round_nearest_even:
2624            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2625                return packFloat64( aSign, 0x3FF, 0 );
2626            }
2627            break;
2628	 case float_round_to_zero:
2629	    break;
2630         case float_round_down:
2631            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2632         case float_round_up:
2633            return
2634            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2635        }
2636        return packFloat64( aSign, 0, 0 );
2637    }
2638    lastBitMask = 1;
2639    lastBitMask <<= 0x433 - aExp;
2640    roundBitsMask = lastBitMask - 1;
2641    z = a;
2642    roundingMode = float_rounding_mode;
2643    if ( roundingMode == float_round_nearest_even ) {
2644        z += lastBitMask>>1;
2645        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2646    }
2647    else if ( roundingMode != float_round_to_zero ) {
2648        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2649            z += roundBitsMask;
2650        }
2651    }
2652    z &= ~ roundBitsMask;
2653    if ( z != a ) float_exception_flags |= float_flag_inexact;
2654    return z;
2655
2656}
2657#endif
2658
2659/*
2660-------------------------------------------------------------------------------
2661Returns the result of adding the absolute values of the double-precision
2662floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2663before being returned.  `zSign' is ignored if the result is a NaN.
2664The addition is performed according to the IEC/IEEE Standard for Binary
2665Floating-Point Arithmetic.
2666-------------------------------------------------------------------------------
2667*/
2668static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2669{
2670    int16 aExp, bExp, zExp;
2671    bits64 aSig, bSig, zSig;
2672    int16 expDiff;
2673
2674    aSig = extractFloat64Frac( a );
2675    aExp = extractFloat64Exp( a );
2676    bSig = extractFloat64Frac( b );
2677    bExp = extractFloat64Exp( b );
2678    expDiff = aExp - bExp;
2679    aSig <<= 9;
2680    bSig <<= 9;
2681    if ( 0 < expDiff ) {
2682        if ( aExp == 0x7FF ) {
2683            if ( aSig ) return propagateFloat64NaN( a, b );
2684            return a;
2685        }
2686        if ( bExp == 0 ) {
2687            --expDiff;
2688        }
2689        else {
2690            bSig |= LIT64( 0x2000000000000000 );
2691        }
2692        shift64RightJamming( bSig, expDiff, &bSig );
2693        zExp = aExp;
2694    }
2695    else if ( expDiff < 0 ) {
2696        if ( bExp == 0x7FF ) {
2697            if ( bSig ) return propagateFloat64NaN( a, b );
2698            return packFloat64( zSign, 0x7FF, 0 );
2699        }
2700        if ( aExp == 0 ) {
2701            ++expDiff;
2702        }
2703        else {
2704            aSig |= LIT64( 0x2000000000000000 );
2705        }
2706        shift64RightJamming( aSig, - expDiff, &aSig );
2707        zExp = bExp;
2708    }
2709    else {
2710        if ( aExp == 0x7FF ) {
2711            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2712            return a;
2713        }
2714        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2715        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2716        zExp = aExp;
2717        goto roundAndPack;
2718    }
2719    aSig |= LIT64( 0x2000000000000000 );
2720    zSig = ( aSig + bSig )<<1;
2721    --zExp;
2722    if ( (sbits64) zSig < 0 ) {
2723        zSig = aSig + bSig;
2724        ++zExp;
2725    }
2726 roundAndPack:
2727    return roundAndPackFloat64( zSign, zExp, zSig );
2728
2729}
2730
2731/*
2732-------------------------------------------------------------------------------
2733Returns the result of subtracting the absolute values of the double-
2734precision floating-point values `a' and `b'.  If `zSign' is 1, the
2735difference is negated before being returned.  `zSign' is ignored if the
2736result is a NaN.  The subtraction is performed according to the IEC/IEEE
2737Standard for Binary Floating-Point Arithmetic.
2738-------------------------------------------------------------------------------
2739*/
2740static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2741{
2742    int16 aExp, bExp, zExp;
2743    bits64 aSig, bSig, zSig;
2744    int16 expDiff;
2745
2746    aSig = extractFloat64Frac( a );
2747    aExp = extractFloat64Exp( a );
2748    bSig = extractFloat64Frac( b );
2749    bExp = extractFloat64Exp( b );
2750    expDiff = aExp - bExp;
2751    aSig <<= 10;
2752    bSig <<= 10;
2753    if ( 0 < expDiff ) goto aExpBigger;
2754    if ( expDiff < 0 ) goto bExpBigger;
2755    if ( aExp == 0x7FF ) {
2756        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2757        float_raise( float_flag_invalid );
2758        return float64_default_nan;
2759    }
2760    if ( aExp == 0 ) {
2761        aExp = 1;
2762        bExp = 1;
2763    }
2764    if ( bSig < aSig ) goto aBigger;
2765    if ( aSig < bSig ) goto bBigger;
2766    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2767 bExpBigger:
2768    if ( bExp == 0x7FF ) {
2769        if ( bSig ) return propagateFloat64NaN( a, b );
2770        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2771    }
2772    if ( aExp == 0 ) {
2773        ++expDiff;
2774    }
2775    else {
2776        aSig |= LIT64( 0x4000000000000000 );
2777    }
2778    shift64RightJamming( aSig, - expDiff, &aSig );
2779    bSig |= LIT64( 0x4000000000000000 );
2780 bBigger:
2781    zSig = bSig - aSig;
2782    zExp = bExp;
2783    zSign ^= 1;
2784    goto normalizeRoundAndPack;
2785 aExpBigger:
2786    if ( aExp == 0x7FF ) {
2787        if ( aSig ) return propagateFloat64NaN( a, b );
2788        return a;
2789    }
2790    if ( bExp == 0 ) {
2791        --expDiff;
2792    }
2793    else {
2794        bSig |= LIT64( 0x4000000000000000 );
2795    }
2796    shift64RightJamming( bSig, expDiff, &bSig );
2797    aSig |= LIT64( 0x4000000000000000 );
2798 aBigger:
2799    zSig = aSig - bSig;
2800    zExp = aExp;
2801 normalizeRoundAndPack:
2802    --zExp;
2803    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2804
2805}
2806
2807/*
2808-------------------------------------------------------------------------------
2809Returns the result of adding the double-precision floating-point values `a'
2810and `b'.  The operation is performed according to the IEC/IEEE Standard for
2811Binary Floating-Point Arithmetic.
2812-------------------------------------------------------------------------------
2813*/
2814float64 float64_add( float64 a, float64 b )
2815{
2816    flag aSign, bSign;
2817
2818    aSign = extractFloat64Sign( a );
2819    bSign = extractFloat64Sign( b );
2820    if ( aSign == bSign ) {
2821        return addFloat64Sigs( a, b, aSign );
2822    }
2823    else {
2824        return subFloat64Sigs( a, b, aSign );
2825    }
2826
2827}
2828
2829/*
2830-------------------------------------------------------------------------------
2831Returns the result of subtracting the double-precision floating-point values
2832`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2833for Binary Floating-Point Arithmetic.
2834-------------------------------------------------------------------------------
2835*/
2836float64 float64_sub( float64 a, float64 b )
2837{
2838    flag aSign, bSign;
2839
2840    aSign = extractFloat64Sign( a );
2841    bSign = extractFloat64Sign( b );
2842    if ( aSign == bSign ) {
2843        return subFloat64Sigs( a, b, aSign );
2844    }
2845    else {
2846        return addFloat64Sigs( a, b, aSign );
2847    }
2848
2849}
2850
2851/*
2852-------------------------------------------------------------------------------
2853Returns the result of multiplying the double-precision floating-point values
2854`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2855for Binary Floating-Point Arithmetic.
2856-------------------------------------------------------------------------------
2857*/
2858float64 float64_mul( float64 a, float64 b )
2859{
2860    flag aSign, bSign, zSign;
2861    int16 aExp, bExp, zExp;
2862    bits64 aSig, bSig, zSig0, zSig1;
2863
2864    aSig = extractFloat64Frac( a );
2865    aExp = extractFloat64Exp( a );
2866    aSign = extractFloat64Sign( a );
2867    bSig = extractFloat64Frac( b );
2868    bExp = extractFloat64Exp( b );
2869    bSign = extractFloat64Sign( b );
2870    zSign = aSign ^ bSign;
2871    if ( aExp == 0x7FF ) {
2872        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2873            return propagateFloat64NaN( a, b );
2874        }
2875        if ( ( bExp | bSig ) == 0 ) {
2876            float_raise( float_flag_invalid );
2877            return float64_default_nan;
2878        }
2879        return packFloat64( zSign, 0x7FF, 0 );
2880    }
2881    if ( bExp == 0x7FF ) {
2882        if ( bSig ) return propagateFloat64NaN( a, b );
2883        if ( ( aExp | aSig ) == 0 ) {
2884            float_raise( float_flag_invalid );
2885            return float64_default_nan;
2886        }
2887        return packFloat64( zSign, 0x7FF, 0 );
2888    }
2889    if ( aExp == 0 ) {
2890        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2891        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2892    }
2893    if ( bExp == 0 ) {
2894        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2895        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2896    }
2897    zExp = aExp + bExp - 0x3FF;
2898    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2899    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2900    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2901    zSig0 |= ( zSig1 != 0 );
2902    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2903        zSig0 <<= 1;
2904        --zExp;
2905    }
2906    return roundAndPackFloat64( zSign, zExp, zSig0 );
2907
2908}
2909
2910/*
2911-------------------------------------------------------------------------------
2912Returns the result of dividing the double-precision floating-point value `a'
2913by the corresponding value `b'.  The operation is performed according to
2914the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2915-------------------------------------------------------------------------------
2916*/
2917float64 float64_div( float64 a, float64 b )
2918{
2919    flag aSign, bSign, zSign;
2920    int16 aExp, bExp, zExp;
2921    bits64 aSig, bSig, zSig;
2922    bits64 rem0, rem1;
2923    bits64 term0, term1;
2924
2925    aSig = extractFloat64Frac( a );
2926    aExp = extractFloat64Exp( a );
2927    aSign = extractFloat64Sign( a );
2928    bSig = extractFloat64Frac( b );
2929    bExp = extractFloat64Exp( b );
2930    bSign = extractFloat64Sign( b );
2931    zSign = aSign ^ bSign;
2932    if ( aExp == 0x7FF ) {
2933        if ( aSig ) return propagateFloat64NaN( a, b );
2934        if ( bExp == 0x7FF ) {
2935            if ( bSig ) return propagateFloat64NaN( a, b );
2936            float_raise( float_flag_invalid );
2937            return float64_default_nan;
2938        }
2939        return packFloat64( zSign, 0x7FF, 0 );
2940    }
2941    if ( bExp == 0x7FF ) {
2942        if ( bSig ) return propagateFloat64NaN( a, b );
2943        return packFloat64( zSign, 0, 0 );
2944    }
2945    if ( bExp == 0 ) {
2946        if ( bSig == 0 ) {
2947            if ( ( aExp | aSig ) == 0 ) {
2948                float_raise( float_flag_invalid );
2949                return float64_default_nan;
2950            }
2951            float_raise( float_flag_divbyzero );
2952            return packFloat64( zSign, 0x7FF, 0 );
2953        }
2954        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2955    }
2956    if ( aExp == 0 ) {
2957        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2958        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2959    }
2960    zExp = aExp - bExp + 0x3FD;
2961    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2962    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2963    if ( bSig <= ( aSig + aSig ) ) {
2964        aSig >>= 1;
2965        ++zExp;
2966    }
2967    zSig = estimateDiv128To64( aSig, 0, bSig );
2968    if ( ( zSig & 0x1FF ) <= 2 ) {
2969        mul64To128( bSig, zSig, &term0, &term1 );
2970        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2971        while ( (sbits64) rem0 < 0 ) {
2972            --zSig;
2973            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2974        }
2975        zSig |= ( rem1 != 0 );
2976    }
2977    return roundAndPackFloat64( zSign, zExp, zSig );
2978
2979}
2980
2981#ifndef SOFTFLOAT_FOR_GCC
2982/*
2983-------------------------------------------------------------------------------
2984Returns the remainder of the double-precision floating-point value `a'
2985with respect to the corresponding value `b'.  The operation is performed
2986according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2987-------------------------------------------------------------------------------
2988*/
2989float64 float64_rem( float64 a, float64 b )
2990{
2991    flag aSign, bSign, zSign;
2992    int16 aExp, bExp, expDiff;
2993    bits64 aSig, bSig;
2994    bits64 q, alternateASig;
2995    sbits64 sigMean;
2996
2997    aSig = extractFloat64Frac( a );
2998    aExp = extractFloat64Exp( a );
2999    aSign = extractFloat64Sign( a );
3000    bSig = extractFloat64Frac( b );
3001    bExp = extractFloat64Exp( b );
3002    bSign = extractFloat64Sign( b );
3003    if ( aExp == 0x7FF ) {
3004        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3005            return propagateFloat64NaN( a, b );
3006        }
3007        float_raise( float_flag_invalid );
3008        return float64_default_nan;
3009    }
3010    if ( bExp == 0x7FF ) {
3011        if ( bSig ) return propagateFloat64NaN( a, b );
3012        return a;
3013    }
3014    if ( bExp == 0 ) {
3015        if ( bSig == 0 ) {
3016            float_raise( float_flag_invalid );
3017            return float64_default_nan;
3018        }
3019        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3020    }
3021    if ( aExp == 0 ) {
3022        if ( aSig == 0 ) return a;
3023        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3024    }
3025    expDiff = aExp - bExp;
3026    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3027    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3028    if ( expDiff < 0 ) {
3029        if ( expDiff < -1 ) return a;
3030        aSig >>= 1;
3031    }
3032    q = ( bSig <= aSig );
3033    if ( q ) aSig -= bSig;
3034    expDiff -= 64;
3035    while ( 0 < expDiff ) {
3036        q = estimateDiv128To64( aSig, 0, bSig );
3037        q = ( 2 < q ) ? q - 2 : 0;
3038        aSig = - ( ( bSig>>2 ) * q );
3039        expDiff -= 62;
3040    }
3041    expDiff += 64;
3042    if ( 0 < expDiff ) {
3043        q = estimateDiv128To64( aSig, 0, bSig );
3044        q = ( 2 < q ) ? q - 2 : 0;
3045        q >>= 64 - expDiff;
3046        bSig >>= 2;
3047        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3048    }
3049    else {
3050        aSig >>= 2;
3051        bSig >>= 2;
3052    }
3053    do {
3054        alternateASig = aSig;
3055        ++q;
3056        aSig -= bSig;
3057    } while ( 0 <= (sbits64) aSig );
3058    sigMean = aSig + alternateASig;
3059    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3060        aSig = alternateASig;
3061    }
3062    zSign = ( (sbits64) aSig < 0 );
3063    if ( zSign ) aSig = - aSig;
3064    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3065
3066}
3067
3068/*
3069-------------------------------------------------------------------------------
3070Returns the square root of the double-precision floating-point value `a'.
3071The operation is performed according to the IEC/IEEE Standard for Binary
3072Floating-Point Arithmetic.
3073-------------------------------------------------------------------------------
3074*/
3075float64 float64_sqrt( float64 a )
3076{
3077    flag aSign;
3078    int16 aExp, zExp;
3079    bits64 aSig, zSig, doubleZSig;
3080    bits64 rem0, rem1, term0, term1;
3081
3082    aSig = extractFloat64Frac( a );
3083    aExp = extractFloat64Exp( a );
3084    aSign = extractFloat64Sign( a );
3085    if ( aExp == 0x7FF ) {
3086        if ( aSig ) return propagateFloat64NaN( a, a );
3087        if ( ! aSign ) return a;
3088        float_raise( float_flag_invalid );
3089        return float64_default_nan;
3090    }
3091    if ( aSign ) {
3092        if ( ( aExp | aSig ) == 0 ) return a;
3093        float_raise( float_flag_invalid );
3094        return float64_default_nan;
3095    }
3096    if ( aExp == 0 ) {
3097        if ( aSig == 0 ) return 0;
3098        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3099    }
3100    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3101    aSig |= LIT64( 0x0010000000000000 );
3102    zSig = estimateSqrt32( aExp, aSig>>21 );
3103    aSig <<= 9 - ( aExp & 1 );
3104    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3105    if ( ( zSig & 0x1FF ) <= 5 ) {
3106        doubleZSig = zSig<<1;
3107        mul64To128( zSig, zSig, &term0, &term1 );
3108        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3109        while ( (sbits64) rem0 < 0 ) {
3110            --zSig;
3111            doubleZSig -= 2;
3112            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3113        }
3114        zSig |= ( ( rem0 | rem1 ) != 0 );
3115    }
3116    return roundAndPackFloat64( 0, zExp, zSig );
3117
3118}
3119#endif
3120
3121/*
3122-------------------------------------------------------------------------------
3123Returns 1 if the double-precision floating-point value `a' is equal to the
3124corresponding value `b', and 0 otherwise.  The comparison is performed
3125according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3126-------------------------------------------------------------------------------
3127*/
3128flag float64_eq( float64 a, float64 b )
3129{
3130
3131    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3132         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3133       ) {
3134        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3135            float_raise( float_flag_invalid );
3136        }
3137        return 0;
3138    }
3139    return ( a == b ) ||
3140	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3141
3142}
3143
3144/*
3145-------------------------------------------------------------------------------
3146Returns 1 if the double-precision floating-point value `a' is less than or
3147equal to the corresponding value `b', and 0 otherwise.  The comparison is
3148performed according to the IEC/IEEE Standard for Binary Floating-Point
3149Arithmetic.
3150-------------------------------------------------------------------------------
3151*/
3152flag float64_le( float64 a, float64 b )
3153{
3154    flag aSign, bSign;
3155
3156    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3157         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3158       ) {
3159        float_raise( float_flag_invalid );
3160        return 0;
3161    }
3162    aSign = extractFloat64Sign( a );
3163    bSign = extractFloat64Sign( b );
3164    if ( aSign != bSign )
3165	return aSign ||
3166	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3167	      0 );
3168    return ( a == b ) ||
3169	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3170
3171}
3172
3173/*
3174-------------------------------------------------------------------------------
3175Returns 1 if the double-precision floating-point value `a' is less than
3176the corresponding value `b', and 0 otherwise.  The comparison is performed
3177according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3178-------------------------------------------------------------------------------
3179*/
3180flag float64_lt( float64 a, float64 b )
3181{
3182    flag aSign, bSign;
3183
3184    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3185         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3186       ) {
3187        float_raise( float_flag_invalid );
3188        return 0;
3189    }
3190    aSign = extractFloat64Sign( a );
3191    bSign = extractFloat64Sign( b );
3192    if ( aSign != bSign )
3193	return aSign &&
3194	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3195	      0 );
3196    return ( a != b ) &&
3197	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3198
3199}
3200
3201#ifndef SOFTFLOAT_FOR_GCC
3202/*
3203-------------------------------------------------------------------------------
3204Returns 1 if the double-precision floating-point value `a' is equal to the
3205corresponding value `b', and 0 otherwise.  The invalid exception is raised
3206if either operand is a NaN.  Otherwise, the comparison is performed
3207according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3208-------------------------------------------------------------------------------
3209*/
3210flag float64_eq_signaling( float64 a, float64 b )
3211{
3212
3213    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3214         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3215       ) {
3216        float_raise( float_flag_invalid );
3217        return 0;
3218    }
3219    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3220
3221}
3222
3223/*
3224-------------------------------------------------------------------------------
3225Returns 1 if the double-precision floating-point value `a' is less than or
3226equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3227cause an exception.  Otherwise, the comparison is performed according to the
3228IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3229-------------------------------------------------------------------------------
3230*/
3231flag float64_le_quiet( float64 a, float64 b )
3232{
3233    flag aSign, bSign;
3234
3235    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3236         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3237       ) {
3238        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3239            float_raise( float_flag_invalid );
3240        }
3241        return 0;
3242    }
3243    aSign = extractFloat64Sign( a );
3244    bSign = extractFloat64Sign( b );
3245    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3246    return ( a == b ) || ( aSign ^ ( a < b ) );
3247
3248}
3249
3250/*
3251-------------------------------------------------------------------------------
3252Returns 1 if the double-precision floating-point value `a' is less than
3253the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3254exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3255Standard for Binary Floating-Point Arithmetic.
3256-------------------------------------------------------------------------------
3257*/
3258flag float64_lt_quiet( float64 a, float64 b )
3259{
3260    flag aSign, bSign;
3261
3262    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3263         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3264       ) {
3265        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3266            float_raise( float_flag_invalid );
3267        }
3268        return 0;
3269    }
3270    aSign = extractFloat64Sign( a );
3271    bSign = extractFloat64Sign( b );
3272    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3273    return ( a != b ) && ( aSign ^ ( a < b ) );
3274
3275}
3276#endif
3277
3278#ifdef FLOATX80
3279
3280/*
3281-------------------------------------------------------------------------------
3282Returns the result of converting the extended double-precision floating-
3283point value `a' to the 32-bit two's complement integer format.  The
3284conversion is performed according to the IEC/IEEE Standard for Binary
3285Floating-Point Arithmetic---which means in particular that the conversion
3286is rounded according to the current rounding mode.  If `a' is a NaN, the
3287largest positive integer is returned.  Otherwise, if the conversion
3288overflows, the largest integer with the same sign as `a' is returned.
3289-------------------------------------------------------------------------------
3290*/
3291int32 floatx80_to_int32( floatx80 a )
3292{
3293    flag aSign;
3294    int32 aExp, shiftCount;
3295    bits64 aSig;
3296
3297    aSig = extractFloatx80Frac( a );
3298    aExp = extractFloatx80Exp( a );
3299    aSign = extractFloatx80Sign( a );
3300    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3301    shiftCount = 0x4037 - aExp;
3302    if ( shiftCount <= 0 ) shiftCount = 1;
3303    shift64RightJamming( aSig, shiftCount, &aSig );
3304    return roundAndPackInt32( aSign, aSig );
3305
3306}
3307
3308/*
3309-------------------------------------------------------------------------------
3310Returns the result of converting the extended double-precision floating-
3311point value `a' to the 32-bit two's complement integer format.  The
3312conversion is performed according to the IEC/IEEE Standard for Binary
3313Floating-Point Arithmetic, except that the conversion is always rounded
3314toward zero.  If `a' is a NaN, the largest positive integer is returned.
3315Otherwise, if the conversion overflows, the largest integer with the same
3316sign as `a' is returned.
3317-------------------------------------------------------------------------------
3318*/
3319int32 floatx80_to_int32_round_to_zero( floatx80 a )
3320{
3321    flag aSign;
3322    int32 aExp, shiftCount;
3323    bits64 aSig, savedASig;
3324    int32 z;
3325
3326    aSig = extractFloatx80Frac( a );
3327    aExp = extractFloatx80Exp( a );
3328    aSign = extractFloatx80Sign( a );
3329    if ( 0x401E < aExp ) {
3330        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3331        goto invalid;
3332    }
3333    else if ( aExp < 0x3FFF ) {
3334        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3335        return 0;
3336    }
3337    shiftCount = 0x403E - aExp;
3338    savedASig = aSig;
3339    aSig >>= shiftCount;
3340    z = aSig;
3341    if ( aSign ) z = - z;
3342    if ( ( z < 0 ) ^ aSign ) {
3343 invalid:
3344        float_raise( float_flag_invalid );
3345        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3346    }
3347    if ( ( aSig<<shiftCount ) != savedASig ) {
3348        float_exception_flags |= float_flag_inexact;
3349    }
3350    return z;
3351
3352}
3353
3354/*
3355-------------------------------------------------------------------------------
3356Returns the result of converting the extended double-precision floating-
3357point value `a' to the 64-bit two's complement integer format.  The
3358conversion is performed according to the IEC/IEEE Standard for Binary
3359Floating-Point Arithmetic---which means in particular that the conversion
3360is rounded according to the current rounding mode.  If `a' is a NaN,
3361the largest positive integer is returned.  Otherwise, if the conversion
3362overflows, the largest integer with the same sign as `a' is returned.
3363-------------------------------------------------------------------------------
3364*/
3365int64 floatx80_to_int64( floatx80 a )
3366{
3367    flag aSign;
3368    int32 aExp, shiftCount;
3369    bits64 aSig, aSigExtra;
3370
3371    aSig = extractFloatx80Frac( a );
3372    aExp = extractFloatx80Exp( a );
3373    aSign = extractFloatx80Sign( a );
3374    shiftCount = 0x403E - aExp;
3375    if ( shiftCount <= 0 ) {
3376        if ( shiftCount ) {
3377            float_raise( float_flag_invalid );
3378            if (    ! aSign
3379                 || (    ( aExp == 0x7FFF )
3380                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3381               ) {
3382                return LIT64( 0x7FFFFFFFFFFFFFFF );
3383            }
3384            return (sbits64) LIT64( 0x8000000000000000 );
3385        }
3386        aSigExtra = 0;
3387    }
3388    else {
3389        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3390    }
3391    return roundAndPackInt64( aSign, aSig, aSigExtra );
3392
3393}
3394
3395/*
3396-------------------------------------------------------------------------------
3397Returns the result of converting the extended double-precision floating-
3398point value `a' to the 64-bit two's complement integer format.  The
3399conversion is performed according to the IEC/IEEE Standard for Binary
3400Floating-Point Arithmetic, except that the conversion is always rounded
3401toward zero.  If `a' is a NaN, the largest positive integer is returned.
3402Otherwise, if the conversion overflows, the largest integer with the same
3403sign as `a' is returned.
3404-------------------------------------------------------------------------------
3405*/
3406int64 floatx80_to_int64_round_to_zero( floatx80 a )
3407{
3408    flag aSign;
3409    int32 aExp, shiftCount;
3410    bits64 aSig;
3411    int64 z;
3412
3413    aSig = extractFloatx80Frac( a );
3414    aExp = extractFloatx80Exp( a );
3415    aSign = extractFloatx80Sign( a );
3416    shiftCount = aExp - 0x403E;
3417    if ( 0 <= shiftCount ) {
3418        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3419        if ( ( a.high != 0xC03E ) || aSig ) {
3420            float_raise( float_flag_invalid );
3421            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3422                return LIT64( 0x7FFFFFFFFFFFFFFF );
3423            }
3424        }
3425        return (sbits64) LIT64( 0x8000000000000000 );
3426    }
3427    else if ( aExp < 0x3FFF ) {
3428        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3429        return 0;
3430    }
3431    z = aSig>>( - shiftCount );
3432    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3433        float_exception_flags |= float_flag_inexact;
3434    }
3435    if ( aSign ) z = - z;
3436    return z;
3437
3438}
3439
3440/*
3441-------------------------------------------------------------------------------
3442Returns the result of converting the extended double-precision floating-
3443point value `a' to the single-precision floating-point format.  The
3444conversion is performed according to the IEC/IEEE Standard for Binary
3445Floating-Point Arithmetic.
3446-------------------------------------------------------------------------------
3447*/
3448float32 floatx80_to_float32( floatx80 a )
3449{
3450    flag aSign;
3451    int32 aExp;
3452    bits64 aSig;
3453
3454    aSig = extractFloatx80Frac( a );
3455    aExp = extractFloatx80Exp( a );
3456    aSign = extractFloatx80Sign( a );
3457    if ( aExp == 0x7FFF ) {
3458        if ( (bits64) ( aSig<<1 ) ) {
3459            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3460        }
3461        return packFloat32( aSign, 0xFF, 0 );
3462    }
3463    shift64RightJamming( aSig, 33, &aSig );
3464    if ( aExp || aSig ) aExp -= 0x3F81;
3465    return roundAndPackFloat32( aSign, aExp, aSig );
3466
3467}
3468
3469/*
3470-------------------------------------------------------------------------------
3471Returns the result of converting the extended double-precision floating-
3472point value `a' to the double-precision floating-point format.  The
3473conversion is performed according to the IEC/IEEE Standard for Binary
3474Floating-Point Arithmetic.
3475-------------------------------------------------------------------------------
3476*/
3477float64 floatx80_to_float64( floatx80 a )
3478{
3479    flag aSign;
3480    int32 aExp;
3481    bits64 aSig, zSig;
3482
3483    aSig = extractFloatx80Frac( a );
3484    aExp = extractFloatx80Exp( a );
3485    aSign = extractFloatx80Sign( a );
3486    if ( aExp == 0x7FFF ) {
3487        if ( (bits64) ( aSig<<1 ) ) {
3488            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3489        }
3490        return packFloat64( aSign, 0x7FF, 0 );
3491    }
3492    shift64RightJamming( aSig, 1, &zSig );
3493    if ( aExp || aSig ) aExp -= 0x3C01;
3494    return roundAndPackFloat64( aSign, aExp, zSig );
3495
3496}
3497
3498#ifdef FLOAT128
3499
3500/*
3501-------------------------------------------------------------------------------
3502Returns the result of converting the extended double-precision floating-
3503point value `a' to the quadruple-precision floating-point format.  The
3504conversion is performed according to the IEC/IEEE Standard for Binary
3505Floating-Point Arithmetic.
3506-------------------------------------------------------------------------------
3507*/
3508float128 floatx80_to_float128( floatx80 a )
3509{
3510    flag aSign;
3511    int16 aExp;
3512    bits64 aSig, zSig0, zSig1;
3513
3514    aSig = extractFloatx80Frac( a );
3515    aExp = extractFloatx80Exp( a );
3516    aSign = extractFloatx80Sign( a );
3517    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3518        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3519    }
3520    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3521    return packFloat128( aSign, aExp, zSig0, zSig1 );
3522
3523}
3524
3525#endif
3526
3527/*
3528-------------------------------------------------------------------------------
3529Rounds the extended double-precision floating-point value `a' to an integer,
3530and returns the result as an extended quadruple-precision floating-point
3531value.  The operation is performed according to the IEC/IEEE Standard for
3532Binary Floating-Point Arithmetic.
3533-------------------------------------------------------------------------------
3534*/
3535floatx80 floatx80_round_to_int( floatx80 a )
3536{
3537    flag aSign;
3538    int32 aExp;
3539    bits64 lastBitMask, roundBitsMask;
3540    int8 roundingMode;
3541    floatx80 z;
3542
3543    aExp = extractFloatx80Exp( a );
3544    if ( 0x403E <= aExp ) {
3545        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3546            return propagateFloatx80NaN( a, a );
3547        }
3548        return a;
3549    }
3550    if ( aExp < 0x3FFF ) {
3551        if (    ( aExp == 0 )
3552             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3553            return a;
3554        }
3555        float_exception_flags |= float_flag_inexact;
3556        aSign = extractFloatx80Sign( a );
3557        switch ( float_rounding_mode ) {
3558         case float_round_nearest_even:
3559            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3560               ) {
3561                return
3562                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3563            }
3564            break;
3565	 case float_round_to_zero:
3566	    break;
3567         case float_round_down:
3568            return
3569                  aSign ?
3570                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3571                : packFloatx80( 0, 0, 0 );
3572         case float_round_up:
3573            return
3574                  aSign ? packFloatx80( 1, 0, 0 )
3575                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3576        }
3577        return packFloatx80( aSign, 0, 0 );
3578    }
3579    lastBitMask = 1;
3580    lastBitMask <<= 0x403E - aExp;
3581    roundBitsMask = lastBitMask - 1;
3582    z = a;
3583    roundingMode = float_rounding_mode;
3584    if ( roundingMode == float_round_nearest_even ) {
3585        z.low += lastBitMask>>1;
3586        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3587    }
3588    else if ( roundingMode != float_round_to_zero ) {
3589        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3590            z.low += roundBitsMask;
3591        }
3592    }
3593    z.low &= ~ roundBitsMask;
3594    if ( z.low == 0 ) {
3595        ++z.high;
3596        z.low = LIT64( 0x8000000000000000 );
3597    }
3598    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3599    return z;
3600
3601}
3602
3603/*
3604-------------------------------------------------------------------------------
3605Returns the result of adding the absolute values of the extended double-
3606precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3607negated before being returned.  `zSign' is ignored if the result is a NaN.
3608The addition is performed according to the IEC/IEEE Standard for Binary
3609Floating-Point Arithmetic.
3610-------------------------------------------------------------------------------
3611*/
3612static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3613{
3614    int32 aExp, bExp, zExp;
3615    bits64 aSig, bSig, zSig0, zSig1;
3616    int32 expDiff;
3617
3618    aSig = extractFloatx80Frac( a );
3619    aExp = extractFloatx80Exp( a );
3620    bSig = extractFloatx80Frac( b );
3621    bExp = extractFloatx80Exp( b );
3622    expDiff = aExp - bExp;
3623    if ( 0 < expDiff ) {
3624        if ( aExp == 0x7FFF ) {
3625            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3626            return a;
3627        }
3628        if ( bExp == 0 ) --expDiff;
3629        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3630        zExp = aExp;
3631    }
3632    else if ( expDiff < 0 ) {
3633        if ( bExp == 0x7FFF ) {
3634            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3635            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3636        }
3637        if ( aExp == 0 ) ++expDiff;
3638        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3639        zExp = bExp;
3640    }
3641    else {
3642        if ( aExp == 0x7FFF ) {
3643            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3644                return propagateFloatx80NaN( a, b );
3645            }
3646            return a;
3647        }
3648        zSig1 = 0;
3649        zSig0 = aSig + bSig;
3650        if ( aExp == 0 ) {
3651            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3652            goto roundAndPack;
3653        }
3654        zExp = aExp;
3655        goto shiftRight1;
3656    }
3657    zSig0 = aSig + bSig;
3658    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3659 shiftRight1:
3660    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3661    zSig0 |= LIT64( 0x8000000000000000 );
3662    ++zExp;
3663 roundAndPack:
3664    return
3665        roundAndPackFloatx80(
3666            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3667
3668}
3669
3670/*
3671-------------------------------------------------------------------------------
3672Returns the result of subtracting the absolute values of the extended
3673double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3674difference is negated before being returned.  `zSign' is ignored if the
3675result is a NaN.  The subtraction is performed according to the IEC/IEEE
3676Standard for Binary Floating-Point Arithmetic.
3677-------------------------------------------------------------------------------
3678*/
3679static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3680{
3681    int32 aExp, bExp, zExp;
3682    bits64 aSig, bSig, zSig0, zSig1;
3683    int32 expDiff;
3684    floatx80 z;
3685
3686    aSig = extractFloatx80Frac( a );
3687    aExp = extractFloatx80Exp( a );
3688    bSig = extractFloatx80Frac( b );
3689    bExp = extractFloatx80Exp( b );
3690    expDiff = aExp - bExp;
3691    if ( 0 < expDiff ) goto aExpBigger;
3692    if ( expDiff < 0 ) goto bExpBigger;
3693    if ( aExp == 0x7FFF ) {
3694        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3695            return propagateFloatx80NaN( a, b );
3696        }
3697        float_raise( float_flag_invalid );
3698        z.low = floatx80_default_nan_low;
3699        z.high = floatx80_default_nan_high;
3700        return z;
3701    }
3702    if ( aExp == 0 ) {
3703        aExp = 1;
3704        bExp = 1;
3705    }
3706    zSig1 = 0;
3707    if ( bSig < aSig ) goto aBigger;
3708    if ( aSig < bSig ) goto bBigger;
3709    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3710 bExpBigger:
3711    if ( bExp == 0x7FFF ) {
3712        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3713        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3714    }
3715    if ( aExp == 0 ) ++expDiff;
3716    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3717 bBigger:
3718    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3719    zExp = bExp;
3720    zSign ^= 1;
3721    goto normalizeRoundAndPack;
3722 aExpBigger:
3723    if ( aExp == 0x7FFF ) {
3724        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3725        return a;
3726    }
3727    if ( bExp == 0 ) --expDiff;
3728    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3729 aBigger:
3730    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3731    zExp = aExp;
3732 normalizeRoundAndPack:
3733    return
3734        normalizeRoundAndPackFloatx80(
3735            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3736
3737}
3738
3739/*
3740-------------------------------------------------------------------------------
3741Returns the result of adding the extended double-precision floating-point
3742values `a' and `b'.  The operation is performed according to the IEC/IEEE
3743Standard for Binary Floating-Point Arithmetic.
3744-------------------------------------------------------------------------------
3745*/
3746floatx80 floatx80_add( floatx80 a, floatx80 b )
3747{
3748    flag aSign, bSign;
3749
3750    aSign = extractFloatx80Sign( a );
3751    bSign = extractFloatx80Sign( b );
3752    if ( aSign == bSign ) {
3753        return addFloatx80Sigs( a, b, aSign );
3754    }
3755    else {
3756        return subFloatx80Sigs( a, b, aSign );
3757    }
3758
3759}
3760
3761/*
3762-------------------------------------------------------------------------------
3763Returns the result of subtracting the extended double-precision floating-
3764point values `a' and `b'.  The operation is performed according to the
3765IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3766-------------------------------------------------------------------------------
3767*/
3768floatx80 floatx80_sub( floatx80 a, floatx80 b )
3769{
3770    flag aSign, bSign;
3771
3772    aSign = extractFloatx80Sign( a );
3773    bSign = extractFloatx80Sign( b );
3774    if ( aSign == bSign ) {
3775        return subFloatx80Sigs( a, b, aSign );
3776    }
3777    else {
3778        return addFloatx80Sigs( a, b, aSign );
3779    }
3780
3781}
3782
3783/*
3784-------------------------------------------------------------------------------
3785Returns the result of multiplying the extended double-precision floating-
3786point values `a' and `b'.  The operation is performed according to the
3787IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788-------------------------------------------------------------------------------
3789*/
3790floatx80 floatx80_mul( floatx80 a, floatx80 b )
3791{
3792    flag aSign, bSign, zSign;
3793    int32 aExp, bExp, zExp;
3794    bits64 aSig, bSig, zSig0, zSig1;
3795    floatx80 z;
3796
3797    aSig = extractFloatx80Frac( a );
3798    aExp = extractFloatx80Exp( a );
3799    aSign = extractFloatx80Sign( a );
3800    bSig = extractFloatx80Frac( b );
3801    bExp = extractFloatx80Exp( b );
3802    bSign = extractFloatx80Sign( b );
3803    zSign = aSign ^ bSign;
3804    if ( aExp == 0x7FFF ) {
3805        if (    (bits64) ( aSig<<1 )
3806             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3807            return propagateFloatx80NaN( a, b );
3808        }
3809        if ( ( bExp | bSig ) == 0 ) goto invalid;
3810        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3811    }
3812    if ( bExp == 0x7FFF ) {
3813        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3814        if ( ( aExp | aSig ) == 0 ) {
3815 invalid:
3816            float_raise( float_flag_invalid );
3817            z.low = floatx80_default_nan_low;
3818            z.high = floatx80_default_nan_high;
3819            return z;
3820        }
3821        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3822    }
3823    if ( aExp == 0 ) {
3824        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3825        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3826    }
3827    if ( bExp == 0 ) {
3828        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3829        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3830    }
3831    zExp = aExp + bExp - 0x3FFE;
3832    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3833    if ( 0 < (sbits64) zSig0 ) {
3834        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3835        --zExp;
3836    }
3837    return
3838        roundAndPackFloatx80(
3839            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3840
3841}
3842
3843/*
3844-------------------------------------------------------------------------------
3845Returns the result of dividing the extended double-precision floating-point
3846value `a' by the corresponding value `b'.  The operation is performed
3847according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3848-------------------------------------------------------------------------------
3849*/
3850floatx80 floatx80_div( floatx80 a, floatx80 b )
3851{
3852    flag aSign, bSign, zSign;
3853    int32 aExp, bExp, zExp;
3854    bits64 aSig, bSig, zSig0, zSig1;
3855    bits64 rem0, rem1, rem2, term0, term1, term2;
3856    floatx80 z;
3857
3858    aSig = extractFloatx80Frac( a );
3859    aExp = extractFloatx80Exp( a );
3860    aSign = extractFloatx80Sign( a );
3861    bSig = extractFloatx80Frac( b );
3862    bExp = extractFloatx80Exp( b );
3863    bSign = extractFloatx80Sign( b );
3864    zSign = aSign ^ bSign;
3865    if ( aExp == 0x7FFF ) {
3866        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3867        if ( bExp == 0x7FFF ) {
3868            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3869            goto invalid;
3870        }
3871        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3872    }
3873    if ( bExp == 0x7FFF ) {
3874        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3875        return packFloatx80( zSign, 0, 0 );
3876    }
3877    if ( bExp == 0 ) {
3878        if ( bSig == 0 ) {
3879            if ( ( aExp | aSig ) == 0 ) {
3880 invalid:
3881                float_raise( float_flag_invalid );
3882                z.low = floatx80_default_nan_low;
3883                z.high = floatx80_default_nan_high;
3884                return z;
3885            }
3886            float_raise( float_flag_divbyzero );
3887            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3888        }
3889        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3890    }
3891    if ( aExp == 0 ) {
3892        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3893        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3894    }
3895    zExp = aExp - bExp + 0x3FFE;
3896    rem1 = 0;
3897    if ( bSig <= aSig ) {
3898        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3899        ++zExp;
3900    }
3901    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3902    mul64To128( bSig, zSig0, &term0, &term1 );
3903    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3904    while ( (sbits64) rem0 < 0 ) {
3905        --zSig0;
3906        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3907    }
3908    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3909    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3910        mul64To128( bSig, zSig1, &term1, &term2 );
3911        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3912        while ( (sbits64) rem1 < 0 ) {
3913            --zSig1;
3914            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3915        }
3916        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3917    }
3918    return
3919        roundAndPackFloatx80(
3920            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3921
3922}
3923
3924/*
3925-------------------------------------------------------------------------------
3926Returns the remainder of the extended double-precision floating-point value
3927`a' with respect to the corresponding value `b'.  The operation is performed
3928according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3929-------------------------------------------------------------------------------
3930*/
3931floatx80 floatx80_rem( floatx80 a, floatx80 b )
3932{
3933    flag aSign, bSign, zSign;
3934    int32 aExp, bExp, expDiff;
3935    bits64 aSig0, aSig1, bSig;
3936    bits64 q, term0, term1, alternateASig0, alternateASig1;
3937    floatx80 z;
3938
3939    aSig0 = extractFloatx80Frac( a );
3940    aExp = extractFloatx80Exp( a );
3941    aSign = extractFloatx80Sign( a );
3942    bSig = extractFloatx80Frac( b );
3943    bExp = extractFloatx80Exp( b );
3944    bSign = extractFloatx80Sign( b );
3945    if ( aExp == 0x7FFF ) {
3946        if (    (bits64) ( aSig0<<1 )
3947             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3948            return propagateFloatx80NaN( a, b );
3949        }
3950        goto invalid;
3951    }
3952    if ( bExp == 0x7FFF ) {
3953        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3954        return a;
3955    }
3956    if ( bExp == 0 ) {
3957        if ( bSig == 0 ) {
3958 invalid:
3959            float_raise( float_flag_invalid );
3960            z.low = floatx80_default_nan_low;
3961            z.high = floatx80_default_nan_high;
3962            return z;
3963        }
3964        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3965    }
3966    if ( aExp == 0 ) {
3967        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3968        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3969    }
3970    bSig |= LIT64( 0x8000000000000000 );
3971    zSign = aSign;
3972    expDiff = aExp - bExp;
3973    aSig1 = 0;
3974    if ( expDiff < 0 ) {
3975        if ( expDiff < -1 ) return a;
3976        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3977        expDiff = 0;
3978    }
3979    q = ( bSig <= aSig0 );
3980    if ( q ) aSig0 -= bSig;
3981    expDiff -= 64;
3982    while ( 0 < expDiff ) {
3983        q = estimateDiv128To64( aSig0, aSig1, bSig );
3984        q = ( 2 < q ) ? q - 2 : 0;
3985        mul64To128( bSig, q, &term0, &term1 );
3986        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3987        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3988        expDiff -= 62;
3989    }
3990    expDiff += 64;
3991    if ( 0 < expDiff ) {
3992        q = estimateDiv128To64( aSig0, aSig1, bSig );
3993        q = ( 2 < q ) ? q - 2 : 0;
3994        q >>= 64 - expDiff;
3995        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3996        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3997        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3998        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3999            ++q;
4000            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4001        }
4002    }
4003    else {
4004        term1 = 0;
4005        term0 = bSig;
4006    }
4007    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4008    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4009         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4010              && ( q & 1 ) )
4011       ) {
4012        aSig0 = alternateASig0;
4013        aSig1 = alternateASig1;
4014        zSign = ! zSign;
4015    }
4016    return
4017        normalizeRoundAndPackFloatx80(
4018            80, zSign, bExp + expDiff, aSig0, aSig1 );
4019
4020}
4021
4022/*
4023-------------------------------------------------------------------------------
4024Returns the square root of the extended double-precision floating-point
4025value `a'.  The operation is performed according to the IEC/IEEE Standard
4026for Binary Floating-Point Arithmetic.
4027-------------------------------------------------------------------------------
4028*/
4029floatx80 floatx80_sqrt( floatx80 a )
4030{
4031    flag aSign;
4032    int32 aExp, zExp;
4033    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4034    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4035    floatx80 z;
4036
4037    aSig0 = extractFloatx80Frac( a );
4038    aExp = extractFloatx80Exp( a );
4039    aSign = extractFloatx80Sign( a );
4040    if ( aExp == 0x7FFF ) {
4041        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4042        if ( ! aSign ) return a;
4043        goto invalid;
4044    }
4045    if ( aSign ) {
4046        if ( ( aExp | aSig0 ) == 0 ) return a;
4047 invalid:
4048        float_raise( float_flag_invalid );
4049        z.low = floatx80_default_nan_low;
4050        z.high = floatx80_default_nan_high;
4051        return z;
4052    }
4053    if ( aExp == 0 ) {
4054        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4055        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4056    }
4057    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4058    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4059    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4060    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4061    doubleZSig0 = zSig0<<1;
4062    mul64To128( zSig0, zSig0, &term0, &term1 );
4063    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4064    while ( (sbits64) rem0 < 0 ) {
4065        --zSig0;
4066        doubleZSig0 -= 2;
4067        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4068    }
4069    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4070    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4071        if ( zSig1 == 0 ) zSig1 = 1;
4072        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4073        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4074        mul64To128( zSig1, zSig1, &term2, &term3 );
4075        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4076        while ( (sbits64) rem1 < 0 ) {
4077            --zSig1;
4078            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4079            term3 |= 1;
4080            term2 |= doubleZSig0;
4081            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4082        }
4083        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4084    }
4085    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4086    zSig0 |= doubleZSig0;
4087    return
4088        roundAndPackFloatx80(
4089            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4090
4091}
4092
4093/*
4094-------------------------------------------------------------------------------
4095Returns 1 if the extended double-precision floating-point value `a' is
4096equal to the corresponding value `b', and 0 otherwise.  The comparison is
4097performed according to the IEC/IEEE Standard for Binary Floating-Point
4098Arithmetic.
4099-------------------------------------------------------------------------------
4100*/
4101flag floatx80_eq( floatx80 a, floatx80 b )
4102{
4103
4104    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4105              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4106         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4107              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4108       ) {
4109        if (    floatx80_is_signaling_nan( a )
4110             || floatx80_is_signaling_nan( b ) ) {
4111            float_raise( float_flag_invalid );
4112        }
4113        return 0;
4114    }
4115    return
4116           ( a.low == b.low )
4117        && (    ( a.high == b.high )
4118             || (    ( a.low == 0 )
4119                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4120           );
4121
4122}
4123
4124/*
4125-------------------------------------------------------------------------------
4126Returns 1 if the extended double-precision floating-point value `a' is
4127less than or equal to the corresponding value `b', and 0 otherwise.  The
4128comparison is performed according to the IEC/IEEE Standard for Binary
4129Floating-Point Arithmetic.
4130-------------------------------------------------------------------------------
4131*/
4132flag floatx80_le( floatx80 a, floatx80 b )
4133{
4134    flag aSign, bSign;
4135
4136    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4137              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4138         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4139              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4140       ) {
4141        float_raise( float_flag_invalid );
4142        return 0;
4143    }
4144    aSign = extractFloatx80Sign( a );
4145    bSign = extractFloatx80Sign( b );
4146    if ( aSign != bSign ) {
4147        return
4148               aSign
4149            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4150                 == 0 );
4151    }
4152    return
4153          aSign ? le128( b.high, b.low, a.high, a.low )
4154        : le128( a.high, a.low, b.high, b.low );
4155
4156}
4157
4158/*
4159-------------------------------------------------------------------------------
4160Returns 1 if the extended double-precision floating-point value `a' is
4161less than the corresponding value `b', and 0 otherwise.  The comparison
4162is performed according to the IEC/IEEE Standard for Binary Floating-Point
4163Arithmetic.
4164-------------------------------------------------------------------------------
4165*/
4166flag floatx80_lt( floatx80 a, floatx80 b )
4167{
4168    flag aSign, bSign;
4169
4170    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4171              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4172         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4173              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4174       ) {
4175        float_raise( float_flag_invalid );
4176        return 0;
4177    }
4178    aSign = extractFloatx80Sign( a );
4179    bSign = extractFloatx80Sign( b );
4180    if ( aSign != bSign ) {
4181        return
4182               aSign
4183            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4184                 != 0 );
4185    }
4186    return
4187          aSign ? lt128( b.high, b.low, a.high, a.low )
4188        : lt128( a.high, a.low, b.high, b.low );
4189
4190}
4191
4192/*
4193-------------------------------------------------------------------------------
4194Returns 1 if the extended double-precision floating-point value `a' is equal
4195to the corresponding value `b', and 0 otherwise.  The invalid exception is
4196raised if either operand is a NaN.  Otherwise, the comparison is performed
4197according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4198-------------------------------------------------------------------------------
4199*/
4200flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4201{
4202
4203    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4204              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4205         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4206              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4207       ) {
4208        float_raise( float_flag_invalid );
4209        return 0;
4210    }
4211    return
4212           ( a.low == b.low )
4213        && (    ( a.high == b.high )
4214             || (    ( a.low == 0 )
4215                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4216           );
4217
4218}
4219
4220/*
4221-------------------------------------------------------------------------------
4222Returns 1 if the extended double-precision floating-point value `a' is less
4223than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4224do not cause an exception.  Otherwise, the comparison is performed according
4225to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4226-------------------------------------------------------------------------------
4227*/
4228flag floatx80_le_quiet( floatx80 a, floatx80 b )
4229{
4230    flag aSign, bSign;
4231
4232    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4233              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4234         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4235              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4236       ) {
4237        if (    floatx80_is_signaling_nan( a )
4238             || floatx80_is_signaling_nan( b ) ) {
4239            float_raise( float_flag_invalid );
4240        }
4241        return 0;
4242    }
4243    aSign = extractFloatx80Sign( a );
4244    bSign = extractFloatx80Sign( b );
4245    if ( aSign != bSign ) {
4246        return
4247               aSign
4248            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4249                 == 0 );
4250    }
4251    return
4252          aSign ? le128( b.high, b.low, a.high, a.low )
4253        : le128( a.high, a.low, b.high, b.low );
4254
4255}
4256
4257/*
4258-------------------------------------------------------------------------------
4259Returns 1 if the extended double-precision floating-point value `a' is less
4260than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4261an exception.  Otherwise, the comparison is performed according to the
4262IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4263-------------------------------------------------------------------------------
4264*/
4265flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4266{
4267    flag aSign, bSign;
4268
4269    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4270              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4271         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4272              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4273       ) {
4274        if (    floatx80_is_signaling_nan( a )
4275             || floatx80_is_signaling_nan( b ) ) {
4276            float_raise( float_flag_invalid );
4277        }
4278        return 0;
4279    }
4280    aSign = extractFloatx80Sign( a );
4281    bSign = extractFloatx80Sign( b );
4282    if ( aSign != bSign ) {
4283        return
4284               aSign
4285            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4286                 != 0 );
4287    }
4288    return
4289          aSign ? lt128( b.high, b.low, a.high, a.low )
4290        : lt128( a.high, a.low, b.high, b.low );
4291
4292}
4293
4294#endif
4295
4296#ifdef FLOAT128
4297
4298/*
4299-------------------------------------------------------------------------------
4300Returns the result of converting the quadruple-precision floating-point
4301value `a' to the 32-bit two's complement integer format.  The conversion
4302is performed according to the IEC/IEEE Standard for Binary Floating-Point
4303Arithmetic---which means in particular that the conversion is rounded
4304according to the current rounding mode.  If `a' is a NaN, the largest
4305positive integer is returned.  Otherwise, if the conversion overflows, the
4306largest integer with the same sign as `a' is returned.
4307-------------------------------------------------------------------------------
4308*/
4309int32 float128_to_int32( float128 a )
4310{
4311    flag aSign;
4312    int32 aExp, shiftCount;
4313    bits64 aSig0, aSig1;
4314
4315    aSig1 = extractFloat128Frac1( a );
4316    aSig0 = extractFloat128Frac0( a );
4317    aExp = extractFloat128Exp( a );
4318    aSign = extractFloat128Sign( a );
4319    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4320    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4321    aSig0 |= ( aSig1 != 0 );
4322    shiftCount = 0x4028 - aExp;
4323    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4324    return roundAndPackInt32( aSign, aSig0 );
4325
4326}
4327
4328/*
4329-------------------------------------------------------------------------------
4330Returns the result of converting the quadruple-precision floating-point
4331value `a' to the 32-bit two's complement integer format.  The conversion
4332is performed according to the IEC/IEEE Standard for Binary Floating-Point
4333Arithmetic, except that the conversion is always rounded toward zero.  If
4334`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4335conversion overflows, the largest integer with the same sign as `a' is
4336returned.
4337-------------------------------------------------------------------------------
4338*/
4339int32 float128_to_int32_round_to_zero( float128 a )
4340{
4341    flag aSign;
4342    int32 aExp, shiftCount;
4343    bits64 aSig0, aSig1, savedASig;
4344    int32 z;
4345
4346    aSig1 = extractFloat128Frac1( a );
4347    aSig0 = extractFloat128Frac0( a );
4348    aExp = extractFloat128Exp( a );
4349    aSign = extractFloat128Sign( a );
4350    aSig0 |= ( aSig1 != 0 );
4351    if ( 0x401E < aExp ) {
4352        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4353        goto invalid;
4354    }
4355    else if ( aExp < 0x3FFF ) {
4356        if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4357        return 0;
4358    }
4359    aSig0 |= LIT64( 0x0001000000000000 );
4360    shiftCount = 0x402F - aExp;
4361    savedASig = aSig0;
4362    aSig0 >>= shiftCount;
4363    z = aSig0;
4364    if ( aSign ) z = - z;
4365    if ( ( z < 0 ) ^ aSign ) {
4366 invalid:
4367        float_raise( float_flag_invalid );
4368        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4369    }
4370    if ( ( aSig0<<shiftCount ) != savedASig ) {
4371        float_exception_flags |= float_flag_inexact;
4372    }
4373    return z;
4374
4375}
4376
4377/*
4378-------------------------------------------------------------------------------
4379Returns the result of converting the quadruple-precision floating-point
4380value `a' to the 64-bit two's complement integer format.  The conversion
4381is performed according to the IEC/IEEE Standard for Binary Floating-Point
4382Arithmetic---which means in particular that the conversion is rounded
4383according to the current rounding mode.  If `a' is a NaN, the largest
4384positive integer is returned.  Otherwise, if the conversion overflows, the
4385largest integer with the same sign as `a' is returned.
4386-------------------------------------------------------------------------------
4387*/
4388int64 float128_to_int64( float128 a )
4389{
4390    flag aSign;
4391    int32 aExp, shiftCount;
4392    bits64 aSig0, aSig1;
4393
4394    aSig1 = extractFloat128Frac1( a );
4395    aSig0 = extractFloat128Frac0( a );
4396    aExp = extractFloat128Exp( a );
4397    aSign = extractFloat128Sign( a );
4398    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4399    shiftCount = 0x402F - aExp;
4400    if ( shiftCount <= 0 ) {
4401        if ( 0x403E < aExp ) {
4402            float_raise( float_flag_invalid );
4403            if (    ! aSign
4404                 || (    ( aExp == 0x7FFF )
4405                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4406                    )
4407               ) {
4408                return LIT64( 0x7FFFFFFFFFFFFFFF );
4409            }
4410            return (sbits64) LIT64( 0x8000000000000000 );
4411        }
4412        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4413    }
4414    else {
4415        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4416    }
4417    return roundAndPackInt64( aSign, aSig0, aSig1 );
4418
4419}
4420
4421/*
4422-------------------------------------------------------------------------------
4423Returns the result of converting the quadruple-precision floating-point
4424value `a' to the 64-bit two's complement integer format.  The conversion
4425is performed according to the IEC/IEEE Standard for Binary Floating-Point
4426Arithmetic, except that the conversion is always rounded toward zero.
4427If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4428the conversion overflows, the largest integer with the same sign as `a' is
4429returned.
4430-------------------------------------------------------------------------------
4431*/
4432int64 float128_to_int64_round_to_zero( float128 a )
4433{
4434    flag aSign;
4435    int32 aExp, shiftCount;
4436    bits64 aSig0, aSig1;
4437    int64 z;
4438
4439    aSig1 = extractFloat128Frac1( a );
4440    aSig0 = extractFloat128Frac0( a );
4441    aExp = extractFloat128Exp( a );
4442    aSign = extractFloat128Sign( a );
4443    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4444    shiftCount = aExp - 0x402F;
4445    if ( 0 < shiftCount ) {
4446        if ( 0x403E <= aExp ) {
4447            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4448            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4449                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4450                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4451            }
4452            else {
4453                float_raise( float_flag_invalid );
4454                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4455                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4456                }
4457            }
4458            return (sbits64) LIT64( 0x8000000000000000 );
4459        }
4460        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4461        if ( (bits64) ( aSig1<<shiftCount ) ) {
4462            float_exception_flags |= float_flag_inexact;
4463        }
4464    }
4465    else {
4466        if ( aExp < 0x3FFF ) {
4467            if ( aExp | aSig0 | aSig1 ) {
4468                float_exception_flags |= float_flag_inexact;
4469            }
4470            return 0;
4471        }
4472        z = aSig0>>( - shiftCount );
4473        if (    aSig1
4474             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4475            float_exception_flags |= float_flag_inexact;
4476        }
4477    }
4478    if ( aSign ) z = - z;
4479    return z;
4480
4481}
4482
4483#if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4484    && defined(SOFTFLOAT_NEED_FIXUNS)
4485/*
4486 * just like above - but do not care for overflow of signed results
4487 */
4488uint64 float128_to_uint64_round_to_zero( float128 a )
4489{
4490    flag aSign;
4491    int32 aExp, shiftCount;
4492    bits64 aSig0, aSig1;
4493    uint64 z;
4494
4495    aSig1 = extractFloat128Frac1( a );
4496    aSig0 = extractFloat128Frac0( a );
4497    aExp = extractFloat128Exp( a );
4498    aSign = extractFloat128Sign( a );
4499    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4500    shiftCount = aExp - 0x402F;
4501    if ( 0 < shiftCount ) {
4502        if ( 0x403F <= aExp ) {
4503            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4504            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4505                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4506                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4507            }
4508            else {
4509                float_raise( float_flag_invalid );
4510            }
4511            return LIT64( 0xFFFFFFFFFFFFFFFF );
4512        }
4513        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4514        if ( (bits64) ( aSig1<<shiftCount ) ) {
4515            float_exception_flags |= float_flag_inexact;
4516        }
4517    }
4518    else {
4519        if ( aExp < 0x3FFF ) {
4520            if ( aExp | aSig0 | aSig1 ) {
4521                float_exception_flags |= float_flag_inexact;
4522            }
4523            return 0;
4524        }
4525        z = aSig0>>( - shiftCount );
4526        if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4527            float_exception_flags |= float_flag_inexact;
4528        }
4529    }
4530    if ( aSign ) z = - z;
4531    return z;
4532
4533}
4534#endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4535
4536/*
4537-------------------------------------------------------------------------------
4538Returns the result of converting the quadruple-precision floating-point
4539value `a' to the single-precision floating-point format.  The conversion
4540is performed according to the IEC/IEEE Standard for Binary Floating-Point
4541Arithmetic.
4542-------------------------------------------------------------------------------
4543*/
4544float32 float128_to_float32( float128 a )
4545{
4546    flag aSign;
4547    int32 aExp;
4548    bits64 aSig0, aSig1;
4549    bits32 zSig;
4550
4551    aSig1 = extractFloat128Frac1( a );
4552    aSig0 = extractFloat128Frac0( a );
4553    aExp = extractFloat128Exp( a );
4554    aSign = extractFloat128Sign( a );
4555    if ( aExp == 0x7FFF ) {
4556        if ( aSig0 | aSig1 ) {
4557            return commonNaNToFloat32( float128ToCommonNaN( a ) );
4558        }
4559        return packFloat32( aSign, 0xFF, 0 );
4560    }
4561    aSig0 |= ( aSig1 != 0 );
4562    shift64RightJamming( aSig0, 18, &aSig0 );
4563    zSig = aSig0;
4564    if ( aExp || zSig ) {
4565        zSig |= 0x40000000;
4566        aExp -= 0x3F81;
4567    }
4568    return roundAndPackFloat32( aSign, aExp, zSig );
4569
4570}
4571
4572/*
4573-------------------------------------------------------------------------------
4574Returns the result of converting the quadruple-precision floating-point
4575value `a' to the double-precision floating-point format.  The conversion
4576is performed according to the IEC/IEEE Standard for Binary Floating-Point
4577Arithmetic.
4578-------------------------------------------------------------------------------
4579*/
4580float64 float128_to_float64( float128 a )
4581{
4582    flag aSign;
4583    int32 aExp;
4584    bits64 aSig0, aSig1;
4585
4586    aSig1 = extractFloat128Frac1( a );
4587    aSig0 = extractFloat128Frac0( a );
4588    aExp = extractFloat128Exp( a );
4589    aSign = extractFloat128Sign( a );
4590    if ( aExp == 0x7FFF ) {
4591        if ( aSig0 | aSig1 ) {
4592            return commonNaNToFloat64( float128ToCommonNaN( a ) );
4593        }
4594        return packFloat64( aSign, 0x7FF, 0 );
4595    }
4596    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4597    aSig0 |= ( aSig1 != 0 );
4598    if ( aExp || aSig0 ) {
4599        aSig0 |= LIT64( 0x4000000000000000 );
4600        aExp -= 0x3C01;
4601    }
4602    return roundAndPackFloat64( aSign, aExp, aSig0 );
4603
4604}
4605
4606#ifdef FLOATX80
4607
4608/*
4609-------------------------------------------------------------------------------
4610Returns the result of converting the quadruple-precision floating-point
4611value `a' to the extended double-precision floating-point format.  The
4612conversion is performed according to the IEC/IEEE Standard for Binary
4613Floating-Point Arithmetic.
4614-------------------------------------------------------------------------------
4615*/
4616floatx80 float128_to_floatx80( float128 a )
4617{
4618    flag aSign;
4619    int32 aExp;
4620    bits64 aSig0, aSig1;
4621
4622    aSig1 = extractFloat128Frac1( a );
4623    aSig0 = extractFloat128Frac0( a );
4624    aExp = extractFloat128Exp( a );
4625    aSign = extractFloat128Sign( a );
4626    if ( aExp == 0x7FFF ) {
4627        if ( aSig0 | aSig1 ) {
4628            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4629        }
4630        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4631    }
4632    if ( aExp == 0 ) {
4633        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4634        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4635    }
4636    else {
4637        aSig0 |= LIT64( 0x0001000000000000 );
4638    }
4639    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4640    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4641
4642}
4643
4644#endif
4645
4646/*
4647-------------------------------------------------------------------------------
4648Rounds the quadruple-precision floating-point value `a' to an integer, and
4649returns the result as a quadruple-precision floating-point value.  The
4650operation is performed according to the IEC/IEEE Standard for Binary
4651Floating-Point Arithmetic.
4652-------------------------------------------------------------------------------
4653*/
4654float128 float128_round_to_int( float128 a )
4655{
4656    flag aSign;
4657    int32 aExp;
4658    bits64 lastBitMask, roundBitsMask;
4659    int8 roundingMode;
4660    float128 z;
4661
4662    aExp = extractFloat128Exp( a );
4663    if ( 0x402F <= aExp ) {
4664        if ( 0x406F <= aExp ) {
4665            if (    ( aExp == 0x7FFF )
4666                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4667               ) {
4668                return propagateFloat128NaN( a, a );
4669            }
4670            return a;
4671        }
4672        lastBitMask = 1;
4673        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4674        roundBitsMask = lastBitMask - 1;
4675        z = a;
4676        roundingMode = float_rounding_mode;
4677        if ( roundingMode == float_round_nearest_even ) {
4678            if ( lastBitMask ) {
4679                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4680                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4681            }
4682            else {
4683                if ( (sbits64) z.low < 0 ) {
4684                    ++z.high;
4685                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4686                }
4687            }
4688        }
4689        else if ( roundingMode != float_round_to_zero ) {
4690            if (   extractFloat128Sign( z )
4691                 ^ ( roundingMode == float_round_up ) ) {
4692                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4693            }
4694        }
4695        z.low &= ~ roundBitsMask;
4696    }
4697    else {
4698        if ( aExp < 0x3FFF ) {
4699            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4700            float_exception_flags |= float_flag_inexact;
4701            aSign = extractFloat128Sign( a );
4702            switch ( float_rounding_mode ) {
4703             case float_round_nearest_even:
4704                if (    ( aExp == 0x3FFE )
4705                     && (   extractFloat128Frac0( a )
4706                          | extractFloat128Frac1( a ) )
4707                   ) {
4708                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4709                }
4710                break;
4711	     case float_round_to_zero:
4712		break;
4713             case float_round_down:
4714                return
4715                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4716                    : packFloat128( 0, 0, 0, 0 );
4717             case float_round_up:
4718                return
4719                      aSign ? packFloat128( 1, 0, 0, 0 )
4720                    : packFloat128( 0, 0x3FFF, 0, 0 );
4721            }
4722            return packFloat128( aSign, 0, 0, 0 );
4723        }
4724        lastBitMask = 1;
4725        lastBitMask <<= 0x402F - aExp;
4726        roundBitsMask = lastBitMask - 1;
4727        z.low = 0;
4728        z.high = a.high;
4729        roundingMode = float_rounding_mode;
4730        if ( roundingMode == float_round_nearest_even ) {
4731            z.high += lastBitMask>>1;
4732            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4733                z.high &= ~ lastBitMask;
4734            }
4735        }
4736        else if ( roundingMode != float_round_to_zero ) {
4737            if (   extractFloat128Sign( z )
4738                 ^ ( roundingMode == float_round_up ) ) {
4739                z.high |= ( a.low != 0 );
4740                z.high += roundBitsMask;
4741            }
4742        }
4743        z.high &= ~ roundBitsMask;
4744    }
4745    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4746        float_exception_flags |= float_flag_inexact;
4747    }
4748    return z;
4749
4750}
4751
4752/*
4753-------------------------------------------------------------------------------
4754Returns the result of adding the absolute values of the quadruple-precision
4755floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4756before being returned.  `zSign' is ignored if the result is a NaN.
4757The addition is performed according to the IEC/IEEE Standard for Binary
4758Floating-Point Arithmetic.
4759-------------------------------------------------------------------------------
4760*/
4761static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4762{
4763    int32 aExp, bExp, zExp;
4764    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4765    int32 expDiff;
4766
4767    aSig1 = extractFloat128Frac1( a );
4768    aSig0 = extractFloat128Frac0( a );
4769    aExp = extractFloat128Exp( a );
4770    bSig1 = extractFloat128Frac1( b );
4771    bSig0 = extractFloat128Frac0( b );
4772    bExp = extractFloat128Exp( b );
4773    expDiff = aExp - bExp;
4774    if ( 0 < expDiff ) {
4775        if ( aExp == 0x7FFF ) {
4776            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4777            return a;
4778        }
4779        if ( bExp == 0 ) {
4780            --expDiff;
4781        }
4782        else {
4783            bSig0 |= LIT64( 0x0001000000000000 );
4784        }
4785        shift128ExtraRightJamming(
4786            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4787        zExp = aExp;
4788    }
4789    else if ( expDiff < 0 ) {
4790        if ( bExp == 0x7FFF ) {
4791            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4792            return packFloat128( zSign, 0x7FFF, 0, 0 );
4793        }
4794        if ( aExp == 0 ) {
4795            ++expDiff;
4796        }
4797        else {
4798            aSig0 |= LIT64( 0x0001000000000000 );
4799        }
4800        shift128ExtraRightJamming(
4801            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4802        zExp = bExp;
4803    }
4804    else {
4805        if ( aExp == 0x7FFF ) {
4806            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4807                return propagateFloat128NaN( a, b );
4808            }
4809            return a;
4810        }
4811        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4812        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4813        zSig2 = 0;
4814        zSig0 |= LIT64( 0x0002000000000000 );
4815        zExp = aExp;
4816        goto shiftRight1;
4817    }
4818    aSig0 |= LIT64( 0x0001000000000000 );
4819    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4820    --zExp;
4821    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4822    ++zExp;
4823 shiftRight1:
4824    shift128ExtraRightJamming(
4825        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4826 roundAndPack:
4827    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4828
4829}
4830
4831/*
4832-------------------------------------------------------------------------------
4833Returns the result of subtracting the absolute values of the quadruple-
4834precision floating-point values `a' and `b'.  If `zSign' is 1, the
4835difference is negated before being returned.  `zSign' is ignored if the
4836result is a NaN.  The subtraction is performed according to the IEC/IEEE
4837Standard for Binary Floating-Point Arithmetic.
4838-------------------------------------------------------------------------------
4839*/
4840static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4841{
4842    int32 aExp, bExp, zExp;
4843    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4844    int32 expDiff;
4845    float128 z;
4846
4847    aSig1 = extractFloat128Frac1( a );
4848    aSig0 = extractFloat128Frac0( a );
4849    aExp = extractFloat128Exp( a );
4850    bSig1 = extractFloat128Frac1( b );
4851    bSig0 = extractFloat128Frac0( b );
4852    bExp = extractFloat128Exp( b );
4853    expDiff = aExp - bExp;
4854    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4855    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4856    if ( 0 < expDiff ) goto aExpBigger;
4857    if ( expDiff < 0 ) goto bExpBigger;
4858    if ( aExp == 0x7FFF ) {
4859        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4860            return propagateFloat128NaN( a, b );
4861        }
4862        float_raise( float_flag_invalid );
4863        z.low = float128_default_nan_low;
4864        z.high = float128_default_nan_high;
4865        return z;
4866    }
4867    if ( aExp == 0 ) {
4868        aExp = 1;
4869        bExp = 1;
4870    }
4871    if ( bSig0 < aSig0 ) goto aBigger;
4872    if ( aSig0 < bSig0 ) goto bBigger;
4873    if ( bSig1 < aSig1 ) goto aBigger;
4874    if ( aSig1 < bSig1 ) goto bBigger;
4875    return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4876 bExpBigger:
4877    if ( bExp == 0x7FFF ) {
4878        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4879        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4880    }
4881    if ( aExp == 0 ) {
4882        ++expDiff;
4883    }
4884    else {
4885        aSig0 |= LIT64( 0x4000000000000000 );
4886    }
4887    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4888    bSig0 |= LIT64( 0x4000000000000000 );
4889 bBigger:
4890    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4891    zExp = bExp;
4892    zSign ^= 1;
4893    goto normalizeRoundAndPack;
4894 aExpBigger:
4895    if ( aExp == 0x7FFF ) {
4896        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4897        return a;
4898    }
4899    if ( bExp == 0 ) {
4900        --expDiff;
4901    }
4902    else {
4903        bSig0 |= LIT64( 0x4000000000000000 );
4904    }
4905    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4906    aSig0 |= LIT64( 0x4000000000000000 );
4907 aBigger:
4908    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4909    zExp = aExp;
4910 normalizeRoundAndPack:
4911    --zExp;
4912    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4913
4914}
4915
4916/*
4917-------------------------------------------------------------------------------
4918Returns the result of adding the quadruple-precision floating-point values
4919`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4920for Binary Floating-Point Arithmetic.
4921-------------------------------------------------------------------------------
4922*/
4923float128 float128_add( float128 a, float128 b )
4924{
4925    flag aSign, bSign;
4926
4927    aSign = extractFloat128Sign( a );
4928    bSign = extractFloat128Sign( b );
4929    if ( aSign == bSign ) {
4930        return addFloat128Sigs( a, b, aSign );
4931    }
4932    else {
4933        return subFloat128Sigs( a, b, aSign );
4934    }
4935
4936}
4937
4938/*
4939-------------------------------------------------------------------------------
4940Returns the result of subtracting the quadruple-precision floating-point
4941values `a' and `b'.  The operation is performed according to the IEC/IEEE
4942Standard for Binary Floating-Point Arithmetic.
4943-------------------------------------------------------------------------------
4944*/
4945float128 float128_sub( float128 a, float128 b )
4946{
4947    flag aSign, bSign;
4948
4949    aSign = extractFloat128Sign( a );
4950    bSign = extractFloat128Sign( b );
4951    if ( aSign == bSign ) {
4952        return subFloat128Sigs( a, b, aSign );
4953    }
4954    else {
4955        return addFloat128Sigs( a, b, aSign );
4956    }
4957
4958}
4959
4960/*
4961-------------------------------------------------------------------------------
4962Returns the result of multiplying the quadruple-precision floating-point
4963values `a' and `b'.  The operation is performed according to the IEC/IEEE
4964Standard for Binary Floating-Point Arithmetic.
4965-------------------------------------------------------------------------------
4966*/
4967float128 float128_mul( float128 a, float128 b )
4968{
4969    flag aSign, bSign, zSign;
4970    int32 aExp, bExp, zExp;
4971    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4972    float128 z;
4973
4974    aSig1 = extractFloat128Frac1( a );
4975    aSig0 = extractFloat128Frac0( a );
4976    aExp = extractFloat128Exp( a );
4977    aSign = extractFloat128Sign( a );
4978    bSig1 = extractFloat128Frac1( b );
4979    bSig0 = extractFloat128Frac0( b );
4980    bExp = extractFloat128Exp( b );
4981    bSign = extractFloat128Sign( b );
4982    zSign = aSign ^ bSign;
4983    if ( aExp == 0x7FFF ) {
4984        if (    ( aSig0 | aSig1 )
4985             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4986            return propagateFloat128NaN( a, b );
4987        }
4988        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4989        return packFloat128( zSign, 0x7FFF, 0, 0 );
4990    }
4991    if ( bExp == 0x7FFF ) {
4992        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4993        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4994 invalid:
4995            float_raise( float_flag_invalid );
4996            z.low = float128_default_nan_low;
4997            z.high = float128_default_nan_high;
4998            return z;
4999        }
5000        return packFloat128( zSign, 0x7FFF, 0, 0 );
5001    }
5002    if ( aExp == 0 ) {
5003        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5004        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5005    }
5006    if ( bExp == 0 ) {
5007        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5008        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5009    }
5010    zExp = aExp + bExp - 0x4000;
5011    aSig0 |= LIT64( 0x0001000000000000 );
5012    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5013    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5014    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5015    zSig2 |= ( zSig3 != 0 );
5016    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5017        shift128ExtraRightJamming(
5018            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5019        ++zExp;
5020    }
5021    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5022
5023}
5024
5025/*
5026-------------------------------------------------------------------------------
5027Returns the result of dividing the quadruple-precision floating-point value
5028`a' by the corresponding value `b'.  The operation is performed according to
5029the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5030-------------------------------------------------------------------------------
5031*/
5032float128 float128_div( float128 a, float128 b )
5033{
5034    flag aSign, bSign, zSign;
5035    int32 aExp, bExp, zExp;
5036    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5037    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5038    float128 z;
5039
5040    aSig1 = extractFloat128Frac1( a );
5041    aSig0 = extractFloat128Frac0( a );
5042    aExp = extractFloat128Exp( a );
5043    aSign = extractFloat128Sign( a );
5044    bSig1 = extractFloat128Frac1( b );
5045    bSig0 = extractFloat128Frac0( b );
5046    bExp = extractFloat128Exp( b );
5047    bSign = extractFloat128Sign( b );
5048    zSign = aSign ^ bSign;
5049    if ( aExp == 0x7FFF ) {
5050        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5051        if ( bExp == 0x7FFF ) {
5052            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5053            goto invalid;
5054        }
5055        return packFloat128( zSign, 0x7FFF, 0, 0 );
5056    }
5057    if ( bExp == 0x7FFF ) {
5058        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5059        return packFloat128( zSign, 0, 0, 0 );
5060    }
5061    if ( bExp == 0 ) {
5062        if ( ( bSig0 | bSig1 ) == 0 ) {
5063            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5064 invalid:
5065                float_raise( float_flag_invalid );
5066                z.low = float128_default_nan_low;
5067                z.high = float128_default_nan_high;
5068                return z;
5069            }
5070            float_raise( float_flag_divbyzero );
5071            return packFloat128( zSign, 0x7FFF, 0, 0 );
5072        }
5073        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5074    }
5075    if ( aExp == 0 ) {
5076        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5077        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5078    }
5079    zExp = aExp - bExp + 0x3FFD;
5080    shortShift128Left(
5081        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5082    shortShift128Left(
5083        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5084    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5085        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5086        ++zExp;
5087    }
5088    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5090    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5091    while ( (sbits64) rem0 < 0 ) {
5092        --zSig0;
5093        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5094    }
5095    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5096    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5097        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5098        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5099        while ( (sbits64) rem1 < 0 ) {
5100            --zSig1;
5101            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5102        }
5103        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5104    }
5105    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5106    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5107
5108}
5109
5110/*
5111-------------------------------------------------------------------------------
5112Returns the remainder of the quadruple-precision floating-point value `a'
5113with respect to the corresponding value `b'.  The operation is performed
5114according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5115-------------------------------------------------------------------------------
5116*/
5117float128 float128_rem( float128 a, float128 b )
5118{
5119    flag aSign, bSign, zSign;
5120    int32 aExp, bExp, expDiff;
5121    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5122    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5123    sbits64 sigMean0;
5124    float128 z;
5125
5126    aSig1 = extractFloat128Frac1( a );
5127    aSig0 = extractFloat128Frac0( a );
5128    aExp = extractFloat128Exp( a );
5129    aSign = extractFloat128Sign( a );
5130    bSig1 = extractFloat128Frac1( b );
5131    bSig0 = extractFloat128Frac0( b );
5132    bExp = extractFloat128Exp( b );
5133    bSign = extractFloat128Sign( b );
5134    if ( aExp == 0x7FFF ) {
5135        if (    ( aSig0 | aSig1 )
5136             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5137            return propagateFloat128NaN( a, b );
5138        }
5139        goto invalid;
5140    }
5141    if ( bExp == 0x7FFF ) {
5142        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5143        return a;
5144    }
5145    if ( bExp == 0 ) {
5146        if ( ( bSig0 | bSig1 ) == 0 ) {
5147 invalid:
5148            float_raise( float_flag_invalid );
5149            z.low = float128_default_nan_low;
5150            z.high = float128_default_nan_high;
5151            return z;
5152        }
5153        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5154    }
5155    if ( aExp == 0 ) {
5156        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5157        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5158    }
5159    expDiff = aExp - bExp;
5160    if ( expDiff < -1 ) return a;
5161    shortShift128Left(
5162        aSig0 | LIT64( 0x0001000000000000 ),
5163        aSig1,
5164        15 - ( expDiff < 0 ),
5165        &aSig0,
5166        &aSig1
5167    );
5168    shortShift128Left(
5169        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5170    q = le128( bSig0, bSig1, aSig0, aSig1 );
5171    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5172    expDiff -= 64;
5173    while ( 0 < expDiff ) {
5174        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5175        q = ( 4 < q ) ? q - 4 : 0;
5176        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5177        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5178        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5179        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5180        expDiff -= 61;
5181    }
5182    if ( -64 < expDiff ) {
5183        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5184        q = ( 4 < q ) ? q - 4 : 0;
5185        q >>= - expDiff;
5186        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5187        expDiff += 52;
5188        if ( expDiff < 0 ) {
5189            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5190        }
5191        else {
5192            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5193        }
5194        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5195        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5196    }
5197    else {
5198        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5199        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5200    }
5201    do {
5202        alternateASig0 = aSig0;
5203        alternateASig1 = aSig1;
5204        ++q;
5205        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5206    } while ( 0 <= (sbits64) aSig0 );
5207    add128(
5208        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5209    if (    ( sigMean0 < 0 )
5210         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5211        aSig0 = alternateASig0;
5212        aSig1 = alternateASig1;
5213    }
5214    zSign = ( (sbits64) aSig0 < 0 );
5215    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5216    return
5217        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5218
5219}
5220
5221/*
5222-------------------------------------------------------------------------------
5223Returns the square root of the quadruple-precision floating-point value `a'.
5224The operation is performed according to the IEC/IEEE Standard for Binary
5225Floating-Point Arithmetic.
5226-------------------------------------------------------------------------------
5227*/
5228float128 float128_sqrt( float128 a )
5229{
5230    flag aSign;
5231    int32 aExp, zExp;
5232    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5233    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5234    float128 z;
5235
5236    aSig1 = extractFloat128Frac1( a );
5237    aSig0 = extractFloat128Frac0( a );
5238    aExp = extractFloat128Exp( a );
5239    aSign = extractFloat128Sign( a );
5240    if ( aExp == 0x7FFF ) {
5241        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5242        if ( ! aSign ) return a;
5243        goto invalid;
5244    }
5245    if ( aSign ) {
5246        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5247 invalid:
5248        float_raise( float_flag_invalid );
5249        z.low = float128_default_nan_low;
5250        z.high = float128_default_nan_high;
5251        return z;
5252    }
5253    if ( aExp == 0 ) {
5254        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5255        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5256    }
5257    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5258    aSig0 |= LIT64( 0x0001000000000000 );
5259    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5260    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5261    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5262    doubleZSig0 = zSig0<<1;
5263    mul64To128( zSig0, zSig0, &term0, &term1 );
5264    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5265    while ( (sbits64) rem0 < 0 ) {
5266        --zSig0;
5267        doubleZSig0 -= 2;
5268        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5269    }
5270    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5271    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5272        if ( zSig1 == 0 ) zSig1 = 1;
5273        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5274        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5275        mul64To128( zSig1, zSig1, &term2, &term3 );
5276        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5277        while ( (sbits64) rem1 < 0 ) {
5278            --zSig1;
5279            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5280            term3 |= 1;
5281            term2 |= doubleZSig0;
5282            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5283        }
5284        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5285    }
5286    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5287    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5288
5289}
5290
5291/*
5292-------------------------------------------------------------------------------
5293Returns 1 if the quadruple-precision floating-point value `a' is equal to
5294the corresponding value `b', and 0 otherwise.  The comparison is performed
5295according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5296-------------------------------------------------------------------------------
5297*/
5298flag float128_eq( float128 a, float128 b )
5299{
5300
5301    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5302              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5303         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5304              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5305       ) {
5306        if (    float128_is_signaling_nan( a )
5307             || float128_is_signaling_nan( b ) ) {
5308            float_raise( float_flag_invalid );
5309        }
5310        return 0;
5311    }
5312    return
5313           ( a.low == b.low )
5314        && (    ( a.high == b.high )
5315             || (    ( a.low == 0 )
5316                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5317           );
5318
5319}
5320
5321/*
5322-------------------------------------------------------------------------------
5323Returns 1 if the quadruple-precision floating-point value `a' is less than
5324or equal to the corresponding value `b', and 0 otherwise.  The comparison
5325is performed according to the IEC/IEEE Standard for Binary Floating-Point
5326Arithmetic.
5327-------------------------------------------------------------------------------
5328*/
5329flag float128_le( float128 a, float128 b )
5330{
5331    flag aSign, bSign;
5332
5333    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5334              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5336              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5337       ) {
5338        float_raise( float_flag_invalid );
5339        return 0;
5340    }
5341    aSign = extractFloat128Sign( a );
5342    bSign = extractFloat128Sign( b );
5343    if ( aSign != bSign ) {
5344        return
5345               aSign
5346            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5347                 == 0 );
5348    }
5349    return
5350          aSign ? le128( b.high, b.low, a.high, a.low )
5351        : le128( a.high, a.low, b.high, b.low );
5352
5353}
5354
5355/*
5356-------------------------------------------------------------------------------
5357Returns 1 if the quadruple-precision floating-point value `a' is less than
5358the corresponding value `b', and 0 otherwise.  The comparison is performed
5359according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5360-------------------------------------------------------------------------------
5361*/
5362flag float128_lt( float128 a, float128 b )
5363{
5364    flag aSign, bSign;
5365
5366    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5367              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5368         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5369              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5370       ) {
5371        float_raise( float_flag_invalid );
5372        return 0;
5373    }
5374    aSign = extractFloat128Sign( a );
5375    bSign = extractFloat128Sign( b );
5376    if ( aSign != bSign ) {
5377        return
5378               aSign
5379            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5380                 != 0 );
5381    }
5382    return
5383          aSign ? lt128( b.high, b.low, a.high, a.low )
5384        : lt128( a.high, a.low, b.high, b.low );
5385
5386}
5387
5388/*
5389-------------------------------------------------------------------------------
5390Returns 1 if the quadruple-precision floating-point value `a' is equal to
5391the corresponding value `b', and 0 otherwise.  The invalid exception is
5392raised if either operand is a NaN.  Otherwise, the comparison is performed
5393according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5394-------------------------------------------------------------------------------
5395*/
5396flag float128_eq_signaling( float128 a, float128 b )
5397{
5398
5399    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5400              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5401         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5402              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5403       ) {
5404        float_raise( float_flag_invalid );
5405        return 0;
5406    }
5407    return
5408           ( a.low == b.low )
5409        && (    ( a.high == b.high )
5410             || (    ( a.low == 0 )
5411                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5412           );
5413
5414}
5415
5416/*
5417-------------------------------------------------------------------------------
5418Returns 1 if the quadruple-precision floating-point value `a' is less than
5419or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5420cause an exception.  Otherwise, the comparison is performed according to the
5421IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5422-------------------------------------------------------------------------------
5423*/
5424flag float128_le_quiet( float128 a, float128 b )
5425{
5426    flag aSign, bSign;
5427
5428    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5429              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5430         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5431              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5432       ) {
5433        if (    float128_is_signaling_nan( a )
5434             || float128_is_signaling_nan( b ) ) {
5435            float_raise( float_flag_invalid );
5436        }
5437        return 0;
5438    }
5439    aSign = extractFloat128Sign( a );
5440    bSign = extractFloat128Sign( b );
5441    if ( aSign != bSign ) {
5442        return
5443               aSign
5444            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5445                 == 0 );
5446    }
5447    return
5448          aSign ? le128( b.high, b.low, a.high, a.low )
5449        : le128( a.high, a.low, b.high, b.low );
5450
5451}
5452
5453/*
5454-------------------------------------------------------------------------------
5455Returns 1 if the quadruple-precision floating-point value `a' is less than
5456the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5457exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5458Standard for Binary Floating-Point Arithmetic.
5459-------------------------------------------------------------------------------
5460*/
5461flag float128_lt_quiet( float128 a, float128 b )
5462{
5463    flag aSign, bSign;
5464
5465    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5466              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5467         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5468              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5469       ) {
5470        if (    float128_is_signaling_nan( a )
5471             || float128_is_signaling_nan( b ) ) {
5472            float_raise( float_flag_invalid );
5473        }
5474        return 0;
5475    }
5476    aSign = extractFloat128Sign( a );
5477    bSign = extractFloat128Sign( b );
5478    if ( aSign != bSign ) {
5479        return
5480               aSign
5481            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5482                 != 0 );
5483    }
5484    return
5485          aSign ? lt128( b.high, b.low, a.high, a.low )
5486        : lt128( a.high, a.low, b.high, b.low );
5487
5488}
5489
5490#endif
5491
5492
5493#if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5494
5495/*
5496 * These two routines are not part of the original softfloat distribution.
5497 *
5498 * They are based on the corresponding conversions to integer but return
5499 * unsigned numbers instead since these functions are required by GCC.
5500 *
5501 * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5502 *
5503 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5504 */
5505
5506/*
5507-------------------------------------------------------------------------------
5508Returns the result of converting the double-precision floating-point value
5509`a' to the 32-bit unsigned integer format.  The conversion is
5510performed according to the IEC/IEEE Standard for Binary Floating-point
5511Arithmetic, except that the conversion is always rounded toward zero.  If
5512`a' is a NaN, the largest positive integer is returned.  If the conversion
5513overflows, the largest integer positive is returned.
5514-------------------------------------------------------------------------------
5515*/
5516uint32 float64_to_uint32_round_to_zero( float64 a )
5517{
5518    flag aSign;
5519    int16 aExp, shiftCount;
5520    bits64 aSig, savedASig;
5521    uint32 z;
5522
5523    aSig = extractFloat64Frac( a );
5524    aExp = extractFloat64Exp( a );
5525    aSign = extractFloat64Sign( a );
5526
5527    if (aSign) {
5528        float_raise( float_flag_invalid );
5529    	return(0);
5530    }
5531
5532    if ( 0x41E < aExp ) {
5533        float_raise( float_flag_invalid );
5534        return 0xffffffff;
5535    }
5536    else if ( aExp < 0x3FF ) {
5537        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5538        return 0;
5539    }
5540    aSig |= LIT64( 0x0010000000000000 );
5541    shiftCount = 0x433 - aExp;
5542    savedASig = aSig;
5543    aSig >>= shiftCount;
5544    z = aSig;
5545    if ( ( aSig<<shiftCount ) != savedASig ) {
5546        float_exception_flags |= float_flag_inexact;
5547    }
5548    return z;
5549
5550}
5551
5552/*
5553-------------------------------------------------------------------------------
5554Returns the result of converting the single-precision floating-point value
5555`a' to the 32-bit unsigned integer format.  The conversion is
5556performed according to the IEC/IEEE Standard for Binary Floating-point
5557Arithmetic, except that the conversion is always rounded toward zero.  If
5558`a' is a NaN, the largest positive integer is returned.  If the conversion
5559overflows, the largest positive integer is returned.
5560-------------------------------------------------------------------------------
5561*/
5562uint32 float32_to_uint32_round_to_zero( float32 a )
5563{
5564    flag aSign;
5565    int16 aExp, shiftCount;
5566    bits32 aSig;
5567    uint32 z;
5568
5569    aSig = extractFloat32Frac( a );
5570    aExp = extractFloat32Exp( a );
5571    aSign = extractFloat32Sign( a );
5572    shiftCount = aExp - 0x9E;
5573
5574    if (aSign) {
5575        float_raise( float_flag_invalid );
5576    	return(0);
5577    }
5578    if ( 0 < shiftCount ) {
5579        float_raise( float_flag_invalid );
5580        return 0xFFFFFFFF;
5581    }
5582    else if ( aExp <= 0x7E ) {
5583        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5584        return 0;
5585    }
5586    aSig = ( aSig | 0x800000 )<<8;
5587    z = aSig>>( - shiftCount );
5588    if ( aSig<<( shiftCount & 31 ) ) {
5589        float_exception_flags |= float_flag_inexact;
5590    }
5591    return z;
5592
5593}
5594
5595#endif
5596