softfloat.c revision 230363
1/* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */ 2 3/* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9/* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17/* 18=============================================================================== 19 20This C source file is part of the SoftFloat IEC/IEEE Floating-point 21Arithmetic Package, Release 2a. 22 23Written by John R. Hauser. This work was made possible in part by the 24International Computer Science Institute, located at Suite 600, 1947 Center 25Street, Berkeley, California 94704. Funding was partially provided by the 26National Science Foundation under grant MIP-9311980. The original version 27of this code was written as part of a project to build a fixed-point vector 28processor in collaboration with the University of California at Berkeley, 29overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31arithmetic/SoftFloat.html'. 32 33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39Derivative works are acceptable, even for commercial purposes, so long as 40(1) they include prominent notice that the work is derivative, and (2) they 41include prominent notice akin to these four paragraphs for those parts of 42this code that are retained. 43 44=============================================================================== 45*/ 46 47#include <sys/cdefs.h> 48__FBSDID("$FreeBSD: head/lib/libc/softfloat/bits64/softfloat.c 230363 2012-01-20 06:16:14Z das $"); 49 50#ifdef SOFTFLOAT_FOR_GCC 51#include "softfloat-for-gcc.h" 52#endif 53 54#include "milieu.h" 55#include "softfloat.h" 56 57/* 58 * Conversions between floats as stored in memory and floats as 59 * SoftFloat uses them 60 */ 61#ifndef FLOAT64_DEMANGLE 62#define FLOAT64_DEMANGLE(a) (a) 63#endif 64#ifndef FLOAT64_MANGLE 65#define FLOAT64_MANGLE(a) (a) 66#endif 67 68/* 69------------------------------------------------------------------------------- 70Floating-point rounding mode, extended double-precision rounding precision, 71and exception flags. 72------------------------------------------------------------------------------- 73*/ 74int float_rounding_mode = float_round_nearest_even; 75int float_exception_flags = 0; 76#ifdef FLOATX80 77int8 floatx80_rounding_precision = 80; 78#endif 79 80/* 81------------------------------------------------------------------------------- 82Primitive arithmetic functions, including multi-word arithmetic, and 83division and square root approximations. (Can be specialized to target if 84desired.) 85------------------------------------------------------------------------------- 86*/ 87#include "softfloat-macros" 88 89/* 90------------------------------------------------------------------------------- 91Functions and definitions to determine: (1) whether tininess for underflow 92is detected before or after rounding by default, (2) what (if anything) 93happens when exceptions are raised, (3) how signaling NaNs are distinguished 94from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 95are propagated from function inputs to output. These details are target- 96specific. 97------------------------------------------------------------------------------- 98*/ 99#include "softfloat-specialize" 100 101#if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 102/* 103------------------------------------------------------------------------------- 104Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 105and 7, and returns the properly rounded 32-bit integer corresponding to the 106input. If `zSign' is 1, the input is negated before being converted to an 107integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 108is simply rounded to an integer, with the inexact exception raised if the 109input cannot be represented exactly as an integer. However, if the fixed- 110point input is too large, the invalid exception is raised and the largest 111positive or negative integer is returned. 112------------------------------------------------------------------------------- 113*/ 114static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 115{ 116 int8 roundingMode; 117 flag roundNearestEven; 118 int8 roundIncrement, roundBits; 119 int32 z; 120 121 roundingMode = float_rounding_mode; 122 roundNearestEven = ( roundingMode == float_round_nearest_even ); 123 roundIncrement = 0x40; 124 if ( ! roundNearestEven ) { 125 if ( roundingMode == float_round_to_zero ) { 126 roundIncrement = 0; 127 } 128 else { 129 roundIncrement = 0x7F; 130 if ( zSign ) { 131 if ( roundingMode == float_round_up ) roundIncrement = 0; 132 } 133 else { 134 if ( roundingMode == float_round_down ) roundIncrement = 0; 135 } 136 } 137 } 138 roundBits = absZ & 0x7F; 139 absZ = ( absZ + roundIncrement )>>7; 140 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 141 z = absZ; 142 if ( zSign ) z = - z; 143 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 144 float_raise( float_flag_invalid ); 145 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 146 } 147 if ( roundBits ) float_exception_flags |= float_flag_inexact; 148 return z; 149 150} 151 152/* 153------------------------------------------------------------------------------- 154Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 155`absZ1', with binary point between bits 63 and 64 (between the input words), 156and returns the properly rounded 64-bit integer corresponding to the input. 157If `zSign' is 1, the input is negated before being converted to an integer. 158Ordinarily, the fixed-point input is simply rounded to an integer, with 159the inexact exception raised if the input cannot be represented exactly as 160an integer. However, if the fixed-point input is too large, the invalid 161exception is raised and the largest positive or negative integer is 162returned. 163------------------------------------------------------------------------------- 164*/ 165static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 166{ 167 int8 roundingMode; 168 flag roundNearestEven, increment; 169 int64 z; 170 171 roundingMode = float_rounding_mode; 172 roundNearestEven = ( roundingMode == float_round_nearest_even ); 173 increment = ( (sbits64) absZ1 < 0 ); 174 if ( ! roundNearestEven ) { 175 if ( roundingMode == float_round_to_zero ) { 176 increment = 0; 177 } 178 else { 179 if ( zSign ) { 180 increment = ( roundingMode == float_round_down ) && absZ1; 181 } 182 else { 183 increment = ( roundingMode == float_round_up ) && absZ1; 184 } 185 } 186 } 187 if ( increment ) { 188 ++absZ0; 189 if ( absZ0 == 0 ) goto overflow; 190 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 191 } 192 z = absZ0; 193 if ( zSign ) z = - z; 194 if ( z && ( ( z < 0 ) ^ zSign ) ) { 195 overflow: 196 float_raise( float_flag_invalid ); 197 return 198 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 199 : LIT64( 0x7FFFFFFFFFFFFFFF ); 200 } 201 if ( absZ1 ) float_exception_flags |= float_flag_inexact; 202 return z; 203 204} 205#endif 206 207/* 208------------------------------------------------------------------------------- 209Returns the fraction bits of the single-precision floating-point value `a'. 210------------------------------------------------------------------------------- 211*/ 212INLINE bits32 extractFloat32Frac( float32 a ) 213{ 214 215 return a & 0x007FFFFF; 216 217} 218 219/* 220------------------------------------------------------------------------------- 221Returns the exponent bits of the single-precision floating-point value `a'. 222------------------------------------------------------------------------------- 223*/ 224INLINE int16 extractFloat32Exp( float32 a ) 225{ 226 227 return ( a>>23 ) & 0xFF; 228 229} 230 231/* 232------------------------------------------------------------------------------- 233Returns the sign bit of the single-precision floating-point value `a'. 234------------------------------------------------------------------------------- 235*/ 236INLINE flag extractFloat32Sign( float32 a ) 237{ 238 239 return a>>31; 240 241} 242 243/* 244------------------------------------------------------------------------------- 245Normalizes the subnormal single-precision floating-point value represented 246by the denormalized significand `aSig'. The normalized exponent and 247significand are stored at the locations pointed to by `zExpPtr' and 248`zSigPtr', respectively. 249------------------------------------------------------------------------------- 250*/ 251static void 252 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 253{ 254 int8 shiftCount; 255 256 shiftCount = countLeadingZeros32( aSig ) - 8; 257 *zSigPtr = aSig<<shiftCount; 258 *zExpPtr = 1 - shiftCount; 259 260} 261 262/* 263------------------------------------------------------------------------------- 264Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 265single-precision floating-point value, returning the result. After being 266shifted into the proper positions, the three fields are simply added 267together to form the result. This means that any integer portion of `zSig' 268will be added into the exponent. Since a properly normalized significand 269will have an integer portion equal to 1, the `zExp' input should be 1 less 270than the desired result exponent whenever `zSig' is a complete, normalized 271significand. 272------------------------------------------------------------------------------- 273*/ 274INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 275{ 276 277 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 278 279} 280 281/* 282------------------------------------------------------------------------------- 283Takes an abstract floating-point value having sign `zSign', exponent `zExp', 284and significand `zSig', and returns the proper single-precision floating- 285point value corresponding to the abstract input. Ordinarily, the abstract 286value is simply rounded and packed into the single-precision format, with 287the inexact exception raised if the abstract input cannot be represented 288exactly. However, if the abstract value is too large, the overflow and 289inexact exceptions are raised and an infinity or maximal finite value is 290returned. If the abstract value is too small, the input value is rounded to 291a subnormal number, and the underflow and inexact exceptions are raised if 292the abstract input cannot be represented exactly as a subnormal single- 293precision floating-point number. 294 The input significand `zSig' has its binary point between bits 30 295and 29, which is 7 bits to the left of the usual location. This shifted 296significand must be normalized or smaller. If `zSig' is not normalized, 297`zExp' must be 0; in that case, the result returned is a subnormal number, 298and it must not require rounding. In the usual case that `zSig' is 299normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 300The handling of underflow and overflow follows the IEC/IEEE Standard for 301Binary Floating-Point Arithmetic. 302------------------------------------------------------------------------------- 303*/ 304static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 305{ 306 int8 roundingMode; 307 flag roundNearestEven; 308 int8 roundIncrement, roundBits; 309 flag isTiny; 310 311 roundingMode = float_rounding_mode; 312 roundNearestEven = ( roundingMode == float_round_nearest_even ); 313 roundIncrement = 0x40; 314 if ( ! roundNearestEven ) { 315 if ( roundingMode == float_round_to_zero ) { 316 roundIncrement = 0; 317 } 318 else { 319 roundIncrement = 0x7F; 320 if ( zSign ) { 321 if ( roundingMode == float_round_up ) roundIncrement = 0; 322 } 323 else { 324 if ( roundingMode == float_round_down ) roundIncrement = 0; 325 } 326 } 327 } 328 roundBits = zSig & 0x7F; 329 if ( 0xFD <= (bits16) zExp ) { 330 if ( ( 0xFD < zExp ) 331 || ( ( zExp == 0xFD ) 332 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 333 ) { 334 float_raise( float_flag_overflow | float_flag_inexact ); 335 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 336 } 337 if ( zExp < 0 ) { 338 isTiny = 339 ( float_detect_tininess == float_tininess_before_rounding ) 340 || ( zExp < -1 ) 341 || ( zSig + roundIncrement < 0x80000000 ); 342 shift32RightJamming( zSig, - zExp, &zSig ); 343 zExp = 0; 344 roundBits = zSig & 0x7F; 345 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 346 } 347 } 348 if ( roundBits ) float_exception_flags |= float_flag_inexact; 349 zSig = ( zSig + roundIncrement )>>7; 350 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 351 if ( zSig == 0 ) zExp = 0; 352 return packFloat32( zSign, zExp, zSig ); 353 354} 355 356/* 357------------------------------------------------------------------------------- 358Takes an abstract floating-point value having sign `zSign', exponent `zExp', 359and significand `zSig', and returns the proper single-precision floating- 360point value corresponding to the abstract input. This routine is just like 361`roundAndPackFloat32' except that `zSig' does not have to be normalized. 362Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 363floating-point exponent. 364------------------------------------------------------------------------------- 365*/ 366static float32 367 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 368{ 369 int8 shiftCount; 370 371 shiftCount = countLeadingZeros32( zSig ) - 1; 372 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 373 374} 375 376/* 377------------------------------------------------------------------------------- 378Returns the fraction bits of the double-precision floating-point value `a'. 379------------------------------------------------------------------------------- 380*/ 381INLINE bits64 extractFloat64Frac( float64 a ) 382{ 383 384 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 385 386} 387 388/* 389------------------------------------------------------------------------------- 390Returns the exponent bits of the double-precision floating-point value `a'. 391------------------------------------------------------------------------------- 392*/ 393INLINE int16 extractFloat64Exp( float64 a ) 394{ 395 396 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 397 398} 399 400/* 401------------------------------------------------------------------------------- 402Returns the sign bit of the double-precision floating-point value `a'. 403------------------------------------------------------------------------------- 404*/ 405INLINE flag extractFloat64Sign( float64 a ) 406{ 407 408 return FLOAT64_DEMANGLE(a)>>63; 409 410} 411 412/* 413------------------------------------------------------------------------------- 414Normalizes the subnormal double-precision floating-point value represented 415by the denormalized significand `aSig'. The normalized exponent and 416significand are stored at the locations pointed to by `zExpPtr' and 417`zSigPtr', respectively. 418------------------------------------------------------------------------------- 419*/ 420static void 421 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 422{ 423 int8 shiftCount; 424 425 shiftCount = countLeadingZeros64( aSig ) - 11; 426 *zSigPtr = aSig<<shiftCount; 427 *zExpPtr = 1 - shiftCount; 428 429} 430 431/* 432------------------------------------------------------------------------------- 433Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 434double-precision floating-point value, returning the result. After being 435shifted into the proper positions, the three fields are simply added 436together to form the result. This means that any integer portion of `zSig' 437will be added into the exponent. Since a properly normalized significand 438will have an integer portion equal to 1, the `zExp' input should be 1 less 439than the desired result exponent whenever `zSig' is a complete, normalized 440significand. 441------------------------------------------------------------------------------- 442*/ 443INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 444{ 445 446 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 447 ( ( (bits64) zExp )<<52 ) + zSig ); 448 449} 450 451/* 452------------------------------------------------------------------------------- 453Takes an abstract floating-point value having sign `zSign', exponent `zExp', 454and significand `zSig', and returns the proper double-precision floating- 455point value corresponding to the abstract input. Ordinarily, the abstract 456value is simply rounded and packed into the double-precision format, with 457the inexact exception raised if the abstract input cannot be represented 458exactly. However, if the abstract value is too large, the overflow and 459inexact exceptions are raised and an infinity or maximal finite value is 460returned. If the abstract value is too small, the input value is rounded to 461a subnormal number, and the underflow and inexact exceptions are raised if 462the abstract input cannot be represented exactly as a subnormal double- 463precision floating-point number. 464 The input significand `zSig' has its binary point between bits 62 465and 61, which is 10 bits to the left of the usual location. This shifted 466significand must be normalized or smaller. If `zSig' is not normalized, 467`zExp' must be 0; in that case, the result returned is a subnormal number, 468and it must not require rounding. In the usual case that `zSig' is 469normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 470The handling of underflow and overflow follows the IEC/IEEE Standard for 471Binary Floating-Point Arithmetic. 472------------------------------------------------------------------------------- 473*/ 474static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 475{ 476 int8 roundingMode; 477 flag roundNearestEven; 478 int16 roundIncrement, roundBits; 479 flag isTiny; 480 481 roundingMode = float_rounding_mode; 482 roundNearestEven = ( roundingMode == float_round_nearest_even ); 483 roundIncrement = 0x200; 484 if ( ! roundNearestEven ) { 485 if ( roundingMode == float_round_to_zero ) { 486 roundIncrement = 0; 487 } 488 else { 489 roundIncrement = 0x3FF; 490 if ( zSign ) { 491 if ( roundingMode == float_round_up ) roundIncrement = 0; 492 } 493 else { 494 if ( roundingMode == float_round_down ) roundIncrement = 0; 495 } 496 } 497 } 498 roundBits = zSig & 0x3FF; 499 if ( 0x7FD <= (bits16) zExp ) { 500 if ( ( 0x7FD < zExp ) 501 || ( ( zExp == 0x7FD ) 502 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 503 ) { 504 float_raise( float_flag_overflow | float_flag_inexact ); 505 return FLOAT64_MANGLE( 506 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 507 ( roundIncrement == 0 )); 508 } 509 if ( zExp < 0 ) { 510 isTiny = 511 ( float_detect_tininess == float_tininess_before_rounding ) 512 || ( zExp < -1 ) 513 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 514 shift64RightJamming( zSig, - zExp, &zSig ); 515 zExp = 0; 516 roundBits = zSig & 0x3FF; 517 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 518 } 519 } 520 if ( roundBits ) float_exception_flags |= float_flag_inexact; 521 zSig = ( zSig + roundIncrement )>>10; 522 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 523 if ( zSig == 0 ) zExp = 0; 524 return packFloat64( zSign, zExp, zSig ); 525 526} 527 528/* 529------------------------------------------------------------------------------- 530Takes an abstract floating-point value having sign `zSign', exponent `zExp', 531and significand `zSig', and returns the proper double-precision floating- 532point value corresponding to the abstract input. This routine is just like 533`roundAndPackFloat64' except that `zSig' does not have to be normalized. 534Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 535floating-point exponent. 536------------------------------------------------------------------------------- 537*/ 538static float64 539 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 540{ 541 int8 shiftCount; 542 543 shiftCount = countLeadingZeros64( zSig ) - 1; 544 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 545 546} 547 548#ifdef FLOATX80 549 550/* 551------------------------------------------------------------------------------- 552Returns the fraction bits of the extended double-precision floating-point 553value `a'. 554------------------------------------------------------------------------------- 555*/ 556INLINE bits64 extractFloatx80Frac( floatx80 a ) 557{ 558 559 return a.low; 560 561} 562 563/* 564------------------------------------------------------------------------------- 565Returns the exponent bits of the extended double-precision floating-point 566value `a'. 567------------------------------------------------------------------------------- 568*/ 569INLINE int32 extractFloatx80Exp( floatx80 a ) 570{ 571 572 return a.high & 0x7FFF; 573 574} 575 576/* 577------------------------------------------------------------------------------- 578Returns the sign bit of the extended double-precision floating-point value 579`a'. 580------------------------------------------------------------------------------- 581*/ 582INLINE flag extractFloatx80Sign( floatx80 a ) 583{ 584 585 return a.high>>15; 586 587} 588 589/* 590------------------------------------------------------------------------------- 591Normalizes the subnormal extended double-precision floating-point value 592represented by the denormalized significand `aSig'. The normalized exponent 593and significand are stored at the locations pointed to by `zExpPtr' and 594`zSigPtr', respectively. 595------------------------------------------------------------------------------- 596*/ 597static void 598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 599{ 600 int8 shiftCount; 601 602 shiftCount = countLeadingZeros64( aSig ); 603 *zSigPtr = aSig<<shiftCount; 604 *zExpPtr = 1 - shiftCount; 605 606} 607 608/* 609------------------------------------------------------------------------------- 610Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 611extended double-precision floating-point value, returning the result. 612------------------------------------------------------------------------------- 613*/ 614INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 615{ 616 floatx80 z; 617 618 z.low = zSig; 619 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 620 return z; 621 622} 623 624/* 625------------------------------------------------------------------------------- 626Takes an abstract floating-point value having sign `zSign', exponent `zExp', 627and extended significand formed by the concatenation of `zSig0' and `zSig1', 628and returns the proper extended double-precision floating-point value 629corresponding to the abstract input. Ordinarily, the abstract value is 630rounded and packed into the extended double-precision format, with the 631inexact exception raised if the abstract input cannot be represented 632exactly. However, if the abstract value is too large, the overflow and 633inexact exceptions are raised and an infinity or maximal finite value is 634returned. If the abstract value is too small, the input value is rounded to 635a subnormal number, and the underflow and inexact exceptions are raised if 636the abstract input cannot be represented exactly as a subnormal extended 637double-precision floating-point number. 638 If `roundingPrecision' is 32 or 64, the result is rounded to the same 639number of bits as single or double precision, respectively. Otherwise, the 640result is rounded to the full precision of the extended double-precision 641format. 642 The input significand must be normalized or smaller. If the input 643significand is not normalized, `zExp' must be 0; in that case, the result 644returned is a subnormal number, and it must not require rounding. The 645handling of underflow and overflow follows the IEC/IEEE Standard for Binary 646Floating-Point Arithmetic. 647------------------------------------------------------------------------------- 648*/ 649static floatx80 650 roundAndPackFloatx80( 651 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 652 ) 653{ 654 int8 roundingMode; 655 flag roundNearestEven, increment, isTiny; 656 int64 roundIncrement, roundMask, roundBits; 657 658 roundingMode = float_rounding_mode; 659 roundNearestEven = ( roundingMode == float_round_nearest_even ); 660 if ( roundingPrecision == 80 ) goto precision80; 661 if ( roundingPrecision == 64 ) { 662 roundIncrement = LIT64( 0x0000000000000400 ); 663 roundMask = LIT64( 0x00000000000007FF ); 664 } 665 else if ( roundingPrecision == 32 ) { 666 roundIncrement = LIT64( 0x0000008000000000 ); 667 roundMask = LIT64( 0x000000FFFFFFFFFF ); 668 } 669 else { 670 goto precision80; 671 } 672 zSig0 |= ( zSig1 != 0 ); 673 if ( ! roundNearestEven ) { 674 if ( roundingMode == float_round_to_zero ) { 675 roundIncrement = 0; 676 } 677 else { 678 roundIncrement = roundMask; 679 if ( zSign ) { 680 if ( roundingMode == float_round_up ) roundIncrement = 0; 681 } 682 else { 683 if ( roundingMode == float_round_down ) roundIncrement = 0; 684 } 685 } 686 } 687 roundBits = zSig0 & roundMask; 688 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 689 if ( ( 0x7FFE < zExp ) 690 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 691 ) { 692 goto overflow; 693 } 694 if ( zExp <= 0 ) { 695 isTiny = 696 ( float_detect_tininess == float_tininess_before_rounding ) 697 || ( zExp < 0 ) 698 || ( zSig0 <= zSig0 + roundIncrement ); 699 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 700 zExp = 0; 701 roundBits = zSig0 & roundMask; 702 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 703 if ( roundBits ) float_exception_flags |= float_flag_inexact; 704 zSig0 += roundIncrement; 705 if ( (sbits64) zSig0 < 0 ) zExp = 1; 706 roundIncrement = roundMask + 1; 707 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 708 roundMask |= roundIncrement; 709 } 710 zSig0 &= ~ roundMask; 711 return packFloatx80( zSign, zExp, zSig0 ); 712 } 713 } 714 if ( roundBits ) float_exception_flags |= float_flag_inexact; 715 zSig0 += roundIncrement; 716 if ( zSig0 < roundIncrement ) { 717 ++zExp; 718 zSig0 = LIT64( 0x8000000000000000 ); 719 } 720 roundIncrement = roundMask + 1; 721 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 722 roundMask |= roundIncrement; 723 } 724 zSig0 &= ~ roundMask; 725 if ( zSig0 == 0 ) zExp = 0; 726 return packFloatx80( zSign, zExp, zSig0 ); 727 precision80: 728 increment = ( (sbits64) zSig1 < 0 ); 729 if ( ! roundNearestEven ) { 730 if ( roundingMode == float_round_to_zero ) { 731 increment = 0; 732 } 733 else { 734 if ( zSign ) { 735 increment = ( roundingMode == float_round_down ) && zSig1; 736 } 737 else { 738 increment = ( roundingMode == float_round_up ) && zSig1; 739 } 740 } 741 } 742 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 743 if ( ( 0x7FFE < zExp ) 744 || ( ( zExp == 0x7FFE ) 745 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 746 && increment 747 ) 748 ) { 749 roundMask = 0; 750 overflow: 751 float_raise( float_flag_overflow | float_flag_inexact ); 752 if ( ( roundingMode == float_round_to_zero ) 753 || ( zSign && ( roundingMode == float_round_up ) ) 754 || ( ! zSign && ( roundingMode == float_round_down ) ) 755 ) { 756 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 757 } 758 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 759 } 760 if ( zExp <= 0 ) { 761 isTiny = 762 ( float_detect_tininess == float_tininess_before_rounding ) 763 || ( zExp < 0 ) 764 || ! increment 765 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 766 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 767 zExp = 0; 768 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 769 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 770 if ( roundNearestEven ) { 771 increment = ( (sbits64) zSig1 < 0 ); 772 } 773 else { 774 if ( zSign ) { 775 increment = ( roundingMode == float_round_down ) && zSig1; 776 } 777 else { 778 increment = ( roundingMode == float_round_up ) && zSig1; 779 } 780 } 781 if ( increment ) { 782 ++zSig0; 783 zSig0 &= 784 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 785 if ( (sbits64) zSig0 < 0 ) zExp = 1; 786 } 787 return packFloatx80( zSign, zExp, zSig0 ); 788 } 789 } 790 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 791 if ( increment ) { 792 ++zSig0; 793 if ( zSig0 == 0 ) { 794 ++zExp; 795 zSig0 = LIT64( 0x8000000000000000 ); 796 } 797 else { 798 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 799 } 800 } 801 else { 802 if ( zSig0 == 0 ) zExp = 0; 803 } 804 return packFloatx80( zSign, zExp, zSig0 ); 805 806} 807 808/* 809------------------------------------------------------------------------------- 810Takes an abstract floating-point value having sign `zSign', exponent 811`zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 812and returns the proper extended double-precision floating-point value 813corresponding to the abstract input. This routine is just like 814`roundAndPackFloatx80' except that the input significand does not have to be 815normalized. 816------------------------------------------------------------------------------- 817*/ 818static floatx80 819 normalizeRoundAndPackFloatx80( 820 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 821 ) 822{ 823 int8 shiftCount; 824 825 if ( zSig0 == 0 ) { 826 zSig0 = zSig1; 827 zSig1 = 0; 828 zExp -= 64; 829 } 830 shiftCount = countLeadingZeros64( zSig0 ); 831 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 832 zExp -= shiftCount; 833 return 834 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 835 836} 837 838#endif 839 840#ifdef FLOAT128 841 842/* 843------------------------------------------------------------------------------- 844Returns the least-significant 64 fraction bits of the quadruple-precision 845floating-point value `a'. 846------------------------------------------------------------------------------- 847*/ 848INLINE bits64 extractFloat128Frac1( float128 a ) 849{ 850 851 return a.low; 852 853} 854 855/* 856------------------------------------------------------------------------------- 857Returns the most-significant 48 fraction bits of the quadruple-precision 858floating-point value `a'. 859------------------------------------------------------------------------------- 860*/ 861INLINE bits64 extractFloat128Frac0( float128 a ) 862{ 863 864 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 865 866} 867 868/* 869------------------------------------------------------------------------------- 870Returns the exponent bits of the quadruple-precision floating-point value 871`a'. 872------------------------------------------------------------------------------- 873*/ 874INLINE int32 extractFloat128Exp( float128 a ) 875{ 876 877 return ( a.high>>48 ) & 0x7FFF; 878 879} 880 881/* 882------------------------------------------------------------------------------- 883Returns the sign bit of the quadruple-precision floating-point value `a'. 884------------------------------------------------------------------------------- 885*/ 886INLINE flag extractFloat128Sign( float128 a ) 887{ 888 889 return a.high>>63; 890 891} 892 893/* 894------------------------------------------------------------------------------- 895Normalizes the subnormal quadruple-precision floating-point value 896represented by the denormalized significand formed by the concatenation of 897`aSig0' and `aSig1'. The normalized exponent is stored at the location 898pointed to by `zExpPtr'. The most significant 49 bits of the normalized 899significand are stored at the location pointed to by `zSig0Ptr', and the 900least significant 64 bits of the normalized significand are stored at the 901location pointed to by `zSig1Ptr'. 902------------------------------------------------------------------------------- 903*/ 904static void 905 normalizeFloat128Subnormal( 906 bits64 aSig0, 907 bits64 aSig1, 908 int32 *zExpPtr, 909 bits64 *zSig0Ptr, 910 bits64 *zSig1Ptr 911 ) 912{ 913 int8 shiftCount; 914 915 if ( aSig0 == 0 ) { 916 shiftCount = countLeadingZeros64( aSig1 ) - 15; 917 if ( shiftCount < 0 ) { 918 *zSig0Ptr = aSig1>>( - shiftCount ); 919 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 920 } 921 else { 922 *zSig0Ptr = aSig1<<shiftCount; 923 *zSig1Ptr = 0; 924 } 925 *zExpPtr = - shiftCount - 63; 926 } 927 else { 928 shiftCount = countLeadingZeros64( aSig0 ) - 15; 929 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 930 *zExpPtr = 1 - shiftCount; 931 } 932 933} 934 935/* 936------------------------------------------------------------------------------- 937Packs the sign `zSign', the exponent `zExp', and the significand formed 938by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 939floating-point value, returning the result. After being shifted into the 940proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 941added together to form the most significant 32 bits of the result. This 942means that any integer portion of `zSig0' will be added into the exponent. 943Since a properly normalized significand will have an integer portion equal 944to 1, the `zExp' input should be 1 less than the desired result exponent 945whenever `zSig0' and `zSig1' concatenated form a complete, normalized 946significand. 947------------------------------------------------------------------------------- 948*/ 949INLINE float128 950 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 951{ 952 float128 z; 953 954 z.low = zSig1; 955 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 956 return z; 957 958} 959 960/* 961------------------------------------------------------------------------------- 962Takes an abstract floating-point value having sign `zSign', exponent `zExp', 963and extended significand formed by the concatenation of `zSig0', `zSig1', 964and `zSig2', and returns the proper quadruple-precision floating-point value 965corresponding to the abstract input. Ordinarily, the abstract value is 966simply rounded and packed into the quadruple-precision format, with the 967inexact exception raised if the abstract input cannot be represented 968exactly. However, if the abstract value is too large, the overflow and 969inexact exceptions are raised and an infinity or maximal finite value is 970returned. If the abstract value is too small, the input value is rounded to 971a subnormal number, and the underflow and inexact exceptions are raised if 972the abstract input cannot be represented exactly as a subnormal quadruple- 973precision floating-point number. 974 The input significand must be normalized or smaller. If the input 975significand is not normalized, `zExp' must be 0; in that case, the result 976returned is a subnormal number, and it must not require rounding. In the 977usual case that the input significand is normalized, `zExp' must be 1 less 978than the ``true'' floating-point exponent. The handling of underflow and 979overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 980------------------------------------------------------------------------------- 981*/ 982static float128 983 roundAndPackFloat128( 984 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 985{ 986 int8 roundingMode; 987 flag roundNearestEven, increment, isTiny; 988 989 roundingMode = float_rounding_mode; 990 roundNearestEven = ( roundingMode == float_round_nearest_even ); 991 increment = ( (sbits64) zSig2 < 0 ); 992 if ( ! roundNearestEven ) { 993 if ( roundingMode == float_round_to_zero ) { 994 increment = 0; 995 } 996 else { 997 if ( zSign ) { 998 increment = ( roundingMode == float_round_down ) && zSig2; 999 } 1000 else { 1001 increment = ( roundingMode == float_round_up ) && zSig2; 1002 } 1003 } 1004 } 1005 if ( 0x7FFD <= (bits32) zExp ) { 1006 if ( ( 0x7FFD < zExp ) 1007 || ( ( zExp == 0x7FFD ) 1008 && eq128( 1009 LIT64( 0x0001FFFFFFFFFFFF ), 1010 LIT64( 0xFFFFFFFFFFFFFFFF ), 1011 zSig0, 1012 zSig1 1013 ) 1014 && increment 1015 ) 1016 ) { 1017 float_raise( float_flag_overflow | float_flag_inexact ); 1018 if ( ( roundingMode == float_round_to_zero ) 1019 || ( zSign && ( roundingMode == float_round_up ) ) 1020 || ( ! zSign && ( roundingMode == float_round_down ) ) 1021 ) { 1022 return 1023 packFloat128( 1024 zSign, 1025 0x7FFE, 1026 LIT64( 0x0000FFFFFFFFFFFF ), 1027 LIT64( 0xFFFFFFFFFFFFFFFF ) 1028 ); 1029 } 1030 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1031 } 1032 if ( zExp < 0 ) { 1033 isTiny = 1034 ( float_detect_tininess == float_tininess_before_rounding ) 1035 || ( zExp < -1 ) 1036 || ! increment 1037 || lt128( 1038 zSig0, 1039 zSig1, 1040 LIT64( 0x0001FFFFFFFFFFFF ), 1041 LIT64( 0xFFFFFFFFFFFFFFFF ) 1042 ); 1043 shift128ExtraRightJamming( 1044 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1045 zExp = 0; 1046 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1047 if ( roundNearestEven ) { 1048 increment = ( (sbits64) zSig2 < 0 ); 1049 } 1050 else { 1051 if ( zSign ) { 1052 increment = ( roundingMode == float_round_down ) && zSig2; 1053 } 1054 else { 1055 increment = ( roundingMode == float_round_up ) && zSig2; 1056 } 1057 } 1058 } 1059 } 1060 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 1061 if ( increment ) { 1062 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1063 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1064 } 1065 else { 1066 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1067 } 1068 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1069 1070} 1071 1072/* 1073------------------------------------------------------------------------------- 1074Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1075and significand formed by the concatenation of `zSig0' and `zSig1', and 1076returns the proper quadruple-precision floating-point value corresponding 1077to the abstract input. This routine is just like `roundAndPackFloat128' 1078except that the input significand has fewer bits and does not have to be 1079normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1080point exponent. 1081------------------------------------------------------------------------------- 1082*/ 1083static float128 1084 normalizeRoundAndPackFloat128( 1085 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1086{ 1087 int8 shiftCount; 1088 bits64 zSig2; 1089 1090 if ( zSig0 == 0 ) { 1091 zSig0 = zSig1; 1092 zSig1 = 0; 1093 zExp -= 64; 1094 } 1095 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1096 if ( 0 <= shiftCount ) { 1097 zSig2 = 0; 1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1099 } 1100 else { 1101 shift128ExtraRightJamming( 1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1103 } 1104 zExp -= shiftCount; 1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1106 1107} 1108 1109#endif 1110 1111/* 1112------------------------------------------------------------------------------- 1113Returns the result of converting the 32-bit two's complement integer `a' 1114to the single-precision floating-point format. The conversion is performed 1115according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1116------------------------------------------------------------------------------- 1117*/ 1118float32 int32_to_float32( int32 a ) 1119{ 1120 flag zSign; 1121 1122 if ( a == 0 ) return 0; 1123 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1124 zSign = ( a < 0 ); 1125 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1126 1127} 1128 1129float32 uint32_to_float32( uint32 a ) 1130{ 1131 if ( a == 0 ) return 0; 1132 if ( a & (bits32) 0x80000000 ) 1133 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); 1134 return normalizeRoundAndPackFloat32( 0, 0x9C, a ); 1135} 1136 1137 1138/* 1139------------------------------------------------------------------------------- 1140Returns the result of converting the 32-bit two's complement integer `a' 1141to the double-precision floating-point format. The conversion is performed 1142according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1143------------------------------------------------------------------------------- 1144*/ 1145float64 int32_to_float64( int32 a ) 1146{ 1147 flag zSign; 1148 uint32 absA; 1149 int8 shiftCount; 1150 bits64 zSig; 1151 1152 if ( a == 0 ) return 0; 1153 zSign = ( a < 0 ); 1154 absA = zSign ? - a : a; 1155 shiftCount = countLeadingZeros32( absA ) + 21; 1156 zSig = absA; 1157 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1158 1159} 1160 1161float64 uint32_to_float64( uint32 a ) 1162{ 1163 int8 shiftCount; 1164 bits64 zSig = a; 1165 1166 if ( a == 0 ) return 0; 1167 shiftCount = countLeadingZeros32( a ) + 21; 1168 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); 1169 1170} 1171 1172#ifdef FLOATX80 1173 1174/* 1175------------------------------------------------------------------------------- 1176Returns the result of converting the 32-bit two's complement integer `a' 1177to the extended double-precision floating-point format. The conversion 1178is performed according to the IEC/IEEE Standard for Binary Floating-Point 1179Arithmetic. 1180------------------------------------------------------------------------------- 1181*/ 1182floatx80 int32_to_floatx80( int32 a ) 1183{ 1184 flag zSign; 1185 uint32 absA; 1186 int8 shiftCount; 1187 bits64 zSig; 1188 1189 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1190 zSign = ( a < 0 ); 1191 absA = zSign ? - a : a; 1192 shiftCount = countLeadingZeros32( absA ) + 32; 1193 zSig = absA; 1194 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1195 1196} 1197 1198floatx80 uint32_to_floatx80( uint32 a ) 1199{ 1200 int8 shiftCount; 1201 bits64 zSig = a; 1202 1203 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1204 shiftCount = countLeadingZeros32( a ) + 32; 1205 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); 1206 1207} 1208 1209#endif 1210 1211#ifdef FLOAT128 1212 1213/* 1214------------------------------------------------------------------------------- 1215Returns the result of converting the 32-bit two's complement integer `a' to 1216the quadruple-precision floating-point format. The conversion is performed 1217according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1218------------------------------------------------------------------------------- 1219*/ 1220float128 int32_to_float128( int32 a ) 1221{ 1222 flag zSign; 1223 uint32 absA; 1224 int8 shiftCount; 1225 bits64 zSig0; 1226 1227 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1228 zSign = ( a < 0 ); 1229 absA = zSign ? - a : a; 1230 shiftCount = countLeadingZeros32( absA ) + 17; 1231 zSig0 = absA; 1232 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1233 1234} 1235 1236float128 uint32_to_float128( uint32 a ) 1237{ 1238 int8 shiftCount; 1239 bits64 zSig0 = a; 1240 1241 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1242 shiftCount = countLeadingZeros32( a ) + 17; 1243 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1244 1245} 1246 1247#endif 1248 1249#ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1250/* 1251------------------------------------------------------------------------------- 1252Returns the result of converting the 64-bit two's complement integer `a' 1253to the single-precision floating-point format. The conversion is performed 1254according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1255------------------------------------------------------------------------------- 1256*/ 1257float32 int64_to_float32( int64 a ) 1258{ 1259 flag zSign; 1260 uint64 absA; 1261 int8 shiftCount; 1262 1263 if ( a == 0 ) return 0; 1264 zSign = ( a < 0 ); 1265 absA = zSign ? - a : a; 1266 shiftCount = countLeadingZeros64( absA ) - 40; 1267 if ( 0 <= shiftCount ) { 1268 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1269 } 1270 else { 1271 shiftCount += 7; 1272 if ( shiftCount < 0 ) { 1273 shift64RightJamming( absA, - shiftCount, &absA ); 1274 } 1275 else { 1276 absA <<= shiftCount; 1277 } 1278 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1279 } 1280 1281} 1282 1283/* 1284------------------------------------------------------------------------------- 1285Returns the result of converting the 64-bit two's complement integer `a' 1286to the double-precision floating-point format. The conversion is performed 1287according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1288------------------------------------------------------------------------------- 1289*/ 1290float64 int64_to_float64( int64 a ) 1291{ 1292 flag zSign; 1293 1294 if ( a == 0 ) return 0; 1295 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1296 return packFloat64( 1, 0x43E, 0 ); 1297 } 1298 zSign = ( a < 0 ); 1299 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1300 1301} 1302 1303#ifdef FLOATX80 1304 1305/* 1306------------------------------------------------------------------------------- 1307Returns the result of converting the 64-bit two's complement integer `a' 1308to the extended double-precision floating-point format. The conversion 1309is performed according to the IEC/IEEE Standard for Binary Floating-Point 1310Arithmetic. 1311------------------------------------------------------------------------------- 1312*/ 1313floatx80 int64_to_floatx80( int64 a ) 1314{ 1315 flag zSign; 1316 uint64 absA; 1317 int8 shiftCount; 1318 1319 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1320 zSign = ( a < 0 ); 1321 absA = zSign ? - a : a; 1322 shiftCount = countLeadingZeros64( absA ); 1323 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1324 1325} 1326 1327#endif 1328 1329#endif /* !SOFTFLOAT_FOR_GCC */ 1330 1331#ifdef FLOAT128 1332 1333/* 1334------------------------------------------------------------------------------- 1335Returns the result of converting the 64-bit two's complement integer `a' to 1336the quadruple-precision floating-point format. The conversion is performed 1337according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1338------------------------------------------------------------------------------- 1339*/ 1340float128 int64_to_float128( int64 a ) 1341{ 1342 flag zSign; 1343 uint64 absA; 1344 int8 shiftCount; 1345 int32 zExp; 1346 bits64 zSig0, zSig1; 1347 1348 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1349 zSign = ( a < 0 ); 1350 absA = zSign ? - a : a; 1351 shiftCount = countLeadingZeros64( absA ) + 49; 1352 zExp = 0x406E - shiftCount; 1353 if ( 64 <= shiftCount ) { 1354 zSig1 = 0; 1355 zSig0 = absA; 1356 shiftCount -= 64; 1357 } 1358 else { 1359 zSig1 = absA; 1360 zSig0 = 0; 1361 } 1362 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1363 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1364 1365} 1366 1367#endif 1368 1369#ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1370/* 1371------------------------------------------------------------------------------- 1372Returns the result of converting the single-precision floating-point value 1373`a' to the 32-bit two's complement integer format. The conversion is 1374performed according to the IEC/IEEE Standard for Binary Floating-Point 1375Arithmetic---which means in particular that the conversion is rounded 1376according to the current rounding mode. If `a' is a NaN, the largest 1377positive integer is returned. Otherwise, if the conversion overflows, the 1378largest integer with the same sign as `a' is returned. 1379------------------------------------------------------------------------------- 1380*/ 1381int32 float32_to_int32( float32 a ) 1382{ 1383 flag aSign; 1384 int16 aExp, shiftCount; 1385 bits32 aSig; 1386 bits64 aSig64; 1387 1388 aSig = extractFloat32Frac( a ); 1389 aExp = extractFloat32Exp( a ); 1390 aSign = extractFloat32Sign( a ); 1391 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1392 if ( aExp ) aSig |= 0x00800000; 1393 shiftCount = 0xAF - aExp; 1394 aSig64 = aSig; 1395 aSig64 <<= 32; 1396 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1397 return roundAndPackInt32( aSign, aSig64 ); 1398 1399} 1400#endif /* !SOFTFLOAT_FOR_GCC */ 1401 1402/* 1403------------------------------------------------------------------------------- 1404Returns the result of converting the single-precision floating-point value 1405`a' to the 32-bit two's complement integer format. The conversion is 1406performed according to the IEC/IEEE Standard for Binary Floating-Point 1407Arithmetic, except that the conversion is always rounded toward zero. 1408If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1409the conversion overflows, the largest integer with the same sign as `a' is 1410returned. 1411------------------------------------------------------------------------------- 1412*/ 1413int32 float32_to_int32_round_to_zero( float32 a ) 1414{ 1415 flag aSign; 1416 int16 aExp, shiftCount; 1417 bits32 aSig; 1418 int32 z; 1419 1420 aSig = extractFloat32Frac( a ); 1421 aExp = extractFloat32Exp( a ); 1422 aSign = extractFloat32Sign( a ); 1423 shiftCount = aExp - 0x9E; 1424 if ( 0 <= shiftCount ) { 1425 if ( a != 0xCF000000 ) { 1426 float_raise( float_flag_invalid ); 1427 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1428 } 1429 return (sbits32) 0x80000000; 1430 } 1431 else if ( aExp <= 0x7E ) { 1432 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1433 return 0; 1434 } 1435 aSig = ( aSig | 0x00800000 )<<8; 1436 z = aSig>>( - shiftCount ); 1437 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1438 float_exception_flags |= float_flag_inexact; 1439 } 1440 if ( aSign ) z = - z; 1441 return z; 1442 1443} 1444 1445#ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1446/* 1447------------------------------------------------------------------------------- 1448Returns the result of converting the single-precision floating-point value 1449`a' to the 64-bit two's complement integer format. The conversion is 1450performed according to the IEC/IEEE Standard for Binary Floating-Point 1451Arithmetic---which means in particular that the conversion is rounded 1452according to the current rounding mode. If `a' is a NaN, the largest 1453positive integer is returned. Otherwise, if the conversion overflows, the 1454largest integer with the same sign as `a' is returned. 1455------------------------------------------------------------------------------- 1456*/ 1457int64 float32_to_int64( float32 a ) 1458{ 1459 flag aSign; 1460 int16 aExp, shiftCount; 1461 bits32 aSig; 1462 bits64 aSig64, aSigExtra; 1463 1464 aSig = extractFloat32Frac( a ); 1465 aExp = extractFloat32Exp( a ); 1466 aSign = extractFloat32Sign( a ); 1467 shiftCount = 0xBE - aExp; 1468 if ( shiftCount < 0 ) { 1469 float_raise( float_flag_invalid ); 1470 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1471 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1472 } 1473 return (sbits64) LIT64( 0x8000000000000000 ); 1474 } 1475 if ( aExp ) aSig |= 0x00800000; 1476 aSig64 = aSig; 1477 aSig64 <<= 40; 1478 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1479 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1480 1481} 1482 1483/* 1484------------------------------------------------------------------------------- 1485Returns the result of converting the single-precision floating-point value 1486`a' to the 64-bit two's complement integer format. The conversion is 1487performed according to the IEC/IEEE Standard for Binary Floating-Point 1488Arithmetic, except that the conversion is always rounded toward zero. If 1489`a' is a NaN, the largest positive integer is returned. Otherwise, if the 1490conversion overflows, the largest integer with the same sign as `a' is 1491returned. 1492------------------------------------------------------------------------------- 1493*/ 1494int64 float32_to_int64_round_to_zero( float32 a ) 1495{ 1496 flag aSign; 1497 int16 aExp, shiftCount; 1498 bits32 aSig; 1499 bits64 aSig64; 1500 int64 z; 1501 1502 aSig = extractFloat32Frac( a ); 1503 aExp = extractFloat32Exp( a ); 1504 aSign = extractFloat32Sign( a ); 1505 shiftCount = aExp - 0xBE; 1506 if ( 0 <= shiftCount ) { 1507 if ( a != 0xDF000000 ) { 1508 float_raise( float_flag_invalid ); 1509 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1510 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1511 } 1512 } 1513 return (sbits64) LIT64( 0x8000000000000000 ); 1514 } 1515 else if ( aExp <= 0x7E ) { 1516 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1517 return 0; 1518 } 1519 aSig64 = aSig | 0x00800000; 1520 aSig64 <<= 40; 1521 z = aSig64>>( - shiftCount ); 1522 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1523 float_exception_flags |= float_flag_inexact; 1524 } 1525 if ( aSign ) z = - z; 1526 return z; 1527 1528} 1529#endif /* !SOFTFLOAT_FOR_GCC */ 1530 1531/* 1532------------------------------------------------------------------------------- 1533Returns the result of converting the single-precision floating-point value 1534`a' to the double-precision floating-point format. The conversion is 1535performed according to the IEC/IEEE Standard for Binary Floating-Point 1536Arithmetic. 1537------------------------------------------------------------------------------- 1538*/ 1539float64 float32_to_float64( float32 a ) 1540{ 1541 flag aSign; 1542 int16 aExp; 1543 bits32 aSig; 1544 1545 aSig = extractFloat32Frac( a ); 1546 aExp = extractFloat32Exp( a ); 1547 aSign = extractFloat32Sign( a ); 1548 if ( aExp == 0xFF ) { 1549 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1550 return packFloat64( aSign, 0x7FF, 0 ); 1551 } 1552 if ( aExp == 0 ) { 1553 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1554 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1555 --aExp; 1556 } 1557 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1558 1559} 1560 1561#ifdef FLOATX80 1562 1563/* 1564------------------------------------------------------------------------------- 1565Returns the result of converting the single-precision floating-point value 1566`a' to the extended double-precision floating-point format. The conversion 1567is performed according to the IEC/IEEE Standard for Binary Floating-Point 1568Arithmetic. 1569------------------------------------------------------------------------------- 1570*/ 1571floatx80 float32_to_floatx80( float32 a ) 1572{ 1573 flag aSign; 1574 int16 aExp; 1575 bits32 aSig; 1576 1577 aSig = extractFloat32Frac( a ); 1578 aExp = extractFloat32Exp( a ); 1579 aSign = extractFloat32Sign( a ); 1580 if ( aExp == 0xFF ) { 1581 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1582 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1583 } 1584 if ( aExp == 0 ) { 1585 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1586 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1587 } 1588 aSig |= 0x00800000; 1589 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1590 1591} 1592 1593#endif 1594 1595#ifdef FLOAT128 1596 1597/* 1598------------------------------------------------------------------------------- 1599Returns the result of converting the single-precision floating-point value 1600`a' to the double-precision floating-point format. The conversion is 1601performed according to the IEC/IEEE Standard for Binary Floating-Point 1602Arithmetic. 1603------------------------------------------------------------------------------- 1604*/ 1605float128 float32_to_float128( float32 a ) 1606{ 1607 flag aSign; 1608 int16 aExp; 1609 bits32 aSig; 1610 1611 aSig = extractFloat32Frac( a ); 1612 aExp = extractFloat32Exp( a ); 1613 aSign = extractFloat32Sign( a ); 1614 if ( aExp == 0xFF ) { 1615 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1616 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1617 } 1618 if ( aExp == 0 ) { 1619 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1620 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1621 --aExp; 1622 } 1623 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1624 1625} 1626 1627#endif 1628 1629#ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1630/* 1631------------------------------------------------------------------------------- 1632Rounds the single-precision floating-point value `a' to an integer, and 1633returns the result as a single-precision floating-point value. The 1634operation is performed according to the IEC/IEEE Standard for Binary 1635Floating-Point Arithmetic. 1636------------------------------------------------------------------------------- 1637*/ 1638float32 float32_round_to_int( float32 a ) 1639{ 1640 flag aSign; 1641 int16 aExp; 1642 bits32 lastBitMask, roundBitsMask; 1643 int8 roundingMode; 1644 float32 z; 1645 1646 aExp = extractFloat32Exp( a ); 1647 if ( 0x96 <= aExp ) { 1648 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1649 return propagateFloat32NaN( a, a ); 1650 } 1651 return a; 1652 } 1653 if ( aExp <= 0x7E ) { 1654 if ( (bits32) ( a<<1 ) == 0 ) return a; 1655 float_exception_flags |= float_flag_inexact; 1656 aSign = extractFloat32Sign( a ); 1657 switch ( float_rounding_mode ) { 1658 case float_round_nearest_even: 1659 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1660 return packFloat32( aSign, 0x7F, 0 ); 1661 } 1662 break; 1663 case float_round_to_zero: 1664 break; 1665 case float_round_down: 1666 return aSign ? 0xBF800000 : 0; 1667 case float_round_up: 1668 return aSign ? 0x80000000 : 0x3F800000; 1669 } 1670 return packFloat32( aSign, 0, 0 ); 1671 } 1672 lastBitMask = 1; 1673 lastBitMask <<= 0x96 - aExp; 1674 roundBitsMask = lastBitMask - 1; 1675 z = a; 1676 roundingMode = float_rounding_mode; 1677 if ( roundingMode == float_round_nearest_even ) { 1678 z += lastBitMask>>1; 1679 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1680 } 1681 else if ( roundingMode != float_round_to_zero ) { 1682 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1683 z += roundBitsMask; 1684 } 1685 } 1686 z &= ~ roundBitsMask; 1687 if ( z != a ) float_exception_flags |= float_flag_inexact; 1688 return z; 1689 1690} 1691#endif /* !SOFTFLOAT_FOR_GCC */ 1692 1693/* 1694------------------------------------------------------------------------------- 1695Returns the result of adding the absolute values of the single-precision 1696floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1697before being returned. `zSign' is ignored if the result is a NaN. 1698The addition is performed according to the IEC/IEEE Standard for Binary 1699Floating-Point Arithmetic. 1700------------------------------------------------------------------------------- 1701*/ 1702static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1703{ 1704 int16 aExp, bExp, zExp; 1705 bits32 aSig, bSig, zSig; 1706 int16 expDiff; 1707 1708 aSig = extractFloat32Frac( a ); 1709 aExp = extractFloat32Exp( a ); 1710 bSig = extractFloat32Frac( b ); 1711 bExp = extractFloat32Exp( b ); 1712 expDiff = aExp - bExp; 1713 aSig <<= 6; 1714 bSig <<= 6; 1715 if ( 0 < expDiff ) { 1716 if ( aExp == 0xFF ) { 1717 if ( aSig ) return propagateFloat32NaN( a, b ); 1718 return a; 1719 } 1720 if ( bExp == 0 ) { 1721 --expDiff; 1722 } 1723 else { 1724 bSig |= 0x20000000; 1725 } 1726 shift32RightJamming( bSig, expDiff, &bSig ); 1727 zExp = aExp; 1728 } 1729 else if ( expDiff < 0 ) { 1730 if ( bExp == 0xFF ) { 1731 if ( bSig ) return propagateFloat32NaN( a, b ); 1732 return packFloat32( zSign, 0xFF, 0 ); 1733 } 1734 if ( aExp == 0 ) { 1735 ++expDiff; 1736 } 1737 else { 1738 aSig |= 0x20000000; 1739 } 1740 shift32RightJamming( aSig, - expDiff, &aSig ); 1741 zExp = bExp; 1742 } 1743 else { 1744 if ( aExp == 0xFF ) { 1745 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1746 return a; 1747 } 1748 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1749 zSig = 0x40000000 + aSig + bSig; 1750 zExp = aExp; 1751 goto roundAndPack; 1752 } 1753 aSig |= 0x20000000; 1754 zSig = ( aSig + bSig )<<1; 1755 --zExp; 1756 if ( (sbits32) zSig < 0 ) { 1757 zSig = aSig + bSig; 1758 ++zExp; 1759 } 1760 roundAndPack: 1761 return roundAndPackFloat32( zSign, zExp, zSig ); 1762 1763} 1764 1765/* 1766------------------------------------------------------------------------------- 1767Returns the result of subtracting the absolute values of the single- 1768precision floating-point values `a' and `b'. If `zSign' is 1, the 1769difference is negated before being returned. `zSign' is ignored if the 1770result is a NaN. The subtraction is performed according to the IEC/IEEE 1771Standard for Binary Floating-Point Arithmetic. 1772------------------------------------------------------------------------------- 1773*/ 1774static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1775{ 1776 int16 aExp, bExp, zExp; 1777 bits32 aSig, bSig, zSig; 1778 int16 expDiff; 1779 1780 aSig = extractFloat32Frac( a ); 1781 aExp = extractFloat32Exp( a ); 1782 bSig = extractFloat32Frac( b ); 1783 bExp = extractFloat32Exp( b ); 1784 expDiff = aExp - bExp; 1785 aSig <<= 7; 1786 bSig <<= 7; 1787 if ( 0 < expDiff ) goto aExpBigger; 1788 if ( expDiff < 0 ) goto bExpBigger; 1789 if ( aExp == 0xFF ) { 1790 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1791 float_raise( float_flag_invalid ); 1792 return float32_default_nan; 1793 } 1794 if ( aExp == 0 ) { 1795 aExp = 1; 1796 bExp = 1; 1797 } 1798 if ( bSig < aSig ) goto aBigger; 1799 if ( aSig < bSig ) goto bBigger; 1800 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1801 bExpBigger: 1802 if ( bExp == 0xFF ) { 1803 if ( bSig ) return propagateFloat32NaN( a, b ); 1804 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1805 } 1806 if ( aExp == 0 ) { 1807 ++expDiff; 1808 } 1809 else { 1810 aSig |= 0x40000000; 1811 } 1812 shift32RightJamming( aSig, - expDiff, &aSig ); 1813 bSig |= 0x40000000; 1814 bBigger: 1815 zSig = bSig - aSig; 1816 zExp = bExp; 1817 zSign ^= 1; 1818 goto normalizeRoundAndPack; 1819 aExpBigger: 1820 if ( aExp == 0xFF ) { 1821 if ( aSig ) return propagateFloat32NaN( a, b ); 1822 return a; 1823 } 1824 if ( bExp == 0 ) { 1825 --expDiff; 1826 } 1827 else { 1828 bSig |= 0x40000000; 1829 } 1830 shift32RightJamming( bSig, expDiff, &bSig ); 1831 aSig |= 0x40000000; 1832 aBigger: 1833 zSig = aSig - bSig; 1834 zExp = aExp; 1835 normalizeRoundAndPack: 1836 --zExp; 1837 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1838 1839} 1840 1841/* 1842------------------------------------------------------------------------------- 1843Returns the result of adding the single-precision floating-point values `a' 1844and `b'. The operation is performed according to the IEC/IEEE Standard for 1845Binary Floating-Point Arithmetic. 1846------------------------------------------------------------------------------- 1847*/ 1848float32 float32_add( float32 a, float32 b ) 1849{ 1850 flag aSign, bSign; 1851 1852 aSign = extractFloat32Sign( a ); 1853 bSign = extractFloat32Sign( b ); 1854 if ( aSign == bSign ) { 1855 return addFloat32Sigs( a, b, aSign ); 1856 } 1857 else { 1858 return subFloat32Sigs( a, b, aSign ); 1859 } 1860 1861} 1862 1863/* 1864------------------------------------------------------------------------------- 1865Returns the result of subtracting the single-precision floating-point values 1866`a' and `b'. The operation is performed according to the IEC/IEEE Standard 1867for Binary Floating-Point Arithmetic. 1868------------------------------------------------------------------------------- 1869*/ 1870float32 float32_sub( float32 a, float32 b ) 1871{ 1872 flag aSign, bSign; 1873 1874 aSign = extractFloat32Sign( a ); 1875 bSign = extractFloat32Sign( b ); 1876 if ( aSign == bSign ) { 1877 return subFloat32Sigs( a, b, aSign ); 1878 } 1879 else { 1880 return addFloat32Sigs( a, b, aSign ); 1881 } 1882 1883} 1884 1885/* 1886------------------------------------------------------------------------------- 1887Returns the result of multiplying the single-precision floating-point values 1888`a' and `b'. The operation is performed according to the IEC/IEEE Standard 1889for Binary Floating-Point Arithmetic. 1890------------------------------------------------------------------------------- 1891*/ 1892float32 float32_mul( float32 a, float32 b ) 1893{ 1894 flag aSign, bSign, zSign; 1895 int16 aExp, bExp, zExp; 1896 bits32 aSig, bSig; 1897 bits64 zSig64; 1898 bits32 zSig; 1899 1900 aSig = extractFloat32Frac( a ); 1901 aExp = extractFloat32Exp( a ); 1902 aSign = extractFloat32Sign( a ); 1903 bSig = extractFloat32Frac( b ); 1904 bExp = extractFloat32Exp( b ); 1905 bSign = extractFloat32Sign( b ); 1906 zSign = aSign ^ bSign; 1907 if ( aExp == 0xFF ) { 1908 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1909 return propagateFloat32NaN( a, b ); 1910 } 1911 if ( ( bExp | bSig ) == 0 ) { 1912 float_raise( float_flag_invalid ); 1913 return float32_default_nan; 1914 } 1915 return packFloat32( zSign, 0xFF, 0 ); 1916 } 1917 if ( bExp == 0xFF ) { 1918 if ( bSig ) return propagateFloat32NaN( a, b ); 1919 if ( ( aExp | aSig ) == 0 ) { 1920 float_raise( float_flag_invalid ); 1921 return float32_default_nan; 1922 } 1923 return packFloat32( zSign, 0xFF, 0 ); 1924 } 1925 if ( aExp == 0 ) { 1926 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1927 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1928 } 1929 if ( bExp == 0 ) { 1930 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1931 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1932 } 1933 zExp = aExp + bExp - 0x7F; 1934 aSig = ( aSig | 0x00800000 )<<7; 1935 bSig = ( bSig | 0x00800000 )<<8; 1936 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1937 zSig = zSig64; 1938 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1939 zSig <<= 1; 1940 --zExp; 1941 } 1942 return roundAndPackFloat32( zSign, zExp, zSig ); 1943 1944} 1945 1946/* 1947------------------------------------------------------------------------------- 1948Returns the result of dividing the single-precision floating-point value `a' 1949by the corresponding value `b'. The operation is performed according to the 1950IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1951------------------------------------------------------------------------------- 1952*/ 1953float32 float32_div( float32 a, float32 b ) 1954{ 1955 flag aSign, bSign, zSign; 1956 int16 aExp, bExp, zExp; 1957 bits32 aSig, bSig, zSig; 1958 1959 aSig = extractFloat32Frac( a ); 1960 aExp = extractFloat32Exp( a ); 1961 aSign = extractFloat32Sign( a ); 1962 bSig = extractFloat32Frac( b ); 1963 bExp = extractFloat32Exp( b ); 1964 bSign = extractFloat32Sign( b ); 1965 zSign = aSign ^ bSign; 1966 if ( aExp == 0xFF ) { 1967 if ( aSig ) return propagateFloat32NaN( a, b ); 1968 if ( bExp == 0xFF ) { 1969 if ( bSig ) return propagateFloat32NaN( a, b ); 1970 float_raise( float_flag_invalid ); 1971 return float32_default_nan; 1972 } 1973 return packFloat32( zSign, 0xFF, 0 ); 1974 } 1975 if ( bExp == 0xFF ) { 1976 if ( bSig ) return propagateFloat32NaN( a, b ); 1977 return packFloat32( zSign, 0, 0 ); 1978 } 1979 if ( bExp == 0 ) { 1980 if ( bSig == 0 ) { 1981 if ( ( aExp | aSig ) == 0 ) { 1982 float_raise( float_flag_invalid ); 1983 return float32_default_nan; 1984 } 1985 float_raise( float_flag_divbyzero ); 1986 return packFloat32( zSign, 0xFF, 0 ); 1987 } 1988 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1989 } 1990 if ( aExp == 0 ) { 1991 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1992 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1993 } 1994 zExp = aExp - bExp + 0x7D; 1995 aSig = ( aSig | 0x00800000 )<<7; 1996 bSig = ( bSig | 0x00800000 )<<8; 1997 if ( bSig <= ( aSig + aSig ) ) { 1998 aSig >>= 1; 1999 ++zExp; 2000 } 2001 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2002 if ( ( zSig & 0x3F ) == 0 ) { 2003 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2004 } 2005 return roundAndPackFloat32( zSign, zExp, zSig ); 2006 2007} 2008 2009#ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2010/* 2011------------------------------------------------------------------------------- 2012Returns the remainder of the single-precision floating-point value `a' 2013with respect to the corresponding value `b'. The operation is performed 2014according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2015------------------------------------------------------------------------------- 2016*/ 2017float32 float32_rem( float32 a, float32 b ) 2018{ 2019 flag aSign, bSign, zSign; 2020 int16 aExp, bExp, expDiff; 2021 bits32 aSig, bSig; 2022 bits32 q; 2023 bits64 aSig64, bSig64, q64; 2024 bits32 alternateASig; 2025 sbits32 sigMean; 2026 2027 aSig = extractFloat32Frac( a ); 2028 aExp = extractFloat32Exp( a ); 2029 aSign = extractFloat32Sign( a ); 2030 bSig = extractFloat32Frac( b ); 2031 bExp = extractFloat32Exp( b ); 2032 bSign = extractFloat32Sign( b ); 2033 if ( aExp == 0xFF ) { 2034 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2035 return propagateFloat32NaN( a, b ); 2036 } 2037 float_raise( float_flag_invalid ); 2038 return float32_default_nan; 2039 } 2040 if ( bExp == 0xFF ) { 2041 if ( bSig ) return propagateFloat32NaN( a, b ); 2042 return a; 2043 } 2044 if ( bExp == 0 ) { 2045 if ( bSig == 0 ) { 2046 float_raise( float_flag_invalid ); 2047 return float32_default_nan; 2048 } 2049 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2050 } 2051 if ( aExp == 0 ) { 2052 if ( aSig == 0 ) return a; 2053 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2054 } 2055 expDiff = aExp - bExp; 2056 aSig |= 0x00800000; 2057 bSig |= 0x00800000; 2058 if ( expDiff < 32 ) { 2059 aSig <<= 8; 2060 bSig <<= 8; 2061 if ( expDiff < 0 ) { 2062 if ( expDiff < -1 ) return a; 2063 aSig >>= 1; 2064 } 2065 q = ( bSig <= aSig ); 2066 if ( q ) aSig -= bSig; 2067 if ( 0 < expDiff ) { 2068 q = ( ( (bits64) aSig )<<32 ) / bSig; 2069 q >>= 32 - expDiff; 2070 bSig >>= 2; 2071 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2072 } 2073 else { 2074 aSig >>= 2; 2075 bSig >>= 2; 2076 } 2077 } 2078 else { 2079 if ( bSig <= aSig ) aSig -= bSig; 2080 aSig64 = ( (bits64) aSig )<<40; 2081 bSig64 = ( (bits64) bSig )<<40; 2082 expDiff -= 64; 2083 while ( 0 < expDiff ) { 2084 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2085 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2086 aSig64 = - ( ( bSig * q64 )<<38 ); 2087 expDiff -= 62; 2088 } 2089 expDiff += 64; 2090 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2091 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2092 q = q64>>( 64 - expDiff ); 2093 bSig <<= 6; 2094 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2095 } 2096 do { 2097 alternateASig = aSig; 2098 ++q; 2099 aSig -= bSig; 2100 } while ( 0 <= (sbits32) aSig ); 2101 sigMean = aSig + alternateASig; 2102 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2103 aSig = alternateASig; 2104 } 2105 zSign = ( (sbits32) aSig < 0 ); 2106 if ( zSign ) aSig = - aSig; 2107 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2108 2109} 2110#endif /* !SOFTFLOAT_FOR_GCC */ 2111 2112#ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2113/* 2114------------------------------------------------------------------------------- 2115Returns the square root of the single-precision floating-point value `a'. 2116The operation is performed according to the IEC/IEEE Standard for Binary 2117Floating-Point Arithmetic. 2118------------------------------------------------------------------------------- 2119*/ 2120float32 float32_sqrt( float32 a ) 2121{ 2122 flag aSign; 2123 int16 aExp, zExp; 2124 bits32 aSig, zSig; 2125 bits64 rem, term; 2126 2127 aSig = extractFloat32Frac( a ); 2128 aExp = extractFloat32Exp( a ); 2129 aSign = extractFloat32Sign( a ); 2130 if ( aExp == 0xFF ) { 2131 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2132 if ( ! aSign ) return a; 2133 float_raise( float_flag_invalid ); 2134 return float32_default_nan; 2135 } 2136 if ( aSign ) { 2137 if ( ( aExp | aSig ) == 0 ) return a; 2138 float_raise( float_flag_invalid ); 2139 return float32_default_nan; 2140 } 2141 if ( aExp == 0 ) { 2142 if ( aSig == 0 ) return 0; 2143 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2144 } 2145 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2146 aSig = ( aSig | 0x00800000 )<<8; 2147 zSig = estimateSqrt32( aExp, aSig ) + 2; 2148 if ( ( zSig & 0x7F ) <= 5 ) { 2149 if ( zSig < 2 ) { 2150 zSig = 0x7FFFFFFF; 2151 goto roundAndPack; 2152 } 2153 aSig >>= aExp & 1; 2154 term = ( (bits64) zSig ) * zSig; 2155 rem = ( ( (bits64) aSig )<<32 ) - term; 2156 while ( (sbits64) rem < 0 ) { 2157 --zSig; 2158 rem += ( ( (bits64) zSig )<<1 ) | 1; 2159 } 2160 zSig |= ( rem != 0 ); 2161 } 2162 shift32RightJamming( zSig, 1, &zSig ); 2163 roundAndPack: 2164 return roundAndPackFloat32( 0, zExp, zSig ); 2165 2166} 2167#endif /* !SOFTFLOAT_FOR_GCC */ 2168 2169/* 2170------------------------------------------------------------------------------- 2171Returns 1 if the single-precision floating-point value `a' is equal to 2172the corresponding value `b', and 0 otherwise. The comparison is performed 2173according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2174------------------------------------------------------------------------------- 2175*/ 2176flag float32_eq( float32 a, float32 b ) 2177{ 2178 2179 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2180 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2181 ) { 2182 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2183 float_raise( float_flag_invalid ); 2184 } 2185 return 0; 2186 } 2187 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2188 2189} 2190 2191/* 2192------------------------------------------------------------------------------- 2193Returns 1 if the single-precision floating-point value `a' is less than 2194or equal to the corresponding value `b', and 0 otherwise. The comparison 2195is performed according to the IEC/IEEE Standard for Binary Floating-Point 2196Arithmetic. 2197------------------------------------------------------------------------------- 2198*/ 2199flag float32_le( float32 a, float32 b ) 2200{ 2201 flag aSign, bSign; 2202 2203 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2204 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2205 ) { 2206 float_raise( float_flag_invalid ); 2207 return 0; 2208 } 2209 aSign = extractFloat32Sign( a ); 2210 bSign = extractFloat32Sign( b ); 2211 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2212 return ( a == b ) || ( aSign ^ ( a < b ) ); 2213 2214} 2215 2216/* 2217------------------------------------------------------------------------------- 2218Returns 1 if the single-precision floating-point value `a' is less than 2219the corresponding value `b', and 0 otherwise. The comparison is performed 2220according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2221------------------------------------------------------------------------------- 2222*/ 2223flag float32_lt( float32 a, float32 b ) 2224{ 2225 flag aSign, bSign; 2226 2227 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2228 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2229 ) { 2230 float_raise( float_flag_invalid ); 2231 return 0; 2232 } 2233 aSign = extractFloat32Sign( a ); 2234 bSign = extractFloat32Sign( b ); 2235 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2236 return ( a != b ) && ( aSign ^ ( a < b ) ); 2237 2238} 2239 2240#ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2241/* 2242------------------------------------------------------------------------------- 2243Returns 1 if the single-precision floating-point value `a' is equal to 2244the corresponding value `b', and 0 otherwise. The invalid exception is 2245raised if either operand is a NaN. Otherwise, the comparison is performed 2246according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2247------------------------------------------------------------------------------- 2248*/ 2249flag float32_eq_signaling( float32 a, float32 b ) 2250{ 2251 2252 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2253 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2254 ) { 2255 float_raise( float_flag_invalid ); 2256 return 0; 2257 } 2258 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2259 2260} 2261 2262/* 2263------------------------------------------------------------------------------- 2264Returns 1 if the single-precision floating-point value `a' is less than or 2265equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2266cause an exception. Otherwise, the comparison is performed according to the 2267IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2268------------------------------------------------------------------------------- 2269*/ 2270flag float32_le_quiet( float32 a, float32 b ) 2271{ 2272 flag aSign, bSign; 2273 2274 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2275 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2276 ) { 2277 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2278 float_raise( float_flag_invalid ); 2279 } 2280 return 0; 2281 } 2282 aSign = extractFloat32Sign( a ); 2283 bSign = extractFloat32Sign( b ); 2284 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2285 return ( a == b ) || ( aSign ^ ( a < b ) ); 2286 2287} 2288 2289/* 2290------------------------------------------------------------------------------- 2291Returns 1 if the single-precision floating-point value `a' is less than 2292the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2293exception. Otherwise, the comparison is performed according to the IEC/IEEE 2294Standard for Binary Floating-Point Arithmetic. 2295------------------------------------------------------------------------------- 2296*/ 2297flag float32_lt_quiet( float32 a, float32 b ) 2298{ 2299 flag aSign, bSign; 2300 2301 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2302 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2303 ) { 2304 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2305 float_raise( float_flag_invalid ); 2306 } 2307 return 0; 2308 } 2309 aSign = extractFloat32Sign( a ); 2310 bSign = extractFloat32Sign( b ); 2311 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2312 return ( a != b ) && ( aSign ^ ( a < b ) ); 2313 2314} 2315#endif /* !SOFTFLOAT_FOR_GCC */ 2316 2317#ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2318/* 2319------------------------------------------------------------------------------- 2320Returns the result of converting the double-precision floating-point value 2321`a' to the 32-bit two's complement integer format. The conversion is 2322performed according to the IEC/IEEE Standard for Binary Floating-Point 2323Arithmetic---which means in particular that the conversion is rounded 2324according to the current rounding mode. If `a' is a NaN, the largest 2325positive integer is returned. Otherwise, if the conversion overflows, the 2326largest integer with the same sign as `a' is returned. 2327------------------------------------------------------------------------------- 2328*/ 2329int32 float64_to_int32( float64 a ) 2330{ 2331 flag aSign; 2332 int16 aExp, shiftCount; 2333 bits64 aSig; 2334 2335 aSig = extractFloat64Frac( a ); 2336 aExp = extractFloat64Exp( a ); 2337 aSign = extractFloat64Sign( a ); 2338 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2339 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2340 shiftCount = 0x42C - aExp; 2341 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2342 return roundAndPackInt32( aSign, aSig ); 2343 2344} 2345#endif /* !SOFTFLOAT_FOR_GCC */ 2346 2347/* 2348------------------------------------------------------------------------------- 2349Returns the result of converting the double-precision floating-point value 2350`a' to the 32-bit two's complement integer format. The conversion is 2351performed according to the IEC/IEEE Standard for Binary Floating-Point 2352Arithmetic, except that the conversion is always rounded toward zero. 2353If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2354the conversion overflows, the largest integer with the same sign as `a' is 2355returned. 2356------------------------------------------------------------------------------- 2357*/ 2358int32 float64_to_int32_round_to_zero( float64 a ) 2359{ 2360 flag aSign; 2361 int16 aExp, shiftCount; 2362 bits64 aSig, savedASig; 2363 int32 z; 2364 2365 aSig = extractFloat64Frac( a ); 2366 aExp = extractFloat64Exp( a ); 2367 aSign = extractFloat64Sign( a ); 2368 if ( 0x41E < aExp ) { 2369 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2370 goto invalid; 2371 } 2372 else if ( aExp < 0x3FF ) { 2373 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2374 return 0; 2375 } 2376 aSig |= LIT64( 0x0010000000000000 ); 2377 shiftCount = 0x433 - aExp; 2378 savedASig = aSig; 2379 aSig >>= shiftCount; 2380 z = aSig; 2381 if ( aSign ) z = - z; 2382 if ( ( z < 0 ) ^ aSign ) { 2383 invalid: 2384 float_raise( float_flag_invalid ); 2385 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2386 } 2387 if ( ( aSig<<shiftCount ) != savedASig ) { 2388 float_exception_flags |= float_flag_inexact; 2389 } 2390 return z; 2391 2392} 2393 2394#ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2395/* 2396------------------------------------------------------------------------------- 2397Returns the result of converting the double-precision floating-point value 2398`a' to the 64-bit two's complement integer format. The conversion is 2399performed according to the IEC/IEEE Standard for Binary Floating-Point 2400Arithmetic---which means in particular that the conversion is rounded 2401according to the current rounding mode. If `a' is a NaN, the largest 2402positive integer is returned. Otherwise, if the conversion overflows, the 2403largest integer with the same sign as `a' is returned. 2404------------------------------------------------------------------------------- 2405*/ 2406int64 float64_to_int64( float64 a ) 2407{ 2408 flag aSign; 2409 int16 aExp, shiftCount; 2410 bits64 aSig, aSigExtra; 2411 2412 aSig = extractFloat64Frac( a ); 2413 aExp = extractFloat64Exp( a ); 2414 aSign = extractFloat64Sign( a ); 2415 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2416 shiftCount = 0x433 - aExp; 2417 if ( shiftCount <= 0 ) { 2418 if ( 0x43E < aExp ) { 2419 float_raise( float_flag_invalid ); 2420 if ( ! aSign 2421 || ( ( aExp == 0x7FF ) 2422 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2423 ) { 2424 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2425 } 2426 return (sbits64) LIT64( 0x8000000000000000 ); 2427 } 2428 aSigExtra = 0; 2429 aSig <<= - shiftCount; 2430 } 2431 else { 2432 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2433 } 2434 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2435 2436} 2437 2438/* 2439------------------------------------------------------------------------------- 2440Returns the result of converting the double-precision floating-point value 2441`a' to the 64-bit two's complement integer format. The conversion is 2442performed according to the IEC/IEEE Standard for Binary Floating-Point 2443Arithmetic, except that the conversion is always rounded toward zero. 2444If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2445the conversion overflows, the largest integer with the same sign as `a' is 2446returned. 2447------------------------------------------------------------------------------- 2448*/ 2449int64 float64_to_int64_round_to_zero( float64 a ) 2450{ 2451 flag aSign; 2452 int16 aExp, shiftCount; 2453 bits64 aSig; 2454 int64 z; 2455 2456 aSig = extractFloat64Frac( a ); 2457 aExp = extractFloat64Exp( a ); 2458 aSign = extractFloat64Sign( a ); 2459 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2460 shiftCount = aExp - 0x433; 2461 if ( 0 <= shiftCount ) { 2462 if ( 0x43E <= aExp ) { 2463 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2464 float_raise( float_flag_invalid ); 2465 if ( ! aSign 2466 || ( ( aExp == 0x7FF ) 2467 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2468 ) { 2469 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2470 } 2471 } 2472 return (sbits64) LIT64( 0x8000000000000000 ); 2473 } 2474 z = aSig<<shiftCount; 2475 } 2476 else { 2477 if ( aExp < 0x3FE ) { 2478 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 2479 return 0; 2480 } 2481 z = aSig>>( - shiftCount ); 2482 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2483 float_exception_flags |= float_flag_inexact; 2484 } 2485 } 2486 if ( aSign ) z = - z; 2487 return z; 2488 2489} 2490#endif /* !SOFTFLOAT_FOR_GCC */ 2491 2492/* 2493------------------------------------------------------------------------------- 2494Returns the result of converting the double-precision floating-point value 2495`a' to the single-precision floating-point format. The conversion is 2496performed according to the IEC/IEEE Standard for Binary Floating-Point 2497Arithmetic. 2498------------------------------------------------------------------------------- 2499*/ 2500float32 float64_to_float32( float64 a ) 2501{ 2502 flag aSign; 2503 int16 aExp; 2504 bits64 aSig; 2505 bits32 zSig; 2506 2507 aSig = extractFloat64Frac( a ); 2508 aExp = extractFloat64Exp( a ); 2509 aSign = extractFloat64Sign( a ); 2510 if ( aExp == 0x7FF ) { 2511 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2512 return packFloat32( aSign, 0xFF, 0 ); 2513 } 2514 shift64RightJamming( aSig, 22, &aSig ); 2515 zSig = aSig; 2516 if ( aExp || zSig ) { 2517 zSig |= 0x40000000; 2518 aExp -= 0x381; 2519 } 2520 return roundAndPackFloat32( aSign, aExp, zSig ); 2521 2522} 2523 2524#ifdef FLOATX80 2525 2526/* 2527------------------------------------------------------------------------------- 2528Returns the result of converting the double-precision floating-point value 2529`a' to the extended double-precision floating-point format. The conversion 2530is performed according to the IEC/IEEE Standard for Binary Floating-Point 2531Arithmetic. 2532------------------------------------------------------------------------------- 2533*/ 2534floatx80 float64_to_floatx80( float64 a ) 2535{ 2536 flag aSign; 2537 int16 aExp; 2538 bits64 aSig; 2539 2540 aSig = extractFloat64Frac( a ); 2541 aExp = extractFloat64Exp( a ); 2542 aSign = extractFloat64Sign( a ); 2543 if ( aExp == 0x7FF ) { 2544 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2545 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2546 } 2547 if ( aExp == 0 ) { 2548 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2549 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2550 } 2551 return 2552 packFloatx80( 2553 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2554 2555} 2556 2557#endif 2558 2559#ifdef FLOAT128 2560 2561/* 2562------------------------------------------------------------------------------- 2563Returns the result of converting the double-precision floating-point value 2564`a' to the quadruple-precision floating-point format. The conversion is 2565performed according to the IEC/IEEE Standard for Binary Floating-Point 2566Arithmetic. 2567------------------------------------------------------------------------------- 2568*/ 2569float128 float64_to_float128( float64 a ) 2570{ 2571 flag aSign; 2572 int16 aExp; 2573 bits64 aSig, zSig0, zSig1; 2574 2575 aSig = extractFloat64Frac( a ); 2576 aExp = extractFloat64Exp( a ); 2577 aSign = extractFloat64Sign( a ); 2578 if ( aExp == 0x7FF ) { 2579 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2580 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2581 } 2582 if ( aExp == 0 ) { 2583 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2584 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2585 --aExp; 2586 } 2587 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2588 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2589 2590} 2591 2592#endif 2593 2594#ifndef SOFTFLOAT_FOR_GCC 2595/* 2596------------------------------------------------------------------------------- 2597Rounds the double-precision floating-point value `a' to an integer, and 2598returns the result as a double-precision floating-point value. The 2599operation is performed according to the IEC/IEEE Standard for Binary 2600Floating-Point Arithmetic. 2601------------------------------------------------------------------------------- 2602*/ 2603float64 float64_round_to_int( float64 a ) 2604{ 2605 flag aSign; 2606 int16 aExp; 2607 bits64 lastBitMask, roundBitsMask; 2608 int8 roundingMode; 2609 float64 z; 2610 2611 aExp = extractFloat64Exp( a ); 2612 if ( 0x433 <= aExp ) { 2613 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2614 return propagateFloat64NaN( a, a ); 2615 } 2616 return a; 2617 } 2618 if ( aExp < 0x3FF ) { 2619 if ( (bits64) ( a<<1 ) == 0 ) return a; 2620 float_exception_flags |= float_flag_inexact; 2621 aSign = extractFloat64Sign( a ); 2622 switch ( float_rounding_mode ) { 2623 case float_round_nearest_even: 2624 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2625 return packFloat64( aSign, 0x3FF, 0 ); 2626 } 2627 break; 2628 case float_round_to_zero: 2629 break; 2630 case float_round_down: 2631 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2632 case float_round_up: 2633 return 2634 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2635 } 2636 return packFloat64( aSign, 0, 0 ); 2637 } 2638 lastBitMask = 1; 2639 lastBitMask <<= 0x433 - aExp; 2640 roundBitsMask = lastBitMask - 1; 2641 z = a; 2642 roundingMode = float_rounding_mode; 2643 if ( roundingMode == float_round_nearest_even ) { 2644 z += lastBitMask>>1; 2645 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2646 } 2647 else if ( roundingMode != float_round_to_zero ) { 2648 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2649 z += roundBitsMask; 2650 } 2651 } 2652 z &= ~ roundBitsMask; 2653 if ( z != a ) float_exception_flags |= float_flag_inexact; 2654 return z; 2655 2656} 2657#endif 2658 2659/* 2660------------------------------------------------------------------------------- 2661Returns the result of adding the absolute values of the double-precision 2662floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2663before being returned. `zSign' is ignored if the result is a NaN. 2664The addition is performed according to the IEC/IEEE Standard for Binary 2665Floating-Point Arithmetic. 2666------------------------------------------------------------------------------- 2667*/ 2668static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2669{ 2670 int16 aExp, bExp, zExp; 2671 bits64 aSig, bSig, zSig; 2672 int16 expDiff; 2673 2674 aSig = extractFloat64Frac( a ); 2675 aExp = extractFloat64Exp( a ); 2676 bSig = extractFloat64Frac( b ); 2677 bExp = extractFloat64Exp( b ); 2678 expDiff = aExp - bExp; 2679 aSig <<= 9; 2680 bSig <<= 9; 2681 if ( 0 < expDiff ) { 2682 if ( aExp == 0x7FF ) { 2683 if ( aSig ) return propagateFloat64NaN( a, b ); 2684 return a; 2685 } 2686 if ( bExp == 0 ) { 2687 --expDiff; 2688 } 2689 else { 2690 bSig |= LIT64( 0x2000000000000000 ); 2691 } 2692 shift64RightJamming( bSig, expDiff, &bSig ); 2693 zExp = aExp; 2694 } 2695 else if ( expDiff < 0 ) { 2696 if ( bExp == 0x7FF ) { 2697 if ( bSig ) return propagateFloat64NaN( a, b ); 2698 return packFloat64( zSign, 0x7FF, 0 ); 2699 } 2700 if ( aExp == 0 ) { 2701 ++expDiff; 2702 } 2703 else { 2704 aSig |= LIT64( 0x2000000000000000 ); 2705 } 2706 shift64RightJamming( aSig, - expDiff, &aSig ); 2707 zExp = bExp; 2708 } 2709 else { 2710 if ( aExp == 0x7FF ) { 2711 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2712 return a; 2713 } 2714 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2715 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2716 zExp = aExp; 2717 goto roundAndPack; 2718 } 2719 aSig |= LIT64( 0x2000000000000000 ); 2720 zSig = ( aSig + bSig )<<1; 2721 --zExp; 2722 if ( (sbits64) zSig < 0 ) { 2723 zSig = aSig + bSig; 2724 ++zExp; 2725 } 2726 roundAndPack: 2727 return roundAndPackFloat64( zSign, zExp, zSig ); 2728 2729} 2730 2731/* 2732------------------------------------------------------------------------------- 2733Returns the result of subtracting the absolute values of the double- 2734precision floating-point values `a' and `b'. If `zSign' is 1, the 2735difference is negated before being returned. `zSign' is ignored if the 2736result is a NaN. The subtraction is performed according to the IEC/IEEE 2737Standard for Binary Floating-Point Arithmetic. 2738------------------------------------------------------------------------------- 2739*/ 2740static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2741{ 2742 int16 aExp, bExp, zExp; 2743 bits64 aSig, bSig, zSig; 2744 int16 expDiff; 2745 2746 aSig = extractFloat64Frac( a ); 2747 aExp = extractFloat64Exp( a ); 2748 bSig = extractFloat64Frac( b ); 2749 bExp = extractFloat64Exp( b ); 2750 expDiff = aExp - bExp; 2751 aSig <<= 10; 2752 bSig <<= 10; 2753 if ( 0 < expDiff ) goto aExpBigger; 2754 if ( expDiff < 0 ) goto bExpBigger; 2755 if ( aExp == 0x7FF ) { 2756 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2757 float_raise( float_flag_invalid ); 2758 return float64_default_nan; 2759 } 2760 if ( aExp == 0 ) { 2761 aExp = 1; 2762 bExp = 1; 2763 } 2764 if ( bSig < aSig ) goto aBigger; 2765 if ( aSig < bSig ) goto bBigger; 2766 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2767 bExpBigger: 2768 if ( bExp == 0x7FF ) { 2769 if ( bSig ) return propagateFloat64NaN( a, b ); 2770 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2771 } 2772 if ( aExp == 0 ) { 2773 ++expDiff; 2774 } 2775 else { 2776 aSig |= LIT64( 0x4000000000000000 ); 2777 } 2778 shift64RightJamming( aSig, - expDiff, &aSig ); 2779 bSig |= LIT64( 0x4000000000000000 ); 2780 bBigger: 2781 zSig = bSig - aSig; 2782 zExp = bExp; 2783 zSign ^= 1; 2784 goto normalizeRoundAndPack; 2785 aExpBigger: 2786 if ( aExp == 0x7FF ) { 2787 if ( aSig ) return propagateFloat64NaN( a, b ); 2788 return a; 2789 } 2790 if ( bExp == 0 ) { 2791 --expDiff; 2792 } 2793 else { 2794 bSig |= LIT64( 0x4000000000000000 ); 2795 } 2796 shift64RightJamming( bSig, expDiff, &bSig ); 2797 aSig |= LIT64( 0x4000000000000000 ); 2798 aBigger: 2799 zSig = aSig - bSig; 2800 zExp = aExp; 2801 normalizeRoundAndPack: 2802 --zExp; 2803 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2804 2805} 2806 2807/* 2808------------------------------------------------------------------------------- 2809Returns the result of adding the double-precision floating-point values `a' 2810and `b'. The operation is performed according to the IEC/IEEE Standard for 2811Binary Floating-Point Arithmetic. 2812------------------------------------------------------------------------------- 2813*/ 2814float64 float64_add( float64 a, float64 b ) 2815{ 2816 flag aSign, bSign; 2817 2818 aSign = extractFloat64Sign( a ); 2819 bSign = extractFloat64Sign( b ); 2820 if ( aSign == bSign ) { 2821 return addFloat64Sigs( a, b, aSign ); 2822 } 2823 else { 2824 return subFloat64Sigs( a, b, aSign ); 2825 } 2826 2827} 2828 2829/* 2830------------------------------------------------------------------------------- 2831Returns the result of subtracting the double-precision floating-point values 2832`a' and `b'. The operation is performed according to the IEC/IEEE Standard 2833for Binary Floating-Point Arithmetic. 2834------------------------------------------------------------------------------- 2835*/ 2836float64 float64_sub( float64 a, float64 b ) 2837{ 2838 flag aSign, bSign; 2839 2840 aSign = extractFloat64Sign( a ); 2841 bSign = extractFloat64Sign( b ); 2842 if ( aSign == bSign ) { 2843 return subFloat64Sigs( a, b, aSign ); 2844 } 2845 else { 2846 return addFloat64Sigs( a, b, aSign ); 2847 } 2848 2849} 2850 2851/* 2852------------------------------------------------------------------------------- 2853Returns the result of multiplying the double-precision floating-point values 2854`a' and `b'. The operation is performed according to the IEC/IEEE Standard 2855for Binary Floating-Point Arithmetic. 2856------------------------------------------------------------------------------- 2857*/ 2858float64 float64_mul( float64 a, float64 b ) 2859{ 2860 flag aSign, bSign, zSign; 2861 int16 aExp, bExp, zExp; 2862 bits64 aSig, bSig, zSig0, zSig1; 2863 2864 aSig = extractFloat64Frac( a ); 2865 aExp = extractFloat64Exp( a ); 2866 aSign = extractFloat64Sign( a ); 2867 bSig = extractFloat64Frac( b ); 2868 bExp = extractFloat64Exp( b ); 2869 bSign = extractFloat64Sign( b ); 2870 zSign = aSign ^ bSign; 2871 if ( aExp == 0x7FF ) { 2872 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2873 return propagateFloat64NaN( a, b ); 2874 } 2875 if ( ( bExp | bSig ) == 0 ) { 2876 float_raise( float_flag_invalid ); 2877 return float64_default_nan; 2878 } 2879 return packFloat64( zSign, 0x7FF, 0 ); 2880 } 2881 if ( bExp == 0x7FF ) { 2882 if ( bSig ) return propagateFloat64NaN( a, b ); 2883 if ( ( aExp | aSig ) == 0 ) { 2884 float_raise( float_flag_invalid ); 2885 return float64_default_nan; 2886 } 2887 return packFloat64( zSign, 0x7FF, 0 ); 2888 } 2889 if ( aExp == 0 ) { 2890 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2891 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2892 } 2893 if ( bExp == 0 ) { 2894 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2895 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2896 } 2897 zExp = aExp + bExp - 0x3FF; 2898 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2899 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2900 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2901 zSig0 |= ( zSig1 != 0 ); 2902 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2903 zSig0 <<= 1; 2904 --zExp; 2905 } 2906 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2907 2908} 2909 2910/* 2911------------------------------------------------------------------------------- 2912Returns the result of dividing the double-precision floating-point value `a' 2913by the corresponding value `b'. The operation is performed according to 2914the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2915------------------------------------------------------------------------------- 2916*/ 2917float64 float64_div( float64 a, float64 b ) 2918{ 2919 flag aSign, bSign, zSign; 2920 int16 aExp, bExp, zExp; 2921 bits64 aSig, bSig, zSig; 2922 bits64 rem0, rem1; 2923 bits64 term0, term1; 2924 2925 aSig = extractFloat64Frac( a ); 2926 aExp = extractFloat64Exp( a ); 2927 aSign = extractFloat64Sign( a ); 2928 bSig = extractFloat64Frac( b ); 2929 bExp = extractFloat64Exp( b ); 2930 bSign = extractFloat64Sign( b ); 2931 zSign = aSign ^ bSign; 2932 if ( aExp == 0x7FF ) { 2933 if ( aSig ) return propagateFloat64NaN( a, b ); 2934 if ( bExp == 0x7FF ) { 2935 if ( bSig ) return propagateFloat64NaN( a, b ); 2936 float_raise( float_flag_invalid ); 2937 return float64_default_nan; 2938 } 2939 return packFloat64( zSign, 0x7FF, 0 ); 2940 } 2941 if ( bExp == 0x7FF ) { 2942 if ( bSig ) return propagateFloat64NaN( a, b ); 2943 return packFloat64( zSign, 0, 0 ); 2944 } 2945 if ( bExp == 0 ) { 2946 if ( bSig == 0 ) { 2947 if ( ( aExp | aSig ) == 0 ) { 2948 float_raise( float_flag_invalid ); 2949 return float64_default_nan; 2950 } 2951 float_raise( float_flag_divbyzero ); 2952 return packFloat64( zSign, 0x7FF, 0 ); 2953 } 2954 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2955 } 2956 if ( aExp == 0 ) { 2957 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2958 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2959 } 2960 zExp = aExp - bExp + 0x3FD; 2961 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2962 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2963 if ( bSig <= ( aSig + aSig ) ) { 2964 aSig >>= 1; 2965 ++zExp; 2966 } 2967 zSig = estimateDiv128To64( aSig, 0, bSig ); 2968 if ( ( zSig & 0x1FF ) <= 2 ) { 2969 mul64To128( bSig, zSig, &term0, &term1 ); 2970 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2971 while ( (sbits64) rem0 < 0 ) { 2972 --zSig; 2973 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2974 } 2975 zSig |= ( rem1 != 0 ); 2976 } 2977 return roundAndPackFloat64( zSign, zExp, zSig ); 2978 2979} 2980 2981#ifndef SOFTFLOAT_FOR_GCC 2982/* 2983------------------------------------------------------------------------------- 2984Returns the remainder of the double-precision floating-point value `a' 2985with respect to the corresponding value `b'. The operation is performed 2986according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2987------------------------------------------------------------------------------- 2988*/ 2989float64 float64_rem( float64 a, float64 b ) 2990{ 2991 flag aSign, bSign, zSign; 2992 int16 aExp, bExp, expDiff; 2993 bits64 aSig, bSig; 2994 bits64 q, alternateASig; 2995 sbits64 sigMean; 2996 2997 aSig = extractFloat64Frac( a ); 2998 aExp = extractFloat64Exp( a ); 2999 aSign = extractFloat64Sign( a ); 3000 bSig = extractFloat64Frac( b ); 3001 bExp = extractFloat64Exp( b ); 3002 bSign = extractFloat64Sign( b ); 3003 if ( aExp == 0x7FF ) { 3004 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3005 return propagateFloat64NaN( a, b ); 3006 } 3007 float_raise( float_flag_invalid ); 3008 return float64_default_nan; 3009 } 3010 if ( bExp == 0x7FF ) { 3011 if ( bSig ) return propagateFloat64NaN( a, b ); 3012 return a; 3013 } 3014 if ( bExp == 0 ) { 3015 if ( bSig == 0 ) { 3016 float_raise( float_flag_invalid ); 3017 return float64_default_nan; 3018 } 3019 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3020 } 3021 if ( aExp == 0 ) { 3022 if ( aSig == 0 ) return a; 3023 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3024 } 3025 expDiff = aExp - bExp; 3026 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3027 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3028 if ( expDiff < 0 ) { 3029 if ( expDiff < -1 ) return a; 3030 aSig >>= 1; 3031 } 3032 q = ( bSig <= aSig ); 3033 if ( q ) aSig -= bSig; 3034 expDiff -= 64; 3035 while ( 0 < expDiff ) { 3036 q = estimateDiv128To64( aSig, 0, bSig ); 3037 q = ( 2 < q ) ? q - 2 : 0; 3038 aSig = - ( ( bSig>>2 ) * q ); 3039 expDiff -= 62; 3040 } 3041 expDiff += 64; 3042 if ( 0 < expDiff ) { 3043 q = estimateDiv128To64( aSig, 0, bSig ); 3044 q = ( 2 < q ) ? q - 2 : 0; 3045 q >>= 64 - expDiff; 3046 bSig >>= 2; 3047 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3048 } 3049 else { 3050 aSig >>= 2; 3051 bSig >>= 2; 3052 } 3053 do { 3054 alternateASig = aSig; 3055 ++q; 3056 aSig -= bSig; 3057 } while ( 0 <= (sbits64) aSig ); 3058 sigMean = aSig + alternateASig; 3059 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3060 aSig = alternateASig; 3061 } 3062 zSign = ( (sbits64) aSig < 0 ); 3063 if ( zSign ) aSig = - aSig; 3064 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3065 3066} 3067 3068/* 3069------------------------------------------------------------------------------- 3070Returns the square root of the double-precision floating-point value `a'. 3071The operation is performed according to the IEC/IEEE Standard for Binary 3072Floating-Point Arithmetic. 3073------------------------------------------------------------------------------- 3074*/ 3075float64 float64_sqrt( float64 a ) 3076{ 3077 flag aSign; 3078 int16 aExp, zExp; 3079 bits64 aSig, zSig, doubleZSig; 3080 bits64 rem0, rem1, term0, term1; 3081 3082 aSig = extractFloat64Frac( a ); 3083 aExp = extractFloat64Exp( a ); 3084 aSign = extractFloat64Sign( a ); 3085 if ( aExp == 0x7FF ) { 3086 if ( aSig ) return propagateFloat64NaN( a, a ); 3087 if ( ! aSign ) return a; 3088 float_raise( float_flag_invalid ); 3089 return float64_default_nan; 3090 } 3091 if ( aSign ) { 3092 if ( ( aExp | aSig ) == 0 ) return a; 3093 float_raise( float_flag_invalid ); 3094 return float64_default_nan; 3095 } 3096 if ( aExp == 0 ) { 3097 if ( aSig == 0 ) return 0; 3098 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3099 } 3100 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3101 aSig |= LIT64( 0x0010000000000000 ); 3102 zSig = estimateSqrt32( aExp, aSig>>21 ); 3103 aSig <<= 9 - ( aExp & 1 ); 3104 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3105 if ( ( zSig & 0x1FF ) <= 5 ) { 3106 doubleZSig = zSig<<1; 3107 mul64To128( zSig, zSig, &term0, &term1 ); 3108 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3109 while ( (sbits64) rem0 < 0 ) { 3110 --zSig; 3111 doubleZSig -= 2; 3112 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3113 } 3114 zSig |= ( ( rem0 | rem1 ) != 0 ); 3115 } 3116 return roundAndPackFloat64( 0, zExp, zSig ); 3117 3118} 3119#endif 3120 3121/* 3122------------------------------------------------------------------------------- 3123Returns 1 if the double-precision floating-point value `a' is equal to the 3124corresponding value `b', and 0 otherwise. The comparison is performed 3125according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3126------------------------------------------------------------------------------- 3127*/ 3128flag float64_eq( float64 a, float64 b ) 3129{ 3130 3131 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3132 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3133 ) { 3134 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3135 float_raise( float_flag_invalid ); 3136 } 3137 return 0; 3138 } 3139 return ( a == b ) || 3140 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3141 3142} 3143 3144/* 3145------------------------------------------------------------------------------- 3146Returns 1 if the double-precision floating-point value `a' is less than or 3147equal to the corresponding value `b', and 0 otherwise. The comparison is 3148performed according to the IEC/IEEE Standard for Binary Floating-Point 3149Arithmetic. 3150------------------------------------------------------------------------------- 3151*/ 3152flag float64_le( float64 a, float64 b ) 3153{ 3154 flag aSign, bSign; 3155 3156 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3157 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3158 ) { 3159 float_raise( float_flag_invalid ); 3160 return 0; 3161 } 3162 aSign = extractFloat64Sign( a ); 3163 bSign = extractFloat64Sign( b ); 3164 if ( aSign != bSign ) 3165 return aSign || 3166 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3167 0 ); 3168 return ( a == b ) || 3169 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3170 3171} 3172 3173/* 3174------------------------------------------------------------------------------- 3175Returns 1 if the double-precision floating-point value `a' is less than 3176the corresponding value `b', and 0 otherwise. The comparison is performed 3177according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3178------------------------------------------------------------------------------- 3179*/ 3180flag float64_lt( float64 a, float64 b ) 3181{ 3182 flag aSign, bSign; 3183 3184 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3185 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3186 ) { 3187 float_raise( float_flag_invalid ); 3188 return 0; 3189 } 3190 aSign = extractFloat64Sign( a ); 3191 bSign = extractFloat64Sign( b ); 3192 if ( aSign != bSign ) 3193 return aSign && 3194 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3195 0 ); 3196 return ( a != b ) && 3197 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3198 3199} 3200 3201#ifndef SOFTFLOAT_FOR_GCC 3202/* 3203------------------------------------------------------------------------------- 3204Returns 1 if the double-precision floating-point value `a' is equal to the 3205corresponding value `b', and 0 otherwise. The invalid exception is raised 3206if either operand is a NaN. Otherwise, the comparison is performed 3207according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3208------------------------------------------------------------------------------- 3209*/ 3210flag float64_eq_signaling( float64 a, float64 b ) 3211{ 3212 3213 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3214 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3215 ) { 3216 float_raise( float_flag_invalid ); 3217 return 0; 3218 } 3219 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3220 3221} 3222 3223/* 3224------------------------------------------------------------------------------- 3225Returns 1 if the double-precision floating-point value `a' is less than or 3226equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3227cause an exception. Otherwise, the comparison is performed according to the 3228IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3229------------------------------------------------------------------------------- 3230*/ 3231flag float64_le_quiet( float64 a, float64 b ) 3232{ 3233 flag aSign, bSign; 3234 3235 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3236 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3237 ) { 3238 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3239 float_raise( float_flag_invalid ); 3240 } 3241 return 0; 3242 } 3243 aSign = extractFloat64Sign( a ); 3244 bSign = extractFloat64Sign( b ); 3245 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3246 return ( a == b ) || ( aSign ^ ( a < b ) ); 3247 3248} 3249 3250/* 3251------------------------------------------------------------------------------- 3252Returns 1 if the double-precision floating-point value `a' is less than 3253the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3254exception. Otherwise, the comparison is performed according to the IEC/IEEE 3255Standard for Binary Floating-Point Arithmetic. 3256------------------------------------------------------------------------------- 3257*/ 3258flag float64_lt_quiet( float64 a, float64 b ) 3259{ 3260 flag aSign, bSign; 3261 3262 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3263 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3264 ) { 3265 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3266 float_raise( float_flag_invalid ); 3267 } 3268 return 0; 3269 } 3270 aSign = extractFloat64Sign( a ); 3271 bSign = extractFloat64Sign( b ); 3272 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3273 return ( a != b ) && ( aSign ^ ( a < b ) ); 3274 3275} 3276#endif 3277 3278#ifdef FLOATX80 3279 3280/* 3281------------------------------------------------------------------------------- 3282Returns the result of converting the extended double-precision floating- 3283point value `a' to the 32-bit two's complement integer format. The 3284conversion is performed according to the IEC/IEEE Standard for Binary 3285Floating-Point Arithmetic---which means in particular that the conversion 3286is rounded according to the current rounding mode. If `a' is a NaN, the 3287largest positive integer is returned. Otherwise, if the conversion 3288overflows, the largest integer with the same sign as `a' is returned. 3289------------------------------------------------------------------------------- 3290*/ 3291int32 floatx80_to_int32( floatx80 a ) 3292{ 3293 flag aSign; 3294 int32 aExp, shiftCount; 3295 bits64 aSig; 3296 3297 aSig = extractFloatx80Frac( a ); 3298 aExp = extractFloatx80Exp( a ); 3299 aSign = extractFloatx80Sign( a ); 3300 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3301 shiftCount = 0x4037 - aExp; 3302 if ( shiftCount <= 0 ) shiftCount = 1; 3303 shift64RightJamming( aSig, shiftCount, &aSig ); 3304 return roundAndPackInt32( aSign, aSig ); 3305 3306} 3307 3308/* 3309------------------------------------------------------------------------------- 3310Returns the result of converting the extended double-precision floating- 3311point value `a' to the 32-bit two's complement integer format. The 3312conversion is performed according to the IEC/IEEE Standard for Binary 3313Floating-Point Arithmetic, except that the conversion is always rounded 3314toward zero. If `a' is a NaN, the largest positive integer is returned. 3315Otherwise, if the conversion overflows, the largest integer with the same 3316sign as `a' is returned. 3317------------------------------------------------------------------------------- 3318*/ 3319int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3320{ 3321 flag aSign; 3322 int32 aExp, shiftCount; 3323 bits64 aSig, savedASig; 3324 int32 z; 3325 3326 aSig = extractFloatx80Frac( a ); 3327 aExp = extractFloatx80Exp( a ); 3328 aSign = extractFloatx80Sign( a ); 3329 if ( 0x401E < aExp ) { 3330 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3331 goto invalid; 3332 } 3333 else if ( aExp < 0x3FFF ) { 3334 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 3335 return 0; 3336 } 3337 shiftCount = 0x403E - aExp; 3338 savedASig = aSig; 3339 aSig >>= shiftCount; 3340 z = aSig; 3341 if ( aSign ) z = - z; 3342 if ( ( z < 0 ) ^ aSign ) { 3343 invalid: 3344 float_raise( float_flag_invalid ); 3345 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3346 } 3347 if ( ( aSig<<shiftCount ) != savedASig ) { 3348 float_exception_flags |= float_flag_inexact; 3349 } 3350 return z; 3351 3352} 3353 3354/* 3355------------------------------------------------------------------------------- 3356Returns the result of converting the extended double-precision floating- 3357point value `a' to the 64-bit two's complement integer format. The 3358conversion is performed according to the IEC/IEEE Standard for Binary 3359Floating-Point Arithmetic---which means in particular that the conversion 3360is rounded according to the current rounding mode. If `a' is a NaN, 3361the largest positive integer is returned. Otherwise, if the conversion 3362overflows, the largest integer with the same sign as `a' is returned. 3363------------------------------------------------------------------------------- 3364*/ 3365int64 floatx80_to_int64( floatx80 a ) 3366{ 3367 flag aSign; 3368 int32 aExp, shiftCount; 3369 bits64 aSig, aSigExtra; 3370 3371 aSig = extractFloatx80Frac( a ); 3372 aExp = extractFloatx80Exp( a ); 3373 aSign = extractFloatx80Sign( a ); 3374 shiftCount = 0x403E - aExp; 3375 if ( shiftCount <= 0 ) { 3376 if ( shiftCount ) { 3377 float_raise( float_flag_invalid ); 3378 if ( ! aSign 3379 || ( ( aExp == 0x7FFF ) 3380 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3381 ) { 3382 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3383 } 3384 return (sbits64) LIT64( 0x8000000000000000 ); 3385 } 3386 aSigExtra = 0; 3387 } 3388 else { 3389 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3390 } 3391 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3392 3393} 3394 3395/* 3396------------------------------------------------------------------------------- 3397Returns the result of converting the extended double-precision floating- 3398point value `a' to the 64-bit two's complement integer format. The 3399conversion is performed according to the IEC/IEEE Standard for Binary 3400Floating-Point Arithmetic, except that the conversion is always rounded 3401toward zero. If `a' is a NaN, the largest positive integer is returned. 3402Otherwise, if the conversion overflows, the largest integer with the same 3403sign as `a' is returned. 3404------------------------------------------------------------------------------- 3405*/ 3406int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3407{ 3408 flag aSign; 3409 int32 aExp, shiftCount; 3410 bits64 aSig; 3411 int64 z; 3412 3413 aSig = extractFloatx80Frac( a ); 3414 aExp = extractFloatx80Exp( a ); 3415 aSign = extractFloatx80Sign( a ); 3416 shiftCount = aExp - 0x403E; 3417 if ( 0 <= shiftCount ) { 3418 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3419 if ( ( a.high != 0xC03E ) || aSig ) { 3420 float_raise( float_flag_invalid ); 3421 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3422 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3423 } 3424 } 3425 return (sbits64) LIT64( 0x8000000000000000 ); 3426 } 3427 else if ( aExp < 0x3FFF ) { 3428 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 3429 return 0; 3430 } 3431 z = aSig>>( - shiftCount ); 3432 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3433 float_exception_flags |= float_flag_inexact; 3434 } 3435 if ( aSign ) z = - z; 3436 return z; 3437 3438} 3439 3440/* 3441------------------------------------------------------------------------------- 3442Returns the result of converting the extended double-precision floating- 3443point value `a' to the single-precision floating-point format. The 3444conversion is performed according to the IEC/IEEE Standard for Binary 3445Floating-Point Arithmetic. 3446------------------------------------------------------------------------------- 3447*/ 3448float32 floatx80_to_float32( floatx80 a ) 3449{ 3450 flag aSign; 3451 int32 aExp; 3452 bits64 aSig; 3453 3454 aSig = extractFloatx80Frac( a ); 3455 aExp = extractFloatx80Exp( a ); 3456 aSign = extractFloatx80Sign( a ); 3457 if ( aExp == 0x7FFF ) { 3458 if ( (bits64) ( aSig<<1 ) ) { 3459 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3460 } 3461 return packFloat32( aSign, 0xFF, 0 ); 3462 } 3463 shift64RightJamming( aSig, 33, &aSig ); 3464 if ( aExp || aSig ) aExp -= 0x3F81; 3465 return roundAndPackFloat32( aSign, aExp, aSig ); 3466 3467} 3468 3469/* 3470------------------------------------------------------------------------------- 3471Returns the result of converting the extended double-precision floating- 3472point value `a' to the double-precision floating-point format. The 3473conversion is performed according to the IEC/IEEE Standard for Binary 3474Floating-Point Arithmetic. 3475------------------------------------------------------------------------------- 3476*/ 3477float64 floatx80_to_float64( floatx80 a ) 3478{ 3479 flag aSign; 3480 int32 aExp; 3481 bits64 aSig, zSig; 3482 3483 aSig = extractFloatx80Frac( a ); 3484 aExp = extractFloatx80Exp( a ); 3485 aSign = extractFloatx80Sign( a ); 3486 if ( aExp == 0x7FFF ) { 3487 if ( (bits64) ( aSig<<1 ) ) { 3488 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3489 } 3490 return packFloat64( aSign, 0x7FF, 0 ); 3491 } 3492 shift64RightJamming( aSig, 1, &zSig ); 3493 if ( aExp || aSig ) aExp -= 0x3C01; 3494 return roundAndPackFloat64( aSign, aExp, zSig ); 3495 3496} 3497 3498#ifdef FLOAT128 3499 3500/* 3501------------------------------------------------------------------------------- 3502Returns the result of converting the extended double-precision floating- 3503point value `a' to the quadruple-precision floating-point format. The 3504conversion is performed according to the IEC/IEEE Standard for Binary 3505Floating-Point Arithmetic. 3506------------------------------------------------------------------------------- 3507*/ 3508float128 floatx80_to_float128( floatx80 a ) 3509{ 3510 flag aSign; 3511 int16 aExp; 3512 bits64 aSig, zSig0, zSig1; 3513 3514 aSig = extractFloatx80Frac( a ); 3515 aExp = extractFloatx80Exp( a ); 3516 aSign = extractFloatx80Sign( a ); 3517 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3518 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3519 } 3520 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3521 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3522 3523} 3524 3525#endif 3526 3527/* 3528------------------------------------------------------------------------------- 3529Rounds the extended double-precision floating-point value `a' to an integer, 3530and returns the result as an extended quadruple-precision floating-point 3531value. The operation is performed according to the IEC/IEEE Standard for 3532Binary Floating-Point Arithmetic. 3533------------------------------------------------------------------------------- 3534*/ 3535floatx80 floatx80_round_to_int( floatx80 a ) 3536{ 3537 flag aSign; 3538 int32 aExp; 3539 bits64 lastBitMask, roundBitsMask; 3540 int8 roundingMode; 3541 floatx80 z; 3542 3543 aExp = extractFloatx80Exp( a ); 3544 if ( 0x403E <= aExp ) { 3545 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3546 return propagateFloatx80NaN( a, a ); 3547 } 3548 return a; 3549 } 3550 if ( aExp < 0x3FFF ) { 3551 if ( ( aExp == 0 ) 3552 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3553 return a; 3554 } 3555 float_exception_flags |= float_flag_inexact; 3556 aSign = extractFloatx80Sign( a ); 3557 switch ( float_rounding_mode ) { 3558 case float_round_nearest_even: 3559 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3560 ) { 3561 return 3562 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3563 } 3564 break; 3565 case float_round_to_zero: 3566 break; 3567 case float_round_down: 3568 return 3569 aSign ? 3570 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3571 : packFloatx80( 0, 0, 0 ); 3572 case float_round_up: 3573 return 3574 aSign ? packFloatx80( 1, 0, 0 ) 3575 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3576 } 3577 return packFloatx80( aSign, 0, 0 ); 3578 } 3579 lastBitMask = 1; 3580 lastBitMask <<= 0x403E - aExp; 3581 roundBitsMask = lastBitMask - 1; 3582 z = a; 3583 roundingMode = float_rounding_mode; 3584 if ( roundingMode == float_round_nearest_even ) { 3585 z.low += lastBitMask>>1; 3586 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3587 } 3588 else if ( roundingMode != float_round_to_zero ) { 3589 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3590 z.low += roundBitsMask; 3591 } 3592 } 3593 z.low &= ~ roundBitsMask; 3594 if ( z.low == 0 ) { 3595 ++z.high; 3596 z.low = LIT64( 0x8000000000000000 ); 3597 } 3598 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 3599 return z; 3600 3601} 3602 3603/* 3604------------------------------------------------------------------------------- 3605Returns the result of adding the absolute values of the extended double- 3606precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3607negated before being returned. `zSign' is ignored if the result is a NaN. 3608The addition is performed according to the IEC/IEEE Standard for Binary 3609Floating-Point Arithmetic. 3610------------------------------------------------------------------------------- 3611*/ 3612static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3613{ 3614 int32 aExp, bExp, zExp; 3615 bits64 aSig, bSig, zSig0, zSig1; 3616 int32 expDiff; 3617 3618 aSig = extractFloatx80Frac( a ); 3619 aExp = extractFloatx80Exp( a ); 3620 bSig = extractFloatx80Frac( b ); 3621 bExp = extractFloatx80Exp( b ); 3622 expDiff = aExp - bExp; 3623 if ( 0 < expDiff ) { 3624 if ( aExp == 0x7FFF ) { 3625 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3626 return a; 3627 } 3628 if ( bExp == 0 ) --expDiff; 3629 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3630 zExp = aExp; 3631 } 3632 else if ( expDiff < 0 ) { 3633 if ( bExp == 0x7FFF ) { 3634 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3635 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3636 } 3637 if ( aExp == 0 ) ++expDiff; 3638 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3639 zExp = bExp; 3640 } 3641 else { 3642 if ( aExp == 0x7FFF ) { 3643 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3644 return propagateFloatx80NaN( a, b ); 3645 } 3646 return a; 3647 } 3648 zSig1 = 0; 3649 zSig0 = aSig + bSig; 3650 if ( aExp == 0 ) { 3651 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3652 goto roundAndPack; 3653 } 3654 zExp = aExp; 3655 goto shiftRight1; 3656 } 3657 zSig0 = aSig + bSig; 3658 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3659 shiftRight1: 3660 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3661 zSig0 |= LIT64( 0x8000000000000000 ); 3662 ++zExp; 3663 roundAndPack: 3664 return 3665 roundAndPackFloatx80( 3666 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3667 3668} 3669 3670/* 3671------------------------------------------------------------------------------- 3672Returns the result of subtracting the absolute values of the extended 3673double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3674difference is negated before being returned. `zSign' is ignored if the 3675result is a NaN. The subtraction is performed according to the IEC/IEEE 3676Standard for Binary Floating-Point Arithmetic. 3677------------------------------------------------------------------------------- 3678*/ 3679static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3680{ 3681 int32 aExp, bExp, zExp; 3682 bits64 aSig, bSig, zSig0, zSig1; 3683 int32 expDiff; 3684 floatx80 z; 3685 3686 aSig = extractFloatx80Frac( a ); 3687 aExp = extractFloatx80Exp( a ); 3688 bSig = extractFloatx80Frac( b ); 3689 bExp = extractFloatx80Exp( b ); 3690 expDiff = aExp - bExp; 3691 if ( 0 < expDiff ) goto aExpBigger; 3692 if ( expDiff < 0 ) goto bExpBigger; 3693 if ( aExp == 0x7FFF ) { 3694 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3695 return propagateFloatx80NaN( a, b ); 3696 } 3697 float_raise( float_flag_invalid ); 3698 z.low = floatx80_default_nan_low; 3699 z.high = floatx80_default_nan_high; 3700 return z; 3701 } 3702 if ( aExp == 0 ) { 3703 aExp = 1; 3704 bExp = 1; 3705 } 3706 zSig1 = 0; 3707 if ( bSig < aSig ) goto aBigger; 3708 if ( aSig < bSig ) goto bBigger; 3709 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3710 bExpBigger: 3711 if ( bExp == 0x7FFF ) { 3712 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3713 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3714 } 3715 if ( aExp == 0 ) ++expDiff; 3716 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3717 bBigger: 3718 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3719 zExp = bExp; 3720 zSign ^= 1; 3721 goto normalizeRoundAndPack; 3722 aExpBigger: 3723 if ( aExp == 0x7FFF ) { 3724 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3725 return a; 3726 } 3727 if ( bExp == 0 ) --expDiff; 3728 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3729 aBigger: 3730 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3731 zExp = aExp; 3732 normalizeRoundAndPack: 3733 return 3734 normalizeRoundAndPackFloatx80( 3735 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3736 3737} 3738 3739/* 3740------------------------------------------------------------------------------- 3741Returns the result of adding the extended double-precision floating-point 3742values `a' and `b'. The operation is performed according to the IEC/IEEE 3743Standard for Binary Floating-Point Arithmetic. 3744------------------------------------------------------------------------------- 3745*/ 3746floatx80 floatx80_add( floatx80 a, floatx80 b ) 3747{ 3748 flag aSign, bSign; 3749 3750 aSign = extractFloatx80Sign( a ); 3751 bSign = extractFloatx80Sign( b ); 3752 if ( aSign == bSign ) { 3753 return addFloatx80Sigs( a, b, aSign ); 3754 } 3755 else { 3756 return subFloatx80Sigs( a, b, aSign ); 3757 } 3758 3759} 3760 3761/* 3762------------------------------------------------------------------------------- 3763Returns the result of subtracting the extended double-precision floating- 3764point values `a' and `b'. The operation is performed according to the 3765IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3766------------------------------------------------------------------------------- 3767*/ 3768floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3769{ 3770 flag aSign, bSign; 3771 3772 aSign = extractFloatx80Sign( a ); 3773 bSign = extractFloatx80Sign( b ); 3774 if ( aSign == bSign ) { 3775 return subFloatx80Sigs( a, b, aSign ); 3776 } 3777 else { 3778 return addFloatx80Sigs( a, b, aSign ); 3779 } 3780 3781} 3782 3783/* 3784------------------------------------------------------------------------------- 3785Returns the result of multiplying the extended double-precision floating- 3786point values `a' and `b'. The operation is performed according to the 3787IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3788------------------------------------------------------------------------------- 3789*/ 3790floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3791{ 3792 flag aSign, bSign, zSign; 3793 int32 aExp, bExp, zExp; 3794 bits64 aSig, bSig, zSig0, zSig1; 3795 floatx80 z; 3796 3797 aSig = extractFloatx80Frac( a ); 3798 aExp = extractFloatx80Exp( a ); 3799 aSign = extractFloatx80Sign( a ); 3800 bSig = extractFloatx80Frac( b ); 3801 bExp = extractFloatx80Exp( b ); 3802 bSign = extractFloatx80Sign( b ); 3803 zSign = aSign ^ bSign; 3804 if ( aExp == 0x7FFF ) { 3805 if ( (bits64) ( aSig<<1 ) 3806 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3807 return propagateFloatx80NaN( a, b ); 3808 } 3809 if ( ( bExp | bSig ) == 0 ) goto invalid; 3810 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3811 } 3812 if ( bExp == 0x7FFF ) { 3813 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3814 if ( ( aExp | aSig ) == 0 ) { 3815 invalid: 3816 float_raise( float_flag_invalid ); 3817 z.low = floatx80_default_nan_low; 3818 z.high = floatx80_default_nan_high; 3819 return z; 3820 } 3821 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3822 } 3823 if ( aExp == 0 ) { 3824 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3825 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3826 } 3827 if ( bExp == 0 ) { 3828 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3829 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3830 } 3831 zExp = aExp + bExp - 0x3FFE; 3832 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3833 if ( 0 < (sbits64) zSig0 ) { 3834 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3835 --zExp; 3836 } 3837 return 3838 roundAndPackFloatx80( 3839 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3840 3841} 3842 3843/* 3844------------------------------------------------------------------------------- 3845Returns the result of dividing the extended double-precision floating-point 3846value `a' by the corresponding value `b'. The operation is performed 3847according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3848------------------------------------------------------------------------------- 3849*/ 3850floatx80 floatx80_div( floatx80 a, floatx80 b ) 3851{ 3852 flag aSign, bSign, zSign; 3853 int32 aExp, bExp, zExp; 3854 bits64 aSig, bSig, zSig0, zSig1; 3855 bits64 rem0, rem1, rem2, term0, term1, term2; 3856 floatx80 z; 3857 3858 aSig = extractFloatx80Frac( a ); 3859 aExp = extractFloatx80Exp( a ); 3860 aSign = extractFloatx80Sign( a ); 3861 bSig = extractFloatx80Frac( b ); 3862 bExp = extractFloatx80Exp( b ); 3863 bSign = extractFloatx80Sign( b ); 3864 zSign = aSign ^ bSign; 3865 if ( aExp == 0x7FFF ) { 3866 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3867 if ( bExp == 0x7FFF ) { 3868 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3869 goto invalid; 3870 } 3871 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3872 } 3873 if ( bExp == 0x7FFF ) { 3874 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3875 return packFloatx80( zSign, 0, 0 ); 3876 } 3877 if ( bExp == 0 ) { 3878 if ( bSig == 0 ) { 3879 if ( ( aExp | aSig ) == 0 ) { 3880 invalid: 3881 float_raise( float_flag_invalid ); 3882 z.low = floatx80_default_nan_low; 3883 z.high = floatx80_default_nan_high; 3884 return z; 3885 } 3886 float_raise( float_flag_divbyzero ); 3887 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3888 } 3889 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3890 } 3891 if ( aExp == 0 ) { 3892 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3893 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3894 } 3895 zExp = aExp - bExp + 0x3FFE; 3896 rem1 = 0; 3897 if ( bSig <= aSig ) { 3898 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3899 ++zExp; 3900 } 3901 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3902 mul64To128( bSig, zSig0, &term0, &term1 ); 3903 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3904 while ( (sbits64) rem0 < 0 ) { 3905 --zSig0; 3906 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3907 } 3908 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3909 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3910 mul64To128( bSig, zSig1, &term1, &term2 ); 3911 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3912 while ( (sbits64) rem1 < 0 ) { 3913 --zSig1; 3914 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3915 } 3916 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3917 } 3918 return 3919 roundAndPackFloatx80( 3920 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3921 3922} 3923 3924/* 3925------------------------------------------------------------------------------- 3926Returns the remainder of the extended double-precision floating-point value 3927`a' with respect to the corresponding value `b'. The operation is performed 3928according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3929------------------------------------------------------------------------------- 3930*/ 3931floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3932{ 3933 flag aSign, bSign, zSign; 3934 int32 aExp, bExp, expDiff; 3935 bits64 aSig0, aSig1, bSig; 3936 bits64 q, term0, term1, alternateASig0, alternateASig1; 3937 floatx80 z; 3938 3939 aSig0 = extractFloatx80Frac( a ); 3940 aExp = extractFloatx80Exp( a ); 3941 aSign = extractFloatx80Sign( a ); 3942 bSig = extractFloatx80Frac( b ); 3943 bExp = extractFloatx80Exp( b ); 3944 bSign = extractFloatx80Sign( b ); 3945 if ( aExp == 0x7FFF ) { 3946 if ( (bits64) ( aSig0<<1 ) 3947 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3948 return propagateFloatx80NaN( a, b ); 3949 } 3950 goto invalid; 3951 } 3952 if ( bExp == 0x7FFF ) { 3953 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3954 return a; 3955 } 3956 if ( bExp == 0 ) { 3957 if ( bSig == 0 ) { 3958 invalid: 3959 float_raise( float_flag_invalid ); 3960 z.low = floatx80_default_nan_low; 3961 z.high = floatx80_default_nan_high; 3962 return z; 3963 } 3964 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3965 } 3966 if ( aExp == 0 ) { 3967 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3968 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3969 } 3970 bSig |= LIT64( 0x8000000000000000 ); 3971 zSign = aSign; 3972 expDiff = aExp - bExp; 3973 aSig1 = 0; 3974 if ( expDiff < 0 ) { 3975 if ( expDiff < -1 ) return a; 3976 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3977 expDiff = 0; 3978 } 3979 q = ( bSig <= aSig0 ); 3980 if ( q ) aSig0 -= bSig; 3981 expDiff -= 64; 3982 while ( 0 < expDiff ) { 3983 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3984 q = ( 2 < q ) ? q - 2 : 0; 3985 mul64To128( bSig, q, &term0, &term1 ); 3986 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3987 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3988 expDiff -= 62; 3989 } 3990 expDiff += 64; 3991 if ( 0 < expDiff ) { 3992 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3993 q = ( 2 < q ) ? q - 2 : 0; 3994 q >>= 64 - expDiff; 3995 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3996 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3997 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3998 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3999 ++q; 4000 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4001 } 4002 } 4003 else { 4004 term1 = 0; 4005 term0 = bSig; 4006 } 4007 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4008 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4009 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4010 && ( q & 1 ) ) 4011 ) { 4012 aSig0 = alternateASig0; 4013 aSig1 = alternateASig1; 4014 zSign = ! zSign; 4015 } 4016 return 4017 normalizeRoundAndPackFloatx80( 4018 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4019 4020} 4021 4022/* 4023------------------------------------------------------------------------------- 4024Returns the square root of the extended double-precision floating-point 4025value `a'. The operation is performed according to the IEC/IEEE Standard 4026for Binary Floating-Point Arithmetic. 4027------------------------------------------------------------------------------- 4028*/ 4029floatx80 floatx80_sqrt( floatx80 a ) 4030{ 4031 flag aSign; 4032 int32 aExp, zExp; 4033 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4034 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4035 floatx80 z; 4036 4037 aSig0 = extractFloatx80Frac( a ); 4038 aExp = extractFloatx80Exp( a ); 4039 aSign = extractFloatx80Sign( a ); 4040 if ( aExp == 0x7FFF ) { 4041 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4042 if ( ! aSign ) return a; 4043 goto invalid; 4044 } 4045 if ( aSign ) { 4046 if ( ( aExp | aSig0 ) == 0 ) return a; 4047 invalid: 4048 float_raise( float_flag_invalid ); 4049 z.low = floatx80_default_nan_low; 4050 z.high = floatx80_default_nan_high; 4051 return z; 4052 } 4053 if ( aExp == 0 ) { 4054 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4055 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4056 } 4057 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4058 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4059 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4060 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4061 doubleZSig0 = zSig0<<1; 4062 mul64To128( zSig0, zSig0, &term0, &term1 ); 4063 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4064 while ( (sbits64) rem0 < 0 ) { 4065 --zSig0; 4066 doubleZSig0 -= 2; 4067 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4068 } 4069 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4070 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4071 if ( zSig1 == 0 ) zSig1 = 1; 4072 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4073 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4074 mul64To128( zSig1, zSig1, &term2, &term3 ); 4075 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4076 while ( (sbits64) rem1 < 0 ) { 4077 --zSig1; 4078 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4079 term3 |= 1; 4080 term2 |= doubleZSig0; 4081 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4082 } 4083 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4084 } 4085 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4086 zSig0 |= doubleZSig0; 4087 return 4088 roundAndPackFloatx80( 4089 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4090 4091} 4092 4093/* 4094------------------------------------------------------------------------------- 4095Returns 1 if the extended double-precision floating-point value `a' is 4096equal to the corresponding value `b', and 0 otherwise. The comparison is 4097performed according to the IEC/IEEE Standard for Binary Floating-Point 4098Arithmetic. 4099------------------------------------------------------------------------------- 4100*/ 4101flag floatx80_eq( floatx80 a, floatx80 b ) 4102{ 4103 4104 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4105 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4106 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4107 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4108 ) { 4109 if ( floatx80_is_signaling_nan( a ) 4110 || floatx80_is_signaling_nan( b ) ) { 4111 float_raise( float_flag_invalid ); 4112 } 4113 return 0; 4114 } 4115 return 4116 ( a.low == b.low ) 4117 && ( ( a.high == b.high ) 4118 || ( ( a.low == 0 ) 4119 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4120 ); 4121 4122} 4123 4124/* 4125------------------------------------------------------------------------------- 4126Returns 1 if the extended double-precision floating-point value `a' is 4127less than or equal to the corresponding value `b', and 0 otherwise. The 4128comparison is performed according to the IEC/IEEE Standard for Binary 4129Floating-Point Arithmetic. 4130------------------------------------------------------------------------------- 4131*/ 4132flag floatx80_le( floatx80 a, floatx80 b ) 4133{ 4134 flag aSign, bSign; 4135 4136 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4137 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4138 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4139 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4140 ) { 4141 float_raise( float_flag_invalid ); 4142 return 0; 4143 } 4144 aSign = extractFloatx80Sign( a ); 4145 bSign = extractFloatx80Sign( b ); 4146 if ( aSign != bSign ) { 4147 return 4148 aSign 4149 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4150 == 0 ); 4151 } 4152 return 4153 aSign ? le128( b.high, b.low, a.high, a.low ) 4154 : le128( a.high, a.low, b.high, b.low ); 4155 4156} 4157 4158/* 4159------------------------------------------------------------------------------- 4160Returns 1 if the extended double-precision floating-point value `a' is 4161less than the corresponding value `b', and 0 otherwise. The comparison 4162is performed according to the IEC/IEEE Standard for Binary Floating-Point 4163Arithmetic. 4164------------------------------------------------------------------------------- 4165*/ 4166flag floatx80_lt( floatx80 a, floatx80 b ) 4167{ 4168 flag aSign, bSign; 4169 4170 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4171 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4172 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4173 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4174 ) { 4175 float_raise( float_flag_invalid ); 4176 return 0; 4177 } 4178 aSign = extractFloatx80Sign( a ); 4179 bSign = extractFloatx80Sign( b ); 4180 if ( aSign != bSign ) { 4181 return 4182 aSign 4183 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4184 != 0 ); 4185 } 4186 return 4187 aSign ? lt128( b.high, b.low, a.high, a.low ) 4188 : lt128( a.high, a.low, b.high, b.low ); 4189 4190} 4191 4192/* 4193------------------------------------------------------------------------------- 4194Returns 1 if the extended double-precision floating-point value `a' is equal 4195to the corresponding value `b', and 0 otherwise. The invalid exception is 4196raised if either operand is a NaN. Otherwise, the comparison is performed 4197according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4198------------------------------------------------------------------------------- 4199*/ 4200flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4201{ 4202 4203 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4204 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4205 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4206 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4207 ) { 4208 float_raise( float_flag_invalid ); 4209 return 0; 4210 } 4211 return 4212 ( a.low == b.low ) 4213 && ( ( a.high == b.high ) 4214 || ( ( a.low == 0 ) 4215 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4216 ); 4217 4218} 4219 4220/* 4221------------------------------------------------------------------------------- 4222Returns 1 if the extended double-precision floating-point value `a' is less 4223than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4224do not cause an exception. Otherwise, the comparison is performed according 4225to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4226------------------------------------------------------------------------------- 4227*/ 4228flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4229{ 4230 flag aSign, bSign; 4231 4232 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4233 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4234 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4235 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4236 ) { 4237 if ( floatx80_is_signaling_nan( a ) 4238 || floatx80_is_signaling_nan( b ) ) { 4239 float_raise( float_flag_invalid ); 4240 } 4241 return 0; 4242 } 4243 aSign = extractFloatx80Sign( a ); 4244 bSign = extractFloatx80Sign( b ); 4245 if ( aSign != bSign ) { 4246 return 4247 aSign 4248 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4249 == 0 ); 4250 } 4251 return 4252 aSign ? le128( b.high, b.low, a.high, a.low ) 4253 : le128( a.high, a.low, b.high, b.low ); 4254 4255} 4256 4257/* 4258------------------------------------------------------------------------------- 4259Returns 1 if the extended double-precision floating-point value `a' is less 4260than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4261an exception. Otherwise, the comparison is performed according to the 4262IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4263------------------------------------------------------------------------------- 4264*/ 4265flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4266{ 4267 flag aSign, bSign; 4268 4269 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4270 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4271 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4272 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4273 ) { 4274 if ( floatx80_is_signaling_nan( a ) 4275 || floatx80_is_signaling_nan( b ) ) { 4276 float_raise( float_flag_invalid ); 4277 } 4278 return 0; 4279 } 4280 aSign = extractFloatx80Sign( a ); 4281 bSign = extractFloatx80Sign( b ); 4282 if ( aSign != bSign ) { 4283 return 4284 aSign 4285 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4286 != 0 ); 4287 } 4288 return 4289 aSign ? lt128( b.high, b.low, a.high, a.low ) 4290 : lt128( a.high, a.low, b.high, b.low ); 4291 4292} 4293 4294#endif 4295 4296#ifdef FLOAT128 4297 4298/* 4299------------------------------------------------------------------------------- 4300Returns the result of converting the quadruple-precision floating-point 4301value `a' to the 32-bit two's complement integer format. The conversion 4302is performed according to the IEC/IEEE Standard for Binary Floating-Point 4303Arithmetic---which means in particular that the conversion is rounded 4304according to the current rounding mode. If `a' is a NaN, the largest 4305positive integer is returned. Otherwise, if the conversion overflows, the 4306largest integer with the same sign as `a' is returned. 4307------------------------------------------------------------------------------- 4308*/ 4309int32 float128_to_int32( float128 a ) 4310{ 4311 flag aSign; 4312 int32 aExp, shiftCount; 4313 bits64 aSig0, aSig1; 4314 4315 aSig1 = extractFloat128Frac1( a ); 4316 aSig0 = extractFloat128Frac0( a ); 4317 aExp = extractFloat128Exp( a ); 4318 aSign = extractFloat128Sign( a ); 4319 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4320 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4321 aSig0 |= ( aSig1 != 0 ); 4322 shiftCount = 0x4028 - aExp; 4323 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4324 return roundAndPackInt32( aSign, aSig0 ); 4325 4326} 4327 4328/* 4329------------------------------------------------------------------------------- 4330Returns the result of converting the quadruple-precision floating-point 4331value `a' to the 32-bit two's complement integer format. The conversion 4332is performed according to the IEC/IEEE Standard for Binary Floating-Point 4333Arithmetic, except that the conversion is always rounded toward zero. If 4334`a' is a NaN, the largest positive integer is returned. Otherwise, if the 4335conversion overflows, the largest integer with the same sign as `a' is 4336returned. 4337------------------------------------------------------------------------------- 4338*/ 4339int32 float128_to_int32_round_to_zero( float128 a ) 4340{ 4341 flag aSign; 4342 int32 aExp, shiftCount; 4343 bits64 aSig0, aSig1, savedASig; 4344 int32 z; 4345 4346 aSig1 = extractFloat128Frac1( a ); 4347 aSig0 = extractFloat128Frac0( a ); 4348 aExp = extractFloat128Exp( a ); 4349 aSign = extractFloat128Sign( a ); 4350 aSig0 |= ( aSig1 != 0 ); 4351 if ( 0x401E < aExp ) { 4352 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4353 goto invalid; 4354 } 4355 else if ( aExp < 0x3FFF ) { 4356 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact; 4357 return 0; 4358 } 4359 aSig0 |= LIT64( 0x0001000000000000 ); 4360 shiftCount = 0x402F - aExp; 4361 savedASig = aSig0; 4362 aSig0 >>= shiftCount; 4363 z = aSig0; 4364 if ( aSign ) z = - z; 4365 if ( ( z < 0 ) ^ aSign ) { 4366 invalid: 4367 float_raise( float_flag_invalid ); 4368 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4369 } 4370 if ( ( aSig0<<shiftCount ) != savedASig ) { 4371 float_exception_flags |= float_flag_inexact; 4372 } 4373 return z; 4374 4375} 4376 4377/* 4378------------------------------------------------------------------------------- 4379Returns the result of converting the quadruple-precision floating-point 4380value `a' to the 64-bit two's complement integer format. The conversion 4381is performed according to the IEC/IEEE Standard for Binary Floating-Point 4382Arithmetic---which means in particular that the conversion is rounded 4383according to the current rounding mode. If `a' is a NaN, the largest 4384positive integer is returned. Otherwise, if the conversion overflows, the 4385largest integer with the same sign as `a' is returned. 4386------------------------------------------------------------------------------- 4387*/ 4388int64 float128_to_int64( float128 a ) 4389{ 4390 flag aSign; 4391 int32 aExp, shiftCount; 4392 bits64 aSig0, aSig1; 4393 4394 aSig1 = extractFloat128Frac1( a ); 4395 aSig0 = extractFloat128Frac0( a ); 4396 aExp = extractFloat128Exp( a ); 4397 aSign = extractFloat128Sign( a ); 4398 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4399 shiftCount = 0x402F - aExp; 4400 if ( shiftCount <= 0 ) { 4401 if ( 0x403E < aExp ) { 4402 float_raise( float_flag_invalid ); 4403 if ( ! aSign 4404 || ( ( aExp == 0x7FFF ) 4405 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4406 ) 4407 ) { 4408 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4409 } 4410 return (sbits64) LIT64( 0x8000000000000000 ); 4411 } 4412 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4413 } 4414 else { 4415 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4416 } 4417 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4418 4419} 4420 4421/* 4422------------------------------------------------------------------------------- 4423Returns the result of converting the quadruple-precision floating-point 4424value `a' to the 64-bit two's complement integer format. The conversion 4425is performed according to the IEC/IEEE Standard for Binary Floating-Point 4426Arithmetic, except that the conversion is always rounded toward zero. 4427If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4428the conversion overflows, the largest integer with the same sign as `a' is 4429returned. 4430------------------------------------------------------------------------------- 4431*/ 4432int64 float128_to_int64_round_to_zero( float128 a ) 4433{ 4434 flag aSign; 4435 int32 aExp, shiftCount; 4436 bits64 aSig0, aSig1; 4437 int64 z; 4438 4439 aSig1 = extractFloat128Frac1( a ); 4440 aSig0 = extractFloat128Frac0( a ); 4441 aExp = extractFloat128Exp( a ); 4442 aSign = extractFloat128Sign( a ); 4443 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4444 shiftCount = aExp - 0x402F; 4445 if ( 0 < shiftCount ) { 4446 if ( 0x403E <= aExp ) { 4447 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4448 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4449 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4450 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4451 } 4452 else { 4453 float_raise( float_flag_invalid ); 4454 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4455 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4456 } 4457 } 4458 return (sbits64) LIT64( 0x8000000000000000 ); 4459 } 4460 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4461 if ( (bits64) ( aSig1<<shiftCount ) ) { 4462 float_exception_flags |= float_flag_inexact; 4463 } 4464 } 4465 else { 4466 if ( aExp < 0x3FFF ) { 4467 if ( aExp | aSig0 | aSig1 ) { 4468 float_exception_flags |= float_flag_inexact; 4469 } 4470 return 0; 4471 } 4472 z = aSig0>>( - shiftCount ); 4473 if ( aSig1 4474 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4475 float_exception_flags |= float_flag_inexact; 4476 } 4477 } 4478 if ( aSign ) z = - z; 4479 return z; 4480 4481} 4482 4483#if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ 4484 && defined(SOFTFLOAT_NEED_FIXUNS) 4485/* 4486 * just like above - but do not care for overflow of signed results 4487 */ 4488uint64 float128_to_uint64_round_to_zero( float128 a ) 4489{ 4490 flag aSign; 4491 int32 aExp, shiftCount; 4492 bits64 aSig0, aSig1; 4493 uint64 z; 4494 4495 aSig1 = extractFloat128Frac1( a ); 4496 aSig0 = extractFloat128Frac0( a ); 4497 aExp = extractFloat128Exp( a ); 4498 aSign = extractFloat128Sign( a ); 4499 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4500 shiftCount = aExp - 0x402F; 4501 if ( 0 < shiftCount ) { 4502 if ( 0x403F <= aExp ) { 4503 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4504 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4505 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4506 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4507 } 4508 else { 4509 float_raise( float_flag_invalid ); 4510 } 4511 return LIT64( 0xFFFFFFFFFFFFFFFF ); 4512 } 4513 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4514 if ( (bits64) ( aSig1<<shiftCount ) ) { 4515 float_exception_flags |= float_flag_inexact; 4516 } 4517 } 4518 else { 4519 if ( aExp < 0x3FFF ) { 4520 if ( aExp | aSig0 | aSig1 ) { 4521 float_exception_flags |= float_flag_inexact; 4522 } 4523 return 0; 4524 } 4525 z = aSig0>>( - shiftCount ); 4526 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4527 float_exception_flags |= float_flag_inexact; 4528 } 4529 } 4530 if ( aSign ) z = - z; 4531 return z; 4532 4533} 4534#endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ 4535 4536/* 4537------------------------------------------------------------------------------- 4538Returns the result of converting the quadruple-precision floating-point 4539value `a' to the single-precision floating-point format. The conversion 4540is performed according to the IEC/IEEE Standard for Binary Floating-Point 4541Arithmetic. 4542------------------------------------------------------------------------------- 4543*/ 4544float32 float128_to_float32( float128 a ) 4545{ 4546 flag aSign; 4547 int32 aExp; 4548 bits64 aSig0, aSig1; 4549 bits32 zSig; 4550 4551 aSig1 = extractFloat128Frac1( a ); 4552 aSig0 = extractFloat128Frac0( a ); 4553 aExp = extractFloat128Exp( a ); 4554 aSign = extractFloat128Sign( a ); 4555 if ( aExp == 0x7FFF ) { 4556 if ( aSig0 | aSig1 ) { 4557 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4558 } 4559 return packFloat32( aSign, 0xFF, 0 ); 4560 } 4561 aSig0 |= ( aSig1 != 0 ); 4562 shift64RightJamming( aSig0, 18, &aSig0 ); 4563 zSig = aSig0; 4564 if ( aExp || zSig ) { 4565 zSig |= 0x40000000; 4566 aExp -= 0x3F81; 4567 } 4568 return roundAndPackFloat32( aSign, aExp, zSig ); 4569 4570} 4571 4572/* 4573------------------------------------------------------------------------------- 4574Returns the result of converting the quadruple-precision floating-point 4575value `a' to the double-precision floating-point format. The conversion 4576is performed according to the IEC/IEEE Standard for Binary Floating-Point 4577Arithmetic. 4578------------------------------------------------------------------------------- 4579*/ 4580float64 float128_to_float64( float128 a ) 4581{ 4582 flag aSign; 4583 int32 aExp; 4584 bits64 aSig0, aSig1; 4585 4586 aSig1 = extractFloat128Frac1( a ); 4587 aSig0 = extractFloat128Frac0( a ); 4588 aExp = extractFloat128Exp( a ); 4589 aSign = extractFloat128Sign( a ); 4590 if ( aExp == 0x7FFF ) { 4591 if ( aSig0 | aSig1 ) { 4592 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4593 } 4594 return packFloat64( aSign, 0x7FF, 0 ); 4595 } 4596 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4597 aSig0 |= ( aSig1 != 0 ); 4598 if ( aExp || aSig0 ) { 4599 aSig0 |= LIT64( 0x4000000000000000 ); 4600 aExp -= 0x3C01; 4601 } 4602 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4603 4604} 4605 4606#ifdef FLOATX80 4607 4608/* 4609------------------------------------------------------------------------------- 4610Returns the result of converting the quadruple-precision floating-point 4611value `a' to the extended double-precision floating-point format. The 4612conversion is performed according to the IEC/IEEE Standard for Binary 4613Floating-Point Arithmetic. 4614------------------------------------------------------------------------------- 4615*/ 4616floatx80 float128_to_floatx80( float128 a ) 4617{ 4618 flag aSign; 4619 int32 aExp; 4620 bits64 aSig0, aSig1; 4621 4622 aSig1 = extractFloat128Frac1( a ); 4623 aSig0 = extractFloat128Frac0( a ); 4624 aExp = extractFloat128Exp( a ); 4625 aSign = extractFloat128Sign( a ); 4626 if ( aExp == 0x7FFF ) { 4627 if ( aSig0 | aSig1 ) { 4628 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4629 } 4630 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4631 } 4632 if ( aExp == 0 ) { 4633 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4634 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4635 } 4636 else { 4637 aSig0 |= LIT64( 0x0001000000000000 ); 4638 } 4639 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4640 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4641 4642} 4643 4644#endif 4645 4646/* 4647------------------------------------------------------------------------------- 4648Rounds the quadruple-precision floating-point value `a' to an integer, and 4649returns the result as a quadruple-precision floating-point value. The 4650operation is performed according to the IEC/IEEE Standard for Binary 4651Floating-Point Arithmetic. 4652------------------------------------------------------------------------------- 4653*/ 4654float128 float128_round_to_int( float128 a ) 4655{ 4656 flag aSign; 4657 int32 aExp; 4658 bits64 lastBitMask, roundBitsMask; 4659 int8 roundingMode; 4660 float128 z; 4661 4662 aExp = extractFloat128Exp( a ); 4663 if ( 0x402F <= aExp ) { 4664 if ( 0x406F <= aExp ) { 4665 if ( ( aExp == 0x7FFF ) 4666 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4667 ) { 4668 return propagateFloat128NaN( a, a ); 4669 } 4670 return a; 4671 } 4672 lastBitMask = 1; 4673 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4674 roundBitsMask = lastBitMask - 1; 4675 z = a; 4676 roundingMode = float_rounding_mode; 4677 if ( roundingMode == float_round_nearest_even ) { 4678 if ( lastBitMask ) { 4679 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4680 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4681 } 4682 else { 4683 if ( (sbits64) z.low < 0 ) { 4684 ++z.high; 4685 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4686 } 4687 } 4688 } 4689 else if ( roundingMode != float_round_to_zero ) { 4690 if ( extractFloat128Sign( z ) 4691 ^ ( roundingMode == float_round_up ) ) { 4692 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4693 } 4694 } 4695 z.low &= ~ roundBitsMask; 4696 } 4697 else { 4698 if ( aExp < 0x3FFF ) { 4699 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4700 float_exception_flags |= float_flag_inexact; 4701 aSign = extractFloat128Sign( a ); 4702 switch ( float_rounding_mode ) { 4703 case float_round_nearest_even: 4704 if ( ( aExp == 0x3FFE ) 4705 && ( extractFloat128Frac0( a ) 4706 | extractFloat128Frac1( a ) ) 4707 ) { 4708 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4709 } 4710 break; 4711 case float_round_to_zero: 4712 break; 4713 case float_round_down: 4714 return 4715 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4716 : packFloat128( 0, 0, 0, 0 ); 4717 case float_round_up: 4718 return 4719 aSign ? packFloat128( 1, 0, 0, 0 ) 4720 : packFloat128( 0, 0x3FFF, 0, 0 ); 4721 } 4722 return packFloat128( aSign, 0, 0, 0 ); 4723 } 4724 lastBitMask = 1; 4725 lastBitMask <<= 0x402F - aExp; 4726 roundBitsMask = lastBitMask - 1; 4727 z.low = 0; 4728 z.high = a.high; 4729 roundingMode = float_rounding_mode; 4730 if ( roundingMode == float_round_nearest_even ) { 4731 z.high += lastBitMask>>1; 4732 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4733 z.high &= ~ lastBitMask; 4734 } 4735 } 4736 else if ( roundingMode != float_round_to_zero ) { 4737 if ( extractFloat128Sign( z ) 4738 ^ ( roundingMode == float_round_up ) ) { 4739 z.high |= ( a.low != 0 ); 4740 z.high += roundBitsMask; 4741 } 4742 } 4743 z.high &= ~ roundBitsMask; 4744 } 4745 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4746 float_exception_flags |= float_flag_inexact; 4747 } 4748 return z; 4749 4750} 4751 4752/* 4753------------------------------------------------------------------------------- 4754Returns the result of adding the absolute values of the quadruple-precision 4755floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4756before being returned. `zSign' is ignored if the result is a NaN. 4757The addition is performed according to the IEC/IEEE Standard for Binary 4758Floating-Point Arithmetic. 4759------------------------------------------------------------------------------- 4760*/ 4761static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4762{ 4763 int32 aExp, bExp, zExp; 4764 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4765 int32 expDiff; 4766 4767 aSig1 = extractFloat128Frac1( a ); 4768 aSig0 = extractFloat128Frac0( a ); 4769 aExp = extractFloat128Exp( a ); 4770 bSig1 = extractFloat128Frac1( b ); 4771 bSig0 = extractFloat128Frac0( b ); 4772 bExp = extractFloat128Exp( b ); 4773 expDiff = aExp - bExp; 4774 if ( 0 < expDiff ) { 4775 if ( aExp == 0x7FFF ) { 4776 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4777 return a; 4778 } 4779 if ( bExp == 0 ) { 4780 --expDiff; 4781 } 4782 else { 4783 bSig0 |= LIT64( 0x0001000000000000 ); 4784 } 4785 shift128ExtraRightJamming( 4786 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4787 zExp = aExp; 4788 } 4789 else if ( expDiff < 0 ) { 4790 if ( bExp == 0x7FFF ) { 4791 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4792 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4793 } 4794 if ( aExp == 0 ) { 4795 ++expDiff; 4796 } 4797 else { 4798 aSig0 |= LIT64( 0x0001000000000000 ); 4799 } 4800 shift128ExtraRightJamming( 4801 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4802 zExp = bExp; 4803 } 4804 else { 4805 if ( aExp == 0x7FFF ) { 4806 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4807 return propagateFloat128NaN( a, b ); 4808 } 4809 return a; 4810 } 4811 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4812 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4813 zSig2 = 0; 4814 zSig0 |= LIT64( 0x0002000000000000 ); 4815 zExp = aExp; 4816 goto shiftRight1; 4817 } 4818 aSig0 |= LIT64( 0x0001000000000000 ); 4819 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4820 --zExp; 4821 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4822 ++zExp; 4823 shiftRight1: 4824 shift128ExtraRightJamming( 4825 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4826 roundAndPack: 4827 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4828 4829} 4830 4831/* 4832------------------------------------------------------------------------------- 4833Returns the result of subtracting the absolute values of the quadruple- 4834precision floating-point values `a' and `b'. If `zSign' is 1, the 4835difference is negated before being returned. `zSign' is ignored if the 4836result is a NaN. The subtraction is performed according to the IEC/IEEE 4837Standard for Binary Floating-Point Arithmetic. 4838------------------------------------------------------------------------------- 4839*/ 4840static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4841{ 4842 int32 aExp, bExp, zExp; 4843 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4844 int32 expDiff; 4845 float128 z; 4846 4847 aSig1 = extractFloat128Frac1( a ); 4848 aSig0 = extractFloat128Frac0( a ); 4849 aExp = extractFloat128Exp( a ); 4850 bSig1 = extractFloat128Frac1( b ); 4851 bSig0 = extractFloat128Frac0( b ); 4852 bExp = extractFloat128Exp( b ); 4853 expDiff = aExp - bExp; 4854 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4855 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4856 if ( 0 < expDiff ) goto aExpBigger; 4857 if ( expDiff < 0 ) goto bExpBigger; 4858 if ( aExp == 0x7FFF ) { 4859 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4860 return propagateFloat128NaN( a, b ); 4861 } 4862 float_raise( float_flag_invalid ); 4863 z.low = float128_default_nan_low; 4864 z.high = float128_default_nan_high; 4865 return z; 4866 } 4867 if ( aExp == 0 ) { 4868 aExp = 1; 4869 bExp = 1; 4870 } 4871 if ( bSig0 < aSig0 ) goto aBigger; 4872 if ( aSig0 < bSig0 ) goto bBigger; 4873 if ( bSig1 < aSig1 ) goto aBigger; 4874 if ( aSig1 < bSig1 ) goto bBigger; 4875 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4876 bExpBigger: 4877 if ( bExp == 0x7FFF ) { 4878 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4879 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4880 } 4881 if ( aExp == 0 ) { 4882 ++expDiff; 4883 } 4884 else { 4885 aSig0 |= LIT64( 0x4000000000000000 ); 4886 } 4887 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4888 bSig0 |= LIT64( 0x4000000000000000 ); 4889 bBigger: 4890 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4891 zExp = bExp; 4892 zSign ^= 1; 4893 goto normalizeRoundAndPack; 4894 aExpBigger: 4895 if ( aExp == 0x7FFF ) { 4896 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4897 return a; 4898 } 4899 if ( bExp == 0 ) { 4900 --expDiff; 4901 } 4902 else { 4903 bSig0 |= LIT64( 0x4000000000000000 ); 4904 } 4905 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4906 aSig0 |= LIT64( 0x4000000000000000 ); 4907 aBigger: 4908 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4909 zExp = aExp; 4910 normalizeRoundAndPack: 4911 --zExp; 4912 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4913 4914} 4915 4916/* 4917------------------------------------------------------------------------------- 4918Returns the result of adding the quadruple-precision floating-point values 4919`a' and `b'. The operation is performed according to the IEC/IEEE Standard 4920for Binary Floating-Point Arithmetic. 4921------------------------------------------------------------------------------- 4922*/ 4923float128 float128_add( float128 a, float128 b ) 4924{ 4925 flag aSign, bSign; 4926 4927 aSign = extractFloat128Sign( a ); 4928 bSign = extractFloat128Sign( b ); 4929 if ( aSign == bSign ) { 4930 return addFloat128Sigs( a, b, aSign ); 4931 } 4932 else { 4933 return subFloat128Sigs( a, b, aSign ); 4934 } 4935 4936} 4937 4938/* 4939------------------------------------------------------------------------------- 4940Returns the result of subtracting the quadruple-precision floating-point 4941values `a' and `b'. The operation is performed according to the IEC/IEEE 4942Standard for Binary Floating-Point Arithmetic. 4943------------------------------------------------------------------------------- 4944*/ 4945float128 float128_sub( float128 a, float128 b ) 4946{ 4947 flag aSign, bSign; 4948 4949 aSign = extractFloat128Sign( a ); 4950 bSign = extractFloat128Sign( b ); 4951 if ( aSign == bSign ) { 4952 return subFloat128Sigs( a, b, aSign ); 4953 } 4954 else { 4955 return addFloat128Sigs( a, b, aSign ); 4956 } 4957 4958} 4959 4960/* 4961------------------------------------------------------------------------------- 4962Returns the result of multiplying the quadruple-precision floating-point 4963values `a' and `b'. The operation is performed according to the IEC/IEEE 4964Standard for Binary Floating-Point Arithmetic. 4965------------------------------------------------------------------------------- 4966*/ 4967float128 float128_mul( float128 a, float128 b ) 4968{ 4969 flag aSign, bSign, zSign; 4970 int32 aExp, bExp, zExp; 4971 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4972 float128 z; 4973 4974 aSig1 = extractFloat128Frac1( a ); 4975 aSig0 = extractFloat128Frac0( a ); 4976 aExp = extractFloat128Exp( a ); 4977 aSign = extractFloat128Sign( a ); 4978 bSig1 = extractFloat128Frac1( b ); 4979 bSig0 = extractFloat128Frac0( b ); 4980 bExp = extractFloat128Exp( b ); 4981 bSign = extractFloat128Sign( b ); 4982 zSign = aSign ^ bSign; 4983 if ( aExp == 0x7FFF ) { 4984 if ( ( aSig0 | aSig1 ) 4985 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4986 return propagateFloat128NaN( a, b ); 4987 } 4988 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4989 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4990 } 4991 if ( bExp == 0x7FFF ) { 4992 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4993 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4994 invalid: 4995 float_raise( float_flag_invalid ); 4996 z.low = float128_default_nan_low; 4997 z.high = float128_default_nan_high; 4998 return z; 4999 } 5000 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5001 } 5002 if ( aExp == 0 ) { 5003 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5004 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5005 } 5006 if ( bExp == 0 ) { 5007 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5008 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5009 } 5010 zExp = aExp + bExp - 0x4000; 5011 aSig0 |= LIT64( 0x0001000000000000 ); 5012 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5013 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5014 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5015 zSig2 |= ( zSig3 != 0 ); 5016 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5017 shift128ExtraRightJamming( 5018 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5019 ++zExp; 5020 } 5021 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5022 5023} 5024 5025/* 5026------------------------------------------------------------------------------- 5027Returns the result of dividing the quadruple-precision floating-point value 5028`a' by the corresponding value `b'. The operation is performed according to 5029the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5030------------------------------------------------------------------------------- 5031*/ 5032float128 float128_div( float128 a, float128 b ) 5033{ 5034 flag aSign, bSign, zSign; 5035 int32 aExp, bExp, zExp; 5036 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5037 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5038 float128 z; 5039 5040 aSig1 = extractFloat128Frac1( a ); 5041 aSig0 = extractFloat128Frac0( a ); 5042 aExp = extractFloat128Exp( a ); 5043 aSign = extractFloat128Sign( a ); 5044 bSig1 = extractFloat128Frac1( b ); 5045 bSig0 = extractFloat128Frac0( b ); 5046 bExp = extractFloat128Exp( b ); 5047 bSign = extractFloat128Sign( b ); 5048 zSign = aSign ^ bSign; 5049 if ( aExp == 0x7FFF ) { 5050 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5051 if ( bExp == 0x7FFF ) { 5052 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5053 goto invalid; 5054 } 5055 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5056 } 5057 if ( bExp == 0x7FFF ) { 5058 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5059 return packFloat128( zSign, 0, 0, 0 ); 5060 } 5061 if ( bExp == 0 ) { 5062 if ( ( bSig0 | bSig1 ) == 0 ) { 5063 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5064 invalid: 5065 float_raise( float_flag_invalid ); 5066 z.low = float128_default_nan_low; 5067 z.high = float128_default_nan_high; 5068 return z; 5069 } 5070 float_raise( float_flag_divbyzero ); 5071 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5072 } 5073 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5074 } 5075 if ( aExp == 0 ) { 5076 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5077 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5078 } 5079 zExp = aExp - bExp + 0x3FFD; 5080 shortShift128Left( 5081 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5082 shortShift128Left( 5083 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5084 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5085 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5086 ++zExp; 5087 } 5088 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5089 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5090 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5091 while ( (sbits64) rem0 < 0 ) { 5092 --zSig0; 5093 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5094 } 5095 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5096 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5097 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5098 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5099 while ( (sbits64) rem1 < 0 ) { 5100 --zSig1; 5101 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5102 } 5103 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5104 } 5105 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5106 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5107 5108} 5109 5110/* 5111------------------------------------------------------------------------------- 5112Returns the remainder of the quadruple-precision floating-point value `a' 5113with respect to the corresponding value `b'. The operation is performed 5114according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5115------------------------------------------------------------------------------- 5116*/ 5117float128 float128_rem( float128 a, float128 b ) 5118{ 5119 flag aSign, bSign, zSign; 5120 int32 aExp, bExp, expDiff; 5121 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5122 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5123 sbits64 sigMean0; 5124 float128 z; 5125 5126 aSig1 = extractFloat128Frac1( a ); 5127 aSig0 = extractFloat128Frac0( a ); 5128 aExp = extractFloat128Exp( a ); 5129 aSign = extractFloat128Sign( a ); 5130 bSig1 = extractFloat128Frac1( b ); 5131 bSig0 = extractFloat128Frac0( b ); 5132 bExp = extractFloat128Exp( b ); 5133 bSign = extractFloat128Sign( b ); 5134 if ( aExp == 0x7FFF ) { 5135 if ( ( aSig0 | aSig1 ) 5136 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5137 return propagateFloat128NaN( a, b ); 5138 } 5139 goto invalid; 5140 } 5141 if ( bExp == 0x7FFF ) { 5142 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5143 return a; 5144 } 5145 if ( bExp == 0 ) { 5146 if ( ( bSig0 | bSig1 ) == 0 ) { 5147 invalid: 5148 float_raise( float_flag_invalid ); 5149 z.low = float128_default_nan_low; 5150 z.high = float128_default_nan_high; 5151 return z; 5152 } 5153 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5154 } 5155 if ( aExp == 0 ) { 5156 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5157 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5158 } 5159 expDiff = aExp - bExp; 5160 if ( expDiff < -1 ) return a; 5161 shortShift128Left( 5162 aSig0 | LIT64( 0x0001000000000000 ), 5163 aSig1, 5164 15 - ( expDiff < 0 ), 5165 &aSig0, 5166 &aSig1 5167 ); 5168 shortShift128Left( 5169 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5170 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5171 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5172 expDiff -= 64; 5173 while ( 0 < expDiff ) { 5174 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5175 q = ( 4 < q ) ? q - 4 : 0; 5176 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5177 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5178 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5179 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5180 expDiff -= 61; 5181 } 5182 if ( -64 < expDiff ) { 5183 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5184 q = ( 4 < q ) ? q - 4 : 0; 5185 q >>= - expDiff; 5186 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5187 expDiff += 52; 5188 if ( expDiff < 0 ) { 5189 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5190 } 5191 else { 5192 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5193 } 5194 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5195 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5196 } 5197 else { 5198 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5199 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5200 } 5201 do { 5202 alternateASig0 = aSig0; 5203 alternateASig1 = aSig1; 5204 ++q; 5205 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5206 } while ( 0 <= (sbits64) aSig0 ); 5207 add128( 5208 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5209 if ( ( sigMean0 < 0 ) 5210 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5211 aSig0 = alternateASig0; 5212 aSig1 = alternateASig1; 5213 } 5214 zSign = ( (sbits64) aSig0 < 0 ); 5215 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5216 return 5217 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5218 5219} 5220 5221/* 5222------------------------------------------------------------------------------- 5223Returns the square root of the quadruple-precision floating-point value `a'. 5224The operation is performed according to the IEC/IEEE Standard for Binary 5225Floating-Point Arithmetic. 5226------------------------------------------------------------------------------- 5227*/ 5228float128 float128_sqrt( float128 a ) 5229{ 5230 flag aSign; 5231 int32 aExp, zExp; 5232 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5233 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5234 float128 z; 5235 5236 aSig1 = extractFloat128Frac1( a ); 5237 aSig0 = extractFloat128Frac0( a ); 5238 aExp = extractFloat128Exp( a ); 5239 aSign = extractFloat128Sign( a ); 5240 if ( aExp == 0x7FFF ) { 5241 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5242 if ( ! aSign ) return a; 5243 goto invalid; 5244 } 5245 if ( aSign ) { 5246 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5247 invalid: 5248 float_raise( float_flag_invalid ); 5249 z.low = float128_default_nan_low; 5250 z.high = float128_default_nan_high; 5251 return z; 5252 } 5253 if ( aExp == 0 ) { 5254 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5255 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5256 } 5257 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5258 aSig0 |= LIT64( 0x0001000000000000 ); 5259 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5260 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5261 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5262 doubleZSig0 = zSig0<<1; 5263 mul64To128( zSig0, zSig0, &term0, &term1 ); 5264 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5265 while ( (sbits64) rem0 < 0 ) { 5266 --zSig0; 5267 doubleZSig0 -= 2; 5268 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5269 } 5270 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5271 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5272 if ( zSig1 == 0 ) zSig1 = 1; 5273 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5274 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5275 mul64To128( zSig1, zSig1, &term2, &term3 ); 5276 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5277 while ( (sbits64) rem1 < 0 ) { 5278 --zSig1; 5279 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5280 term3 |= 1; 5281 term2 |= doubleZSig0; 5282 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5283 } 5284 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5285 } 5286 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5287 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5288 5289} 5290 5291/* 5292------------------------------------------------------------------------------- 5293Returns 1 if the quadruple-precision floating-point value `a' is equal to 5294the corresponding value `b', and 0 otherwise. The comparison is performed 5295according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5296------------------------------------------------------------------------------- 5297*/ 5298flag float128_eq( float128 a, float128 b ) 5299{ 5300 5301 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5302 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5303 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5304 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5305 ) { 5306 if ( float128_is_signaling_nan( a ) 5307 || float128_is_signaling_nan( b ) ) { 5308 float_raise( float_flag_invalid ); 5309 } 5310 return 0; 5311 } 5312 return 5313 ( a.low == b.low ) 5314 && ( ( a.high == b.high ) 5315 || ( ( a.low == 0 ) 5316 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5317 ); 5318 5319} 5320 5321/* 5322------------------------------------------------------------------------------- 5323Returns 1 if the quadruple-precision floating-point value `a' is less than 5324or equal to the corresponding value `b', and 0 otherwise. The comparison 5325is performed according to the IEC/IEEE Standard for Binary Floating-Point 5326Arithmetic. 5327------------------------------------------------------------------------------- 5328*/ 5329flag float128_le( float128 a, float128 b ) 5330{ 5331 flag aSign, bSign; 5332 5333 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5334 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5335 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5336 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5337 ) { 5338 float_raise( float_flag_invalid ); 5339 return 0; 5340 } 5341 aSign = extractFloat128Sign( a ); 5342 bSign = extractFloat128Sign( b ); 5343 if ( aSign != bSign ) { 5344 return 5345 aSign 5346 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5347 == 0 ); 5348 } 5349 return 5350 aSign ? le128( b.high, b.low, a.high, a.low ) 5351 : le128( a.high, a.low, b.high, b.low ); 5352 5353} 5354 5355/* 5356------------------------------------------------------------------------------- 5357Returns 1 if the quadruple-precision floating-point value `a' is less than 5358the corresponding value `b', and 0 otherwise. The comparison is performed 5359according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5360------------------------------------------------------------------------------- 5361*/ 5362flag float128_lt( float128 a, float128 b ) 5363{ 5364 flag aSign, bSign; 5365 5366 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5367 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5368 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5369 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5370 ) { 5371 float_raise( float_flag_invalid ); 5372 return 0; 5373 } 5374 aSign = extractFloat128Sign( a ); 5375 bSign = extractFloat128Sign( b ); 5376 if ( aSign != bSign ) { 5377 return 5378 aSign 5379 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5380 != 0 ); 5381 } 5382 return 5383 aSign ? lt128( b.high, b.low, a.high, a.low ) 5384 : lt128( a.high, a.low, b.high, b.low ); 5385 5386} 5387 5388/* 5389------------------------------------------------------------------------------- 5390Returns 1 if the quadruple-precision floating-point value `a' is equal to 5391the corresponding value `b', and 0 otherwise. The invalid exception is 5392raised if either operand is a NaN. Otherwise, the comparison is performed 5393according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5394------------------------------------------------------------------------------- 5395*/ 5396flag float128_eq_signaling( float128 a, float128 b ) 5397{ 5398 5399 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5400 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5401 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5402 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5403 ) { 5404 float_raise( float_flag_invalid ); 5405 return 0; 5406 } 5407 return 5408 ( a.low == b.low ) 5409 && ( ( a.high == b.high ) 5410 || ( ( a.low == 0 ) 5411 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5412 ); 5413 5414} 5415 5416/* 5417------------------------------------------------------------------------------- 5418Returns 1 if the quadruple-precision floating-point value `a' is less than 5419or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5420cause an exception. Otherwise, the comparison is performed according to the 5421IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5422------------------------------------------------------------------------------- 5423*/ 5424flag float128_le_quiet( float128 a, float128 b ) 5425{ 5426 flag aSign, bSign; 5427 5428 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5429 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5430 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5431 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5432 ) { 5433 if ( float128_is_signaling_nan( a ) 5434 || float128_is_signaling_nan( b ) ) { 5435 float_raise( float_flag_invalid ); 5436 } 5437 return 0; 5438 } 5439 aSign = extractFloat128Sign( a ); 5440 bSign = extractFloat128Sign( b ); 5441 if ( aSign != bSign ) { 5442 return 5443 aSign 5444 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5445 == 0 ); 5446 } 5447 return 5448 aSign ? le128( b.high, b.low, a.high, a.low ) 5449 : le128( a.high, a.low, b.high, b.low ); 5450 5451} 5452 5453/* 5454------------------------------------------------------------------------------- 5455Returns 1 if the quadruple-precision floating-point value `a' is less than 5456the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5457exception. Otherwise, the comparison is performed according to the IEC/IEEE 5458Standard for Binary Floating-Point Arithmetic. 5459------------------------------------------------------------------------------- 5460*/ 5461flag float128_lt_quiet( float128 a, float128 b ) 5462{ 5463 flag aSign, bSign; 5464 5465 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5466 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5467 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5468 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5469 ) { 5470 if ( float128_is_signaling_nan( a ) 5471 || float128_is_signaling_nan( b ) ) { 5472 float_raise( float_flag_invalid ); 5473 } 5474 return 0; 5475 } 5476 aSign = extractFloat128Sign( a ); 5477 bSign = extractFloat128Sign( b ); 5478 if ( aSign != bSign ) { 5479 return 5480 aSign 5481 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5482 != 0 ); 5483 } 5484 return 5485 aSign ? lt128( b.high, b.low, a.high, a.low ) 5486 : lt128( a.high, a.low, b.high, b.low ); 5487 5488} 5489 5490#endif 5491 5492 5493#if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5494 5495/* 5496 * These two routines are not part of the original softfloat distribution. 5497 * 5498 * They are based on the corresponding conversions to integer but return 5499 * unsigned numbers instead since these functions are required by GCC. 5500 * 5501 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5502 * 5503 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5504 */ 5505 5506/* 5507------------------------------------------------------------------------------- 5508Returns the result of converting the double-precision floating-point value 5509`a' to the 32-bit unsigned integer format. The conversion is 5510performed according to the IEC/IEEE Standard for Binary Floating-point 5511Arithmetic, except that the conversion is always rounded toward zero. If 5512`a' is a NaN, the largest positive integer is returned. If the conversion 5513overflows, the largest integer positive is returned. 5514------------------------------------------------------------------------------- 5515*/ 5516uint32 float64_to_uint32_round_to_zero( float64 a ) 5517{ 5518 flag aSign; 5519 int16 aExp, shiftCount; 5520 bits64 aSig, savedASig; 5521 uint32 z; 5522 5523 aSig = extractFloat64Frac( a ); 5524 aExp = extractFloat64Exp( a ); 5525 aSign = extractFloat64Sign( a ); 5526 5527 if (aSign) { 5528 float_raise( float_flag_invalid ); 5529 return(0); 5530 } 5531 5532 if ( 0x41E < aExp ) { 5533 float_raise( float_flag_invalid ); 5534 return 0xffffffff; 5535 } 5536 else if ( aExp < 0x3FF ) { 5537 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 5538 return 0; 5539 } 5540 aSig |= LIT64( 0x0010000000000000 ); 5541 shiftCount = 0x433 - aExp; 5542 savedASig = aSig; 5543 aSig >>= shiftCount; 5544 z = aSig; 5545 if ( ( aSig<<shiftCount ) != savedASig ) { 5546 float_exception_flags |= float_flag_inexact; 5547 } 5548 return z; 5549 5550} 5551 5552/* 5553------------------------------------------------------------------------------- 5554Returns the result of converting the single-precision floating-point value 5555`a' to the 32-bit unsigned integer format. The conversion is 5556performed according to the IEC/IEEE Standard for Binary Floating-point 5557Arithmetic, except that the conversion is always rounded toward zero. If 5558`a' is a NaN, the largest positive integer is returned. If the conversion 5559overflows, the largest positive integer is returned. 5560------------------------------------------------------------------------------- 5561*/ 5562uint32 float32_to_uint32_round_to_zero( float32 a ) 5563{ 5564 flag aSign; 5565 int16 aExp, shiftCount; 5566 bits32 aSig; 5567 uint32 z; 5568 5569 aSig = extractFloat32Frac( a ); 5570 aExp = extractFloat32Exp( a ); 5571 aSign = extractFloat32Sign( a ); 5572 shiftCount = aExp - 0x9E; 5573 5574 if (aSign) { 5575 float_raise( float_flag_invalid ); 5576 return(0); 5577 } 5578 if ( 0 < shiftCount ) { 5579 float_raise( float_flag_invalid ); 5580 return 0xFFFFFFFF; 5581 } 5582 else if ( aExp <= 0x7E ) { 5583 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 5584 return 0; 5585 } 5586 aSig = ( aSig | 0x800000 )<<8; 5587 z = aSig>>( - shiftCount ); 5588 if ( aSig<<( shiftCount & 31 ) ) { 5589 float_exception_flags |= float_flag_inexact; 5590 } 5591 return z; 5592 5593} 5594 5595#endif 5596