1/* 2=============================================================================== 3 4This C source file is part of the SoftFloat IEC/IEEE Floating-point 5Arithmetic Package, Release 2. 6 7Written by John R. Hauser. This work was made possible in part by the 8International Computer Science Institute, located at Suite 600, 1947 Center 9Street, Berkeley, California 94704. Funding was partially provided by the 10National Science Foundation under grant MIP-9311980. The original version 11of this code was written as part of a project to build a fixed-point vector 12processor in collaboration with the University of California at Berkeley, 13overseen by Profs. Nelson Morgan and John Wawrzynek. More information 14is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 15arithmetic/softfloat.html'. 16 17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 18has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 19TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 20PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 21AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 22 23Derivative works are acceptable, even for commercial purposes, so long as 24(1) they include prominent notice that the work is derivative, and (2) they 25include prominent notice akin to these three paragraphs for those parts of 26this code that are retained. 27 28=============================================================================== 29*/ 30 31#include "fpa11.h" 32#include "milieu.h" 33#include "softfloat.h" 34 35/* 36------------------------------------------------------------------------------- 37Floating-point rounding mode, extended double-precision rounding precision, 38and exception flags. 39------------------------------------------------------------------------------- 40*/ 41int8 float_rounding_mode = float_round_nearest_even; 42int8 floatx80_rounding_precision = 80; 43int8 float_exception_flags; 44 45/* 46------------------------------------------------------------------------------- 47Primitive arithmetic functions, including multi-word arithmetic, and 48division and square root approximations. (Can be specialized to target if 49desired.) 50------------------------------------------------------------------------------- 51*/ 52#include "softfloat-macros" 53 54/* 55------------------------------------------------------------------------------- 56Functions and definitions to determine: (1) whether tininess for underflow 57is detected before or after rounding by default, (2) what (if anything) 58happens when exceptions are raised, (3) how signaling NaNs are distinguished 59from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 60are propagated from function inputs to output. These details are target- 61specific. 62------------------------------------------------------------------------------- 63*/ 64#include "softfloat-specialize" 65 66/* 67------------------------------------------------------------------------------- 68Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 69and 7, and returns the properly rounded 32-bit integer corresponding to the 70input. If `zSign' is nonzero, the input is negated before being converted 71to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point 72input is simply rounded to an integer, with the inexact exception raised if 73the input cannot be represented exactly as an integer. If the fixed-point 74input is too large, however, the invalid exception is raised and the largest 75positive or negative integer is returned. 76------------------------------------------------------------------------------- 77*/ 78static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 79{ 80 int8 roundingMode; 81 flag roundNearestEven; 82 int8 roundIncrement, roundBits; 83 int32 z; 84 85 roundingMode = float_rounding_mode; 86 roundNearestEven = ( roundingMode == float_round_nearest_even ); 87 roundIncrement = 0x40; 88 if ( ! roundNearestEven ) { 89 if ( roundingMode == float_round_to_zero ) { 90 roundIncrement = 0; 91 } 92 else { 93 roundIncrement = 0x7F; 94 if ( zSign ) { 95 if ( roundingMode == float_round_up ) roundIncrement = 0; 96 } 97 else { 98 if ( roundingMode == float_round_down ) roundIncrement = 0; 99 } 100 } 101 } 102 roundBits = absZ & 0x7F; 103 absZ = ( absZ + roundIncrement )>>7; 104 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 105 z = absZ; 106 if ( zSign ) z = - z; 107 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 108 float_exception_flags |= float_flag_invalid; 109 return zSign ? 0x80000000 : 0x7FFFFFFF; 110 } 111 if ( roundBits ) float_exception_flags |= float_flag_inexact; 112 return z; 113 114} 115 116/* 117------------------------------------------------------------------------------- 118Returns the fraction bits of the single-precision floating-point value `a'. 119------------------------------------------------------------------------------- 120*/ 121INLINE bits32 extractFloat32Frac( float32 a ) 122{ 123 124 return a & 0x007FFFFF; 125 126} 127 128/* 129------------------------------------------------------------------------------- 130Returns the exponent bits of the single-precision floating-point value `a'. 131------------------------------------------------------------------------------- 132*/ 133INLINE int16 extractFloat32Exp( float32 a ) 134{ 135 136 return ( a>>23 ) & 0xFF; 137 138} 139 140/* 141------------------------------------------------------------------------------- 142Returns the sign bit of the single-precision floating-point value `a'. 143------------------------------------------------------------------------------- 144*/ 145INLINE flag extractFloat32Sign( float32 a ) 146{ 147 148 return a>>31; 149 150} 151 152/* 153------------------------------------------------------------------------------- 154Normalizes the subnormal single-precision floating-point value represented 155by the denormalized significand `aSig'. The normalized exponent and 156significand are stored at the locations pointed to by `zExpPtr' and 157`zSigPtr', respectively. 158------------------------------------------------------------------------------- 159*/ 160static void 161 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 162{ 163 int8 shiftCount; 164 165 shiftCount = countLeadingZeros32( aSig ) - 8; 166 *zSigPtr = aSig<<shiftCount; 167 *zExpPtr = 1 - shiftCount; 168 169} 170 171/* 172------------------------------------------------------------------------------- 173Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 174single-precision floating-point value, returning the result. After being 175shifted into the proper positions, the three fields are simply added 176together to form the result. This means that any integer portion of `zSig' 177will be added into the exponent. Since a properly normalized significand 178will have an integer portion equal to 1, the `zExp' input should be 1 less 179than the desired result exponent whenever `zSig' is a complete, normalized 180significand. 181------------------------------------------------------------------------------- 182*/ 183INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 184{ 185 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 186} 187 188/* 189------------------------------------------------------------------------------- 190Takes an abstract floating-point value having sign `zSign', exponent `zExp', 191and significand `zSig', and returns the proper single-precision floating- 192point value corresponding to the abstract input. Ordinarily, the abstract 193value is simply rounded and packed into the single-precision format, with 194the inexact exception raised if the abstract input cannot be represented 195exactly. If the abstract value is too large, however, the overflow and 196inexact exceptions are raised and an infinity or maximal finite value is 197returned. If the abstract value is too small, the input value is rounded to 198a subnormal number, and the underflow and inexact exceptions are raised if 199the abstract input cannot be represented exactly as a subnormal single- 200precision floating-point number. 201 The input significand `zSig' has its binary point between bits 30 202and 29, which is 7 bits to the left of the usual location. This shifted 203significand must be normalized or smaller. If `zSig' is not normalized, 204`zExp' must be 0; in that case, the result returned is a subnormal number, 205and it must not require rounding. In the usual case that `zSig' is 206normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 207The handling of underflow and overflow follows the IEC/IEEE Standard for 208Binary Floating-point Arithmetic. 209------------------------------------------------------------------------------- 210*/ 211static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 212{ 213 int8 roundingMode; 214 flag roundNearestEven; 215 int8 roundIncrement, roundBits; 216 flag isTiny; 217 218 roundingMode = float_rounding_mode; 219 roundNearestEven = ( roundingMode == float_round_nearest_even ); 220 roundIncrement = 0x40; 221 if ( ! roundNearestEven ) { 222 if ( roundingMode == float_round_to_zero ) { 223 roundIncrement = 0; 224 } 225 else { 226 roundIncrement = 0x7F; 227 if ( zSign ) { 228 if ( roundingMode == float_round_up ) roundIncrement = 0; 229 } 230 else { 231 if ( roundingMode == float_round_down ) roundIncrement = 0; 232 } 233 } 234 } 235 roundBits = zSig & 0x7F; 236 if ( 0xFD <= (bits16) zExp ) { 237 if ( ( 0xFD < zExp ) 238 || ( ( zExp == 0xFD ) 239 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 240 ) { 241 float_raise( float_flag_overflow | float_flag_inexact ); 242 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 243 } 244 if ( zExp < 0 ) { 245 isTiny = 246 ( float_detect_tininess == float_tininess_before_rounding ) 247 || ( zExp < -1 ) 248 || ( zSig + roundIncrement < 0x80000000 ); 249 shift32RightJamming( zSig, - zExp, &zSig ); 250 zExp = 0; 251 roundBits = zSig & 0x7F; 252 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 253 } 254 } 255 if ( roundBits ) float_exception_flags |= float_flag_inexact; 256 zSig = ( zSig + roundIncrement )>>7; 257 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 258 if ( zSig == 0 ) zExp = 0; 259 return packFloat32( zSign, zExp, zSig ); 260 261} 262 263/* 264------------------------------------------------------------------------------- 265Takes an abstract floating-point value having sign `zSign', exponent `zExp', 266and significand `zSig', and returns the proper single-precision floating- 267point value corresponding to the abstract input. This routine is just like 268`roundAndPackFloat32' except that `zSig' does not have to be normalized in 269any way. In all cases, `zExp' must be 1 less than the ``true'' floating- 270point exponent. 271------------------------------------------------------------------------------- 272*/ 273static float32 274 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 275{ 276 int8 shiftCount; 277 278 shiftCount = countLeadingZeros32( zSig ) - 1; 279 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 280 281} 282 283/* 284------------------------------------------------------------------------------- 285Returns the fraction bits of the double-precision floating-point value `a'. 286------------------------------------------------------------------------------- 287*/ 288INLINE bits64 extractFloat64Frac( float64 a ) 289{ 290 291 return a & LIT64( 0x000FFFFFFFFFFFFF ); 292 293} 294 295/* 296------------------------------------------------------------------------------- 297Returns the exponent bits of the double-precision floating-point value `a'. 298------------------------------------------------------------------------------- 299*/ 300INLINE int16 extractFloat64Exp( float64 a ) 301{ 302 303 return ( a>>52 ) & 0x7FF; 304 305} 306 307/* 308------------------------------------------------------------------------------- 309Returns the sign bit of the double-precision floating-point value `a'. 310------------------------------------------------------------------------------- 311*/ 312INLINE flag extractFloat64Sign( float64 a ) 313{ 314 315 return a>>63; 316 317} 318 319/* 320------------------------------------------------------------------------------- 321Normalizes the subnormal double-precision floating-point value represented 322by the denormalized significand `aSig'. The normalized exponent and 323significand are stored at the locations pointed to by `zExpPtr' and 324`zSigPtr', respectively. 325------------------------------------------------------------------------------- 326*/ 327static void 328 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 329{ 330 int8 shiftCount; 331 332 shiftCount = countLeadingZeros64( aSig ) - 11; 333 *zSigPtr = aSig<<shiftCount; 334 *zExpPtr = 1 - shiftCount; 335 336} 337 338/* 339------------------------------------------------------------------------------- 340Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 341double-precision floating-point value, returning the result. After being 342shifted into the proper positions, the three fields are simply added 343together to form the result. This means that any integer portion of `zSig' 344will be added into the exponent. Since a properly normalized significand 345will have an integer portion equal to 1, the `zExp' input should be 1 less 346than the desired result exponent whenever `zSig' is a complete, normalized 347significand. 348------------------------------------------------------------------------------- 349*/ 350INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 351{ 352 353 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig; 354 355} 356 357/* 358------------------------------------------------------------------------------- 359Takes an abstract floating-point value having sign `zSign', exponent `zExp', 360and significand `zSig', and returns the proper double-precision floating- 361point value corresponding to the abstract input. Ordinarily, the abstract 362value is simply rounded and packed into the double-precision format, with 363the inexact exception raised if the abstract input cannot be represented 364exactly. If the abstract value is too large, however, the overflow and 365inexact exceptions are raised and an infinity or maximal finite value is 366returned. If the abstract value is too small, the input value is rounded to 367a subnormal number, and the underflow and inexact exceptions are raised if 368the abstract input cannot be represented exactly as a subnormal double- 369precision floating-point number. 370 The input significand `zSig' has its binary point between bits 62 371and 61, which is 10 bits to the left of the usual location. This shifted 372significand must be normalized or smaller. If `zSig' is not normalized, 373`zExp' must be 0; in that case, the result returned is a subnormal number, 374and it must not require rounding. In the usual case that `zSig' is 375normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 376The handling of underflow and overflow follows the IEC/IEEE Standard for 377Binary Floating-point Arithmetic. 378------------------------------------------------------------------------------- 379*/ 380static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 381{ 382 int8 roundingMode; 383 flag roundNearestEven; 384 int16 roundIncrement, roundBits; 385 flag isTiny; 386 387 roundingMode = float_rounding_mode; 388 roundNearestEven = ( roundingMode == float_round_nearest_even ); 389 roundIncrement = 0x200; 390 if ( ! roundNearestEven ) { 391 if ( roundingMode == float_round_to_zero ) { 392 roundIncrement = 0; 393 } 394 else { 395 roundIncrement = 0x3FF; 396 if ( zSign ) { 397 if ( roundingMode == float_round_up ) roundIncrement = 0; 398 } 399 else { 400 if ( roundingMode == float_round_down ) roundIncrement = 0; 401 } 402 } 403 } 404 roundBits = zSig & 0x3FF; 405 if ( 0x7FD <= (bits16) zExp ) { 406 if ( ( 0x7FD < zExp ) 407 || ( ( zExp == 0x7FD ) 408 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 409 ) { 410 //register int lr = __builtin_return_address(0); 411 //printk("roundAndPackFloat64 called from 0x%08x\n",lr); 412 float_raise( float_flag_overflow | float_flag_inexact ); 413 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 ); 414 } 415 if ( zExp < 0 ) { 416 isTiny = 417 ( float_detect_tininess == float_tininess_before_rounding ) 418 || ( zExp < -1 ) 419 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 420 shift64RightJamming( zSig, - zExp, &zSig ); 421 zExp = 0; 422 roundBits = zSig & 0x3FF; 423 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 424 } 425 } 426 if ( roundBits ) float_exception_flags |= float_flag_inexact; 427 zSig = ( zSig + roundIncrement )>>10; 428 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 429 if ( zSig == 0 ) zExp = 0; 430 return packFloat64( zSign, zExp, zSig ); 431 432} 433 434/* 435------------------------------------------------------------------------------- 436Takes an abstract floating-point value having sign `zSign', exponent `zExp', 437and significand `zSig', and returns the proper double-precision floating- 438point value corresponding to the abstract input. This routine is just like 439`roundAndPackFloat64' except that `zSig' does not have to be normalized in 440any way. In all cases, `zExp' must be 1 less than the ``true'' floating- 441point exponent. 442------------------------------------------------------------------------------- 443*/ 444static float64 445 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 446{ 447 int8 shiftCount; 448 449 shiftCount = countLeadingZeros64( zSig ) - 1; 450 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 451 452} 453 454#ifdef FLOATX80 455 456/* 457------------------------------------------------------------------------------- 458Returns the fraction bits of the extended double-precision floating-point 459value `a'. 460------------------------------------------------------------------------------- 461*/ 462INLINE bits64 extractFloatx80Frac( floatx80 a ) 463{ 464 465 return a.low; 466 467} 468 469/* 470------------------------------------------------------------------------------- 471Returns the exponent bits of the extended double-precision floating-point 472value `a'. 473------------------------------------------------------------------------------- 474*/ 475INLINE int32 extractFloatx80Exp( floatx80 a ) 476{ 477 478 return a.high & 0x7FFF; 479 480} 481 482/* 483------------------------------------------------------------------------------- 484Returns the sign bit of the extended double-precision floating-point value 485`a'. 486------------------------------------------------------------------------------- 487*/ 488INLINE flag extractFloatx80Sign( floatx80 a ) 489{ 490 491 return a.high>>15; 492 493} 494 495/* 496------------------------------------------------------------------------------- 497Normalizes the subnormal extended double-precision floating-point value 498represented by the denormalized significand `aSig'. The normalized exponent 499and significand are stored at the locations pointed to by `zExpPtr' and 500`zSigPtr', respectively. 501------------------------------------------------------------------------------- 502*/ 503static void 504 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 505{ 506 int8 shiftCount; 507 508 shiftCount = countLeadingZeros64( aSig ); 509 *zSigPtr = aSig<<shiftCount; 510 *zExpPtr = 1 - shiftCount; 511 512} 513 514/* 515------------------------------------------------------------------------------- 516Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 517extended double-precision floating-point value, returning the result. 518------------------------------------------------------------------------------- 519*/ 520INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 521{ 522 floatx80 z; 523 524 z.low = zSig; 525 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 526 return z; 527 528} 529 530/* 531------------------------------------------------------------------------------- 532Takes an abstract floating-point value having sign `zSign', exponent `zExp', 533and extended significand formed by the concatenation of `zSig0' and `zSig1', 534and returns the proper extended double-precision floating-point value 535corresponding to the abstract input. Ordinarily, the abstract value is 536rounded and packed into the extended double-precision format, with the 537inexact exception raised if the abstract input cannot be represented 538exactly. If the abstract value is too large, however, the overflow and 539inexact exceptions are raised and an infinity or maximal finite value is 540returned. If the abstract value is too small, the input value is rounded to 541a subnormal number, and the underflow and inexact exceptions are raised if 542the abstract input cannot be represented exactly as a subnormal extended 543double-precision floating-point number. 544 If `roundingPrecision' is 32 or 64, the result is rounded to the same 545number of bits as single or double precision, respectively. Otherwise, the 546result is rounded to the full precision of the extended double-precision 547format. 548 The input significand must be normalized or smaller. If the input 549significand is not normalized, `zExp' must be 0; in that case, the result 550returned is a subnormal number, and it must not require rounding. The 551handling of underflow and overflow follows the IEC/IEEE Standard for Binary 552Floating-point Arithmetic. 553------------------------------------------------------------------------------- 554*/ 555static floatx80 556 roundAndPackFloatx80( 557 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 558 ) 559{ 560 int8 roundingMode; 561 flag roundNearestEven, increment, isTiny; 562 int64 roundIncrement, roundMask, roundBits; 563 564 roundingMode = float_rounding_mode; 565 roundNearestEven = ( roundingMode == float_round_nearest_even ); 566 if ( roundingPrecision == 80 ) goto precision80; 567 if ( roundingPrecision == 64 ) { 568 roundIncrement = LIT64( 0x0000000000000400 ); 569 roundMask = LIT64( 0x00000000000007FF ); 570 } 571 else if ( roundingPrecision == 32 ) { 572 roundIncrement = LIT64( 0x0000008000000000 ); 573 roundMask = LIT64( 0x000000FFFFFFFFFF ); 574 } 575 else { 576 goto precision80; 577 } 578 zSig0 |= ( zSig1 != 0 ); 579 if ( ! roundNearestEven ) { 580 if ( roundingMode == float_round_to_zero ) { 581 roundIncrement = 0; 582 } 583 else { 584 roundIncrement = roundMask; 585 if ( zSign ) { 586 if ( roundingMode == float_round_up ) roundIncrement = 0; 587 } 588 else { 589 if ( roundingMode == float_round_down ) roundIncrement = 0; 590 } 591 } 592 } 593 roundBits = zSig0 & roundMask; 594 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 595 if ( ( 0x7FFE < zExp ) 596 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 597 ) { 598 goto overflow; 599 } 600 if ( zExp <= 0 ) { 601 isTiny = 602 ( float_detect_tininess == float_tininess_before_rounding ) 603 || ( zExp < 0 ) 604 || ( zSig0 <= zSig0 + roundIncrement ); 605 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 606 zExp = 0; 607 roundBits = zSig0 & roundMask; 608 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 609 if ( roundBits ) float_exception_flags |= float_flag_inexact; 610 zSig0 += roundIncrement; 611 if ( (sbits64) zSig0 < 0 ) zExp = 1; 612 roundIncrement = roundMask + 1; 613 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 614 roundMask |= roundIncrement; 615 } 616 zSig0 &= ~ roundMask; 617 return packFloatx80( zSign, zExp, zSig0 ); 618 } 619 } 620 if ( roundBits ) float_exception_flags |= float_flag_inexact; 621 zSig0 += roundIncrement; 622 if ( zSig0 < roundIncrement ) { 623 ++zExp; 624 zSig0 = LIT64( 0x8000000000000000 ); 625 } 626 roundIncrement = roundMask + 1; 627 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 628 roundMask |= roundIncrement; 629 } 630 zSig0 &= ~ roundMask; 631 if ( zSig0 == 0 ) zExp = 0; 632 return packFloatx80( zSign, zExp, zSig0 ); 633 precision80: 634 increment = ( (sbits64) zSig1 < 0 ); 635 if ( ! roundNearestEven ) { 636 if ( roundingMode == float_round_to_zero ) { 637 increment = 0; 638 } 639 else { 640 if ( zSign ) { 641 increment = ( roundingMode == float_round_down ) && zSig1; 642 } 643 else { 644 increment = ( roundingMode == float_round_up ) && zSig1; 645 } 646 } 647 } 648 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 649 if ( ( 0x7FFE < zExp ) 650 || ( ( zExp == 0x7FFE ) 651 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 652 && increment 653 ) 654 ) { 655 roundMask = 0; 656 overflow: 657 float_raise( float_flag_overflow | float_flag_inexact ); 658 if ( ( roundingMode == float_round_to_zero ) 659 || ( zSign && ( roundingMode == float_round_up ) ) 660 || ( ! zSign && ( roundingMode == float_round_down ) ) 661 ) { 662 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 663 } 664 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 665 } 666 if ( zExp <= 0 ) { 667 isTiny = 668 ( float_detect_tininess == float_tininess_before_rounding ) 669 || ( zExp < 0 ) 670 || ! increment 671 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 672 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 673 zExp = 0; 674 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 675 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 676 if ( roundNearestEven ) { 677 increment = ( (sbits64) zSig1 < 0 ); 678 } 679 else { 680 if ( zSign ) { 681 increment = ( roundingMode == float_round_down ) && zSig1; 682 } 683 else { 684 increment = ( roundingMode == float_round_up ) && zSig1; 685 } 686 } 687 if ( increment ) { 688 ++zSig0; 689 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven ); 690 if ( (sbits64) zSig0 < 0 ) zExp = 1; 691 } 692 return packFloatx80( zSign, zExp, zSig0 ); 693 } 694 } 695 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 696 if ( increment ) { 697 ++zSig0; 698 if ( zSig0 == 0 ) { 699 ++zExp; 700 zSig0 = LIT64( 0x8000000000000000 ); 701 } 702 else { 703 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven ); 704 } 705 } 706 else { 707 if ( zSig0 == 0 ) zExp = 0; 708 } 709 710 return packFloatx80( zSign, zExp, zSig0 ); 711} 712 713/* 714------------------------------------------------------------------------------- 715Takes an abstract floating-point value having sign `zSign', exponent 716`zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 717and returns the proper extended double-precision floating-point value 718corresponding to the abstract input. This routine is just like 719`roundAndPackFloatx80' except that the input significand does not have to be 720normalized. 721------------------------------------------------------------------------------- 722*/ 723static floatx80 724 normalizeRoundAndPackFloatx80( 725 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 726 ) 727{ 728 int8 shiftCount; 729 730 if ( zSig0 == 0 ) { 731 zSig0 = zSig1; 732 zSig1 = 0; 733 zExp -= 64; 734 } 735 shiftCount = countLeadingZeros64( zSig0 ); 736 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 737 zExp -= shiftCount; 738 return 739 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 740 741} 742 743#endif 744 745/* 746------------------------------------------------------------------------------- 747Returns the result of converting the 32-bit two's complement integer `a' to 748the single-precision floating-point format. The conversion is performed 749according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 750------------------------------------------------------------------------------- 751*/ 752float32 int32_to_float32( int32 a ) 753{ 754 flag zSign; 755 756 if ( a == 0 ) return 0; 757 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 758 zSign = ( a < 0 ); 759 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 760 761} 762 763/* 764------------------------------------------------------------------------------- 765Returns the result of converting the 32-bit two's complement integer `a' to 766the double-precision floating-point format. The conversion is performed 767according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 768------------------------------------------------------------------------------- 769*/ 770float64 int32_to_float64( int32 a ) 771{ 772 flag aSign; 773 uint32 absA; 774 int8 shiftCount; 775 bits64 zSig; 776 777 if ( a == 0 ) return 0; 778 aSign = ( a < 0 ); 779 absA = aSign ? - a : a; 780 shiftCount = countLeadingZeros32( absA ) + 21; 781 zSig = absA; 782 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount ); 783 784} 785 786#ifdef FLOATX80 787 788/* 789------------------------------------------------------------------------------- 790Returns the result of converting the 32-bit two's complement integer `a' 791to the extended double-precision floating-point format. The conversion 792is performed according to the IEC/IEEE Standard for Binary Floating-point 793Arithmetic. 794------------------------------------------------------------------------------- 795*/ 796floatx80 int32_to_floatx80( int32 a ) 797{ 798 flag zSign; 799 uint32 absA; 800 int8 shiftCount; 801 bits64 zSig; 802 803 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 804 zSign = ( a < 0 ); 805 absA = zSign ? - a : a; 806 shiftCount = countLeadingZeros32( absA ) + 32; 807 zSig = absA; 808 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 809 810} 811 812#endif 813 814/* 815------------------------------------------------------------------------------- 816Returns the result of converting the single-precision floating-point value 817`a' to the 32-bit two's complement integer format. The conversion is 818performed according to the IEC/IEEE Standard for Binary Floating-point 819Arithmetic---which means in particular that the conversion is rounded 820according to the current rounding mode. If `a' is a NaN, the largest 821positive integer is returned. Otherwise, if the conversion overflows, the 822largest integer with the same sign as `a' is returned. 823------------------------------------------------------------------------------- 824*/ 825int32 float32_to_int32( float32 a ) 826{ 827 flag aSign; 828 int16 aExp, shiftCount; 829 bits32 aSig; 830 bits64 zSig; 831 832 aSig = extractFloat32Frac( a ); 833 aExp = extractFloat32Exp( a ); 834 aSign = extractFloat32Sign( a ); 835 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 836 if ( aExp ) aSig |= 0x00800000; 837 shiftCount = 0xAF - aExp; 838 zSig = aSig; 839 zSig <<= 32; 840 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig ); 841 return roundAndPackInt32( aSign, zSig ); 842 843} 844 845/* 846------------------------------------------------------------------------------- 847Returns the result of converting the single-precision floating-point value 848`a' to the 32-bit two's complement integer format. The conversion is 849performed according to the IEC/IEEE Standard for Binary Floating-point 850Arithmetic, except that the conversion is always rounded toward zero. If 851`a' is a NaN, the largest positive integer is returned. Otherwise, if the 852conversion overflows, the largest integer with the same sign as `a' is 853returned. 854------------------------------------------------------------------------------- 855*/ 856int32 float32_to_int32_round_to_zero( float32 a ) 857{ 858 flag aSign; 859 int16 aExp, shiftCount; 860 bits32 aSig; 861 int32 z; 862 863 aSig = extractFloat32Frac( a ); 864 aExp = extractFloat32Exp( a ); 865 aSign = extractFloat32Sign( a ); 866 shiftCount = aExp - 0x9E; 867 if ( 0 <= shiftCount ) { 868 if ( a == 0xCF000000 ) return 0x80000000; 869 float_raise( float_flag_invalid ); 870 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 871 return 0x80000000; 872 } 873 else if ( aExp <= 0x7E ) { 874 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 875 return 0; 876 } 877 aSig = ( aSig | 0x00800000 )<<8; 878 z = aSig>>( - shiftCount ); 879 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 880 float_exception_flags |= float_flag_inexact; 881 } 882 return aSign ? - z : z; 883 884} 885 886/* 887------------------------------------------------------------------------------- 888Returns the result of converting the single-precision floating-point value 889`a' to the double-precision floating-point format. The conversion is 890performed according to the IEC/IEEE Standard for Binary Floating-point 891Arithmetic. 892------------------------------------------------------------------------------- 893*/ 894float64 float32_to_float64( float32 a ) 895{ 896 flag aSign; 897 int16 aExp; 898 bits32 aSig; 899 900 aSig = extractFloat32Frac( a ); 901 aExp = extractFloat32Exp( a ); 902 aSign = extractFloat32Sign( a ); 903 if ( aExp == 0xFF ) { 904 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 905 return packFloat64( aSign, 0x7FF, 0 ); 906 } 907 if ( aExp == 0 ) { 908 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 909 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 910 --aExp; 911 } 912 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 913 914} 915 916#ifdef FLOATX80 917 918/* 919------------------------------------------------------------------------------- 920Returns the result of converting the single-precision floating-point value 921`a' to the extended double-precision floating-point format. The conversion 922is performed according to the IEC/IEEE Standard for Binary Floating-point 923Arithmetic. 924------------------------------------------------------------------------------- 925*/ 926floatx80 float32_to_floatx80( float32 a ) 927{ 928 flag aSign; 929 int16 aExp; 930 bits32 aSig; 931 932 aSig = extractFloat32Frac( a ); 933 aExp = extractFloat32Exp( a ); 934 aSign = extractFloat32Sign( a ); 935 if ( aExp == 0xFF ) { 936 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 937 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 938 } 939 if ( aExp == 0 ) { 940 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 941 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 942 } 943 aSig |= 0x00800000; 944 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 945 946} 947 948#endif 949 950/* 951------------------------------------------------------------------------------- 952Rounds the single-precision floating-point value `a' to an integer, and 953returns the result as a single-precision floating-point value. The 954operation is performed according to the IEC/IEEE Standard for Binary 955Floating-point Arithmetic. 956------------------------------------------------------------------------------- 957*/ 958float32 float32_round_to_int( float32 a ) 959{ 960 flag aSign; 961 int16 aExp; 962 bits32 lastBitMask, roundBitsMask; 963 int8 roundingMode; 964 float32 z; 965 966 aExp = extractFloat32Exp( a ); 967 if ( 0x96 <= aExp ) { 968 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 969 return propagateFloat32NaN( a, a ); 970 } 971 return a; 972 } 973 if ( aExp <= 0x7E ) { 974 if ( (bits32) ( a<<1 ) == 0 ) return a; 975 float_exception_flags |= float_flag_inexact; 976 aSign = extractFloat32Sign( a ); 977 switch ( float_rounding_mode ) { 978 case float_round_nearest_even: 979 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 980 return packFloat32( aSign, 0x7F, 0 ); 981 } 982 break; 983 case float_round_down: 984 return aSign ? 0xBF800000 : 0; 985 case float_round_up: 986 return aSign ? 0x80000000 : 0x3F800000; 987 } 988 return packFloat32( aSign, 0, 0 ); 989 } 990 lastBitMask = 1; 991 lastBitMask <<= 0x96 - aExp; 992 roundBitsMask = lastBitMask - 1; 993 z = a; 994 roundingMode = float_rounding_mode; 995 if ( roundingMode == float_round_nearest_even ) { 996 z += lastBitMask>>1; 997 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 998 } 999 else if ( roundingMode != float_round_to_zero ) { 1000 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1001 z += roundBitsMask; 1002 } 1003 } 1004 z &= ~ roundBitsMask; 1005 if ( z != a ) float_exception_flags |= float_flag_inexact; 1006 return z; 1007 1008} 1009 1010/* 1011------------------------------------------------------------------------------- 1012Returns the result of adding the absolute values of the single-precision 1013floating-point values `a' and `b'. If `zSign' is true, the sum is negated 1014before being returned. `zSign' is ignored if the result is a NaN. The 1015addition is performed according to the IEC/IEEE Standard for Binary 1016Floating-point Arithmetic. 1017------------------------------------------------------------------------------- 1018*/ 1019static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1020{ 1021 int16 aExp, bExp, zExp; 1022 bits32 aSig, bSig, zSig; 1023 int16 expDiff; 1024 1025 aSig = extractFloat32Frac( a ); 1026 aExp = extractFloat32Exp( a ); 1027 bSig = extractFloat32Frac( b ); 1028 bExp = extractFloat32Exp( b ); 1029 expDiff = aExp - bExp; 1030 aSig <<= 6; 1031 bSig <<= 6; 1032 if ( 0 < expDiff ) { 1033 if ( aExp == 0xFF ) { 1034 if ( aSig ) return propagateFloat32NaN( a, b ); 1035 return a; 1036 } 1037 if ( bExp == 0 ) { 1038 --expDiff; 1039 } 1040 else { 1041 bSig |= 0x20000000; 1042 } 1043 shift32RightJamming( bSig, expDiff, &bSig ); 1044 zExp = aExp; 1045 } 1046 else if ( expDiff < 0 ) { 1047 if ( bExp == 0xFF ) { 1048 if ( bSig ) return propagateFloat32NaN( a, b ); 1049 return packFloat32( zSign, 0xFF, 0 ); 1050 } 1051 if ( aExp == 0 ) { 1052 ++expDiff; 1053 } 1054 else { 1055 aSig |= 0x20000000; 1056 } 1057 shift32RightJamming( aSig, - expDiff, &aSig ); 1058 zExp = bExp; 1059 } 1060 else { 1061 if ( aExp == 0xFF ) { 1062 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1063 return a; 1064 } 1065 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1066 zSig = 0x40000000 + aSig + bSig; 1067 zExp = aExp; 1068 goto roundAndPack; 1069 } 1070 aSig |= 0x20000000; 1071 zSig = ( aSig + bSig )<<1; 1072 --zExp; 1073 if ( (sbits32) zSig < 0 ) { 1074 zSig = aSig + bSig; 1075 ++zExp; 1076 } 1077 roundAndPack: 1078 return roundAndPackFloat32( zSign, zExp, zSig ); 1079 1080} 1081 1082/* 1083------------------------------------------------------------------------------- 1084Returns the result of subtracting the absolute values of the single- 1085precision floating-point values `a' and `b'. If `zSign' is true, the 1086difference is negated before being returned. `zSign' is ignored if the 1087result is a NaN. The subtraction is performed according to the IEC/IEEE 1088Standard for Binary Floating-point Arithmetic. 1089------------------------------------------------------------------------------- 1090*/ 1091static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1092{ 1093 int16 aExp, bExp, zExp; 1094 bits32 aSig, bSig, zSig; 1095 int16 expDiff; 1096 1097 aSig = extractFloat32Frac( a ); 1098 aExp = extractFloat32Exp( a ); 1099 bSig = extractFloat32Frac( b ); 1100 bExp = extractFloat32Exp( b ); 1101 expDiff = aExp - bExp; 1102 aSig <<= 7; 1103 bSig <<= 7; 1104 if ( 0 < expDiff ) goto aExpBigger; 1105 if ( expDiff < 0 ) goto bExpBigger; 1106 if ( aExp == 0xFF ) { 1107 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1108 float_raise( float_flag_invalid ); 1109 return float32_default_nan; 1110 } 1111 if ( aExp == 0 ) { 1112 aExp = 1; 1113 bExp = 1; 1114 } 1115 if ( bSig < aSig ) goto aBigger; 1116 if ( aSig < bSig ) goto bBigger; 1117 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1118 bExpBigger: 1119 if ( bExp == 0xFF ) { 1120 if ( bSig ) return propagateFloat32NaN( a, b ); 1121 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1122 } 1123 if ( aExp == 0 ) { 1124 ++expDiff; 1125 } 1126 else { 1127 aSig |= 0x40000000; 1128 } 1129 shift32RightJamming( aSig, - expDiff, &aSig ); 1130 bSig |= 0x40000000; 1131 bBigger: 1132 zSig = bSig - aSig; 1133 zExp = bExp; 1134 zSign ^= 1; 1135 goto normalizeRoundAndPack; 1136 aExpBigger: 1137 if ( aExp == 0xFF ) { 1138 if ( aSig ) return propagateFloat32NaN( a, b ); 1139 return a; 1140 } 1141 if ( bExp == 0 ) { 1142 --expDiff; 1143 } 1144 else { 1145 bSig |= 0x40000000; 1146 } 1147 shift32RightJamming( bSig, expDiff, &bSig ); 1148 aSig |= 0x40000000; 1149 aBigger: 1150 zSig = aSig - bSig; 1151 zExp = aExp; 1152 normalizeRoundAndPack: 1153 --zExp; 1154 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1155 1156} 1157 1158/* 1159------------------------------------------------------------------------------- 1160Returns the result of adding the single-precision floating-point values `a' 1161and `b'. The operation is performed according to the IEC/IEEE Standard for 1162Binary Floating-point Arithmetic. 1163------------------------------------------------------------------------------- 1164*/ 1165float32 float32_add( float32 a, float32 b ) 1166{ 1167 flag aSign, bSign; 1168 1169 aSign = extractFloat32Sign( a ); 1170 bSign = extractFloat32Sign( b ); 1171 if ( aSign == bSign ) { 1172 return addFloat32Sigs( a, b, aSign ); 1173 } 1174 else { 1175 return subFloat32Sigs( a, b, aSign ); 1176 } 1177 1178} 1179 1180/* 1181------------------------------------------------------------------------------- 1182Returns the result of subtracting the single-precision floating-point values 1183`a' and `b'. The operation is performed according to the IEC/IEEE Standard 1184for Binary Floating-point Arithmetic. 1185------------------------------------------------------------------------------- 1186*/ 1187float32 float32_sub( float32 a, float32 b ) 1188{ 1189 flag aSign, bSign; 1190 1191 aSign = extractFloat32Sign( a ); 1192 bSign = extractFloat32Sign( b ); 1193 if ( aSign == bSign ) { 1194 return subFloat32Sigs( a, b, aSign ); 1195 } 1196 else { 1197 return addFloat32Sigs( a, b, aSign ); 1198 } 1199 1200} 1201 1202/* 1203------------------------------------------------------------------------------- 1204Returns the result of multiplying the single-precision floating-point values 1205`a' and `b'. The operation is performed according to the IEC/IEEE Standard 1206for Binary Floating-point Arithmetic. 1207------------------------------------------------------------------------------- 1208*/ 1209float32 float32_mul( float32 a, float32 b ) 1210{ 1211 flag aSign, bSign, zSign; 1212 int16 aExp, bExp, zExp; 1213 bits32 aSig, bSig; 1214 bits64 zSig64; 1215 bits32 zSig; 1216 1217 aSig = extractFloat32Frac( a ); 1218 aExp = extractFloat32Exp( a ); 1219 aSign = extractFloat32Sign( a ); 1220 bSig = extractFloat32Frac( b ); 1221 bExp = extractFloat32Exp( b ); 1222 bSign = extractFloat32Sign( b ); 1223 zSign = aSign ^ bSign; 1224 if ( aExp == 0xFF ) { 1225 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1226 return propagateFloat32NaN( a, b ); 1227 } 1228 if ( ( bExp | bSig ) == 0 ) { 1229 float_raise( float_flag_invalid ); 1230 return float32_default_nan; 1231 } 1232 return packFloat32( zSign, 0xFF, 0 ); 1233 } 1234 if ( bExp == 0xFF ) { 1235 if ( bSig ) return propagateFloat32NaN( a, b ); 1236 if ( ( aExp | aSig ) == 0 ) { 1237 float_raise( float_flag_invalid ); 1238 return float32_default_nan; 1239 } 1240 return packFloat32( zSign, 0xFF, 0 ); 1241 } 1242 if ( aExp == 0 ) { 1243 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1244 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1245 } 1246 if ( bExp == 0 ) { 1247 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1248 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1249 } 1250 zExp = aExp + bExp - 0x7F; 1251 aSig = ( aSig | 0x00800000 )<<7; 1252 bSig = ( bSig | 0x00800000 )<<8; 1253 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1254 zSig = zSig64; 1255 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1256 zSig <<= 1; 1257 --zExp; 1258 } 1259 return roundAndPackFloat32( zSign, zExp, zSig ); 1260 1261} 1262 1263/* 1264------------------------------------------------------------------------------- 1265Returns the result of dividing the single-precision floating-point value `a' 1266by the corresponding value `b'. The operation is performed according to the 1267IEC/IEEE Standard for Binary Floating-point Arithmetic. 1268------------------------------------------------------------------------------- 1269*/ 1270float32 float32_div( float32 a, float32 b ) 1271{ 1272 flag aSign, bSign, zSign; 1273 int16 aExp, bExp, zExp; 1274 bits32 aSig, bSig, zSig; 1275 1276 aSig = extractFloat32Frac( a ); 1277 aExp = extractFloat32Exp( a ); 1278 aSign = extractFloat32Sign( a ); 1279 bSig = extractFloat32Frac( b ); 1280 bExp = extractFloat32Exp( b ); 1281 bSign = extractFloat32Sign( b ); 1282 zSign = aSign ^ bSign; 1283 if ( aExp == 0xFF ) { 1284 if ( aSig ) return propagateFloat32NaN( a, b ); 1285 if ( bExp == 0xFF ) { 1286 if ( bSig ) return propagateFloat32NaN( a, b ); 1287 float_raise( float_flag_invalid ); 1288 return float32_default_nan; 1289 } 1290 return packFloat32( zSign, 0xFF, 0 ); 1291 } 1292 if ( bExp == 0xFF ) { 1293 if ( bSig ) return propagateFloat32NaN( a, b ); 1294 return packFloat32( zSign, 0, 0 ); 1295 } 1296 if ( bExp == 0 ) { 1297 if ( bSig == 0 ) { 1298 if ( ( aExp | aSig ) == 0 ) { 1299 float_raise( float_flag_invalid ); 1300 return float32_default_nan; 1301 } 1302 float_raise( float_flag_divbyzero ); 1303 return packFloat32( zSign, 0xFF, 0 ); 1304 } 1305 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1306 } 1307 if ( aExp == 0 ) { 1308 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1309 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1310 } 1311 zExp = aExp - bExp + 0x7D; 1312 aSig = ( aSig | 0x00800000 )<<7; 1313 bSig = ( bSig | 0x00800000 )<<8; 1314 if ( bSig <= ( aSig + aSig ) ) { 1315 aSig >>= 1; 1316 ++zExp; 1317 } 1318 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1319 if ( ( zSig & 0x3F ) == 0 ) { 1320 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 ); 1321 } 1322 return roundAndPackFloat32( zSign, zExp, zSig ); 1323 1324} 1325 1326/* 1327------------------------------------------------------------------------------- 1328Returns the remainder of the single-precision floating-point value `a' 1329with respect to the corresponding value `b'. The operation is performed 1330according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1331------------------------------------------------------------------------------- 1332*/ 1333float32 float32_rem( float32 a, float32 b ) 1334{ 1335 flag aSign, bSign, zSign; 1336 int16 aExp, bExp, expDiff; 1337 bits32 aSig, bSig; 1338 bits32 q; 1339 bits64 aSig64, bSig64, q64; 1340 bits32 alternateASig; 1341 sbits32 sigMean; 1342 1343 aSig = extractFloat32Frac( a ); 1344 aExp = extractFloat32Exp( a ); 1345 aSign = extractFloat32Sign( a ); 1346 bSig = extractFloat32Frac( b ); 1347 bExp = extractFloat32Exp( b ); 1348 bSign = extractFloat32Sign( b ); 1349 if ( aExp == 0xFF ) { 1350 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1351 return propagateFloat32NaN( a, b ); 1352 } 1353 float_raise( float_flag_invalid ); 1354 return float32_default_nan; 1355 } 1356 if ( bExp == 0xFF ) { 1357 if ( bSig ) return propagateFloat32NaN( a, b ); 1358 return a; 1359 } 1360 if ( bExp == 0 ) { 1361 if ( bSig == 0 ) { 1362 float_raise( float_flag_invalid ); 1363 return float32_default_nan; 1364 } 1365 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1366 } 1367 if ( aExp == 0 ) { 1368 if ( aSig == 0 ) return a; 1369 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1370 } 1371 expDiff = aExp - bExp; 1372 aSig |= 0x00800000; 1373 bSig |= 0x00800000; 1374 if ( expDiff < 32 ) { 1375 aSig <<= 8; 1376 bSig <<= 8; 1377 if ( expDiff < 0 ) { 1378 if ( expDiff < -1 ) return a; 1379 aSig >>= 1; 1380 } 1381 q = ( bSig <= aSig ); 1382 if ( q ) aSig -= bSig; 1383 if ( 0 < expDiff ) { 1384 q = ( ( (bits64) aSig )<<32 ) / bSig; 1385 q >>= 32 - expDiff; 1386 bSig >>= 2; 1387 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1388 } 1389 else { 1390 aSig >>= 2; 1391 bSig >>= 2; 1392 } 1393 } 1394 else { 1395 if ( bSig <= aSig ) aSig -= bSig; 1396 aSig64 = ( (bits64) aSig )<<40; 1397 bSig64 = ( (bits64) bSig )<<40; 1398 expDiff -= 64; 1399 while ( 0 < expDiff ) { 1400 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1401 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1402 aSig64 = - ( ( bSig * q64 )<<38 ); 1403 expDiff -= 62; 1404 } 1405 expDiff += 64; 1406 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1407 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1408 q = q64>>( 64 - expDiff ); 1409 bSig <<= 6; 1410 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 1411 } 1412 do { 1413 alternateASig = aSig; 1414 ++q; 1415 aSig -= bSig; 1416 } while ( 0 <= (sbits32) aSig ); 1417 sigMean = aSig + alternateASig; 1418 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 1419 aSig = alternateASig; 1420 } 1421 zSign = ( (sbits32) aSig < 0 ); 1422 if ( zSign ) aSig = - aSig; 1423 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 1424 1425} 1426 1427/* 1428------------------------------------------------------------------------------- 1429Returns the square root of the single-precision floating-point value `a'. 1430The operation is performed according to the IEC/IEEE Standard for Binary 1431Floating-point Arithmetic. 1432------------------------------------------------------------------------------- 1433*/ 1434float32 float32_sqrt( float32 a ) 1435{ 1436 flag aSign; 1437 int16 aExp, zExp; 1438 bits32 aSig, zSig; 1439 bits64 rem, term; 1440 1441 aSig = extractFloat32Frac( a ); 1442 aExp = extractFloat32Exp( a ); 1443 aSign = extractFloat32Sign( a ); 1444 if ( aExp == 0xFF ) { 1445 if ( aSig ) return propagateFloat32NaN( a, 0 ); 1446 if ( ! aSign ) return a; 1447 float_raise( float_flag_invalid ); 1448 return float32_default_nan; 1449 } 1450 if ( aSign ) { 1451 if ( ( aExp | aSig ) == 0 ) return a; 1452 float_raise( float_flag_invalid ); 1453 return float32_default_nan; 1454 } 1455 if ( aExp == 0 ) { 1456 if ( aSig == 0 ) return 0; 1457 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1458 } 1459 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 1460 aSig = ( aSig | 0x00800000 )<<8; 1461 zSig = estimateSqrt32( aExp, aSig ) + 2; 1462 if ( ( zSig & 0x7F ) <= 5 ) { 1463 if ( zSig < 2 ) { 1464 zSig = 0xFFFFFFFF; 1465 } 1466 else { 1467 aSig >>= aExp & 1; 1468 term = ( (bits64) zSig ) * zSig; 1469 rem = ( ( (bits64) aSig )<<32 ) - term; 1470 while ( (sbits64) rem < 0 ) { 1471 --zSig; 1472 rem += ( ( (bits64) zSig )<<1 ) | 1; 1473 } 1474 zSig |= ( rem != 0 ); 1475 } 1476 } 1477 shift32RightJamming( zSig, 1, &zSig ); 1478 return roundAndPackFloat32( 0, zExp, zSig ); 1479 1480} 1481 1482/* 1483------------------------------------------------------------------------------- 1484Returns 1 if the single-precision floating-point value `a' is equal to the 1485corresponding value `b', and 0 otherwise. The comparison is performed 1486according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1487------------------------------------------------------------------------------- 1488*/ 1489flag float32_eq( float32 a, float32 b ) 1490{ 1491 1492 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1493 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1494 ) { 1495 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1496 float_raise( float_flag_invalid ); 1497 } 1498 return 0; 1499 } 1500 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1501 1502} 1503 1504/* 1505------------------------------------------------------------------------------- 1506Returns 1 if the single-precision floating-point value `a' is less than or 1507equal to the corresponding value `b', and 0 otherwise. The comparison is 1508performed according to the IEC/IEEE Standard for Binary Floating-point 1509Arithmetic. 1510------------------------------------------------------------------------------- 1511*/ 1512flag float32_le( float32 a, float32 b ) 1513{ 1514 flag aSign, bSign; 1515 1516 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1517 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1518 ) { 1519 float_raise( float_flag_invalid ); 1520 return 0; 1521 } 1522 aSign = extractFloat32Sign( a ); 1523 bSign = extractFloat32Sign( b ); 1524 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1525 return ( a == b ) || ( aSign ^ ( a < b ) ); 1526 1527} 1528 1529/* 1530------------------------------------------------------------------------------- 1531Returns 1 if the single-precision floating-point value `a' is less than 1532the corresponding value `b', and 0 otherwise. The comparison is performed 1533according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1534------------------------------------------------------------------------------- 1535*/ 1536flag float32_lt( float32 a, float32 b ) 1537{ 1538 flag aSign, bSign; 1539 1540 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1541 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1542 ) { 1543 float_raise( float_flag_invalid ); 1544 return 0; 1545 } 1546 aSign = extractFloat32Sign( a ); 1547 bSign = extractFloat32Sign( b ); 1548 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1549 return ( a != b ) && ( aSign ^ ( a < b ) ); 1550 1551} 1552 1553/* 1554------------------------------------------------------------------------------- 1555Returns 1 if the single-precision floating-point value `a' is equal to the 1556corresponding value `b', and 0 otherwise. The invalid exception is raised 1557if either operand is a NaN. Otherwise, the comparison is performed 1558according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1559------------------------------------------------------------------------------- 1560*/ 1561flag float32_eq_signaling( float32 a, float32 b ) 1562{ 1563 1564 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1565 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1566 ) { 1567 float_raise( float_flag_invalid ); 1568 return 0; 1569 } 1570 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1571 1572} 1573 1574/* 1575------------------------------------------------------------------------------- 1576Returns 1 if the single-precision floating-point value `a' is less than or 1577equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 1578cause an exception. Otherwise, the comparison is performed according to the 1579IEC/IEEE Standard for Binary Floating-point Arithmetic. 1580------------------------------------------------------------------------------- 1581*/ 1582flag float32_le_quiet( float32 a, float32 b ) 1583{ 1584 flag aSign, bSign; 1585 //int16 aExp, bExp; 1586 1587 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1588 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1589 ) { 1590 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1591 float_raise( float_flag_invalid ); 1592 } 1593 return 0; 1594 } 1595 aSign = extractFloat32Sign( a ); 1596 bSign = extractFloat32Sign( b ); 1597 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1598 return ( a == b ) || ( aSign ^ ( a < b ) ); 1599 1600} 1601 1602/* 1603------------------------------------------------------------------------------- 1604Returns 1 if the single-precision floating-point value `a' is less than 1605the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 1606exception. Otherwise, the comparison is performed according to the IEC/IEEE 1607Standard for Binary Floating-point Arithmetic. 1608------------------------------------------------------------------------------- 1609*/ 1610flag float32_lt_quiet( float32 a, float32 b ) 1611{ 1612 flag aSign, bSign; 1613 1614 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1615 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1616 ) { 1617 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1618 float_raise( float_flag_invalid ); 1619 } 1620 return 0; 1621 } 1622 aSign = extractFloat32Sign( a ); 1623 bSign = extractFloat32Sign( b ); 1624 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1625 return ( a != b ) && ( aSign ^ ( a < b ) ); 1626 1627} 1628 1629/* 1630------------------------------------------------------------------------------- 1631Returns the result of converting the double-precision floating-point value 1632`a' to the 32-bit two's complement integer format. The conversion is 1633performed according to the IEC/IEEE Standard for Binary Floating-point 1634Arithmetic---which means in particular that the conversion is rounded 1635according to the current rounding mode. If `a' is a NaN, the largest 1636positive integer is returned. Otherwise, if the conversion overflows, the 1637largest integer with the same sign as `a' is returned. 1638------------------------------------------------------------------------------- 1639*/ 1640int32 float64_to_int32( float64 a ) 1641{ 1642 flag aSign; 1643 int16 aExp, shiftCount; 1644 bits64 aSig; 1645 1646 aSig = extractFloat64Frac( a ); 1647 aExp = extractFloat64Exp( a ); 1648 aSign = extractFloat64Sign( a ); 1649 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1650 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 1651 shiftCount = 0x42C - aExp; 1652 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 1653 return roundAndPackInt32( aSign, aSig ); 1654 1655} 1656 1657/* 1658------------------------------------------------------------------------------- 1659Returns the result of converting the double-precision floating-point value 1660`a' to the 32-bit two's complement integer format. The conversion is 1661performed according to the IEC/IEEE Standard for Binary Floating-point 1662Arithmetic, except that the conversion is always rounded toward zero. If 1663`a' is a NaN, the largest positive integer is returned. Otherwise, if the 1664conversion overflows, the largest integer with the same sign as `a' is 1665returned. 1666------------------------------------------------------------------------------- 1667*/ 1668int32 float64_to_int32_round_to_zero( float64 a ) 1669{ 1670 flag aSign; 1671 int16 aExp, shiftCount; 1672 bits64 aSig, savedASig; 1673 int32 z; 1674 1675 aSig = extractFloat64Frac( a ); 1676 aExp = extractFloat64Exp( a ); 1677 aSign = extractFloat64Sign( a ); 1678 shiftCount = 0x433 - aExp; 1679 if ( shiftCount < 21 ) { 1680 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1681 goto invalid; 1682 } 1683 else if ( 52 < shiftCount ) { 1684 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 1685 return 0; 1686 } 1687 aSig |= LIT64( 0x0010000000000000 ); 1688 savedASig = aSig; 1689 aSig >>= shiftCount; 1690 z = aSig; 1691 if ( aSign ) z = - z; 1692 if ( ( z < 0 ) ^ aSign ) { 1693 invalid: 1694 float_exception_flags |= float_flag_invalid; 1695 return aSign ? 0x80000000 : 0x7FFFFFFF; 1696 } 1697 if ( ( aSig<<shiftCount ) != savedASig ) { 1698 float_exception_flags |= float_flag_inexact; 1699 } 1700 return z; 1701 1702} 1703 1704/* 1705------------------------------------------------------------------------------- 1706Returns the result of converting the double-precision floating-point value 1707`a' to the 32-bit two's complement unsigned integer format. The conversion 1708is performed according to the IEC/IEEE Standard for Binary Floating-point 1709Arithmetic---which means in particular that the conversion is rounded 1710according to the current rounding mode. If `a' is a NaN, the largest 1711positive integer is returned. Otherwise, if the conversion overflows, the 1712largest positive integer is returned. 1713------------------------------------------------------------------------------- 1714*/ 1715int32 float64_to_uint32( float64 a ) 1716{ 1717 flag aSign; 1718 int16 aExp, shiftCount; 1719 bits64 aSig; 1720 1721 aSig = extractFloat64Frac( a ); 1722 aExp = extractFloat64Exp( a ); 1723 aSign = 0; //extractFloat64Sign( a ); 1724 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1725 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 1726 shiftCount = 0x42C - aExp; 1727 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 1728 return roundAndPackInt32( aSign, aSig ); 1729} 1730 1731/* 1732------------------------------------------------------------------------------- 1733Returns the result of converting the double-precision floating-point value 1734`a' to the 32-bit two's complement integer format. The conversion is 1735performed according to the IEC/IEEE Standard for Binary Floating-point 1736Arithmetic, except that the conversion is always rounded toward zero. If 1737`a' is a NaN, the largest positive integer is returned. Otherwise, if the 1738conversion overflows, the largest positive integer is returned. 1739------------------------------------------------------------------------------- 1740*/ 1741int32 float64_to_uint32_round_to_zero( float64 a ) 1742{ 1743 flag aSign; 1744 int16 aExp, shiftCount; 1745 bits64 aSig, savedASig; 1746 int32 z; 1747 1748 aSig = extractFloat64Frac( a ); 1749 aExp = extractFloat64Exp( a ); 1750 aSign = extractFloat64Sign( a ); 1751 shiftCount = 0x433 - aExp; 1752 if ( shiftCount < 21 ) { 1753 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1754 goto invalid; 1755 } 1756 else if ( 52 < shiftCount ) { 1757 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 1758 return 0; 1759 } 1760 aSig |= LIT64( 0x0010000000000000 ); 1761 savedASig = aSig; 1762 aSig >>= shiftCount; 1763 z = aSig; 1764 if ( aSign ) z = - z; 1765 if ( ( z < 0 ) ^ aSign ) { 1766 invalid: 1767 float_exception_flags |= float_flag_invalid; 1768 return aSign ? 0x80000000 : 0x7FFFFFFF; 1769 } 1770 if ( ( aSig<<shiftCount ) != savedASig ) { 1771 float_exception_flags |= float_flag_inexact; 1772 } 1773 return z; 1774} 1775 1776/* 1777------------------------------------------------------------------------------- 1778Returns the result of converting the double-precision floating-point value 1779`a' to the single-precision floating-point format. The conversion is 1780performed according to the IEC/IEEE Standard for Binary Floating-point 1781Arithmetic. 1782------------------------------------------------------------------------------- 1783*/ 1784float32 float64_to_float32( float64 a ) 1785{ 1786 flag aSign; 1787 int16 aExp; 1788 bits64 aSig; 1789 bits32 zSig; 1790 1791 aSig = extractFloat64Frac( a ); 1792 aExp = extractFloat64Exp( a ); 1793 aSign = extractFloat64Sign( a ); 1794 if ( aExp == 0x7FF ) { 1795 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 1796 return packFloat32( aSign, 0xFF, 0 ); 1797 } 1798 shift64RightJamming( aSig, 22, &aSig ); 1799 zSig = aSig; 1800 if ( aExp || zSig ) { 1801 zSig |= 0x40000000; 1802 aExp -= 0x381; 1803 } 1804 return roundAndPackFloat32( aSign, aExp, zSig ); 1805 1806} 1807 1808#ifdef FLOATX80 1809 1810/* 1811------------------------------------------------------------------------------- 1812Returns the result of converting the double-precision floating-point value 1813`a' to the extended double-precision floating-point format. The conversion 1814is performed according to the IEC/IEEE Standard for Binary Floating-point 1815Arithmetic. 1816------------------------------------------------------------------------------- 1817*/ 1818floatx80 float64_to_floatx80( float64 a ) 1819{ 1820 flag aSign; 1821 int16 aExp; 1822 bits64 aSig; 1823 1824 aSig = extractFloat64Frac( a ); 1825 aExp = extractFloat64Exp( a ); 1826 aSign = extractFloat64Sign( a ); 1827 if ( aExp == 0x7FF ) { 1828 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 1829 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1830 } 1831 if ( aExp == 0 ) { 1832 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1833 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 1834 } 1835 return 1836 packFloatx80( 1837 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 1838 1839} 1840 1841#endif 1842 1843/* 1844------------------------------------------------------------------------------- 1845Rounds the double-precision floating-point value `a' to an integer, and 1846returns the result as a double-precision floating-point value. The 1847operation is performed according to the IEC/IEEE Standard for Binary 1848Floating-point Arithmetic. 1849------------------------------------------------------------------------------- 1850*/ 1851float64 float64_round_to_int( float64 a ) 1852{ 1853 flag aSign; 1854 int16 aExp; 1855 bits64 lastBitMask, roundBitsMask; 1856 int8 roundingMode; 1857 float64 z; 1858 1859 aExp = extractFloat64Exp( a ); 1860 if ( 0x433 <= aExp ) { 1861 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 1862 return propagateFloat64NaN( a, a ); 1863 } 1864 return a; 1865 } 1866 if ( aExp <= 0x3FE ) { 1867 if ( (bits64) ( a<<1 ) == 0 ) return a; 1868 float_exception_flags |= float_flag_inexact; 1869 aSign = extractFloat64Sign( a ); 1870 switch ( float_rounding_mode ) { 1871 case float_round_nearest_even: 1872 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 1873 return packFloat64( aSign, 0x3FF, 0 ); 1874 } 1875 break; 1876 case float_round_down: 1877 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 1878 case float_round_up: 1879 return 1880 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 1881 } 1882 return packFloat64( aSign, 0, 0 ); 1883 } 1884 lastBitMask = 1; 1885 lastBitMask <<= 0x433 - aExp; 1886 roundBitsMask = lastBitMask - 1; 1887 z = a; 1888 roundingMode = float_rounding_mode; 1889 if ( roundingMode == float_round_nearest_even ) { 1890 z += lastBitMask>>1; 1891 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1892 } 1893 else if ( roundingMode != float_round_to_zero ) { 1894 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1895 z += roundBitsMask; 1896 } 1897 } 1898 z &= ~ roundBitsMask; 1899 if ( z != a ) float_exception_flags |= float_flag_inexact; 1900 return z; 1901 1902} 1903 1904/* 1905------------------------------------------------------------------------------- 1906Returns the result of adding the absolute values of the double-precision 1907floating-point values `a' and `b'. If `zSign' is true, the sum is negated 1908before being returned. `zSign' is ignored if the result is a NaN. The 1909addition is performed according to the IEC/IEEE Standard for Binary 1910Floating-point Arithmetic. 1911------------------------------------------------------------------------------- 1912*/ 1913static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 1914{ 1915 int16 aExp, bExp, zExp; 1916 bits64 aSig, bSig, zSig; 1917 int16 expDiff; 1918 1919 aSig = extractFloat64Frac( a ); 1920 aExp = extractFloat64Exp( a ); 1921 bSig = extractFloat64Frac( b ); 1922 bExp = extractFloat64Exp( b ); 1923 expDiff = aExp - bExp; 1924 aSig <<= 9; 1925 bSig <<= 9; 1926 if ( 0 < expDiff ) { 1927 if ( aExp == 0x7FF ) { 1928 if ( aSig ) return propagateFloat64NaN( a, b ); 1929 return a; 1930 } 1931 if ( bExp == 0 ) { 1932 --expDiff; 1933 } 1934 else { 1935 bSig |= LIT64( 0x2000000000000000 ); 1936 } 1937 shift64RightJamming( bSig, expDiff, &bSig ); 1938 zExp = aExp; 1939 } 1940 else if ( expDiff < 0 ) { 1941 if ( bExp == 0x7FF ) { 1942 if ( bSig ) return propagateFloat64NaN( a, b ); 1943 return packFloat64( zSign, 0x7FF, 0 ); 1944 } 1945 if ( aExp == 0 ) { 1946 ++expDiff; 1947 } 1948 else { 1949 aSig |= LIT64( 0x2000000000000000 ); 1950 } 1951 shift64RightJamming( aSig, - expDiff, &aSig ); 1952 zExp = bExp; 1953 } 1954 else { 1955 if ( aExp == 0x7FF ) { 1956 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 1957 return a; 1958 } 1959 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 1960 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 1961 zExp = aExp; 1962 goto roundAndPack; 1963 } 1964 aSig |= LIT64( 0x2000000000000000 ); 1965 zSig = ( aSig + bSig )<<1; 1966 --zExp; 1967 if ( (sbits64) zSig < 0 ) { 1968 zSig = aSig + bSig; 1969 ++zExp; 1970 } 1971 roundAndPack: 1972 return roundAndPackFloat64( zSign, zExp, zSig ); 1973 1974} 1975 1976/* 1977------------------------------------------------------------------------------- 1978Returns the result of subtracting the absolute values of the double- 1979precision floating-point values `a' and `b'. If `zSign' is true, the 1980difference is negated before being returned. `zSign' is ignored if the 1981result is a NaN. The subtraction is performed according to the IEC/IEEE 1982Standard for Binary Floating-point Arithmetic. 1983------------------------------------------------------------------------------- 1984*/ 1985static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 1986{ 1987 int16 aExp, bExp, zExp; 1988 bits64 aSig, bSig, zSig; 1989 int16 expDiff; 1990 1991 aSig = extractFloat64Frac( a ); 1992 aExp = extractFloat64Exp( a ); 1993 bSig = extractFloat64Frac( b ); 1994 bExp = extractFloat64Exp( b ); 1995 expDiff = aExp - bExp; 1996 aSig <<= 10; 1997 bSig <<= 10; 1998 if ( 0 < expDiff ) goto aExpBigger; 1999 if ( expDiff < 0 ) goto bExpBigger; 2000 if ( aExp == 0x7FF ) { 2001 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2002 float_raise( float_flag_invalid ); 2003 return float64_default_nan; 2004 } 2005 if ( aExp == 0 ) { 2006 aExp = 1; 2007 bExp = 1; 2008 } 2009 if ( bSig < aSig ) goto aBigger; 2010 if ( aSig < bSig ) goto bBigger; 2011 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2012 bExpBigger: 2013 if ( bExp == 0x7FF ) { 2014 if ( bSig ) return propagateFloat64NaN( a, b ); 2015 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2016 } 2017 if ( aExp == 0 ) { 2018 ++expDiff; 2019 } 2020 else { 2021 aSig |= LIT64( 0x4000000000000000 ); 2022 } 2023 shift64RightJamming( aSig, - expDiff, &aSig ); 2024 bSig |= LIT64( 0x4000000000000000 ); 2025 bBigger: 2026 zSig = bSig - aSig; 2027 zExp = bExp; 2028 zSign ^= 1; 2029 goto normalizeRoundAndPack; 2030 aExpBigger: 2031 if ( aExp == 0x7FF ) { 2032 if ( aSig ) return propagateFloat64NaN( a, b ); 2033 return a; 2034 } 2035 if ( bExp == 0 ) { 2036 --expDiff; 2037 } 2038 else { 2039 bSig |= LIT64( 0x4000000000000000 ); 2040 } 2041 shift64RightJamming( bSig, expDiff, &bSig ); 2042 aSig |= LIT64( 0x4000000000000000 ); 2043 aBigger: 2044 zSig = aSig - bSig; 2045 zExp = aExp; 2046 normalizeRoundAndPack: 2047 --zExp; 2048 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2049 2050} 2051 2052/* 2053------------------------------------------------------------------------------- 2054Returns the result of adding the double-precision floating-point values `a' 2055and `b'. The operation is performed according to the IEC/IEEE Standard for 2056Binary Floating-point Arithmetic. 2057------------------------------------------------------------------------------- 2058*/ 2059float64 float64_add( float64 a, float64 b ) 2060{ 2061 flag aSign, bSign; 2062 2063 aSign = extractFloat64Sign( a ); 2064 bSign = extractFloat64Sign( b ); 2065 if ( aSign == bSign ) { 2066 return addFloat64Sigs( a, b, aSign ); 2067 } 2068 else { 2069 return subFloat64Sigs( a, b, aSign ); 2070 } 2071 2072} 2073 2074/* 2075------------------------------------------------------------------------------- 2076Returns the result of subtracting the double-precision floating-point values 2077`a' and `b'. The operation is performed according to the IEC/IEEE Standard 2078for Binary Floating-point Arithmetic. 2079------------------------------------------------------------------------------- 2080*/ 2081float64 float64_sub( float64 a, float64 b ) 2082{ 2083 flag aSign, bSign; 2084 2085 aSign = extractFloat64Sign( a ); 2086 bSign = extractFloat64Sign( b ); 2087 if ( aSign == bSign ) { 2088 return subFloat64Sigs( a, b, aSign ); 2089 } 2090 else { 2091 return addFloat64Sigs( a, b, aSign ); 2092 } 2093 2094} 2095 2096/* 2097------------------------------------------------------------------------------- 2098Returns the result of multiplying the double-precision floating-point values 2099`a' and `b'. The operation is performed according to the IEC/IEEE Standard 2100for Binary Floating-point Arithmetic. 2101------------------------------------------------------------------------------- 2102*/ 2103float64 float64_mul( float64 a, float64 b ) 2104{ 2105 flag aSign, bSign, zSign; 2106 int16 aExp, bExp, zExp; 2107 bits64 aSig, bSig, zSig0, zSig1; 2108 2109 aSig = extractFloat64Frac( a ); 2110 aExp = extractFloat64Exp( a ); 2111 aSign = extractFloat64Sign( a ); 2112 bSig = extractFloat64Frac( b ); 2113 bExp = extractFloat64Exp( b ); 2114 bSign = extractFloat64Sign( b ); 2115 zSign = aSign ^ bSign; 2116 if ( aExp == 0x7FF ) { 2117 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2118 return propagateFloat64NaN( a, b ); 2119 } 2120 if ( ( bExp | bSig ) == 0 ) { 2121 float_raise( float_flag_invalid ); 2122 return float64_default_nan; 2123 } 2124 return packFloat64( zSign, 0x7FF, 0 ); 2125 } 2126 if ( bExp == 0x7FF ) { 2127 if ( bSig ) return propagateFloat64NaN( a, b ); 2128 if ( ( aExp | aSig ) == 0 ) { 2129 float_raise( float_flag_invalid ); 2130 return float64_default_nan; 2131 } 2132 return packFloat64( zSign, 0x7FF, 0 ); 2133 } 2134 if ( aExp == 0 ) { 2135 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2136 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2137 } 2138 if ( bExp == 0 ) { 2139 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2140 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2141 } 2142 zExp = aExp + bExp - 0x3FF; 2143 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2144 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2145 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2146 zSig0 |= ( zSig1 != 0 ); 2147 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2148 zSig0 <<= 1; 2149 --zExp; 2150 } 2151 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2152 2153} 2154 2155/* 2156------------------------------------------------------------------------------- 2157Returns the result of dividing the double-precision floating-point value `a' 2158by the corresponding value `b'. The operation is performed according to 2159the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2160------------------------------------------------------------------------------- 2161*/ 2162float64 float64_div( float64 a, float64 b ) 2163{ 2164 flag aSign, bSign, zSign; 2165 int16 aExp, bExp, zExp; 2166 bits64 aSig, bSig, zSig; 2167 bits64 rem0, rem1; 2168 bits64 term0, term1; 2169 2170 aSig = extractFloat64Frac( a ); 2171 aExp = extractFloat64Exp( a ); 2172 aSign = extractFloat64Sign( a ); 2173 bSig = extractFloat64Frac( b ); 2174 bExp = extractFloat64Exp( b ); 2175 bSign = extractFloat64Sign( b ); 2176 zSign = aSign ^ bSign; 2177 if ( aExp == 0x7FF ) { 2178 if ( aSig ) return propagateFloat64NaN( a, b ); 2179 if ( bExp == 0x7FF ) { 2180 if ( bSig ) return propagateFloat64NaN( a, b ); 2181 float_raise( float_flag_invalid ); 2182 return float64_default_nan; 2183 } 2184 return packFloat64( zSign, 0x7FF, 0 ); 2185 } 2186 if ( bExp == 0x7FF ) { 2187 if ( bSig ) return propagateFloat64NaN( a, b ); 2188 return packFloat64( zSign, 0, 0 ); 2189 } 2190 if ( bExp == 0 ) { 2191 if ( bSig == 0 ) { 2192 if ( ( aExp | aSig ) == 0 ) { 2193 float_raise( float_flag_invalid ); 2194 return float64_default_nan; 2195 } 2196 float_raise( float_flag_divbyzero ); 2197 return packFloat64( zSign, 0x7FF, 0 ); 2198 } 2199 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2200 } 2201 if ( aExp == 0 ) { 2202 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2203 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2204 } 2205 zExp = aExp - bExp + 0x3FD; 2206 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2207 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2208 if ( bSig <= ( aSig + aSig ) ) { 2209 aSig >>= 1; 2210 ++zExp; 2211 } 2212 zSig = estimateDiv128To64( aSig, 0, bSig ); 2213 if ( ( zSig & 0x1FF ) <= 2 ) { 2214 mul64To128( bSig, zSig, &term0, &term1 ); 2215 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2216 while ( (sbits64) rem0 < 0 ) { 2217 --zSig; 2218 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2219 } 2220 zSig |= ( rem1 != 0 ); 2221 } 2222 return roundAndPackFloat64( zSign, zExp, zSig ); 2223 2224} 2225 2226/* 2227------------------------------------------------------------------------------- 2228Returns the remainder of the double-precision floating-point value `a' 2229with respect to the corresponding value `b'. The operation is performed 2230according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2231------------------------------------------------------------------------------- 2232*/ 2233float64 float64_rem( float64 a, float64 b ) 2234{ 2235 flag aSign, bSign, zSign; 2236 int16 aExp, bExp, expDiff; 2237 bits64 aSig, bSig; 2238 bits64 q, alternateASig; 2239 sbits64 sigMean; 2240 2241 aSig = extractFloat64Frac( a ); 2242 aExp = extractFloat64Exp( a ); 2243 aSign = extractFloat64Sign( a ); 2244 bSig = extractFloat64Frac( b ); 2245 bExp = extractFloat64Exp( b ); 2246 bSign = extractFloat64Sign( b ); 2247 if ( aExp == 0x7FF ) { 2248 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2249 return propagateFloat64NaN( a, b ); 2250 } 2251 float_raise( float_flag_invalid ); 2252 return float64_default_nan; 2253 } 2254 if ( bExp == 0x7FF ) { 2255 if ( bSig ) return propagateFloat64NaN( a, b ); 2256 return a; 2257 } 2258 if ( bExp == 0 ) { 2259 if ( bSig == 0 ) { 2260 float_raise( float_flag_invalid ); 2261 return float64_default_nan; 2262 } 2263 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2264 } 2265 if ( aExp == 0 ) { 2266 if ( aSig == 0 ) return a; 2267 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2268 } 2269 expDiff = aExp - bExp; 2270 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2271 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2272 if ( expDiff < 0 ) { 2273 if ( expDiff < -1 ) return a; 2274 aSig >>= 1; 2275 } 2276 q = ( bSig <= aSig ); 2277 if ( q ) aSig -= bSig; 2278 expDiff -= 64; 2279 while ( 0 < expDiff ) { 2280 q = estimateDiv128To64( aSig, 0, bSig ); 2281 q = ( 2 < q ) ? q - 2 : 0; 2282 aSig = - ( ( bSig>>2 ) * q ); 2283 expDiff -= 62; 2284 } 2285 expDiff += 64; 2286 if ( 0 < expDiff ) { 2287 q = estimateDiv128To64( aSig, 0, bSig ); 2288 q = ( 2 < q ) ? q - 2 : 0; 2289 q >>= 64 - expDiff; 2290 bSig >>= 2; 2291 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2292 } 2293 else { 2294 aSig >>= 2; 2295 bSig >>= 2; 2296 } 2297 do { 2298 alternateASig = aSig; 2299 ++q; 2300 aSig -= bSig; 2301 } while ( 0 <= (sbits64) aSig ); 2302 sigMean = aSig + alternateASig; 2303 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2304 aSig = alternateASig; 2305 } 2306 zSign = ( (sbits64) aSig < 0 ); 2307 if ( zSign ) aSig = - aSig; 2308 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 2309 2310} 2311 2312/* 2313------------------------------------------------------------------------------- 2314Returns the square root of the double-precision floating-point value `a'. 2315The operation is performed according to the IEC/IEEE Standard for Binary 2316Floating-point Arithmetic. 2317------------------------------------------------------------------------------- 2318*/ 2319float64 float64_sqrt( float64 a ) 2320{ 2321 flag aSign; 2322 int16 aExp, zExp; 2323 bits64 aSig, zSig; 2324 bits64 rem0, rem1, term0, term1; //, shiftedRem; 2325 //float64 z; 2326 2327 aSig = extractFloat64Frac( a ); 2328 aExp = extractFloat64Exp( a ); 2329 aSign = extractFloat64Sign( a ); 2330 if ( aExp == 0x7FF ) { 2331 if ( aSig ) return propagateFloat64NaN( a, a ); 2332 if ( ! aSign ) return a; 2333 float_raise( float_flag_invalid ); 2334 return float64_default_nan; 2335 } 2336 if ( aSign ) { 2337 if ( ( aExp | aSig ) == 0 ) return a; 2338 float_raise( float_flag_invalid ); 2339 return float64_default_nan; 2340 } 2341 if ( aExp == 0 ) { 2342 if ( aSig == 0 ) return 0; 2343 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2344 } 2345 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 2346 aSig |= LIT64( 0x0010000000000000 ); 2347 zSig = estimateSqrt32( aExp, aSig>>21 ); 2348 zSig <<= 31; 2349 aSig <<= 9 - ( aExp & 1 ); 2350 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2; 2351 if ( ( zSig & 0x3FF ) <= 5 ) { 2352 if ( zSig < 2 ) { 2353 zSig = LIT64( 0xFFFFFFFFFFFFFFFF ); 2354 } 2355 else { 2356 aSig <<= 2; 2357 mul64To128( zSig, zSig, &term0, &term1 ); 2358 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2359 while ( (sbits64) rem0 < 0 ) { 2360 --zSig; 2361 shortShift128Left( 0, zSig, 1, &term0, &term1 ); 2362 term1 |= 1; 2363 add128( rem0, rem1, term0, term1, &rem0, &rem1 ); 2364 } 2365 zSig |= ( ( rem0 | rem1 ) != 0 ); 2366 } 2367 } 2368 shift64RightJamming( zSig, 1, &zSig ); 2369 return roundAndPackFloat64( 0, zExp, zSig ); 2370 2371} 2372 2373/* 2374------------------------------------------------------------------------------- 2375Returns 1 if the double-precision floating-point value `a' is equal to the 2376corresponding value `b', and 0 otherwise. The comparison is performed 2377according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2378------------------------------------------------------------------------------- 2379*/ 2380flag float64_eq( float64 a, float64 b ) 2381{ 2382 2383 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2384 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2385 ) { 2386 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2387 float_raise( float_flag_invalid ); 2388 } 2389 return 0; 2390 } 2391 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2392 2393} 2394 2395/* 2396------------------------------------------------------------------------------- 2397Returns 1 if the double-precision floating-point value `a' is less than or 2398equal to the corresponding value `b', and 0 otherwise. The comparison is 2399performed according to the IEC/IEEE Standard for Binary Floating-point 2400Arithmetic. 2401------------------------------------------------------------------------------- 2402*/ 2403flag float64_le( float64 a, float64 b ) 2404{ 2405 flag aSign, bSign; 2406 2407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2409 ) { 2410 float_raise( float_flag_invalid ); 2411 return 0; 2412 } 2413 aSign = extractFloat64Sign( a ); 2414 bSign = extractFloat64Sign( b ); 2415 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2416 return ( a == b ) || ( aSign ^ ( a < b ) ); 2417 2418} 2419 2420/* 2421------------------------------------------------------------------------------- 2422Returns 1 if the double-precision floating-point value `a' is less than 2423the corresponding value `b', and 0 otherwise. The comparison is performed 2424according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2425------------------------------------------------------------------------------- 2426*/ 2427flag float64_lt( float64 a, float64 b ) 2428{ 2429 flag aSign, bSign; 2430 2431 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2432 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2433 ) { 2434 float_raise( float_flag_invalid ); 2435 return 0; 2436 } 2437 aSign = extractFloat64Sign( a ); 2438 bSign = extractFloat64Sign( b ); 2439 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2440 return ( a != b ) && ( aSign ^ ( a < b ) ); 2441 2442} 2443 2444/* 2445------------------------------------------------------------------------------- 2446Returns 1 if the double-precision floating-point value `a' is equal to the 2447corresponding value `b', and 0 otherwise. The invalid exception is raised 2448if either operand is a NaN. Otherwise, the comparison is performed 2449according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2450------------------------------------------------------------------------------- 2451*/ 2452flag float64_eq_signaling( float64 a, float64 b ) 2453{ 2454 2455 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2456 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2457 ) { 2458 float_raise( float_flag_invalid ); 2459 return 0; 2460 } 2461 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2462 2463} 2464 2465/* 2466------------------------------------------------------------------------------- 2467Returns 1 if the double-precision floating-point value `a' is less than or 2468equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2469cause an exception. Otherwise, the comparison is performed according to the 2470IEC/IEEE Standard for Binary Floating-point Arithmetic. 2471------------------------------------------------------------------------------- 2472*/ 2473flag float64_le_quiet( float64 a, float64 b ) 2474{ 2475 flag aSign, bSign; 2476 //int16 aExp, bExp; 2477 2478 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2479 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2480 ) { 2481 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2482 float_raise( float_flag_invalid ); 2483 } 2484 return 0; 2485 } 2486 aSign = extractFloat64Sign( a ); 2487 bSign = extractFloat64Sign( b ); 2488 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2489 return ( a == b ) || ( aSign ^ ( a < b ) ); 2490 2491} 2492 2493/* 2494------------------------------------------------------------------------------- 2495Returns 1 if the double-precision floating-point value `a' is less than 2496the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2497exception. Otherwise, the comparison is performed according to the IEC/IEEE 2498Standard for Binary Floating-point Arithmetic. 2499------------------------------------------------------------------------------- 2500*/ 2501flag float64_lt_quiet( float64 a, float64 b ) 2502{ 2503 flag aSign, bSign; 2504 2505 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2506 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2507 ) { 2508 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2509 float_raise( float_flag_invalid ); 2510 } 2511 return 0; 2512 } 2513 aSign = extractFloat64Sign( a ); 2514 bSign = extractFloat64Sign( b ); 2515 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2516 return ( a != b ) && ( aSign ^ ( a < b ) ); 2517 2518} 2519 2520#ifdef FLOATX80 2521 2522/* 2523------------------------------------------------------------------------------- 2524Returns the result of converting the extended double-precision floating- 2525point value `a' to the 32-bit two's complement integer format. The 2526conversion is performed according to the IEC/IEEE Standard for Binary 2527Floating-point Arithmetic---which means in particular that the conversion 2528is rounded according to the current rounding mode. If `a' is a NaN, the 2529largest positive integer is returned. Otherwise, if the conversion 2530overflows, the largest integer with the same sign as `a' is returned. 2531------------------------------------------------------------------------------- 2532*/ 2533int32 floatx80_to_int32( floatx80 a ) 2534{ 2535 flag aSign; 2536 int32 aExp, shiftCount; 2537 bits64 aSig; 2538 2539 aSig = extractFloatx80Frac( a ); 2540 aExp = extractFloatx80Exp( a ); 2541 aSign = extractFloatx80Sign( a ); 2542 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 2543 shiftCount = 0x4037 - aExp; 2544 if ( shiftCount <= 0 ) shiftCount = 1; 2545 shift64RightJamming( aSig, shiftCount, &aSig ); 2546 return roundAndPackInt32( aSign, aSig ); 2547 2548} 2549 2550/* 2551------------------------------------------------------------------------------- 2552Returns the result of converting the extended double-precision floating- 2553point value `a' to the 32-bit two's complement integer format. The 2554conversion is performed according to the IEC/IEEE Standard for Binary 2555Floating-point Arithmetic, except that the conversion is always rounded 2556toward zero. If `a' is a NaN, the largest positive integer is returned. 2557Otherwise, if the conversion overflows, the largest integer with the same 2558sign as `a' is returned. 2559------------------------------------------------------------------------------- 2560*/ 2561int32 floatx80_to_int32_round_to_zero( floatx80 a ) 2562{ 2563 flag aSign; 2564 int32 aExp, shiftCount; 2565 bits64 aSig, savedASig; 2566 int32 z; 2567 2568 aSig = extractFloatx80Frac( a ); 2569 aExp = extractFloatx80Exp( a ); 2570 aSign = extractFloatx80Sign( a ); 2571 shiftCount = 0x403E - aExp; 2572 if ( shiftCount < 32 ) { 2573 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 2574 goto invalid; 2575 } 2576 else if ( 63 < shiftCount ) { 2577 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2578 return 0; 2579 } 2580 savedASig = aSig; 2581 aSig >>= shiftCount; 2582 z = aSig; 2583 if ( aSign ) z = - z; 2584 if ( ( z < 0 ) ^ aSign ) { 2585 invalid: 2586 float_exception_flags |= float_flag_invalid; 2587 return aSign ? 0x80000000 : 0x7FFFFFFF; 2588 } 2589 if ( ( aSig<<shiftCount ) != savedASig ) { 2590 float_exception_flags |= float_flag_inexact; 2591 } 2592 return z; 2593 2594} 2595 2596/* 2597------------------------------------------------------------------------------- 2598Returns the result of converting the extended double-precision floating- 2599point value `a' to the single-precision floating-point format. The 2600conversion is performed according to the IEC/IEEE Standard for Binary 2601Floating-point Arithmetic. 2602------------------------------------------------------------------------------- 2603*/ 2604float32 floatx80_to_float32( floatx80 a ) 2605{ 2606 flag aSign; 2607 int32 aExp; 2608 bits64 aSig; 2609 2610 aSig = extractFloatx80Frac( a ); 2611 aExp = extractFloatx80Exp( a ); 2612 aSign = extractFloatx80Sign( a ); 2613 if ( aExp == 0x7FFF ) { 2614 if ( (bits64) ( aSig<<1 ) ) { 2615 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 2616 } 2617 return packFloat32( aSign, 0xFF, 0 ); 2618 } 2619 shift64RightJamming( aSig, 33, &aSig ); 2620 if ( aExp || aSig ) aExp -= 0x3F81; 2621 return roundAndPackFloat32( aSign, aExp, aSig ); 2622 2623} 2624 2625/* 2626------------------------------------------------------------------------------- 2627Returns the result of converting the extended double-precision floating- 2628point value `a' to the double-precision floating-point format. The 2629conversion is performed according to the IEC/IEEE Standard for Binary 2630Floating-point Arithmetic. 2631------------------------------------------------------------------------------- 2632*/ 2633float64 floatx80_to_float64( floatx80 a ) 2634{ 2635 flag aSign; 2636 int32 aExp; 2637 bits64 aSig, zSig; 2638 2639 aSig = extractFloatx80Frac( a ); 2640 aExp = extractFloatx80Exp( a ); 2641 aSign = extractFloatx80Sign( a ); 2642 if ( aExp == 0x7FFF ) { 2643 if ( (bits64) ( aSig<<1 ) ) { 2644 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 2645 } 2646 return packFloat64( aSign, 0x7FF, 0 ); 2647 } 2648 shift64RightJamming( aSig, 1, &zSig ); 2649 if ( aExp || aSig ) aExp -= 0x3C01; 2650 return roundAndPackFloat64( aSign, aExp, zSig ); 2651 2652} 2653 2654/* 2655------------------------------------------------------------------------------- 2656Rounds the extended double-precision floating-point value `a' to an integer, 2657and returns the result as an extended quadruple-precision floating-point 2658value. The operation is performed according to the IEC/IEEE Standard for 2659Binary Floating-point Arithmetic. 2660------------------------------------------------------------------------------- 2661*/ 2662floatx80 floatx80_round_to_int( floatx80 a ) 2663{ 2664 flag aSign; 2665 int32 aExp; 2666 bits64 lastBitMask, roundBitsMask; 2667 int8 roundingMode; 2668 floatx80 z; 2669 2670 aExp = extractFloatx80Exp( a ); 2671 if ( 0x403E <= aExp ) { 2672 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 2673 return propagateFloatx80NaN( a, a ); 2674 } 2675 return a; 2676 } 2677 if ( aExp <= 0x3FFE ) { 2678 if ( ( aExp == 0 ) 2679 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 2680 return a; 2681 } 2682 float_exception_flags |= float_flag_inexact; 2683 aSign = extractFloatx80Sign( a ); 2684 switch ( float_rounding_mode ) { 2685 case float_round_nearest_even: 2686 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 2687 ) { 2688 return 2689 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 2690 } 2691 break; 2692 case float_round_down: 2693 return 2694 aSign ? 2695 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 2696 : packFloatx80( 0, 0, 0 ); 2697 case float_round_up: 2698 return 2699 aSign ? packFloatx80( 1, 0, 0 ) 2700 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 2701 } 2702 return packFloatx80( aSign, 0, 0 ); 2703 } 2704 lastBitMask = 1; 2705 lastBitMask <<= 0x403E - aExp; 2706 roundBitsMask = lastBitMask - 1; 2707 z = a; 2708 roundingMode = float_rounding_mode; 2709 if ( roundingMode == float_round_nearest_even ) { 2710 z.low += lastBitMask>>1; 2711 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 2712 } 2713 else if ( roundingMode != float_round_to_zero ) { 2714 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2715 z.low += roundBitsMask; 2716 } 2717 } 2718 z.low &= ~ roundBitsMask; 2719 if ( z.low == 0 ) { 2720 ++z.high; 2721 z.low = LIT64( 0x8000000000000000 ); 2722 } 2723 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 2724 return z; 2725 2726} 2727 2728/* 2729------------------------------------------------------------------------------- 2730Returns the result of adding the absolute values of the extended double- 2731precision floating-point values `a' and `b'. If `zSign' is true, the sum is 2732negated before being returned. `zSign' is ignored if the result is a NaN. 2733The addition is performed according to the IEC/IEEE Standard for Binary 2734Floating-point Arithmetic. 2735------------------------------------------------------------------------------- 2736*/ 2737static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 2738{ 2739 int32 aExp, bExp, zExp; 2740 bits64 aSig, bSig, zSig0, zSig1; 2741 int32 expDiff; 2742 2743 aSig = extractFloatx80Frac( a ); 2744 aExp = extractFloatx80Exp( a ); 2745 bSig = extractFloatx80Frac( b ); 2746 bExp = extractFloatx80Exp( b ); 2747 expDiff = aExp - bExp; 2748 if ( 0 < expDiff ) { 2749 if ( aExp == 0x7FFF ) { 2750 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2751 return a; 2752 } 2753 if ( bExp == 0 ) --expDiff; 2754 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 2755 zExp = aExp; 2756 } 2757 else if ( expDiff < 0 ) { 2758 if ( bExp == 0x7FFF ) { 2759 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2760 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2761 } 2762 if ( aExp == 0 ) ++expDiff; 2763 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 2764 zExp = bExp; 2765 } 2766 else { 2767 if ( aExp == 0x7FFF ) { 2768 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 2769 return propagateFloatx80NaN( a, b ); 2770 } 2771 return a; 2772 } 2773 zSig1 = 0; 2774 zSig0 = aSig + bSig; 2775 if ( aExp == 0 ) { 2776 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 2777 goto roundAndPack; 2778 } 2779 zExp = aExp; 2780 goto shiftRight1; 2781 } 2782 2783 zSig0 = aSig + bSig; 2784 2785 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 2786 shiftRight1: 2787 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 2788 zSig0 |= LIT64( 0x8000000000000000 ); 2789 ++zExp; 2790 roundAndPack: 2791 return 2792 roundAndPackFloatx80( 2793 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 2794 2795} 2796 2797/* 2798------------------------------------------------------------------------------- 2799Returns the result of subtracting the absolute values of the extended 2800double-precision floating-point values `a' and `b'. If `zSign' is true, 2801the difference is negated before being returned. `zSign' is ignored if the 2802result is a NaN. The subtraction is performed according to the IEC/IEEE 2803Standard for Binary Floating-point Arithmetic. 2804------------------------------------------------------------------------------- 2805*/ 2806static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 2807{ 2808 int32 aExp, bExp, zExp; 2809 bits64 aSig, bSig, zSig0, zSig1; 2810 int32 expDiff; 2811 floatx80 z; 2812 2813 aSig = extractFloatx80Frac( a ); 2814 aExp = extractFloatx80Exp( a ); 2815 bSig = extractFloatx80Frac( b ); 2816 bExp = extractFloatx80Exp( b ); 2817 expDiff = aExp - bExp; 2818 if ( 0 < expDiff ) goto aExpBigger; 2819 if ( expDiff < 0 ) goto bExpBigger; 2820 if ( aExp == 0x7FFF ) { 2821 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 2822 return propagateFloatx80NaN( a, b ); 2823 } 2824 float_raise( float_flag_invalid ); 2825 z.low = floatx80_default_nan_low; 2826 z.high = floatx80_default_nan_high; 2827 return z; 2828 } 2829 if ( aExp == 0 ) { 2830 aExp = 1; 2831 bExp = 1; 2832 } 2833 zSig1 = 0; 2834 if ( bSig < aSig ) goto aBigger; 2835 if ( aSig < bSig ) goto bBigger; 2836 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 2837 bExpBigger: 2838 if ( bExp == 0x7FFF ) { 2839 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2840 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2841 } 2842 if ( aExp == 0 ) ++expDiff; 2843 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 2844 bBigger: 2845 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 2846 zExp = bExp; 2847 zSign ^= 1; 2848 goto normalizeRoundAndPack; 2849 aExpBigger: 2850 if ( aExp == 0x7FFF ) { 2851 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2852 return a; 2853 } 2854 if ( bExp == 0 ) --expDiff; 2855 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 2856 aBigger: 2857 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 2858 zExp = aExp; 2859 normalizeRoundAndPack: 2860 return 2861 normalizeRoundAndPackFloatx80( 2862 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 2863 2864} 2865 2866/* 2867------------------------------------------------------------------------------- 2868Returns the result of adding the extended double-precision floating-point 2869values `a' and `b'. The operation is performed according to the IEC/IEEE 2870Standard for Binary Floating-point Arithmetic. 2871------------------------------------------------------------------------------- 2872*/ 2873floatx80 floatx80_add( floatx80 a, floatx80 b ) 2874{ 2875 flag aSign, bSign; 2876 2877 aSign = extractFloatx80Sign( a ); 2878 bSign = extractFloatx80Sign( b ); 2879 if ( aSign == bSign ) { 2880 return addFloatx80Sigs( a, b, aSign ); 2881 } 2882 else { 2883 return subFloatx80Sigs( a, b, aSign ); 2884 } 2885 2886} 2887 2888/* 2889------------------------------------------------------------------------------- 2890Returns the result of subtracting the extended double-precision floating- 2891point values `a' and `b'. The operation is performed according to the 2892IEC/IEEE Standard for Binary Floating-point Arithmetic. 2893------------------------------------------------------------------------------- 2894*/ 2895floatx80 floatx80_sub( floatx80 a, floatx80 b ) 2896{ 2897 flag aSign, bSign; 2898 2899 aSign = extractFloatx80Sign( a ); 2900 bSign = extractFloatx80Sign( b ); 2901 if ( aSign == bSign ) { 2902 return subFloatx80Sigs( a, b, aSign ); 2903 } 2904 else { 2905 return addFloatx80Sigs( a, b, aSign ); 2906 } 2907 2908} 2909 2910/* 2911------------------------------------------------------------------------------- 2912Returns the result of multiplying the extended double-precision floating- 2913point values `a' and `b'. The operation is performed according to the 2914IEC/IEEE Standard for Binary Floating-point Arithmetic. 2915------------------------------------------------------------------------------- 2916*/ 2917floatx80 floatx80_mul( floatx80 a, floatx80 b ) 2918{ 2919 flag aSign, bSign, zSign; 2920 int32 aExp, bExp, zExp; 2921 bits64 aSig, bSig, zSig0, zSig1; 2922 floatx80 z; 2923 2924 aSig = extractFloatx80Frac( a ); 2925 aExp = extractFloatx80Exp( a ); 2926 aSign = extractFloatx80Sign( a ); 2927 bSig = extractFloatx80Frac( b ); 2928 bExp = extractFloatx80Exp( b ); 2929 bSign = extractFloatx80Sign( b ); 2930 zSign = aSign ^ bSign; 2931 if ( aExp == 0x7FFF ) { 2932 if ( (bits64) ( aSig<<1 ) 2933 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 2934 return propagateFloatx80NaN( a, b ); 2935 } 2936 if ( ( bExp | bSig ) == 0 ) goto invalid; 2937 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2938 } 2939 if ( bExp == 0x7FFF ) { 2940 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2941 if ( ( aExp | aSig ) == 0 ) { 2942 invalid: 2943 float_raise( float_flag_invalid ); 2944 z.low = floatx80_default_nan_low; 2945 z.high = floatx80_default_nan_high; 2946 return z; 2947 } 2948 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2949 } 2950 if ( aExp == 0 ) { 2951 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 2952 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 2953 } 2954 if ( bExp == 0 ) { 2955 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 2956 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 2957 } 2958 zExp = aExp + bExp - 0x3FFE; 2959 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2960 if ( 0 < (sbits64) zSig0 ) { 2961 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 2962 --zExp; 2963 } 2964 return 2965 roundAndPackFloatx80( 2966 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 2967 2968} 2969 2970/* 2971------------------------------------------------------------------------------- 2972Returns the result of dividing the extended double-precision floating-point 2973value `a' by the corresponding value `b'. The operation is performed 2974according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2975------------------------------------------------------------------------------- 2976*/ 2977floatx80 floatx80_div( floatx80 a, floatx80 b ) 2978{ 2979 flag aSign, bSign, zSign; 2980 int32 aExp, bExp, zExp; 2981 bits64 aSig, bSig, zSig0, zSig1; 2982 bits64 rem0, rem1, rem2, term0, term1, term2; 2983 floatx80 z; 2984 2985 aSig = extractFloatx80Frac( a ); 2986 aExp = extractFloatx80Exp( a ); 2987 aSign = extractFloatx80Sign( a ); 2988 bSig = extractFloatx80Frac( b ); 2989 bExp = extractFloatx80Exp( b ); 2990 bSign = extractFloatx80Sign( b ); 2991 zSign = aSign ^ bSign; 2992 if ( aExp == 0x7FFF ) { 2993 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2994 if ( bExp == 0x7FFF ) { 2995 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2996 goto invalid; 2997 } 2998 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2999 } 3000 if ( bExp == 0x7FFF ) { 3001 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3002 return packFloatx80( zSign, 0, 0 ); 3003 } 3004 if ( bExp == 0 ) { 3005 if ( bSig == 0 ) { 3006 if ( ( aExp | aSig ) == 0 ) { 3007 invalid: 3008 float_raise( float_flag_invalid ); 3009 z.low = floatx80_default_nan_low; 3010 z.high = floatx80_default_nan_high; 3011 return z; 3012 } 3013 float_raise( float_flag_divbyzero ); 3014 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3015 } 3016 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3017 } 3018 if ( aExp == 0 ) { 3019 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3020 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3021 } 3022 zExp = aExp - bExp + 0x3FFE; 3023 rem1 = 0; 3024 if ( bSig <= aSig ) { 3025 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3026 ++zExp; 3027 } 3028 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3029 mul64To128( bSig, zSig0, &term0, &term1 ); 3030 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3031 while ( (sbits64) rem0 < 0 ) { 3032 --zSig0; 3033 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3034 } 3035 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3036 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3037 mul64To128( bSig, zSig1, &term1, &term2 ); 3038 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3039 while ( (sbits64) rem1 < 0 ) { 3040 --zSig1; 3041 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3042 } 3043 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3044 } 3045 return 3046 roundAndPackFloatx80( 3047 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3048 3049} 3050 3051/* 3052------------------------------------------------------------------------------- 3053Returns the remainder of the extended double-precision floating-point value 3054`a' with respect to the corresponding value `b'. The operation is performed 3055according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3056------------------------------------------------------------------------------- 3057*/ 3058floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3059{ 3060 flag aSign, bSign, zSign; 3061 int32 aExp, bExp, expDiff; 3062 bits64 aSig0, aSig1, bSig; 3063 bits64 q, term0, term1, alternateASig0, alternateASig1; 3064 floatx80 z; 3065 3066 aSig0 = extractFloatx80Frac( a ); 3067 aExp = extractFloatx80Exp( a ); 3068 aSign = extractFloatx80Sign( a ); 3069 bSig = extractFloatx80Frac( b ); 3070 bExp = extractFloatx80Exp( b ); 3071 bSign = extractFloatx80Sign( b ); 3072 if ( aExp == 0x7FFF ) { 3073 if ( (bits64) ( aSig0<<1 ) 3074 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3075 return propagateFloatx80NaN( a, b ); 3076 } 3077 goto invalid; 3078 } 3079 if ( bExp == 0x7FFF ) { 3080 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3081 return a; 3082 } 3083 if ( bExp == 0 ) { 3084 if ( bSig == 0 ) { 3085 invalid: 3086 float_raise( float_flag_invalid ); 3087 z.low = floatx80_default_nan_low; 3088 z.high = floatx80_default_nan_high; 3089 return z; 3090 } 3091 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3092 } 3093 if ( aExp == 0 ) { 3094 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3095 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3096 } 3097 bSig |= LIT64( 0x8000000000000000 ); 3098 zSign = aSign; 3099 expDiff = aExp - bExp; 3100 aSig1 = 0; 3101 if ( expDiff < 0 ) { 3102 if ( expDiff < -1 ) return a; 3103 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3104 expDiff = 0; 3105 } 3106 q = ( bSig <= aSig0 ); 3107 if ( q ) aSig0 -= bSig; 3108 expDiff -= 64; 3109 while ( 0 < expDiff ) { 3110 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3111 q = ( 2 < q ) ? q - 2 : 0; 3112 mul64To128( bSig, q, &term0, &term1 ); 3113 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3114 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3115 expDiff -= 62; 3116 } 3117 expDiff += 64; 3118 if ( 0 < expDiff ) { 3119 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3120 q = ( 2 < q ) ? q - 2 : 0; 3121 q >>= 64 - expDiff; 3122 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3123 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3124 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3125 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3126 ++q; 3127 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3128 } 3129 } 3130 else { 3131 term1 = 0; 3132 term0 = bSig; 3133 } 3134 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3135 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3136 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3137 && ( q & 1 ) ) 3138 ) { 3139 aSig0 = alternateASig0; 3140 aSig1 = alternateASig1; 3141 zSign = ! zSign; 3142 } 3143 return 3144 normalizeRoundAndPackFloatx80( 3145 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3146 3147} 3148 3149/* 3150------------------------------------------------------------------------------- 3151Returns the square root of the extended double-precision floating-point 3152value `a'. The operation is performed according to the IEC/IEEE Standard 3153for Binary Floating-point Arithmetic. 3154------------------------------------------------------------------------------- 3155*/ 3156floatx80 floatx80_sqrt( floatx80 a ) 3157{ 3158 flag aSign; 3159 int32 aExp, zExp; 3160 bits64 aSig0, aSig1, zSig0, zSig1; 3161 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3162 bits64 shiftedRem0, shiftedRem1; 3163 floatx80 z; 3164 3165 aSig0 = extractFloatx80Frac( a ); 3166 aExp = extractFloatx80Exp( a ); 3167 aSign = extractFloatx80Sign( a ); 3168 if ( aExp == 0x7FFF ) { 3169 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 3170 if ( ! aSign ) return a; 3171 goto invalid; 3172 } 3173 if ( aSign ) { 3174 if ( ( aExp | aSig0 ) == 0 ) return a; 3175 invalid: 3176 float_raise( float_flag_invalid ); 3177 z.low = floatx80_default_nan_low; 3178 z.high = floatx80_default_nan_high; 3179 return z; 3180 } 3181 if ( aExp == 0 ) { 3182 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 3183 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3184 } 3185 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 3186 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 3187 zSig0 <<= 31; 3188 aSig1 = 0; 3189 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 ); 3190 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4; 3191 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF ); 3192 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 ); 3193 mul64To128( zSig0, zSig0, &term0, &term1 ); 3194 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 3195 while ( (sbits64) rem0 < 0 ) { 3196 --zSig0; 3197 shortShift128Left( 0, zSig0, 1, &term0, &term1 ); 3198 term1 |= 1; 3199 add128( rem0, rem1, term0, term1, &rem0, &rem1 ); 3200 } 3201 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 ); 3202 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 ); 3203 if ( (bits64) ( zSig1<<1 ) <= 10 ) { 3204 if ( zSig1 == 0 ) zSig1 = 1; 3205 mul64To128( zSig0, zSig1, &term1, &term2 ); 3206 shortShift128Left( term1, term2, 1, &term1, &term2 ); 3207 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3208 mul64To128( zSig1, zSig1, &term2, &term3 ); 3209 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 3210 while ( (sbits64) rem1 < 0 ) { 3211 --zSig1; 3212 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 ); 3213 term3 |= 1; 3214 add192( 3215 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 ); 3216 } 3217 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 3218 } 3219 return 3220 roundAndPackFloatx80( 3221 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 3222 3223} 3224 3225/* 3226------------------------------------------------------------------------------- 3227Returns 1 if the extended double-precision floating-point value `a' is 3228equal to the corresponding value `b', and 0 otherwise. The comparison is 3229performed according to the IEC/IEEE Standard for Binary Floating-point 3230Arithmetic. 3231------------------------------------------------------------------------------- 3232*/ 3233flag floatx80_eq( floatx80 a, floatx80 b ) 3234{ 3235 3236 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3237 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3238 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3239 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3240 ) { 3241 if ( floatx80_is_signaling_nan( a ) 3242 || floatx80_is_signaling_nan( b ) ) { 3243 float_raise( float_flag_invalid ); 3244 } 3245 return 0; 3246 } 3247 return 3248 ( a.low == b.low ) 3249 && ( ( a.high == b.high ) 3250 || ( ( a.low == 0 ) 3251 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 3252 ); 3253 3254} 3255 3256/* 3257------------------------------------------------------------------------------- 3258Returns 1 if the extended double-precision floating-point value `a' is 3259less than or equal to the corresponding value `b', and 0 otherwise. The 3260comparison is performed according to the IEC/IEEE Standard for Binary 3261Floating-point Arithmetic. 3262------------------------------------------------------------------------------- 3263*/ 3264flag floatx80_le( floatx80 a, floatx80 b ) 3265{ 3266 flag aSign, bSign; 3267 3268 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3269 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3270 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3271 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3272 ) { 3273 float_raise( float_flag_invalid ); 3274 return 0; 3275 } 3276 aSign = extractFloatx80Sign( a ); 3277 bSign = extractFloatx80Sign( b ); 3278 if ( aSign != bSign ) { 3279 return 3280 aSign 3281 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3282 == 0 ); 3283 } 3284 return 3285 aSign ? le128( b.high, b.low, a.high, a.low ) 3286 : le128( a.high, a.low, b.high, b.low ); 3287 3288} 3289 3290/* 3291------------------------------------------------------------------------------- 3292Returns 1 if the extended double-precision floating-point value `a' is 3293less than the corresponding value `b', and 0 otherwise. The comparison 3294is performed according to the IEC/IEEE Standard for Binary Floating-point 3295Arithmetic. 3296------------------------------------------------------------------------------- 3297*/ 3298flag floatx80_lt( floatx80 a, floatx80 b ) 3299{ 3300 flag aSign, bSign; 3301 3302 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3303 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3304 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3305 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3306 ) { 3307 float_raise( float_flag_invalid ); 3308 return 0; 3309 } 3310 aSign = extractFloatx80Sign( a ); 3311 bSign = extractFloatx80Sign( b ); 3312 if ( aSign != bSign ) { 3313 return 3314 aSign 3315 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3316 != 0 ); 3317 } 3318 return 3319 aSign ? lt128( b.high, b.low, a.high, a.low ) 3320 : lt128( a.high, a.low, b.high, b.low ); 3321 3322} 3323 3324/* 3325------------------------------------------------------------------------------- 3326Returns 1 if the extended double-precision floating-point value `a' is equal 3327to the corresponding value `b', and 0 otherwise. The invalid exception is 3328raised if either operand is a NaN. Otherwise, the comparison is performed 3329according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3330------------------------------------------------------------------------------- 3331*/ 3332flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 3333{ 3334 3335 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3336 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3337 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3338 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3339 ) { 3340 float_raise( float_flag_invalid ); 3341 return 0; 3342 } 3343 return 3344 ( a.low == b.low ) 3345 && ( ( a.high == b.high ) 3346 || ( ( a.low == 0 ) 3347 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 3348 ); 3349 3350} 3351 3352/* 3353------------------------------------------------------------------------------- 3354Returns 1 if the extended double-precision floating-point value `a' is less 3355than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 3356do not cause an exception. Otherwise, the comparison is performed according 3357to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3358------------------------------------------------------------------------------- 3359*/ 3360flag floatx80_le_quiet( floatx80 a, floatx80 b ) 3361{ 3362 flag aSign, bSign; 3363 3364 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3365 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3366 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3367 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3368 ) { 3369 if ( floatx80_is_signaling_nan( a ) 3370 || floatx80_is_signaling_nan( b ) ) { 3371 float_raise( float_flag_invalid ); 3372 } 3373 return 0; 3374 } 3375 aSign = extractFloatx80Sign( a ); 3376 bSign = extractFloatx80Sign( b ); 3377 if ( aSign != bSign ) { 3378 return 3379 aSign 3380 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3381 == 0 ); 3382 } 3383 return 3384 aSign ? le128( b.high, b.low, a.high, a.low ) 3385 : le128( a.high, a.low, b.high, b.low ); 3386 3387} 3388 3389/* 3390------------------------------------------------------------------------------- 3391Returns 1 if the extended double-precision floating-point value `a' is less 3392than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 3393an exception. Otherwise, the comparison is performed according to the 3394IEC/IEEE Standard for Binary Floating-point Arithmetic. 3395------------------------------------------------------------------------------- 3396*/ 3397flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 3398{ 3399 flag aSign, bSign; 3400 3401 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3402 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3403 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3404 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3405 ) { 3406 if ( floatx80_is_signaling_nan( a ) 3407 || floatx80_is_signaling_nan( b ) ) { 3408 float_raise( float_flag_invalid ); 3409 } 3410 return 0; 3411 } 3412 aSign = extractFloatx80Sign( a ); 3413 bSign = extractFloatx80Sign( b ); 3414 if ( aSign != bSign ) { 3415 return 3416 aSign 3417 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3418 != 0 ); 3419 } 3420 return 3421 aSign ? lt128( b.high, b.low, a.high, a.low ) 3422 : lt128( a.high, a.low, b.high, b.low ); 3423 3424} 3425 3426#endif 3427