1/* 2=============================================================================== 3 4This C source file is part of the SoftFloat IEC/IEEE Floating-point 5Arithmetic Package, Release 2. 6 7Written by John R. Hauser. This work was made possible in part by the 8International Computer Science Institute, located at Suite 600, 1947 Center 9Street, Berkeley, California 94704. Funding was partially provided by the 10National Science Foundation under grant MIP-9311980. The original version 11of this code was written as part of a project to build a fixed-point vector 12processor in collaboration with the University of California at Berkeley, 13overseen by Profs. Nelson Morgan and John Wawrzynek. More information 14is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 15arithmetic/softfloat.html'. 16 17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 18has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 19TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 20PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 21AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 22 23Derivative works are acceptable, even for commercial purposes, so long as 24(1) they include prominent notice that the work is derivative, and (2) they 25include prominent notice akin to these three paragraphs for those parts of 26this code that are retained. 27 28=============================================================================== 29*/ 30 31#include <asm/div64.h> 32 33#include "fpa11.h" 34//#include "milieu.h" 35//#include "softfloat.h" 36 37/* 38------------------------------------------------------------------------------- 39Primitive arithmetic functions, including multi-word arithmetic, and 40division and square root approximations. (Can be specialized to target if 41desired.) 42------------------------------------------------------------------------------- 43*/ 44#include "softfloat-macros" 45 46/* 47------------------------------------------------------------------------------- 48Functions and definitions to determine: (1) whether tininess for underflow 49is detected before or after rounding by default, (2) what (if anything) 50happens when exceptions are raised, (3) how signaling NaNs are distinguished 51from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 52are propagated from function inputs to output. These details are target- 53specific. 54------------------------------------------------------------------------------- 55*/ 56#include "softfloat-specialize" 57 58/* 59------------------------------------------------------------------------------- 60Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 61and 7, and returns the properly rounded 32-bit integer corresponding to the 62input. If `zSign' is nonzero, the input is negated before being converted 63to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point 64input is simply rounded to an integer, with the inexact exception raised if 65the input cannot be represented exactly as an integer. If the fixed-point 66input is too large, however, the invalid exception is raised and the largest 67positive or negative integer is returned. 68------------------------------------------------------------------------------- 69*/ 70static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ ) 71{ 72 int8 roundingMode; 73 flag roundNearestEven; 74 int8 roundIncrement, roundBits; 75 int32 z; 76 77 roundingMode = roundData->mode; 78 roundNearestEven = ( roundingMode == float_round_nearest_even ); 79 roundIncrement = 0x40; 80 if ( ! roundNearestEven ) { 81 if ( roundingMode == float_round_to_zero ) { 82 roundIncrement = 0; 83 } 84 else { 85 roundIncrement = 0x7F; 86 if ( zSign ) { 87 if ( roundingMode == float_round_up ) roundIncrement = 0; 88 } 89 else { 90 if ( roundingMode == float_round_down ) roundIncrement = 0; 91 } 92 } 93 } 94 roundBits = absZ & 0x7F; 95 absZ = ( absZ + roundIncrement )>>7; 96 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 97 z = absZ; 98 if ( zSign ) z = - z; 99 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 100 roundData->exception |= float_flag_invalid; 101 return zSign ? 0x80000000 : 0x7FFFFFFF; 102 } 103 if ( roundBits ) roundData->exception |= float_flag_inexact; 104 return z; 105 106} 107 108/* 109------------------------------------------------------------------------------- 110Returns the fraction bits of the single-precision floating-point value `a'. 111------------------------------------------------------------------------------- 112*/ 113INLINE bits32 extractFloat32Frac( float32 a ) 114{ 115 116 return a & 0x007FFFFF; 117 118} 119 120/* 121------------------------------------------------------------------------------- 122Returns the exponent bits of the single-precision floating-point value `a'. 123------------------------------------------------------------------------------- 124*/ 125INLINE int16 extractFloat32Exp( float32 a ) 126{ 127 128 return ( a>>23 ) & 0xFF; 129 130} 131 132/* 133------------------------------------------------------------------------------- 134Returns the sign bit of the single-precision floating-point value `a'. 135------------------------------------------------------------------------------- 136*/ 137 138/* 139------------------------------------------------------------------------------- 140Normalizes the subnormal single-precision floating-point value represented 141by the denormalized significand `aSig'. The normalized exponent and 142significand are stored at the locations pointed to by `zExpPtr' and 143`zSigPtr', respectively. 144------------------------------------------------------------------------------- 145*/ 146static void 147 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 148{ 149 int8 shiftCount; 150 151 shiftCount = countLeadingZeros32( aSig ) - 8; 152 *zSigPtr = aSig<<shiftCount; 153 *zExpPtr = 1 - shiftCount; 154 155} 156 157/* 158------------------------------------------------------------------------------- 159Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 160single-precision floating-point value, returning the result. After being 161shifted into the proper positions, the three fields are simply added 162together to form the result. This means that any integer portion of `zSig' 163will be added into the exponent. Since a properly normalized significand 164will have an integer portion equal to 1, the `zExp' input should be 1 less 165than the desired result exponent whenever `zSig' is a complete, normalized 166significand. 167------------------------------------------------------------------------------- 168*/ 169INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 170{ 171 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 172} 173 174/* 175------------------------------------------------------------------------------- 176Takes an abstract floating-point value having sign `zSign', exponent `zExp', 177and significand `zSig', and returns the proper single-precision floating- 178point value corresponding to the abstract input. Ordinarily, the abstract 179value is simply rounded and packed into the single-precision format, with 180the inexact exception raised if the abstract input cannot be represented 181exactly. If the abstract value is too large, however, the overflow and 182inexact exceptions are raised and an infinity or maximal finite value is 183returned. If the abstract value is too small, the input value is rounded to 184a subnormal number, and the underflow and inexact exceptions are raised if 185the abstract input cannot be represented exactly as a subnormal single- 186precision floating-point number. 187 The input significand `zSig' has its binary point between bits 30 188and 29, which is 7 bits to the left of the usual location. This shifted 189significand must be normalized or smaller. If `zSig' is not normalized, 190`zExp' must be 0; in that case, the result returned is a subnormal number, 191and it must not require rounding. In the usual case that `zSig' is 192normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 193The handling of underflow and overflow follows the IEC/IEEE Standard for 194Binary Floating-point Arithmetic. 195------------------------------------------------------------------------------- 196*/ 197static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig ) 198{ 199 int8 roundingMode; 200 flag roundNearestEven; 201 int8 roundIncrement, roundBits; 202 flag isTiny; 203 204 roundingMode = roundData->mode; 205 roundNearestEven = ( roundingMode == float_round_nearest_even ); 206 roundIncrement = 0x40; 207 if ( ! roundNearestEven ) { 208 if ( roundingMode == float_round_to_zero ) { 209 roundIncrement = 0; 210 } 211 else { 212 roundIncrement = 0x7F; 213 if ( zSign ) { 214 if ( roundingMode == float_round_up ) roundIncrement = 0; 215 } 216 else { 217 if ( roundingMode == float_round_down ) roundIncrement = 0; 218 } 219 } 220 } 221 roundBits = zSig & 0x7F; 222 if ( 0xFD <= (bits16) zExp ) { 223 if ( ( 0xFD < zExp ) 224 || ( ( zExp == 0xFD ) 225 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 226 ) { 227 roundData->exception |= float_flag_overflow | float_flag_inexact; 228 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 229 } 230 if ( zExp < 0 ) { 231 isTiny = 232 ( float_detect_tininess == float_tininess_before_rounding ) 233 || ( zExp < -1 ) 234 || ( zSig + roundIncrement < 0x80000000 ); 235 shift32RightJamming( zSig, - zExp, &zSig ); 236 zExp = 0; 237 roundBits = zSig & 0x7F; 238 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow; 239 } 240 } 241 if ( roundBits ) roundData->exception |= float_flag_inexact; 242 zSig = ( zSig + roundIncrement )>>7; 243 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 244 if ( zSig == 0 ) zExp = 0; 245 return packFloat32( zSign, zExp, zSig ); 246 247} 248 249/* 250------------------------------------------------------------------------------- 251Takes an abstract floating-point value having sign `zSign', exponent `zExp', 252and significand `zSig', and returns the proper single-precision floating- 253point value corresponding to the abstract input. This routine is just like 254`roundAndPackFloat32' except that `zSig' does not have to be normalized in 255any way. In all cases, `zExp' must be 1 less than the ``true'' floating- 256point exponent. 257------------------------------------------------------------------------------- 258*/ 259static float32 260 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig ) 261{ 262 int8 shiftCount; 263 264 shiftCount = countLeadingZeros32( zSig ) - 1; 265 return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount ); 266 267} 268 269/* 270------------------------------------------------------------------------------- 271Returns the fraction bits of the double-precision floating-point value `a'. 272------------------------------------------------------------------------------- 273*/ 274INLINE bits64 extractFloat64Frac( float64 a ) 275{ 276 277 return a & LIT64( 0x000FFFFFFFFFFFFF ); 278 279} 280 281/* 282------------------------------------------------------------------------------- 283Returns the exponent bits of the double-precision floating-point value `a'. 284------------------------------------------------------------------------------- 285*/ 286INLINE int16 extractFloat64Exp( float64 a ) 287{ 288 289 return ( a>>52 ) & 0x7FF; 290 291} 292 293/* 294------------------------------------------------------------------------------- 295Returns the sign bit of the double-precision floating-point value `a'. 296------------------------------------------------------------------------------- 297*/ 298 299/* 300------------------------------------------------------------------------------- 301Normalizes the subnormal double-precision floating-point value represented 302by the denormalized significand `aSig'. The normalized exponent and 303significand are stored at the locations pointed to by `zExpPtr' and 304`zSigPtr', respectively. 305------------------------------------------------------------------------------- 306*/ 307static void 308 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 309{ 310 int8 shiftCount; 311 312 shiftCount = countLeadingZeros64( aSig ) - 11; 313 *zSigPtr = aSig<<shiftCount; 314 *zExpPtr = 1 - shiftCount; 315 316} 317 318/* 319------------------------------------------------------------------------------- 320Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 321double-precision floating-point value, returning the result. After being 322shifted into the proper positions, the three fields are simply added 323together to form the result. This means that any integer portion of `zSig' 324will be added into the exponent. Since a properly normalized significand 325will have an integer portion equal to 1, the `zExp' input should be 1 less 326than the desired result exponent whenever `zSig' is a complete, normalized 327significand. 328------------------------------------------------------------------------------- 329*/ 330INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 331{ 332 333 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig; 334 335} 336 337/* 338------------------------------------------------------------------------------- 339Takes an abstract floating-point value having sign `zSign', exponent `zExp', 340and significand `zSig', and returns the proper double-precision floating- 341point value corresponding to the abstract input. Ordinarily, the abstract 342value is simply rounded and packed into the double-precision format, with 343the inexact exception raised if the abstract input cannot be represented 344exactly. If the abstract value is too large, however, the overflow and 345inexact exceptions are raised and an infinity or maximal finite value is 346returned. If the abstract value is too small, the input value is rounded to 347a subnormal number, and the underflow and inexact exceptions are raised if 348the abstract input cannot be represented exactly as a subnormal double- 349precision floating-point number. 350 The input significand `zSig' has its binary point between bits 62 351and 61, which is 10 bits to the left of the usual location. This shifted 352significand must be normalized or smaller. If `zSig' is not normalized, 353`zExp' must be 0; in that case, the result returned is a subnormal number, 354and it must not require rounding. In the usual case that `zSig' is 355normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 356The handling of underflow and overflow follows the IEC/IEEE Standard for 357Binary Floating-point Arithmetic. 358------------------------------------------------------------------------------- 359*/ 360static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig ) 361{ 362 int8 roundingMode; 363 flag roundNearestEven; 364 int16 roundIncrement, roundBits; 365 flag isTiny; 366 367 roundingMode = roundData->mode; 368 roundNearestEven = ( roundingMode == float_round_nearest_even ); 369 roundIncrement = 0x200; 370 if ( ! roundNearestEven ) { 371 if ( roundingMode == float_round_to_zero ) { 372 roundIncrement = 0; 373 } 374 else { 375 roundIncrement = 0x3FF; 376 if ( zSign ) { 377 if ( roundingMode == float_round_up ) roundIncrement = 0; 378 } 379 else { 380 if ( roundingMode == float_round_down ) roundIncrement = 0; 381 } 382 } 383 } 384 roundBits = zSig & 0x3FF; 385 if ( 0x7FD <= (bits16) zExp ) { 386 if ( ( 0x7FD < zExp ) 387 || ( ( zExp == 0x7FD ) 388 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 389 ) { 390 //register int lr = __builtin_return_address(0); 391 //printk("roundAndPackFloat64 called from 0x%08x\n",lr); 392 roundData->exception |= float_flag_overflow | float_flag_inexact; 393 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 ); 394 } 395 if ( zExp < 0 ) { 396 isTiny = 397 ( float_detect_tininess == float_tininess_before_rounding ) 398 || ( zExp < -1 ) 399 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 400 shift64RightJamming( zSig, - zExp, &zSig ); 401 zExp = 0; 402 roundBits = zSig & 0x3FF; 403 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow; 404 } 405 } 406 if ( roundBits ) roundData->exception |= float_flag_inexact; 407 zSig = ( zSig + roundIncrement )>>10; 408 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 409 if ( zSig == 0 ) zExp = 0; 410 return packFloat64( zSign, zExp, zSig ); 411 412} 413 414/* 415------------------------------------------------------------------------------- 416Takes an abstract floating-point value having sign `zSign', exponent `zExp', 417and significand `zSig', and returns the proper double-precision floating- 418point value corresponding to the abstract input. This routine is just like 419`roundAndPackFloat64' except that `zSig' does not have to be normalized in 420any way. In all cases, `zExp' must be 1 less than the ``true'' floating- 421point exponent. 422------------------------------------------------------------------------------- 423*/ 424static float64 425 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig ) 426{ 427 int8 shiftCount; 428 429 shiftCount = countLeadingZeros64( zSig ) - 1; 430 return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount ); 431 432} 433 434#ifdef FLOATX80 435 436/* 437------------------------------------------------------------------------------- 438Returns the fraction bits of the extended double-precision floating-point 439value `a'. 440------------------------------------------------------------------------------- 441*/ 442INLINE bits64 extractFloatx80Frac( floatx80 a ) 443{ 444 445 return a.low; 446 447} 448 449/* 450------------------------------------------------------------------------------- 451Returns the exponent bits of the extended double-precision floating-point 452value `a'. 453------------------------------------------------------------------------------- 454*/ 455INLINE int32 extractFloatx80Exp( floatx80 a ) 456{ 457 458 return a.high & 0x7FFF; 459 460} 461 462/* 463------------------------------------------------------------------------------- 464Returns the sign bit of the extended double-precision floating-point value 465`a'. 466------------------------------------------------------------------------------- 467*/ 468INLINE flag extractFloatx80Sign( floatx80 a ) 469{ 470 471 return a.high>>15; 472 473} 474 475/* 476------------------------------------------------------------------------------- 477Normalizes the subnormal extended double-precision floating-point value 478represented by the denormalized significand `aSig'. The normalized exponent 479and significand are stored at the locations pointed to by `zExpPtr' and 480`zSigPtr', respectively. 481------------------------------------------------------------------------------- 482*/ 483static void 484 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 485{ 486 int8 shiftCount; 487 488 shiftCount = countLeadingZeros64( aSig ); 489 *zSigPtr = aSig<<shiftCount; 490 *zExpPtr = 1 - shiftCount; 491 492} 493 494/* 495------------------------------------------------------------------------------- 496Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 497extended double-precision floating-point value, returning the result. 498------------------------------------------------------------------------------- 499*/ 500INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 501{ 502 floatx80 z; 503 504 z.low = zSig; 505 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 506 z.__padding = 0; 507 return z; 508 509} 510 511/* 512------------------------------------------------------------------------------- 513Takes an abstract floating-point value having sign `zSign', exponent `zExp', 514and extended significand formed by the concatenation of `zSig0' and `zSig1', 515and returns the proper extended double-precision floating-point value 516corresponding to the abstract input. Ordinarily, the abstract value is 517rounded and packed into the extended double-precision format, with the 518inexact exception raised if the abstract input cannot be represented 519exactly. If the abstract value is too large, however, the overflow and 520inexact exceptions are raised and an infinity or maximal finite value is 521returned. If the abstract value is too small, the input value is rounded to 522a subnormal number, and the underflow and inexact exceptions are raised if 523the abstract input cannot be represented exactly as a subnormal extended 524double-precision floating-point number. 525 If `roundingPrecision' is 32 or 64, the result is rounded to the same 526number of bits as single or double precision, respectively. Otherwise, the 527result is rounded to the full precision of the extended double-precision 528format. 529 The input significand must be normalized or smaller. If the input 530significand is not normalized, `zExp' must be 0; in that case, the result 531returned is a subnormal number, and it must not require rounding. The 532handling of underflow and overflow follows the IEC/IEEE Standard for Binary 533Floating-point Arithmetic. 534------------------------------------------------------------------------------- 535*/ 536static floatx80 537 roundAndPackFloatx80( 538 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 539 ) 540{ 541 int8 roundingMode, roundingPrecision; 542 flag roundNearestEven, increment, isTiny; 543 int64 roundIncrement, roundMask, roundBits; 544 545 roundingMode = roundData->mode; 546 roundingPrecision = roundData->precision; 547 roundNearestEven = ( roundingMode == float_round_nearest_even ); 548 if ( roundingPrecision == 80 ) goto precision80; 549 if ( roundingPrecision == 64 ) { 550 roundIncrement = LIT64( 0x0000000000000400 ); 551 roundMask = LIT64( 0x00000000000007FF ); 552 } 553 else if ( roundingPrecision == 32 ) { 554 roundIncrement = LIT64( 0x0000008000000000 ); 555 roundMask = LIT64( 0x000000FFFFFFFFFF ); 556 } 557 else { 558 goto precision80; 559 } 560 zSig0 |= ( zSig1 != 0 ); 561 if ( ! roundNearestEven ) { 562 if ( roundingMode == float_round_to_zero ) { 563 roundIncrement = 0; 564 } 565 else { 566 roundIncrement = roundMask; 567 if ( zSign ) { 568 if ( roundingMode == float_round_up ) roundIncrement = 0; 569 } 570 else { 571 if ( roundingMode == float_round_down ) roundIncrement = 0; 572 } 573 } 574 } 575 roundBits = zSig0 & roundMask; 576 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 577 if ( ( 0x7FFE < zExp ) 578 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 579 ) { 580 goto overflow; 581 } 582 if ( zExp <= 0 ) { 583 isTiny = 584 ( float_detect_tininess == float_tininess_before_rounding ) 585 || ( zExp < 0 ) 586 || ( zSig0 <= zSig0 + roundIncrement ); 587 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 588 zExp = 0; 589 roundBits = zSig0 & roundMask; 590 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow; 591 if ( roundBits ) roundData->exception |= float_flag_inexact; 592 zSig0 += roundIncrement; 593 if ( (sbits64) zSig0 < 0 ) zExp = 1; 594 roundIncrement = roundMask + 1; 595 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 596 roundMask |= roundIncrement; 597 } 598 zSig0 &= ~ roundMask; 599 return packFloatx80( zSign, zExp, zSig0 ); 600 } 601 } 602 if ( roundBits ) roundData->exception |= float_flag_inexact; 603 zSig0 += roundIncrement; 604 if ( zSig0 < roundIncrement ) { 605 ++zExp; 606 zSig0 = LIT64( 0x8000000000000000 ); 607 } 608 roundIncrement = roundMask + 1; 609 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 610 roundMask |= roundIncrement; 611 } 612 zSig0 &= ~ roundMask; 613 if ( zSig0 == 0 ) zExp = 0; 614 return packFloatx80( zSign, zExp, zSig0 ); 615 precision80: 616 increment = ( (sbits64) zSig1 < 0 ); 617 if ( ! roundNearestEven ) { 618 if ( roundingMode == float_round_to_zero ) { 619 increment = 0; 620 } 621 else { 622 if ( zSign ) { 623 increment = ( roundingMode == float_round_down ) && zSig1; 624 } 625 else { 626 increment = ( roundingMode == float_round_up ) && zSig1; 627 } 628 } 629 } 630 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 631 if ( ( 0x7FFE < zExp ) 632 || ( ( zExp == 0x7FFE ) 633 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 634 && increment 635 ) 636 ) { 637 roundMask = 0; 638 overflow: 639 roundData->exception |= float_flag_overflow | float_flag_inexact; 640 if ( ( roundingMode == float_round_to_zero ) 641 || ( zSign && ( roundingMode == float_round_up ) ) 642 || ( ! zSign && ( roundingMode == float_round_down ) ) 643 ) { 644 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 645 } 646 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 647 } 648 if ( zExp <= 0 ) { 649 isTiny = 650 ( float_detect_tininess == float_tininess_before_rounding ) 651 || ( zExp < 0 ) 652 || ! increment 653 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 654 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 655 zExp = 0; 656 if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow; 657 if ( zSig1 ) roundData->exception |= float_flag_inexact; 658 if ( roundNearestEven ) { 659 increment = ( (sbits64) zSig1 < 0 ); 660 } 661 else { 662 if ( zSign ) { 663 increment = ( roundingMode == float_round_down ) && zSig1; 664 } 665 else { 666 increment = ( roundingMode == float_round_up ) && zSig1; 667 } 668 } 669 if ( increment ) { 670 ++zSig0; 671 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven ); 672 if ( (sbits64) zSig0 < 0 ) zExp = 1; 673 } 674 return packFloatx80( zSign, zExp, zSig0 ); 675 } 676 } 677 if ( zSig1 ) roundData->exception |= float_flag_inexact; 678 if ( increment ) { 679 ++zSig0; 680 if ( zSig0 == 0 ) { 681 ++zExp; 682 zSig0 = LIT64( 0x8000000000000000 ); 683 } 684 else { 685 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven ); 686 } 687 } 688 else { 689 if ( zSig0 == 0 ) zExp = 0; 690 } 691 692 return packFloatx80( zSign, zExp, zSig0 ); 693} 694 695/* 696------------------------------------------------------------------------------- 697Takes an abstract floating-point value having sign `zSign', exponent 698`zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 699and returns the proper extended double-precision floating-point value 700corresponding to the abstract input. This routine is just like 701`roundAndPackFloatx80' except that the input significand does not have to be 702normalized. 703------------------------------------------------------------------------------- 704*/ 705static floatx80 706 normalizeRoundAndPackFloatx80( 707 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 708 ) 709{ 710 int8 shiftCount; 711 712 if ( zSig0 == 0 ) { 713 zSig0 = zSig1; 714 zSig1 = 0; 715 zExp -= 64; 716 } 717 shiftCount = countLeadingZeros64( zSig0 ); 718 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 719 zExp -= shiftCount; 720 return 721 roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 ); 722 723} 724 725#endif 726 727/* 728------------------------------------------------------------------------------- 729Returns the result of converting the 32-bit two's complement integer `a' to 730the single-precision floating-point format. The conversion is performed 731according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 732------------------------------------------------------------------------------- 733*/ 734float32 int32_to_float32(struct roundingData *roundData, int32 a) 735{ 736 flag zSign; 737 738 if ( a == 0 ) return 0; 739 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 740 zSign = ( a < 0 ); 741 return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a ); 742 743} 744 745/* 746------------------------------------------------------------------------------- 747Returns the result of converting the 32-bit two's complement integer `a' to 748the double-precision floating-point format. The conversion is performed 749according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 750------------------------------------------------------------------------------- 751*/ 752float64 int32_to_float64( int32 a ) 753{ 754 flag aSign; 755 uint32 absA; 756 int8 shiftCount; 757 bits64 zSig; 758 759 if ( a == 0 ) return 0; 760 aSign = ( a < 0 ); 761 absA = aSign ? - a : a; 762 shiftCount = countLeadingZeros32( absA ) + 21; 763 zSig = absA; 764 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount ); 765 766} 767 768#ifdef FLOATX80 769 770/* 771------------------------------------------------------------------------------- 772Returns the result of converting the 32-bit two's complement integer `a' 773to the extended double-precision floating-point format. The conversion 774is performed according to the IEC/IEEE Standard for Binary Floating-point 775Arithmetic. 776------------------------------------------------------------------------------- 777*/ 778floatx80 int32_to_floatx80( int32 a ) 779{ 780 flag zSign; 781 uint32 absA; 782 int8 shiftCount; 783 bits64 zSig; 784 785 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 786 zSign = ( a < 0 ); 787 absA = zSign ? - a : a; 788 shiftCount = countLeadingZeros32( absA ) + 32; 789 zSig = absA; 790 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 791 792} 793 794#endif 795 796/* 797------------------------------------------------------------------------------- 798Returns the result of converting the single-precision floating-point value 799`a' to the 32-bit two's complement integer format. The conversion is 800performed according to the IEC/IEEE Standard for Binary Floating-point 801Arithmetic---which means in particular that the conversion is rounded 802according to the current rounding mode. If `a' is a NaN, the largest 803positive integer is returned. Otherwise, if the conversion overflows, the 804largest integer with the same sign as `a' is returned. 805------------------------------------------------------------------------------- 806*/ 807int32 float32_to_int32( struct roundingData *roundData, float32 a ) 808{ 809 flag aSign; 810 int16 aExp, shiftCount; 811 bits32 aSig; 812 bits64 zSig; 813 814 aSig = extractFloat32Frac( a ); 815 aExp = extractFloat32Exp( a ); 816 aSign = extractFloat32Sign( a ); 817 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 818 if ( aExp ) aSig |= 0x00800000; 819 shiftCount = 0xAF - aExp; 820 zSig = aSig; 821 zSig <<= 32; 822 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig ); 823 return roundAndPackInt32( roundData, aSign, zSig ); 824 825} 826 827/* 828------------------------------------------------------------------------------- 829Returns the result of converting the single-precision floating-point value 830`a' to the 32-bit two's complement integer format. The conversion is 831performed according to the IEC/IEEE Standard for Binary Floating-point 832Arithmetic, except that the conversion is always rounded toward zero. If 833`a' is a NaN, the largest positive integer is returned. Otherwise, if the 834conversion overflows, the largest integer with the same sign as `a' is 835returned. 836------------------------------------------------------------------------------- 837*/ 838int32 float32_to_int32_round_to_zero( float32 a ) 839{ 840 flag aSign; 841 int16 aExp, shiftCount; 842 bits32 aSig; 843 int32 z; 844 845 aSig = extractFloat32Frac( a ); 846 aExp = extractFloat32Exp( a ); 847 aSign = extractFloat32Sign( a ); 848 shiftCount = aExp - 0x9E; 849 if ( 0 <= shiftCount ) { 850 if ( a == 0xCF000000 ) return 0x80000000; 851 float_raise( float_flag_invalid ); 852 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 853 return 0x80000000; 854 } 855 else if ( aExp <= 0x7E ) { 856 if ( aExp | aSig ) float_raise( float_flag_inexact ); 857 return 0; 858 } 859 aSig = ( aSig | 0x00800000 )<<8; 860 z = aSig>>( - shiftCount ); 861 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 862 float_raise( float_flag_inexact ); 863 } 864 return aSign ? - z : z; 865 866} 867 868/* 869------------------------------------------------------------------------------- 870Returns the result of converting the single-precision floating-point value 871`a' to the double-precision floating-point format. The conversion is 872performed according to the IEC/IEEE Standard for Binary Floating-point 873Arithmetic. 874------------------------------------------------------------------------------- 875*/ 876float64 float32_to_float64( float32 a ) 877{ 878 flag aSign; 879 int16 aExp; 880 bits32 aSig; 881 882 aSig = extractFloat32Frac( a ); 883 aExp = extractFloat32Exp( a ); 884 aSign = extractFloat32Sign( a ); 885 if ( aExp == 0xFF ) { 886 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 887 return packFloat64( aSign, 0x7FF, 0 ); 888 } 889 if ( aExp == 0 ) { 890 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 891 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 892 --aExp; 893 } 894 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 895 896} 897 898#ifdef FLOATX80 899 900/* 901------------------------------------------------------------------------------- 902Returns the result of converting the single-precision floating-point value 903`a' to the extended double-precision floating-point format. The conversion 904is performed according to the IEC/IEEE Standard for Binary Floating-point 905Arithmetic. 906------------------------------------------------------------------------------- 907*/ 908floatx80 float32_to_floatx80( float32 a ) 909{ 910 flag aSign; 911 int16 aExp; 912 bits32 aSig; 913 914 aSig = extractFloat32Frac( a ); 915 aExp = extractFloat32Exp( a ); 916 aSign = extractFloat32Sign( a ); 917 if ( aExp == 0xFF ) { 918 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 919 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 920 } 921 if ( aExp == 0 ) { 922 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 923 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 924 } 925 aSig |= 0x00800000; 926 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 927 928} 929 930#endif 931 932/* 933------------------------------------------------------------------------------- 934Rounds the single-precision floating-point value `a' to an integer, and 935returns the result as a single-precision floating-point value. The 936operation is performed according to the IEC/IEEE Standard for Binary 937Floating-point Arithmetic. 938------------------------------------------------------------------------------- 939*/ 940float32 float32_round_to_int( struct roundingData *roundData, float32 a ) 941{ 942 flag aSign; 943 int16 aExp; 944 bits32 lastBitMask, roundBitsMask; 945 int8 roundingMode; 946 float32 z; 947 948 aExp = extractFloat32Exp( a ); 949 if ( 0x96 <= aExp ) { 950 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 951 return propagateFloat32NaN( a, a ); 952 } 953 return a; 954 } 955 roundingMode = roundData->mode; 956 if ( aExp <= 0x7E ) { 957 if ( (bits32) ( a<<1 ) == 0 ) return a; 958 roundData->exception |= float_flag_inexact; 959 aSign = extractFloat32Sign( a ); 960 switch ( roundingMode ) { 961 case float_round_nearest_even: 962 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 963 return packFloat32( aSign, 0x7F, 0 ); 964 } 965 break; 966 case float_round_down: 967 return aSign ? 0xBF800000 : 0; 968 case float_round_up: 969 return aSign ? 0x80000000 : 0x3F800000; 970 } 971 return packFloat32( aSign, 0, 0 ); 972 } 973 lastBitMask = 1; 974 lastBitMask <<= 0x96 - aExp; 975 roundBitsMask = lastBitMask - 1; 976 z = a; 977 if ( roundingMode == float_round_nearest_even ) { 978 z += lastBitMask>>1; 979 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 980 } 981 else if ( roundingMode != float_round_to_zero ) { 982 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 983 z += roundBitsMask; 984 } 985 } 986 z &= ~ roundBitsMask; 987 if ( z != a ) roundData->exception |= float_flag_inexact; 988 return z; 989 990} 991 992/* 993------------------------------------------------------------------------------- 994Returns the result of adding the absolute values of the single-precision 995floating-point values `a' and `b'. If `zSign' is true, the sum is negated 996before being returned. `zSign' is ignored if the result is a NaN. The 997addition is performed according to the IEC/IEEE Standard for Binary 998Floating-point Arithmetic. 999------------------------------------------------------------------------------- 1000*/ 1001static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign ) 1002{ 1003 int16 aExp, bExp, zExp; 1004 bits32 aSig, bSig, zSig; 1005 int16 expDiff; 1006 1007 aSig = extractFloat32Frac( a ); 1008 aExp = extractFloat32Exp( a ); 1009 bSig = extractFloat32Frac( b ); 1010 bExp = extractFloat32Exp( b ); 1011 expDiff = aExp - bExp; 1012 aSig <<= 6; 1013 bSig <<= 6; 1014 if ( 0 < expDiff ) { 1015 if ( aExp == 0xFF ) { 1016 if ( aSig ) return propagateFloat32NaN( a, b ); 1017 return a; 1018 } 1019 if ( bExp == 0 ) { 1020 --expDiff; 1021 } 1022 else { 1023 bSig |= 0x20000000; 1024 } 1025 shift32RightJamming( bSig, expDiff, &bSig ); 1026 zExp = aExp; 1027 } 1028 else if ( expDiff < 0 ) { 1029 if ( bExp == 0xFF ) { 1030 if ( bSig ) return propagateFloat32NaN( a, b ); 1031 return packFloat32( zSign, 0xFF, 0 ); 1032 } 1033 if ( aExp == 0 ) { 1034 ++expDiff; 1035 } 1036 else { 1037 aSig |= 0x20000000; 1038 } 1039 shift32RightJamming( aSig, - expDiff, &aSig ); 1040 zExp = bExp; 1041 } 1042 else { 1043 if ( aExp == 0xFF ) { 1044 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1045 return a; 1046 } 1047 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1048 zSig = 0x40000000 + aSig + bSig; 1049 zExp = aExp; 1050 goto roundAndPack; 1051 } 1052 aSig |= 0x20000000; 1053 zSig = ( aSig + bSig )<<1; 1054 --zExp; 1055 if ( (sbits32) zSig < 0 ) { 1056 zSig = aSig + bSig; 1057 ++zExp; 1058 } 1059 roundAndPack: 1060 return roundAndPackFloat32( roundData, zSign, zExp, zSig ); 1061 1062} 1063 1064/* 1065------------------------------------------------------------------------------- 1066Returns the result of subtracting the absolute values of the single- 1067precision floating-point values `a' and `b'. If `zSign' is true, the 1068difference is negated before being returned. `zSign' is ignored if the 1069result is a NaN. The subtraction is performed according to the IEC/IEEE 1070Standard for Binary Floating-point Arithmetic. 1071------------------------------------------------------------------------------- 1072*/ 1073static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign ) 1074{ 1075 int16 aExp, bExp, zExp; 1076 bits32 aSig, bSig, zSig; 1077 int16 expDiff; 1078 1079 aSig = extractFloat32Frac( a ); 1080 aExp = extractFloat32Exp( a ); 1081 bSig = extractFloat32Frac( b ); 1082 bExp = extractFloat32Exp( b ); 1083 expDiff = aExp - bExp; 1084 aSig <<= 7; 1085 bSig <<= 7; 1086 if ( 0 < expDiff ) goto aExpBigger; 1087 if ( expDiff < 0 ) goto bExpBigger; 1088 if ( aExp == 0xFF ) { 1089 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1090 roundData->exception |= float_flag_invalid; 1091 return float32_default_nan; 1092 } 1093 if ( aExp == 0 ) { 1094 aExp = 1; 1095 bExp = 1; 1096 } 1097 if ( bSig < aSig ) goto aBigger; 1098 if ( aSig < bSig ) goto bBigger; 1099 return packFloat32( roundData->mode == float_round_down, 0, 0 ); 1100 bExpBigger: 1101 if ( bExp == 0xFF ) { 1102 if ( bSig ) return propagateFloat32NaN( a, b ); 1103 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1104 } 1105 if ( aExp == 0 ) { 1106 ++expDiff; 1107 } 1108 else { 1109 aSig |= 0x40000000; 1110 } 1111 shift32RightJamming( aSig, - expDiff, &aSig ); 1112 bSig |= 0x40000000; 1113 bBigger: 1114 zSig = bSig - aSig; 1115 zExp = bExp; 1116 zSign ^= 1; 1117 goto normalizeRoundAndPack; 1118 aExpBigger: 1119 if ( aExp == 0xFF ) { 1120 if ( aSig ) return propagateFloat32NaN( a, b ); 1121 return a; 1122 } 1123 if ( bExp == 0 ) { 1124 --expDiff; 1125 } 1126 else { 1127 bSig |= 0x40000000; 1128 } 1129 shift32RightJamming( bSig, expDiff, &bSig ); 1130 aSig |= 0x40000000; 1131 aBigger: 1132 zSig = aSig - bSig; 1133 zExp = aExp; 1134 normalizeRoundAndPack: 1135 --zExp; 1136 return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig ); 1137 1138} 1139 1140/* 1141------------------------------------------------------------------------------- 1142Returns the result of adding the single-precision floating-point values `a' 1143and `b'. The operation is performed according to the IEC/IEEE Standard for 1144Binary Floating-point Arithmetic. 1145------------------------------------------------------------------------------- 1146*/ 1147float32 float32_add( struct roundingData *roundData, float32 a, float32 b ) 1148{ 1149 flag aSign, bSign; 1150 1151 aSign = extractFloat32Sign( a ); 1152 bSign = extractFloat32Sign( b ); 1153 if ( aSign == bSign ) { 1154 return addFloat32Sigs( roundData, a, b, aSign ); 1155 } 1156 else { 1157 return subFloat32Sigs( roundData, a, b, aSign ); 1158 } 1159 1160} 1161 1162/* 1163------------------------------------------------------------------------------- 1164Returns the result of subtracting the single-precision floating-point values 1165`a' and `b'. The operation is performed according to the IEC/IEEE Standard 1166for Binary Floating-point Arithmetic. 1167------------------------------------------------------------------------------- 1168*/ 1169float32 float32_sub( struct roundingData *roundData, float32 a, float32 b ) 1170{ 1171 flag aSign, bSign; 1172 1173 aSign = extractFloat32Sign( a ); 1174 bSign = extractFloat32Sign( b ); 1175 if ( aSign == bSign ) { 1176 return subFloat32Sigs( roundData, a, b, aSign ); 1177 } 1178 else { 1179 return addFloat32Sigs( roundData, a, b, aSign ); 1180 } 1181 1182} 1183 1184/* 1185------------------------------------------------------------------------------- 1186Returns the result of multiplying the single-precision floating-point values 1187`a' and `b'. The operation is performed according to the IEC/IEEE Standard 1188for Binary Floating-point Arithmetic. 1189------------------------------------------------------------------------------- 1190*/ 1191float32 float32_mul( struct roundingData *roundData, float32 a, float32 b ) 1192{ 1193 flag aSign, bSign, zSign; 1194 int16 aExp, bExp, zExp; 1195 bits32 aSig, bSig; 1196 bits64 zSig64; 1197 bits32 zSig; 1198 1199 aSig = extractFloat32Frac( a ); 1200 aExp = extractFloat32Exp( a ); 1201 aSign = extractFloat32Sign( a ); 1202 bSig = extractFloat32Frac( b ); 1203 bExp = extractFloat32Exp( b ); 1204 bSign = extractFloat32Sign( b ); 1205 zSign = aSign ^ bSign; 1206 if ( aExp == 0xFF ) { 1207 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1208 return propagateFloat32NaN( a, b ); 1209 } 1210 if ( ( bExp | bSig ) == 0 ) { 1211 roundData->exception |= float_flag_invalid; 1212 return float32_default_nan; 1213 } 1214 return packFloat32( zSign, 0xFF, 0 ); 1215 } 1216 if ( bExp == 0xFF ) { 1217 if ( bSig ) return propagateFloat32NaN( a, b ); 1218 if ( ( aExp | aSig ) == 0 ) { 1219 roundData->exception |= float_flag_invalid; 1220 return float32_default_nan; 1221 } 1222 return packFloat32( zSign, 0xFF, 0 ); 1223 } 1224 if ( aExp == 0 ) { 1225 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1226 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1227 } 1228 if ( bExp == 0 ) { 1229 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1230 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1231 } 1232 zExp = aExp + bExp - 0x7F; 1233 aSig = ( aSig | 0x00800000 )<<7; 1234 bSig = ( bSig | 0x00800000 )<<8; 1235 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1236 zSig = zSig64; 1237 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1238 zSig <<= 1; 1239 --zExp; 1240 } 1241 return roundAndPackFloat32( roundData, zSign, zExp, zSig ); 1242 1243} 1244 1245/* 1246------------------------------------------------------------------------------- 1247Returns the result of dividing the single-precision floating-point value `a' 1248by the corresponding value `b'. The operation is performed according to the 1249IEC/IEEE Standard for Binary Floating-point Arithmetic. 1250------------------------------------------------------------------------------- 1251*/ 1252float32 float32_div( struct roundingData *roundData, float32 a, float32 b ) 1253{ 1254 flag aSign, bSign, zSign; 1255 int16 aExp, bExp, zExp; 1256 bits32 aSig, bSig, zSig; 1257 1258 aSig = extractFloat32Frac( a ); 1259 aExp = extractFloat32Exp( a ); 1260 aSign = extractFloat32Sign( a ); 1261 bSig = extractFloat32Frac( b ); 1262 bExp = extractFloat32Exp( b ); 1263 bSign = extractFloat32Sign( b ); 1264 zSign = aSign ^ bSign; 1265 if ( aExp == 0xFF ) { 1266 if ( aSig ) return propagateFloat32NaN( a, b ); 1267 if ( bExp == 0xFF ) { 1268 if ( bSig ) return propagateFloat32NaN( a, b ); 1269 roundData->exception |= float_flag_invalid; 1270 return float32_default_nan; 1271 } 1272 return packFloat32( zSign, 0xFF, 0 ); 1273 } 1274 if ( bExp == 0xFF ) { 1275 if ( bSig ) return propagateFloat32NaN( a, b ); 1276 return packFloat32( zSign, 0, 0 ); 1277 } 1278 if ( bExp == 0 ) { 1279 if ( bSig == 0 ) { 1280 if ( ( aExp | aSig ) == 0 ) { 1281 roundData->exception |= float_flag_invalid; 1282 return float32_default_nan; 1283 } 1284 roundData->exception |= float_flag_divbyzero; 1285 return packFloat32( zSign, 0xFF, 0 ); 1286 } 1287 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1288 } 1289 if ( aExp == 0 ) { 1290 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1291 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1292 } 1293 zExp = aExp - bExp + 0x7D; 1294 aSig = ( aSig | 0x00800000 )<<7; 1295 bSig = ( bSig | 0x00800000 )<<8; 1296 if ( bSig <= ( aSig + aSig ) ) { 1297 aSig >>= 1; 1298 ++zExp; 1299 } 1300 { 1301 bits64 tmp = ( (bits64) aSig )<<32; 1302 do_div( tmp, bSig ); 1303 zSig = tmp; 1304 } 1305 if ( ( zSig & 0x3F ) == 0 ) { 1306 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 ); 1307 } 1308 return roundAndPackFloat32( roundData, zSign, zExp, zSig ); 1309 1310} 1311 1312/* 1313------------------------------------------------------------------------------- 1314Returns the remainder of the single-precision floating-point value `a' 1315with respect to the corresponding value `b'. The operation is performed 1316according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1317------------------------------------------------------------------------------- 1318*/ 1319float32 float32_rem( struct roundingData *roundData, float32 a, float32 b ) 1320{ 1321 flag aSign, bSign, zSign; 1322 int16 aExp, bExp, expDiff; 1323 bits32 aSig, bSig; 1324 bits32 q; 1325 bits64 aSig64, bSig64, q64; 1326 bits32 alternateASig; 1327 sbits32 sigMean; 1328 1329 aSig = extractFloat32Frac( a ); 1330 aExp = extractFloat32Exp( a ); 1331 aSign = extractFloat32Sign( a ); 1332 bSig = extractFloat32Frac( b ); 1333 bExp = extractFloat32Exp( b ); 1334 bSign = extractFloat32Sign( b ); 1335 if ( aExp == 0xFF ) { 1336 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1337 return propagateFloat32NaN( a, b ); 1338 } 1339 roundData->exception |= float_flag_invalid; 1340 return float32_default_nan; 1341 } 1342 if ( bExp == 0xFF ) { 1343 if ( bSig ) return propagateFloat32NaN( a, b ); 1344 return a; 1345 } 1346 if ( bExp == 0 ) { 1347 if ( bSig == 0 ) { 1348 roundData->exception |= float_flag_invalid; 1349 return float32_default_nan; 1350 } 1351 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1352 } 1353 if ( aExp == 0 ) { 1354 if ( aSig == 0 ) return a; 1355 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1356 } 1357 expDiff = aExp - bExp; 1358 aSig |= 0x00800000; 1359 bSig |= 0x00800000; 1360 if ( expDiff < 32 ) { 1361 aSig <<= 8; 1362 bSig <<= 8; 1363 if ( expDiff < 0 ) { 1364 if ( expDiff < -1 ) return a; 1365 aSig >>= 1; 1366 } 1367 q = ( bSig <= aSig ); 1368 if ( q ) aSig -= bSig; 1369 if ( 0 < expDiff ) { 1370 bits64 tmp = ( (bits64) aSig )<<32; 1371 do_div( tmp, bSig ); 1372 q = tmp; 1373 q >>= 32 - expDiff; 1374 bSig >>= 2; 1375 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1376 } 1377 else { 1378 aSig >>= 2; 1379 bSig >>= 2; 1380 } 1381 } 1382 else { 1383 if ( bSig <= aSig ) aSig -= bSig; 1384 aSig64 = ( (bits64) aSig )<<40; 1385 bSig64 = ( (bits64) bSig )<<40; 1386 expDiff -= 64; 1387 while ( 0 < expDiff ) { 1388 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1389 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1390 aSig64 = - ( ( bSig * q64 )<<38 ); 1391 expDiff -= 62; 1392 } 1393 expDiff += 64; 1394 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1395 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1396 q = q64>>( 64 - expDiff ); 1397 bSig <<= 6; 1398 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 1399 } 1400 do { 1401 alternateASig = aSig; 1402 ++q; 1403 aSig -= bSig; 1404 } while ( 0 <= (sbits32) aSig ); 1405 sigMean = aSig + alternateASig; 1406 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 1407 aSig = alternateASig; 1408 } 1409 zSign = ( (sbits32) aSig < 0 ); 1410 if ( zSign ) aSig = - aSig; 1411 return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig ); 1412 1413} 1414 1415/* 1416------------------------------------------------------------------------------- 1417Returns the square root of the single-precision floating-point value `a'. 1418The operation is performed according to the IEC/IEEE Standard for Binary 1419Floating-point Arithmetic. 1420------------------------------------------------------------------------------- 1421*/ 1422float32 float32_sqrt( struct roundingData *roundData, float32 a ) 1423{ 1424 flag aSign; 1425 int16 aExp, zExp; 1426 bits32 aSig, zSig; 1427 bits64 rem, term; 1428 1429 aSig = extractFloat32Frac( a ); 1430 aExp = extractFloat32Exp( a ); 1431 aSign = extractFloat32Sign( a ); 1432 if ( aExp == 0xFF ) { 1433 if ( aSig ) return propagateFloat32NaN( a, 0 ); 1434 if ( ! aSign ) return a; 1435 roundData->exception |= float_flag_invalid; 1436 return float32_default_nan; 1437 } 1438 if ( aSign ) { 1439 if ( ( aExp | aSig ) == 0 ) return a; 1440 roundData->exception |= float_flag_invalid; 1441 return float32_default_nan; 1442 } 1443 if ( aExp == 0 ) { 1444 if ( aSig == 0 ) return 0; 1445 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1446 } 1447 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 1448 aSig = ( aSig | 0x00800000 )<<8; 1449 zSig = estimateSqrt32( aExp, aSig ) + 2; 1450 if ( ( zSig & 0x7F ) <= 5 ) { 1451 if ( zSig < 2 ) { 1452 zSig = 0xFFFFFFFF; 1453 } 1454 else { 1455 aSig >>= aExp & 1; 1456 term = ( (bits64) zSig ) * zSig; 1457 rem = ( ( (bits64) aSig )<<32 ) - term; 1458 while ( (sbits64) rem < 0 ) { 1459 --zSig; 1460 rem += ( ( (bits64) zSig )<<1 ) | 1; 1461 } 1462 zSig |= ( rem != 0 ); 1463 } 1464 } 1465 shift32RightJamming( zSig, 1, &zSig ); 1466 return roundAndPackFloat32( roundData, 0, zExp, zSig ); 1467 1468} 1469 1470/* 1471------------------------------------------------------------------------------- 1472Returns 1 if the single-precision floating-point value `a' is equal to the 1473corresponding value `b', and 0 otherwise. The comparison is performed 1474according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1475------------------------------------------------------------------------------- 1476*/ 1477flag float32_eq( float32 a, float32 b ) 1478{ 1479 1480 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1481 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1482 ) { 1483 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1484 float_raise( float_flag_invalid ); 1485 } 1486 return 0; 1487 } 1488 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1489 1490} 1491 1492/* 1493------------------------------------------------------------------------------- 1494Returns 1 if the single-precision floating-point value `a' is less than or 1495equal to the corresponding value `b', and 0 otherwise. The comparison is 1496performed according to the IEC/IEEE Standard for Binary Floating-point 1497Arithmetic. 1498------------------------------------------------------------------------------- 1499*/ 1500flag float32_le( float32 a, float32 b ) 1501{ 1502 flag aSign, bSign; 1503 1504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1506 ) { 1507 float_raise( float_flag_invalid ); 1508 return 0; 1509 } 1510 aSign = extractFloat32Sign( a ); 1511 bSign = extractFloat32Sign( b ); 1512 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1513 return ( a == b ) || ( aSign ^ ( a < b ) ); 1514 1515} 1516 1517/* 1518------------------------------------------------------------------------------- 1519Returns 1 if the single-precision floating-point value `a' is less than 1520the corresponding value `b', and 0 otherwise. The comparison is performed 1521according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1522------------------------------------------------------------------------------- 1523*/ 1524flag float32_lt( float32 a, float32 b ) 1525{ 1526 flag aSign, bSign; 1527 1528 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1529 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1530 ) { 1531 float_raise( float_flag_invalid ); 1532 return 0; 1533 } 1534 aSign = extractFloat32Sign( a ); 1535 bSign = extractFloat32Sign( b ); 1536 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1537 return ( a != b ) && ( aSign ^ ( a < b ) ); 1538 1539} 1540 1541/* 1542------------------------------------------------------------------------------- 1543Returns 1 if the single-precision floating-point value `a' is equal to the 1544corresponding value `b', and 0 otherwise. The invalid exception is raised 1545if either operand is a NaN. Otherwise, the comparison is performed 1546according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1547------------------------------------------------------------------------------- 1548*/ 1549flag float32_eq_signaling( float32 a, float32 b ) 1550{ 1551 1552 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1553 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1554 ) { 1555 float_raise( float_flag_invalid ); 1556 return 0; 1557 } 1558 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1559 1560} 1561 1562/* 1563------------------------------------------------------------------------------- 1564Returns 1 if the single-precision floating-point value `a' is less than or 1565equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 1566cause an exception. Otherwise, the comparison is performed according to the 1567IEC/IEEE Standard for Binary Floating-point Arithmetic. 1568------------------------------------------------------------------------------- 1569*/ 1570flag float32_le_quiet( float32 a, float32 b ) 1571{ 1572 flag aSign, bSign; 1573 //int16 aExp, bExp; 1574 1575 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1576 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1577 ) { 1578 /* Do nothing, even if NaN as we're quiet */ 1579 return 0; 1580 } 1581 aSign = extractFloat32Sign( a ); 1582 bSign = extractFloat32Sign( b ); 1583 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1584 return ( a == b ) || ( aSign ^ ( a < b ) ); 1585 1586} 1587 1588/* 1589------------------------------------------------------------------------------- 1590Returns 1 if the single-precision floating-point value `a' is less than 1591the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 1592exception. Otherwise, the comparison is performed according to the IEC/IEEE 1593Standard for Binary Floating-point Arithmetic. 1594------------------------------------------------------------------------------- 1595*/ 1596flag float32_lt_quiet( float32 a, float32 b ) 1597{ 1598 flag aSign, bSign; 1599 1600 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1601 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1602 ) { 1603 /* Do nothing, even if NaN as we're quiet */ 1604 return 0; 1605 } 1606 aSign = extractFloat32Sign( a ); 1607 bSign = extractFloat32Sign( b ); 1608 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1609 return ( a != b ) && ( aSign ^ ( a < b ) ); 1610 1611} 1612 1613/* 1614------------------------------------------------------------------------------- 1615Returns the result of converting the double-precision floating-point value 1616`a' to the 32-bit two's complement integer format. The conversion is 1617performed according to the IEC/IEEE Standard for Binary Floating-point 1618Arithmetic---which means in particular that the conversion is rounded 1619according to the current rounding mode. If `a' is a NaN, the largest 1620positive integer is returned. Otherwise, if the conversion overflows, the 1621largest integer with the same sign as `a' is returned. 1622------------------------------------------------------------------------------- 1623*/ 1624int32 float64_to_int32( struct roundingData *roundData, float64 a ) 1625{ 1626 flag aSign; 1627 int16 aExp, shiftCount; 1628 bits64 aSig; 1629 1630 aSig = extractFloat64Frac( a ); 1631 aExp = extractFloat64Exp( a ); 1632 aSign = extractFloat64Sign( a ); 1633 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1634 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 1635 shiftCount = 0x42C - aExp; 1636 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 1637 return roundAndPackInt32( roundData, aSign, aSig ); 1638 1639} 1640 1641/* 1642------------------------------------------------------------------------------- 1643Returns the result of converting the double-precision floating-point value 1644`a' to the 32-bit two's complement integer format. The conversion is 1645performed according to the IEC/IEEE Standard for Binary Floating-point 1646Arithmetic, except that the conversion is always rounded toward zero. If 1647`a' is a NaN, the largest positive integer is returned. Otherwise, if the 1648conversion overflows, the largest integer with the same sign as `a' is 1649returned. 1650------------------------------------------------------------------------------- 1651*/ 1652int32 float64_to_int32_round_to_zero( float64 a ) 1653{ 1654 flag aSign; 1655 int16 aExp, shiftCount; 1656 bits64 aSig, savedASig; 1657 int32 z; 1658 1659 aSig = extractFloat64Frac( a ); 1660 aExp = extractFloat64Exp( a ); 1661 aSign = extractFloat64Sign( a ); 1662 shiftCount = 0x433 - aExp; 1663 if ( shiftCount < 21 ) { 1664 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1665 goto invalid; 1666 } 1667 else if ( 52 < shiftCount ) { 1668 if ( aExp || aSig ) float_raise( float_flag_inexact ); 1669 return 0; 1670 } 1671 aSig |= LIT64( 0x0010000000000000 ); 1672 savedASig = aSig; 1673 aSig >>= shiftCount; 1674 z = aSig; 1675 if ( aSign ) z = - z; 1676 if ( ( z < 0 ) ^ aSign ) { 1677 invalid: 1678 float_raise( float_flag_invalid ); 1679 return aSign ? 0x80000000 : 0x7FFFFFFF; 1680 } 1681 if ( ( aSig<<shiftCount ) != savedASig ) { 1682 float_raise( float_flag_inexact ); 1683 } 1684 return z; 1685 1686} 1687 1688/* 1689------------------------------------------------------------------------------- 1690Returns the result of converting the double-precision floating-point value 1691`a' to the 32-bit two's complement unsigned integer format. The conversion 1692is performed according to the IEC/IEEE Standard for Binary Floating-point 1693Arithmetic---which means in particular that the conversion is rounded 1694according to the current rounding mode. If `a' is a NaN, the largest 1695positive integer is returned. Otherwise, if the conversion overflows, the 1696largest positive integer is returned. 1697------------------------------------------------------------------------------- 1698*/ 1699int32 float64_to_uint32( struct roundingData *roundData, float64 a ) 1700{ 1701 flag aSign; 1702 int16 aExp, shiftCount; 1703 bits64 aSig; 1704 1705 aSig = extractFloat64Frac( a ); 1706 aExp = extractFloat64Exp( a ); 1707 aSign = 0; //extractFloat64Sign( a ); 1708 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1709 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 1710 shiftCount = 0x42C - aExp; 1711 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 1712 return roundAndPackInt32( roundData, aSign, aSig ); 1713} 1714 1715/* 1716------------------------------------------------------------------------------- 1717Returns the result of converting the double-precision floating-point value 1718`a' to the 32-bit two's complement integer format. The conversion is 1719performed according to the IEC/IEEE Standard for Binary Floating-point 1720Arithmetic, except that the conversion is always rounded toward zero. If 1721`a' is a NaN, the largest positive integer is returned. Otherwise, if the 1722conversion overflows, the largest positive integer is returned. 1723------------------------------------------------------------------------------- 1724*/ 1725int32 float64_to_uint32_round_to_zero( float64 a ) 1726{ 1727 flag aSign; 1728 int16 aExp, shiftCount; 1729 bits64 aSig, savedASig; 1730 int32 z; 1731 1732 aSig = extractFloat64Frac( a ); 1733 aExp = extractFloat64Exp( a ); 1734 aSign = extractFloat64Sign( a ); 1735 shiftCount = 0x433 - aExp; 1736 if ( shiftCount < 21 ) { 1737 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1738 goto invalid; 1739 } 1740 else if ( 52 < shiftCount ) { 1741 if ( aExp || aSig ) float_raise( float_flag_inexact ); 1742 return 0; 1743 } 1744 aSig |= LIT64( 0x0010000000000000 ); 1745 savedASig = aSig; 1746 aSig >>= shiftCount; 1747 z = aSig; 1748 if ( aSign ) z = - z; 1749 if ( ( z < 0 ) ^ aSign ) { 1750 invalid: 1751 float_raise( float_flag_invalid ); 1752 return aSign ? 0x80000000 : 0x7FFFFFFF; 1753 } 1754 if ( ( aSig<<shiftCount ) != savedASig ) { 1755 float_raise( float_flag_inexact ); 1756 } 1757 return z; 1758} 1759 1760/* 1761------------------------------------------------------------------------------- 1762Returns the result of converting the double-precision floating-point value 1763`a' to the single-precision floating-point format. The conversion is 1764performed according to the IEC/IEEE Standard for Binary Floating-point 1765Arithmetic. 1766------------------------------------------------------------------------------- 1767*/ 1768float32 float64_to_float32( struct roundingData *roundData, float64 a ) 1769{ 1770 flag aSign; 1771 int16 aExp; 1772 bits64 aSig; 1773 bits32 zSig; 1774 1775 aSig = extractFloat64Frac( a ); 1776 aExp = extractFloat64Exp( a ); 1777 aSign = extractFloat64Sign( a ); 1778 if ( aExp == 0x7FF ) { 1779 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 1780 return packFloat32( aSign, 0xFF, 0 ); 1781 } 1782 shift64RightJamming( aSig, 22, &aSig ); 1783 zSig = aSig; 1784 if ( aExp || zSig ) { 1785 zSig |= 0x40000000; 1786 aExp -= 0x381; 1787 } 1788 return roundAndPackFloat32( roundData, aSign, aExp, zSig ); 1789 1790} 1791 1792#ifdef FLOATX80 1793 1794/* 1795------------------------------------------------------------------------------- 1796Returns the result of converting the double-precision floating-point value 1797`a' to the extended double-precision floating-point format. The conversion 1798is performed according to the IEC/IEEE Standard for Binary Floating-point 1799Arithmetic. 1800------------------------------------------------------------------------------- 1801*/ 1802floatx80 float64_to_floatx80( float64 a ) 1803{ 1804 flag aSign; 1805 int16 aExp; 1806 bits64 aSig; 1807 1808 aSig = extractFloat64Frac( a ); 1809 aExp = extractFloat64Exp( a ); 1810 aSign = extractFloat64Sign( a ); 1811 if ( aExp == 0x7FF ) { 1812 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 1813 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1814 } 1815 if ( aExp == 0 ) { 1816 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1817 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 1818 } 1819 return 1820 packFloatx80( 1821 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 1822 1823} 1824 1825#endif 1826 1827/* 1828------------------------------------------------------------------------------- 1829Rounds the double-precision floating-point value `a' to an integer, and 1830returns the result as a double-precision floating-point value. The 1831operation is performed according to the IEC/IEEE Standard for Binary 1832Floating-point Arithmetic. 1833------------------------------------------------------------------------------- 1834*/ 1835float64 float64_round_to_int( struct roundingData *roundData, float64 a ) 1836{ 1837 flag aSign; 1838 int16 aExp; 1839 bits64 lastBitMask, roundBitsMask; 1840 int8 roundingMode; 1841 float64 z; 1842 1843 aExp = extractFloat64Exp( a ); 1844 if ( 0x433 <= aExp ) { 1845 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 1846 return propagateFloat64NaN( a, a ); 1847 } 1848 return a; 1849 } 1850 if ( aExp <= 0x3FE ) { 1851 if ( (bits64) ( a<<1 ) == 0 ) return a; 1852 roundData->exception |= float_flag_inexact; 1853 aSign = extractFloat64Sign( a ); 1854 switch ( roundData->mode ) { 1855 case float_round_nearest_even: 1856 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 1857 return packFloat64( aSign, 0x3FF, 0 ); 1858 } 1859 break; 1860 case float_round_down: 1861 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 1862 case float_round_up: 1863 return 1864 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 1865 } 1866 return packFloat64( aSign, 0, 0 ); 1867 } 1868 lastBitMask = 1; 1869 lastBitMask <<= 0x433 - aExp; 1870 roundBitsMask = lastBitMask - 1; 1871 z = a; 1872 roundingMode = roundData->mode; 1873 if ( roundingMode == float_round_nearest_even ) { 1874 z += lastBitMask>>1; 1875 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1876 } 1877 else if ( roundingMode != float_round_to_zero ) { 1878 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1879 z += roundBitsMask; 1880 } 1881 } 1882 z &= ~ roundBitsMask; 1883 if ( z != a ) roundData->exception |= float_flag_inexact; 1884 return z; 1885 1886} 1887 1888/* 1889------------------------------------------------------------------------------- 1890Returns the result of adding the absolute values of the double-precision 1891floating-point values `a' and `b'. If `zSign' is true, the sum is negated 1892before being returned. `zSign' is ignored if the result is a NaN. The 1893addition is performed according to the IEC/IEEE Standard for Binary 1894Floating-point Arithmetic. 1895------------------------------------------------------------------------------- 1896*/ 1897static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign ) 1898{ 1899 int16 aExp, bExp, zExp; 1900 bits64 aSig, bSig, zSig; 1901 int16 expDiff; 1902 1903 aSig = extractFloat64Frac( a ); 1904 aExp = extractFloat64Exp( a ); 1905 bSig = extractFloat64Frac( b ); 1906 bExp = extractFloat64Exp( b ); 1907 expDiff = aExp - bExp; 1908 aSig <<= 9; 1909 bSig <<= 9; 1910 if ( 0 < expDiff ) { 1911 if ( aExp == 0x7FF ) { 1912 if ( aSig ) return propagateFloat64NaN( a, b ); 1913 return a; 1914 } 1915 if ( bExp == 0 ) { 1916 --expDiff; 1917 } 1918 else { 1919 bSig |= LIT64( 0x2000000000000000 ); 1920 } 1921 shift64RightJamming( bSig, expDiff, &bSig ); 1922 zExp = aExp; 1923 } 1924 else if ( expDiff < 0 ) { 1925 if ( bExp == 0x7FF ) { 1926 if ( bSig ) return propagateFloat64NaN( a, b ); 1927 return packFloat64( zSign, 0x7FF, 0 ); 1928 } 1929 if ( aExp == 0 ) { 1930 ++expDiff; 1931 } 1932 else { 1933 aSig |= LIT64( 0x2000000000000000 ); 1934 } 1935 shift64RightJamming( aSig, - expDiff, &aSig ); 1936 zExp = bExp; 1937 } 1938 else { 1939 if ( aExp == 0x7FF ) { 1940 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 1941 return a; 1942 } 1943 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 1944 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 1945 zExp = aExp; 1946 goto roundAndPack; 1947 } 1948 aSig |= LIT64( 0x2000000000000000 ); 1949 zSig = ( aSig + bSig )<<1; 1950 --zExp; 1951 if ( (sbits64) zSig < 0 ) { 1952 zSig = aSig + bSig; 1953 ++zExp; 1954 } 1955 roundAndPack: 1956 return roundAndPackFloat64( roundData, zSign, zExp, zSig ); 1957 1958} 1959 1960/* 1961------------------------------------------------------------------------------- 1962Returns the result of subtracting the absolute values of the double- 1963precision floating-point values `a' and `b'. If `zSign' is true, the 1964difference is negated before being returned. `zSign' is ignored if the 1965result is a NaN. The subtraction is performed according to the IEC/IEEE 1966Standard for Binary Floating-point Arithmetic. 1967------------------------------------------------------------------------------- 1968*/ 1969static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign ) 1970{ 1971 int16 aExp, bExp, zExp; 1972 bits64 aSig, bSig, zSig; 1973 int16 expDiff; 1974 1975 aSig = extractFloat64Frac( a ); 1976 aExp = extractFloat64Exp( a ); 1977 bSig = extractFloat64Frac( b ); 1978 bExp = extractFloat64Exp( b ); 1979 expDiff = aExp - bExp; 1980 aSig <<= 10; 1981 bSig <<= 10; 1982 if ( 0 < expDiff ) goto aExpBigger; 1983 if ( expDiff < 0 ) goto bExpBigger; 1984 if ( aExp == 0x7FF ) { 1985 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 1986 roundData->exception |= float_flag_invalid; 1987 return float64_default_nan; 1988 } 1989 if ( aExp == 0 ) { 1990 aExp = 1; 1991 bExp = 1; 1992 } 1993 if ( bSig < aSig ) goto aBigger; 1994 if ( aSig < bSig ) goto bBigger; 1995 return packFloat64( roundData->mode == float_round_down, 0, 0 ); 1996 bExpBigger: 1997 if ( bExp == 0x7FF ) { 1998 if ( bSig ) return propagateFloat64NaN( a, b ); 1999 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2000 } 2001 if ( aExp == 0 ) { 2002 ++expDiff; 2003 } 2004 else { 2005 aSig |= LIT64( 0x4000000000000000 ); 2006 } 2007 shift64RightJamming( aSig, - expDiff, &aSig ); 2008 bSig |= LIT64( 0x4000000000000000 ); 2009 bBigger: 2010 zSig = bSig - aSig; 2011 zExp = bExp; 2012 zSign ^= 1; 2013 goto normalizeRoundAndPack; 2014 aExpBigger: 2015 if ( aExp == 0x7FF ) { 2016 if ( aSig ) return propagateFloat64NaN( a, b ); 2017 return a; 2018 } 2019 if ( bExp == 0 ) { 2020 --expDiff; 2021 } 2022 else { 2023 bSig |= LIT64( 0x4000000000000000 ); 2024 } 2025 shift64RightJamming( bSig, expDiff, &bSig ); 2026 aSig |= LIT64( 0x4000000000000000 ); 2027 aBigger: 2028 zSig = aSig - bSig; 2029 zExp = aExp; 2030 normalizeRoundAndPack: 2031 --zExp; 2032 return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig ); 2033 2034} 2035 2036/* 2037------------------------------------------------------------------------------- 2038Returns the result of adding the double-precision floating-point values `a' 2039and `b'. The operation is performed according to the IEC/IEEE Standard for 2040Binary Floating-point Arithmetic. 2041------------------------------------------------------------------------------- 2042*/ 2043float64 float64_add( struct roundingData *roundData, float64 a, float64 b ) 2044{ 2045 flag aSign, bSign; 2046 2047 aSign = extractFloat64Sign( a ); 2048 bSign = extractFloat64Sign( b ); 2049 if ( aSign == bSign ) { 2050 return addFloat64Sigs( roundData, a, b, aSign ); 2051 } 2052 else { 2053 return subFloat64Sigs( roundData, a, b, aSign ); 2054 } 2055 2056} 2057 2058/* 2059------------------------------------------------------------------------------- 2060Returns the result of subtracting the double-precision floating-point values 2061`a' and `b'. The operation is performed according to the IEC/IEEE Standard 2062for Binary Floating-point Arithmetic. 2063------------------------------------------------------------------------------- 2064*/ 2065float64 float64_sub( struct roundingData *roundData, float64 a, float64 b ) 2066{ 2067 flag aSign, bSign; 2068 2069 aSign = extractFloat64Sign( a ); 2070 bSign = extractFloat64Sign( b ); 2071 if ( aSign == bSign ) { 2072 return subFloat64Sigs( roundData, a, b, aSign ); 2073 } 2074 else { 2075 return addFloat64Sigs( roundData, a, b, aSign ); 2076 } 2077 2078} 2079 2080/* 2081------------------------------------------------------------------------------- 2082Returns the result of multiplying the double-precision floating-point values 2083`a' and `b'. The operation is performed according to the IEC/IEEE Standard 2084for Binary Floating-point Arithmetic. 2085------------------------------------------------------------------------------- 2086*/ 2087float64 float64_mul( struct roundingData *roundData, float64 a, float64 b ) 2088{ 2089 flag aSign, bSign, zSign; 2090 int16 aExp, bExp, zExp; 2091 bits64 aSig, bSig, zSig0, zSig1; 2092 2093 aSig = extractFloat64Frac( a ); 2094 aExp = extractFloat64Exp( a ); 2095 aSign = extractFloat64Sign( a ); 2096 bSig = extractFloat64Frac( b ); 2097 bExp = extractFloat64Exp( b ); 2098 bSign = extractFloat64Sign( b ); 2099 zSign = aSign ^ bSign; 2100 if ( aExp == 0x7FF ) { 2101 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2102 return propagateFloat64NaN( a, b ); 2103 } 2104 if ( ( bExp | bSig ) == 0 ) { 2105 roundData->exception |= float_flag_invalid; 2106 return float64_default_nan; 2107 } 2108 return packFloat64( zSign, 0x7FF, 0 ); 2109 } 2110 if ( bExp == 0x7FF ) { 2111 if ( bSig ) return propagateFloat64NaN( a, b ); 2112 if ( ( aExp | aSig ) == 0 ) { 2113 roundData->exception |= float_flag_invalid; 2114 return float64_default_nan; 2115 } 2116 return packFloat64( zSign, 0x7FF, 0 ); 2117 } 2118 if ( aExp == 0 ) { 2119 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2120 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2121 } 2122 if ( bExp == 0 ) { 2123 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2124 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2125 } 2126 zExp = aExp + bExp - 0x3FF; 2127 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2128 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2129 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2130 zSig0 |= ( zSig1 != 0 ); 2131 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2132 zSig0 <<= 1; 2133 --zExp; 2134 } 2135 return roundAndPackFloat64( roundData, zSign, zExp, zSig0 ); 2136 2137} 2138 2139/* 2140------------------------------------------------------------------------------- 2141Returns the result of dividing the double-precision floating-point value `a' 2142by the corresponding value `b'. The operation is performed according to 2143the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2144------------------------------------------------------------------------------- 2145*/ 2146float64 float64_div( struct roundingData *roundData, float64 a, float64 b ) 2147{ 2148 flag aSign, bSign, zSign; 2149 int16 aExp, bExp, zExp; 2150 bits64 aSig, bSig, zSig; 2151 bits64 rem0, rem1; 2152 bits64 term0, term1; 2153 2154 aSig = extractFloat64Frac( a ); 2155 aExp = extractFloat64Exp( a ); 2156 aSign = extractFloat64Sign( a ); 2157 bSig = extractFloat64Frac( b ); 2158 bExp = extractFloat64Exp( b ); 2159 bSign = extractFloat64Sign( b ); 2160 zSign = aSign ^ bSign; 2161 if ( aExp == 0x7FF ) { 2162 if ( aSig ) return propagateFloat64NaN( a, b ); 2163 if ( bExp == 0x7FF ) { 2164 if ( bSig ) return propagateFloat64NaN( a, b ); 2165 roundData->exception |= float_flag_invalid; 2166 return float64_default_nan; 2167 } 2168 return packFloat64( zSign, 0x7FF, 0 ); 2169 } 2170 if ( bExp == 0x7FF ) { 2171 if ( bSig ) return propagateFloat64NaN( a, b ); 2172 return packFloat64( zSign, 0, 0 ); 2173 } 2174 if ( bExp == 0 ) { 2175 if ( bSig == 0 ) { 2176 if ( ( aExp | aSig ) == 0 ) { 2177 roundData->exception |= float_flag_invalid; 2178 return float64_default_nan; 2179 } 2180 roundData->exception |= float_flag_divbyzero; 2181 return packFloat64( zSign, 0x7FF, 0 ); 2182 } 2183 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2184 } 2185 if ( aExp == 0 ) { 2186 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2187 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2188 } 2189 zExp = aExp - bExp + 0x3FD; 2190 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2191 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2192 if ( bSig <= ( aSig + aSig ) ) { 2193 aSig >>= 1; 2194 ++zExp; 2195 } 2196 zSig = estimateDiv128To64( aSig, 0, bSig ); 2197 if ( ( zSig & 0x1FF ) <= 2 ) { 2198 mul64To128( bSig, zSig, &term0, &term1 ); 2199 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2200 while ( (sbits64) rem0 < 0 ) { 2201 --zSig; 2202 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2203 } 2204 zSig |= ( rem1 != 0 ); 2205 } 2206 return roundAndPackFloat64( roundData, zSign, zExp, zSig ); 2207 2208} 2209 2210/* 2211------------------------------------------------------------------------------- 2212Returns the remainder of the double-precision floating-point value `a' 2213with respect to the corresponding value `b'. The operation is performed 2214according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2215------------------------------------------------------------------------------- 2216*/ 2217float64 float64_rem( struct roundingData *roundData, float64 a, float64 b ) 2218{ 2219 flag aSign, bSign, zSign; 2220 int16 aExp, bExp, expDiff; 2221 bits64 aSig, bSig; 2222 bits64 q, alternateASig; 2223 sbits64 sigMean; 2224 2225 aSig = extractFloat64Frac( a ); 2226 aExp = extractFloat64Exp( a ); 2227 aSign = extractFloat64Sign( a ); 2228 bSig = extractFloat64Frac( b ); 2229 bExp = extractFloat64Exp( b ); 2230 bSign = extractFloat64Sign( b ); 2231 if ( aExp == 0x7FF ) { 2232 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2233 return propagateFloat64NaN( a, b ); 2234 } 2235 roundData->exception |= float_flag_invalid; 2236 return float64_default_nan; 2237 } 2238 if ( bExp == 0x7FF ) { 2239 if ( bSig ) return propagateFloat64NaN( a, b ); 2240 return a; 2241 } 2242 if ( bExp == 0 ) { 2243 if ( bSig == 0 ) { 2244 roundData->exception |= float_flag_invalid; 2245 return float64_default_nan; 2246 } 2247 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2248 } 2249 if ( aExp == 0 ) { 2250 if ( aSig == 0 ) return a; 2251 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2252 } 2253 expDiff = aExp - bExp; 2254 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2255 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2256 if ( expDiff < 0 ) { 2257 if ( expDiff < -1 ) return a; 2258 aSig >>= 1; 2259 } 2260 q = ( bSig <= aSig ); 2261 if ( q ) aSig -= bSig; 2262 expDiff -= 64; 2263 while ( 0 < expDiff ) { 2264 q = estimateDiv128To64( aSig, 0, bSig ); 2265 q = ( 2 < q ) ? q - 2 : 0; 2266 aSig = - ( ( bSig>>2 ) * q ); 2267 expDiff -= 62; 2268 } 2269 expDiff += 64; 2270 if ( 0 < expDiff ) { 2271 q = estimateDiv128To64( aSig, 0, bSig ); 2272 q = ( 2 < q ) ? q - 2 : 0; 2273 q >>= 64 - expDiff; 2274 bSig >>= 2; 2275 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2276 } 2277 else { 2278 aSig >>= 2; 2279 bSig >>= 2; 2280 } 2281 do { 2282 alternateASig = aSig; 2283 ++q; 2284 aSig -= bSig; 2285 } while ( 0 <= (sbits64) aSig ); 2286 sigMean = aSig + alternateASig; 2287 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2288 aSig = alternateASig; 2289 } 2290 zSign = ( (sbits64) aSig < 0 ); 2291 if ( zSign ) aSig = - aSig; 2292 return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig ); 2293 2294} 2295 2296/* 2297------------------------------------------------------------------------------- 2298Returns the square root of the double-precision floating-point value `a'. 2299The operation is performed according to the IEC/IEEE Standard for Binary 2300Floating-point Arithmetic. 2301------------------------------------------------------------------------------- 2302*/ 2303float64 float64_sqrt( struct roundingData *roundData, float64 a ) 2304{ 2305 flag aSign; 2306 int16 aExp, zExp; 2307 bits64 aSig, zSig; 2308 bits64 rem0, rem1, term0, term1; //, shiftedRem; 2309 //float64 z; 2310 2311 aSig = extractFloat64Frac( a ); 2312 aExp = extractFloat64Exp( a ); 2313 aSign = extractFloat64Sign( a ); 2314 if ( aExp == 0x7FF ) { 2315 if ( aSig ) return propagateFloat64NaN( a, a ); 2316 if ( ! aSign ) return a; 2317 roundData->exception |= float_flag_invalid; 2318 return float64_default_nan; 2319 } 2320 if ( aSign ) { 2321 if ( ( aExp | aSig ) == 0 ) return a; 2322 roundData->exception |= float_flag_invalid; 2323 return float64_default_nan; 2324 } 2325 if ( aExp == 0 ) { 2326 if ( aSig == 0 ) return 0; 2327 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2328 } 2329 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 2330 aSig |= LIT64( 0x0010000000000000 ); 2331 zSig = estimateSqrt32( aExp, aSig>>21 ); 2332 zSig <<= 31; 2333 aSig <<= 9 - ( aExp & 1 ); 2334 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2; 2335 if ( ( zSig & 0x3FF ) <= 5 ) { 2336 if ( zSig < 2 ) { 2337 zSig = LIT64( 0xFFFFFFFFFFFFFFFF ); 2338 } 2339 else { 2340 aSig <<= 2; 2341 mul64To128( zSig, zSig, &term0, &term1 ); 2342 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2343 while ( (sbits64) rem0 < 0 ) { 2344 --zSig; 2345 shortShift128Left( 0, zSig, 1, &term0, &term1 ); 2346 term1 |= 1; 2347 add128( rem0, rem1, term0, term1, &rem0, &rem1 ); 2348 } 2349 zSig |= ( ( rem0 | rem1 ) != 0 ); 2350 } 2351 } 2352 shift64RightJamming( zSig, 1, &zSig ); 2353 return roundAndPackFloat64( roundData, 0, zExp, zSig ); 2354 2355} 2356 2357/* 2358------------------------------------------------------------------------------- 2359Returns 1 if the double-precision floating-point value `a' is equal to the 2360corresponding value `b', and 0 otherwise. The comparison is performed 2361according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2362------------------------------------------------------------------------------- 2363*/ 2364flag float64_eq( float64 a, float64 b ) 2365{ 2366 2367 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2368 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2369 ) { 2370 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2371 float_raise( float_flag_invalid ); 2372 } 2373 return 0; 2374 } 2375 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2376 2377} 2378 2379/* 2380------------------------------------------------------------------------------- 2381Returns 1 if the double-precision floating-point value `a' is less than or 2382equal to the corresponding value `b', and 0 otherwise. The comparison is 2383performed according to the IEC/IEEE Standard for Binary Floating-point 2384Arithmetic. 2385------------------------------------------------------------------------------- 2386*/ 2387flag float64_le( float64 a, float64 b ) 2388{ 2389 flag aSign, bSign; 2390 2391 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2392 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2393 ) { 2394 float_raise( float_flag_invalid ); 2395 return 0; 2396 } 2397 aSign = extractFloat64Sign( a ); 2398 bSign = extractFloat64Sign( b ); 2399 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2400 return ( a == b ) || ( aSign ^ ( a < b ) ); 2401 2402} 2403 2404/* 2405------------------------------------------------------------------------------- 2406Returns 1 if the double-precision floating-point value `a' is less than 2407the corresponding value `b', and 0 otherwise. The comparison is performed 2408according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2409------------------------------------------------------------------------------- 2410*/ 2411flag float64_lt( float64 a, float64 b ) 2412{ 2413 flag aSign, bSign; 2414 2415 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2416 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2417 ) { 2418 float_raise( float_flag_invalid ); 2419 return 0; 2420 } 2421 aSign = extractFloat64Sign( a ); 2422 bSign = extractFloat64Sign( b ); 2423 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2424 return ( a != b ) && ( aSign ^ ( a < b ) ); 2425 2426} 2427 2428/* 2429------------------------------------------------------------------------------- 2430Returns 1 if the double-precision floating-point value `a' is equal to the 2431corresponding value `b', and 0 otherwise. The invalid exception is raised 2432if either operand is a NaN. Otherwise, the comparison is performed 2433according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2434------------------------------------------------------------------------------- 2435*/ 2436flag float64_eq_signaling( float64 a, float64 b ) 2437{ 2438 2439 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2440 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2441 ) { 2442 float_raise( float_flag_invalid ); 2443 return 0; 2444 } 2445 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2446 2447} 2448 2449/* 2450------------------------------------------------------------------------------- 2451Returns 1 if the double-precision floating-point value `a' is less than or 2452equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2453cause an exception. Otherwise, the comparison is performed according to the 2454IEC/IEEE Standard for Binary Floating-point Arithmetic. 2455------------------------------------------------------------------------------- 2456*/ 2457flag float64_le_quiet( float64 a, float64 b ) 2458{ 2459 flag aSign, bSign; 2460 //int16 aExp, bExp; 2461 2462 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2463 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2464 ) { 2465 /* Do nothing, even if NaN as we're quiet */ 2466 return 0; 2467 } 2468 aSign = extractFloat64Sign( a ); 2469 bSign = extractFloat64Sign( b ); 2470 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2471 return ( a == b ) || ( aSign ^ ( a < b ) ); 2472 2473} 2474 2475/* 2476------------------------------------------------------------------------------- 2477Returns 1 if the double-precision floating-point value `a' is less than 2478the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2479exception. Otherwise, the comparison is performed according to the IEC/IEEE 2480Standard for Binary Floating-point Arithmetic. 2481------------------------------------------------------------------------------- 2482*/ 2483flag float64_lt_quiet( float64 a, float64 b ) 2484{ 2485 flag aSign, bSign; 2486 2487 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2488 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2489 ) { 2490 /* Do nothing, even if NaN as we're quiet */ 2491 return 0; 2492 } 2493 aSign = extractFloat64Sign( a ); 2494 bSign = extractFloat64Sign( b ); 2495 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2496 return ( a != b ) && ( aSign ^ ( a < b ) ); 2497 2498} 2499 2500#ifdef FLOATX80 2501 2502/* 2503------------------------------------------------------------------------------- 2504Returns the result of converting the extended double-precision floating- 2505point value `a' to the 32-bit two's complement integer format. The 2506conversion is performed according to the IEC/IEEE Standard for Binary 2507Floating-point Arithmetic---which means in particular that the conversion 2508is rounded according to the current rounding mode. If `a' is a NaN, the 2509largest positive integer is returned. Otherwise, if the conversion 2510overflows, the largest integer with the same sign as `a' is returned. 2511------------------------------------------------------------------------------- 2512*/ 2513int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a ) 2514{ 2515 flag aSign; 2516 int32 aExp, shiftCount; 2517 bits64 aSig; 2518 2519 aSig = extractFloatx80Frac( a ); 2520 aExp = extractFloatx80Exp( a ); 2521 aSign = extractFloatx80Sign( a ); 2522 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 2523 shiftCount = 0x4037 - aExp; 2524 if ( shiftCount <= 0 ) shiftCount = 1; 2525 shift64RightJamming( aSig, shiftCount, &aSig ); 2526 return roundAndPackInt32( roundData, aSign, aSig ); 2527 2528} 2529 2530/* 2531------------------------------------------------------------------------------- 2532Returns the result of converting the extended double-precision floating- 2533point value `a' to the 32-bit two's complement integer format. The 2534conversion is performed according to the IEC/IEEE Standard for Binary 2535Floating-point Arithmetic, except that the conversion is always rounded 2536toward zero. If `a' is a NaN, the largest positive integer is returned. 2537Otherwise, if the conversion overflows, the largest integer with the same 2538sign as `a' is returned. 2539------------------------------------------------------------------------------- 2540*/ 2541int32 floatx80_to_int32_round_to_zero( floatx80 a ) 2542{ 2543 flag aSign; 2544 int32 aExp, shiftCount; 2545 bits64 aSig, savedASig; 2546 int32 z; 2547 2548 aSig = extractFloatx80Frac( a ); 2549 aExp = extractFloatx80Exp( a ); 2550 aSign = extractFloatx80Sign( a ); 2551 shiftCount = 0x403E - aExp; 2552 if ( shiftCount < 32 ) { 2553 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 2554 goto invalid; 2555 } 2556 else if ( 63 < shiftCount ) { 2557 if ( aExp || aSig ) float_raise( float_flag_inexact ); 2558 return 0; 2559 } 2560 savedASig = aSig; 2561 aSig >>= shiftCount; 2562 z = aSig; 2563 if ( aSign ) z = - z; 2564 if ( ( z < 0 ) ^ aSign ) { 2565 invalid: 2566 float_raise( float_flag_invalid ); 2567 return aSign ? 0x80000000 : 0x7FFFFFFF; 2568 } 2569 if ( ( aSig<<shiftCount ) != savedASig ) { 2570 float_raise( float_flag_inexact ); 2571 } 2572 return z; 2573 2574} 2575 2576/* 2577------------------------------------------------------------------------------- 2578Returns the result of converting the extended double-precision floating- 2579point value `a' to the single-precision floating-point format. The 2580conversion is performed according to the IEC/IEEE Standard for Binary 2581Floating-point Arithmetic. 2582------------------------------------------------------------------------------- 2583*/ 2584float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a ) 2585{ 2586 flag aSign; 2587 int32 aExp; 2588 bits64 aSig; 2589 2590 aSig = extractFloatx80Frac( a ); 2591 aExp = extractFloatx80Exp( a ); 2592 aSign = extractFloatx80Sign( a ); 2593 if ( aExp == 0x7FFF ) { 2594 if ( (bits64) ( aSig<<1 ) ) { 2595 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 2596 } 2597 return packFloat32( aSign, 0xFF, 0 ); 2598 } 2599 shift64RightJamming( aSig, 33, &aSig ); 2600 if ( aExp || aSig ) aExp -= 0x3F81; 2601 return roundAndPackFloat32( roundData, aSign, aExp, aSig ); 2602 2603} 2604 2605/* 2606------------------------------------------------------------------------------- 2607Returns the result of converting the extended double-precision floating- 2608point value `a' to the double-precision floating-point format. The 2609conversion is performed according to the IEC/IEEE Standard for Binary 2610Floating-point Arithmetic. 2611------------------------------------------------------------------------------- 2612*/ 2613float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a ) 2614{ 2615 flag aSign; 2616 int32 aExp; 2617 bits64 aSig, zSig; 2618 2619 aSig = extractFloatx80Frac( a ); 2620 aExp = extractFloatx80Exp( a ); 2621 aSign = extractFloatx80Sign( a ); 2622 if ( aExp == 0x7FFF ) { 2623 if ( (bits64) ( aSig<<1 ) ) { 2624 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 2625 } 2626 return packFloat64( aSign, 0x7FF, 0 ); 2627 } 2628 shift64RightJamming( aSig, 1, &zSig ); 2629 if ( aExp || aSig ) aExp -= 0x3C01; 2630 return roundAndPackFloat64( roundData, aSign, aExp, zSig ); 2631 2632} 2633 2634/* 2635------------------------------------------------------------------------------- 2636Rounds the extended double-precision floating-point value `a' to an integer, 2637and returns the result as an extended quadruple-precision floating-point 2638value. The operation is performed according to the IEC/IEEE Standard for 2639Binary Floating-point Arithmetic. 2640------------------------------------------------------------------------------- 2641*/ 2642floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a ) 2643{ 2644 flag aSign; 2645 int32 aExp; 2646 bits64 lastBitMask, roundBitsMask; 2647 int8 roundingMode; 2648 floatx80 z; 2649 2650 aExp = extractFloatx80Exp( a ); 2651 if ( 0x403E <= aExp ) { 2652 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 2653 return propagateFloatx80NaN( a, a ); 2654 } 2655 return a; 2656 } 2657 if ( aExp <= 0x3FFE ) { 2658 if ( ( aExp == 0 ) 2659 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 2660 return a; 2661 } 2662 roundData->exception |= float_flag_inexact; 2663 aSign = extractFloatx80Sign( a ); 2664 switch ( roundData->mode ) { 2665 case float_round_nearest_even: 2666 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 2667 ) { 2668 return 2669 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 2670 } 2671 break; 2672 case float_round_down: 2673 return 2674 aSign ? 2675 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 2676 : packFloatx80( 0, 0, 0 ); 2677 case float_round_up: 2678 return 2679 aSign ? packFloatx80( 1, 0, 0 ) 2680 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 2681 } 2682 return packFloatx80( aSign, 0, 0 ); 2683 } 2684 lastBitMask = 1; 2685 lastBitMask <<= 0x403E - aExp; 2686 roundBitsMask = lastBitMask - 1; 2687 z = a; 2688 roundingMode = roundData->mode; 2689 if ( roundingMode == float_round_nearest_even ) { 2690 z.low += lastBitMask>>1; 2691 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 2692 } 2693 else if ( roundingMode != float_round_to_zero ) { 2694 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2695 z.low += roundBitsMask; 2696 } 2697 } 2698 z.low &= ~ roundBitsMask; 2699 if ( z.low == 0 ) { 2700 ++z.high; 2701 z.low = LIT64( 0x8000000000000000 ); 2702 } 2703 if ( z.low != a.low ) roundData->exception |= float_flag_inexact; 2704 return z; 2705 2706} 2707 2708/* 2709------------------------------------------------------------------------------- 2710Returns the result of adding the absolute values of the extended double- 2711precision floating-point values `a' and `b'. If `zSign' is true, the sum is 2712negated before being returned. `zSign' is ignored if the result is a NaN. 2713The addition is performed according to the IEC/IEEE Standard for Binary 2714Floating-point Arithmetic. 2715------------------------------------------------------------------------------- 2716*/ 2717static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign ) 2718{ 2719 int32 aExp, bExp, zExp; 2720 bits64 aSig, bSig, zSig0, zSig1; 2721 int32 expDiff; 2722 2723 aSig = extractFloatx80Frac( a ); 2724 aExp = extractFloatx80Exp( a ); 2725 bSig = extractFloatx80Frac( b ); 2726 bExp = extractFloatx80Exp( b ); 2727 expDiff = aExp - bExp; 2728 if ( 0 < expDiff ) { 2729 if ( aExp == 0x7FFF ) { 2730 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2731 return a; 2732 } 2733 if ( bExp == 0 ) --expDiff; 2734 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 2735 zExp = aExp; 2736 } 2737 else if ( expDiff < 0 ) { 2738 if ( bExp == 0x7FFF ) { 2739 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2740 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2741 } 2742 if ( aExp == 0 ) ++expDiff; 2743 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 2744 zExp = bExp; 2745 } 2746 else { 2747 if ( aExp == 0x7FFF ) { 2748 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 2749 return propagateFloatx80NaN( a, b ); 2750 } 2751 return a; 2752 } 2753 zSig1 = 0; 2754 zSig0 = aSig + bSig; 2755 if ( aExp == 0 ) { 2756 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 2757 goto roundAndPack; 2758 } 2759 zExp = aExp; 2760 goto shiftRight1; 2761 } 2762 2763 zSig0 = aSig + bSig; 2764 2765 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 2766 shiftRight1: 2767 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 2768 zSig0 |= LIT64( 0x8000000000000000 ); 2769 ++zExp; 2770 roundAndPack: 2771 return 2772 roundAndPackFloatx80( 2773 roundData, zSign, zExp, zSig0, zSig1 ); 2774 2775} 2776 2777/* 2778------------------------------------------------------------------------------- 2779Returns the result of subtracting the absolute values of the extended 2780double-precision floating-point values `a' and `b'. If `zSign' is true, 2781the difference is negated before being returned. `zSign' is ignored if the 2782result is a NaN. The subtraction is performed according to the IEC/IEEE 2783Standard for Binary Floating-point Arithmetic. 2784------------------------------------------------------------------------------- 2785*/ 2786static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign ) 2787{ 2788 int32 aExp, bExp, zExp; 2789 bits64 aSig, bSig, zSig0, zSig1; 2790 int32 expDiff; 2791 floatx80 z; 2792 2793 aSig = extractFloatx80Frac( a ); 2794 aExp = extractFloatx80Exp( a ); 2795 bSig = extractFloatx80Frac( b ); 2796 bExp = extractFloatx80Exp( b ); 2797 expDiff = aExp - bExp; 2798 if ( 0 < expDiff ) goto aExpBigger; 2799 if ( expDiff < 0 ) goto bExpBigger; 2800 if ( aExp == 0x7FFF ) { 2801 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 2802 return propagateFloatx80NaN( a, b ); 2803 } 2804 roundData->exception |= float_flag_invalid; 2805 z.low = floatx80_default_nan_low; 2806 z.high = floatx80_default_nan_high; 2807 z.__padding = 0; 2808 return z; 2809 } 2810 if ( aExp == 0 ) { 2811 aExp = 1; 2812 bExp = 1; 2813 } 2814 zSig1 = 0; 2815 if ( bSig < aSig ) goto aBigger; 2816 if ( aSig < bSig ) goto bBigger; 2817 return packFloatx80( roundData->mode == float_round_down, 0, 0 ); 2818 bExpBigger: 2819 if ( bExp == 0x7FFF ) { 2820 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2821 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2822 } 2823 if ( aExp == 0 ) ++expDiff; 2824 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 2825 bBigger: 2826 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 2827 zExp = bExp; 2828 zSign ^= 1; 2829 goto normalizeRoundAndPack; 2830 aExpBigger: 2831 if ( aExp == 0x7FFF ) { 2832 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2833 return a; 2834 } 2835 if ( bExp == 0 ) --expDiff; 2836 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 2837 aBigger: 2838 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 2839 zExp = aExp; 2840 normalizeRoundAndPack: 2841 return 2842 normalizeRoundAndPackFloatx80( 2843 roundData, zSign, zExp, zSig0, zSig1 ); 2844 2845} 2846 2847/* 2848------------------------------------------------------------------------------- 2849Returns the result of adding the extended double-precision floating-point 2850values `a' and `b'. The operation is performed according to the IEC/IEEE 2851Standard for Binary Floating-point Arithmetic. 2852------------------------------------------------------------------------------- 2853*/ 2854floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b ) 2855{ 2856 flag aSign, bSign; 2857 2858 aSign = extractFloatx80Sign( a ); 2859 bSign = extractFloatx80Sign( b ); 2860 if ( aSign == bSign ) { 2861 return addFloatx80Sigs( roundData, a, b, aSign ); 2862 } 2863 else { 2864 return subFloatx80Sigs( roundData, a, b, aSign ); 2865 } 2866 2867} 2868 2869/* 2870------------------------------------------------------------------------------- 2871Returns the result of subtracting the extended double-precision floating- 2872point values `a' and `b'. The operation is performed according to the 2873IEC/IEEE Standard for Binary Floating-point Arithmetic. 2874------------------------------------------------------------------------------- 2875*/ 2876floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b ) 2877{ 2878 flag aSign, bSign; 2879 2880 aSign = extractFloatx80Sign( a ); 2881 bSign = extractFloatx80Sign( b ); 2882 if ( aSign == bSign ) { 2883 return subFloatx80Sigs( roundData, a, b, aSign ); 2884 } 2885 else { 2886 return addFloatx80Sigs( roundData, a, b, aSign ); 2887 } 2888 2889} 2890 2891/* 2892------------------------------------------------------------------------------- 2893Returns the result of multiplying the extended double-precision floating- 2894point values `a' and `b'. The operation is performed according to the 2895IEC/IEEE Standard for Binary Floating-point Arithmetic. 2896------------------------------------------------------------------------------- 2897*/ 2898floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b ) 2899{ 2900 flag aSign, bSign, zSign; 2901 int32 aExp, bExp, zExp; 2902 bits64 aSig, bSig, zSig0, zSig1; 2903 floatx80 z; 2904 2905 aSig = extractFloatx80Frac( a ); 2906 aExp = extractFloatx80Exp( a ); 2907 aSign = extractFloatx80Sign( a ); 2908 bSig = extractFloatx80Frac( b ); 2909 bExp = extractFloatx80Exp( b ); 2910 bSign = extractFloatx80Sign( b ); 2911 zSign = aSign ^ bSign; 2912 if ( aExp == 0x7FFF ) { 2913 if ( (bits64) ( aSig<<1 ) 2914 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 2915 return propagateFloatx80NaN( a, b ); 2916 } 2917 if ( ( bExp | bSig ) == 0 ) goto invalid; 2918 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2919 } 2920 if ( bExp == 0x7FFF ) { 2921 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2922 if ( ( aExp | aSig ) == 0 ) { 2923 invalid: 2924 roundData->exception |= float_flag_invalid; 2925 z.low = floatx80_default_nan_low; 2926 z.high = floatx80_default_nan_high; 2927 z.__padding = 0; 2928 return z; 2929 } 2930 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2931 } 2932 if ( aExp == 0 ) { 2933 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 2934 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 2935 } 2936 if ( bExp == 0 ) { 2937 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 2938 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 2939 } 2940 zExp = aExp + bExp - 0x3FFE; 2941 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2942 if ( 0 < (sbits64) zSig0 ) { 2943 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 2944 --zExp; 2945 } 2946 return 2947 roundAndPackFloatx80( 2948 roundData, zSign, zExp, zSig0, zSig1 ); 2949 2950} 2951 2952/* 2953------------------------------------------------------------------------------- 2954Returns the result of dividing the extended double-precision floating-point 2955value `a' by the corresponding value `b'. The operation is performed 2956according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2957------------------------------------------------------------------------------- 2958*/ 2959floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b ) 2960{ 2961 flag aSign, bSign, zSign; 2962 int32 aExp, bExp, zExp; 2963 bits64 aSig, bSig, zSig0, zSig1; 2964 bits64 rem0, rem1, rem2, term0, term1, term2; 2965 floatx80 z; 2966 2967 aSig = extractFloatx80Frac( a ); 2968 aExp = extractFloatx80Exp( a ); 2969 aSign = extractFloatx80Sign( a ); 2970 bSig = extractFloatx80Frac( b ); 2971 bExp = extractFloatx80Exp( b ); 2972 bSign = extractFloatx80Sign( b ); 2973 zSign = aSign ^ bSign; 2974 if ( aExp == 0x7FFF ) { 2975 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2976 if ( bExp == 0x7FFF ) { 2977 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2978 goto invalid; 2979 } 2980 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2981 } 2982 if ( bExp == 0x7FFF ) { 2983 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2984 return packFloatx80( zSign, 0, 0 ); 2985 } 2986 if ( bExp == 0 ) { 2987 if ( bSig == 0 ) { 2988 if ( ( aExp | aSig ) == 0 ) { 2989 invalid: 2990 roundData->exception |= float_flag_invalid; 2991 z.low = floatx80_default_nan_low; 2992 z.high = floatx80_default_nan_high; 2993 z.__padding = 0; 2994 return z; 2995 } 2996 roundData->exception |= float_flag_divbyzero; 2997 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2998 } 2999 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3000 } 3001 if ( aExp == 0 ) { 3002 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3003 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3004 } 3005 zExp = aExp - bExp + 0x3FFE; 3006 rem1 = 0; 3007 if ( bSig <= aSig ) { 3008 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3009 ++zExp; 3010 } 3011 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3012 mul64To128( bSig, zSig0, &term0, &term1 ); 3013 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3014 while ( (sbits64) rem0 < 0 ) { 3015 --zSig0; 3016 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3017 } 3018 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3019 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3020 mul64To128( bSig, zSig1, &term1, &term2 ); 3021 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3022 while ( (sbits64) rem1 < 0 ) { 3023 --zSig1; 3024 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3025 } 3026 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3027 } 3028 return 3029 roundAndPackFloatx80( 3030 roundData, zSign, zExp, zSig0, zSig1 ); 3031 3032} 3033 3034/* 3035------------------------------------------------------------------------------- 3036Returns the remainder of the extended double-precision floating-point value 3037`a' with respect to the corresponding value `b'. The operation is performed 3038according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3039------------------------------------------------------------------------------- 3040*/ 3041floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b ) 3042{ 3043 flag aSign, bSign, zSign; 3044 int32 aExp, bExp, expDiff; 3045 bits64 aSig0, aSig1, bSig; 3046 bits64 q, term0, term1, alternateASig0, alternateASig1; 3047 floatx80 z; 3048 3049 aSig0 = extractFloatx80Frac( a ); 3050 aExp = extractFloatx80Exp( a ); 3051 aSign = extractFloatx80Sign( a ); 3052 bSig = extractFloatx80Frac( b ); 3053 bExp = extractFloatx80Exp( b ); 3054 bSign = extractFloatx80Sign( b ); 3055 if ( aExp == 0x7FFF ) { 3056 if ( (bits64) ( aSig0<<1 ) 3057 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3058 return propagateFloatx80NaN( a, b ); 3059 } 3060 goto invalid; 3061 } 3062 if ( bExp == 0x7FFF ) { 3063 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3064 return a; 3065 } 3066 if ( bExp == 0 ) { 3067 if ( bSig == 0 ) { 3068 invalid: 3069 roundData->exception |= float_flag_invalid; 3070 z.low = floatx80_default_nan_low; 3071 z.high = floatx80_default_nan_high; 3072 z.__padding = 0; 3073 return z; 3074 } 3075 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3076 } 3077 if ( aExp == 0 ) { 3078 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3079 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3080 } 3081 bSig |= LIT64( 0x8000000000000000 ); 3082 zSign = aSign; 3083 expDiff = aExp - bExp; 3084 aSig1 = 0; 3085 if ( expDiff < 0 ) { 3086 if ( expDiff < -1 ) return a; 3087 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3088 expDiff = 0; 3089 } 3090 q = ( bSig <= aSig0 ); 3091 if ( q ) aSig0 -= bSig; 3092 expDiff -= 64; 3093 while ( 0 < expDiff ) { 3094 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3095 q = ( 2 < q ) ? q - 2 : 0; 3096 mul64To128( bSig, q, &term0, &term1 ); 3097 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3098 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3099 expDiff -= 62; 3100 } 3101 expDiff += 64; 3102 if ( 0 < expDiff ) { 3103 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3104 q = ( 2 < q ) ? q - 2 : 0; 3105 q >>= 64 - expDiff; 3106 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3107 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3108 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3109 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3110 ++q; 3111 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3112 } 3113 } 3114 else { 3115 term1 = 0; 3116 term0 = bSig; 3117 } 3118 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3119 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3120 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3121 && ( q & 1 ) ) 3122 ) { 3123 aSig0 = alternateASig0; 3124 aSig1 = alternateASig1; 3125 zSign = ! zSign; 3126 } 3127 3128 return 3129 normalizeRoundAndPackFloatx80( 3130 roundData, zSign, bExp + expDiff, aSig0, aSig1 ); 3131 3132} 3133 3134/* 3135------------------------------------------------------------------------------- 3136Returns the square root of the extended double-precision floating-point 3137value `a'. The operation is performed according to the IEC/IEEE Standard 3138for Binary Floating-point Arithmetic. 3139------------------------------------------------------------------------------- 3140*/ 3141floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a ) 3142{ 3143 flag aSign; 3144 int32 aExp, zExp; 3145 bits64 aSig0, aSig1, zSig0, zSig1; 3146 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3147 bits64 shiftedRem0, shiftedRem1; 3148 floatx80 z; 3149 3150 aSig0 = extractFloatx80Frac( a ); 3151 aExp = extractFloatx80Exp( a ); 3152 aSign = extractFloatx80Sign( a ); 3153 if ( aExp == 0x7FFF ) { 3154 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 3155 if ( ! aSign ) return a; 3156 goto invalid; 3157 } 3158 if ( aSign ) { 3159 if ( ( aExp | aSig0 ) == 0 ) return a; 3160 invalid: 3161 roundData->exception |= float_flag_invalid; 3162 z.low = floatx80_default_nan_low; 3163 z.high = floatx80_default_nan_high; 3164 z.__padding = 0; 3165 return z; 3166 } 3167 if ( aExp == 0 ) { 3168 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 3169 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3170 } 3171 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 3172 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 3173 zSig0 <<= 31; 3174 aSig1 = 0; 3175 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 ); 3176 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4; 3177 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF ); 3178 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 ); 3179 mul64To128( zSig0, zSig0, &term0, &term1 ); 3180 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 3181 while ( (sbits64) rem0 < 0 ) { 3182 --zSig0; 3183 shortShift128Left( 0, zSig0, 1, &term0, &term1 ); 3184 term1 |= 1; 3185 add128( rem0, rem1, term0, term1, &rem0, &rem1 ); 3186 } 3187 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 ); 3188 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 ); 3189 if ( (bits64) ( zSig1<<1 ) <= 10 ) { 3190 if ( zSig1 == 0 ) zSig1 = 1; 3191 mul64To128( zSig0, zSig1, &term1, &term2 ); 3192 shortShift128Left( term1, term2, 1, &term1, &term2 ); 3193 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3194 mul64To128( zSig1, zSig1, &term2, &term3 ); 3195 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 3196 while ( (sbits64) rem1 < 0 ) { 3197 --zSig1; 3198 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 ); 3199 term3 |= 1; 3200 add192( 3201 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 ); 3202 } 3203 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 3204 } 3205 return 3206 roundAndPackFloatx80( 3207 roundData, 0, zExp, zSig0, zSig1 ); 3208 3209} 3210 3211/* 3212------------------------------------------------------------------------------- 3213Returns 1 if the extended double-precision floating-point value `a' is 3214equal to the corresponding value `b', and 0 otherwise. The comparison is 3215performed according to the IEC/IEEE Standard for Binary Floating-point 3216Arithmetic. 3217------------------------------------------------------------------------------- 3218*/ 3219flag floatx80_eq( floatx80 a, floatx80 b ) 3220{ 3221 3222 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3223 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3224 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3225 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3226 ) { 3227 if ( floatx80_is_signaling_nan( a ) 3228 || floatx80_is_signaling_nan( b ) ) { 3229 float_raise( float_flag_invalid ); 3230 } 3231 return 0; 3232 } 3233 return 3234 ( a.low == b.low ) 3235 && ( ( a.high == b.high ) 3236 || ( ( a.low == 0 ) 3237 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 3238 ); 3239 3240} 3241 3242/* 3243------------------------------------------------------------------------------- 3244Returns 1 if the extended double-precision floating-point value `a' is 3245less than or equal to the corresponding value `b', and 0 otherwise. The 3246comparison is performed according to the IEC/IEEE Standard for Binary 3247Floating-point Arithmetic. 3248------------------------------------------------------------------------------- 3249*/ 3250flag floatx80_le( floatx80 a, floatx80 b ) 3251{ 3252 flag aSign, bSign; 3253 3254 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3255 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3256 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3257 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3258 ) { 3259 float_raise( float_flag_invalid ); 3260 return 0; 3261 } 3262 aSign = extractFloatx80Sign( a ); 3263 bSign = extractFloatx80Sign( b ); 3264 if ( aSign != bSign ) { 3265 return 3266 aSign 3267 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3268 == 0 ); 3269 } 3270 return 3271 aSign ? le128( b.high, b.low, a.high, a.low ) 3272 : le128( a.high, a.low, b.high, b.low ); 3273 3274} 3275 3276/* 3277------------------------------------------------------------------------------- 3278Returns 1 if the extended double-precision floating-point value `a' is 3279less than the corresponding value `b', and 0 otherwise. The comparison 3280is performed according to the IEC/IEEE Standard for Binary Floating-point 3281Arithmetic. 3282------------------------------------------------------------------------------- 3283*/ 3284flag floatx80_lt( floatx80 a, floatx80 b ) 3285{ 3286 flag aSign, bSign; 3287 3288 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3289 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3290 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3291 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3292 ) { 3293 float_raise( float_flag_invalid ); 3294 return 0; 3295 } 3296 aSign = extractFloatx80Sign( a ); 3297 bSign = extractFloatx80Sign( b ); 3298 if ( aSign != bSign ) { 3299 return 3300 aSign 3301 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3302 != 0 ); 3303 } 3304 return 3305 aSign ? lt128( b.high, b.low, a.high, a.low ) 3306 : lt128( a.high, a.low, b.high, b.low ); 3307 3308} 3309 3310/* 3311------------------------------------------------------------------------------- 3312Returns 1 if the extended double-precision floating-point value `a' is equal 3313to the corresponding value `b', and 0 otherwise. The invalid exception is 3314raised if either operand is a NaN. Otherwise, the comparison is performed 3315according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3316------------------------------------------------------------------------------- 3317*/ 3318flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 3319{ 3320 3321 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3322 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3323 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3324 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3325 ) { 3326 float_raise( float_flag_invalid ); 3327 return 0; 3328 } 3329 return 3330 ( a.low == b.low ) 3331 && ( ( a.high == b.high ) 3332 || ( ( a.low == 0 ) 3333 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 3334 ); 3335 3336} 3337 3338/* 3339------------------------------------------------------------------------------- 3340Returns 1 if the extended double-precision floating-point value `a' is less 3341than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 3342do not cause an exception. Otherwise, the comparison is performed according 3343to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3344------------------------------------------------------------------------------- 3345*/ 3346flag floatx80_le_quiet( floatx80 a, floatx80 b ) 3347{ 3348 flag aSign, bSign; 3349 3350 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3351 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3352 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3353 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3354 ) { 3355 /* Do nothing, even if NaN as we're quiet */ 3356 return 0; 3357 } 3358 aSign = extractFloatx80Sign( a ); 3359 bSign = extractFloatx80Sign( b ); 3360 if ( aSign != bSign ) { 3361 return 3362 aSign 3363 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3364 == 0 ); 3365 } 3366 return 3367 aSign ? le128( b.high, b.low, a.high, a.low ) 3368 : le128( a.high, a.low, b.high, b.low ); 3369 3370} 3371 3372/* 3373------------------------------------------------------------------------------- 3374Returns 1 if the extended double-precision floating-point value `a' is less 3375than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 3376an exception. Otherwise, the comparison is performed according to the 3377IEC/IEEE Standard for Binary Floating-point Arithmetic. 3378------------------------------------------------------------------------------- 3379*/ 3380flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 3381{ 3382 flag aSign, bSign; 3383 3384 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3385 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3386 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3387 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3388 ) { 3389 /* Do nothing, even if NaN as we're quiet */ 3390 return 0; 3391 } 3392 aSign = extractFloatx80Sign( a ); 3393 bSign = extractFloatx80Sign( b ); 3394 if ( aSign != bSign ) { 3395 return 3396 aSign 3397 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3398 != 0 ); 3399 } 3400 return 3401 aSign ? lt128( b.high, b.low, a.high, a.low ) 3402 : lt128( a.high, a.low, b.high, b.low ); 3403 3404} 3405 3406#endif 3407