1/* Copyright (C) 2007-2020 Free Software Foundation, Inc. 2 3This file is part of GCC. 4 5GCC is free software; you can redistribute it and/or modify it under 6the terms of the GNU General Public License as published by the Free 7Software Foundation; either version 3, or (at your option) any later 8version. 9 10GCC is distributed in the hope that it will be useful, but WITHOUT ANY 11WARRANTY; without even the implied warranty of MERCHANTABILITY or 12FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13for more details. 14 15Under Section 7 of GPL version 3, you are granted additional 16permissions described in the GCC Runtime Library Exception, version 173.1, as published by the Free Software Foundation. 18 19You should have received a copy of the GNU General Public License and 20a copy of the GCC Runtime Library Exception along with this program; 21see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22<http://www.gnu.org/licenses/>. */ 23 24/***************************************************************************** 25 * BID64 square root 26 ***************************************************************************** 27 * 28 * Algorithm description: 29 * 30 * if(exponent_x is odd) 31 * scale coefficient_x by 10, adjust exponent 32 * - get lower estimate for number of digits in coefficient_x 33 * - scale coefficient x to between 31 and 33 decimal digits 34 * - in parallel, check for exact case and return if true 35 * - get high part of result coefficient using double precision sqrt 36 * - compute remainder and refine coefficient in one iteration (which 37 * modifies it by at most 1) 38 * - result exponent is easy to compute from the adjusted arg. exponent 39 * 40 ****************************************************************************/ 41 42#include "bid_internal.h" 43#include "bid_sqrt_macros.h" 44#ifdef UNCHANGED_BINARY_STATUS_FLAGS 45#include <fenv.h> 46 47#define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT 48#endif 49 50extern double sqrt (double); 51 52#if DECIMAL_CALL_BY_REFERENCE 53 54void 55bid64_sqrt (UINT64 * pres, 56 UINT64 * 57 px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM 58 _EXC_INFO_PARAM) { 59 UINT64 x; 60#else 61 62UINT64 63bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM 64 _EXC_MASKS_PARAM _EXC_INFO_PARAM) { 65#endif 66 UINT128 CA, CT; 67 UINT64 sign_x, coefficient_x; 68 UINT64 Q, Q2, A10, C4, R, R2, QE, res; 69 SINT64 D; 70 int_double t_scale; 71 int_float tempx; 72 double da, dq, da_h, da_l, dqe; 73 int exponent_x, exponent_q, bin_expon_cx; 74 int digits_x; 75 int scale; 76#ifdef UNCHANGED_BINARY_STATUS_FLAGS 77 fexcept_t binaryflags = 0; 78#endif 79 80#if DECIMAL_CALL_BY_REFERENCE 81#if !DECIMAL_GLOBAL_ROUNDING 82 _IDEC_round rnd_mode = *prnd_mode; 83#endif 84 x = *px; 85#endif 86 87 // unpack arguments, check for NaN or Infinity 88 if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) { 89 // x is Inf. or NaN or 0 90 if ((x & INFINITY_MASK64) == INFINITY_MASK64) { 91 res = coefficient_x; 92 if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64) // -Infinity 93 { 94 res = NAN_MASK64; 95#ifdef SET_STATUS_FLAGS 96 __set_status_flags (pfpsf, INVALID_EXCEPTION); 97#endif 98 } 99#ifdef SET_STATUS_FLAGS 100 if ((x & SNAN_MASK64) == SNAN_MASK64) // sNaN 101 __set_status_flags (pfpsf, INVALID_EXCEPTION); 102#endif 103 BID_RETURN (res & QUIET_MASK64); 104 } 105 // x is 0 106 exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1; 107 res = sign_x | (((UINT64) exponent_x) << 53); 108 BID_RETURN (res); 109 } 110 // x<0? 111 if (sign_x && coefficient_x) { 112 res = NAN_MASK64; 113#ifdef SET_STATUS_FLAGS 114 __set_status_flags (pfpsf, INVALID_EXCEPTION); 115#endif 116 BID_RETURN (res); 117 } 118#ifdef UNCHANGED_BINARY_STATUS_FLAGS 119 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS); 120#endif 121 //--- get number of bits in the coefficient of x --- 122 tempx.d = (float) coefficient_x; 123 bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f; 124 digits_x = estimate_decimal_digits[bin_expon_cx]; 125 // add test for range 126 if (coefficient_x >= power10_index_binexp[bin_expon_cx]) 127 digits_x++; 128 129 A10 = coefficient_x; 130 if (exponent_x & 1) { 131 A10 = (A10 << 2) + A10; 132 A10 += A10; 133 } 134 135 dqe = sqrt ((double) A10); 136 QE = (UINT32) dqe; 137 if (QE * QE == A10) { 138 res = 139 very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1, 140 QE); 141#ifdef UNCHANGED_BINARY_STATUS_FLAGS 142 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 143#endif 144 BID_RETURN (res); 145 } 146 // if exponent is odd, scale coefficient by 10 147 scale = 31 - digits_x; 148 exponent_q = exponent_x - scale; 149 scale += (exponent_q & 1); // exp. bias is even 150 151 CT = power10_table_128[scale]; 152 __mul_64x128_short (CA, coefficient_x, CT); 153 154 // 2^64 155 t_scale.i = 0x43f0000000000000ull; 156 // convert CA to DP 157 da_h = CA.w[1]; 158 da_l = CA.w[0]; 159 da = da_h * t_scale.d + da_l; 160 161 dq = sqrt (da); 162 163 Q = (UINT64) dq; 164 165 // get sign(sqrt(CA)-Q) 166 R = CA.w[0] - Q * Q; 167 R = ((SINT64) R) >> 63; 168 D = R + R + 1; 169 170 exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1; 171 172#ifdef SET_STATUS_FLAGS 173 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 174#endif 175 176#ifndef IEEE_ROUND_NEAREST 177#ifndef IEEE_ROUND_NEAREST_TIES_AWAY 178 if (!((rnd_mode) & 3)) { 179#endif 180#endif 181 182 // midpoint to check 183 Q2 = Q + Q + D; 184 C4 = CA.w[0] << 2; 185 186 // get sign(-sqrt(CA)+Midpoint) 187 R2 = Q2 * Q2 - C4; 188 R2 = ((SINT64) R2) >> 63; 189 190 // adjust Q if R!=R2 191 Q += (D & (R ^ R2)); 192#ifndef IEEE_ROUND_NEAREST 193#ifndef IEEE_ROUND_NEAREST_TIES_AWAY 194 } else { 195 C4 = CA.w[0]; 196 Q += D; 197 if ((SINT64) (Q * Q - C4) > 0) 198 Q--; 199 if (rnd_mode == ROUNDING_UP) 200 Q++; 201 } 202#endif 203#endif 204 205 res = fast_get_BID64 (0, exponent_q, Q); 206#ifdef UNCHANGED_BINARY_STATUS_FLAGS 207 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 208#endif 209 BID_RETURN (res); 210} 211 212 213TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x) 214 215 UINT256 M256, C4, C8; 216 UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1, 217 mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql; 218UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0; 219SINT64 D; 220int_float fx, f64; 221int exponent_x, bin_expon_cx, done = 0; 222int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits; 223#ifdef UNCHANGED_BINARY_STATUS_FLAGS 224fexcept_t binaryflags = 0; 225#endif 226 227 // unpack arguments, check for NaN or Infinity 228if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) { 229 res = CX.w[1]; 230 // NaN ? 231 if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) { 232#ifdef SET_STATUS_FLAGS 233 if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull) // sNaN 234 __set_status_flags (pfpsf, INVALID_EXCEPTION); 235#endif 236 Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull); 237 Tmp.w[0] = CX.w[0]; 238 TP128 = reciprocals10_128[18]; 239 __mul_128x128_full (Qh, Ql, Tmp, TP128); 240 amount = recip_scale[18]; 241 __shr_128 (Tmp, Qh, amount); 242 res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0]; 243 BID_RETURN (res); 244 } 245 // x is Infinity? 246 if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) { 247 if (sign_x) { 248 // -Inf, return NaN 249 res = 0x7c00000000000000ull; 250#ifdef SET_STATUS_FLAGS 251 __set_status_flags (pfpsf, INVALID_EXCEPTION); 252#endif 253 } 254 BID_RETURN (res); 255 } 256 // x is 0 otherwise 257 258 exponent_x = 259 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) + 260 DECIMAL_EXPONENT_BIAS; 261 if (exponent_x < 0) 262 exponent_x = 0; 263 if (exponent_x > DECIMAL_MAX_EXPON_64) 264 exponent_x = DECIMAL_MAX_EXPON_64; 265 //res= sign_x | (((UINT64)exponent_x)<<53); 266 res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf); 267 BID_RETURN (res); 268} 269if (sign_x) { 270 res = 0x7c00000000000000ull; 271#ifdef SET_STATUS_FLAGS 272 __set_status_flags (pfpsf, INVALID_EXCEPTION); 273#endif 274 BID_RETURN (res); 275} 276#ifdef UNCHANGED_BINARY_STATUS_FLAGS 277(void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS); 278#endif 279 280 // 2^64 281f64.i = 0x5f800000; 282 283 // fx ~ CX 284fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0]; 285bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f; 286digits = estimate_decimal_digits[bin_expon_cx]; 287 288A10 = CX; 289if (exponent_x & 1) { 290 A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61); 291 A10.w[0] = CX.w[0] << 3; 292 CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63); 293 CX2.w[0] = CX.w[0] << 1; 294 __add_128_128 (A10, A10, CX2); 295} 296 297C256.w[1] = A10.w[1]; 298C256.w[0] = A10.w[0]; 299CS.w[0] = short_sqrt128 (A10); 300CS.w[1] = 0; 301mul_factor = 0; 302 // check for exact result 303if (CS.w[0] < 10000000000000000ull) { 304 if (CS.w[0] * CS.w[0] == A10.w[0]) { 305 __sqr64_fast (S2, CS.w[0]); 306 if (S2.w[1] == A10.w[1]) // && S2.w[0]==A10.w[0]) 307 { 308 res = 309 get_BID64 (0, 310 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) + 311 DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf); 312#ifdef UNCHANGED_BINARY_STATUS_FLAGS 313 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 314#endif 315 BID_RETURN (res); 316 } 317 } 318 if (CS.w[0] >= 1000000000000000ull) { 319 done = 1; 320 exponent_q = exponent_x; 321 C256.w[1] = A10.w[1]; 322 C256.w[0] = A10.w[0]; 323 } 324#ifdef SET_STATUS_FLAGS 325 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 326#endif 327 exact = 0; 328} else { 329 B10 = 0x3333333333333334ull; 330 __mul_64x64_to_128_full (CS2, CS.w[0], B10); 331 CS0 = CS2.w[1] >> 1; 332 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) { 333#ifdef SET_STATUS_FLAGS 334 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 335#endif 336 exact = 0; 337 } 338 done = 1; 339 CS.w[0] = CS0; 340 exponent_q = exponent_x + 2; 341 mul_factor = 10; 342 mul_factor2 = 100; 343 if (CS.w[0] >= 10000000000000000ull) { 344 __mul_64x64_to_128_full (CS2, CS.w[0], B10); 345 CS0 = CS2.w[1] >> 1; 346 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) { 347#ifdef SET_STATUS_FLAGS 348 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 349#endif 350 exact = 0; 351 } 352 exponent_q += 2; 353 CS.w[0] = CS0; 354 mul_factor = 100; 355 mul_factor2 = 10000; 356 } 357 if (exact) { 358 CS0 = CS.w[0] * mul_factor; 359 __sqr64_fast (CS1, CS0) 360 if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) { 361#ifdef SET_STATUS_FLAGS 362 __set_status_flags (pfpsf, INEXACT_EXCEPTION); 363#endif 364 exact = 0; 365 } 366 } 367} 368 369if (!done) { 370 // get number of digits in CX 371 D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1]; 372 if (D > 0 373 || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0])) 374 digits++; 375 376 // if exponent is odd, scale coefficient by 10 377 scale = 31 - digits; 378 exponent_q = exponent_x - scale; 379 scale += (exponent_q & 1); // exp. bias is even 380 381 T128 = power10_table_128[scale]; 382 __mul_128x128_low (C256, CX, T128); 383 384 385 CS.w[0] = short_sqrt128 (C256); 386} 387 //printf("CS=%016I64x\n",CS.w[0]); 388 389exponent_q = 390 ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) + 391 DECIMAL_EXPONENT_BIAS; 392if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) { 393 extra_digits = -exponent_q; 394 exponent_q = 0; 395 396 // get coeff*(2^M[extra_digits])/10^extra_digits 397 __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]); 398 399 // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128 400 amount = short_recip_scale[extra_digits]; 401 402 CS0 = QH.w[1] >> amount; 403 404#ifdef SET_STATUS_FLAGS 405 if (exact) { 406 if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0]) 407 exact = 0; 408 } 409 if (!exact) 410 __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION); 411#endif 412 413 CS.w[0] = CS0; 414 if (!mul_factor) 415 mul_factor = 1; 416 mul_factor *= power10_table_128[extra_digits].w[0]; 417 __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor); 418 if (mul_factor2_long.w[1]) 419 mul_factor2 = 0; 420 else 421 mul_factor2 = mul_factor2_long.w[1]; 422} 423 // 4*C256 424C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62); 425C4.w[0] = C256.w[0] << 2; 426 427#ifndef IEEE_ROUND_NEAREST 428#ifndef IEEE_ROUND_NEAREST_TIES_AWAY 429if (!((rnd_mode) & 3)) { 430#endif 431#endif 432 // compare to midpoints 433 CSM.w[0] = (CS.w[0] + CS.w[0]) | 1; 434 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]); 435 if (mul_factor) 436 CSM.w[0] *= mul_factor; 437 // CSM^2 438 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]); 439 //__mul_128x128_to_256(M256, CSM, CSM); 440 441 if (C4.w[1] > M256.w[1] || 442 (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) { 443 // round up 444 CS.w[0]++; 445 } else { 446 C8.w[0] = CS.w[0] << 3; 447 C8.w[1] = 0; 448 if (mul_factor) { 449 if (mul_factor2) { 450 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2); 451 } else { 452 __mul_64x128_low (C8, C8.w[0], mul_factor2_long); 453 } 454 } 455 // M256 - 8*CSM 456 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]); 457 M256.w[1] = M256.w[1] - C8.w[1] - Carry; 458 459 // if CSM' > C256, round up 460 if (M256.w[1] > C4.w[1] || 461 (M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) { 462 // round down 463 if (CS.w[0]) 464 CS.w[0]--; 465 } 466 } 467#ifndef IEEE_ROUND_NEAREST 468#ifndef IEEE_ROUND_NEAREST_TIES_AWAY 469} else { 470 CS.w[0]++; 471 CSM.w[0] = CS.w[0]; 472 C8.w[0] = CSM.w[0] << 1; 473 if (mul_factor) 474 CSM.w[0] *= mul_factor; 475 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]); 476 C8.w[1] = 0; 477 if (mul_factor) { 478 if (mul_factor2) { 479 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2); 480 } else { 481 __mul_64x128_low (C8, C8.w[0], mul_factor2_long); 482 } 483 } 484 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]); 485 486 if (M256.w[1] > C256.w[1] || 487 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) { 488 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]); 489 M256.w[1] = M256.w[1] - Carry - C8.w[1]; 490 M256.w[0]++; 491 if (!M256.w[0]) { 492 M256.w[1]++; 493 494 } 495 496 if ((M256.w[1] > C256.w[1] || 497 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) 498 && (CS.w[0] > 1)) { 499 500 CS.w[0]--; 501 502 if (CS.w[0] > 1) { 503 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]); 504 M256.w[1] = M256.w[1] - Carry - C8.w[1]; 505 M256.w[0]++; 506 if (!M256.w[0]) { 507 M256.w[1]++; 508 } 509 510 if (M256.w[1] > C256.w[1] || 511 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) 512 CS.w[0]--; 513 } 514 } 515 } 516 517 else { 518 /*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]); 519 M256.w[1] = M256.w[1] + Carry + C8.w[1]; 520 M256.w[0]++; 521 if(!M256.w[0]) 522 { 523 M256.w[1]++; 524 } 525 CS.w[0]++; 526 if(M256.w[1]<C256.w[1] || 527 (M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0])) 528 { 529 CS.w[0]++; 530 }*/ 531 CS.w[0]++; 532 } 533 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact); 534 // RU? 535 if (((rnd_mode) != ROUNDING_UP) || exact) { 536 if (CS.w[0]) 537 CS.w[0]--; 538 } 539 540} 541#endif 542#endif 543 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact); 544 545res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf); 546#ifdef UNCHANGED_BINARY_STATUS_FLAGS 547(void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS); 548#endif 549BID_RETURN (res); 550 551 552} 553