1#include <fenv.h> 2#include "libm.h" 3 4#if LDBL_MANT_DIG==64 && LDBL_MAX_EXP==16384 5/* exact add, assumes exponent_x >= exponent_y */ 6static void add(long double *hi, long double *lo, long double x, long double y) 7{ 8 long double r; 9 10 r = x + y; 11 *hi = r; 12 r -= x; 13 *lo = y - r; 14} 15 16/* exact mul, assumes no over/underflow */ 17static void mul(long double *hi, long double *lo, long double x, long double y) 18{ 19 static const long double c = 1.0 + 0x1p32L; 20 long double cx, xh, xl, cy, yh, yl; 21 22 cx = c*x; 23 xh = (x - cx) + cx; 24 xl = x - xh; 25 cy = c*y; 26 yh = (y - cy) + cy; 27 yl = y - yh; 28 *hi = x*y; 29 *lo = (xh*yh - *hi) + xh*yl + xl*yh + xl*yl; 30} 31 32/* 33assume (long double)(hi+lo) == hi 34return an adjusted hi so that rounding it to double (or less) precision is correct 35*/ 36static long double adjust(long double hi, long double lo) 37{ 38 union ldshape uhi, ulo; 39 40 if (lo == 0) 41 return hi; 42 uhi.f = hi; 43 if (uhi.i.m & 0x3ff) 44 return hi; 45 ulo.f = lo; 46 if ((uhi.i.se & 0x8000) == (ulo.i.se & 0x8000)) 47 uhi.i.m++; 48 else { 49 /* handle underflow and take care of ld80 implicit msb */ 50 if (uhi.i.m << 1 == 0) { 51 uhi.i.m = 0; 52 uhi.i.se--; 53 } 54 uhi.i.m--; 55 } 56 return uhi.f; 57} 58 59/* adjusted add so the result is correct when rounded to double (or less) precision */ 60static long double dadd(long double x, long double y) 61{ 62 add(&x, &y, x, y); 63 return adjust(x, y); 64} 65 66/* adjusted mul so the result is correct when rounded to double (or less) precision */ 67static long double dmul(long double x, long double y) 68{ 69 mul(&x, &y, x, y); 70 return adjust(x, y); 71} 72 73static int getexp(long double x) 74{ 75 union ldshape u; 76 u.f = x; 77 return u.i.se & 0x7fff; 78} 79 80double fma(double x, double y, double z) 81{ 82 #pragma STDC FENV_ACCESS ON 83 long double hi, lo1, lo2, xy; 84 int round, ez, exy; 85 86 /* handle +-inf,nan */ 87 if (!isfinite(x) || !isfinite(y)) 88 return x*y + z; 89 if (!isfinite(z)) 90 return z; 91 /* handle +-0 */ 92 if (x == 0.0 || y == 0.0) 93 return x*y + z; 94 round = fegetround(); 95 if (z == 0.0) { 96 if (round == FE_TONEAREST) 97 return dmul(x, y); 98 return x*y; 99 } 100 101 /* exact mul and add require nearest rounding */ 102 /* spurious inexact exceptions may be raised */ 103 fesetround(FE_TONEAREST); 104 mul(&xy, &lo1, x, y); 105 exy = getexp(xy); 106 ez = getexp(z); 107 if (ez > exy) { 108 add(&hi, &lo2, z, xy); 109 } else if (ez > exy - 12) { 110 add(&hi, &lo2, xy, z); 111 if (hi == 0) { 112 /* 113 xy + z is 0, but it should be calculated with the 114 original rounding mode so the sign is correct, if the 115 compiler does not support FENV_ACCESS ON it does not 116 know about the changed rounding mode and eliminates 117 the xy + z below without the volatile memory access 118 */ 119 volatile double z_; 120 fesetround(round); 121 z_ = z; 122 return (xy + z_) + lo1; 123 } 124 } else { 125 /* 126 ez <= exy - 12 127 the 12 extra bits (1guard, 11round+sticky) are needed so with 128 lo = dadd(lo1, lo2) 129 elo <= ehi - 11, and we use the last 10 bits in adjust so 130 dadd(hi, lo) 131 gives correct result when rounded to double 132 */ 133 hi = xy; 134 lo2 = z; 135 } 136 /* 137 the result is stored before return for correct precision and exceptions 138 139 one corner case is when the underflow flag should be raised because 140 the precise result is an inexact subnormal double, but the calculated 141 long double result is an exact subnormal double 142 (so rounding to double does not raise exceptions) 143 144 in nearest rounding mode dadd takes care of this: the last bit of the 145 result is adjusted so rounding sees an inexact value when it should 146 147 in non-nearest rounding mode fenv is used for the workaround 148 */ 149 fesetround(round); 150 if (round == FE_TONEAREST) 151 z = dadd(hi, dadd(lo1, lo2)); 152 else { 153#if defined(FE_INEXACT) && defined(FE_UNDERFLOW) 154 int e = fetestexcept(FE_INEXACT); 155 feclearexcept(FE_INEXACT); 156#endif 157 z = hi + (lo1 + lo2); 158#if defined(FE_INEXACT) && defined(FE_UNDERFLOW) 159 if (getexp(z) < 0x3fff-1022 && fetestexcept(FE_INEXACT)) 160 feraiseexcept(FE_UNDERFLOW); 161 else if (e) 162 feraiseexcept(FE_INEXACT); 163#endif 164 } 165 return z; 166} 167#else 168/* origin: FreeBSD /usr/src/lib/msun/src/s_fma.c */ 169/*- 170 * Copyright (c) 2005-2011 David Schultz <das@FreeBSD.ORG> 171 * All rights reserved. 172 * 173 * Redistribution and use in source and binary forms, with or without 174 * modification, are permitted provided that the following conditions 175 * are met: 176 * 1. Redistributions of source code must retain the above copyright 177 * notice, this list of conditions and the following disclaimer. 178 * 2. Redistributions in binary form must reproduce the above copyright 179 * notice, this list of conditions and the following disclaimer in the 180 * documentation and/or other materials provided with the distribution. 181 * 182 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 183 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 184 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 185 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 186 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 187 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 188 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 189 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 190 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 191 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 192 * SUCH DAMAGE. 193 */ 194 195/* 196 * A struct dd represents a floating-point number with twice the precision 197 * of a double. We maintain the invariant that "hi" stores the 53 high-order 198 * bits of the result. 199 */ 200struct dd { 201 double hi; 202 double lo; 203}; 204 205/* 206 * Compute a+b exactly, returning the exact result in a struct dd. We assume 207 * that both a and b are finite, but make no assumptions about their relative 208 * magnitudes. 209 */ 210static inline struct dd dd_add(double a, double b) 211{ 212 struct dd ret; 213 double s; 214 215 ret.hi = a + b; 216 s = ret.hi - a; 217 ret.lo = (a - (ret.hi - s)) + (b - s); 218 return (ret); 219} 220 221/* 222 * Compute a+b, with a small tweak: The least significant bit of the 223 * result is adjusted into a sticky bit summarizing all the bits that 224 * were lost to rounding. This adjustment negates the effects of double 225 * rounding when the result is added to another number with a higher 226 * exponent. For an explanation of round and sticky bits, see any reference 227 * on FPU design, e.g., 228 * 229 * J. Coonen. An Implementation Guide to a Proposed Standard for 230 * Floating-Point Arithmetic. Computer, vol. 13, no. 1, Jan 1980. 231 */ 232static inline double add_adjusted(double a, double b) 233{ 234 struct dd sum; 235 union {double f; uint64_t i;} uhi, ulo; 236 237 sum = dd_add(a, b); 238 if (sum.lo != 0) { 239 uhi.f = sum.hi; 240 if ((uhi.i & 1) == 0) { 241 /* hibits += (int)copysign(1.0, sum.hi * sum.lo) */ 242 ulo.f = sum.lo; 243 uhi.i += 1 - ((uhi.i ^ ulo.i) >> 62); 244 sum.hi = uhi.f; 245 } 246 } 247 return (sum.hi); 248} 249 250/* 251 * Compute ldexp(a+b, scale) with a single rounding error. It is assumed 252 * that the result will be subnormal, and care is taken to ensure that 253 * double rounding does not occur. 254 */ 255static inline double add_and_denormalize(double a, double b, int scale) 256{ 257 struct dd sum; 258 union {double f; uint64_t i;} uhi, ulo; 259 int bits_lost; 260 261 sum = dd_add(a, b); 262 263 /* 264 * If we are losing at least two bits of accuracy to denormalization, 265 * then the first lost bit becomes a round bit, and we adjust the 266 * lowest bit of sum.hi to make it a sticky bit summarizing all the 267 * bits in sum.lo. With the sticky bit adjusted, the hardware will 268 * break any ties in the correct direction. 269 * 270 * If we are losing only one bit to denormalization, however, we must 271 * break the ties manually. 272 */ 273 if (sum.lo != 0) { 274 uhi.f = sum.hi; 275 bits_lost = -((int)(uhi.i >> 52) & 0x7ff) - scale + 1; 276 if ((bits_lost != 1) ^ (int)(uhi.i & 1)) { 277 /* hibits += (int)copysign(1.0, sum.hi * sum.lo) */ 278 ulo.f = sum.lo; 279 uhi.i += 1 - (((uhi.i ^ ulo.i) >> 62) & 2); 280 sum.hi = uhi.f; 281 } 282 } 283 return scalbn(sum.hi, scale); 284} 285 286/* 287 * Compute a*b exactly, returning the exact result in a struct dd. We assume 288 * that both a and b are normalized, so no underflow or overflow will occur. 289 * The current rounding mode must be round-to-nearest. 290 */ 291static inline struct dd dd_mul(double a, double b) 292{ 293 static const double split = 0x1p27 + 1.0; 294 struct dd ret; 295 double ha, hb, la, lb, p, q; 296 297 p = a * split; 298 ha = a - p; 299 ha += p; 300 la = a - ha; 301 302 p = b * split; 303 hb = b - p; 304 hb += p; 305 lb = b - hb; 306 307 p = ha * hb; 308 q = ha * lb + la * hb; 309 310 ret.hi = p + q; 311 ret.lo = p - ret.hi + q + la * lb; 312 return (ret); 313} 314 315/* 316 * Fused multiply-add: Compute x * y + z with a single rounding error. 317 * 318 * We use scaling to avoid overflow/underflow, along with the 319 * canonical precision-doubling technique adapted from: 320 * 321 * Dekker, T. A Floating-Point Technique for Extending the 322 * Available Precision. Numer. Math. 18, 224-242 (1971). 323 * 324 * This algorithm is sensitive to the rounding precision. FPUs such 325 * as the i387 must be set in double-precision mode if variables are 326 * to be stored in FP registers in order to avoid incorrect results. 327 * This is the default on FreeBSD, but not on many other systems. 328 * 329 * Hardware instructions should be used on architectures that support it, 330 * since this implementation will likely be several times slower. 331 */ 332double fma(double x, double y, double z) 333{ 334 #pragma STDC FENV_ACCESS ON 335 double xs, ys, zs, adj; 336 struct dd xy, r; 337 int oround; 338 int ex, ey, ez; 339 int spread; 340 341 /* 342 * Handle special cases. The order of operations and the particular 343 * return values here are crucial in handling special cases involving 344 * infinities, NaNs, overflows, and signed zeroes correctly. 345 */ 346 if (!isfinite(x) || !isfinite(y)) 347 return (x * y + z); 348 if (!isfinite(z)) 349 return (z); 350 if (x == 0.0 || y == 0.0) 351 return (x * y + z); 352 if (z == 0.0) 353 return (x * y); 354 355 xs = frexp(x, &ex); 356 ys = frexp(y, &ey); 357 zs = frexp(z, &ez); 358 oround = fegetround(); 359 spread = ex + ey - ez; 360 361 /* 362 * If x * y and z are many orders of magnitude apart, the scaling 363 * will overflow, so we handle these cases specially. Rounding 364 * modes other than FE_TONEAREST are painful. 365 */ 366 if (spread < -DBL_MANT_DIG) { 367#ifdef FE_INEXACT 368 feraiseexcept(FE_INEXACT); 369#endif 370#ifdef FE_UNDERFLOW 371 if (!isnormal(z)) 372 feraiseexcept(FE_UNDERFLOW); 373#endif 374 switch (oround) { 375 default: /* FE_TONEAREST */ 376 return (z); 377#ifdef FE_TOWARDZERO 378 case FE_TOWARDZERO: 379 if (x > 0.0 ^ y < 0.0 ^ z < 0.0) 380 return (z); 381 else 382 return (nextafter(z, 0)); 383#endif 384#ifdef FE_DOWNWARD 385 case FE_DOWNWARD: 386 if (x > 0.0 ^ y < 0.0) 387 return (z); 388 else 389 return (nextafter(z, -INFINITY)); 390#endif 391#ifdef FE_UPWARD 392 case FE_UPWARD: 393 if (x > 0.0 ^ y < 0.0) 394 return (nextafter(z, INFINITY)); 395 else 396 return (z); 397#endif 398 } 399 } 400 if (spread <= DBL_MANT_DIG * 2) 401 zs = scalbn(zs, -spread); 402 else 403 zs = copysign(DBL_MIN, zs); 404 405 fesetround(FE_TONEAREST); 406 407 /* 408 * Basic approach for round-to-nearest: 409 * 410 * (xy.hi, xy.lo) = x * y (exact) 411 * (r.hi, r.lo) = xy.hi + z (exact) 412 * adj = xy.lo + r.lo (inexact; low bit is sticky) 413 * result = r.hi + adj (correctly rounded) 414 */ 415 xy = dd_mul(xs, ys); 416 r = dd_add(xy.hi, zs); 417 418 spread = ex + ey; 419 420 if (r.hi == 0.0) { 421 /* 422 * When the addends cancel to 0, ensure that the result has 423 * the correct sign. 424 */ 425 fesetround(oround); 426 volatile double vzs = zs; /* XXX gcc CSE bug workaround */ 427 return xy.hi + vzs + scalbn(xy.lo, spread); 428 } 429 430 if (oround != FE_TONEAREST) { 431 /* 432 * There is no need to worry about double rounding in directed 433 * rounding modes. 434 * But underflow may not be raised properly, example in downward rounding: 435 * fma(0x1.000000001p-1000, 0x1.000000001p-30, -0x1p-1066) 436 */ 437 double ret; 438#if defined(FE_INEXACT) && defined(FE_UNDERFLOW) 439 int e = fetestexcept(FE_INEXACT); 440 feclearexcept(FE_INEXACT); 441#endif 442 fesetround(oround); 443 adj = r.lo + xy.lo; 444 ret = scalbn(r.hi + adj, spread); 445#if defined(FE_INEXACT) && defined(FE_UNDERFLOW) 446 if (ilogb(ret) < -1022 && fetestexcept(FE_INEXACT)) 447 feraiseexcept(FE_UNDERFLOW); 448 else if (e) 449 feraiseexcept(FE_INEXACT); 450#endif 451 return ret; 452 } 453 454 adj = add_adjusted(r.lo, xy.lo); 455 if (spread + ilogb(r.hi) > -1023) 456 return scalbn(r.hi + adj, spread); 457 else 458 return add_and_denormalize(r.hi, adj, spread); 459} 460#endif 461