1/* mpfr_exp_2 -- exponential of a floating-point number
2                 using algorithms in O(n^(1/2)*M(n)) and O(n^(1/3)*M(n))
3
4Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
5Contributed by the AriC and Caramel projects, INRIA.
6
7This file is part of the GNU MPFR Library.
8
9The GNU MPFR Library is free software; you can redistribute it and/or modify
10it under the terms of the GNU Lesser General Public License as published by
11the Free Software Foundation; either version 3 of the License, or (at your
12option) any later version.
13
14The GNU MPFR Library is distributed in the hope that it will be useful, but
15WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17License for more details.
18
19You should have received a copy of the GNU Lesser General Public License
20along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
21http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
2251 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
23
24/* #define DEBUG */
25#define MPFR_NEED_LONGLONG_H /* for count_leading_zeros */
26#include "mpfr-impl.h"
27
28static unsigned long
29mpfr_exp2_aux (mpz_t, mpfr_srcptr, mpfr_prec_t, mpfr_exp_t *);
30static unsigned long
31mpfr_exp2_aux2 (mpz_t, mpfr_srcptr, mpfr_prec_t, mpfr_exp_t *);
32static mpfr_exp_t
33mpz_normalize  (mpz_t, mpz_t, mpfr_exp_t);
34static mpfr_exp_t
35mpz_normalize2 (mpz_t, mpz_t, mpfr_exp_t, mpfr_exp_t);
36
37/* if k = the number of bits of z > q, divides z by 2^(k-q) and returns k-q.
38   Otherwise do nothing and return 0.
39 */
40static mpfr_exp_t
41mpz_normalize (mpz_t rop, mpz_t z, mpfr_exp_t q)
42{
43  size_t k;
44
45  MPFR_MPZ_SIZEINBASE2 (k, z);
46  MPFR_ASSERTD (k == (mpfr_uexp_t) k);
47  if (q < 0 || (mpfr_uexp_t) k > (mpfr_uexp_t) q)
48    {
49      mpz_fdiv_q_2exp (rop, z, (unsigned long) ((mpfr_uexp_t) k - q));
50      return (mpfr_exp_t) k - q;
51    }
52  if (MPFR_UNLIKELY(rop != z))
53    mpz_set (rop, z);
54  return 0;
55}
56
57/* if expz > target, shift z by (expz-target) bits to the left.
58   if expz < target, shift z by (target-expz) bits to the right.
59   Returns target.
60*/
61static mpfr_exp_t
62mpz_normalize2 (mpz_t rop, mpz_t z, mpfr_exp_t expz, mpfr_exp_t target)
63{
64  if (target > expz)
65    mpz_fdiv_q_2exp (rop, z, target - expz);
66  else
67    mpz_mul_2exp (rop, z, expz - target);
68  return target;
69}
70
71/* use Brent's formula exp(x) = (1+r+r^2/2!+r^3/3!+...)^(2^K)*2^n
72   where x = n*log(2)+(2^K)*r
73   together with the Paterson-Stockmeyer O(t^(1/2)) algorithm for the
74   evaluation of power series. The resulting complexity is O(n^(1/3)*M(n)).
75   This function returns with the exact flags due to exp.
76*/
77int
78mpfr_exp_2 (mpfr_ptr y, mpfr_srcptr x, mpfr_rnd_t rnd_mode)
79{
80  long n;
81  unsigned long K, k, l, err; /* FIXME: Which type ? */
82  int error_r;
83  mpfr_exp_t exps, expx;
84  mpfr_prec_t q, precy;
85  int inexact;
86  mpfr_t r, s;
87  mpz_t ss;
88  MPFR_ZIV_DECL (loop);
89
90  MPFR_LOG_FUNC
91    (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec(x), mpfr_log_prec, x, rnd_mode),
92     ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec(y), mpfr_log_prec, y,
93      inexact));
94
95  expx = MPFR_GET_EXP (x);
96  precy = MPFR_PREC(y);
97
98  /* Warning: we cannot use the 'double' type here, since on 64-bit machines
99     x may be as large as 2^62*log(2) without overflow, and then x/log(2)
100     is about 2^62: not every integer of that size can be represented as a
101     'double', thus the argument reduction would fail. */
102  if (expx <= -2)
103    /* |x| <= 0.25, thus n = round(x/log(2)) = 0 */
104    n = 0;
105  else
106    {
107      mpfr_init2 (r, sizeof (long) * CHAR_BIT);
108      mpfr_const_log2 (r, MPFR_RNDZ);
109      mpfr_div (r, x, r, MPFR_RNDN);
110      n = mpfr_get_si (r, MPFR_RNDN);
111      mpfr_clear (r);
112    }
113  /* we have |x| <= (|n|+1)*log(2) */
114  MPFR_LOG_MSG (("d(x)=%1.30e n=%ld\n", mpfr_get_d1(x), n));
115
116  /* error_r bounds the cancelled bits in x - n*log(2) */
117  if (MPFR_UNLIKELY (n == 0))
118    error_r = 0;
119  else
120    {
121      count_leading_zeros (error_r, (mp_limb_t) SAFE_ABS (unsigned long, n) + 1);
122      error_r = GMP_NUMB_BITS - error_r;
123      /* we have |x| <= 2^error_r * log(2) */
124    }
125
126  /* for the O(n^(1/2)*M(n)) method, the Taylor series computation of
127     n/K terms costs about n/(2K) multiplications when computed in fixed
128     point */
129  K = (precy < MPFR_EXP_2_THRESHOLD) ? __gmpfr_isqrt ((precy + 1) / 2)
130    : __gmpfr_cuberoot (4*precy);
131  l = (precy - 1) / K + 1;
132  err = K + MPFR_INT_CEIL_LOG2 (2 * l + 18);
133  /* add K extra bits, i.e. failure probability <= 1/2^K = O(1/precy) */
134  q = precy + err + K + 8;
135  /* if |x| >> 1, take into account the cancelled bits */
136  if (expx > 0)
137    q += expx;
138
139  /* Note: due to the mpfr_prec_round below, it is not possible to use
140     the MPFR_GROUP_* macros here. */
141
142  mpfr_init2 (r, q + error_r);
143  mpfr_init2 (s, q + error_r);
144
145  /* the algorithm consists in computing an upper bound of exp(x) using
146     a precision of q bits, and see if we can round to MPFR_PREC(y) taking
147     into account the maximal error. Otherwise we increase q. */
148  MPFR_ZIV_INIT (loop, q);
149  for (;;)
150    {
151      MPFR_LOG_MSG (("n=%ld K=%lu l=%lu q=%lu error_r=%d\n",
152                     n, K, l, (unsigned long) q, error_r));
153
154      /* First reduce the argument to r = x - n * log(2),
155         so that r is small in absolute value. We want an upper
156         bound on r to get an upper bound on exp(x). */
157
158      /* if n<0, we have to get an upper bound of log(2)
159         in order to get an upper bound of r = x-n*log(2) */
160      mpfr_const_log2 (s, (n >= 0) ? MPFR_RNDZ : MPFR_RNDU);
161      /* s is within 1 ulp(s) of log(2) */
162
163      mpfr_mul_ui (r, s, (n < 0) ? -n : n, (n >= 0) ? MPFR_RNDZ : MPFR_RNDU);
164      /* r is within 3 ulps of |n|*log(2) */
165      if (n < 0)
166        MPFR_CHANGE_SIGN (r);
167      /* r <= n*log(2), within 3 ulps */
168
169      MPFR_LOG_VAR (x);
170      MPFR_LOG_VAR (r);
171
172      mpfr_sub (r, x, r, MPFR_RNDU);
173
174      if (MPFR_IS_PURE_FP (r))
175        {
176          while (MPFR_IS_NEG (r))
177            { /* initial approximation n was too large */
178              n--;
179              mpfr_add (r, r, s, MPFR_RNDU);
180            }
181
182          /* since there was a cancellation in x - n*log(2), the low error_r
183             bits from r are zero and thus non significant, thus we can reduce
184             the working precision */
185          if (error_r > 0)
186            mpfr_prec_round (r, q, MPFR_RNDU);
187          /* the error on r is at most 3 ulps (3 ulps if error_r = 0,
188             and 1 + 3/2 if error_r > 0) */
189          MPFR_LOG_VAR (r);
190          MPFR_ASSERTD (MPFR_IS_POS (r));
191          mpfr_div_2ui (r, r, K, MPFR_RNDU); /* r = (x-n*log(2))/2^K, exact */
192
193          mpz_init (ss);
194          exps = mpfr_get_z_2exp (ss, s);
195          /* s <- 1 + r/1! + r^2/2! + ... + r^l/l! */
196          MPFR_ASSERTD (MPFR_IS_PURE_FP (r) && MPFR_EXP (r) < 0);
197          l = (precy < MPFR_EXP_2_THRESHOLD)
198            ? mpfr_exp2_aux (ss, r, q, &exps)   /* naive method */
199            : mpfr_exp2_aux2 (ss, r, q, &exps); /* Paterson/Stockmeyer meth */
200
201          MPFR_LOG_MSG (("l=%lu q=%lu (K+l)*q^2=%1.3e\n",
202                         l, (unsigned long) q, (K + l) * (double) q * q));
203
204          for (k = 0; k < K; k++)
205            {
206              mpz_mul (ss, ss, ss);
207              exps <<= 1;
208              exps += mpz_normalize (ss, ss, q);
209            }
210          mpfr_set_z (s, ss, MPFR_RNDN);
211
212          MPFR_SET_EXP(s, MPFR_GET_EXP (s) + exps);
213          mpz_clear (ss);
214
215          /* error is at most 2^K*l, plus 2 to take into account of
216             the error of 3 ulps on r */
217          err = K + MPFR_INT_CEIL_LOG2 (l) + 2;
218
219          MPFR_LOG_MSG (("before mult. by 2^n:\n", 0));
220          MPFR_LOG_VAR (s);
221          MPFR_LOG_MSG (("err=%lu bits\n", K));
222
223          if (MPFR_LIKELY (MPFR_CAN_ROUND (s, q - err, precy, rnd_mode)))
224            {
225              mpfr_clear_flags ();
226              inexact = mpfr_mul_2si (y, s, n, rnd_mode);
227              break;
228            }
229        }
230
231      MPFR_ZIV_NEXT (loop, q);
232      mpfr_set_prec (r, q + error_r);
233      mpfr_set_prec (s, q + error_r);
234    }
235  MPFR_ZIV_FREE (loop);
236
237  mpfr_clear (r);
238  mpfr_clear (s);
239
240  return inexact;
241}
242
243/* s <- 1 + r/1! + r^2/2! + ... + r^l/l! while MPFR_EXP(r^l/l!)+MPFR_EXPR(r)>-q
244   using naive method with O(l) multiplications.
245   Return the number of iterations l.
246   The absolute error on s is less than 3*l*(l+1)*2^(-q).
247   Version using fixed-point arithmetic with mpz instead
248   of mpfr for internal computations.
249   NOTE[VL]: the following sentence seems to be obsolete since MY_INIT_MPZ
250   is no longer used (r6919); qn was the number of limbs of q.
251   s must have at least qn+1 limbs (qn should be enough, but currently fails
252   since mpz_mul_2exp(s, s, q-1) reallocates qn+1 limbs)
253*/
254static unsigned long
255mpfr_exp2_aux (mpz_t s, mpfr_srcptr r, mpfr_prec_t q, mpfr_exp_t *exps)
256{
257  unsigned long l;
258  mpfr_exp_t dif, expt, expr;
259  mpz_t t, rr;
260  mp_size_t sbit, tbit;
261
262  MPFR_ASSERTN (MPFR_IS_PURE_FP (r));
263
264  expt = 0;
265  *exps = 1 - (mpfr_exp_t) q;                   /* s = 2^(q-1) */
266  mpz_init (t);
267  mpz_init (rr);
268  mpz_set_ui(t, 1);
269  mpz_set_ui(s, 1);
270  mpz_mul_2exp(s, s, q-1);
271  expr = mpfr_get_z_2exp(rr, r);               /* no error here */
272
273  l = 0;
274  for (;;) {
275    l++;
276    mpz_mul(t, t, rr);
277    expt += expr;
278    MPFR_MPZ_SIZEINBASE2 (sbit, s);
279    MPFR_MPZ_SIZEINBASE2 (tbit, t);
280    dif = *exps + sbit - expt - tbit;
281    /* truncates the bits of t which are < ulp(s) = 2^(1-q) */
282    expt += mpz_normalize(t, t, (mpfr_exp_t) q-dif); /* error at most 2^(1-q) */
283    mpz_fdiv_q_ui (t, t, l);                   /* error at most 2^(1-q) */
284    /* the error wrt t^l/l! is here at most 3*l*ulp(s) */
285    MPFR_ASSERTD (expt == *exps);
286    if (mpz_sgn (t) == 0)
287      break;
288    mpz_add(s, s, t);                      /* no error here: exact */
289    /* ensures rr has the same size as t: after several shifts, the error
290       on rr is still at most ulp(t)=ulp(s) */
291    MPFR_MPZ_SIZEINBASE2 (tbit, t);
292    expr += mpz_normalize(rr, rr, tbit);
293  }
294
295  mpz_clear (t);
296  mpz_clear (rr);
297
298  return 3 * l * (l + 1);
299}
300
301/* s <- 1 + r/1! + r^2/2! + ... + r^l/l! while MPFR_EXP(r^l/l!)+MPFR_EXPR(r)>-q
302   using Paterson-Stockmeyer algorithm with O(sqrt(l)) multiplications.
303   Return l.
304   Uses m multiplications of full size and 2l/m of decreasing size,
305   i.e. a total equivalent to about m+l/m full multiplications,
306   i.e. 2*sqrt(l) for m=sqrt(l).
307   NOTE[VL]: The following sentence seems to be obsolete since MY_INIT_MPZ
308   is no longer used (r6919); sizer was the number of limbs of r.
309   Version using mpz. ss must have at least (sizer+1) limbs.
310   The error is bounded by (l^2+4*l) ulps where l is the return value.
311*/
312static unsigned long
313mpfr_exp2_aux2 (mpz_t s, mpfr_srcptr r, mpfr_prec_t q, mpfr_exp_t *exps)
314{
315  mpfr_exp_t expr, *expR, expt;
316  mpfr_prec_t ql;
317  unsigned long l, m, i;
318  mpz_t t, *R, rr, tmp;
319  mp_size_t sbit, rrbit;
320  MPFR_TMP_DECL(marker);
321
322  /* estimate value of l */
323  MPFR_ASSERTD (MPFR_GET_EXP (r) < 0);
324  l = q / (- MPFR_GET_EXP (r));
325  m = __gmpfr_isqrt (l);
326  /* we access R[2], thus we need m >= 2 */
327  if (m < 2)
328    m = 2;
329
330  MPFR_TMP_MARK(marker);
331  R = (mpz_t*) MPFR_TMP_ALLOC ((m + 1) * sizeof (mpz_t));     /* R[i] is r^i */
332  expR = (mpfr_exp_t*) MPFR_TMP_ALLOC((m + 1) * sizeof (mpfr_exp_t));
333  /* expR[i] is the exponent for R[i] */
334  mpz_init (tmp);
335  mpz_init (rr);
336  mpz_init (t);
337  mpz_set_ui (s, 0);
338  *exps = 1 - q;                        /* 1 ulp = 2^(1-q) */
339  for (i = 0 ; i <= m ; i++)
340    mpz_init (R[i]);
341  expR[1] = mpfr_get_z_2exp (R[1], r); /* exact operation: no error */
342  expR[1] = mpz_normalize2 (R[1], R[1], expR[1], 1 - q); /* error <= 1 ulp */
343  mpz_mul (t, R[1], R[1]); /* err(t) <= 2 ulps */
344  mpz_fdiv_q_2exp (R[2], t, q - 1); /* err(R[2]) <= 3 ulps */
345  expR[2] = 1 - q;
346  for (i = 3 ; i <= m ; i++)
347    {
348      if ((i & 1) == 1)
349        mpz_mul (t, R[i-1], R[1]); /* err(t) <= 2*i-2 */
350      else
351        mpz_mul (t, R[i/2], R[i/2]);
352      mpz_fdiv_q_2exp (R[i], t, q - 1); /* err(R[i]) <= 2*i-1 ulps */
353      expR[i] = 1 - q;
354    }
355  mpz_set_ui (R[0], 1);
356  mpz_mul_2exp (R[0], R[0], q-1);
357  expR[0] = 1-q; /* R[0]=1 */
358  mpz_set_ui (rr, 1);
359  expr = 0; /* rr contains r^l/l! */
360  /* by induction: err(rr) <= 2*l ulps */
361
362  l = 0;
363  ql = q; /* precision used for current giant step */
364  do
365    {
366      /* all R[i] must have exponent 1-ql */
367      if (l != 0)
368        for (i = 0 ; i < m ; i++)
369          expR[i] = mpz_normalize2 (R[i], R[i], expR[i], 1 - ql);
370      /* the absolute error on R[i]*rr is still 2*i-1 ulps */
371      expt = mpz_normalize2 (t, R[m-1], expR[m-1], 1 - ql);
372      /* err(t) <= 2*m-1 ulps */
373      /* computes t = 1 + r/(l+1) + ... + r^(m-1)*l!/(l+m-1)!
374         using Horner's scheme */
375      for (i = m-1 ; i-- != 0 ; )
376        {
377          mpz_fdiv_q_ui (t, t, l+i+1); /* err(t) += 1 ulp */
378          mpz_add (t, t, R[i]);
379        }
380      /* now err(t) <= (3m-2) ulps */
381
382      /* now multiplies t by r^l/l! and adds to s */
383      mpz_mul (t, t, rr);
384      expt += expr;
385      expt = mpz_normalize2 (t, t, expt, *exps);
386      /* err(t) <= (3m-1) + err_rr(l) <= (3m-2) + 2*l */
387      MPFR_ASSERTD (expt == *exps);
388      mpz_add (s, s, t); /* no error here */
389
390      /* updates rr, the multiplication of the factors l+i could be done
391         using binary splitting too, but it is not sure it would save much */
392      mpz_mul (t, rr, R[m]); /* err(t) <= err(rr) + 2m-1 */
393      expr += expR[m];
394      mpz_set_ui (tmp, 1);
395      for (i = 1 ; i <= m ; i++)
396        mpz_mul_ui (tmp, tmp, l + i);
397      mpz_fdiv_q (t, t, tmp); /* err(t) <= err(rr) + 2m */
398      l += m;
399      if (MPFR_UNLIKELY (mpz_sgn (t) == 0))
400        break;
401      expr += mpz_normalize (rr, t, ql); /* err_rr(l+1) <= err_rr(l) + 2m+1 */
402      if (MPFR_UNLIKELY (mpz_sgn (rr) == 0))
403        rrbit = 1;
404      else
405        MPFR_MPZ_SIZEINBASE2 (rrbit, rr);
406      MPFR_MPZ_SIZEINBASE2 (sbit, s);
407      ql = q - *exps - sbit + expr + rrbit;
408      /* TODO: Wrong cast. I don't want what is right, but this is
409         certainly wrong */
410    }
411  while ((size_t) expr + rrbit > (size_t) -q);
412
413  for (i = 0 ; i <= m ; i++)
414    mpz_clear (R[i]);
415  MPFR_TMP_FREE(marker);
416  mpz_clear (rr);
417  mpz_clear (t);
418  mpz_clear (tmp);
419
420  return l * (l + 4);
421}
422