1/* Mulders' MulHigh function (short product)
2
3Copyright 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4Contributed by the AriC and Caramel projects, INRIA.
5
6This file is part of the GNU MPFR Library.
7
8The GNU MPFR Library is free software; you can redistribute it and/or modify
9it under the terms of the GNU Lesser General Public License as published by
10the Free Software Foundation; either version 3 of the License, or (at your
11option) any later version.
12
13The GNU MPFR Library is distributed in the hope that it will be useful, but
14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16License for more details.
17
18You should have received a copy of the GNU Lesser General Public License
19along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
20http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
2151 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23/* References:
24   [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
25       Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
26       July 25-27, 2011, pages 7-14.
27*/
28
29#define MPFR_NEED_LONGLONG_H
30#include "mpfr-impl.h"
31
32#ifndef MUL_FFT_THRESHOLD
33#define MUL_FFT_THRESHOLD 8448
34#endif
35
36/* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
37#ifdef MPFR_MULHIGH_TAB_SIZE
38static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
39#else
40static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
41#define MPFR_MULHIGH_TAB_SIZE \
42  ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
43#endif
44
45/* Put in  rp[n..2n-1] an approximation of the n high limbs
46   of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
47   approximation is always less or equal to the truncated full product).
48   Assume 2n limbs are allocated at rp.
49
50   Implements Algorithm ShortMulNaive from [1].
51*/
52static void
53mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
54                         mpfr_limb_srcptr vp, mp_size_t n)
55{
56  mp_size_t i;
57
58  rp += n - 1;
59  umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
60                                               which is less than B^n */
61  for (i = 1 ; i < n ; i++)
62    /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
63    rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
64  /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
65}
66
67/* Put in  rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
68   Assume 2n limbs are allocated at rp. */
69static void
70mpfr_mullow_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
71                        mpfr_limb_srcptr vp, mp_size_t n)
72{
73  mp_size_t i;
74
75  rp[n] = mpn_mul_1 (rp, up, n, vp[0]);
76  for (i = 1 ; i < n ; i++)
77    mpn_addmul_1 (rp + i, up, n - i + 1, vp[i]);
78}
79
80/* Put in  rp[n..2n-1] an approximation of the n high limbs
81   of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
82   approximation is always less or equal to the truncated full product).
83
84   Implements Algorithm ShortMul from [1].
85*/
86void
87mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
88                mp_size_t n)
89{
90  mp_size_t k;
91
92  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
93  k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
94  /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
95     into k >= (n+4)/2 in the C language. */
96  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
97  if (k < 0)
98    mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
99  else if (k == 0)
100    mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
101  else if (n > MUL_FFT_THRESHOLD)
102    mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
103  else
104    {
105      mp_size_t l = n - k;
106      mp_limb_t cy;
107
108      mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
109      mpfr_mulhigh_n (rp, np + k, mp, l);        /* fills rp[l-1..2l-1] */
110      cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
111      mpfr_mulhigh_n (rp, np, mp + k, l);        /* fills rp[l-1..2l-1] */
112      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
113      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
114    }
115}
116
117/* Put in  rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
118   Assume 2n limbs are allocated at rp. */
119void
120mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
121               mp_size_t n)
122{
123  mp_size_t k;
124
125  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
126  k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
127  MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n));
128  if (k < 0)
129    mpn_mul_basecase (rp, np, n, mp, n);
130  else if (k == 0)
131    mpfr_mullow_n_basecase (rp, np, mp, n);
132  else if (n > MUL_FFT_THRESHOLD)
133    mpn_mul_n (rp, np, mp, n);
134  else
135    {
136      mp_size_t l = n - k;
137
138      mpn_mul_n (rp, np, mp, k);                      /* fills rp[0..2k] */
139      mpfr_mullow_n (rp + n, np + k, mp, l);          /* fills rp[n..n+2l] */
140      mpn_add_n (rp + k, rp + k, rp + n, l + 1);
141      mpfr_mullow_n (rp + n, np, mp + k, l);          /* fills rp[n..n+2l] */
142      mpn_add_n (rp + k, rp + k, rp + n, l + 1);
143    }
144}
145
146#ifdef MPFR_SQRHIGH_TAB_SIZE
147static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
148#else
149static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
150#define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
151#endif
152
153/* Put in  rp[n..2n-1] an approximation of the n high limbs
154   of {np, n}^2. The error is less than n ulps of rp[n]. */
155void
156mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
157{
158  mp_size_t k;
159
160  MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
161  k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
162    : (n+4)/2; /* ensures that k >= (n+3)/2 */
163  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
164  if (k < 0)
165    /* we can't use mpn_sqr_basecase here, since it requires
166       n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
167       is not exported by GMP */
168    mpn_sqr_n (rp, np, n);
169  else if (k == 0)
170    mpfr_mulhigh_n_basecase (rp, np, np, n);
171  else
172    {
173      mp_size_t l = n - k;
174      mp_limb_t cy;
175
176      mpn_sqr_n (rp + 2 * l, np + l, k);          /* fills rp[2l..2n-1] */
177      mpfr_mulhigh_n (rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
178      /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
179      cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
180      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
181      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
182    }
183}
184
185#ifdef MPFR_DIVHIGH_TAB_SIZE
186static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
187#else
188static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
189#define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
190#endif
191
192#ifndef __GMPFR_GMP_H__
193#define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
194#endif
195
196#if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
197/* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
198   with the most significant limb of the quotient as return value (0 or 1).
199   Assumes the most significant bit of D is set. Clobbers N.
200
201   The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
202*/
203static mp_limb_t
204mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
205                         mpfr_limb_srcptr dp, mp_size_t n)
206{
207  mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
208  mpfr_pi1_t dinv2;
209
210  np += n;
211
212  if ((qh = (mpn_cmp (np, dp, n) >= 0)))
213    mpn_sub_n (np, np, dp, n);
214
215  /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
216
217  d1 = dp[n - 1];
218
219  if (n == 1)
220    {
221      invert_limb (dinv, d1);
222      umul_ppmm (q1, q0, np[0], dinv);
223      qp[0] = np[0] + q1;
224      return qh;
225    }
226
227  /* now n >= 2 */
228  d0 = dp[n - 2];
229  invert_pi1 (dinv2, d1, d0);
230  /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
231  while (n > 1)
232    {
233      /* Invariant: it remains to reduce n limbs from N (in addition to the
234         initial low n limbs).
235         Since n >= 2 here, necessarily we had n >= 2 initially, which means
236         that in addition to the limb np[n-1] to reduce, we have at least 2
237         extra limbs, thus accessing np[n-3] is valid. */
238
239      /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
240         the largest possible partial quotient is B-1 */
241      if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0))
242        q2 = ~ (mp_limb_t) 0;
243      else
244        udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
245                      d1, d0, dinv2.inv32);
246      /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
247         we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
248         thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
249         and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
250         thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
251         which proves that at most one correction is needed */
252      q0 = mpn_submul_1 (np - 1, dp, n, q2);
253      if (MPFR_UNLIKELY(q0 > np[n - 1]))
254        {
255          mpn_add_n (np - 1, np - 1, dp, n);
256          q2 --;
257        }
258      qp[--n] = q2;
259      dp ++;
260    }
261
262  /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
263     q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
264        <= floor((np[0]*B+np[1])/d1)
265     thus q1 is not larger than the true quotient.
266     q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
267     For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
268     thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
269     (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
270     d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
271     thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
272
273     For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
274     np[0]*B/d1 - 2.
275
276     In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
277     q - 4 <= q1 <= q
278  */
279  umul_ppmm (q1, q0, np[0], dinv2.inv32);
280  qp[0] = np[0] + q1;
281
282  return qh;
283}
284#endif
285
286/* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
287   with the most significant limb of the quotient as return value (0 or 1).
288   Assumes the most significant bit of D is set. Clobbers N.
289
290   This implements the ShortDiv algorithm from reference [1].
291*/
292#if 1
293mp_limb_t
294mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
295                mp_size_t n)
296{
297  mp_size_t k, l;
298  mp_limb_t qh, cy;
299  mpfr_limb_ptr tp;
300  MPFR_TMP_DECL(marker);
301
302  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
303  k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
304
305  if (k == 0)
306#if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
307  {
308    mpfr_pi1_t dinv2;
309    invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
310    return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
311  }
312#else /* use our own code for base-case short division */
313    return mpfr_divhigh_n_basecase (qp, np, dp, n);
314#endif
315  else if (k == n)
316    /* for k=n, we use a division with remainder (mpn_divrem),
317     which computes the exact quotient */
318    return mpn_divrem (qp, 0, np, 2 * n, dp, n);
319
320  MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
321  MPFR_TMP_MARK (marker);
322  l = n - k;
323  /* first divide the most significant 2k limbs from N by the most significant
324     k limbs of D */
325  qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
326
327  /* it remains {np,2l+k} = {np,n+l} as remainder */
328
329  /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
330     D0={dp,l} */
331  tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
332  mpfr_mulhigh_n (tp, qp + k, dp, l);
333  /* we are only interested in the upper l limbs from {tp,2l} */
334  cy = mpn_sub_n (np + n, np + n, tp + l, l);
335  if (qh)
336    cy += mpn_sub_n (np + n, np + n, dp, l);
337  while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
338    {
339      qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
340      cy -= mpn_add_n (np + l, np + l, dp, n);
341    }
342
343  /* now it remains {np,n+l} to divide by D */
344  cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
345  qh += mpn_add_1 (qp + l, qp + l, k, cy);
346  MPFR_TMP_FREE(marker);
347
348  return qh;
349}
350#else /* below is the FoldDiv(K) algorithm from [1] */
351mp_limb_t
352mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
353                mp_size_t n)
354{
355  mp_size_t k, r;
356  mpfr_limb_ptr ip, tp, up;
357  mp_limb_t qh = 0, cy, cc;
358  int count;
359  MPFR_TMP_DECL(marker);
360
361#define K 3
362  if (n < K)
363    return mpn_divrem (qp, 0, np, 2 * n, dp, n);
364
365  k = (n - 1) / K + 1; /* ceil(n/K) */
366
367  MPFR_TMP_MARK (marker);
368  ip = MPFR_TMP_LIMBS_ALLOC (k + 1);
369  tp = MPFR_TMP_LIMBS_ALLOC (n + k);
370  up = MPFR_TMP_LIMBS_ALLOC (2 * (k + 1));
371  mpn_invert (ip, dp + n - (k + 1), k + 1, NULL); /* takes about 13% for n=1000 */
372  /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
373  for (r = n, cc = 0UL; r > 0;)
374    {
375      /* cc is the carry at np[n+r] */
376      MPFR_ASSERTD(cc <= 1);
377      /* FIXME: why can we have cc as large as say 8? */
378      count = 0;
379      while (cc > 0)
380        {
381          count ++;
382          MPFR_ASSERTD(count <= 1);
383          /* subtract {dp+n-r,r} from {np+n,r} */
384          cc -= mpn_sub_n (np + n, np + n, dp + n - r, r);
385          /* add 1 at qp[r] */
386          qh += mpn_add_1 (qp + r, qp + r, n - r, 1UL);
387        }
388      /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
389      if (r < k)
390        {
391          ip += k - r;
392          k = r;
393        }
394      /* now r >= k */
395      /* qp + r - 2 * k -> up */
396      mpfr_mulhigh_n (up, np + n + r - (k + 1), ip, k + 1);
397      /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
398      cy = mpn_add_n (qp + r - k, up + k + 2, np + n + r - k, k);
399      /* since we need only r limbs of tp (below), it suffices to consider
400         r high limbs of dp */
401      if (r > k)
402        {
403#if 0
404          mpn_mul (tp, dp + n - r, r, qp + r - k, k);
405#else  /* use a short product for the low k x k limbs */
406          /* we know the upper k limbs of the r-limb product cancel with the
407             remainder, thus we only need to compute the low r-k limbs */
408          if (r - k >= k)
409            mpn_mul (tp + k, dp + n - r + k, r - k, qp + r - k, k);
410          else /* r-k < k */
411            {
412/* #define LOW */
413#ifndef LOW
414              mpn_mul (tp + k, qp + r - k, k, dp + n - r + k, r - k);
415#else
416              mpfr_mullow_n_basecase (tp + k, qp + r - k, dp + n - r + k, r - k);
417              /* take into account qp[2r-2k] * dp[n - r + k] */
418              tp[r] += qp[2*r-2*k] * dp[n - r + k];
419#endif
420              /* tp[k..r] is filled */
421            }
422#if 0
423          mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k);
424#else /* compute one more limb. FIXME: we could add one limb of dp in the
425         above, to save one mpn_addmul_1 call */
426          mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k - 1); /* {up,2k-2} */
427          /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
428          up[2*k-2] = mpn_addmul_1 (up + k - 1, qp + r - k, k-1, dp[n-r+k-1]);
429          /* add {dp+n-r, k} * qp[r-1] */
430          up[2*k-1] = mpn_addmul_1 (up + k - 1, dp + n - r, k, qp[r-1]);
431#endif
432#ifndef LOW
433          cc = mpn_add_n (tp + k, tp + k, up + k, k);
434          mpn_add_1 (tp + 2 * k, tp + 2 * k, r - k, cc);
435#else
436          /* update tp[k..r] */
437          if (r - k + 1 <= k)
438            mpn_add_n (tp + k, tp + k, up + k, r - k + 1);
439          else /* r - k >= k */
440            {
441              cc = mpn_add_n (tp + k, tp + k, up + k, k);
442              mpn_add_1 (tp + 2 * k, tp + 2 * k, r - 2 * k + 1, cc);
443            }
444#endif
445#endif
446        }
447      else /* last step: since we only want the quotient, no need to update,
448              just propagate the carry cy */
449        {
450          MPFR_ASSERTD(r < n);
451          if (cy > 0)
452            qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
453          break;
454        }
455      /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
456         update {np+n, n} */
457      /* we should have tp[r] = np[n+r-k] up to 1 */
458      MPFR_ASSERTD(tp[r] == np[n + r - k] || tp[r] + 1 == np[n + r - k]);
459#ifndef LOW
460      cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r + 1); /* borrow at np[n+r] */
461#else
462      cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r - k + 2);
463#endif
464      /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
465         {dp+n-r,r} from {np+n,r} */
466      if (cy)
467        {
468          if (r < n)
469            cc += mpn_sub_n (np + n - 1, np + n - 1, dp + n - r - 1, r + 1);
470          else
471            cc += mpn_sub_n (np + n, np + n, dp + n - r, r);
472          /* propagate cy */
473          if (r == n)
474            qh = cy;
475          else
476            qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
477        }
478      /* cc is the borrow at np[n+r] */
479      count = 0;
480      while (cc > 0) /* quotient was too large */
481        {
482          count++;
483          MPFR_ASSERTD (count <= 1);
484          cy = mpn_add_n (np + n, np + n, dp + n - (r - k), r - k);
485          cc -= mpn_add_1 (np + n + r - k, np + n + r - k, k, cy);
486          qh -= mpn_sub_1 (qp + r - k, qp + r - k, n - (r - k), 1UL);
487        }
488      r -= k;
489      cc = np[n + r];
490    }
491  MPFR_TMP_FREE(marker);
492
493  return qh;
494}
495#endif
496