1/* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
2
3   Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
4   normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
5   is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
6   to let N be unmodified during the operation.
7
8   Contributed to the GNU project by Torbjorn Granlund.
9
10   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
11   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
13
14Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
15
16This file is part of the GNU MP Library.
17
18The GNU MP Library is free software; you can redistribute it and/or modify
19it under the terms of either:
20
21  * the GNU Lesser General Public License as published by the Free
22    Software Foundation; either version 3 of the License, or (at your
23    option) any later version.
24
25or
26
27  * the GNU General Public License as published by the Free Software
28    Foundation; either version 2 of the License, or (at your option) any
29    later version.
30
31or both in parallel, as here.
32
33The GNU MP Library is distributed in the hope that it will be useful, but
34WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
36for more details.
37
38You should have received copies of the GNU General Public License and the
39GNU Lesser General Public License along with the GNU MP Library.  If not,
40see https://www.gnu.org/licenses/.  */
41
42
43/*
44   The idea of the algorithm used herein is to compute a smaller inverted value
45   than used in the standard Barrett algorithm, and thus save time in the
46   Newton iterations, and pay just a small price when using the inverted value
47   for developing quotient bits.  This algorithm was presented at ICMS 2006.
48*/
49
50/* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
51
52 Things to work on:
53
54  * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
55    demonstrated by the fact that the mpn_invertappr function's scratch needs
56    mean that we need to keep a large allocation long after it is needed.
57    Things are worse as mpn_mul_fft does not accept any scratch parameter,
58    which means we'll have a large memory hole while in mpn_mul_fft.  In
59    general, a peak scratch need in the beginning of a function isn't
60    well-handled by the itch/scratch scheme.
61*/
62
63#ifdef STAT
64#undef STAT
65#define STAT(x) x
66#else
67#define STAT(x)
68#endif
69
70#include <stdlib.h>		/* for NULL */
71#include "gmp-impl.h"
72
73static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t,
74			 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
75static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int);
76
77mp_limb_t
78mpn_mu_divappr_q (mp_ptr qp,
79		  mp_srcptr np,
80		  mp_size_t nn,
81		  mp_srcptr dp,
82		  mp_size_t dn,
83		  mp_ptr scratch)
84{
85  mp_size_t qn, in;
86  mp_limb_t cy, qh;
87  mp_ptr ip, tp;
88
89  ASSERT (dn > 1);
90
91  qn = nn - dn;
92
93  /* If Q is smaller than D, truncate operands. */
94  if (qn + 1 < dn)
95    {
96      np += dn - (qn + 1);
97      nn -= dn - (qn + 1);
98      dp += dn - (qn + 1);
99      dn = qn + 1;
100    }
101
102  /* Compute the inverse size.  */
103  in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
104  ASSERT (in <= dn);
105
106#if 1
107  /* This alternative inverse computation method gets slightly more accurate
108     results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
109     not adapted (3) mpn_invertappr scratch needs not met.  */
110  ip = scratch;
111  tp = scratch + in + 1;
112
113  /* compute an approximate inverse on (in+1) limbs */
114  if (dn == in)
115    {
116      MPN_COPY (tp + 1, dp, in);
117      tp[0] = 1;
118      mpn_invertappr (ip, tp, in + 1, tp + in + 1);
119      MPN_COPY_INCR (ip, ip + 1, in);
120    }
121  else
122    {
123      cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
124      if (UNLIKELY (cy != 0))
125	MPN_ZERO (ip, in);
126      else
127	{
128	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
129	  MPN_COPY_INCR (ip, ip + 1, in);
130	}
131    }
132#else
133  /* This older inverse computation method gets slightly worse results than the
134     one above.  */
135  ip = scratch;
136  tp = scratch + in;
137
138  /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
139     inversion function should do this automatically.  */
140  if (dn == in)
141    {
142      tp[in + 1] = 0;
143      MPN_COPY (tp + in + 2, dp, in);
144      mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
145    }
146  else
147    {
148      mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
149    }
150  cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
151  if (UNLIKELY (cy != 0))
152    MPN_ZERO (tp + 1, in);
153  MPN_COPY (ip, tp + 1, in);
154#endif
155
156  qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
157
158  return qh;
159}
160
161static mp_limb_t
162mpn_preinv_mu_divappr_q (mp_ptr qp,
163			 mp_srcptr np,
164			 mp_size_t nn,
165			 mp_srcptr dp,
166			 mp_size_t dn,
167			 mp_srcptr ip,
168			 mp_size_t in,
169			 mp_ptr scratch)
170{
171  mp_size_t qn;
172  mp_limb_t cy, cx, qh;
173  mp_limb_t r;
174  mp_size_t tn, wn;
175
176#define rp           scratch
177#define tp           (scratch + dn)
178#define scratch_out  (scratch + dn + tn)
179
180  qn = nn - dn;
181
182  np += qn;
183  qp += qn;
184
185  qh = mpn_cmp (np, dp, dn) >= 0;
186  if (qh != 0)
187    mpn_sub_n (rp, np, dp, dn);
188  else
189    MPN_COPY (rp, np, dn);
190
191  if (qn == 0)
192    return qh;			/* Degenerate use.  Should we allow this? */
193
194  while (qn > 0)
195    {
196      if (qn < in)
197	{
198	  ip += in - qn;
199	  in = qn;
200	}
201      np -= in;
202      qp -= in;
203
204      /* Compute the next block of quotient limbs by multiplying the inverse I
205	 by the upper part of the partial remainder R.  */
206      mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
207      cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
208      ASSERT_ALWAYS (cy == 0);
209
210      qn -= in;
211      if (qn == 0)
212	break;
213
214      /* Compute the product of the quotient block and the divisor D, to be
215	 subtracted from the partial remainder combined with new limbs from the
216	 dividend N.  We only really need the low dn limbs.  */
217
218      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
219	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
220      else
221	{
222	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
223	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
224	  wn = dn + in - tn;			/* number of wrapped limbs */
225	  if (wn > 0)
226	    {
227	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
228	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
229	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
230	      ASSERT_ALWAYS (cx >= cy);
231	      mpn_incr_u (tp, cx - cy);
232	    }
233	}
234
235      r = rp[dn - in] - tp[dn];
236
237      /* Subtract the product from the partial remainder combined with new
238	 limbs from the dividend N, generating a new partial remainder R.  */
239      if (dn != in)
240	{
241	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
242	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
243	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
244	}
245      else
246	{
247	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
248	}
249
250      STAT (int i; int err = 0;
251	    static int errarr[5]; static int err_rec; static int tot);
252
253      /* Check the remainder R and adjust the quotient as needed.  */
254      r -= cy;
255      while (r != 0)
256	{
257	  /* We loop 0 times with about 69% probability, 1 time with about 31%
258	     probability, 2 times with about 0.6% probability, if inverse is
259	     computed as recommended.  */
260	  mpn_incr_u (qp, 1);
261	  cy = mpn_sub_n (rp, rp, dp, dn);
262	  r -= cy;
263	  STAT (err++);
264	}
265      if (mpn_cmp (rp, dp, dn) >= 0)
266	{
267	  /* This is executed with about 76% probability.  */
268	  mpn_incr_u (qp, 1);
269	  cy = mpn_sub_n (rp, rp, dp, dn);
270	  STAT (err++);
271	}
272
273      STAT (
274	    tot++;
275	    errarr[err]++;
276	    if (err > err_rec)
277	      err_rec = err;
278	    if (tot % 0x10000 == 0)
279	      {
280		for (i = 0; i <= err_rec; i++)
281		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
282		printf ("\n");
283	      }
284	    );
285    }
286
287  /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
288     quotient.  For now, just make sure the returned quotient is >= the real
289     quotient; add 3 with saturating arithmetic.  */
290  qn = nn - dn;
291  cy += mpn_add_1 (qp, qp, qn, 3);
292  if (cy != 0)
293    {
294      if (qh != 0)
295	{
296	  /* Return a quotient of just 1-bits, with qh set.  */
297	  mp_size_t i;
298	  for (i = 0; i < qn; i++)
299	    qp[i] = GMP_NUMB_MAX;
300	}
301      else
302	{
303	  /* Propagate carry into qh.  */
304	  qh = 1;
305	}
306    }
307
308  return qh;
309}
310
311/* In case k=0 (automatic choice), we distinguish 3 cases:
312   (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
313   (b) dn/3 < qn <= dn: in = ceil(qn / 2)
314   (c) qn < dn/3:       in = qn
315   In all cases we have in <= dn.
316 */
317static mp_size_t
318mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
319{
320  mp_size_t in;
321
322  if (k == 0)
323    {
324      mp_size_t b;
325      if (qn > dn)
326	{
327	  /* Compute an inverse size that is a nice partition of the quotient.  */
328	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
329	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
330	}
331      else if (3 * qn > dn)
332	{
333	  in = (qn - 1) / 2 + 1;	/* b = 2 */
334	}
335      else
336	{
337	  in = (qn - 1) / 1 + 1;	/* b = 1 */
338	}
339    }
340  else
341    {
342      mp_size_t xn;
343      xn = MIN (dn, qn);
344      in = (xn - 1) / k + 1;
345    }
346
347  return in;
348}
349
350mp_size_t
351mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
352{
353  mp_size_t qn, in, itch_local, itch_out, itch_invapp;
354
355  qn = nn - dn;
356  if (qn + 1 < dn)
357    {
358      dn = qn + 1;
359    }
360  in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
361
362  itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
363  itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
364  itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
365
366  ASSERT (dn + itch_local + itch_out >= itch_invapp);
367  return in + MAX (dn + itch_local + itch_out, itch_invapp);
368}
369