mu_div_qr.c revision 1.1.1.1
1/* mpn_mu_div_qr, mpn_preinv_mu_div_qr.
2
3   Compute Q = floor(N / D) and R = N-QD.  N is nn limbs and D is dn limbs and
4   must be normalized, and Q must be nn-dn limbs.  The requirement that Q is
5   nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to
6   let N be unmodified during the operation.
7
8   Contributed to the GNU project by Torbjorn Granlund.
9
10   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
11   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
13
14Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
15
16This file is part of the GNU MP Library.
17
18The GNU MP Library is free software; you can redistribute it and/or modify
19it under the terms of the GNU Lesser General Public License as published by
20the Free Software Foundation; either version 3 of the License, or (at your
21option) any later version.
22
23The GNU MP Library is distributed in the hope that it will be useful, but
24WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
26License for more details.
27
28You should have received a copy of the GNU Lesser General Public License
29along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
30
31
32/*
33   The idea of the algorithm used herein is to compute a smaller inverted value
34   than used in the standard Barrett algorithm, and thus save time in the
35   Newton iterations, and pay just a small price when using the inverted value
36   for developing quotient bits.  This algorithm was presented at ICMS 2006.
37*/
38
39/* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync.
40
41 Things to work on:
42
43  * This isn't optimal when the quotient isn't needed, as it might take a lot
44    of space.  The computation is always needed, though, so there is no time to
45    save with special code.
46
47  * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
48    demonstrated by the fact that the mpn_invertappr function's scratch needs
49    mean that we need to keep a large allocation long after it is needed.
50    Things are worse as mpn_mul_fft does not accept any scratch parameter,
51    which means we'll have a large memory hole while in mpn_mul_fft.  In
52    general, a peak scratch need in the beginning of a function isn't
53    well-handled by the itch/scratch scheme.
54*/
55
56#ifdef STAT
57#undef STAT
58#define STAT(x) x
59#else
60#define STAT(x)
61#endif
62
63#include <stdlib.h>		/* for NULL */
64#include "gmp.h"
65#include "gmp-impl.h"
66
67
68/* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly.  It gives a
69   speedup according to old measurements, but does the decision mechanism
70   really make sense?  It seem like the quotient between dn and qn might be
71   what we really should be checking.  */
72#ifndef MU_DIV_QR_SKEW_THRESHOLD
73#define MU_DIV_QR_SKEW_THRESHOLD 100
74#endif
75
76#ifdef CHECK				/* FIXME: Enable in minithres */
77#undef  MU_DIV_QR_SKEW_THRESHOLD
78#define MU_DIV_QR_SKEW_THRESHOLD 1
79#endif
80
81
82static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
83
84
85mp_limb_t
86mpn_mu_div_qr (mp_ptr qp,
87	       mp_ptr rp,
88	       mp_srcptr np,
89	       mp_size_t nn,
90	       mp_srcptr dp,
91	       mp_size_t dn,
92	       mp_ptr scratch)
93{
94  mp_size_t qn;
95  mp_limb_t cy, qh;
96
97  qn = nn - dn;
98  if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn)
99    {
100      /* |______________|_ign_first__|   dividend			  nn
101		|_______|_ign_first__|   divisor			  dn
102
103		|______|	     quotient (prel)			  qn
104
105		 |___________________|   quotient * ignored-divisor-part  dn-1
106      */
107
108      /* Compute a preliminary quotient and a partial remainder by dividing the
109	 most significant limbs of each operand.  */
110      qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
111			   np + nn - (2 * qn + 1), 2 * qn + 1,
112			   dp + dn - (qn + 1), qn + 1,
113			   scratch);
114
115      /* Multiply the quotient by the divisor limbs ignored above.  */
116      if (dn - (qn + 1) > qn)
117	mpn_mul (scratch, dp, dn - (qn + 1), qp, qn);  /* prod is dn-1 limbs */
118      else
119	mpn_mul (scratch, qp, qn, dp, dn - (qn + 1));  /* prod is dn-1 limbs */
120
121      if (qh)
122	cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1));
123      else
124	cy = 0;
125      scratch[dn - 1] = cy;
126
127      cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
128      cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
129		       rp + nn - (2 * qn + 1),
130		       scratch + nn - (2 * qn + 1),
131		       qn + 1, cy);
132      if (cy)
133	{
134	  qh -= mpn_sub_1 (qp, qp, qn, 1);
135	  mpn_add_n (rp, rp, dp, dn);
136	}
137    }
138  else
139    {
140      qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
141    }
142
143  return qh;
144}
145
146static mp_limb_t
147mpn_mu_div_qr2 (mp_ptr qp,
148		mp_ptr rp,
149		mp_srcptr np,
150		mp_size_t nn,
151		mp_srcptr dp,
152		mp_size_t dn,
153		mp_ptr scratch)
154{
155  mp_size_t qn, in;
156  mp_limb_t cy, qh;
157  mp_ptr ip, tp;
158
159  ASSERT (dn > 1);
160
161  qn = nn - dn;
162
163  /* Compute the inverse size.  */
164  in = mpn_mu_div_qr_choose_in (qn, dn, 0);
165  ASSERT (in <= dn);
166
167#if 1
168  /* This alternative inverse computation method gets slightly more accurate
169     results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
170     not adapted (3) mpn_invertappr scratch needs not met.  */
171  ip = scratch;
172  tp = scratch + in + 1;
173
174  /* compute an approximate inverse on (in+1) limbs */
175  if (dn == in)
176    {
177      MPN_COPY (tp + 1, dp, in);
178      tp[0] = 1;
179      mpn_invertappr (ip, tp, in + 1, NULL);
180      MPN_COPY_INCR (ip, ip + 1, in);
181    }
182  else
183    {
184      cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
185      if (UNLIKELY (cy != 0))
186	MPN_ZERO (ip, in);
187      else
188	{
189	  mpn_invertappr (ip, tp, in + 1, NULL);
190	  MPN_COPY_INCR (ip, ip + 1, in);
191	}
192    }
193#else
194  /* This older inverse computation method gets slightly worse results than the
195     one above.  */
196  ip = scratch;
197  tp = scratch + in;
198
199  /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
200     inversion function should do this automatically.  */
201  if (dn == in)
202    {
203      tp[in + 1] = 0;
204      MPN_COPY (tp + in + 2, dp, in);
205      mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
206    }
207  else
208    {
209      mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
210    }
211  cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
212  if (UNLIKELY (cy != 0))
213    MPN_ZERO (tp + 1, in);
214  MPN_COPY (ip, tp + 1, in);
215#endif
216
217  qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
218
219  return qh;
220}
221
222mp_limb_t
223mpn_preinv_mu_div_qr (mp_ptr qp,
224		      mp_ptr rp,
225		      mp_srcptr np,
226		      mp_size_t nn,
227		      mp_srcptr dp,
228		      mp_size_t dn,
229		      mp_srcptr ip,
230		      mp_size_t in,
231		      mp_ptr scratch)
232{
233  mp_size_t qn;
234  mp_limb_t cy, cx, qh;
235  mp_limb_t r;
236  mp_size_t tn, wn;
237
238#define tp           scratch
239#define scratch_out  (scratch + tn)
240
241  qn = nn - dn;
242
243  np += qn;
244  qp += qn;
245
246  qh = mpn_cmp (np, dp, dn) >= 0;
247  if (qh != 0)
248    mpn_sub_n (rp, np, dp, dn);
249  else
250    MPN_COPY (rp, np, dn);
251
252  if (qn == 0)
253    return qh;			/* Degenerate use.  Should we allow this? */
254
255  while (qn > 0)
256    {
257      if (qn < in)
258	{
259	  ip += in - qn;
260	  in = qn;
261	}
262      np -= in;
263      qp -= in;
264
265      /* Compute the next block of quotient limbs by multiplying the inverse I
266	 by the upper part of the partial remainder R.  */
267      mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
268      cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
269      ASSERT_ALWAYS (cy == 0);
270
271      qn -= in;
272
273      /* Compute the product of the quotient block and the divisor D, to be
274	 subtracted from the partial remainder combined with new limbs from the
275	 dividend N.  We only really need the low dn+1 limbs.  */
276
277      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
278	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
279      else
280	{
281	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
282	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
283	  wn = dn + in - tn;			/* number of wrapped limbs */
284	  if (wn > 0)
285	    {
286	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
287	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
288	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
289	      ASSERT_ALWAYS (cx >= cy);
290	      mpn_incr_u (tp, cx - cy);
291	    }
292	}
293
294      r = rp[dn - in] - tp[dn];
295
296      /* Subtract the product from the partial remainder combined with new
297	 limbs from the dividend N, generating a new partial remainder R.  */
298      if (dn != in)
299	{
300	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
301	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
302	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
303	}
304      else
305	{
306	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
307	}
308
309      STAT (int i; int err = 0;
310	    static int errarr[5]; static int err_rec; static int tot);
311
312      /* Check the remainder R and adjust the quotient as needed.  */
313      r -= cy;
314      while (r != 0)
315	{
316	  /* We loop 0 times with about 69% probability, 1 time with about 31%
317	     probability, 2 times with about 0.6% probability, if inverse is
318	     computed as recommended.  */
319	  mpn_incr_u (qp, 1);
320	  cy = mpn_sub_n (rp, rp, dp, dn);
321	  r -= cy;
322	  STAT (err++);
323	}
324      if (mpn_cmp (rp, dp, dn) >= 0)
325	{
326	  /* This is executed with about 76% probability.  */
327	  mpn_incr_u (qp, 1);
328	  cy = mpn_sub_n (rp, rp, dp, dn);
329	  STAT (err++);
330	}
331
332      STAT (
333	    tot++;
334	    errarr[err]++;
335	    if (err > err_rec)
336	      err_rec = err;
337	    if (tot % 0x10000 == 0)
338	      {
339		for (i = 0; i <= err_rec; i++)
340		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
341		printf ("\n");
342	      }
343	    );
344    }
345
346  return qh;
347}
348
349/* In case k=0 (automatic choice), we distinguish 3 cases:
350   (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
351   (b) dn/3 < qn <= dn: in = ceil(qn / 2)
352   (c) qn < dn/3:       in = qn
353   In all cases we have in <= dn.
354 */
355mp_size_t
356mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
357{
358  mp_size_t in;
359
360  if (k == 0)
361    {
362      mp_size_t b;
363      if (qn > dn)
364	{
365	  /* Compute an inverse size that is a nice partition of the quotient.  */
366	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
367	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
368	}
369      else if (3 * qn > dn)
370	{
371	  in = (qn - 1) / 2 + 1;	/* b = 2 */
372	}
373      else
374	{
375	  in = (qn - 1) / 1 + 1;	/* b = 1 */
376	}
377    }
378  else
379    {
380      mp_size_t xn;
381      xn = MIN (dn, qn);
382      in = (xn - 1) / k + 1;
383    }
384
385  return in;
386}
387
388mp_size_t
389mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
390{
391  mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
392  mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k);
393  mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
394
395  return in + itch_local + itch_out;
396}
397
398mp_size_t
399mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in)
400{
401  mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
402  mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
403
404  return itch_local + itch_out;
405}
406