1/* mpn_div_q -- division for arbitrary size operands.
2
3   Contributed to the GNU project by Torbjorn Granlund.
4
5   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
8
9Copyright 2009, 2010 Free Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of the GNU Lesser General Public License as published by
15the Free Software Foundation; either version 3 of the License, or (at your
16option) any later version.
17
18The GNU MP Library is distributed in the hope that it will be useful, but
19WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21License for more details.
22
23You should have received a copy of the GNU Lesser General Public License
24along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25
26#include "gmp.h"
27#include "gmp-impl.h"
28#include "longlong.h"
29
30
31/* Compute Q = N/D with truncation.
32     N = {np,nn}
33     D = {dp,dn}
34     Q = {qp,nn-dn+1}
35     T = {scratch,nn+1} is scratch space
36   N and D are both untouched by the computation.
37   N and T may overlap; pass the same space if N is irrelevant after the call,
38   but note that tp needs an extra limb.
39
40   Operand requirements:
41     N >= D > 0
42     dp[dn-1] != 0
43     No overlap between the N, D, and Q areas.
44
45   This division function does not clobber its input operands, since it is
46   intended to support average-O(qn) division, and for that to be effective, it
47   cannot put requirements on callers to copy a O(nn) operand.
48
49   If a caller does not care about the value of {np,nn+1} after calling this
50   function, it should pass np also for the scratch argument.  This function
51   will then save some time and space by avoiding allocation and copying.
52   (FIXME: Is this a good design?  We only really save any copying for
53   already-normalised divisors, which should be rare.  It also prevents us from
54   reasonably asking for all scratch space we need.)
55
56   We write nn-dn+1 limbs for the quotient, but return void.  Why not return
57   the most significant quotient limb?  Look at the 4 main code blocks below
58   (consisting of an outer if-else where each arm contains an if-else). It is
59   tricky for the first code block, since the mpn_*_div_q calls will typically
60   generate all nn-dn+1 and return 0 or 1.  I don't see how to fix that unless
61   we generate the most significant quotient limb here, before calling
62   mpn_*_div_q, or put the quotient in a temporary area.  Since this is a
63   critical division case (the SB sub-case in particular) copying is not a good
64   idea.
65
66   It might make sense to split the if-else parts of the (qn + FUDGE
67   >= dn) blocks into separate functions, since we could promise quite
68   different things to callers in these two cases.  The 'then' case
69   benefits from np=scratch, and it could perhaps even tolerate qp=np,
70   saving some headache for many callers.
71
72   FIXME: Scratch allocation leaves a lot to be desired.  E.g., for the MU size
73   operands, we do not reuse the huge scratch for adjustments.  This can be a
74   serious waste of memory for the largest operands.
75*/
76
77/* FUDGE determines when to try getting an approximate quotient from the upper
78   parts of the dividend and divisor, then adjust.  N.B. FUDGE must be >= 2
79   for the code to be correct.  */
80#define FUDGE 5			/* FIXME: tune this */
81
82#define DC_DIV_Q_THRESHOLD      DC_DIVAPPR_Q_THRESHOLD
83#define MU_DIV_Q_THRESHOLD      MU_DIVAPPR_Q_THRESHOLD
84#define MUPI_DIV_Q_THRESHOLD  MUPI_DIVAPPR_Q_THRESHOLD
85#ifndef MUPI_DIVAPPR_Q_THRESHOLD
86#define MUPI_DIVAPPR_Q_THRESHOLD  MUPI_DIV_QR_THRESHOLD
87#endif
88
89void
90mpn_div_q (mp_ptr qp,
91	   mp_srcptr np, mp_size_t nn,
92	   mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
93{
94  mp_ptr new_dp, new_np, tp, rp;
95  mp_limb_t cy, dh, qh;
96  mp_size_t new_nn, qn;
97  gmp_pi1_t dinv;
98  int cnt;
99  TMP_DECL;
100  TMP_MARK;
101
102  ASSERT (nn >= dn);
103  ASSERT (dn > 0);
104  ASSERT (dp[dn - 1] != 0);
105  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));
106  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));
107  ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn));
108
109  ASSERT_ALWAYS (FUDGE >= 2);
110
111  if (dn == 1)
112    {
113      mpn_divrem_1 (qp, 0L, np, nn, dp[dn - 1]);
114      return;
115    }
116
117  qn = nn - dn + 1;		/* Quotient size, high limb might be zero */
118
119  if (qn + FUDGE >= dn)
120    {
121      /* |________________________|
122                          |_______|  */
123      new_np = scratch;
124
125      dh = dp[dn - 1];
126      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
127	{
128	  count_leading_zeros (cnt, dh);
129
130	  cy = mpn_lshift (new_np, np, nn, cnt);
131	  new_np[nn] = cy;
132	  new_nn = nn + (cy != 0);
133
134	  new_dp = TMP_ALLOC_LIMBS (dn);
135	  mpn_lshift (new_dp, dp, dn, cnt);
136
137	  if (dn == 2)
138	    {
139	      qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp);
140	    }
141	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
142		   BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD))
143	    {
144	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
145	      qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32);
146	    }
147	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
148		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
149		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
150		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
151	    {
152	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
153	      qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv);
154	    }
155	  else
156	    {
157	      mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0);
158	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
159	      qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch);
160	    }
161	  if (cy == 0)
162	    qp[qn - 1] = qh;
163	  else if (UNLIKELY (qh != 0))
164	    {
165	      /* This happens only when the quotient is close to B^n and
166		 mpn_*_divappr_q returned B^n.  */
167	      mp_size_t i, n;
168	      n = new_nn - dn;
169	      for (i = 0; i < n; i++)
170		qp[i] = GMP_NUMB_MAX;
171	      qh = 0;		/* currently ignored */
172	    }
173	}
174      else  /* divisor is already normalised */
175	{
176	  if (new_np != np)
177	    MPN_COPY (new_np, np, nn);
178
179	  if (dn == 2)
180	    {
181	      qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp);
182	    }
183	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
184		   BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD))
185	    {
186	      invert_pi1 (dinv, dh, dp[dn - 2]);
187	      qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32);
188	    }
189	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
190		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
191		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
192		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
193	    {
194	      invert_pi1 (dinv, dh, dp[dn - 2]);
195	      qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv);
196	    }
197	  else
198	    {
199	      mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0);
200	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
201	      qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch);
202	    }
203	  qp[nn - dn] = qh;
204	}
205    }
206  else
207    {
208      /* |________________________|
209                |_________________|  */
210      tp = TMP_ALLOC_LIMBS (qn + 1);
211
212      new_np = scratch;
213      new_nn = 2 * qn + 1;
214      if (new_np == np)
215	/* We need {np,nn} to remain untouched until the final adjustment, so
216	   we need to allocate separate space for new_np.  */
217	new_np = TMP_ALLOC_LIMBS (new_nn + 1);
218
219
220      dh = dp[dn - 1];
221      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
222	{
223	  count_leading_zeros (cnt, dh);
224
225	  cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt);
226	  new_np[new_nn] = cy;
227
228	  new_nn += (cy != 0);
229
230	  new_dp = TMP_ALLOC_LIMBS (qn + 1);
231	  mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt);
232	  new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt);
233
234	  if (qn + 1 == 2)
235	    {
236	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
237	    }
238	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
239	    {
240	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
241	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
242	    }
243	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
244	    {
245	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
246	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
247	    }
248	  else
249	    {
250	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
251	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
252	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
253	    }
254	  if (cy == 0)
255	    tp[qn] = qh;
256	  else if (UNLIKELY (qh != 0))
257	    {
258	      /* This happens only when the quotient is close to B^n and
259		 mpn_*_divappr_q returned B^n.  */
260	      mp_size_t i, n;
261	      n = new_nn - (qn + 1);
262	      for (i = 0; i < n; i++)
263		tp[i] = GMP_NUMB_MAX;
264	      qh = 0;		/* currently ignored */
265	    }
266	}
267      else  /* divisor is already normalised */
268	{
269	  MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless of MU will be used */
270
271	  new_dp = (mp_ptr) dp + dn - (qn + 1);
272
273	  if (qn == 2 - 1)
274	    {
275	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
276	    }
277	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
278	    {
279	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
280	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
281	    }
282	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
283	    {
284	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
285	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
286	    }
287	  else
288	    {
289	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
290	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
291	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
292	    }
293	  tp[qn] = qh;
294	}
295
296      MPN_COPY (qp, tp + 1, qn);
297      if (tp[0] <= 4)
298        {
299	  mp_size_t rn;
300
301          rp = TMP_ALLOC_LIMBS (dn + qn);
302          mpn_mul (rp, dp, dn, tp + 1, qn);
303	  rn = dn + qn;
304	  rn -= rp[rn - 1] == 0;
305
306          if (rn > nn || mpn_cmp (np, rp, nn) < 0)
307            mpn_decr_u (qp, 1);
308        }
309    }
310
311  TMP_FREE;
312}
313