1/* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
2   storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
3   overlap disallowed.
4
5   Contributed to the GNU project by Torbjorn Granlund.
6
7   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10
11Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
12
13This file is part of the GNU MP Library.
14
15The GNU MP Library is free software; you can redistribute it and/or modify
16it under the terms of the GNU Lesser General Public License as published by
17the Free Software Foundation; either version 3 of the License, or (at your
18option) any later version.
19
20The GNU MP Library is distributed in the hope that it will be useful, but
21WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23License for more details.
24
25You should have received a copy of the GNU Lesser General Public License
26along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27
28
29/*
30   The idea of the algorithm used herein is to compute a smaller inverted value
31   than used in the standard Barrett algorithm, and thus save time in the
32   Newton iterations, and pay just a small price when using the inverted value
33   for developing quotient bits.  This algorithm was presented at ICMS 2006.
34*/
35
36#include "gmp.h"
37#include "gmp-impl.h"
38
39
40/* N = {np,nn}
41   D = {dp,dn}
42
43   Requirements: N >= D
44		 D >= 1
45		 D odd
46		 dn >= 2
47		 nn >= 2
48		 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
49
50   Write quotient to Q = {qp,nn}.
51
52   FIXME: When iterating, perhaps do the small step before loop, not after.
53   FIXME: Try to avoid the scalar divisions when computing inverse size.
54   FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
55	  particular, when dn==in, tp and rp could use the same space.
56   FIXME: Trim final quotient calculation to qn limbs of precision.
57*/
58void
59mpn_mu_bdiv_q (mp_ptr qp,
60	       mp_srcptr np, mp_size_t nn,
61	       mp_srcptr dp, mp_size_t dn,
62	       mp_ptr scratch)
63{
64  mp_size_t qn;
65  mp_size_t in;
66  int cy, c0;
67  mp_size_t tn, wn;
68
69  qn = nn;
70
71  ASSERT (dn >= 2);
72  ASSERT (qn >= 2);
73
74  if (qn > dn)
75    {
76      mp_size_t b;
77
78      /* |_______________________|   dividend
79			|________|   divisor  */
80
81#define ip           scratch			/* in */
82#define rp           (scratch + in)		/* dn or rest >= binvert_itch(in) */
83#define tp           (scratch + in + dn)	/* dn+in or next_size(dn) */
84#define scratch_out  (scratch + in + dn + tn)	/* mulmod_bnm1_itch(next_size(dn)) */
85
86      /* Compute an inverse size that is a nice partition of the quotient.  */
87      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
88      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
89
90      /* Some notes on allocation:
91
92	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
93	 limbs of R dies at that point.  We could save memory by letting T live
94	 just under R, and let the upper part of T expand into R. These changes
95	 should reduce itch to perhaps 3dn.
96       */
97
98      mpn_binvert (ip, dp, in, rp);
99
100      cy = 0;
101
102      MPN_COPY (rp, np, dn);
103      np += dn;
104      mpn_mullo_n (qp, rp, ip, in);
105      qn -= in;
106
107      while (qn > in)
108	{
109	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
110	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
111	  else
112	    {
113	      tn = mpn_mulmod_bnm1_next_size (dn);
114	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
115	      wn = dn + in - tn;		/* number of wrapped limbs */
116	      if (wn > 0)
117		{
118		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
119		  mpn_decr_u (tp + wn, c0);
120		}
121	    }
122
123	  qp += in;
124	  if (dn != in)
125	    {
126	      /* Subtract tp[dn-1...in] from partial remainder.  */
127	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
128	      if (cy == 2)
129		{
130		  mpn_incr_u (tp + dn, 1);
131		  cy = 1;
132		}
133	    }
134	  /* Subtract tp[dn+in-1...dn] from dividend.  */
135	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
136	  np += in;
137	  mpn_mullo_n (qp, rp, ip, in);
138	  qn -= in;
139	}
140
141      /* Generate last qn limbs.
142	 FIXME: It should be possible to limit precision here, since qn is
143	 typically somewhat smaller than dn.  No big gains expected.  */
144
145      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
146	mpn_mul (tp, dp, dn, qp, in);		/* mulhi, need tp[qn+in-1...in] */
147      else
148	{
149	  tn = mpn_mulmod_bnm1_next_size (dn);
150	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
151	  wn = dn + in - tn;			/* number of wrapped limbs */
152	  if (wn > 0)
153	    {
154	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
155	      mpn_decr_u (tp + wn, c0);
156	    }
157	}
158
159      qp += in;
160      if (dn != in)
161	{
162	  cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
163	  if (cy == 2)
164	    {
165	      mpn_incr_u (tp + dn, 1);
166	      cy = 1;
167	    }
168	}
169
170      mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
171      mpn_mullo_n (qp, rp, ip, qn);
172
173#undef ip
174#undef rp
175#undef tp
176#undef scratch_out
177   }
178  else
179    {
180      /* |_______________________|   dividend
181		|________________|   divisor  */
182
183#define ip           scratch		/* in */
184#define tp           (scratch + in)	/* qn+in or next_size(qn) or rest >= binvert_itch(in) */
185#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
186
187      /* Compute half-sized inverse.  */
188      in = qn - (qn >> 1);
189
190      mpn_binvert (ip, dp, in, tp);
191
192      mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
193
194      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
195	mpn_mul (tp, dp, qn, qp, in);		/* mulhigh */
196      else
197	{
198	  tn = mpn_mulmod_bnm1_next_size (qn);
199	  mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
200	  wn = qn + in - tn;			/* number of wrapped limbs */
201	  if (wn > 0)
202	    {
203	      c0 = mpn_cmp (tp, np, wn) < 0;
204	      mpn_decr_u (tp + wn, c0);
205	    }
206	}
207
208      mpn_sub_n (tp, np + in, tp + in, qn - in);
209      mpn_mullo_n (qp + in, tp, ip, qn - in);	/* high qn-in quotient limbs */
210
211#undef ip
212#undef tp
213#undef scratch_out
214    }
215}
216
217mp_size_t
218mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
219{
220  mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
221  mp_size_t b;
222
223  qn = nn;
224
225  if (qn > dn)
226    {
227      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
228      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
229      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
230	{
231	  tn = dn + in;
232	  itch_out = 0;
233	}
234      else
235	{
236	  tn = mpn_mulmod_bnm1_next_size (dn);
237	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
238	}
239      itch_binvert = mpn_binvert_itch (in);
240      itches = dn + tn + itch_out;
241      return in + MAX (itches, itch_binvert);
242    }
243  else
244    {
245      in = qn - (qn >> 1);
246      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
247	{
248	  tn = qn + in;
249	  itch_out = 0;
250	}
251      else
252	{
253	  tn = mpn_mulmod_bnm1_next_size (qn);
254	  itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
255	}
256      itch_binvert = mpn_binvert_itch (in);
257      itches = tn + itch_out;
258      return in + MAX (itches, itch_binvert);
259    }
260}
261