1/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2   where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
3   and N; all other overlap disallowed.
4
5   Contributed to the GNU project by Torbjorn Granlund.
6
7   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10
11Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
12
13This file is part of the GNU MP Library.
14
15The GNU MP Library is free software; you can redistribute it and/or modify
16it under the terms of the GNU Lesser General Public License as published by
17the Free Software Foundation; either version 3 of the License, or (at your
18option) any later version.
19
20The GNU MP Library is distributed in the hope that it will be useful, but
21WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23License for more details.
24
25You should have received a copy of the GNU Lesser General Public License
26along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27
28
29/*
30   The idea of the algorithm used herein is to compute a smaller inverted value
31   than used in the standard Barrett algorithm, and thus save time in the
32   Newton iterations, and pay just a small price when using the inverted value
33   for developing quotient bits.  This algorithm was presented at ICMS 2006.
34*/
35
36#include "gmp.h"
37#include "gmp-impl.h"
38
39
40/* N = {np,nn}
41   D = {dp,dn}
42
43   Requirements: N >= D
44		 D >= 1
45		 D odd
46		 dn >= 2
47		 nn >= 2
48		 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
49
50   Write quotient to Q = {qp,nn-dn}.
51
52   FIXME: When iterating, perhaps do the small step before loop, not after.
53   FIXME: Try to avoid the scalar divisions when computing inverse size.
54   FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
55	  particular, when dn==in, tp and rp could use the same space.
56*/
57mp_limb_t
58mpn_mu_bdiv_qr (mp_ptr qp,
59		mp_ptr rp,
60		mp_srcptr np, mp_size_t nn,
61		mp_srcptr dp, mp_size_t dn,
62		mp_ptr scratch)
63{
64  mp_size_t qn;
65  mp_size_t in;
66  mp_limb_t cy, c0;
67  int k;
68  mp_size_t tn, wn;
69  mp_size_t i;
70
71  qn = nn - dn;
72
73  ASSERT (dn >= 2);
74  ASSERT (qn >= 2);
75
76  if (qn > dn)
77    {
78      mp_size_t b;
79
80      /* |_______________________|   dividend
81			|________|   divisor  */
82
83#define ip           scratch		/* in */
84#define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
85#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
86
87      /* Compute an inverse size that is a nice partition of the quotient.  */
88      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
89      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
90
91      /* Some notes on allocation:
92
93	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
94	 limbs of R dies at that point.  We could save memory by letting T live
95	 just under R, and let the upper part of T expand into R. These changes
96	 should reduce itch to perhaps 3dn.
97       */
98
99      mpn_binvert (ip, dp, in, tp);
100
101      MPN_COPY (rp, np, dn);
102      np += dn;
103      cy = 0;
104
105      while (qn > in)
106	{
107	  mpn_mullo_n (qp, rp, ip, in);
108
109	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
110	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
111	  else
112	    {
113	      tn = mpn_mulmod_bnm1_next_size (dn);
114	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
115	      wn = dn + in - tn;		/* number of wrapped limbs */
116	      if (wn > 0)
117		{
118		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
119		  mpn_decr_u (tp + wn, c0);
120		}
121	    }
122
123	  qp += in;
124	  qn -= in;
125
126	  if (dn != in)
127	    {
128	      /* Subtract tp[dn-1...in] from partial remainder.  */
129	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
130	      if (cy == 2)
131		{
132		  mpn_incr_u (tp + dn, 1);
133		  cy = 1;
134		}
135	    }
136	  /* Subtract tp[dn+in-1...dn] from dividend.  */
137	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
138	  np += in;
139	}
140
141      /* Generate last qn limbs.  */
142      mpn_mullo_n (qp, rp, ip, qn);
143
144      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
145	mpn_mul (tp, dp, dn, qp, qn);		/* mulhi, need tp[qn+in-1...in] */
146      else
147	{
148	  tn = mpn_mulmod_bnm1_next_size (dn);
149	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
150	  wn = dn + qn - tn;			/* number of wrapped limbs */
151	  if (wn > 0)
152	    {
153	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
154	      mpn_decr_u (tp + wn, c0);
155	    }
156	}
157
158      if (dn != qn)
159	{
160	  cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
161	  if (cy == 2)
162	    {
163	      mpn_incr_u (tp + dn, 1);
164	      cy = 1;
165	    }
166	}
167      return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
168
169#undef ip
170#undef tp
171#undef scratch_out
172    }
173  else
174    {
175      /* |_______________________|   dividend
176		|________________|   divisor  */
177
178#define ip           scratch		/* in */
179#define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
180#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
181
182      /* Compute half-sized inverse.  */
183      in = qn - (qn >> 1);
184
185      mpn_binvert (ip, dp, in, tp);
186
187      mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
188
189      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
190	mpn_mul (tp, dp, dn, qp, in);		/* mulhigh */
191      else
192	{
193	  tn = mpn_mulmod_bnm1_next_size (dn);
194	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
195	  wn = dn + in - tn;			/* number of wrapped limbs */
196	  if (wn > 0)
197	    {
198	      c0 = mpn_sub_n (tp + tn, tp, np, wn);
199	      mpn_decr_u (tp + wn, c0);
200	    }
201	}
202
203      qp += in;
204      qn -= in;
205
206      cy = mpn_sub_n (rp, np + in, tp + in, dn);
207      mpn_mullo_n (qp, rp, ip, qn);		/* high qn quotient limbs */
208
209      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
210	mpn_mul (tp, dp, dn, qp, qn);		/* mulhigh */
211      else
212	{
213	  tn = mpn_mulmod_bnm1_next_size (dn);
214	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
215	  wn = dn + qn - tn;			/* number of wrapped limbs */
216	  if (wn > 0)
217	    {
218	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
219	      mpn_decr_u (tp + wn, c0);
220	    }
221	}
222
223      cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
224      if (cy == 2)
225	{
226	  mpn_incr_u (tp + dn, 1);
227	  cy = 1;
228	}
229      return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
230
231#undef ip
232#undef tp
233#undef scratch_out
234    }
235}
236
237mp_size_t
238mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
239{
240  mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
241  mp_size_t b;
242
243  qn = nn - dn;
244
245  if (qn > dn)
246    {
247      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
248      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
249      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
250	{
251	  tn = dn + in;
252	  itch_out = 0;
253	}
254      else
255	{
256	  tn = mpn_mulmod_bnm1_next_size (dn);
257	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
258	}
259      itch_binvert = mpn_binvert_itch (in);
260      itches = tn + itch_out;
261      return in + MAX (itches, itch_binvert);
262    }
263  else
264    {
265      in = qn - (qn >> 1);
266      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
267	{
268	  tn = dn + in;
269	  itch_out = 0;
270	}
271      else
272	{
273	  tn = mpn_mulmod_bnm1_next_size (dn);
274	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
275	}
276    }
277  itch_binvert = mpn_binvert_itch (in);
278  itches = tn + itch_out;
279  return in + MAX (itches, itch_binvert);
280}
281