1/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2   where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
3   and N; all other overlap disallowed.
4
5   Contributed to the GNU project by Torbjorn Granlund.
6
7   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10
11Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc.
12
13This file is part of the GNU MP Library.
14
15The GNU MP Library is free software; you can redistribute it and/or modify
16it under the terms of either:
17
18  * the GNU Lesser General Public License as published by the Free
19    Software Foundation; either version 3 of the License, or (at your
20    option) any later version.
21
22or
23
24  * the GNU General Public License as published by the Free Software
25    Foundation; either version 2 of the License, or (at your option) any
26    later version.
27
28or both in parallel, as here.
29
30The GNU MP Library is distributed in the hope that it will be useful, but
31WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33for more details.
34
35You should have received copies of the GNU General Public License and the
36GNU Lesser General Public License along with the GNU MP Library.  If not,
37see https://www.gnu.org/licenses/.  */
38
39
40/*
41   The idea of the algorithm used herein is to compute a smaller inverted value
42   than used in the standard Barrett algorithm, and thus save time in the
43   Newton iterations, and pay just a small price when using the inverted value
44   for developing quotient bits.  This algorithm was presented at ICMS 2006.
45*/
46
47#include "gmp-impl.h"
48
49
50/* N = {np,nn}
51   D = {dp,dn}
52
53   Requirements: N >= D
54		 D >= 1
55		 D odd
56		 dn >= 2
57		 nn >= 2
58		 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
59
60   Write quotient to Q = {qp,nn-dn}.
61
62   FIXME: When iterating, perhaps do the small step before loop, not after.
63   FIXME: Try to avoid the scalar divisions when computing inverse size.
64   FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
65	  particular, when dn==in, tp and rp could use the same space.
66*/
67static mp_limb_t
68mpn_mu_bdiv_qr_old (mp_ptr qp,
69		    mp_ptr rp,
70		    mp_srcptr np, mp_size_t nn,
71		    mp_srcptr dp, mp_size_t dn,
72		    mp_ptr scratch)
73{
74  mp_size_t qn;
75  mp_size_t in;
76  mp_limb_t cy, c0;
77  mp_size_t tn, wn;
78
79  qn = nn - dn;
80
81  ASSERT (dn >= 2);
82  ASSERT (qn >= 2);
83
84  if (qn > dn)
85    {
86      mp_size_t b;
87
88      /* |_______________________|   dividend
89			|________|   divisor  */
90
91#define ip           scratch		/* in */
92#define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
93#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
94
95      /* Compute an inverse size that is a nice partition of the quotient.  */
96      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
97      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
98
99      /* Some notes on allocation:
100
101	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
102	 limbs of R dies at that point.  We could save memory by letting T live
103	 just under R, and let the upper part of T expand into R. These changes
104	 should reduce itch to perhaps 3dn.
105       */
106
107      mpn_binvert (ip, dp, in, tp);
108
109      MPN_COPY (rp, np, dn);
110      np += dn;
111      cy = 0;
112
113      while (qn > in)
114	{
115	  mpn_mullo_n (qp, rp, ip, in);
116
117	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
118	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
119	  else
120	    {
121	      tn = mpn_mulmod_bnm1_next_size (dn);
122	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
123	      wn = dn + in - tn;		/* number of wrapped limbs */
124	      if (wn > 0)
125		{
126		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
127		  mpn_decr_u (tp + wn, c0);
128		}
129	    }
130
131	  qp += in;
132	  qn -= in;
133
134	  if (dn != in)
135	    {
136	      /* Subtract tp[dn-1...in] from partial remainder.  */
137	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
138	      if (cy == 2)
139		{
140		  mpn_incr_u (tp + dn, 1);
141		  cy = 1;
142		}
143	    }
144	  /* Subtract tp[dn+in-1...dn] from dividend.  */
145	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
146	  np += in;
147	}
148
149      /* Generate last qn limbs.  */
150      mpn_mullo_n (qp, rp, ip, qn);
151
152      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
153	mpn_mul (tp, dp, dn, qp, qn);		/* mulhi, need tp[qn+in-1...in] */
154      else
155	{
156	  tn = mpn_mulmod_bnm1_next_size (dn);
157	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
158	  wn = dn + qn - tn;			/* number of wrapped limbs */
159	  if (wn > 0)
160	    {
161	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
162	      mpn_decr_u (tp + wn, c0);
163	    }
164	}
165
166      if (dn != qn)
167	{
168	  cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
169	  if (cy == 2)
170	    {
171	      mpn_incr_u (tp + dn, 1);
172	      cy = 1;
173	    }
174	}
175      return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
176
177#undef ip
178#undef tp
179#undef scratch_out
180    }
181  else
182    {
183      /* |_______________________|   dividend
184		|________________|   divisor  */
185
186#define ip           scratch		/* in */
187#define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
188#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
189
190      /* Compute half-sized inverse.  */
191      in = qn - (qn >> 1);
192
193      mpn_binvert (ip, dp, in, tp);
194
195      mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
196
197      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
198	mpn_mul (tp, dp, dn, qp, in);		/* mulhigh */
199      else
200	{
201	  tn = mpn_mulmod_bnm1_next_size (dn);
202	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
203	  wn = dn + in - tn;			/* number of wrapped limbs */
204	  if (wn > 0)
205	    {
206	      c0 = mpn_sub_n (tp + tn, tp, np, wn);
207	      mpn_decr_u (tp + wn, c0);
208	    }
209	}
210
211      qp += in;
212      qn -= in;
213
214      cy = mpn_sub_n (rp, np + in, tp + in, dn);
215      mpn_mullo_n (qp, rp, ip, qn);		/* high qn quotient limbs */
216
217      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
218	mpn_mul (tp, dp, dn, qp, qn);		/* mulhigh */
219      else
220	{
221	  tn = mpn_mulmod_bnm1_next_size (dn);
222	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
223	  wn = dn + qn - tn;			/* number of wrapped limbs */
224	  if (wn > 0)
225	    {
226	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
227	      mpn_decr_u (tp + wn, c0);
228	    }
229	}
230
231      cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
232      if (cy == 2)
233	{
234	  mpn_incr_u (tp + dn, 1);
235	  cy = 1;
236	}
237      return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
238
239#undef ip
240#undef tp
241#undef scratch_out
242    }
243}
244
245mp_limb_t
246mpn_mu_bdiv_qr (mp_ptr qp,
247		mp_ptr rp,
248		mp_srcptr np, mp_size_t nn,
249		mp_srcptr dp, mp_size_t dn,
250		mp_ptr scratch)
251{
252  mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch);
253
254  /* R' B^{qn} = U - Q' D
255   *
256   * Q = B^{qn} - Q' (assuming Q' != 0)
257   *
258   * R B^{qn} = U + Q D = U + B^{qn} D - Q' D
259   *          = B^{qn} D + R'
260   */
261
262  if (UNLIKELY (!mpn_neg (qp, qp, nn - dn)))
263    {
264      /* Zero quotient. */
265      ASSERT (cy == 0);
266      return 0;
267    }
268  else
269    {
270      mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn);
271      ASSERT (cy2 >= cy);
272
273      return cy2 - cy;
274    }
275}
276
277
278mp_size_t
279mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
280{
281  mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
282  mp_size_t b;
283
284  ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
285
286  qn = nn - dn;
287
288  if (qn > dn)
289    {
290      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
291      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
292    }
293  else
294    {
295      in = qn - (qn >> 1);
296    }
297
298  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
299    {
300      tn = dn + in;
301      itch_out = 0;
302    }
303  else
304    {
305      tn = mpn_mulmod_bnm1_next_size (dn);
306      itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
307    }
308
309  itch_binvert = mpn_binvert_itch (in);
310  itches = tn + itch_out;
311  return in + MAX (itches, itch_binvert);
312}
313