1/* mpn_mu_div_q.
2
3   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4
5   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
6   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
8
9Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of the GNU Lesser General Public License as published by
15the Free Software Foundation; either version 3 of the License, or (at your
16option) any later version.
17
18The GNU MP Library is distributed in the hope that it will be useful, but
19WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21License for more details.
22
23You should have received a copy of the GNU Lesser General Public License
24along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25
26
27/*
28   The idea of the algorithm used herein is to compute a smaller inverted value
29   than used in the standard Barrett algorithm, and thus save time in the
30   Newton iterations, and pay just a small price when using the inverted value
31   for developing quotient bits.  This algorithm was presented at ICMS 2006.
32*/
33
34/*
35  Things to work on:
36
37  1. This is a rudimentary implementation of mpn_mu_div_q.  The algorithm is
38     probably close to optimal, except when mpn_mu_divappr_q fails.
39
40     An alternative which could be considered for much simpler code for the
41     complex qn>=dn arm would be to allocate a temporary nn+1 limb buffer, then
42     simply call mpn_mu_divappr_q.  Such a temporary allocation is
43     unfortunately very large.
44
45  2. We used to fall back to mpn_mu_div_qr when we detect a possible
46     mpn_mu_divappr_q rounding problem, now we multiply and compare.
47     Unfortunately, since mpn_mu_divappr_q does not return the partial
48     remainder, this also doesn't become optimal.  A mpn_mu_divappr_qr could
49     solve that.
50
51  3. The allocations done here should be made from the scratch area, which
52     then would need to be amended.
53*/
54
55#include <stdlib.h>		/* for NULL */
56#include "gmp.h"
57#include "gmp-impl.h"
58
59
60mp_limb_t
61mpn_mu_div_q (mp_ptr qp,
62	      mp_srcptr np, mp_size_t nn,
63	      mp_srcptr dp, mp_size_t dn,
64	      mp_ptr scratch)
65{
66  mp_ptr tp, rp, ip, this_ip;
67  mp_size_t qn, in, this_in;
68  mp_limb_t cy, qh;
69  TMP_DECL;
70
71  TMP_MARK;
72
73  qn = nn - dn;
74
75  tp = TMP_BALLOC_LIMBS (qn + 1);
76
77  if (qn >= dn)			/* nn >= 2*dn + 1 */
78    {
79      /* Find max inverse size needed by the two preinv calls.  FIXME: This is
80	 not optimal, it underestimates the invariance.  */
81      if (dn != qn)
82	{
83	  mp_size_t in1, in2;
84
85	  in1 = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
86	  in2 = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
87	  in = MAX (in1, in2);
88	}
89      else
90	{
91	  in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
92	}
93
94      ip = TMP_BALLOC_LIMBS (in + 1);
95
96      if (dn == in)
97	{
98	  MPN_COPY (scratch + 1, dp, in);
99	  scratch[0] = 1;
100	  mpn_invertappr (ip, scratch, in + 1, NULL);
101	  MPN_COPY_INCR (ip, ip + 1, in);
102	}
103      else
104	{
105	  cy = mpn_add_1 (scratch, dp + dn - (in + 1), in + 1, 1);
106	  if (UNLIKELY (cy != 0))
107	    MPN_ZERO (ip, in);
108	  else
109	    {
110	      mpn_invertappr (ip, scratch, in + 1, NULL);
111	      MPN_COPY_INCR (ip, ip + 1, in);
112	    }
113	}
114
115       /* |_______________________|   dividend
116			 |________|   divisor  */
117      rp = TMP_BALLOC_LIMBS (2 * dn + 1);
118
119      this_in = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
120      this_ip = ip + in - this_in;
121      qh = mpn_preinv_mu_div_qr (tp + dn + 1, rp + dn + 1, np + dn, qn, dp, dn,
122				 this_ip, this_in, scratch);
123
124      MPN_COPY (rp + 1, np, dn);
125      rp[0] = 0;
126      this_in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
127      this_ip = ip + in - this_in;
128      cy = mpn_preinv_mu_divappr_q (tp, rp, 2 * dn + 1, dp, dn,
129				    this_ip, this_in, scratch);
130
131      if (UNLIKELY (cy != 0))
132	{
133	  /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was
134	     canonically reduced, replace the returned value of B^(qn-dn)+eps
135	     by the largest possible value.  */
136	  mp_size_t i;
137	  for (i = 0; i < dn + 1; i++)
138	    tp[i] = GMP_NUMB_MAX;
139	}
140
141      /* The max error of mpn_mu_divappr_q is +4.  If the low quotient limb is
142	 greater than the max error, we cannot trust the quotient.  */
143      if (tp[0] > 4)
144	{
145	  MPN_COPY (qp, tp + 1, qn);
146	}
147      else
148	{
149	  mp_limb_t cy;
150	  mp_ptr pp;
151
152	  /* FIXME: can we use already allocated space? */
153	  pp = TMP_BALLOC_LIMBS (nn);
154	  mpn_mul (pp, tp + 1, qn, dp, dn);
155
156	  cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0;
157
158	  if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */
159	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
160	  else /* Same as above */
161	    MPN_COPY (qp, tp + 1, qn);
162	}
163    }
164  else
165    {
166       /* |_______________________|   dividend
167		 |________________|   divisor  */
168
169      /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed
170	 here becomes 2dn, i.e., more than nn.  This shouldn't hurt, since only
171	 the most significant dn-1 limbs will actually be read, but it is not
172	 pretty.  */
173
174      qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2,
175			     dp + dn - (qn + 1), qn + 1, scratch);
176
177      /* The max error of mpn_mu_divappr_q is +4, but we get an additional
178         error from the divisor truncation.  */
179      if (tp[0] > 6)
180	{
181	  MPN_COPY (qp, tp + 1, qn);
182	}
183      else
184	{
185	  mp_limb_t cy;
186
187	  /* FIXME: a shorter product should be enough; we may use already
188	     allocated space... */
189	  rp = TMP_BALLOC_LIMBS (nn);
190	  mpn_mul (rp, dp, dn, tp + 1, qn);
191
192	  cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0;
193
194	  if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */
195	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
196	  else /* Same as above */
197	    MPN_COPY (qp, tp + 1, qn);
198	}
199    }
200
201  TMP_FREE;
202  return qh;
203}
204
205mp_size_t
206mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
207{
208  mp_size_t qn, itch1, itch2;
209
210  qn = nn - dn;
211  if (qn >= dn)
212    {
213      itch1 = mpn_mu_div_qr_itch (qn, dn, mua_k);
214      itch2 = mpn_mu_divappr_q_itch (2 * dn + 1, dn, mua_k);
215      return MAX (itch1, itch2);
216    }
217  else
218    {
219      itch1 = mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k);
220      return itch1;
221    }
222}
223