1/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
2   quotient.  The divisor is two limbs.
3
4   Contributed to the GNU project by Torbjorn Granlund and Niels M��ller
5
6   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
7   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10
11Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
12
13This file is part of the GNU MP Library.
14
15The GNU MP Library is free software; you can redistribute it and/or modify
16it under the terms of either:
17
18  * the GNU Lesser General Public License as published by the Free
19    Software Foundation; either version 3 of the License, or (at your
20    option) any later version.
21
22or
23
24  * the GNU General Public License as published by the Free Software
25    Foundation; either version 2 of the License, or (at your option) any
26    later version.
27
28or both in parallel, as here.
29
30The GNU MP Library is distributed in the hope that it will be useful, but
31WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33for more details.
34
35You should have received copies of the GNU General Public License and the
36GNU Lesser General Public License along with the GNU MP Library.  If not,
37see https://www.gnu.org/licenses/.  */
38
39#include "gmp-impl.h"
40#include "longlong.h"
41
42#ifndef DIV_QR_2_PI2_THRESHOLD
43/* Disabled unless explicitly tuned. */
44#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
45#endif
46
47#ifndef SANITY_CHECK
48#define SANITY_CHECK 0
49#endif
50
51/* Define some longlong.h-style macros, but for wider operations.
52   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
53     an additional sum operand.
54   * add_csaac accepts two addends and a carry in, and generates a sum and a
55     carry out.  A little like a "full adder".
56*/
57#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
58
59#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
60#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
61  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
62	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
63	   : "0"  ((USItype)(s2)),					\
64	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
65	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
66#endif
67
68#if defined (__amd64__) && W_TYPE_SIZE == 64
69#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
70  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
71	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
72	   : "0"  ((UDItype)(s2)),					\
73	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
74	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
75#endif
76
77#if defined (__aarch64__) && W_TYPE_SIZE == 64
78#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
79  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
80	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
81	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
82	     __CLOBBER_CC)
83#endif
84
85#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
86/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
87   processor running in 32-bit mode, since the carry flag then gets the 32-bit
88   carry.  */
89#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
90  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
91	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
92	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
93	     __CLOBBER_CC)
94#endif
95
96#endif /* __GNUC__ */
97
98#ifndef add_sssaaaa
99#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
100  do {									\
101    UWtype __s0, __s1, __c0, __c1;					\
102    __s0 = (a0) + (b0);							\
103    __s1 = (a1) + (b1);							\
104    __c0 = __s0 < (a0);							\
105    __c1 = __s1 < (a1);							\
106    (s0) = __s0;							\
107    __s1 = __s1 + __c0;							\
108    (s1) = __s1;							\
109    (s2) += __c1 + (__s1 < __c0);					\
110  } while (0)
111#endif
112
113/* Typically used with r1, r0 same as n3, n2. Other types of overlap
114   between inputs and outputs are not supported. */
115#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
116  do {									\
117    mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
118    mp_limb_t _t1, _t0;							\
119    mp_limb_t _mask;							\
120									\
121    /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */	\
122    umul_ppmm (_q2,_q1, n2, di1);					\
123    umul_ppmm (_q3,_q2a, n3, di1);					\
124    ++_q2;	/* _q2 cannot overflow */				\
125    add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a);				\
126    umul_ppmm (_q2c,_q1c, n3, di0);					\
127    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c);			\
128    umul_ppmm (_q1d,_q0, n2, di0);					\
129    add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
130    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d);			\
131									\
132    umul_ppmm (_t1,_t0, _q2, d0);					\
133    _t1 += _q2 * d1 + _q3 * d0;						\
134									\
135    sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
136									\
137    _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
138    add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
139    sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
140									\
141    if (UNLIKELY (r1 >= d1))						\
142      {									\
143	if (r1 > d1 || r0 >= d0)					\
144	  {								\
145	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
146	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
147	  }								\
148      }									\
149    (q1) = _q3;								\
150    (q0) = _q2;								\
151  } while (0)
152
153static void
154invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
155{
156  mp_limb_t v1, v0, p1, t1, t0, p0, mask;
157  invert_limb (v1, d1);
158  p1 = d1 * v1;
159  /* <1, v1> * d1 = <B-1, p1> */
160  p1 += d0;
161  if (p1 < d0)
162    {
163      v1--;
164      mask = -(mp_limb_t) (p1 >= d1);
165      p1 -= d1;
166      v1 += mask;
167      p1 -= mask & d1;
168    }
169  /* <1, v1> * d1 + d0 = <B-1, p1> */
170  umul_ppmm (t1, p0, d0, v1);
171  p1 += t1;
172  if (p1 < t1)
173    {
174      if (UNLIKELY (p1 >= d1))
175	{
176	  if (p1 > d1 || p0 >= d0)
177	    {
178	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
179	      v1--;
180	    }
181	}
182      sub_ddmmss (p1, p0, p1, p0, d1, d0);
183      v1--;
184    }
185  /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
186   * with <p1, p0> + <d1, d0> >= B^2.
187   *
188   * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
189   * partial remainder after <1, v1> is
190   *
191   * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
192   *                              = <~p1, ~p0, B-1>
193   */
194  udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
195  di[0] = v0;
196  di[1] = v1;
197
198#if SANITY_CHECK
199  {
200    mp_limb_t tp[4];
201    mp_limb_t dp[2];
202    dp[0] = d0;
203    dp[1] = d1;
204    mpn_mul_n (tp, dp, di, 2);
205    ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
206    ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
207    ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
208    ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
209  }
210#endif
211}
212
213static mp_limb_t
214mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
215		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
216{
217  mp_limb_t qh;
218  mp_size_t i;
219  mp_limb_t r1, r0;
220
221  ASSERT (nn >= 2);
222  ASSERT (d1 & GMP_NUMB_HIGHBIT);
223
224  r1 = np[nn-1];
225  r0 = np[nn-2];
226
227  qh = 0;
228  if (r1 >= d1 && (r1 > d1 || r0 >= d0))
229    {
230#if GMP_NAIL_BITS == 0
231      sub_ddmmss (r1, r0, r1, r0, d1, d0);
232#else
233      r0 = r0 - d0;
234      r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
235      r0 &= GMP_NUMB_MASK;
236#endif
237      qh = 1;
238    }
239
240  for (i = nn - 2; i >= 2; i -= 2)
241    {
242      mp_limb_t n1, n0, q1, q0;
243      n1 = np[i-1];
244      n0 = np[i-2];
245      udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
246      qp[i-1] = q1;
247      qp[i-2] = q0;
248    }
249
250  if (i > 0)
251    {
252      mp_limb_t q;
253      udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
254      qp[0] = q;
255    }
256  rp[1] = r1;
257  rp[0] = r0;
258
259  return qh;
260}
261
262
263/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
264   significant quotient limbs at qp and the 2 long remainder at np.
265   Return the most significant limb of the quotient.
266
267   Preconditions:
268   1. qp must either not overlap with the other operands at all, or
269      qp >= np + 2 must hold true.  (This means that it's possible to put
270      the quotient in the high part of {np,nn}, right above the remainder.)
271   2. nn >= 2.  */
272
273mp_limb_t
274mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
275	      mp_srcptr dp)
276{
277  mp_limb_t d1;
278  mp_limb_t d0;
279  gmp_pi1_t dinv;
280
281  ASSERT (nn >= 2);
282  ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
283  ASSERT_MPN (np, nn);
284  ASSERT_MPN (dp, 2);
285
286  d1 = dp[1]; d0 = dp[0];
287
288  ASSERT (d1 > 0);
289
290  if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
291    {
292      if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
293	{
294	  gmp_pi1_t dinv;
295	  invert_pi1 (dinv, d1, d0);
296	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
297	}
298      else
299	{
300	  mp_limb_t di[2];
301	  invert_4by2 (di, d1, d0);
302	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
303	}
304    }
305  else
306    {
307      int shift;
308      count_leading_zeros (shift, d1);
309      d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
310      d0 <<= shift;
311      invert_pi1 (dinv, d1, d0);
312      return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
313    }
314}
315