1/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and 2 quotient. The divisor is two limbs. 3 4 Contributed to the GNU project by Torbjorn Granlund and Niels M��ller 5 6 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY 7 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 8 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 11Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc. 12 13This file is part of the GNU MP Library. 14 15The GNU MP Library is free software; you can redistribute it and/or modify 16it under the terms of either: 17 18 * the GNU Lesser General Public License as published by the Free 19 Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22or 23 24 * the GNU General Public License as published by the Free Software 25 Foundation; either version 2 of the License, or (at your option) any 26 later version. 27 28or both in parallel, as here. 29 30The GNU MP Library is distributed in the hope that it will be useful, but 31WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 32or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 33for more details. 34 35You should have received copies of the GNU General Public License and the 36GNU Lesser General Public License along with the GNU MP Library. If not, 37see https://www.gnu.org/licenses/. */ 38 39#include "gmp-impl.h" 40#include "longlong.h" 41 42#ifndef DIV_QR_2_PI2_THRESHOLD 43/* Disabled unless explicitly tuned. */ 44#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX 45#endif 46 47#ifndef SANITY_CHECK 48#define SANITY_CHECK 0 49#endif 50 51/* Define some longlong.h-style macros, but for wider operations. 52 * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into 53 an additional sum operand. 54 * add_csaac accepts two addends and a carry in, and generates a sum and a 55 carry out. A little like a "full adder". 56*/ 57#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) 58 59#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 60#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 61 __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ 62 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 63 : "0" ((USItype)(s2)), \ 64 "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ 65 "%2" ((USItype)(a0)), "g" ((USItype)(b0))) 66#endif 67 68#if defined (__amd64__) && W_TYPE_SIZE == 64 69#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 70 __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ 71 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 72 : "0" ((UDItype)(s2)), \ 73 "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ 74 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) 75#endif 76 77#if defined (__aarch64__) && W_TYPE_SIZE == 64 78#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 79 __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\ 80 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 81 : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ 82 __CLOBBER_CC) 83#endif 84 85#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) 86/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a 87 processor running in 32-bit mode, since the carry flag then gets the 32-bit 88 carry. */ 89#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 90 __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ 91 : "=r" (s2), "=&r" (s1), "=&r" (s0) \ 92 : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ 93 __CLOBBER_CC) 94#endif 95 96#endif /* __GNUC__ */ 97 98#ifndef add_sssaaaa 99#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ 100 do { \ 101 UWtype __s0, __s1, __c0, __c1; \ 102 __s0 = (a0) + (b0); \ 103 __s1 = (a1) + (b1); \ 104 __c0 = __s0 < (a0); \ 105 __c1 = __s1 < (a1); \ 106 (s0) = __s0; \ 107 __s1 = __s1 + __c0; \ 108 (s1) = __s1; \ 109 (s2) += __c1 + (__s1 < __c0); \ 110 } while (0) 111#endif 112 113/* Typically used with r1, r0 same as n3, n2. Other types of overlap 114 between inputs and outputs are not supported. */ 115#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ 116 do { \ 117 mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ 118 mp_limb_t _t1, _t0; \ 119 mp_limb_t _mask; \ 120 \ 121 /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \ 122 umul_ppmm (_q2,_q1, n2, di1); \ 123 umul_ppmm (_q3,_q2a, n3, di1); \ 124 ++_q2; /* _q2 cannot overflow */ \ 125 add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \ 126 umul_ppmm (_q2c,_q1c, n3, di0); \ 127 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \ 128 umul_ppmm (_q1d,_q0, n2, di0); \ 129 add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \ 130 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \ 131 \ 132 umul_ppmm (_t1,_t0, _q2, d0); \ 133 _t1 += _q2 * d1 + _q3 * d0; \ 134 \ 135 sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ 136 \ 137 _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \ 138 add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \ 139 sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \ 140 \ 141 if (UNLIKELY (r1 >= d1)) \ 142 { \ 143 if (r1 > d1 || r0 >= d0) \ 144 { \ 145 sub_ddmmss (r1, r0, r1, r0, d1, d0); \ 146 add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\ 147 } \ 148 } \ 149 (q1) = _q3; \ 150 (q0) = _q2; \ 151 } while (0) 152 153static void 154invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0) 155{ 156 mp_limb_t v1, v0, p1, t1, t0, p0, mask; 157 invert_limb (v1, d1); 158 p1 = d1 * v1; 159 /* <1, v1> * d1 = <B-1, p1> */ 160 p1 += d0; 161 if (p1 < d0) 162 { 163 v1--; 164 mask = -(mp_limb_t) (p1 >= d1); 165 p1 -= d1; 166 v1 += mask; 167 p1 -= mask & d1; 168 } 169 /* <1, v1> * d1 + d0 = <B-1, p1> */ 170 umul_ppmm (t1, p0, d0, v1); 171 p1 += t1; 172 if (p1 < t1) 173 { 174 if (UNLIKELY (p1 >= d1)) 175 { 176 if (p1 > d1 || p0 >= d0) 177 { 178 sub_ddmmss (p1, p0, p1, p0, d1, d0); 179 v1--; 180 } 181 } 182 sub_ddmmss (p1, p0, p1, p0, d1, d0); 183 v1--; 184 } 185 /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>, 186 * with <p1, p0> + <d1, d0> >= B^2. 187 * 188 * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The 189 * partial remainder after <1, v1> is 190 * 191 * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0> 192 * = <~p1, ~p0, B-1> 193 */ 194 udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1); 195 di[0] = v0; 196 di[1] = v1; 197 198#if SANITY_CHECK 199 { 200 mp_limb_t tp[4]; 201 mp_limb_t dp[2]; 202 dp[0] = d0; 203 dp[1] = d1; 204 mpn_mul_n (tp, dp, di, 2); 205 ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0); 206 ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX); 207 ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX); 208 ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1); 209 } 210#endif 211} 212 213static mp_limb_t 214mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, 215 mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0) 216{ 217 mp_limb_t qh; 218 mp_size_t i; 219 mp_limb_t r1, r0; 220 221 ASSERT (nn >= 2); 222 ASSERT (d1 & GMP_NUMB_HIGHBIT); 223 224 r1 = np[nn-1]; 225 r0 = np[nn-2]; 226 227 qh = 0; 228 if (r1 >= d1 && (r1 > d1 || r0 >= d0)) 229 { 230#if GMP_NAIL_BITS == 0 231 sub_ddmmss (r1, r0, r1, r0, d1, d0); 232#else 233 r0 = r0 - d0; 234 r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); 235 r0 &= GMP_NUMB_MASK; 236#endif 237 qh = 1; 238 } 239 240 for (i = nn - 2; i >= 2; i -= 2) 241 { 242 mp_limb_t n1, n0, q1, q0; 243 n1 = np[i-1]; 244 n0 = np[i-2]; 245 udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0); 246 qp[i-1] = q1; 247 qp[i-2] = q0; 248 } 249 250 if (i > 0) 251 { 252 mp_limb_t q; 253 udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1); 254 qp[0] = q; 255 } 256 rp[1] = r1; 257 rp[0] = r0; 258 259 return qh; 260} 261 262 263/* Divide num {np,nn} by den {dp,2} and write the nn-2 least 264 significant quotient limbs at qp and the 2 long remainder at np. 265 Return the most significant limb of the quotient. 266 267 Preconditions: 268 1. qp must either not overlap with the other operands at all, or 269 qp >= np + 2 must hold true. (This means that it's possible to put 270 the quotient in the high part of {np,nn}, right above the remainder.) 271 2. nn >= 2. */ 272 273mp_limb_t 274mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, 275 mp_srcptr dp) 276{ 277 mp_limb_t d1; 278 mp_limb_t d0; 279 gmp_pi1_t dinv; 280 281 ASSERT (nn >= 2); 282 ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2); 283 ASSERT_MPN (np, nn); 284 ASSERT_MPN (dp, 2); 285 286 d1 = dp[1]; d0 = dp[0]; 287 288 ASSERT (d1 > 0); 289 290 if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT)) 291 { 292 if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD)) 293 { 294 gmp_pi1_t dinv; 295 invert_pi1 (dinv, d1, d0); 296 return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32); 297 } 298 else 299 { 300 mp_limb_t di[2]; 301 invert_4by2 (di, d1, d0); 302 return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]); 303 } 304 } 305 else 306 { 307 int shift; 308 count_leading_zeros (shift, d1); 309 d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift)); 310 d0 <<= shift; 311 invert_pi1 (dinv, d1, d0); 312 return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32); 313 } 314} 315