1/* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q. 2 3 Compute Q = floor(N / D) + e. N is nn limbs, D is dn limbs and must be 4 normalized, and Q must be nn-dn limbs, 0 <= e <= 4. The requirement that Q 5 is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us 6 to let N be unmodified during the operation. 7 8 Contributed to the GNU project by Torbjorn Granlund. 9 10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 11 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 12 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. 13 14Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc. 15 16This file is part of the GNU MP Library. 17 18The GNU MP Library is free software; you can redistribute it and/or modify 19it under the terms of either: 20 21 * the GNU Lesser General Public License as published by the Free 22 Software Foundation; either version 3 of the License, or (at your 23 option) any later version. 24 25or 26 27 * the GNU General Public License as published by the Free Software 28 Foundation; either version 2 of the License, or (at your option) any 29 later version. 30 31or both in parallel, as here. 32 33The GNU MP Library is distributed in the hope that it will be useful, but 34WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 35or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 36for more details. 37 38You should have received copies of the GNU General Public License and the 39GNU Lesser General Public License along with the GNU MP Library. If not, 40see https://www.gnu.org/licenses/. */ 41 42 43/* 44 The idea of the algorithm used herein is to compute a smaller inverted value 45 than used in the standard Barrett algorithm, and thus save time in the 46 Newton iterations, and pay just a small price when using the inverted value 47 for developing quotient bits. This algorithm was presented at ICMS 2006. 48*/ 49 50/* CAUTION: This code and the code in mu_div_qr.c should be edited in sync. 51 52 Things to work on: 53 54 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, 55 demonstrated by the fact that the mpn_invertappr function's scratch needs 56 mean that we need to keep a large allocation long after it is needed. 57 Things are worse as mpn_mul_fft does not accept any scratch parameter, 58 which means we'll have a large memory hole while in mpn_mul_fft. In 59 general, a peak scratch need in the beginning of a function isn't 60 well-handled by the itch/scratch scheme. 61*/ 62 63#ifdef STAT 64#undef STAT 65#define STAT(x) x 66#else 67#define STAT(x) 68#endif 69 70#include <stdlib.h> /* for NULL */ 71#include "gmp-impl.h" 72 73static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t, 74 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr); 75static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int); 76 77mp_limb_t 78mpn_mu_divappr_q (mp_ptr qp, 79 mp_srcptr np, 80 mp_size_t nn, 81 mp_srcptr dp, 82 mp_size_t dn, 83 mp_ptr scratch) 84{ 85 mp_size_t qn, in; 86 mp_limb_t cy, qh; 87 mp_ptr ip, tp; 88 89 ASSERT (dn > 1); 90 91 qn = nn - dn; 92 93 /* If Q is smaller than D, truncate operands. */ 94 if (qn + 1 < dn) 95 { 96 np += dn - (qn + 1); 97 nn -= dn - (qn + 1); 98 dp += dn - (qn + 1); 99 dn = qn + 1; 100 } 101 102 /* Compute the inverse size. */ 103 in = mpn_mu_divappr_q_choose_in (qn, dn, 0); 104 ASSERT (in <= dn); 105 106#if 1 107 /* This alternative inverse computation method gets slightly more accurate 108 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function 109 not adapted (3) mpn_invertappr scratch needs not met. */ 110 ip = scratch; 111 tp = scratch + in + 1; 112 113 /* compute an approximate inverse on (in+1) limbs */ 114 if (dn == in) 115 { 116 MPN_COPY (tp + 1, dp, in); 117 tp[0] = 1; 118 mpn_invertappr (ip, tp, in + 1, tp + in + 1); 119 MPN_COPY_INCR (ip, ip + 1, in); 120 } 121 else 122 { 123 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); 124 if (UNLIKELY (cy != 0)) 125 MPN_ZERO (ip, in); 126 else 127 { 128 mpn_invertappr (ip, tp, in + 1, tp + in + 1); 129 MPN_COPY_INCR (ip, ip + 1, in); 130 } 131 } 132#else 133 /* This older inverse computation method gets slightly worse results than the 134 one above. */ 135 ip = scratch; 136 tp = scratch + in; 137 138 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the 139 inversion function should do this automatically. */ 140 if (dn == in) 141 { 142 tp[in + 1] = 0; 143 MPN_COPY (tp + in + 2, dp, in); 144 mpn_invertappr (tp, tp + in + 1, in + 1, NULL); 145 } 146 else 147 { 148 mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); 149 } 150 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); 151 if (UNLIKELY (cy != 0)) 152 MPN_ZERO (tp + 1, in); 153 MPN_COPY (ip, tp + 1, in); 154#endif 155 156 qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in); 157 158 return qh; 159} 160 161static mp_limb_t 162mpn_preinv_mu_divappr_q (mp_ptr qp, 163 mp_srcptr np, 164 mp_size_t nn, 165 mp_srcptr dp, 166 mp_size_t dn, 167 mp_srcptr ip, 168 mp_size_t in, 169 mp_ptr scratch) 170{ 171 mp_size_t qn; 172 mp_limb_t cy, cx, qh; 173 mp_limb_t r; 174 mp_size_t tn, wn; 175 176#define rp scratch 177#define tp (scratch + dn) 178#define scratch_out (scratch + dn + tn) 179 180 qn = nn - dn; 181 182 np += qn; 183 qp += qn; 184 185 qh = mpn_cmp (np, dp, dn) >= 0; 186 if (qh != 0) 187 mpn_sub_n (rp, np, dp, dn); 188 else 189 MPN_COPY (rp, np, dn); 190 191 if (qn == 0) 192 return qh; /* Degenerate use. Should we allow this? */ 193 194 while (qn > 0) 195 { 196 if (qn < in) 197 { 198 ip += in - qn; 199 in = qn; 200 } 201 np -= in; 202 qp -= in; 203 204 /* Compute the next block of quotient limbs by multiplying the inverse I 205 by the upper part of the partial remainder R. */ 206 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ 207 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ 208 ASSERT_ALWAYS (cy == 0); 209 210 qn -= in; 211 if (qn == 0) 212 break; 213 214 /* Compute the product of the quotient block and the divisor D, to be 215 subtracted from the partial remainder combined with new limbs from the 216 dividend N. We only really need the low dn limbs. */ 217 218 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 219 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ 220 else 221 { 222 tn = mpn_mulmod_bnm1_next_size (dn + 1); 223 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); 224 wn = dn + in - tn; /* number of wrapped limbs */ 225 if (wn > 0) 226 { 227 cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); 228 cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); 229 cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; 230 ASSERT_ALWAYS (cx >= cy); 231 mpn_incr_u (tp, cx - cy); 232 } 233 } 234 235 r = rp[dn - in] - tp[dn]; 236 237 /* Subtract the product from the partial remainder combined with new 238 limbs from the dividend N, generating a new partial remainder R. */ 239 if (dn != in) 240 { 241 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ 242 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); 243 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ 244 } 245 else 246 { 247 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ 248 } 249 250 STAT (int i; int err = 0; 251 static int errarr[5]; static int err_rec; static int tot); 252 253 /* Check the remainder R and adjust the quotient as needed. */ 254 r -= cy; 255 while (r != 0) 256 { 257 /* We loop 0 times with about 69% probability, 1 time with about 31% 258 probability, 2 times with about 0.6% probability, if inverse is 259 computed as recommended. */ 260 mpn_incr_u (qp, 1); 261 cy = mpn_sub_n (rp, rp, dp, dn); 262 r -= cy; 263 STAT (err++); 264 } 265 if (mpn_cmp (rp, dp, dn) >= 0) 266 { 267 /* This is executed with about 76% probability. */ 268 mpn_incr_u (qp, 1); 269 cy = mpn_sub_n (rp, rp, dp, dn); 270 STAT (err++); 271 } 272 273 STAT ( 274 tot++; 275 errarr[err]++; 276 if (err > err_rec) 277 err_rec = err; 278 if (tot % 0x10000 == 0) 279 { 280 for (i = 0; i <= err_rec; i++) 281 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); 282 printf ("\n"); 283 } 284 ); 285 } 286 287 /* FIXME: We should perhaps be somewhat more elegant in our rounding of the 288 quotient. For now, just make sure the returned quotient is >= the real 289 quotient; add 3 with saturating arithmetic. */ 290 qn = nn - dn; 291 cy += mpn_add_1 (qp, qp, qn, 3); 292 if (cy != 0) 293 { 294 if (qh != 0) 295 { 296 /* Return a quotient of just 1-bits, with qh set. */ 297 mp_size_t i; 298 for (i = 0; i < qn; i++) 299 qp[i] = GMP_NUMB_MAX; 300 } 301 else 302 { 303 /* Propagate carry into qh. */ 304 qh = 1; 305 } 306 } 307 308 return qh; 309} 310 311/* In case k=0 (automatic choice), we distinguish 3 cases: 312 (a) dn < qn: in = ceil(qn / ceil(qn/dn)) 313 (b) dn/3 < qn <= dn: in = ceil(qn / 2) 314 (c) qn < dn/3: in = qn 315 In all cases we have in <= dn. 316 */ 317static mp_size_t 318mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k) 319{ 320 mp_size_t in; 321 322 if (k == 0) 323 { 324 mp_size_t b; 325 if (qn > dn) 326 { 327 /* Compute an inverse size that is a nice partition of the quotient. */ 328 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 329 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 330 } 331 else if (3 * qn > dn) 332 { 333 in = (qn - 1) / 2 + 1; /* b = 2 */ 334 } 335 else 336 { 337 in = (qn - 1) / 1 + 1; /* b = 1 */ 338 } 339 } 340 else 341 { 342 mp_size_t xn; 343 xn = MIN (dn, qn); 344 in = (xn - 1) / k + 1; 345 } 346 347 return in; 348} 349 350mp_size_t 351mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) 352{ 353 mp_size_t qn, in, itch_local, itch_out, itch_invapp; 354 355 qn = nn - dn; 356 if (qn + 1 < dn) 357 { 358 dn = qn + 1; 359 } 360 in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k); 361 362 itch_local = mpn_mulmod_bnm1_next_size (dn + 1); 363 itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); 364 itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */ 365 366 ASSERT (dn + itch_local + itch_out >= itch_invapp); 367 return in + MAX (dn + itch_local + itch_out, itch_invapp); 368} 369