1249109Sjkim/* mpn_div_q -- division for arbitrary size operands. 2249109Sjkim 3249109Sjkim Contributed to the GNU project by Torbjorn Granlund. 4249109Sjkim 5249109Sjkim THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6249109Sjkim SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7249109Sjkim GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. 8281075Sdim 9249109SjkimCopyright 2009, 2010 Free Software Foundation, Inc. 10249109Sjkim 11249109SjkimThis file is part of the GNU MP Library. 12249109Sjkim 13249109SjkimThe GNU MP Library is free software; you can redistribute it and/or modify 14249109Sjkimit under the terms of the GNU Lesser General Public License as published by 15249109Sjkimthe Free Software Foundation; either version 3 of the License, or (at your 16249109Sjkimoption) any later version. 17249109Sjkim 18249109SjkimThe GNU MP Library is distributed in the hope that it will be useful, but 19249109SjkimWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20249109Sjkimor FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21249109SjkimLicense for more details. 22249109Sjkim 23249109SjkimYou should have received a copy of the GNU Lesser General Public License 24249109Sjkimalong with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25249109Sjkim 26249109Sjkim#include "gmp.h" 27249109Sjkim#include "gmp-impl.h" 28249109Sjkim#include "longlong.h" 29249109Sjkim 30249109Sjkim 31249109Sjkim/* Compute Q = N/D with truncation. 32249109Sjkim N = {np,nn} 33249109Sjkim D = {dp,dn} 34249109Sjkim Q = {qp,nn-dn+1} 35249109Sjkim T = {scratch,nn+1} is scratch space 36249109Sjkim N and D are both untouched by the computation. 37249109Sjkim N and T may overlap; pass the same space if N is irrelevant after the call, 38249109Sjkim but note that tp needs an extra limb. 39249109Sjkim 40249109Sjkim Operand requirements: 41249109Sjkim N >= D > 0 42249109Sjkim dp[dn-1] != 0 43249109Sjkim No overlap between the N, D, and Q areas. 44249112Sjkim 45249109Sjkim This division function does not clobber its input operands, since it is 46249109Sjkim intended to support average-O(qn) division, and for that to be effective, it 47249109Sjkim cannot put requirements on callers to copy a O(nn) operand. 48249109Sjkim 49249109Sjkim If a caller does not care about the value of {np,nn+1} after calling this 50249109Sjkim function, it should pass np also for the scratch argument. This function 51249109Sjkim will then save some time and space by avoiding allocation and copying. 52249109Sjkim (FIXME: Is this a good design? We only really save any copying for 53249109Sjkim already-normalised divisors, which should be rare. It also prevents us from 54249109Sjkim reasonably asking for all scratch space we need.) 55249109Sjkim 56249109Sjkim We write nn-dn+1 limbs for the quotient, but return void. Why not return 57249109Sjkim the most significant quotient limb? Look at the 4 main code blocks below 58249109Sjkim (consisting of an outer if-else where each arm contains an if-else). It is 59249109Sjkim tricky for the first code block, since the mpn_*_div_q calls will typically 60249109Sjkim generate all nn-dn+1 and return 0 or 1. I don't see how to fix that unless 61249109Sjkim we generate the most significant quotient limb here, before calling 62249109Sjkim mpn_*_div_q, or put the quotient in a temporary area. Since this is a 63249109Sjkim critical division case (the SB sub-case in particular) copying is not a good 64249109Sjkim idea. 65249109Sjkim 66249109Sjkim It might make sense to split the if-else parts of the (qn + FUDGE 67249109Sjkim >= dn) blocks into separate functions, since we could promise quite 68249109Sjkim different things to callers in these two cases. The 'then' case 69249109Sjkim benefits from np=scratch, and it could perhaps even tolerate qp=np, 70249109Sjkim saving some headache for many callers. 71249109Sjkim 72249109Sjkim FIXME: Scratch allocation leaves a lot to be desired. E.g., for the MU size 73249109Sjkim operands, we do not reuse the huge scratch for adjustments. This can be a 74249109Sjkim serious waste of memory for the largest operands. 75249109Sjkim*/ 76249109Sjkim 77249109Sjkim/* FUDGE determines when to try getting an approximate quotient from the upper 78249109Sjkim parts of the dividend and divisor, then adjust. N.B. FUDGE must be >= 2 79249109Sjkim for the code to be correct. */ 80249109Sjkim#define FUDGE 5 /* FIXME: tune this */ 81249109Sjkim 82249109Sjkim#define DC_DIV_Q_THRESHOLD DC_DIVAPPR_Q_THRESHOLD 83249109Sjkim#define MU_DIV_Q_THRESHOLD MU_DIVAPPR_Q_THRESHOLD 84249109Sjkim#define MUPI_DIV_Q_THRESHOLD MUPI_DIVAPPR_Q_THRESHOLD 85249109Sjkim#ifndef MUPI_DIVAPPR_Q_THRESHOLD 86249109Sjkim#define MUPI_DIVAPPR_Q_THRESHOLD MUPI_DIV_QR_THRESHOLD 87249109Sjkim#endif 88249109Sjkim 89249109Sjkimvoid 90249109Sjkimmpn_div_q (mp_ptr qp, 91249109Sjkim mp_srcptr np, mp_size_t nn, 92249109Sjkim mp_srcptr dp, mp_size_t dn, mp_ptr scratch) 93249109Sjkim{ 94249109Sjkim mp_ptr new_dp, new_np, tp, rp; 95249109Sjkim mp_limb_t cy, dh, qh; 96249109Sjkim mp_size_t new_nn, qn; 97249109Sjkim gmp_pi1_t dinv; 98249109Sjkim int cnt; 99249109Sjkim TMP_DECL; 100249109Sjkim TMP_MARK; 101249109Sjkim 102249109Sjkim ASSERT (nn >= dn); 103249109Sjkim ASSERT (dn > 0); 104249109Sjkim ASSERT (dp[dn - 1] != 0); 105249109Sjkim ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn)); 106249109Sjkim ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn)); 107249109Sjkim ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn)); 108249109Sjkim 109249109Sjkim ASSERT_ALWAYS (FUDGE >= 2); 110249109Sjkim 111249109Sjkim if (dn == 1) 112249109Sjkim { 113249109Sjkim mpn_divrem_1 (qp, 0L, np, nn, dp[dn - 1]); 114249109Sjkim return; 115249109Sjkim } 116249109Sjkim 117249109Sjkim qn = nn - dn + 1; /* Quotient size, high limb might be zero */ 118249109Sjkim 119249109Sjkim if (qn + FUDGE >= dn) 120249109Sjkim { 121249109Sjkim /* |________________________| 122249109Sjkim |_______| */ 123249109Sjkim new_np = scratch; 124249109Sjkim 125249109Sjkim dh = dp[dn - 1]; 126249109Sjkim if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) 127249109Sjkim { 128249109Sjkim count_leading_zeros (cnt, dh); 129249109Sjkim 130249109Sjkim cy = mpn_lshift (new_np, np, nn, cnt); 131249109Sjkim new_np[nn] = cy; 132249109Sjkim new_nn = nn + (cy != 0); 133249109Sjkim 134249109Sjkim new_dp = TMP_ALLOC_LIMBS (dn); 135249109Sjkim mpn_lshift (new_dp, dp, dn, cnt); 136249109Sjkim 137249109Sjkim if (dn == 2) 138249109Sjkim { 139249109Sjkim qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp); 140249109Sjkim } 141249109Sjkim else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || 142249109Sjkim BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD)) 143249109Sjkim { 144249109Sjkim invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]); 145249109Sjkim qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32); 146249109Sjkim } 147249109Sjkim else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */ 148249109Sjkim BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */ 149249109Sjkim (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */ 150249109Sjkim + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */ 151249109Sjkim { 152249109Sjkim invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]); 153249109Sjkim qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv); 154249109Sjkim } 155249109Sjkim else 156249109Sjkim { 157249109Sjkim mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0); 158249109Sjkim mp_ptr scratch = TMP_ALLOC_LIMBS (itch); 159249109Sjkim qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch); 160249109Sjkim } 161249109Sjkim if (cy == 0) 162249109Sjkim qp[qn - 1] = qh; 163249109Sjkim else if (UNLIKELY (qh != 0)) 164249109Sjkim { 165249109Sjkim /* This happens only when the quotient is close to B^n and 166249109Sjkim mpn_*_divappr_q returned B^n. */ 167249109Sjkim mp_size_t i, n; 168249109Sjkim n = new_nn - dn; 169249109Sjkim for (i = 0; i < n; i++) 170249109Sjkim qp[i] = GMP_NUMB_MAX; 171249109Sjkim qh = 0; /* currently ignored */ 172249109Sjkim } 173249109Sjkim } 174249109Sjkim else /* divisor is already normalised */ 175249109Sjkim { 176249109Sjkim if (new_np != np) 177249109Sjkim MPN_COPY (new_np, np, nn); 178249109Sjkim 179249109Sjkim if (dn == 2) 180249109Sjkim { 181249109Sjkim qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp); 182249109Sjkim } 183249109Sjkim else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || 184249109Sjkim BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD)) 185249109Sjkim { 186249109Sjkim invert_pi1 (dinv, dh, dp[dn - 2]); 187249109Sjkim qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32); 188249109Sjkim } 189249109Sjkim else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */ 190249109Sjkim BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */ 191249109Sjkim (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */ 192249109Sjkim + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */ 193249109Sjkim { 194249109Sjkim invert_pi1 (dinv, dh, dp[dn - 2]); 195249109Sjkim qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv); 196249109Sjkim } 197249109Sjkim else 198249109Sjkim { 199249109Sjkim mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0); 200249109Sjkim mp_ptr scratch = TMP_ALLOC_LIMBS (itch); 201249109Sjkim qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch); 202249109Sjkim } 203249109Sjkim qp[nn - dn] = qh; 204249109Sjkim } 205249109Sjkim } 206249109Sjkim else 207249109Sjkim { 208249109Sjkim /* |________________________| 209249109Sjkim |_________________| */ 210249109Sjkim tp = TMP_ALLOC_LIMBS (qn + 1); 211249109Sjkim 212249109Sjkim new_np = scratch; 213249109Sjkim new_nn = 2 * qn + 1; 214249109Sjkim if (new_np == np) 215249109Sjkim /* We need {np,nn} to remain untouched until the final adjustment, so 216249109Sjkim we need to allocate separate space for new_np. */ 217249109Sjkim new_np = TMP_ALLOC_LIMBS (new_nn + 1); 218249109Sjkim 219249109Sjkim 220249109Sjkim dh = dp[dn - 1]; 221249109Sjkim if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) 222249109Sjkim { 223249109Sjkim count_leading_zeros (cnt, dh); 224249109Sjkim 225249109Sjkim cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt); 226249109Sjkim new_np[new_nn] = cy; 227249109Sjkim 228249109Sjkim new_nn += (cy != 0); 229249109Sjkim 230249109Sjkim new_dp = TMP_ALLOC_LIMBS (qn + 1); 231249109Sjkim mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt); 232249109Sjkim new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt); 233249109Sjkim 234249109Sjkim if (qn + 1 == 2) 235249109Sjkim { 236249109Sjkim qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp); 237249109Sjkim } 238249109Sjkim else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1)) 239249109Sjkim { 240249109Sjkim invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]); 241249109Sjkim qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32); 242249109Sjkim } 243249109Sjkim else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1)) 244249109Sjkim { 245249109Sjkim invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]); 246249109Sjkim qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv); 247249109Sjkim } 248249109Sjkim else 249249109Sjkim { 250249109Sjkim mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0); 251249109Sjkim mp_ptr scratch = TMP_ALLOC_LIMBS (itch); 252249109Sjkim qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch); 253249109Sjkim } 254250838Sjkim if (cy == 0) 255249109Sjkim tp[qn] = qh; 256250838Sjkim else if (UNLIKELY (qh != 0)) 257249109Sjkim { 258249109Sjkim /* This happens only when the quotient is close to B^n and 259249109Sjkim mpn_*_divappr_q returned B^n. */ 260249109Sjkim mp_size_t i, n; 261249109Sjkim n = new_nn - (qn + 1); 262249109Sjkim for (i = 0; i < n; i++) 263249109Sjkim tp[i] = GMP_NUMB_MAX; 264249109Sjkim qh = 0; /* currently ignored */ 265249109Sjkim } 266249109Sjkim } 267249109Sjkim else /* divisor is already normalised */ 268249109Sjkim { 269249109Sjkim MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless of MU will be used */ 270249109Sjkim 271249109Sjkim new_dp = (mp_ptr) dp + dn - (qn + 1); 272249109Sjkim 273249109Sjkim if (qn == 2 - 1) 274249109Sjkim { 275249109Sjkim qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp); 276249109Sjkim } 277249109Sjkim else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1)) 278249109Sjkim { 279249109Sjkim invert_pi1 (dinv, dh, new_dp[qn - 1]); 280249109Sjkim qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32); 281249109Sjkim } 282249109Sjkim else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1)) 283249109Sjkim { 284249109Sjkim invert_pi1 (dinv, dh, new_dp[qn - 1]); 285249109Sjkim qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv); 286249109Sjkim } 287249109Sjkim else 288249109Sjkim { 289249109Sjkim mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0); 290249109Sjkim mp_ptr scratch = TMP_ALLOC_LIMBS (itch); 291249109Sjkim qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch); 292249109Sjkim } 293249109Sjkim tp[qn] = qh; 294249109Sjkim } 295249109Sjkim 296249109Sjkim MPN_COPY (qp, tp + 1, qn); 297249109Sjkim if (tp[0] <= 4) 298249109Sjkim { 299249109Sjkim mp_size_t rn; 300249109Sjkim 301249109Sjkim rp = TMP_ALLOC_LIMBS (dn + qn); 302249109Sjkim mpn_mul (rp, dp, dn, tp + 1, qn); 303249109Sjkim rn = dn + qn; 304249109Sjkim rn -= rp[rn - 1] == 0; 305249109Sjkim 306249109Sjkim if (rn > nn || mpn_cmp (np, rp, nn) < 0) 307249109Sjkim mpn_decr_u (qp, 1); 308249109Sjkim } 309249109Sjkim } 310249109Sjkim 311249109Sjkim TMP_FREE; 312249109Sjkim} 313249109Sjkim