1/* mpn_sqrtrem -- square root and remainder 2 3 Contributed to the GNU project by Paul Zimmermann (most code), 4 Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt). 5 6 THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE 7 INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. 8 IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A 9 FUTURE GMP RELEASE. 10 11Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software 12Foundation, Inc. 13 14This file is part of the GNU MP Library. 15 16The GNU MP Library is free software; you can redistribute it and/or modify 17it under the terms of either: 18 19 * the GNU Lesser General Public License as published by the Free 20 Software Foundation; either version 3 of the License, or (at your 21 option) any later version. 22 23or 24 25 * the GNU General Public License as published by the Free Software 26 Foundation; either version 2 of the License, or (at your option) any 27 later version. 28 29or both in parallel, as here. 30 31The GNU MP Library is distributed in the hope that it will be useful, but 32WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 33or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 34for more details. 35 36You should have received copies of the GNU General Public License and the 37GNU Lesser General Public License along with the GNU MP Library. If not, 38see https://www.gnu.org/licenses/. */ 39 40 41/* See "Karatsuba Square Root", reference in gmp.texi. */ 42 43 44#include <stdio.h> 45#include <stdlib.h> 46 47#include "gmp-impl.h" 48#include "longlong.h" 49#define USE_DIVAPPR_Q 1 50#define TRACE(x) 51 52static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */ 53{ 54 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */ 55 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */ 56 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */ 57 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */ 58 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */ 59 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */ 60 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */ 61 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */ 62 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */ 63 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */ 64 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */ 65 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */ 66 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */ 67 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */ 68 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */ 69 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */ 70 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */ 71 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */ 72 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */ 73 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */ 74 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */ 75 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */ 76 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */ 77 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */ 78 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */ 79 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */ 80 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */ 81 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */ 82 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */ 83 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */ 84 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */ 85 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */ 86 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */ 87 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */ 88 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */ 89 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */ 90 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */ 91 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */ 92 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */ 93 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */ 94 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */ 95 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */ 96 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */ 97 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */ 98 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */ 99 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */ 100 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */ 101 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */ 102}; 103 104/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */ 105 106#if GMP_NUMB_BITS > 32 107#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */ 108#else 109#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */ 110#endif 111 112static mp_limb_t 113mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0) 114{ 115#if GMP_NUMB_BITS > 32 116 mp_limb_t a1; 117#endif 118 mp_limb_t x0, t2, t, x2; 119 unsigned abits; 120 121 ASSERT_ALWAYS (GMP_NAIL_BITS == 0); 122 ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64); 123 ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2); 124 125 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a), 126 since we can do the former without division. As part of the last 127 iteration convert from 1/sqrt(a) to sqrt(a). */ 128 129 abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */ 130 x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */ 131 132 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */ 133 134#if GMP_NUMB_BITS > 32 135 a1 = a0 >> (GMP_LIMB_BITS - 1 - 32); 136 t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16; 137 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2)); 138 139 /* x0 is now a 16 bits approximation of 1/sqrt(a0) */ 140 141 t2 = x0 * (a0 >> (32-8)); 142 t = t2 >> 25; 143 t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8)); 144 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15); 145 x0 >>= 32; 146#else 147 t2 = x0 * (a0 >> (16-8)); 148 t = t2 >> 13; 149 t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8)); 150 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7); 151 x0 >>= 16; 152#endif 153 154 /* x0 is now a full limb approximation of sqrt(a0) */ 155 156 x2 = x0 * x0; 157 if (x2 + 2*x0 <= a0 - 1) 158 { 159 x2 += 2*x0 + 1; 160 x0++; 161 } 162 163 *rp = a0 - x2; 164 return x0; 165} 166 167 168#define Prec (GMP_NUMB_BITS >> 1) 169#if ! defined(SQRTREM2_INPLACE) 170#define SQRTREM2_INPLACE 0 171#endif 172 173/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized 174 return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ 175#if SQRTREM2_INPLACE 176#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp) 177static mp_limb_t 178mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp) 179{ 180 mp_srcptr np = rp; 181#else 182#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp) 183static mp_limb_t 184mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np) 185{ 186#endif 187 mp_limb_t q, u, np0, sp0, rp0, q2; 188 int cc; 189 190 ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2); 191 192 np0 = np[0]; 193 sp0 = mpn_sqrtrem1 (rp, np[1]); 194 rp0 = rp[0]; 195 /* rp0 <= 2*sp0 < 2^(Prec + 1) */ 196 rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1)); 197 q = rp0 / sp0; 198 /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */ 199 q -= q >> Prec; 200 /* now we have q < 2^Prec */ 201 u = rp0 - q * sp0; 202 /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */ 203 sp0 = (sp0 << Prec) | q; 204 cc = u >> (Prec - 1); 205 rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1)); 206 /* subtract q * q from rp */ 207 q2 = q * q; 208 cc -= rp0 < q2; 209 rp0 -= q2; 210 if (cc < 0) 211 { 212 rp0 += sp0; 213 cc += rp0 < sp0; 214 --sp0; 215 rp0 += sp0; 216 cc += rp0 < sp0; 217 } 218 219 rp[0] = rp0; 220 sp[0] = sp0; 221 return cc; 222} 223 224/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, 225 and in {np, n} the low n limbs of the remainder, returns the high 226 limb of the remainder (which is 0 or 1). 227 Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 228 where B=2^GMP_NUMB_BITS. 229 Needs a scratch of n/2+1 limbs. */ 230static mp_limb_t 231mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch) 232{ 233 mp_limb_t q; /* carry out of {sp, n} */ 234 int c, b; /* carry out of remainder */ 235 mp_size_t l, h; 236 237 ASSERT (n > 1); 238 ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2); 239 240 l = n / 2; 241 h = n - l; 242 if (h == 1) 243 q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l); 244 else 245 q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch); 246 if (q != 0) 247 ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h)); 248 TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1))); 249 mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h); 250 q += scratch[l]; 251 c = scratch[0] & 1; 252 mpn_rshift (sp, scratch, l, 1); 253 sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; 254 if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */ 255 return 1; /* Remainder is non-zero */ 256 q >>= 1; 257 if (c != 0) 258 c = mpn_add_n (np + l, np + l, sp + l, h); 259 TRACE(printf("sqr(,,%u)\n", (unsigned) l)); 260 mpn_sqr (np + n, sp, l); 261 b = q + mpn_sub_n (np, np, np + n, 2 * l); 262 c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b); 263 264 if (c < 0) 265 { 266 q = mpn_add_1 (sp + l, sp + l, h, q); 267#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n 268 c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q; 269#else 270 c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q; 271#endif 272 c -= mpn_sub_1 (np, np, n, CNST_LIMB(1)); 273 q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1)); 274 } 275 276 return c; 277} 278 279#if USE_DIVAPPR_Q 280static void 281mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch) 282{ 283 gmp_pi1_t inv; 284 mp_limb_t qh; 285 ASSERT (dn > 2); 286 ASSERT (nn >= dn); 287 ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); 288 289 MPN_COPY (scratch, np, nn); 290 invert_pi1 (inv, dp[dn-1], dp[dn-2]); 291 if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD)) 292 qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32); 293 else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD)) 294 qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv); 295 else 296 { 297 mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0); 298 TMP_DECL; 299 TMP_MARK; 300 /* Sadly, scratch is too small. */ 301 qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch)); 302 TMP_FREE; 303 } 304 qp [nn - dn] = qh; 305} 306#endif 307 308/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd}, 309 returns zero if the operand was a perfect square, one otherwise. 310 Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4 311 where B=2^GMP_NUMB_BITS. 312 THINK: In the odd case, three more (dummy) limbs are taken into account, 313 when nsh is maximal, two limbs are discarded from the result of the 314 division. Too much? Is a single dummy limb enough? */ 315static int 316mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd) 317{ 318 mp_limb_t q; /* carry out of {sp, n} */ 319 int c; /* carry out of remainder */ 320 mp_size_t l, h; 321 mp_ptr qp, tp, scratch; 322 TMP_DECL; 323 TMP_MARK; 324 325 ASSERT (np[2 * n - 1 - odd] != 0); 326 ASSERT (n > 4); 327 ASSERT (nsh < GMP_NUMB_BITS / 2); 328 329 l = (n - 1) / 2; 330 h = n - l; 331 ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd); 332 scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */ 333 tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */ 334 if (nsh != 0) 335 { 336 /* o is used to exactly set the lowest bits of the dividend, is it needed? */ 337 int o = l > (1 + odd); 338 ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh)); 339 } 340 else 341 MPN_COPY (tp, np + l - 1 - odd, n + h + 1); 342 q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch); 343 if (q != 0) 344 ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h)); 345 qp = tp + n + 1; /* l + 2 */ 346 TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1))); 347#if USE_DIVAPPR_Q 348 mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch); 349#else 350 mpn_div_q (qp, tp, n + 1, sp + l, h, scratch); 351#endif 352 q += qp [l + 1]; 353 c = 1; 354 if (q > 1) 355 { 356 /* FIXME: if s!=0 we will shift later, a noop on this area. */ 357 MPN_FILL (sp, l, GMP_NUMB_MAX); 358 } 359 else 360 { 361 /* FIXME: if s!=0 we will shift again later, shift just once. */ 362 mpn_rshift (sp, qp + 1, l, 1); 363 sp[l - 1] |= q << (GMP_NUMB_BITS - 1); 364 if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */ 365 (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0) 366 { 367 mp_limb_t cy; 368 /* Approximation is not good enough, the extra limb(+ nsh bits) 369 is smaller than needed to absorb the possible error. */ 370 /* {qp + 1, l + 1} equals 2*{sp, l} */ 371 /* FIXME: use mullo or wrap-around, or directly evaluate 372 remainder with a single sqrmod_bnm1. */ 373 TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1))); 374 ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1)); 375 /* Compute the remainder of the previous mpn_div(appr)_q. */ 376 cy = mpn_sub_n (tp + 1, tp + 1, scratch, h); 377#if USE_DIVAPPR_Q || WANT_ASSERT 378 MPN_DECR_U (tp + 1 + h, l, cy); 379#if USE_DIVAPPR_Q 380 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0); 381 if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0) 382 { 383 /* May happen only if div result was not exact. */ 384#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n 385 cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h); 386#else 387 cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2)); 388#endif 389 ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy)); 390 MPN_DECR_U (sp, l, 1); 391 } 392 /* Can the root be exact when a correction was needed? We 393 did not find an example, but it depends on divappr 394 internals, and we can not assume it true in general...*/ 395 /* else */ 396#else /* WANT_ASSERT */ 397 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0); 398#endif 399#endif 400 if (mpn_zero_p (tp + l + 1, h - l)) 401 { 402 TRACE(printf("sqr(,,%u)\n", (unsigned) l)); 403 mpn_sqr (scratch, sp, l); 404 c = mpn_cmp (tp + 1, scratch + l, l); 405 if (c == 0) 406 { 407 if (nsh != 0) 408 { 409 mpn_lshift (tp, np, l, 2 * nsh); 410 np = tp; 411 } 412 c = mpn_cmp (np, scratch + odd, l - odd); 413 } 414 if (c < 0) 415 { 416 MPN_DECR_U (sp, l, 1); 417 c = 1; 418 } 419 } 420 } 421 } 422 TMP_FREE; 423 424 if ((odd | nsh) != 0) 425 mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0)); 426 return c; 427} 428 429 430mp_size_t 431mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn) 432{ 433 mp_limb_t cc, high, rl; 434 int c; 435 mp_size_t rn, tn; 436 TMP_DECL; 437 438 ASSERT (nn > 0); 439 ASSERT_MPN (np, nn); 440 441 ASSERT (np[nn - 1] != 0); 442 ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn)); 443 ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn)); 444 ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn)); 445 446 high = np[nn - 1]; 447 if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2))) 448 c = 0; 449 else 450 { 451 count_leading_zeros (c, high); 452 c -= GMP_NAIL_BITS; 453 454 c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ 455 } 456 if (nn == 1) { 457 if (c == 0) 458 { 459 sp[0] = mpn_sqrtrem1 (&rl, high); 460 if (rp != NULL) 461 rp[0] = rl; 462 } 463 else 464 { 465 cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c; 466 sp[0] = cc; 467 if (rp != NULL) 468 rp[0] = rl = high - cc*cc; 469 } 470 return rl != 0; 471 } 472 if (nn == 2) { 473 mp_limb_t tp [2]; 474 if (rp == NULL) rp = tp; 475 if (c == 0) 476 { 477#if SQRTREM2_INPLACE 478 rp[1] = high; 479 rp[0] = np[0]; 480 cc = CALL_SQRTREM2_INPLACE (sp, rp); 481#else 482 cc = mpn_sqrtrem2 (sp, rp, np); 483#endif 484 rp[1] = cc; 485 return ((rp[0] | cc) != 0) + cc; 486 } 487 else 488 { 489 rl = np[0]; 490 rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c)); 491 rp[0] = rl << (2*c); 492 CALL_SQRTREM2_INPLACE (sp, rp); 493 cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */ 494 rp[0] = rl -= cc*cc; /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */ 495 return rl != 0; 496 } 497 } 498 tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */ 499 500 if ((rp == NULL) && (nn > 8)) 501 return mpn_dc_sqrt (sp, np, tn, c, nn & 1); 502 TMP_MARK; 503 if (((nn & 1) | c) != 0) 504 { 505 mp_limb_t s0[1], mask; 506 mp_ptr tp, scratch; 507 TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1); 508 tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */ 509 if (c != 0) 510 mpn_lshift (tp + (nn & 1), np, nn, 2 * c); 511 else 512 MPN_COPY (tp + (nn & 1), np, nn); 513 c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */ 514 mask = (CNST_LIMB (1) << c) - 1; 515 rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch); 516 /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2, 517 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */ 518 s0[0] = sp[0] & mask; /* S mod 2^k */ 519 rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */ 520 cc = mpn_submul_1 (tp, s0, 1, s0[0]); 521 rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc; 522 mpn_rshift (sp, sp, tn, c); 523 tp[tn] = rl; 524 if (rp == NULL) 525 rp = tp; 526 c = c << 1; 527 if (c < GMP_NUMB_BITS) 528 tn++; 529 else 530 { 531 tp++; 532 c -= GMP_NUMB_BITS; 533 } 534 if (c != 0) 535 mpn_rshift (rp, tp, tn, c); 536 else 537 MPN_COPY_INCR (rp, tp, tn); 538 rn = tn; 539 } 540 else 541 { 542 if (rp != np) 543 { 544 if (rp == NULL) /* nn <= 8 */ 545 rp = TMP_SALLOC_LIMBS (nn); 546 MPN_COPY (rp, np, nn); 547 } 548 rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1))); 549 } 550 551 MPN_NORMALIZE (rp, rn); 552 553 TMP_FREE; 554 return rn; 555} 556