1/* mpn_sqrtrem -- square root and remainder
2
3   Contributed to the GNU project by Paul Zimmermann (most code),
4   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5
6   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9   FUTURE GMP RELEASE.
10
11Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12Foundation, Inc.
13
14This file is part of the GNU MP Library.
15
16The GNU MP Library is free software; you can redistribute it and/or modify
17it under the terms of either:
18
19  * the GNU Lesser General Public License as published by the Free
20    Software Foundation; either version 3 of the License, or (at your
21    option) any later version.
22
23or
24
25  * the GNU General Public License as published by the Free Software
26    Foundation; either version 2 of the License, or (at your option) any
27    later version.
28
29or both in parallel, as here.
30
31The GNU MP Library is distributed in the hope that it will be useful, but
32WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
34for more details.
35
36You should have received copies of the GNU General Public License and the
37GNU Lesser General Public License along with the GNU MP Library.  If not,
38see https://www.gnu.org/licenses/.  */
39
40
41/* See "Karatsuba Square Root", reference in gmp.texi.  */
42
43
44#include <stdio.h>
45#include <stdlib.h>
46
47#include "gmp-impl.h"
48#include "longlong.h"
49#define USE_DIVAPPR_Q 1
50#define TRACE(x)
51
52static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53{
54  0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55  0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56  0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57  0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58  0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59  0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60  0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61  0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62  0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63  0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64  0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65  0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66  0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67  0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68  0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69  0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70  0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71  0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72  0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73  0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74  0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75  0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76  0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77  0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78  0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79  0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80  0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81  0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82  0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83  0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84  0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85  0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86  0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87  0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88  0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89  0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90  0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91  0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92  0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93  0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94  0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95  0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96  0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97  0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98  0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99  0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100  0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101  0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
102};
103
104/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
105
106#if GMP_NUMB_BITS > 32
107#define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108#else
109#define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
110#endif
111
112static mp_limb_t
113mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114{
115#if GMP_NUMB_BITS > 32
116  mp_limb_t a1;
117#endif
118  mp_limb_t x0, t2, t, x2;
119  unsigned abits;
120
121  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
123  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124
125  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126     since we can do the former without division.  As part of the last
127     iteration convert from 1/sqrt(a) to sqrt(a).  */
128
129  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
130  x0 = 0x100 | invsqrttab[abits - 0x80];	/* initial 1/sqrt(a) */
131
132  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133
134#if GMP_NUMB_BITS > 32
135  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138
139  /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140
141  t2 = x0 * (a0 >> (32-8));
142  t = t2 >> 25;
143  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145  x0 >>= 32;
146#else
147  t2 = x0 * (a0 >> (16-8));
148  t = t2 >> 13;
149  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151  x0 >>= 16;
152#endif
153
154  /* x0 is now a full limb approximation of sqrt(a0) */
155
156  x2 = x0 * x0;
157  if (x2 + 2*x0 <= a0 - 1)
158    {
159      x2 += 2*x0 + 1;
160      x0++;
161    }
162
163  *rp = a0 - x2;
164  return x0;
165}
166
167
168#define Prec (GMP_NUMB_BITS >> 1)
169#if ! defined(SQRTREM2_INPLACE)
170#define SQRTREM2_INPLACE 0
171#endif
172
173/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
175#if SQRTREM2_INPLACE
176#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177static mp_limb_t
178mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179{
180  mp_srcptr np = rp;
181#else
182#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183static mp_limb_t
184mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185{
186#endif
187  mp_limb_t q, u, np0, sp0, rp0, q2;
188  int cc;
189
190  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191
192  np0 = np[0];
193  sp0 = mpn_sqrtrem1 (rp, np[1]);
194  rp0 = rp[0];
195  /* rp0 <= 2*sp0 < 2^(Prec + 1) */
196  rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197  q = rp0 / sp0;
198  /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199  q -= q >> Prec;
200  /* now we have q < 2^Prec */
201  u = rp0 - q * sp0;
202  /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203  sp0 = (sp0 << Prec) | q;
204  cc = u >> (Prec - 1);
205  rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206  /* subtract q * q from rp */
207  q2 = q * q;
208  cc -= rp0 < q2;
209  rp0 -= q2;
210  if (cc < 0)
211    {
212      rp0 += sp0;
213      cc += rp0 < sp0;
214      --sp0;
215      rp0 += sp0;
216      cc += rp0 < sp0;
217    }
218
219  rp[0] = rp0;
220  sp[0] = sp0;
221  return cc;
222}
223
224/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225   and in {np, n} the low n limbs of the remainder, returns the high
226   limb of the remainder (which is 0 or 1).
227   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228   where B=2^GMP_NUMB_BITS.
229   Needs a scratch of n/2+1 limbs. */
230static mp_limb_t
231mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232{
233  mp_limb_t q;			/* carry out of {sp, n} */
234  int c, b;			/* carry out of remainder */
235  mp_size_t l, h;
236
237  ASSERT (n > 1);
238  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239
240  l = n / 2;
241  h = n - l;
242  if (h == 1)
243    q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244  else
245    q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246  if (q != 0)
247    ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248  TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249  mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250  q += scratch[l];
251  c = scratch[0] & 1;
252  mpn_rshift (sp, scratch, l, 1);
253  sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254  if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255    return 1; /* Remainder is non-zero */
256  q >>= 1;
257  if (c != 0)
258    c = mpn_add_n (np + l, np + l, sp + l, h);
259  TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260  mpn_sqr (np + n, sp, l);
261  b = q + mpn_sub_n (np, np, np + n, 2 * l);
262  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263
264  if (c < 0)
265    {
266      q = mpn_add_1 (sp + l, sp + l, h, q);
267#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
268      c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269#else
270      c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271#endif
272      c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273      q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274    }
275
276  return c;
277}
278
279#if USE_DIVAPPR_Q
280static void
281mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282{
283  gmp_pi1_t inv;
284  mp_limb_t qh;
285  ASSERT (dn > 2);
286  ASSERT (nn >= dn);
287  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288
289  MPN_COPY (scratch, np, nn);
290  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295  else
296    {
297      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298      TMP_DECL;
299      TMP_MARK;
300      /* Sadly, scratch is too small. */
301      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302      TMP_FREE;
303    }
304  qp [nn - dn] = qh;
305}
306#endif
307
308/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309   returns zero if the operand was a perfect square, one otherwise.
310   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
311   where B=2^GMP_NUMB_BITS.
312   THINK: In the odd case, three more (dummy) limbs are taken into account,
313   when nsh is maximal, two limbs are discarded from the result of the
314   division. Too much? Is a single dummy limb enough? */
315static int
316mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317{
318  mp_limb_t q;			/* carry out of {sp, n} */
319  int c;			/* carry out of remainder */
320  mp_size_t l, h;
321  mp_ptr qp, tp, scratch;
322  TMP_DECL;
323  TMP_MARK;
324
325  ASSERT (np[2 * n - 1 - odd] != 0);
326  ASSERT (n > 4);
327  ASSERT (nsh < GMP_NUMB_BITS / 2);
328
329  l = (n - 1) / 2;
330  h = n - l;
331  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334  if (nsh != 0)
335    {
336      /* o is used to exactly set the lowest bits of the dividend, is it needed? */
337      int o = l > (1 + odd);
338      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339    }
340  else
341    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343  if (q != 0)
344    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345  qp = tp + n + 1; /* l + 2 */
346  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347#if USE_DIVAPPR_Q
348  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349#else
350  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351#endif
352  q += qp [l + 1];
353  c = 1;
354  if (q > 1)
355    {
356      /* FIXME: if s!=0 we will shift later, a noop on this area. */
357      MPN_FILL (sp, l, GMP_NUMB_MAX);
358    }
359  else
360    {
361      /* FIXME: if s!=0 we will shift again later, shift just once. */
362      mpn_rshift (sp, qp + 1, l, 1);
363      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
364      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
365	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366	{
367	  mp_limb_t cy;
368	  /* Approximation is not good enough, the extra limb(+ nsh bits)
369	     is smaller than needed to absorb the possible error. */
370	  /* {qp + 1, l + 1} equals 2*{sp, l} */
371	  /* FIXME: use mullo or wrap-around, or directly evaluate
372	     remainder with a single sqrmod_bnm1. */
373	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375	  /* Compute the remainder of the previous mpn_div(appr)_q. */
376	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377#if USE_DIVAPPR_Q || WANT_ASSERT
378	  MPN_DECR_U (tp + 1 + h, l, cy);
379#if USE_DIVAPPR_Q
380	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382	    {
383	      /* May happen only if div result was not exact. */
384#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
385	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386#else
387	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388#endif
389	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390	      MPN_DECR_U (sp, l, 1);
391	    }
392	  /* Can the root be exact when a correction was needed? We
393	     did not find an example, but it depends on divappr
394	     internals, and we can not assume it true in general...*/
395	  /* else */
396#else /* WANT_ASSERT */
397	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398#endif
399#endif
400	  if (mpn_zero_p (tp + l + 1, h - l))
401	    {
402	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403	      mpn_sqr (scratch, sp, l);
404	      c = mpn_cmp (tp + 1, scratch + l, l);
405	      if (c == 0)
406		{
407		  if (nsh != 0)
408		    {
409		      mpn_lshift (tp, np, l, 2 * nsh);
410		      np = tp;
411		    }
412		  c = mpn_cmp (np, scratch + odd, l - odd);
413		}
414	      if (c < 0)
415		{
416		  MPN_DECR_U (sp, l, 1);
417		  c = 1;
418		}
419	    }
420	}
421    }
422  TMP_FREE;
423
424  if ((odd | nsh) != 0)
425    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426  return c;
427}
428
429
430mp_size_t
431mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432{
433  mp_limb_t cc, high, rl;
434  int c;
435  mp_size_t rn, tn;
436  TMP_DECL;
437
438  ASSERT (nn > 0);
439  ASSERT_MPN (np, nn);
440
441  ASSERT (np[nn - 1] != 0);
442  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445
446  high = np[nn - 1];
447  if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
448    c = 0;
449  else
450    {
451      count_leading_zeros (c, high);
452      c -= GMP_NAIL_BITS;
453
454      c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455    }
456  if (nn == 1) {
457    if (c == 0)
458      {
459	sp[0] = mpn_sqrtrem1 (&rl, high);
460	if (rp != NULL)
461	  rp[0] = rl;
462      }
463    else
464      {
465	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466	sp[0] = cc;
467	if (rp != NULL)
468	  rp[0] = rl = high - cc*cc;
469      }
470    return rl != 0;
471  }
472  if (nn == 2) {
473    mp_limb_t tp [2];
474    if (rp == NULL) rp = tp;
475    if (c == 0)
476      {
477#if SQRTREM2_INPLACE
478	rp[1] = high;
479	rp[0] = np[0];
480	cc = CALL_SQRTREM2_INPLACE (sp, rp);
481#else
482	cc = mpn_sqrtrem2 (sp, rp, np);
483#endif
484	rp[1] = cc;
485	return ((rp[0] | cc) != 0) + cc;
486      }
487    else
488      {
489	rl = np[0];
490	rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
491	rp[0] = rl << (2*c);
492	CALL_SQRTREM2_INPLACE (sp, rp);
493	cc = sp[0] >>= c;	/* c != 0, the highest bit of the root cc is 0. */
494	rp[0] = rl -= cc*cc;	/* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495	return rl != 0;
496      }
497  }
498  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
499
500  if ((rp == NULL) && (nn > 8))
501    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502  TMP_MARK;
503  if (((nn & 1) | c) != 0)
504    {
505      mp_limb_t s0[1], mask;
506      mp_ptr tp, scratch;
507      TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508      tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
509      if (c != 0)
510	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511      else
512	MPN_COPY (tp + (nn & 1), np, nn);
513      c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;		/* c now represents k */
514      mask = (CNST_LIMB (1) << c) - 1;
515      rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
517	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
518      s0[0] = sp[0] & mask;	/* S mod 2^k */
519      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
520      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522      mpn_rshift (sp, sp, tn, c);
523      tp[tn] = rl;
524      if (rp == NULL)
525	rp = tp;
526      c = c << 1;
527      if (c < GMP_NUMB_BITS)
528	tn++;
529      else
530	{
531	  tp++;
532	  c -= GMP_NUMB_BITS;
533	}
534      if (c != 0)
535	mpn_rshift (rp, tp, tn, c);
536      else
537	MPN_COPY_INCR (rp, tp, tn);
538      rn = tn;
539    }
540  else
541    {
542      if (rp != np)
543	{
544	  if (rp == NULL) /* nn <= 8 */
545	    rp = TMP_SALLOC_LIMBS (nn);
546	  MPN_COPY (rp, np, nn);
547	}
548      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549    }
550
551  MPN_NORMALIZE (rp, rn);
552
553  TMP_FREE;
554  return rn;
555}
556