1/* mpn_sqrtrem -- square root and remainder
2
3   Contributed to the GNU project by Paul Zimmermann (most code) and
4   Torbjorn Granlund (mpn_sqrtrem1).
5
6   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
7   MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
8   INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
9   DISAPPEAR IN A FUTURE GMP RELEASE.
10
11Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2010 Free Software
12Foundation, Inc.
13
14This file is part of the GNU MP Library.
15
16The GNU MP Library is free software; you can redistribute it and/or modify
17it under the terms of the GNU Lesser General Public License as published by
18the Free Software Foundation; either version 3 of the License, or (at your
19option) any later version.
20
21The GNU MP Library is distributed in the hope that it will be useful, but
22WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
24License for more details.
25
26You should have received a copy of the GNU Lesser General Public License
27along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
28
29
30/* See "Karatsuba Square Root", reference in gmp.texi.  */
31
32
33#include <stdio.h>
34#include <stdlib.h>
35
36#include "gmp.h"
37#include "gmp-impl.h"
38#include "longlong.h"
39
40static const unsigned short invsqrttab[384] =
41{
42  0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
43  0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
44  0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
45  0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
46  0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
47  0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
48  0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
49  0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
50  0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
51  0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
52  0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
53  0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
54  0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
55  0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
56  0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
57  0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
58  0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
59  0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
60  0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
61  0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
62  0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
63  0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
64  0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
65  0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
66  0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
67  0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
68  0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
69  0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
70  0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
71  0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
72  0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
73  0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
74  0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
75  0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
76  0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
77  0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
78  0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
79  0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
80  0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
81  0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
82  0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
83  0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
84  0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
85  0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
86  0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
87  0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
88  0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
89  0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100  /* sqrt(1/1f8)..sqrt(1/1ff) */
90};
91
92/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
93
94#if GMP_NUMB_BITS > 32
95#define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
96#else
97#define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
98#endif
99
100static mp_limb_t
101mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
102{
103#if GMP_NUMB_BITS > 32
104  mp_limb_t a1;
105#endif
106  mp_limb_t x0, t2, t, x2;
107  unsigned abits;
108
109  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
110  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
111  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
112
113  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
114     since we can do the former without division.  As part of the last
115     iteration convert from 1/sqrt(a) to sqrt(a).  */
116
117  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
118  x0 = invsqrttab[abits - 0x80];		/* initial 1/sqrt(a) */
119
120  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
121
122#if GMP_NUMB_BITS > 32
123  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
124  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000  - a1 * x0 * x0) >> 16;
125  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
126
127  /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
128
129  t2 = x0 * (a0 >> (32-8));
130  t = t2 >> 25;
131  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
132  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
133  x0 >>= 32;
134#else
135  t2 = x0 * (a0 >> (16-8));
136  t = t2 >> 13;
137  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
138  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
139  x0 >>= 16;
140#endif
141
142  /* x0 is now a full limb approximation of sqrt(a0) */
143
144  x2 = x0 * x0;
145  if (x2 + 2*x0 <= a0 - 1)
146    {
147      x2 += 2*x0 + 1;
148      x0++;
149    }
150
151  *rp = a0 - x2;
152  return x0;
153}
154
155
156#define Prec (GMP_NUMB_BITS >> 1)
157
158/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
159   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
160static mp_limb_t
161mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
162{
163  mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
164  int cc;
165
166  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
167
168  np0 = np[0];
169  sp0 = mpn_sqrtrem1 (rp, np[1]);
170  qhl = 0;
171  rp0 = rp[0];
172  while (rp0 >= sp0)
173    {
174      qhl++;
175      rp0 -= sp0;
176    }
177  /* now rp0 < sp0 < 2^Prec */
178  rp0 = (rp0 << Prec) + (np0 >> Prec);
179  u = 2 * sp0;
180  q = rp0 / u;
181  u = rp0 - q * u;
182  q += (qhl & 1) << (Prec - 1);
183  qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
184  /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
185  sp0 = ((sp0 + qhl) << Prec) + q;
186  cc = u >> Prec;
187  rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
188  /* subtract q * q or qhl*2^(2*Prec) from rp */
189  q2 = q * q;
190  cc -= (rp0 < q2) + qhl;
191  rp0 -= q2;
192  /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
193  if (cc < 0)
194    {
195      if (sp0 != 0)
196	{
197	  rp0 += sp0;
198	  cc += rp0 < sp0;
199	}
200      else
201	cc++;
202      --sp0;
203      rp0 += sp0;
204      cc += rp0 < sp0;
205    }
206
207  rp[0] = rp0;
208  sp[0] = sp0;
209  return cc;
210}
211
212/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
213   and in {np, n} the low n limbs of the remainder, returns the high
214   limb of the remainder (which is 0 or 1).
215   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
216   where B=2^GMP_NUMB_BITS.  */
217static mp_limb_t
218mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
219{
220  mp_limb_t q;			/* carry out of {sp, n} */
221  int c, b;			/* carry out of remainder */
222  mp_size_t l, h;
223
224  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
225
226  if (n == 1)
227    c = mpn_sqrtrem2 (sp, np, np);
228  else
229    {
230      l = n / 2;
231      h = n - l;
232      q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
233      if (q != 0)
234	mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
235      q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
236      c = sp[0] & 1;
237      mpn_rshift (sp, sp, l, 1);
238      sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
239      q >>= 1;
240      if (c != 0)
241	c = mpn_add_n (np + l, np + l, sp + l, h);
242      mpn_sqr (np + n, sp, l);
243      b = q + mpn_sub_n (np, np, np + n, 2 * l);
244      c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
245      q = mpn_add_1 (sp + l, sp + l, h, q);
246
247      if (c < 0)
248	{
249	  c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
250	  c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
251	  q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
252	}
253    }
254
255  return c;
256}
257
258
259mp_size_t
260mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
261{
262  mp_limb_t *tp, s0[1], cc, high, rl;
263  int c;
264  mp_size_t rn, tn;
265  TMP_DECL;
266
267  ASSERT (nn >= 0);
268  ASSERT_MPN (np, nn);
269
270  /* If OP is zero, both results are zero.  */
271  if (nn == 0)
272    return 0;
273
274  ASSERT (np[nn - 1] != 0);
275  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
276  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
277  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
278
279  high = np[nn - 1];
280  if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
281    {
282      mp_limb_t r;
283      sp[0] = mpn_sqrtrem1 (&r, high);
284      if (rp != NULL)
285	rp[0] = r;
286      return r != 0;
287    }
288  count_leading_zeros (c, high);
289  c -= GMP_NAIL_BITS;
290
291  c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
292  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
293
294  TMP_MARK;
295  if (nn % 2 != 0 || c > 0)
296    {
297      tp = TMP_ALLOC_LIMBS (2 * tn);
298      tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
299      if (c != 0)
300	mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
301      else
302	MPN_COPY (tp + 2 * tn - nn, np, nn);
303      rl = mpn_dc_sqrtrem (sp, tp, tn);
304      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
305	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
306      c += (nn % 2) * GMP_NUMB_BITS / 2;		/* c now represents k */
307      s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);	/* S mod 2^k */
308      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
309      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
310      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
311      mpn_rshift (sp, sp, tn, c);
312      tp[tn] = rl;
313      if (rp == NULL)
314	rp = tp;
315      c = c << 1;
316      if (c < GMP_NUMB_BITS)
317	tn++;
318      else
319	{
320	  tp++;
321	  c -= GMP_NUMB_BITS;
322	}
323      if (c != 0)
324	mpn_rshift (rp, tp, tn, c);
325      else
326	MPN_COPY_INCR (rp, tp, tn);
327      rn = tn;
328    }
329  else
330    {
331      if (rp == NULL)
332	rp = TMP_ALLOC_LIMBS (nn);
333      if (rp != np)
334	MPN_COPY (rp, np, nn);
335      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
336    }
337
338  MPN_NORMALIZE (rp, rn);
339
340  TMP_FREE;
341  return rn;
342}
343