1/* mpn_gcdext -- Extended Greatest Common Divisor.
2
3Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
4Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of either:
10
11  * the GNU Lesser General Public License as published by the Free
12    Software Foundation; either version 3 of the License, or (at your
13    option) any later version.
14
15or
16
17  * the GNU General Public License as published by the Free Software
18    Foundation; either version 2 of the License, or (at your option) any
19    later version.
20
21or both in parallel, as here.
22
23The GNU MP Library is distributed in the hope that it will be useful, but
24WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26for more details.
27
28You should have received copies of the GNU General Public License and the
29GNU Lesser General Public License along with the GNU MP Library.  If not,
30see https://www.gnu.org/licenses/.  */
31
32#include "gmp-impl.h"
33#include "longlong.h"
34
35/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
36   the size is returned (if inputs are non-normalized, result may be
37   non-normalized too). Temporary space needed is M->n + n.
38 */
39static size_t
40hgcd_mul_matrix_vector (struct hgcd_matrix *M,
41			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
42{
43  mp_limb_t ah, bh;
44
45  /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
46
47     t  = u00 * a
48     r  = u10 * b
49     r += t;
50
51     t  = u11 * b
52     b  = u01 * a
53     b += t;
54  */
55
56  if (M->n >= n)
57    {
58      mpn_mul (tp, M->p[0][0], M->n, ap, n);
59      mpn_mul (rp, M->p[1][0], M->n, bp, n);
60    }
61  else
62    {
63      mpn_mul (tp, ap, n, M->p[0][0], M->n);
64      mpn_mul (rp, bp, n, M->p[1][0], M->n);
65    }
66
67  ah = mpn_add_n (rp, rp, tp, n + M->n);
68
69  if (M->n >= n)
70    {
71      mpn_mul (tp, M->p[1][1], M->n, bp, n);
72      mpn_mul (bp, M->p[0][1], M->n, ap, n);
73    }
74  else
75    {
76      mpn_mul (tp, bp, n, M->p[1][1], M->n);
77      mpn_mul (bp, ap, n, M->p[0][1], M->n);
78    }
79  bh = mpn_add_n (bp, bp, tp, n + M->n);
80
81  n += M->n;
82  if ( (ah | bh) > 0)
83    {
84      rp[n] = ah;
85      bp[n] = bh;
86      n++;
87    }
88  else
89    {
90      /* Normalize */
91      while ( (rp[n-1] | bp[n-1]) == 0)
92	n--;
93    }
94
95  return n;
96}
97
98#define COMPUTE_V_ITCH(n) (2*(n))
99
100/* Computes |v| = |(g - u a)| / b, where u may be positive or
101   negative, and v is of the opposite sign. max(a, b) is of size n, u and
102   v at most size n, and v must have space for n+1 limbs. */
103static mp_size_t
104compute_v (mp_ptr vp,
105	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
106	   mp_srcptr gp, mp_size_t gn,
107	   mp_srcptr up, mp_size_t usize,
108	   mp_ptr tp)
109{
110  mp_size_t size;
111  mp_size_t an;
112  mp_size_t bn;
113  mp_size_t vn;
114
115  ASSERT (n > 0);
116  ASSERT (gn > 0);
117  ASSERT (usize != 0);
118
119  size = ABS (usize);
120  ASSERT (size <= n);
121  ASSERT (up[size-1] > 0);
122
123  an = n;
124  MPN_NORMALIZE (ap, an);
125  ASSERT (gn <= an);
126
127  if (an >= size)
128    mpn_mul (tp, ap, an, up, size);
129  else
130    mpn_mul (tp, up, size, ap, an);
131
132  size += an;
133
134  if (usize > 0)
135    {
136      /* |v| = -v = (u a - g) / b */
137
138      ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
139      MPN_NORMALIZE (tp, size);
140      if (size == 0)
141	return 0;
142    }
143  else
144    { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
145	 (g + |u| a) always fits in (|usize| + an) limbs. */
146
147      ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
148      size -= (tp[size - 1] == 0);
149    }
150
151  /* Now divide t / b. There must be no remainder */
152  bn = n;
153  MPN_NORMALIZE (bp, bn);
154  ASSERT (size >= bn);
155
156  vn = size + 1 - bn;
157  ASSERT (vn <= n + 1);
158
159  mpn_divexact (vp, tp, size, bp, bn);
160  vn -= (vp[vn-1] == 0);
161
162  return vn;
163}
164
165/* Temporary storage:
166
167   Initial division: Quotient of at most an - n + 1 <= an limbs.
168
169   Storage for u0 and u1: 2(n+1).
170
171   Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
172
173   Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
174
175   When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
176
177   When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
178
179   For the lehmer call after the loop, Let T denote
180   GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
181   u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
182   for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
183   sufficient for both operations.
184
185*/
186
187/* Optimal choice of p seems difficult. In each iteration the division
188 * of work between hgcd and the updates of u0 and u1 depends on the
189 * current size of the u. It may be desirable to use a different
190 * choice of p in each iteration. Also the input size seems to matter;
191 * choosing p = n / 3 in the first iteration seems to improve
192 * performance slightly for input size just above the threshold, but
193 * degrade performance for larger inputs. */
194#define CHOOSE_P_1(n) ((n) / 2)
195#define CHOOSE_P_2(n) ((n) / 3)
196
197mp_size_t
198mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
199	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
200{
201  mp_size_t talloc;
202  mp_size_t scratch;
203  mp_size_t matrix_scratch;
204  mp_size_t ualloc = n + 1;
205
206  struct gcdext_ctx ctx;
207  mp_size_t un;
208  mp_ptr u0;
209  mp_ptr u1;
210
211  mp_ptr tp;
212
213  TMP_DECL;
214
215  ASSERT (an >= n);
216  ASSERT (n > 0);
217  ASSERT (bp[n-1] > 0);
218
219  TMP_MARK;
220
221  /* FIXME: Check for small sizes first, before setting up temporary
222     storage etc. */
223  talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
224
225  /* For initial division */
226  scratch = an - n + 1;
227  if (scratch > talloc)
228    talloc = scratch;
229
230  if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
231    {
232      /* For hgcd loop. */
233      mp_size_t hgcd_scratch;
234      mp_size_t update_scratch;
235      mp_size_t p1 = CHOOSE_P_1 (n);
236      mp_size_t p2 = CHOOSE_P_2 (n);
237      mp_size_t min_p = MIN(p1, p2);
238      mp_size_t max_p = MAX(p1, p2);
239      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
240      hgcd_scratch = mpn_hgcd_itch (n - min_p);
241      update_scratch = max_p + n - 1;
242
243      scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
244      if (scratch > talloc)
245	talloc = scratch;
246
247      /* Final mpn_gcdext_lehmer_n call. Need space for u and for
248	 copies of a and b. */
249      scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
250	+ 3*GCDEXT_DC_THRESHOLD;
251
252      if (scratch > talloc)
253	talloc = scratch;
254
255      /* Cofactors u0 and u1 */
256      talloc += 2*(n+1);
257    }
258
259  tp = TMP_ALLOC_LIMBS(talloc);
260
261  if (an > n)
262    {
263      mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
264
265      if (mpn_zero_p (ap, n))
266	{
267	  MPN_COPY (gp, bp, n);
268	  *usizep = 0;
269	  TMP_FREE;
270	  return n;
271	}
272    }
273
274  if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
275    {
276      mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
277
278      TMP_FREE;
279      return gn;
280    }
281
282  MPN_ZERO (tp, 2*ualloc);
283  u0 = tp; tp += ualloc;
284  u1 = tp; tp += ualloc;
285
286  ctx.gp = gp;
287  ctx.up = up;
288  ctx.usize = usizep;
289
290  {
291    /* For the first hgcd call, there are no u updates, and it makes
292       some sense to use a different choice for p. */
293
294    /* FIXME: We could trim use of temporary storage, since u0 and u1
295       are not used yet. For the hgcd call, we could swap in the u0
296       and u1 pointers for the relevant matrix elements. */
297
298    struct hgcd_matrix M;
299    mp_size_t p = CHOOSE_P_1 (n);
300    mp_size_t nn;
301
302    mpn_hgcd_matrix_init (&M, n - p, tp);
303    nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
304    if (nn > 0)
305      {
306	ASSERT (M.n <= (n - p - 1)/2);
307	ASSERT (M.n + p <= (p + n - 1) / 2);
308
309	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
310	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
311
312	MPN_COPY (u0, M.p[1][0], M.n);
313	MPN_COPY (u1, M.p[1][1], M.n);
314	un = M.n;
315	while ( (u0[un-1] | u1[un-1] ) == 0)
316	  un--;
317      }
318    else
319      {
320	/* mpn_hgcd has failed. Then either one of a or b is very
321	   small, or the difference is very small. Perform one
322	   subtraction followed by one division. */
323	u1[0] = 1;
324
325	ctx.u0 = u0;
326	ctx.u1 = u1;
327	ctx.tp = tp + n; /* ualloc */
328	ctx.un = 1;
329
330	/* Temporary storage n */
331	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
332	if (n == 0)
333	  {
334	    TMP_FREE;
335	    return ctx.gn;
336	  }
337
338	un = ctx.un;
339	ASSERT (un < ualloc);
340      }
341  }
342
343  while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
344    {
345      struct hgcd_matrix M;
346      mp_size_t p = CHOOSE_P_2 (n);
347      mp_size_t nn;
348
349      mpn_hgcd_matrix_init (&M, n - p, tp);
350      nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
351      if (nn > 0)
352	{
353	  mp_ptr t0;
354
355	  t0 = tp + matrix_scratch;
356	  ASSERT (M.n <= (n - p - 1)/2);
357	  ASSERT (M.n + p <= (p + n - 1) / 2);
358
359	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
360	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
361
362	  /* By the same analysis as for mpn_hgcd_matrix_mul */
363	  ASSERT (M.n + un <= ualloc);
364
365	  /* FIXME: This copying could be avoided by some swapping of
366	   * pointers. May need more temporary storage, though. */
367	  MPN_COPY (t0, u0, un);
368
369	  /* Temporary storage ualloc */
370	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
371
372	  ASSERT (un < ualloc);
373	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
374	}
375      else
376	{
377	  /* mpn_hgcd has failed. Then either one of a or b is very
378	     small, or the difference is very small. Perform one
379	     subtraction followed by one division. */
380	  ctx.u0 = u0;
381	  ctx.u1 = u1;
382	  ctx.tp = tp + n; /* ualloc */
383	  ctx.un = un;
384
385	  /* Temporary storage n */
386	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
387	  if (n == 0)
388	    {
389	      TMP_FREE;
390	      return ctx.gn;
391	    }
392
393	  un = ctx.un;
394	  ASSERT (un < ualloc);
395	}
396    }
397  /* We have A = ... a + ... b
398	     B =  u0 a +  u1 b
399
400	     a = u1  A + ... B
401	     b = -u0 A + ... B
402
403     with bounds
404
405       |u0|, |u1| <= B / min(a, b)
406
407     We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
408     in which case the only reduction done so far is a = A - k B for
409     some k.
410
411     Compute g = u a + v b = (u u1 - v u0) A + (...) B
412     Here, u, v are bounded by
413
414       |u| <= b,
415       |v| <= a
416  */
417
418  ASSERT ( (ap[n-1] | bp[n-1]) > 0);
419
420  if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
421    {
422      /* Must return the smallest cofactor, +u1 or -u0 */
423      int c;
424
425      MPN_COPY (gp, ap, n);
426
427      MPN_CMP (c, u0, u1, un);
428      /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
429	 this case we choose the cofactor + 1, corresponding to G = A
430	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
431      ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
432      if (c < 0)
433	{
434	  MPN_NORMALIZE (u0, un);
435	  MPN_COPY (up, u0, un);
436	  *usizep = -un;
437	}
438      else
439	{
440	  MPN_NORMALIZE_NOT_ZERO (u1, un);
441	  MPN_COPY (up, u1, un);
442	  *usizep = un;
443	}
444
445      TMP_FREE;
446      return n;
447    }
448  else if (UNLIKELY (u0[0] == 0) && un == 1)
449    {
450      mp_size_t gn;
451      ASSERT (u1[0] == 1);
452
453      /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
454      gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
455
456      TMP_FREE;
457      return gn;
458    }
459  else
460    {
461      mp_size_t u0n;
462      mp_size_t u1n;
463      mp_size_t lehmer_un;
464      mp_size_t lehmer_vn;
465      mp_size_t gn;
466
467      mp_ptr lehmer_up;
468      mp_ptr lehmer_vp;
469      int negate;
470
471      lehmer_up = tp; tp += n;
472
473      /* Call mpn_gcdext_lehmer_n with copies of a and b. */
474      MPN_COPY (tp, ap, n);
475      MPN_COPY (tp + n, bp, n);
476      gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
477
478      u0n = un;
479      MPN_NORMALIZE (u0, u0n);
480      ASSERT (u0n > 0);
481
482      if (lehmer_un == 0)
483	{
484	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
485	  MPN_COPY (up, u0, u0n);
486	  *usizep = -u0n;
487
488	  TMP_FREE;
489	  return gn;
490	}
491
492      lehmer_vp = tp;
493      /* Compute v = (g - u a) / b */
494      lehmer_vn = compute_v (lehmer_vp,
495			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
496
497      if (lehmer_un > 0)
498	negate = 0;
499      else
500	{
501	  lehmer_un = -lehmer_un;
502	  negate = 1;
503	}
504
505      u1n = un;
506      MPN_NORMALIZE (u1, u1n);
507      ASSERT (u1n > 0);
508
509      ASSERT (lehmer_un + u1n <= ualloc);
510      ASSERT (lehmer_vn + u0n <= ualloc);
511
512      /* We may still have v == 0 */
513
514      /* Compute u u0 */
515      if (lehmer_un <= u1n)
516	/* Should be the common case */
517	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
518      else
519	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
520
521      un = u1n + lehmer_un;
522      un -= (up[un - 1] == 0);
523
524      if (lehmer_vn > 0)
525	{
526	  mp_limb_t cy;
527
528	  /* Overwrites old u1 value */
529	  if (lehmer_vn <= u0n)
530	    /* Should be the common case */
531	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
532	  else
533	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
534
535	  u1n = u0n + lehmer_vn;
536	  u1n -= (u1[u1n - 1] == 0);
537
538	  if (u1n <= un)
539	    {
540	      cy = mpn_add (up, up, un, u1, u1n);
541	    }
542	  else
543	    {
544	      cy = mpn_add (up, u1, u1n, up, un);
545	      un = u1n;
546	    }
547	  up[un] = cy;
548	  un += (cy != 0);
549
550	  ASSERT (un < ualloc);
551	}
552      *usizep = negate ? -un : un;
553
554      TMP_FREE;
555      return gn;
556    }
557}
558