1/* mpn_gcdext -- Extended Greatest Common Divisor. 2 3Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation, 4Inc. 5 6This file is part of the GNU MP Library. 7 8The GNU MP Library is free software; you can redistribute it and/or modify 9it under the terms of either: 10 11 * the GNU Lesser General Public License as published by the Free 12 Software Foundation; either version 3 of the License, or (at your 13 option) any later version. 14 15or 16 17 * the GNU General Public License as published by the Free Software 18 Foundation; either version 2 of the License, or (at your option) any 19 later version. 20 21or both in parallel, as here. 22 23The GNU MP Library is distributed in the hope that it will be useful, but 24WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26for more details. 27 28You should have received copies of the GNU General Public License and the 29GNU Lesser General Public License along with the GNU MP Library. If not, 30see https://www.gnu.org/licenses/. */ 31 32#include "gmp-impl.h" 33#include "longlong.h" 34 35/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and 36 the size is returned (if inputs are non-normalized, result may be 37 non-normalized too). Temporary space needed is M->n + n. 38 */ 39static size_t 40hgcd_mul_matrix_vector (struct hgcd_matrix *M, 41 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) 42{ 43 mp_limb_t ah, bh; 44 45 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 46 47 t = u00 * a 48 r = u10 * b 49 r += t; 50 51 t = u11 * b 52 b = u01 * a 53 b += t; 54 */ 55 56 if (M->n >= n) 57 { 58 mpn_mul (tp, M->p[0][0], M->n, ap, n); 59 mpn_mul (rp, M->p[1][0], M->n, bp, n); 60 } 61 else 62 { 63 mpn_mul (tp, ap, n, M->p[0][0], M->n); 64 mpn_mul (rp, bp, n, M->p[1][0], M->n); 65 } 66 67 ah = mpn_add_n (rp, rp, tp, n + M->n); 68 69 if (M->n >= n) 70 { 71 mpn_mul (tp, M->p[1][1], M->n, bp, n); 72 mpn_mul (bp, M->p[0][1], M->n, ap, n); 73 } 74 else 75 { 76 mpn_mul (tp, bp, n, M->p[1][1], M->n); 77 mpn_mul (bp, ap, n, M->p[0][1], M->n); 78 } 79 bh = mpn_add_n (bp, bp, tp, n + M->n); 80 81 n += M->n; 82 if ( (ah | bh) > 0) 83 { 84 rp[n] = ah; 85 bp[n] = bh; 86 n++; 87 } 88 else 89 { 90 /* Normalize */ 91 while ( (rp[n-1] | bp[n-1]) == 0) 92 n--; 93 } 94 95 return n; 96} 97 98#define COMPUTE_V_ITCH(n) (2*(n)) 99 100/* Computes |v| = |(g - u a)| / b, where u may be positive or 101 negative, and v is of the opposite sign. max(a, b) is of size n, u and 102 v at most size n, and v must have space for n+1 limbs. */ 103static mp_size_t 104compute_v (mp_ptr vp, 105 mp_srcptr ap, mp_srcptr bp, mp_size_t n, 106 mp_srcptr gp, mp_size_t gn, 107 mp_srcptr up, mp_size_t usize, 108 mp_ptr tp) 109{ 110 mp_size_t size; 111 mp_size_t an; 112 mp_size_t bn; 113 mp_size_t vn; 114 115 ASSERT (n > 0); 116 ASSERT (gn > 0); 117 ASSERT (usize != 0); 118 119 size = ABS (usize); 120 ASSERT (size <= n); 121 ASSERT (up[size-1] > 0); 122 123 an = n; 124 MPN_NORMALIZE (ap, an); 125 ASSERT (gn <= an); 126 127 if (an >= size) 128 mpn_mul (tp, ap, an, up, size); 129 else 130 mpn_mul (tp, up, size, ap, an); 131 132 size += an; 133 134 if (usize > 0) 135 { 136 /* |v| = -v = (u a - g) / b */ 137 138 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); 139 MPN_NORMALIZE (tp, size); 140 if (size == 0) 141 return 0; 142 } 143 else 144 { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a, 145 (g + |u| a) always fits in (|usize| + an) limbs. */ 146 147 ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn)); 148 size -= (tp[size - 1] == 0); 149 } 150 151 /* Now divide t / b. There must be no remainder */ 152 bn = n; 153 MPN_NORMALIZE (bp, bn); 154 ASSERT (size >= bn); 155 156 vn = size + 1 - bn; 157 ASSERT (vn <= n + 1); 158 159 mpn_divexact (vp, tp, size, bp, bn); 160 vn -= (vp[vn-1] == 0); 161 162 return vn; 163} 164 165/* Temporary storage: 166 167 Initial division: Quotient of at most an - n + 1 <= an limbs. 168 169 Storage for u0 and u1: 2(n+1). 170 171 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) 172 173 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. 174 175 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. 176 177 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. 178 179 For the lehmer call after the loop, Let T denote 180 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for 181 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T 182 for u, T+1 for v and 2T scratch space. In all, 7T + 3 is 183 sufficient for both operations. 184 185*/ 186 187/* Optimal choice of p seems difficult. In each iteration the division 188 * of work between hgcd and the updates of u0 and u1 depends on the 189 * current size of the u. It may be desirable to use a different 190 * choice of p in each iteration. Also the input size seems to matter; 191 * choosing p = n / 3 in the first iteration seems to improve 192 * performance slightly for input size just above the threshold, but 193 * degrade performance for larger inputs. */ 194#define CHOOSE_P_1(n) ((n) / 2) 195#define CHOOSE_P_2(n) ((n) / 3) 196 197mp_size_t 198mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 199 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) 200{ 201 mp_size_t talloc; 202 mp_size_t scratch; 203 mp_size_t matrix_scratch; 204 mp_size_t ualloc = n + 1; 205 206 struct gcdext_ctx ctx; 207 mp_size_t un; 208 mp_ptr u0; 209 mp_ptr u1; 210 211 mp_ptr tp; 212 213 TMP_DECL; 214 215 ASSERT (an >= n); 216 ASSERT (n > 0); 217 ASSERT (bp[n-1] > 0); 218 219 TMP_MARK; 220 221 /* FIXME: Check for small sizes first, before setting up temporary 222 storage etc. */ 223 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); 224 225 /* For initial division */ 226 scratch = an - n + 1; 227 if (scratch > talloc) 228 talloc = scratch; 229 230 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 231 { 232 /* For hgcd loop. */ 233 mp_size_t hgcd_scratch; 234 mp_size_t update_scratch; 235 mp_size_t p1 = CHOOSE_P_1 (n); 236 mp_size_t p2 = CHOOSE_P_2 (n); 237 mp_size_t min_p = MIN(p1, p2); 238 mp_size_t max_p = MAX(p1, p2); 239 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); 240 hgcd_scratch = mpn_hgcd_itch (n - min_p); 241 update_scratch = max_p + n - 1; 242 243 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); 244 if (scratch > talloc) 245 talloc = scratch; 246 247 /* Final mpn_gcdext_lehmer_n call. Need space for u and for 248 copies of a and b. */ 249 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) 250 + 3*GCDEXT_DC_THRESHOLD; 251 252 if (scratch > talloc) 253 talloc = scratch; 254 255 /* Cofactors u0 and u1 */ 256 talloc += 2*(n+1); 257 } 258 259 tp = TMP_ALLOC_LIMBS(talloc); 260 261 if (an > n) 262 { 263 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); 264 265 if (mpn_zero_p (ap, n)) 266 { 267 MPN_COPY (gp, bp, n); 268 *usizep = 0; 269 TMP_FREE; 270 return n; 271 } 272 } 273 274 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 275 { 276 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); 277 278 TMP_FREE; 279 return gn; 280 } 281 282 MPN_ZERO (tp, 2*ualloc); 283 u0 = tp; tp += ualloc; 284 u1 = tp; tp += ualloc; 285 286 ctx.gp = gp; 287 ctx.up = up; 288 ctx.usize = usizep; 289 290 { 291 /* For the first hgcd call, there are no u updates, and it makes 292 some sense to use a different choice for p. */ 293 294 /* FIXME: We could trim use of temporary storage, since u0 and u1 295 are not used yet. For the hgcd call, we could swap in the u0 296 and u1 pointers for the relevant matrix elements. */ 297 298 struct hgcd_matrix M; 299 mp_size_t p = CHOOSE_P_1 (n); 300 mp_size_t nn; 301 302 mpn_hgcd_matrix_init (&M, n - p, tp); 303 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 304 if (nn > 0) 305 { 306 ASSERT (M.n <= (n - p - 1)/2); 307 ASSERT (M.n + p <= (p + n - 1) / 2); 308 309 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 310 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); 311 312 MPN_COPY (u0, M.p[1][0], M.n); 313 MPN_COPY (u1, M.p[1][1], M.n); 314 un = M.n; 315 while ( (u0[un-1] | u1[un-1] ) == 0) 316 un--; 317 } 318 else 319 { 320 /* mpn_hgcd has failed. Then either one of a or b is very 321 small, or the difference is very small. Perform one 322 subtraction followed by one division. */ 323 u1[0] = 1; 324 325 ctx.u0 = u0; 326 ctx.u1 = u1; 327 ctx.tp = tp + n; /* ualloc */ 328 ctx.un = 1; 329 330 /* Temporary storage n */ 331 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); 332 if (n == 0) 333 { 334 TMP_FREE; 335 return ctx.gn; 336 } 337 338 un = ctx.un; 339 ASSERT (un < ualloc); 340 } 341 } 342 343 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 344 { 345 struct hgcd_matrix M; 346 mp_size_t p = CHOOSE_P_2 (n); 347 mp_size_t nn; 348 349 mpn_hgcd_matrix_init (&M, n - p, tp); 350 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 351 if (nn > 0) 352 { 353 mp_ptr t0; 354 355 t0 = tp + matrix_scratch; 356 ASSERT (M.n <= (n - p - 1)/2); 357 ASSERT (M.n + p <= (p + n - 1) / 2); 358 359 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 360 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); 361 362 /* By the same analysis as for mpn_hgcd_matrix_mul */ 363 ASSERT (M.n + un <= ualloc); 364 365 /* FIXME: This copying could be avoided by some swapping of 366 * pointers. May need more temporary storage, though. */ 367 MPN_COPY (t0, u0, un); 368 369 /* Temporary storage ualloc */ 370 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); 371 372 ASSERT (un < ualloc); 373 ASSERT ( (u0[un-1] | u1[un-1]) > 0); 374 } 375 else 376 { 377 /* mpn_hgcd has failed. Then either one of a or b is very 378 small, or the difference is very small. Perform one 379 subtraction followed by one division. */ 380 ctx.u0 = u0; 381 ctx.u1 = u1; 382 ctx.tp = tp + n; /* ualloc */ 383 ctx.un = un; 384 385 /* Temporary storage n */ 386 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); 387 if (n == 0) 388 { 389 TMP_FREE; 390 return ctx.gn; 391 } 392 393 un = ctx.un; 394 ASSERT (un < ualloc); 395 } 396 } 397 /* We have A = ... a + ... b 398 B = u0 a + u1 b 399 400 a = u1 A + ... B 401 b = -u0 A + ... B 402 403 with bounds 404 405 |u0|, |u1| <= B / min(a, b) 406 407 We always have u1 > 0, and u0 == 0 is possible only if u1 == 1, 408 in which case the only reduction done so far is a = A - k B for 409 some k. 410 411 Compute g = u a + v b = (u u1 - v u0) A + (...) B 412 Here, u, v are bounded by 413 414 |u| <= b, 415 |v| <= a 416 */ 417 418 ASSERT ( (ap[n-1] | bp[n-1]) > 0); 419 420 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) 421 { 422 /* Must return the smallest cofactor, +u1 or -u0 */ 423 int c; 424 425 MPN_COPY (gp, ap, n); 426 427 MPN_CMP (c, u0, u1, un); 428 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in 429 this case we choose the cofactor + 1, corresponding to G = A 430 - k B, rather than -1, corresponding to G = - A + (k+1) B. */ 431 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); 432 if (c < 0) 433 { 434 MPN_NORMALIZE (u0, un); 435 MPN_COPY (up, u0, un); 436 *usizep = -un; 437 } 438 else 439 { 440 MPN_NORMALIZE_NOT_ZERO (u1, un); 441 MPN_COPY (up, u1, un); 442 *usizep = un; 443 } 444 445 TMP_FREE; 446 return n; 447 } 448 else if (UNLIKELY (u0[0] == 0) && un == 1) 449 { 450 mp_size_t gn; 451 ASSERT (u1[0] == 1); 452 453 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ 454 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); 455 456 TMP_FREE; 457 return gn; 458 } 459 else 460 { 461 mp_size_t u0n; 462 mp_size_t u1n; 463 mp_size_t lehmer_un; 464 mp_size_t lehmer_vn; 465 mp_size_t gn; 466 467 mp_ptr lehmer_up; 468 mp_ptr lehmer_vp; 469 int negate; 470 471 lehmer_up = tp; tp += n; 472 473 /* Call mpn_gcdext_lehmer_n with copies of a and b. */ 474 MPN_COPY (tp, ap, n); 475 MPN_COPY (tp + n, bp, n); 476 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); 477 478 u0n = un; 479 MPN_NORMALIZE (u0, u0n); 480 ASSERT (u0n > 0); 481 482 if (lehmer_un == 0) 483 { 484 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ 485 MPN_COPY (up, u0, u0n); 486 *usizep = -u0n; 487 488 TMP_FREE; 489 return gn; 490 } 491 492 lehmer_vp = tp; 493 /* Compute v = (g - u a) / b */ 494 lehmer_vn = compute_v (lehmer_vp, 495 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); 496 497 if (lehmer_un > 0) 498 negate = 0; 499 else 500 { 501 lehmer_un = -lehmer_un; 502 negate = 1; 503 } 504 505 u1n = un; 506 MPN_NORMALIZE (u1, u1n); 507 ASSERT (u1n > 0); 508 509 ASSERT (lehmer_un + u1n <= ualloc); 510 ASSERT (lehmer_vn + u0n <= ualloc); 511 512 /* We may still have v == 0 */ 513 514 /* Compute u u0 */ 515 if (lehmer_un <= u1n) 516 /* Should be the common case */ 517 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); 518 else 519 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); 520 521 un = u1n + lehmer_un; 522 un -= (up[un - 1] == 0); 523 524 if (lehmer_vn > 0) 525 { 526 mp_limb_t cy; 527 528 /* Overwrites old u1 value */ 529 if (lehmer_vn <= u0n) 530 /* Should be the common case */ 531 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); 532 else 533 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); 534 535 u1n = u0n + lehmer_vn; 536 u1n -= (u1[u1n - 1] == 0); 537 538 if (u1n <= un) 539 { 540 cy = mpn_add (up, up, un, u1, u1n); 541 } 542 else 543 { 544 cy = mpn_add (up, u1, u1n, up, un); 545 un = u1n; 546 } 547 up[un] = cy; 548 un += (cy != 0); 549 550 ASSERT (un < ualloc); 551 } 552 *usizep = negate ? -un : un; 553 554 TMP_FREE; 555 return gn; 556 } 557} 558