1218885Sdim/* mulmod_bnm1.c -- multiplication mod B^n-1. 2218885Sdim 3218885Sdim Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and 4218885Sdim Marco Bodrato. 5218885Sdim 6218885Sdim THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 7218885Sdim SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 8218885Sdim GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9218885Sdim 10218885SdimCopyright 2009, 2010 Free Software Foundation, Inc. 11218885Sdim 12218885SdimThis file is part of the GNU MP Library. 13218885Sdim 14218885SdimThe GNU MP Library is free software; you can redistribute it and/or modify 15218885Sdimit under the terms of the GNU Lesser General Public License as published by 16218885Sdimthe Free Software Foundation; either version 3 of the License, or (at your 17218885Sdimoption) any later version. 18218885Sdim 19218885SdimThe GNU MP Library is distributed in the hope that it will be useful, but 20218885SdimWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 21218885Sdimor FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 22218885SdimLicense for more details. 23218885Sdim 24218885SdimYou should have received a copy of the GNU Lesser General Public License 25218885Sdimalong with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 26218885Sdim 27218885Sdim 28218885Sdim#include "gmp.h" 29218885Sdim#include "gmp-impl.h" 30218885Sdim#include "longlong.h" 31218885Sdim 32218885Sdim/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is 33218885Sdim mod B^rn - 1, and values are semi-normalised; zero is represented 34218885Sdim as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp. 35218885Sdim tp==rp is allowed. */ 36218885Sdimvoid 37218885Sdimmpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, 38218885Sdim mp_ptr tp) 39218885Sdim{ 40218885Sdim mp_limb_t cy; 41218885Sdim 42218885Sdim ASSERT (0 < rn); 43218885Sdim 44218885Sdim mpn_mul_n (tp, ap, bp, rn); 45218885Sdim cy = mpn_add_n (rp, tp, tp + rn, rn); 46218885Sdim /* If cy == 1, then the value of rp is at most B^rn - 2, so there can 47218885Sdim * be no overflow when adding in the carry. */ 48218885Sdim MPN_INCR_U (rp, rn, cy); 49218885Sdim} 50218885Sdim 51218885Sdim 52218885Sdim/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in 53218885Sdim semi-normalised representation, computation is mod B^rn + 1. Needs 54218885Sdim a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed. 55218885Sdim Output is normalised. */ 56218885Sdimstatic void 57218885Sdimmpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, 58218885Sdim mp_ptr tp) 59218885Sdim{ 60218885Sdim mp_limb_t cy; 61218885Sdim 62218885Sdim ASSERT (0 < rn); 63218885Sdim 64218885Sdim mpn_mul_n (tp, ap, bp, rn + 1); 65218885Sdim ASSERT (tp[2*rn+1] == 0); 66218885Sdim ASSERT (tp[2*rn] < GMP_NUMB_MAX); 67218885Sdim cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn); 68218885Sdim rp[rn] = 0; 69218885Sdim MPN_INCR_U (rp, rn+1, cy ); 70218885Sdim} 71218885Sdim 72218885Sdim 73218885Sdim/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1) 74218885Sdim * 75218885Sdim * The result is expected to be ZERO if and only if one of the operand 76218885Sdim * already is. Otherwise the class [0] Mod(B^rn-1) is represented by 77218885Sdim * B^rn-1. This should not be a problem if mulmod_bnm1 is used to 78218885Sdim * combine results and obtain a natural number when one knows in 79218885Sdim * advance that the final value is less than (B^rn-1). 80218885Sdim * Moreover it should not be a problem if mulmod_bnm1 is used to 81218885Sdim * compute the full product with an+bn <= rn, because this condition 82218885Sdim * implies (B^an-1)(B^bn-1) < (B^rn-1) . 83218885Sdim * 84218885Sdim * Requires 0 < bn <= an <= rn and an + bn > rn/2 85218885Sdim * Scratch need: rn + (need for recursive call OR rn + 4). This gives 86218885Sdim * 87218885Sdim * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4 88218885Sdim */ 89218885Sdimvoid 90218885Sdimmpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp) 91218885Sdim{ 92218885Sdim ASSERT (0 < bn); 93218885Sdim ASSERT (bn <= an); 94218885Sdim ASSERT (an <= rn); 95218885Sdim 96218885Sdim if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD)) 97218885Sdim { 98218885Sdim if (UNLIKELY (bn < rn)) 99218885Sdim { 100218885Sdim if (UNLIKELY (an + bn <= rn)) 101218885Sdim { 102218885Sdim mpn_mul (rp, ap, an, bp, bn); 103218885Sdim } 104218885Sdim else 105218885Sdim { 106218885Sdim mp_limb_t cy; 107218885Sdim mpn_mul (tp, ap, an, bp, bn); 108218885Sdim cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn); 109218885Sdim MPN_INCR_U (rp, rn, cy); 110218885Sdim } 111218885Sdim } 112218885Sdim else 113218885Sdim mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp); 114218885Sdim } 115218885Sdim else 116218885Sdim { 117218885Sdim mp_size_t n; 118218885Sdim mp_limb_t cy; 119218885Sdim mp_limb_t hi; 120218885Sdim 121218885Sdim n = rn >> 1; 122218885Sdim 123218885Sdim /* We need at least an + bn >= n, to be able to fit one of the 124218885Sdim recursive products at rp. Requiring strict inequality makes 125218885Sdim the coded slightly simpler. If desired, we could avoid this 126218885Sdim restriction by initially halving rn as long as rn is even and 127218885Sdim an + bn <= rn/2. */ 128218885Sdim 129218885Sdim ASSERT (an + bn > n); 130218885Sdim 131218885Sdim /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1) 132218885Sdim and crt together as 133218885Sdim 134218885Sdim x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] 135218885Sdim */ 136218885Sdim 137218885Sdim#define a0 ap 138218885Sdim#define a1 (ap + n) 139218885Sdim#define b0 bp 140218885Sdim#define b1 (bp + n) 141218885Sdim 142218885Sdim#define xp tp /* 2n + 2 */ 143218885Sdim /* am1 maybe in {xp, n} */ 144218885Sdim /* bm1 maybe in {xp + n, n} */ 145218885Sdim#define sp1 (tp + 2*n + 2) 146218885Sdim /* ap1 maybe in {sp1, n + 1} */ 147218885Sdim /* bp1 maybe in {sp1 + n + 1, n + 1} */ 148218885Sdim 149218885Sdim { 150218885Sdim mp_srcptr am1, bm1; 151218885Sdim mp_size_t anm, bnm; 152218885Sdim mp_ptr so; 153218885Sdim 154218885Sdim if (LIKELY (an > n)) 155218885Sdim { 156218885Sdim am1 = xp; 157218885Sdim cy = mpn_add (xp, a0, n, a1, an - n); 158218885Sdim MPN_INCR_U (xp, n, cy); 159218885Sdim anm = n; 160218885Sdim if (LIKELY (bn > n)) 161218885Sdim { 162218885Sdim bm1 = xp + n; 163218885Sdim cy = mpn_add (xp + n, b0, n, b1, bn - n); 164218885Sdim MPN_INCR_U (xp + n, n, cy); 165218885Sdim bnm = n; 166218885Sdim so = xp + 2*n; 167218885Sdim } 168218885Sdim else 169218885Sdim { 170218885Sdim so = xp + n; 171218885Sdim bm1 = b0; 172218885Sdim bnm = bn; 173218885Sdim } 174218885Sdim } 175218885Sdim else 176218885Sdim { 177218885Sdim so = xp; 178218885Sdim am1 = a0; 179218885Sdim anm = an; 180218885Sdim bm1 = b0; 181218885Sdim bnm = bn; 182218885Sdim } 183218885Sdim 184218885Sdim mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so); 185218885Sdim } 186218885Sdim 187218885Sdim { 188218885Sdim int k; 189218885Sdim mp_srcptr ap1, bp1; 190218885Sdim mp_size_t anp, bnp; 191218885Sdim 192218885Sdim if (LIKELY (an > n)) { 193218885Sdim ap1 = sp1; 194218885Sdim cy = mpn_sub (sp1, a0, n, a1, an - n); 195218885Sdim sp1[n] = 0; 196218885Sdim MPN_INCR_U (sp1, n + 1, cy); 197218885Sdim anp = n + ap1[n]; 198218885Sdim } else { 199218885Sdim ap1 = a0; 200218885Sdim anp = an; 201218885Sdim } 202218885Sdim 203218885Sdim if (LIKELY (bn > n)) { 204218885Sdim bp1 = sp1 + n + 1; 205218885Sdim cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n); 206218885Sdim sp1[2*n+1] = 0; 207218885Sdim MPN_INCR_U (sp1 + n + 1, n + 1, cy); 208218885Sdim bnp = n + bp1[n]; 209218885Sdim } else { 210218885Sdim bp1 = b0; 211218885Sdim bnp = bn; 212218885Sdim } 213218885Sdim 214218885Sdim if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) 215218885Sdim k=0; 216218885Sdim else 217218885Sdim { 218218885Sdim int mask; 219218885Sdim k = mpn_fft_best_k (n, 0); 220218885Sdim mask = (1<<k) -1; 221218885Sdim while (n & mask) {k--; mask >>=1;}; 222218885Sdim } 223218885Sdim if (k >= FFT_FIRST_K) 224218885Sdim xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k); 225218885Sdim else if (UNLIKELY (bp1 == b0)) 226218885Sdim { 227218885Sdim ASSERT (anp + bnp <= 2*n+1); 228218885Sdim ASSERT (anp + bnp > n); 229218885Sdim ASSERT (anp >= bnp); 230218885Sdim mpn_mul (xp, ap1, anp, bp1, bnp); 231218885Sdim anp = anp + bnp - n; 232218885Sdim ASSERT (anp <= n || xp[2*n]==0); 233218885Sdim anp-= anp > n; 234218885Sdim cy = mpn_sub (xp, xp, n, xp + n, anp); 235218885Sdim xp[n] = 0; 236218885Sdim MPN_INCR_U (xp, n+1, cy); 237218885Sdim } 238218885Sdim else 239218885Sdim mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp); 240218885Sdim } 241218885Sdim 242218885Sdim /* Here the CRT recomposition begins. 243218885Sdim 244218885Sdim xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) 245218885Sdim Division by 2 is a bitwise rotation. 246218885Sdim 247218885Sdim Assumes xp normalised mod (B^n+1). 248218885Sdim 249218885Sdim The residue class [0] is represented by [B^n-1]; except when 250218885Sdim both input are ZERO. 251218885Sdim */ 252218885Sdim 253218885Sdim#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc 254218885Sdim#if HAVE_NATIVE_mpn_rsh1add_nc 255218885Sdim cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ 256218885Sdim hi = cy << (GMP_NUMB_BITS - 1); 257218885Sdim cy = 0; 258218885Sdim /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi 259218885Sdim overflows, i.e. a further increment will not overflow again. */ 260218885Sdim#else /* ! _nc */ 261218885Sdim cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ 262218885Sdim hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ 263218885Sdim cy >>= 1; 264218885Sdim /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that 265218885Sdim the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ 266218885Sdim#endif 267218885Sdim#if GMP_NAIL_BITS == 0 268218885Sdim add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi); 269218885Sdim#else 270218885Sdim cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); 271218885Sdim rp[n-1] ^= hi; 272218885Sdim#endif 273218885Sdim#else /* ! HAVE_NATIVE_mpn_rsh1add_n */ 274218885Sdim#if HAVE_NATIVE_mpn_add_nc 275218885Sdim cy = mpn_add_nc(rp, rp, xp, n, xp[n]); 276218885Sdim#else /* ! _nc */ 277218885Sdim cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ 278218885Sdim#endif 279218885Sdim cy += (rp[0]&1); 280218885Sdim mpn_rshift(rp, rp, n, 1); 281218885Sdim ASSERT (cy <= 2); 282218885Sdim hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ 283218885Sdim cy >>= 1; 284218885Sdim /* We can have cy != 0 only if hi = 0... */ 285218885Sdim ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); 286218885Sdim rp[n-1] |= hi; 287218885Sdim /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ 288218885Sdim#endif 289218885Sdim ASSERT (cy <= 1); 290218885Sdim /* Next increment can not overflow, read the previous comments about cy. */ 291218885Sdim ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); 292218885Sdim MPN_INCR_U(rp, n, cy); 293218885Sdim 294218885Sdim /* Compute the highest half: 295218885Sdim ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n 296218885Sdim */ 297218885Sdim if (UNLIKELY (an + bn < rn)) 298218885Sdim { 299218885Sdim /* Note that in this case, the only way the result can equal 300218885Sdim zero mod B^{rn} - 1 is if one of the inputs is zero, and 301218885Sdim then the output of both the recursive calls and this CRT 302218885Sdim reconstruction is zero, not B^{rn} - 1. Which is good, 303218885Sdim since the latter representation doesn't fit in the output 304218885Sdim area.*/ 305218885Sdim cy = mpn_sub_n (rp + n, rp, xp, an + bn - n); 306218885Sdim 307218885Sdim /* FIXME: This subtraction of the high parts is not really 308218885Sdim necessary, we do it to get the carry out, and for sanity 309218885Sdim checking. */ 310218885Sdim cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n, 311218885Sdim xp + an + bn - n, rn - (an + bn), cy); 312218885Sdim ASSERT (an + bn == rn - 1 || 313218885Sdim mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn))); 314218885Sdim cy = mpn_sub_1 (rp, rp, an + bn, cy); 315218885Sdim ASSERT (cy == (xp + an + bn - n)[0]); 316218885Sdim } 317218885Sdim else 318218885Sdim { 319218885Sdim cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); 320218885Sdim /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. 321218885Sdim DECR will affect _at most_ the lowest n limbs. */ 322218885Sdim MPN_DECR_U (rp, 2*n, cy); 323218885Sdim } 324218885Sdim#undef a0 325218885Sdim#undef a1 326218885Sdim#undef b0 327218885Sdim#undef b1 328218885Sdim#undef xp 329218885Sdim#undef sp1 330218885Sdim } 331218885Sdim} 332218885Sdim 333218885Sdimmp_size_t 334218885Sdimmpn_mulmod_bnm1_next_size (mp_size_t n) 335218885Sdim{ 336218885Sdim mp_size_t nh; 337218885Sdim 338218885Sdim if (BELOW_THRESHOLD (n, MULMOD_BNM1_THRESHOLD)) 339218885Sdim return n; 340218885Sdim if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1)) 341218885Sdim return (n + (2-1)) & (-2); 342218885Sdim if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1)) 343218885Sdim return (n + (4-1)) & (-4); 344218885Sdim 345218885Sdim nh = (n + 1) >> 1; 346218885Sdim 347218885Sdim if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD)) 348218885Sdim return (n + (8-1)) & (-8); 349218885Sdim 350218885Sdim return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0)); 351218885Sdim} 352223017Sdim