mpn/generic/mulmod_bnm1.c

218885Sdim/* mulmod_bnm1.c -- multiplication mod B^n-1.
218885Sdim
218885Sdim   Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and
218885Sdim   Marco Bodrato.
218885Sdim
218885Sdim   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
218885Sdim   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
218885Sdim   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
218885Sdim
218885SdimCopyright 2009, 2010 Free Software Foundation, Inc.
218885Sdim
218885SdimThis file is part of the GNU MP Library.
218885Sdim
218885SdimThe GNU MP Library is free software; you can redistribute it and/or modify
218885Sdimit under the terms of the GNU Lesser General Public License as published by
218885Sdimthe Free Software Foundation; either version 3 of the License, or (at your
218885Sdimoption) any later version.
218885Sdim
218885SdimThe GNU MP Library is distributed in the hope that it will be useful, but
218885SdimWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
218885Sdimor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
218885SdimLicense for more details.
218885Sdim
218885SdimYou should have received a copy of the GNU Lesser General Public License
218885Sdimalong with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
218885Sdim
218885Sdim
218885Sdim#include "gmp.h"
218885Sdim#include "gmp-impl.h"
218885Sdim#include "longlong.h"
218885Sdim
218885Sdim/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
218885Sdim   mod B^rn - 1, and values are semi-normalised; zero is represented
218885Sdim   as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
218885Sdim   tp==rp is allowed. */
218885Sdimvoid
218885Sdimmpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
218885Sdim		    mp_ptr tp)
218885Sdim{
218885Sdim  mp_limb_t cy;
218885Sdim
218885Sdim  ASSERT (0 < rn);
218885Sdim
218885Sdim  mpn_mul_n (tp, ap, bp, rn);
218885Sdim  cy = mpn_add_n (rp, tp, tp + rn, rn);
218885Sdim  /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
218885Sdim   * be no overflow when adding in the carry. */
218885Sdim  MPN_INCR_U (rp, rn, cy);
218885Sdim}
218885Sdim
218885Sdim
218885Sdim/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
218885Sdim   semi-normalised representation, computation is mod B^rn + 1. Needs
218885Sdim   a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
218885Sdim   Output is normalised. */
218885Sdimstatic void
218885Sdimmpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
218885Sdim		    mp_ptr tp)
218885Sdim{
218885Sdim  mp_limb_t cy;
218885Sdim
218885Sdim  ASSERT (0 < rn);
218885Sdim
218885Sdim  mpn_mul_n (tp, ap, bp, rn + 1);
218885Sdim  ASSERT (tp[2*rn+1] == 0);
218885Sdim  ASSERT (tp[2*rn] < GMP_NUMB_MAX);
218885Sdim  cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
218885Sdim  rp[rn] = 0;
218885Sdim  MPN_INCR_U (rp, rn+1, cy );
218885Sdim}
218885Sdim
218885Sdim
218885Sdim/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
218885Sdim *
218885Sdim * The result is expected to be ZERO if and only if one of the operand
218885Sdim * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
218885Sdim * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
218885Sdim * combine results and obtain a natural number when one knows in
218885Sdim * advance that the final value is less than (B^rn-1).
218885Sdim * Moreover it should not be a problem if mulmod_bnm1 is used to
218885Sdim * compute the full product with an+bn <= rn, because this condition
218885Sdim * implies (B^an-1)(B^bn-1) < (B^rn-1) .
218885Sdim *
218885Sdim * Requires 0 < bn <= an <= rn and an + bn > rn/2
218885Sdim * Scratch need: rn + (need for recursive call OR rn + 4). This gives
218885Sdim *
218885Sdim * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
218885Sdim */
218885Sdimvoid
218885Sdimmpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
218885Sdim{
218885Sdim  ASSERT (0 < bn);
218885Sdim  ASSERT (bn <= an);
218885Sdim  ASSERT (an <= rn);
218885Sdim
218885Sdim  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
218885Sdim    {
218885Sdim      if (UNLIKELY (bn < rn))
218885Sdim	{
218885Sdim	  if (UNLIKELY (an + bn <= rn))
218885Sdim	    {
218885Sdim	      mpn_mul (rp, ap, an, bp, bn);
218885Sdim	    }
218885Sdim	  else
218885Sdim	    {
218885Sdim	      mp_limb_t cy;
218885Sdim	      mpn_mul (tp, ap, an, bp, bn);
218885Sdim	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
218885Sdim	      MPN_INCR_U (rp, rn, cy);
218885Sdim	    }
218885Sdim	}
218885Sdim      else
218885Sdim	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
218885Sdim    }
218885Sdim  else
218885Sdim    {
218885Sdim      mp_size_t n;
218885Sdim      mp_limb_t cy;
218885Sdim      mp_limb_t hi;
218885Sdim
218885Sdim      n = rn >> 1;
218885Sdim
218885Sdim      /* We need at least an + bn >= n, to be able to fit one of the
218885Sdim	 recursive products at rp. Requiring strict inequality makes
218885Sdim	 the coded slightly simpler. If desired, we could avoid this
218885Sdim	 restriction by initially halving rn as long as rn is even and
218885Sdim	 an + bn <= rn/2. */
218885Sdim
218885Sdim      ASSERT (an + bn > n);
218885Sdim
218885Sdim      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
218885Sdim	 and crt together as
218885Sdim
218885Sdim	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
218885Sdim      */
218885Sdim
218885Sdim#define a0 ap
218885Sdim#define a1 (ap + n)
218885Sdim#define b0 bp
218885Sdim#define b1 (bp + n)
218885Sdim
218885Sdim#define xp  tp	/* 2n + 2 */
218885Sdim      /* am1  maybe in {xp, n} */
218885Sdim      /* bm1  maybe in {xp + n, n} */
218885Sdim#define sp1 (tp + 2*n + 2)
218885Sdim      /* ap1  maybe in {sp1, n + 1} */
218885Sdim      /* bp1  maybe in {sp1 + n + 1, n + 1} */
218885Sdim
218885Sdim      {
218885Sdim	mp_srcptr am1, bm1;
218885Sdim	mp_size_t anm, bnm;
218885Sdim	mp_ptr so;
218885Sdim
218885Sdim	if (LIKELY (an > n))
218885Sdim	  {
218885Sdim	    am1 = xp;
218885Sdim	    cy = mpn_add (xp, a0, n, a1, an - n);
218885Sdim	    MPN_INCR_U (xp, n, cy);
218885Sdim	    anm = n;
218885Sdim	    if (LIKELY (bn > n))
218885Sdim	      {
218885Sdim		bm1 = xp + n;
218885Sdim		cy = mpn_add (xp + n, b0, n, b1, bn - n);
218885Sdim		MPN_INCR_U (xp + n, n, cy);
218885Sdim		bnm = n;
218885Sdim		so = xp + 2*n;
218885Sdim	      }
218885Sdim	    else
218885Sdim	      {
218885Sdim		so = xp + n;
218885Sdim		bm1 = b0;
218885Sdim		bnm = bn;
218885Sdim	      }
218885Sdim	  }
218885Sdim	else
218885Sdim	  {
218885Sdim	    so = xp;
218885Sdim	    am1 = a0;
218885Sdim	    anm = an;
218885Sdim	    bm1 = b0;
218885Sdim	    bnm = bn;
218885Sdim	  }
218885Sdim
218885Sdim	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
218885Sdim      }
218885Sdim
218885Sdim      {
218885Sdim	int       k;
218885Sdim	mp_srcptr ap1, bp1;
218885Sdim	mp_size_t anp, bnp;
218885Sdim
218885Sdim	if (LIKELY (an > n)) {
218885Sdim	  ap1 = sp1;
218885Sdim	  cy = mpn_sub (sp1, a0, n, a1, an - n);
218885Sdim	  sp1[n] = 0;
218885Sdim	  MPN_INCR_U (sp1, n + 1, cy);
218885Sdim	  anp = n + ap1[n];
218885Sdim	} else {
218885Sdim	  ap1 = a0;
218885Sdim	  anp = an;
218885Sdim	}
218885Sdim
218885Sdim	if (LIKELY (bn > n)) {
218885Sdim	  bp1 = sp1 + n + 1;
218885Sdim	  cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
218885Sdim	  sp1[2*n+1] = 0;
218885Sdim	  MPN_INCR_U (sp1 + n + 1, n + 1, cy);
218885Sdim	  bnp = n + bp1[n];
218885Sdim	} else {
218885Sdim	  bp1 = b0;
218885Sdim	  bnp = bn;
218885Sdim	}
218885Sdim
218885Sdim	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
218885Sdim	  k=0;
218885Sdim	else
218885Sdim	  {
218885Sdim	    int mask;
218885Sdim	    k = mpn_fft_best_k (n, 0);
218885Sdim	    mask = (1<<k) -1;
218885Sdim	    while (n & mask) {k--; mask >>=1;};
218885Sdim	  }
218885Sdim	if (k >= FFT_FIRST_K)
218885Sdim	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
218885Sdim	else if (UNLIKELY (bp1 == b0))
218885Sdim	  {
218885Sdim	    ASSERT (anp + bnp <= 2*n+1);
218885Sdim	    ASSERT (anp + bnp > n);
218885Sdim	    ASSERT (anp >= bnp);
218885Sdim	    mpn_mul (xp, ap1, anp, bp1, bnp);
218885Sdim	    anp = anp + bnp - n;
218885Sdim	    ASSERT (anp <= n || xp[2*n]==0);
218885Sdim	    anp-= anp > n;
218885Sdim	    cy = mpn_sub (xp, xp, n, xp + n, anp);
218885Sdim	    xp[n] = 0;
218885Sdim	    MPN_INCR_U (xp, n+1, cy);
218885Sdim	  }
218885Sdim	else
218885Sdim	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
218885Sdim      }
218885Sdim
218885Sdim      /* Here the CRT recomposition begins.
218885Sdim
218885Sdim	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
218885Sdim	 Division by 2 is a bitwise rotation.
218885Sdim
218885Sdim	 Assumes xp normalised mod (B^n+1).
218885Sdim
218885Sdim	 The residue class [0] is represented by [B^n-1]; except when
218885Sdim	 both input are ZERO.
218885Sdim      */
218885Sdim
218885Sdim#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
218885Sdim#if HAVE_NATIVE_mpn_rsh1add_nc
218885Sdim      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
218885Sdim      hi = cy << (GMP_NUMB_BITS - 1);
218885Sdim      cy = 0;
218885Sdim      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
218885Sdim	 overflows, i.e. a further increment will not overflow again. */
218885Sdim#else /* ! _nc */
218885Sdim      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
218885Sdim      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
218885Sdim      cy >>= 1;
218885Sdim      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
218885Sdim	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
218885Sdim#endif
218885Sdim#if GMP_NAIL_BITS == 0
218885Sdim      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
218885Sdim#else
218885Sdim      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
218885Sdim      rp[n-1] ^= hi;
218885Sdim#endif
218885Sdim#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
218885Sdim#if HAVE_NATIVE_mpn_add_nc
218885Sdim      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
218885Sdim#else /* ! _nc */
218885Sdim      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
218885Sdim#endif
218885Sdim      cy += (rp[0]&1);
218885Sdim      mpn_rshift(rp, rp, n, 1);
218885Sdim      ASSERT (cy <= 2);
218885Sdim      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
218885Sdim      cy >>= 1;
218885Sdim      /* We can have cy != 0 only if hi = 0... */
218885Sdim      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
218885Sdim      rp[n-1] |= hi;
218885Sdim      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
218885Sdim#endif
218885Sdim      ASSERT (cy <= 1);
218885Sdim      /* Next increment can not overflow, read the previous comments about cy. */
218885Sdim      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
218885Sdim      MPN_INCR_U(rp, n, cy);
218885Sdim
218885Sdim      /* Compute the highest half:
218885Sdim	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
218885Sdim       */
218885Sdim      if (UNLIKELY (an + bn < rn))
218885Sdim	{
218885Sdim	  /* Note that in this case, the only way the result can equal
218885Sdim	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
218885Sdim	     then the output of both the recursive calls and this CRT
218885Sdim	     reconstruction is zero, not B^{rn} - 1. Which is good,
218885Sdim	     since the latter representation doesn't fit in the output
218885Sdim	     area.*/
218885Sdim	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
218885Sdim
218885Sdim	  /* FIXME: This subtraction of the high parts is not really
218885Sdim	     necessary, we do it to get the carry out, and for sanity
218885Sdim	     checking. */
218885Sdim	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
218885Sdim				   xp + an + bn - n, rn - (an + bn), cy);
218885Sdim	  ASSERT (an + bn == rn - 1 ||
218885Sdim		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
218885Sdim	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
218885Sdim	  ASSERT (cy == (xp + an + bn - n)[0]);
218885Sdim	}
218885Sdim      else
218885Sdim	{
218885Sdim	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
218885Sdim	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
218885Sdim	     DECR will affect _at most_ the lowest n limbs. */
218885Sdim	  MPN_DECR_U (rp, 2*n, cy);
218885Sdim	}
218885Sdim#undef a0
218885Sdim#undef a1
218885Sdim#undef b0
218885Sdim#undef b1
218885Sdim#undef xp
218885Sdim#undef sp1
218885Sdim    }
218885Sdim}
218885Sdim
218885Sdimmp_size_t
218885Sdimmpn_mulmod_bnm1_next_size (mp_size_t n)
218885Sdim{
218885Sdim  mp_size_t nh;
218885Sdim
218885Sdim  if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
218885Sdim    return n;
218885Sdim  if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
218885Sdim    return (n + (2-1)) & (-2);
218885Sdim  if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
218885Sdim    return (n + (4-1)) & (-4);
218885Sdim
218885Sdim  nh = (n + 1) >> 1;
218885Sdim
218885Sdim  if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
218885Sdim    return (n + (8-1)) & (-8);
218885Sdim
218885Sdim  return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
218885Sdim}
223017Sdim