dist/mpz/bin_ui.c

/* mpz_bin_ui(RESULT, N, K) -- Set RESULT to N over K.

Copyright 1998-2002, 2012, 2013, 2015, 2017-2018 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include "gmp-impl.h"

/* How many special cases? Minimum is 2: 0 and 1;
 * also 3 {0,1,2} and 5 {0,1,2,3,4} are implemented.
 */
#define APARTAJ_KALKULOJ 2

/* Whether to use (1) or not (0) the function mpz_bin_uiui whenever
 * the operands fit.
 */
#define UZU_BIN_UIUI 0

/* Whether to use a shortcut to precompute the product of four
 * elements (1), or precompute only the product of a couple (0).
 *
 * In both cases the precomputed product is then updated with some
 * linear operations to obtain the product of the next four (1)
 * [or two (0)] operands.
 */
#define KVAROPE 1

static void
posmpz_init (mpz_ptr r)
{
  mp_ptr rp;
  ASSERT (SIZ (r) > 0);
  rp = SIZ (r) + MPZ_REALLOC (r, SIZ (r) + 2);
  *rp = 0;
  *++rp = 0;
}

/* Equivalent to mpz_add_ui (r, r, in), but faster when
   0 < SIZ (r) < ALLOC (r) and limbs above SIZ (r) contain 0. */
static void
posmpz_inc_ui (mpz_ptr r, unsigned long in)
{
#if BITS_PER_ULONG > GMP_NUMB_BITS
  mpz_add_ui (r, r, in);
#else
  ASSERT (SIZ (r) > 0);
  MPN_INCR_U (PTR (r), SIZ (r) + 1, in);
  SIZ (r) += (PTR (r)[SIZ (r)] != 0);
#endif
}

/* Equivalent to mpz_sub_ui (r, r, in), but faster when
   0 < SIZ (r) and we know in advance that the result is positive. */
static void
posmpz_dec_ui (mpz_ptr r, unsigned long in)
{
#if BITS_PER_ULONG > GMP_NUMB_BITS
  mpz_sub_ui (r, r, in);
#else
  ASSERT (mpz_cmp_ui (r, in) >= 0);
  MPN_DECR_U (PTR (r), SIZ (r), in);
  SIZ (r) -= (PTR (r)[SIZ (r)-1] == 0);
#endif
}

/* Equivalent to mpz_tdiv_q_2exp (r, r, 1), but faster when
   0 < SIZ (r) and we know in advance that the result is positive. */
static void
posmpz_rsh1 (mpz_ptr r)
{
  mp_ptr rp;
  mp_size_t rn;

  rn = SIZ (r);
  rp = PTR (r);
  ASSERT (rn > 0);
  mpn_rshift (rp, rp, rn, 1);
  SIZ (r) -= rp[rn - 1] == 0;
}

/* Computes r = n(n+(2*k-1))/2
   It uses a sqare instead of a product, computing
   r = ((n+k-1)^2 + n - (k-1)^2)/2
   As a side effect, sets t = n+k-1
 */
static void
mpz_hmul_nbnpk (mpz_ptr r, mpz_srcptr n, unsigned long int k, mpz_ptr t)
{
  ASSERT (k > 0 && SIZ(n) > 0);
  --k;
  mpz_add_ui (t, n, k);
  mpz_mul (r, t, t);
  mpz_add (r, r, n);
  posmpz_rsh1 (r);
  if (LIKELY (k <= (1UL << (BITS_PER_ULONG / 2))))
    posmpz_dec_ui (r, (k + (k & 1))*(k >> 1));
  else
    {
      mpz_t tmp;
      mpz_init_set_ui (tmp, (k + (k & 1)));
      mpz_mul_ui (tmp, tmp, k >> 1);
      mpz_sub (r, r, tmp);
      mpz_clear (tmp);
    }
}

#if KVAROPE
static void
rek_raising_fac4 (mpz_ptr r, mpz_ptr p, mpz_ptr P, unsigned long int k, unsigned long int lk, mpz_ptr t)
{
  if (k - lk < 5)
    {
      do {
	posmpz_inc_ui (p, 4*k+2);
	mpz_addmul_ui (P, p, 4*k);
	posmpz_dec_ui (P, k);
	mpz_mul (r, r, P);
      } while (--k > lk);
    }
  else
    {
      mpz_t lt;
      unsigned long int m;

      m = ((k + lk) >> 1) + 1;
      rek_raising_fac4 (r, p, P, k, m, t);

      posmpz_inc_ui (p, 4*m+2);
      mpz_addmul_ui (P, p, 4*m);
      posmpz_dec_ui (P, m);
      if (t == NULL)
	{
	  mpz_init_set (lt, P);
	  t = lt;
	}
      else
	{
	  ALLOC (lt) = 0;
	  mpz_set (t, P);
	}
      rek_raising_fac4 (t, p, P, m - 1, lk, NULL);

      mpz_mul (r, r, t);
      mpz_clear (lt);
    }
}

/* Computes (n+1)(n+2)...(n+k)/2^(k/2 +k/4) using the helper function
   rek_raising_fac4, and exploiting an idea inspired by a piece of
   code that Fredrik Johansson wrote and by a comment by Niels M��ller.

   Assume k = 4i then compute:
     p  = (n+1)(n+4i)/2 - i
	  (n+1+1)(n+4i)/2 = p + i + (n+4i)/2
	  (n+1+1)(n+4i-1)/2 = p + i + ((n+4i)-(n+1+1))/2 = p + i + (n-n+4i-2)/2 = p + 3i-1
     P  = (p + i)*(p+3i-1)/2 = (n+1)(n+2)(n+4i-1)(n+4i)/8
     n' = n + 2
     i' = i - 1
	  (n'-1)(n')(n'+4i'+1)(n'+4i'+2)/8 = P
	  (n'-1)(n'+4i'+2)/2 - i' - 1 = p
	  (n'-1+2)(n'+4i'+2)/2 - i' - 1 = p + (n'+4i'+2)
	  (n'-1+2)(n'+4i'+2-2)/2 - i' - 1 = p + (n'+4i'+2) - (n'-1+2) =  p + 4i' + 1
	  (n'-1+2)(n'+4i'+2-2)/2 - i' = p + 4i' + 2
     p' = p + 4i' + 2 = (n'+1)(n'+4i')/2 - i'
	  p' - 4i' - 2 = p
	  (p' - 4i' - 2 + i)*(p' - 4i' - 2+3i-1)/2 = P
	  (p' - 4i' - 2 + i' + 1)*(p' - 4i' - 2 + 3i' + 3 - 1)/2 = P
	  (p' - 3i' - 1)*(p' - i')/2 = P
	  (p' - 3i' - 1 + 4i' + 1)*(p' - i' + 4i' - 1)/2 = P + (4i' + 1)*(p' - i')/2 + (p' - 3i' - 1 + 4i' + 1)*(4i' - 1)/2
	  (p' + i')*(p' + 3i' - 1)/2 = P + (4i')*(p' + p')/2 + (p' - i' - (p' + i'))/2
	  (p' + i')*(p' + 3i' - 1)/2 = P + 4i'p' + (p' - i' - p' - i')/2
	  (p' + i')*(p' + 3i' - 1)/2 = P + 4i'p' - i'
     P' = P + 4i'p' - i'

   And compute the product P * P' * P" ...
 */

static void
mpz_raising_fac4 (mpz_ptr r, mpz_ptr n, unsigned long int k, mpz_ptr t, mpz_ptr p)
{
  ASSERT ((k >= APARTAJ_KALKULOJ) && (APARTAJ_KALKULOJ > 0));
  posmpz_init (n);
  posmpz_inc_ui (n, 1);
  SIZ (r) = 0;
  if (k & 1)
    {
      mpz_set (r, n);
      posmpz_inc_ui (n, 1);
    }
  k >>= 1;
  if (APARTAJ_KALKULOJ < 2 && k == 0)
    return;

  mpz_hmul_nbnpk (p, n, k, t);
  posmpz_init (p);

  if (k & 1)
    {
      if (SIZ (r))
	mpz_mul (r, r, p);
      else
	mpz_set (r, p);
      posmpz_inc_ui (p, k - 1);
    }
  k >>= 1;
  if (APARTAJ_KALKULOJ < 4 && k == 0)
    return;

  mpz_hmul_nbnpk (t, p, k, n);
  if (SIZ (r))
    mpz_mul (r, r, t);
  else
    mpz_set (r, t);

  if (APARTAJ_KALKULOJ > 8 || k > 1)
    {
      posmpz_dec_ui (p, k);
      rek_raising_fac4 (r, p, t, k - 1, 0, n);
    }
}

#else /* KVAROPE */

static void
rek_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, unsigned long int lk, mpz_ptr t1, mpz_ptr t2)
{
  /* Should the threshold depend on SIZ (n) ? */
  if (k - lk < 10)
    {
      do {
	posmpz_inc_ui (n, k);
	mpz_mul (r, r, n);
	--k;
      } while (k > lk);
    }
  else
    {
      mpz_t t3;
      unsigned long int m;

      m = ((k + lk) >> 1) + 1;
      rek_raising_fac (r, n, k, m, t1, t2);

      posmpz_inc_ui (n, m);
      if (t1 == NULL)
	{
	  mpz_init_set (t3, n);
	  t1 = t3;
	}
      else
	{
	  ALLOC (t3) = 0;
	  mpz_set (t1, n);
	}
      rek_raising_fac (t1, n, m - 1, lk, t2, NULL);

      mpz_mul (r, r, t1);
      mpz_clear (t3);
    }
}

/* Computes (n+1)(n+2)...(n+k)/2^(k/2) using the helper function
   rek_raising_fac, and exploiting an idea inspired by a piece of
   code that Fredrik Johansson wrote.

   Force an even k = 2i then compute:
     p  = (n+1)(n+2i)/2
     i' = i - 1
     p == (n+1)(n+2i'+2)/2
     p' = p + i' == (n+2)(n+2i'+1)/2
     n' = n + 1
     p'== (n'+1)(n'+2i')/2 == (n+1 +1)(n+2i -1)/2

   And compute the product p * p' * p" ...
*/

static void
mpz_raising_fac (mpz_ptr r, mpz_ptr n, unsigned long int k, mpz_ptr t, mpz_ptr p)
{
  unsigned long int hk;
  ASSERT ((k >= APARTAJ_KALKULOJ) && (APARTAJ_KALKULOJ > 1));
  mpz_add_ui (n, n, 1);
  hk = k >> 1;
  mpz_hmul_nbnpk (p, n, hk, t);

  if ((k & 1) != 0)
    {
      mpz_add_ui (t, t, hk + 1);
      mpz_mul (r, t, p);
    }
  else
    {
      mpz_set (r, p);
    }

  if ((APARTAJ_KALKULOJ > 3) || (hk > 1))
    {
      posmpz_init (p);
      rek_raising_fac (r, p, hk - 1, 0, t, n);
    }
}
#endif /* KVAROPE */

/* This is a poor implementation.  Look at bin_uiui.c for improvement ideas.
   In fact consider calling mpz_bin_uiui() when the arguments fit, leaving
   the code here only for big n.

   The identity bin(n,k) = (-1)^k * bin(-n+k-1,k) can be found in Knuth vol
   1 section 1.2.6 part G. */

void
mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k)
{
  mpz_t      ni;
  mp_size_t  negate;

  if (SIZ (n) < 0)
    {
      /* bin(n,k) = (-1)^k * bin(-n+k-1,k), and set ni = -n+k-1 - k = -n-1 */
      mpz_init (ni);
      mpz_add_ui (ni, n, 1L);
      mpz_neg (ni, ni);
      negate = (k & 1);   /* (-1)^k */
    }
  else
    {
      /* bin(n,k) == 0 if k>n
	 (no test for this under the n<0 case, since -n+k-1 >= k there) */
      if (mpz_cmp_ui (n, k) < 0)
	{
	  SIZ (r) = 0;
	  return;
	}

      /* set ni = n-k */
      mpz_init (ni);
      mpz_sub_ui (ni, n, k);
      negate = 0;
    }

  /* Now wanting bin(ni+k,k), with ni positive, and "negate" is the sign (0
     for positive, 1 for negative). */

  /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller.  In this case it's
     whether ni+k-k < k meaning ni<k, and if so change to denominator ni+k-k
     = ni, and new ni of ni+k-ni = k.  */
  if (mpz_cmp_ui (ni, k) < 0)
    {
      unsigned long  tmp;
      tmp = k;
      k = mpz_get_ui (ni);
      mpz_set_ui (ni, tmp);
    }

  if (k < APARTAJ_KALKULOJ)
    {
      if (k == 0)
	{
	  SIZ (r) = 1;
	  MPZ_NEWALLOC (r, 1)[0] = 1;
	}
#if APARTAJ_KALKULOJ > 2
      else if (k == 2)
	{
	  mpz_add_ui (ni, ni, 1);
	  mpz_mul (r, ni, ni);
	  mpz_add (r, r, ni);
	  posmpz_rsh1 (r);
	}
#endif
#if APARTAJ_KALKULOJ > 3
      else if (k > 2)
	{ /* k = 3, 4 */
	  mpz_add_ui (ni, ni, 2); /* n+1 */
	  mpz_mul (r, ni, ni); /* (n+1)^2 */
	  mpz_sub_ui (r, r, 1); /* (n+1)^2-1 */
	  if (k == 3)
	    {
	      mpz_mul (r, r, ni); /* ((n+1)^2-1)(n+1) = n(n+1)(n+2) */
	      /* mpz_divexact_ui (r, r, 6); /\* 6=3<<1; div_by3 ? *\/ */
	      mpn_pi1_bdiv_q_1 (PTR(r), PTR(r), SIZ(r), 3, GMP_NUMB_MASK/3*2+1, 1);
	      MPN_NORMALIZE_NOT_ZERO (PTR(r), SIZ(r));
	    }
	  else /* k = 4 */
	    {
	      mpz_add (ni, ni, r); /* (n+1)^2+n */
	      mpz_mul (r, ni, ni); /* ((n+1)^2+n)^2 */
	      mpz_sub_ui (r, r, 1); /* ((n+1)^2+n)^2-1 = n(n+1)(n+2)(n+3) */
	      /* mpz_divexact_ui (r, r, 24); /\* 24=3<<3; div_by3 ? *\/ */
	      mpn_pi1_bdiv_q_1 (PTR(r), PTR(r), SIZ(r), 3, GMP_NUMB_MASK/3*2+1, 3);
	      MPN_NORMALIZE_NOT_ZERO (PTR(r), SIZ(r));
	    }
	}
#endif
      else
	{ /* k = 1 */
	  mpz_add_ui (r, ni, 1);
	}
    }
#if UZU_BIN_UIUI
  else if (mpz_cmp_ui (ni, ULONG_MAX - k) <= 0)
    {
      mpz_bin_uiui (r, mpz_get_ui (ni) + k, k);
    }
#endif
  else
    {
      mp_limb_t count;
      mpz_t num, den;

      mpz_init (num);
      mpz_init (den);

#if KVAROPE
      mpz_raising_fac4 (num, ni, k, den, r);
      popc_limb (count, k);
      ASSERT (k - (k >> 1) - (k >> 2) - count >= 0);
      mpz_tdiv_q_2exp (num, num, k - (k >> 1) - (k >> 2) - count);
#else
      mpz_raising_fac (num, ni, k, den, r);
      popc_limb (count, k);
      ASSERT (k - (k >> 1) - count >= 0);
      mpz_tdiv_q_2exp (num, num, k - (k >> 1) - count);
#endif

      mpz_oddfac_1(den, k, 0);

      mpz_divexact(r, num, den);
      mpz_clear (num);
      mpz_clear (den);
    }
  mpz_clear (ni);

  SIZ(r) = (SIZ(r) ^ -negate) + negate;
}