1218885Sdim/* mulmod_bnm1.c -- multiplication mod B^n-1.
2218885Sdim
3218885Sdim   Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and
4218885Sdim   Marco Bodrato.
5218885Sdim
6218885Sdim   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
7218885Sdim   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8218885Sdim   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9218885Sdim
10218885SdimCopyright 2009, 2010 Free Software Foundation, Inc.
11218885Sdim
12218885SdimThis file is part of the GNU MP Library.
13218885Sdim
14218885SdimThe GNU MP Library is free software; you can redistribute it and/or modify
15218885Sdimit under the terms of the GNU Lesser General Public License as published by
16218885Sdimthe Free Software Foundation; either version 3 of the License, or (at your
17218885Sdimoption) any later version.
18218885Sdim
19218885SdimThe GNU MP Library is distributed in the hope that it will be useful, but
20218885SdimWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21218885Sdimor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
22218885SdimLicense for more details.
23218885Sdim
24218885SdimYou should have received a copy of the GNU Lesser General Public License
25218885Sdimalong with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
26218885Sdim
27218885Sdim
28218885Sdim#include "gmp.h"
29218885Sdim#include "gmp-impl.h"
30218885Sdim#include "longlong.h"
31218885Sdim
32218885Sdim/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
33218885Sdim   mod B^rn - 1, and values are semi-normalised; zero is represented
34218885Sdim   as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
35218885Sdim   tp==rp is allowed. */
36218885Sdimvoid
37218885Sdimmpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
38218885Sdim		    mp_ptr tp)
39218885Sdim{
40218885Sdim  mp_limb_t cy;
41218885Sdim
42218885Sdim  ASSERT (0 < rn);
43218885Sdim
44218885Sdim  mpn_mul_n (tp, ap, bp, rn);
45218885Sdim  cy = mpn_add_n (rp, tp, tp + rn, rn);
46218885Sdim  /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
47218885Sdim   * be no overflow when adding in the carry. */
48218885Sdim  MPN_INCR_U (rp, rn, cy);
49218885Sdim}
50218885Sdim
51218885Sdim
52218885Sdim/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
53218885Sdim   semi-normalised representation, computation is mod B^rn + 1. Needs
54218885Sdim   a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
55218885Sdim   Output is normalised. */
56218885Sdimstatic void
57218885Sdimmpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
58218885Sdim		    mp_ptr tp)
59218885Sdim{
60218885Sdim  mp_limb_t cy;
61218885Sdim
62218885Sdim  ASSERT (0 < rn);
63218885Sdim
64218885Sdim  mpn_mul_n (tp, ap, bp, rn + 1);
65218885Sdim  ASSERT (tp[2*rn+1] == 0);
66218885Sdim  ASSERT (tp[2*rn] < GMP_NUMB_MAX);
67218885Sdim  cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
68218885Sdim  rp[rn] = 0;
69218885Sdim  MPN_INCR_U (rp, rn+1, cy );
70218885Sdim}
71218885Sdim
72218885Sdim
73218885Sdim/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
74218885Sdim *
75218885Sdim * The result is expected to be ZERO if and only if one of the operand
76218885Sdim * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
77218885Sdim * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
78218885Sdim * combine results and obtain a natural number when one knows in
79218885Sdim * advance that the final value is less than (B^rn-1).
80218885Sdim * Moreover it should not be a problem if mulmod_bnm1 is used to
81218885Sdim * compute the full product with an+bn <= rn, because this condition
82218885Sdim * implies (B^an-1)(B^bn-1) < (B^rn-1) .
83218885Sdim *
84218885Sdim * Requires 0 < bn <= an <= rn and an + bn > rn/2
85218885Sdim * Scratch need: rn + (need for recursive call OR rn + 4). This gives
86218885Sdim *
87218885Sdim * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
88218885Sdim */
89218885Sdimvoid
90218885Sdimmpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
91218885Sdim{
92218885Sdim  ASSERT (0 < bn);
93218885Sdim  ASSERT (bn <= an);
94218885Sdim  ASSERT (an <= rn);
95218885Sdim
96218885Sdim  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
97218885Sdim    {
98218885Sdim      if (UNLIKELY (bn < rn))
99218885Sdim	{
100218885Sdim	  if (UNLIKELY (an + bn <= rn))
101218885Sdim	    {
102218885Sdim	      mpn_mul (rp, ap, an, bp, bn);
103218885Sdim	    }
104218885Sdim	  else
105218885Sdim	    {
106218885Sdim	      mp_limb_t cy;
107218885Sdim	      mpn_mul (tp, ap, an, bp, bn);
108218885Sdim	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
109218885Sdim	      MPN_INCR_U (rp, rn, cy);
110218885Sdim	    }
111218885Sdim	}
112218885Sdim      else
113218885Sdim	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
114218885Sdim    }
115218885Sdim  else
116218885Sdim    {
117218885Sdim      mp_size_t n;
118218885Sdim      mp_limb_t cy;
119218885Sdim      mp_limb_t hi;
120218885Sdim
121218885Sdim      n = rn >> 1;
122218885Sdim
123218885Sdim      /* We need at least an + bn >= n, to be able to fit one of the
124218885Sdim	 recursive products at rp. Requiring strict inequality makes
125218885Sdim	 the coded slightly simpler. If desired, we could avoid this
126218885Sdim	 restriction by initially halving rn as long as rn is even and
127218885Sdim	 an + bn <= rn/2. */
128218885Sdim
129218885Sdim      ASSERT (an + bn > n);
130218885Sdim
131218885Sdim      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
132218885Sdim	 and crt together as
133218885Sdim
134218885Sdim	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
135218885Sdim      */
136218885Sdim
137218885Sdim#define a0 ap
138218885Sdim#define a1 (ap + n)
139218885Sdim#define b0 bp
140218885Sdim#define b1 (bp + n)
141218885Sdim
142218885Sdim#define xp  tp	/* 2n + 2 */
143218885Sdim      /* am1  maybe in {xp, n} */
144218885Sdim      /* bm1  maybe in {xp + n, n} */
145218885Sdim#define sp1 (tp + 2*n + 2)
146218885Sdim      /* ap1  maybe in {sp1, n + 1} */
147218885Sdim      /* bp1  maybe in {sp1 + n + 1, n + 1} */
148218885Sdim
149218885Sdim      {
150218885Sdim	mp_srcptr am1, bm1;
151218885Sdim	mp_size_t anm, bnm;
152218885Sdim	mp_ptr so;
153218885Sdim
154218885Sdim	if (LIKELY (an > n))
155218885Sdim	  {
156218885Sdim	    am1 = xp;
157218885Sdim	    cy = mpn_add (xp, a0, n, a1, an - n);
158218885Sdim	    MPN_INCR_U (xp, n, cy);
159218885Sdim	    anm = n;
160218885Sdim	    if (LIKELY (bn > n))
161218885Sdim	      {
162218885Sdim		bm1 = xp + n;
163218885Sdim		cy = mpn_add (xp + n, b0, n, b1, bn - n);
164218885Sdim		MPN_INCR_U (xp + n, n, cy);
165218885Sdim		bnm = n;
166218885Sdim		so = xp + 2*n;
167218885Sdim	      }
168218885Sdim	    else
169218885Sdim	      {
170218885Sdim		so = xp + n;
171218885Sdim		bm1 = b0;
172218885Sdim		bnm = bn;
173218885Sdim	      }
174218885Sdim	  }
175218885Sdim	else
176218885Sdim	  {
177218885Sdim	    so = xp;
178218885Sdim	    am1 = a0;
179218885Sdim	    anm = an;
180218885Sdim	    bm1 = b0;
181218885Sdim	    bnm = bn;
182218885Sdim	  }
183218885Sdim
184218885Sdim	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
185218885Sdim      }
186218885Sdim
187218885Sdim      {
188218885Sdim	int       k;
189218885Sdim	mp_srcptr ap1, bp1;
190218885Sdim	mp_size_t anp, bnp;
191218885Sdim
192218885Sdim	if (LIKELY (an > n)) {
193218885Sdim	  ap1 = sp1;
194218885Sdim	  cy = mpn_sub (sp1, a0, n, a1, an - n);
195218885Sdim	  sp1[n] = 0;
196218885Sdim	  MPN_INCR_U (sp1, n + 1, cy);
197218885Sdim	  anp = n + ap1[n];
198218885Sdim	} else {
199218885Sdim	  ap1 = a0;
200218885Sdim	  anp = an;
201218885Sdim	}
202218885Sdim
203218885Sdim	if (LIKELY (bn > n)) {
204218885Sdim	  bp1 = sp1 + n + 1;
205218885Sdim	  cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
206218885Sdim	  sp1[2*n+1] = 0;
207218885Sdim	  MPN_INCR_U (sp1 + n + 1, n + 1, cy);
208218885Sdim	  bnp = n + bp1[n];
209218885Sdim	} else {
210218885Sdim	  bp1 = b0;
211218885Sdim	  bnp = bn;
212218885Sdim	}
213218885Sdim
214218885Sdim	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
215218885Sdim	  k=0;
216218885Sdim	else
217218885Sdim	  {
218218885Sdim	    int mask;
219218885Sdim	    k = mpn_fft_best_k (n, 0);
220218885Sdim	    mask = (1<<k) -1;
221218885Sdim	    while (n & mask) {k--; mask >>=1;};
222218885Sdim	  }
223218885Sdim	if (k >= FFT_FIRST_K)
224218885Sdim	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
225218885Sdim	else if (UNLIKELY (bp1 == b0))
226218885Sdim	  {
227218885Sdim	    ASSERT (anp + bnp <= 2*n+1);
228218885Sdim	    ASSERT (anp + bnp > n);
229218885Sdim	    ASSERT (anp >= bnp);
230218885Sdim	    mpn_mul (xp, ap1, anp, bp1, bnp);
231218885Sdim	    anp = anp + bnp - n;
232218885Sdim	    ASSERT (anp <= n || xp[2*n]==0);
233218885Sdim	    anp-= anp > n;
234218885Sdim	    cy = mpn_sub (xp, xp, n, xp + n, anp);
235218885Sdim	    xp[n] = 0;
236218885Sdim	    MPN_INCR_U (xp, n+1, cy);
237218885Sdim	  }
238218885Sdim	else
239218885Sdim	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
240218885Sdim      }
241218885Sdim
242218885Sdim      /* Here the CRT recomposition begins.
243218885Sdim
244218885Sdim	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
245218885Sdim	 Division by 2 is a bitwise rotation.
246218885Sdim
247218885Sdim	 Assumes xp normalised mod (B^n+1).
248218885Sdim
249218885Sdim	 The residue class [0] is represented by [B^n-1]; except when
250218885Sdim	 both input are ZERO.
251218885Sdim      */
252218885Sdim
253218885Sdim#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
254218885Sdim#if HAVE_NATIVE_mpn_rsh1add_nc
255218885Sdim      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
256218885Sdim      hi = cy << (GMP_NUMB_BITS - 1);
257218885Sdim      cy = 0;
258218885Sdim      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
259218885Sdim	 overflows, i.e. a further increment will not overflow again. */
260218885Sdim#else /* ! _nc */
261218885Sdim      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
262218885Sdim      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
263218885Sdim      cy >>= 1;
264218885Sdim      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
265218885Sdim	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
266218885Sdim#endif
267218885Sdim#if GMP_NAIL_BITS == 0
268218885Sdim      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
269218885Sdim#else
270218885Sdim      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
271218885Sdim      rp[n-1] ^= hi;
272218885Sdim#endif
273218885Sdim#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
274218885Sdim#if HAVE_NATIVE_mpn_add_nc
275218885Sdim      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
276218885Sdim#else /* ! _nc */
277218885Sdim      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
278218885Sdim#endif
279218885Sdim      cy += (rp[0]&1);
280218885Sdim      mpn_rshift(rp, rp, n, 1);
281218885Sdim      ASSERT (cy <= 2);
282218885Sdim      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
283218885Sdim      cy >>= 1;
284218885Sdim      /* We can have cy != 0 only if hi = 0... */
285218885Sdim      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
286218885Sdim      rp[n-1] |= hi;
287218885Sdim      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
288218885Sdim#endif
289218885Sdim      ASSERT (cy <= 1);
290218885Sdim      /* Next increment can not overflow, read the previous comments about cy. */
291218885Sdim      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
292218885Sdim      MPN_INCR_U(rp, n, cy);
293218885Sdim
294218885Sdim      /* Compute the highest half:
295218885Sdim	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
296218885Sdim       */
297218885Sdim      if (UNLIKELY (an + bn < rn))
298218885Sdim	{
299218885Sdim	  /* Note that in this case, the only way the result can equal
300218885Sdim	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
301218885Sdim	     then the output of both the recursive calls and this CRT
302218885Sdim	     reconstruction is zero, not B^{rn} - 1. Which is good,
303218885Sdim	     since the latter representation doesn't fit in the output
304218885Sdim	     area.*/
305218885Sdim	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
306218885Sdim
307218885Sdim	  /* FIXME: This subtraction of the high parts is not really
308218885Sdim	     necessary, we do it to get the carry out, and for sanity
309218885Sdim	     checking. */
310218885Sdim	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
311218885Sdim				   xp + an + bn - n, rn - (an + bn), cy);
312218885Sdim	  ASSERT (an + bn == rn - 1 ||
313218885Sdim		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
314218885Sdim	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
315218885Sdim	  ASSERT (cy == (xp + an + bn - n)[0]);
316218885Sdim	}
317218885Sdim      else
318218885Sdim	{
319218885Sdim	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
320218885Sdim	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
321218885Sdim	     DECR will affect _at most_ the lowest n limbs. */
322218885Sdim	  MPN_DECR_U (rp, 2*n, cy);
323218885Sdim	}
324218885Sdim#undef a0
325218885Sdim#undef a1
326218885Sdim#undef b0
327218885Sdim#undef b1
328218885Sdim#undef xp
329218885Sdim#undef sp1
330218885Sdim    }
331218885Sdim}
332218885Sdim
333218885Sdimmp_size_t
334218885Sdimmpn_mulmod_bnm1_next_size (mp_size_t n)
335218885Sdim{
336218885Sdim  mp_size_t nh;
337218885Sdim
338218885Sdim  if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
339218885Sdim    return n;
340218885Sdim  if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
341218885Sdim    return (n + (2-1)) & (-2);
342218885Sdim  if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
343218885Sdim    return (n + (4-1)) & (-4);
344218885Sdim
345218885Sdim  nh = (n + 1) >> 1;
346218885Sdim
347218885Sdim  if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
348218885Sdim    return (n + (8-1)) & (-8);
349218885Sdim
350218885Sdim  return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
351218885Sdim}
352223017Sdim