1/* mpn_powm -- Compute R = U^E mod M.
2
3   Contributed to the GNU project by Torbjorn Granlund.
4
5   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
6   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9Copyright 2007, 2008, 2009 Free Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of the GNU Lesser General Public License as published by
15the Free Software Foundation; either version 3 of the License, or (at your
16option) any later version.
17
18The GNU MP Library is distributed in the hope that it will be useful, but
19WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21License for more details.
22
23You should have received a copy of the GNU Lesser General Public License
24along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25
26
27/*
28  BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
29
30  1. W <- U
31
32  2. T <- (B^n * U) mod M                Convert to REDC form
33
34  3. Compute table U^1, U^3, U^5... of E-dependent size
35
36  4. While there are more bits in E
37       W <- power left-to-right base-k
38
39
40  TODO:
41
42   * Make getbits a macro, thereby allowing it to update the index operand.
43     That will simplify the code using getbits.  (Perhaps make getbits' sibling
44     getbit then have similar form, for symmetry.)
45
46   * Write an itch function.  Or perhaps get rid of tp parameter since the huge
47     pp area is allocated locally anyway?
48
49   * Choose window size without looping.  (Superoptimize or think(tm).)
50
51   * Handle small bases with initial, reduction-free exponentiation.
52
53   * Call new division functions, not mpn_tdiv_qr.
54
55   * Consider special code for one-limb M.
56
57   * How should we handle the redc1/redc2/redc_n choice?
58     - redc1:  T(binvert_1limb)  + e * (n)   * (T(mullo-1x1) + n*T(addmul_1))
59     - redc2:  T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2))
60     - redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n)))
61     This disregards the addmul_N constant term, but we could think of
62     that as part of the respective mullo.
63
64   * When U (the base) is small, we should start the exponentiation with plain
65     operations, then convert that partial result to REDC form.
66
67   * When U is just one limb, should it be handled without the k-ary tricks?
68     We could keep a factor of B^n in W, but use U' = BU as base.  After
69     multiplying by this (pseudo two-limb) number, we need to multiply by 1/B
70     mod M.
71*/
72
73#include "gmp.h"
74#include "gmp-impl.h"
75#include "longlong.h"
76
77#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
78#define WANT_REDC_2 1
79#endif
80
81#define getbit(p,bi) \
82  ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
83
84static inline mp_limb_t
85getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
86{
87  int nbits_in_r;
88  mp_limb_t r;
89  mp_size_t i;
90
91  if (bi < nbits)
92    {
93      return p[0] & (((mp_limb_t) 1 << bi) - 1);
94    }
95  else
96    {
97      bi -= nbits;			/* bit index of low bit to extract */
98      i = bi / GMP_NUMB_BITS;		/* word index of low bit to extract */
99      bi %= GMP_NUMB_BITS;		/* bit index in low word */
100      r = p[i] >> bi;			/* extract (low) bits */
101      nbits_in_r = GMP_NUMB_BITS - bi;	/* number of bits now in r */
102      if (nbits_in_r < nbits)		/* did we get enough bits? */
103	r += p[i + 1] << nbits_in_r;	/* prepend bits from higher word */
104      return r & (((mp_limb_t ) 1 << nbits) - 1);
105    }
106}
107
108static inline int
109win_size (mp_bitcnt_t eb)
110{
111  int k;
112  static mp_bitcnt_t x[] = {0,7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
113  for (k = 1; eb > x[k]; k++)
114    ;
115  return k;
116}
117
118/* Convert U to REDC form, U_r = B^n * U mod M */
119static void
120redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
121{
122  mp_ptr tp, qp;
123  TMP_DECL;
124  TMP_MARK;
125
126  tp = TMP_ALLOC_LIMBS (un + n);
127  qp = TMP_ALLOC_LIMBS (un + 1);	/* FIXME: Put at tp+? */
128
129  MPN_ZERO (tp, n);
130  MPN_COPY (tp + n, up, un);
131  mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
132  TMP_FREE;
133}
134
135/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
136   Requires that mp[n-1..0] is odd.
137   Requires that ep[en-1..0] is > 1.
138   Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs.  */
139void
140mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
141	  mp_srcptr ep, mp_size_t en,
142	  mp_srcptr mp, mp_size_t n, mp_ptr tp)
143{
144  mp_limb_t ip[2], *mip;
145  int cnt;
146  mp_bitcnt_t ebi;
147  int windowsize, this_windowsize;
148  mp_limb_t expbits;
149  mp_ptr pp, this_pp;
150  long i;
151  TMP_DECL;
152
153  ASSERT (en > 1 || (en == 1 && ep[0] > 1));
154  ASSERT (n >= 1 && ((mp[0] & 1) != 0));
155
156  TMP_MARK;
157
158  count_leading_zeros (cnt, ep[en - 1]);
159  ebi = (mp_bitcnt_t) en * GMP_LIMB_BITS - cnt;
160
161#if 0
162  if (bn < n)
163    {
164      /* Do the first few exponent bits without mod reductions,
165	 until the result is greater than the mod argument.  */
166      for (;;)
167	{
168	  mpn_sqr (tp, this_pp, tn);
169	  tn = tn * 2 - 1,  tn += tp[tn] != 0;
170	  if (getbit (ep, ebi) != 0)
171	    mpn_mul (..., tp, tn, bp, bn);
172	  ebi--;
173	}
174    }
175#endif
176
177  windowsize = win_size (ebi);
178
179#if WANT_REDC_2
180  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
181    {
182      mip = ip;
183      binvert_limb (mip[0], mp[0]);
184      mip[0] = -mip[0];
185    }
186  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
187    {
188      mip = ip;
189      mpn_binvert (mip, mp, 2, tp);
190      mip[0] = -mip[0]; mip[1] = ~mip[1];
191    }
192#else
193  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
194    {
195      mip = ip;
196      binvert_limb (mip[0], mp[0]);
197      mip[0] = -mip[0];
198    }
199#endif
200  else
201    {
202      mip = TMP_ALLOC_LIMBS (n);
203      mpn_binvert (mip, mp, n, tp);
204    }
205
206  pp = TMP_ALLOC_LIMBS (n << (windowsize - 1));
207
208  this_pp = pp;
209  redcify (this_pp, bp, bn, mp, n);
210
211  /* Store b^2 at rp.  */
212  mpn_sqr (tp, this_pp, n);
213#if WANT_REDC_2
214  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
215    mpn_redc_1 (rp, tp, mp, n, mip[0]);
216  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
217    mpn_redc_2 (rp, tp, mp, n, mip);
218#else
219  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
220    mpn_redc_1 (rp, tp, mp, n, mip[0]);
221#endif
222  else
223    mpn_redc_n (rp, tp, mp, n, mip);
224
225  /* Precompute odd powers of b and put them in the temporary area at pp.  */
226  for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
227    {
228      mpn_mul_n (tp, this_pp, rp, n);
229      this_pp += n;
230#if WANT_REDC_2
231      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
232	mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
233      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
234	mpn_redc_2 (this_pp, tp, mp, n, mip);
235#else
236      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
237	mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
238#endif
239      else
240	mpn_redc_n (this_pp, tp, mp, n, mip);
241    }
242
243  expbits = getbits (ep, ebi, windowsize);
244  if (ebi < windowsize)
245    ebi = 0;
246  else
247    ebi -= windowsize;
248
249  count_trailing_zeros (cnt, expbits);
250  ebi += cnt;
251  expbits >>= cnt;
252
253  MPN_COPY (rp, pp + n * (expbits >> 1), n);
254
255#define INNERLOOP							\
256  while (ebi != 0)							\
257    {									\
258      while (getbit (ep, ebi) == 0)					\
259	{								\
260	  MPN_SQR (tp, rp, n);						\
261	  MPN_REDUCE (rp, tp, mp, n, mip);				\
262	  ebi--;							\
263	  if (ebi == 0)							\
264	    goto done;							\
265	}								\
266									\
267      /* The next bit of the exponent is 1.  Now extract the largest	\
268	 block of bits <= windowsize, and such that the least		\
269	 significant bit is 1.  */					\
270									\
271      expbits = getbits (ep, ebi, windowsize);				\
272      this_windowsize = windowsize;					\
273      if (ebi < windowsize)						\
274	{								\
275	  this_windowsize -= windowsize - ebi;				\
276	  ebi = 0;							\
277	}								\
278      else								\
279        ebi -= windowsize;						\
280									\
281      count_trailing_zeros (cnt, expbits);				\
282      this_windowsize -= cnt;						\
283      ebi += cnt;							\
284      expbits >>= cnt;							\
285									\
286      do								\
287	{								\
288	  MPN_SQR (tp, rp, n);						\
289	  MPN_REDUCE (rp, tp, mp, n, mip);				\
290	  this_windowsize--;						\
291	}								\
292      while (this_windowsize != 0);					\
293									\
294      MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n);			\
295      MPN_REDUCE (rp, tp, mp, n, mip);					\
296    }
297
298
299#if WANT_REDC_2
300  if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD)
301    {
302      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
303	{
304#undef MPN_MUL_N
305#undef MPN_SQR
306#undef MPN_REDUCE
307#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
308#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
309#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
310	  INNERLOOP;
311	}
312      else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
313	{
314#undef MPN_MUL_N
315#undef MPN_SQR
316#undef MPN_REDUCE
317#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
318#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
319#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_2 (rp, tp, mp, n, mip)
320	  INNERLOOP;
321	}
322      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
323	{
324#undef MPN_MUL_N
325#undef MPN_SQR
326#undef MPN_REDUCE
327#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
328#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
329#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_2 (rp, tp, mp, n, mip)
330	  INNERLOOP;
331	}
332      else
333	{
334#undef MPN_MUL_N
335#undef MPN_SQR
336#undef MPN_REDUCE
337#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
338#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
339#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
340	  INNERLOOP;
341	}
342    }
343  else
344    {
345      if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
346	{
347#undef MPN_MUL_N
348#undef MPN_SQR
349#undef MPN_REDUCE
350#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
351#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
352#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
353	  INNERLOOP;
354	}
355      else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
356	{
357#undef MPN_MUL_N
358#undef MPN_SQR
359#undef MPN_REDUCE
360#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
361#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
362#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
363	  INNERLOOP;
364	}
365      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
366	{
367#undef MPN_MUL_N
368#undef MPN_SQR
369#undef MPN_REDUCE
370#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
371#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
372#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_2 (rp, tp, mp, n, mip)
373	  INNERLOOP;
374	}
375      else
376	{
377#undef MPN_MUL_N
378#undef MPN_SQR
379#undef MPN_REDUCE
380#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
381#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
382#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
383	  INNERLOOP;
384	}
385    }
386
387#else  /* WANT_REDC_2 */
388
389  if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD)
390    {
391      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
392	{
393#undef MPN_MUL_N
394#undef MPN_SQR
395#undef MPN_REDUCE
396#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
397#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
398#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
399	  INNERLOOP;
400	}
401      else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
402	{
403#undef MPN_MUL_N
404#undef MPN_SQR
405#undef MPN_REDUCE
406#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
407#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
408#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
409	  INNERLOOP;
410	}
411      else
412	{
413#undef MPN_MUL_N
414#undef MPN_SQR
415#undef MPN_REDUCE
416#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
417#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
418#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
419	  INNERLOOP;
420	}
421    }
422  else
423    {
424      if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
425	{
426#undef MPN_MUL_N
427#undef MPN_SQR
428#undef MPN_REDUCE
429#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
430#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
431#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
432	  INNERLOOP;
433	}
434      else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
435	{
436#undef MPN_MUL_N
437#undef MPN_SQR
438#undef MPN_REDUCE
439#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
440#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
441#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_1 (rp, tp, mp, n, mip[0])
442	  INNERLOOP;
443	}
444      else
445	{
446#undef MPN_MUL_N
447#undef MPN_SQR
448#undef MPN_REDUCE
449#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
450#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
451#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
452	  INNERLOOP;
453	}
454    }
455#endif  /* WANT_REDC_2 */
456
457 done:
458
459  MPN_COPY (tp, rp, n);
460  MPN_ZERO (tp + n, n);
461
462#if WANT_REDC_2
463  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
464    mpn_redc_1 (rp, tp, mp, n, mip[0]);
465  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
466    mpn_redc_2 (rp, tp, mp, n, mip);
467#else
468  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
469    mpn_redc_1 (rp, tp, mp, n, mip[0]);
470#endif
471  else
472    mpn_redc_n (rp, tp, mp, n, mip);
473
474  if (mpn_cmp (rp, mp, n) >= 0)
475    mpn_sub_n (rp, rp, mp, n);
476
477  TMP_FREE;
478}
479