1/* mpz_n_pow_ui -- mpn raised to ulong.
2
3   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5   FUTURE GNU MP RELEASES.
6
7Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8
9This file is part of the GNU MP Library.
10
11The GNU MP Library is free software; you can redistribute it and/or modify
12it under the terms of either:
13
14  * the GNU Lesser General Public License as published by the Free
15    Software Foundation; either version 3 of the License, or (at your
16    option) any later version.
17
18or
19
20  * the GNU General Public License as published by the Free Software
21    Foundation; either version 2 of the License, or (at your option) any
22    later version.
23
24or both in parallel, as here.
25
26The GNU MP Library is distributed in the hope that it will be useful, but
27WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29for more details.
30
31You should have received copies of the GNU General Public License and the
32GNU Lesser General Public License along with the GNU MP Library.  If not,
33see https://www.gnu.org/licenses/.  */
34
35#include <stdlib.h>
36#include <stdio.h>
37#include "gmp-impl.h"
38#include "longlong.h"
39
40
41/* Change this to "#define TRACE(x) x" for some traces. */
42#define TRACE(x)
43
44
45/* Use this to test the mul_2 code on a CPU without a native version of that
46   routine.  */
47#if 0
48#define mpn_mul_2  refmpn_mul_2
49#define HAVE_NATIVE_mpn_mul_2  1
50#endif
51
52
53/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55   bsize==2 or >2, but separating that isn't easy because there's shared
56   code both before and after (the size calculations and the powers of 2
57   handling).
58
59   Alternatives:
60
61   It would work to just use the mpn_mul powering loop for 1 and 2 limb
62   bases, but the current separate loop allows mul_1 and mul_2 to be done
63   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
64   to allow source==dest when vn==1 or 2 then some pointer twiddling might
65   let us get the same effect in one loop.
66
67   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68   form the biggest possible power of b that fits, only the biggest power of
69   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
70   using mp_bases[b].big_base for small b, and thereby get better value
71   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
72   so would be more complicated than it's worth, and could well end up being
73   a slowdown for small e.  For big e on the other hand the algorithm is
74   dominated by mpn_sqr so there wouldn't much of a saving.  The current
75   code can be viewed as simply doing the first few steps of the powering in
76   a single or double limb where possible.
77
78   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79   copy made of b is unnecessary.  We could just use the old alloc'ed block
80   and free it at the end.  But arranging this seems like a lot more trouble
81   than it's worth.  */
82
83
84/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85   a limb without overflowing.
86   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87
88#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89
90
91/* The following are for convenience, they update the size and check the
92   alloc.  */
93
94#define MPN_SQR(dst, alloc, src, size)          \
95  do {                                          \
96    ASSERT (2*(size) <= (alloc));               \
97    mpn_sqr (dst, src, size);                   \
98    (size) *= 2;                                \
99    (size) -= ((dst)[(size)-1] == 0);           \
100  } while (0)
101
102#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
103  do {                                                  \
104    mp_limb_t  cy;                                      \
105    ASSERT ((size) + (size2) <= (alloc));               \
106    cy = mpn_mul (dst, src, size, src2, size2);         \
107    (size) += (size2) - (cy == 0);                      \
108  } while (0)
109
110#define MPN_MUL_2(ptr, size, alloc, mult)       \
111  do {                                          \
112    mp_limb_t  cy;                              \
113    ASSERT ((size)+2 <= (alloc));               \
114    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
115    (size)++;                                   \
116    (ptr)[(size)] = cy;                         \
117    (size) += (cy != 0);                        \
118  } while (0)
119
120#define MPN_MUL_1(ptr, size, alloc, limb)       \
121  do {                                          \
122    mp_limb_t  cy;                              \
123    ASSERT ((size)+1 <= (alloc));               \
124    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
125    (ptr)[size] = cy;                           \
126    (size) += (cy != 0);                        \
127  } while (0)
128
129#define MPN_LSHIFT(ptr, size, alloc, shift)     \
130  do {                                          \
131    mp_limb_t  cy;                              \
132    ASSERT ((size)+1 <= (alloc));               \
133    cy = mpn_lshift (ptr, ptr, size, shift);    \
134    (ptr)[size] = cy;                           \
135    (size) += (cy != 0);                        \
136  } while (0)
137
138#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
139  do {                                                  \
140    if ((shift) == 0)                                   \
141      MPN_COPY (dst, src, size);                        \
142    else                                                \
143      {                                                 \
144        mpn_rshift (dst, src, size, shift);             \
145        (size) -= ((dst)[(size)-1] == 0);               \
146      }                                                 \
147  } while (0)
148
149
150/* ralloc and talloc are only wanted for ASSERTs, after the initial space
151   allocations.  Avoid writing values to them in a normal build, to ensure
152   the compiler lets them go dead.  gcc already figures this out itself
153   actually.  */
154
155#define SWAP_RP_TP                                      \
156  do {                                                  \
157    MP_PTR_SWAP (rp, tp);                               \
158    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
159  } while (0)
160
161
162void
163mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164{
165  mp_ptr         rp;
166  mp_size_t      rtwos_limbs, ralloc, rsize;
167  int            rneg, i, cnt, btwos, r_bp_overlap;
168  mp_limb_t      blimb, rl;
169  mp_bitcnt_t    rtwos_bits;
170#if HAVE_NATIVE_mpn_mul_2
171  mp_limb_t      blimb_low, rl_high;
172#else
173  mp_limb_t      b_twolimbs[2];
174#endif
175  mp_limb_t ovfl;
176  TMP_DECL;
177
178  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179		 PTR(r), bp, bsize, e, e);
180	 mpn_trace ("b", bp, bsize));
181
182  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
183  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184
185  /* b^0 == 1, including 0^0 == 1 */
186  if (e == 0)
187    {
188      MPZ_NEWALLOC (r, 1)[0] = 1;
189      SIZ(r) = 1;
190      return;
191    }
192
193  /* 0^e == 0 apart from 0^0 above */
194  if (bsize == 0)
195    {
196      SIZ(r) = 0;
197      return;
198    }
199
200  /* Sign of the final result. */
201  rneg = (bsize < 0 && (e & 1) != 0);
202  bsize = ABS (bsize);
203  TRACE (printf ("rneg %d\n", rneg));
204
205  r_bp_overlap = (PTR(r) == bp);
206
207  /* Strip low zero limbs from b. */
208  rtwos_limbs = 0;
209  for (blimb = *bp; blimb == 0; blimb = *++bp)
210    {
211      rtwos_limbs += e;
212      bsize--; ASSERT (bsize >= 1);
213    }
214  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215
216  /* Strip low zero bits from b. */
217  count_trailing_zeros (btwos, blimb);
218  blimb >>= btwos;
219
220  umul_ppmm (ovfl, rtwos_bits, e, btwos);
221  if (ovfl)
222    {
223      fprintf (stderr, "gmp: overflow in mpz type\n");
224      abort ();
225    }
226
227  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228  rtwos_bits %= GMP_NUMB_BITS;
229  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230		 btwos, rtwos_limbs, rtwos_bits));
231
232  TMP_MARK;
233
234  rl = 1;
235#if HAVE_NATIVE_mpn_mul_2
236  rl_high = 0;
237#endif
238
239  if (bsize == 1)
240    {
241    bsize_1:
242      /* Power up as far as possible within blimb.  We start here with e!=0,
243	 but if e is small then we might reach e==0 and the whole b^e in rl.
244	 Notice this code works when blimb==1 too, reaching e==0.  */
245
246      while (blimb <= GMP_NUMB_HALFMAX)
247	{
248	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249			 e, blimb, rl));
250	  ASSERT (e != 0);
251	  if ((e & 1) != 0)
252	    rl *= blimb;
253	  e >>= 1;
254	  if (e == 0)
255	    goto got_rl;
256	  blimb *= blimb;
257	}
258
259#if HAVE_NATIVE_mpn_mul_2
260      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261		     e, blimb, rl));
262
263      /* Can power b once more into blimb:blimb_low */
264      bsize = 2;
265      ASSERT (e != 0);
266      if ((e & 1) != 0)
267	{
268	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269	  rl >>= GMP_NAIL_BITS;
270	}
271      e >>= 1;
272      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273      blimb_low >>= GMP_NAIL_BITS;
274
275    got_rl:
276      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277		     e, blimb, blimb_low, rl_high, rl));
278
279      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280	 final mul_1 or mul_2 rather than a separate lshift.
281	 - rl_high:rl mustn't be 1 (since then there's no final mul)
282	 - rl_high mustn't overflow
283	 - rl_high mustn't change to non-zero, since mul_1+lshift is
284	 probably faster than mul_2 (FIXME: is this true?)  */
285
286      if (rtwos_bits != 0
287	  && ! (rl_high == 0 && rl == 1)
288	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289	{
290	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
291	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
292	  if (! (rl_high == 0 && new_rl_high != 0))
293	    {
294	      rl_high = new_rl_high;
295	      rl <<= rtwos_bits;
296	      rtwos_bits = 0;
297	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298			     rl_high, rl));
299	    }
300	}
301#else
302    got_rl:
303      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304		     e, blimb, rl));
305
306      /* Combine left-over rtwos_bits into rl to be handled by the final
307	 mul_1 rather than a separate lshift.
308	 - rl mustn't be 1 (since then there's no final mul)
309	 - rl mustn't overflow	*/
310
311      if (rtwos_bits != 0
312	  && rl != 1
313	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314	{
315	  rl <<= rtwos_bits;
316	  rtwos_bits = 0;
317	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318	}
319#endif
320    }
321  else if (bsize == 2)
322    {
323      mp_limb_t  bsecond = bp[1];
324      if (btwos != 0)
325	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326      bsecond >>= btwos;
327      if (bsecond == 0)
328	{
329	  /* Two limbs became one after rshift. */
330	  bsize = 1;
331	  goto bsize_1;
332	}
333
334      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335#if HAVE_NATIVE_mpn_mul_2
336      blimb_low = blimb;
337#else
338      bp = b_twolimbs;
339      b_twolimbs[0] = blimb;
340      b_twolimbs[1] = bsecond;
341#endif
342      blimb = bsecond;
343    }
344  else
345    {
346      if (r_bp_overlap || btwos != 0)
347	{
348	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350	  bp = tp;
351	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352	}
353#if HAVE_NATIVE_mpn_mul_2
354      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355      blimb_low = bp[0];
356#endif
357      blimb = bp[bsize-1];
358
359      TRACE (printf ("big bsize=%ld  ", bsize);
360	     mpn_trace ("b", bp, bsize));
361    }
362
363  /* At this point blimb is the most significant limb of the base to use.
364
365     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366     limb to round up the division; +1 for multiplies all using an extra
367     limb over the true size; +2 for rl at the end; +1 for lshift at the
368     end.
369
370     The size calculation here is reasonably accurate.  The base is at least
371     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372     when it will power up as just over 16, an overestimate of 17/16 =
373     6.25%.  For a 64-bit limb it's half that.
374
375     If e==0 then blimb won't be anything useful (though it will be
376     non-zero), but that doesn't matter since we just end up with ralloc==5,
377     and that's fine for 2 limbs of rl and 1 of lshift.  */
378
379  ASSERT (blimb != 0);
380  count_leading_zeros (cnt, blimb);
381
382  umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383  if (ovfl)
384    {
385      fprintf (stderr, "gmp: overflow in mpz type\n");
386      abort ();
387    }
388  ralloc = ralloc / GMP_NUMB_BITS + 5;
389
390  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391		 ralloc, bsize, blimb, cnt));
392  rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393
394  /* Low zero limbs resulting from powers of 2. */
395  MPN_ZERO (rp, rtwos_limbs);
396  rp += rtwos_limbs;
397
398  if (e == 0)
399    {
400      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401	 start. */
402      rp[0] = rl;
403      rsize = 1;
404#if HAVE_NATIVE_mpn_mul_2
405      rp[1] = rl_high;
406      rsize += (rl_high != 0);
407#endif
408      ASSERT (rp[rsize-1] != 0);
409    }
410  else
411    {
412      mp_ptr     tp;
413      mp_size_t  talloc;
414
415      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416	 low bit of e is zero, tp only has to hold the second last power
417	 step, which is half the size of the final result.  There's no need
418	 to round up the divide by 2, since ralloc includes a +2 for rl
419	 which not needed by tp.  In the mpn_mul loop when the low bit of e
420	 is 1, tp must hold nearly the full result, so just size it the same
421	 as rp.  */
422
423      talloc = ralloc;
424#if HAVE_NATIVE_mpn_mul_2
425      if (bsize <= 2 || (e & 1) == 0)
426	talloc /= 2;
427#else
428      if (bsize <= 1 || (e & 1) == 0)
429	talloc /= 2;
430#endif
431      TRACE (printf ("talloc %ld\n", talloc));
432      tp = TMP_ALLOC_LIMBS (talloc);
433
434      /* Go from high to low over the bits of e, starting with i pointing at
435	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
436      count_leading_zeros (cnt, (mp_limb_t) e);
437      i = GMP_LIMB_BITS - cnt - 2;
438
439#if HAVE_NATIVE_mpn_mul_2
440      if (bsize <= 2)
441	{
442	  mp_limb_t  mult[2];
443
444	  /* Any bsize==1 will have been powered above to be two limbs. */
445	  ASSERT (bsize == 2);
446	  ASSERT (blimb != 0);
447
448	  /* Arrange the final result ends up in r, not in the temp space */
449	  if ((i & 1) == 0)
450	    SWAP_RP_TP;
451
452	  rp[0] = blimb_low;
453	  rp[1] = blimb;
454	  rsize = 2;
455
456	  mult[0] = blimb_low;
457	  mult[1] = blimb;
458
459	  for ( ; i >= 0; i--)
460	    {
461	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462			     i, e, rsize, ralloc, talloc);
463		     mpn_trace ("r", rp, rsize));
464
465	      MPN_SQR (tp, talloc, rp, rsize);
466	      SWAP_RP_TP;
467	      if ((e & (1L << i)) != 0)
468		MPN_MUL_2 (rp, rsize, ralloc, mult);
469	    }
470
471	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472	  if (rl_high != 0)
473	    {
474	      mult[0] = rl;
475	      mult[1] = rl_high;
476	      MPN_MUL_2 (rp, rsize, ralloc, mult);
477	    }
478	  else if (rl != 1)
479	    MPN_MUL_1 (rp, rsize, ralloc, rl);
480	}
481#else
482      if (bsize == 1)
483	{
484	  /* Arrange the final result ends up in r, not in the temp space */
485	  if ((i & 1) == 0)
486	    SWAP_RP_TP;
487
488	  rp[0] = blimb;
489	  rsize = 1;
490
491	  for ( ; i >= 0; i--)
492	    {
493	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494			     i, e, rsize, ralloc, talloc);
495		     mpn_trace ("r", rp, rsize));
496
497	      MPN_SQR (tp, talloc, rp, rsize);
498	      SWAP_RP_TP;
499	      if ((e & (1L << i)) != 0)
500		MPN_MUL_1 (rp, rsize, ralloc, blimb);
501	    }
502
503	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504	  if (rl != 1)
505	    MPN_MUL_1 (rp, rsize, ralloc, rl);
506	}
507#endif
508      else
509	{
510	  int  parity;
511
512	  /* Arrange the final result ends up in r, not in the temp space */
513	  ULONG_PARITY (parity, e);
514	  if (((parity ^ i) & 1) != 0)
515	    SWAP_RP_TP;
516
517	  MPN_COPY (rp, bp, bsize);
518	  rsize = bsize;
519
520	  for ( ; i >= 0; i--)
521	    {
522	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523			     i, e, rsize, ralloc, talloc);
524		     mpn_trace ("r", rp, rsize));
525
526	      MPN_SQR (tp, talloc, rp, rsize);
527	      SWAP_RP_TP;
528	      if ((e & (1L << i)) != 0)
529		{
530		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531		  SWAP_RP_TP;
532		}
533	    }
534	}
535    }
536
537  ASSERT (rp == PTR(r) + rtwos_limbs);
538  TRACE (mpn_trace ("end loop r", rp, rsize));
539  TMP_FREE;
540
541  /* Apply any partial limb factors of 2. */
542  if (rtwos_bits != 0)
543    {
544      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545      TRACE (mpn_trace ("lshift r", rp, rsize));
546    }
547
548  rsize += rtwos_limbs;
549  SIZ(r) = (rneg ? -rsize : rsize);
550}
551