1/* Shared speed subroutines.
2
3Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2009, 2010
4Free Software Foundation, Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of the GNU Lesser General Public License as published by
10the Free Software Foundation; either version 3 of the License, or (at your
11option) any later version.
12
13The GNU MP Library is distributed in the hope that it will be useful, but
14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16License for more details.
17
18You should have received a copy of the GNU Lesser General Public License
19along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20
21#define __GMP_NO_ATTRIBUTE_CONST_PURE
22
23#include <errno.h>
24#include <fcntl.h>
25#include <math.h>
26#include <stdio.h>
27#include <stdlib.h> /* for qsort */
28#include <string.h>
29#include <unistd.h>
30#if 0
31#include <sys/ioctl.h>
32#endif
33
34#include "gmp.h"
35#include "gmp-impl.h"
36#include "longlong.h"
37
38#include "tests.h"
39#include "speed.h"
40
41
42int   speed_option_addrs = 0;
43int   speed_option_verbose = 0;
44
45
46/* Provide __clz_tab even if it's not required, for the benefit of new code
47   being tested with many.pl. */
48#ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
49#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
50#include "mp_clz_tab.c"
51#undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
52#endif
53
54
55void
56pentium_wbinvd(void)
57{
58#if 0
59  {
60    static int  fd = -2;
61
62    if (fd == -2)
63      {
64	fd = open ("/dev/wbinvd", O_RDWR);
65	if (fd == -1)
66	  perror ("open /dev/wbinvd");
67      }
68
69    if (fd != -1)
70      ioctl (fd, 0, 0);
71  }
72#endif
73
74#if 0
75#define WBINVDSIZE  1024*1024*2
76  {
77    static char  *p = NULL;
78    int   i, sum;
79
80    if (p == NULL)
81      p = malloc (WBINVDSIZE);
82
83#if 0
84    for (i = 0; i < WBINVDSIZE; i++)
85      p[i] = i & 0xFF;
86#endif
87
88    sum = 0;
89    for (i = 0; i < WBINVDSIZE; i++)
90      sum += p[i];
91
92    mpn_cache_fill_dummy (sum);
93  }
94#endif
95}
96
97
98int
99double_cmp_ptr (const double *p, const double *q)
100{
101  if (*p > *q)  return 1;
102  if (*p < *q)  return -1;
103  return 0;
104}
105
106
107/* Measure the speed of a given routine.
108
109   The routine is run with enough repetitions to make it take at least
110   speed_precision * speed_unittime.  This aims to minimize the effects of a
111   limited accuracy time base and the overhead of the measuring itself.
112
113   Measurements are made looking for 4 results within TOLERANCE of each
114   other (or 3 for routines taking longer than 2 seconds).  This aims to get
115   an accurate reading even if some runs are bloated by interrupts or task
116   switches or whatever.
117
118   The given (*fun)() is expected to run its function "s->reps" many times
119   and return the total elapsed time measured using speed_starttime() and
120   speed_endtime().  If the function doesn't support the given s->size or
121   s->r, -1.0 should be returned.  See the various base routines below.  */
122
123double
124speed_measure (double (*fun) __GMP_PROTO ((struct speed_params *s)),
125	       struct speed_params *s)
126{
127#define TOLERANCE    1.005  /* 0.5% */
128  const int max_zeros = 10;
129
130  struct speed_params  s_dummy;
131  int     i, j, e;
132  double  t[30];
133  double  t_unsorted[30];
134  double  reps_d;
135  int     zeros = 0;
136
137  /* Use dummy parameters if caller doesn't provide any.  Only a few special
138     "fun"s will cope with this, speed_noop() is one.  */
139  if (s == NULL)
140    {
141      memset (&s_dummy, '\0', sizeof (s_dummy));
142      s = &s_dummy;
143    }
144
145  s->reps = 1;
146  s->time_divisor = 1.0;
147  for (i = 0; i < numberof (t); i++)
148    {
149      for (;;)
150	{
151	  s->src_num = 0;
152	  s->dst_num = 0;
153
154	  t[i] = (*fun) (s);
155
156	  if (speed_option_verbose >= 3)
157	    gmp_printf("size=%ld reps=%u r=%Md attempt=%d  %.9f\n",
158		       (long) s->size, s->reps, s->r, i, t[i]);
159
160	  if (t[i] == 0.0)
161	    {
162	      zeros++;
163	      if (zeros > max_zeros)
164		{
165		  fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros);
166		  abort ();
167		}
168	      continue;
169	    }
170
171	  if (t[i] == -1.0)
172	    return -1.0;
173
174	  if (t[i] >= speed_unittime * speed_precision)
175	    break;
176
177	  /* go to a value of reps to make t[i] >= precision */
178	  reps_d = ceil (1.1 * s->reps
179			 * speed_unittime * speed_precision
180			 / MAX (t[i], speed_unittime));
181	  if (reps_d > 2e9 || reps_d < 1.0)
182	    {
183	      fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d);
184	      fprintf (stderr, "  (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n",
185		       s->reps, speed_unittime, speed_precision, t[i]);
186	      abort ();
187	    }
188	  s->reps = (unsigned) reps_d;
189	}
190      t[i] /= s->reps;
191      t_unsorted[i] = t[i];
192
193      if (speed_precision == 0)
194	return t[i];
195
196      /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
197      if (t[0] >= 2.0)
198	e = 3;
199      else
200	e = 4;
201
202      /* Look for e many t[]'s within TOLERANCE of each other to consider a
203	 valid measurement.  Return smallest among them.  */
204      if (i >= e)
205	{
206	  qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
207	  for (j = e-1; j < i; j++)
208	    if (t[j] <= t[j-e+1] * TOLERANCE)
209	      return t[j-e+1] / s->time_divisor;
210	}
211    }
212
213  fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
214	   e, (TOLERANCE-1.0)*100.0);
215  fprintf (stderr, "    unsorted         sorted\n");
216  fprintf (stderr, "  %.12f    %.12f    is about 0.5%%\n",
217	   t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0));
218  for (i = 0; i < numberof (t); i++)
219    fprintf (stderr, "  %.09f       %.09f\n", t_unsorted[i], t[i]);
220
221  return -1.0;
222}
223
224
225/* Read all of ptr,size to get it into the CPU memory cache.
226
227   A call to mpn_cache_fill_dummy() is used to make sure the compiler
228   doesn't optimize away the whole loop.  Using "volatile mp_limb_t sum"
229   would work too, but the function call means we don't rely on every
230   compiler actually implementing volatile properly.
231
232   mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
233   it can inline it.  */
234
235void
236mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
237{
238  mp_limb_t  sum = 0;
239  mp_size_t  i;
240
241  for (i = 0; i < size; i++)
242    sum += ptr[i];
243
244  mpn_cache_fill_dummy(sum);
245}
246
247
248void
249mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
250{
251  mpn_cache_fill (ptr, size);
252
253#if 0
254  mpn_random (ptr, size);
255#endif
256
257#if 0
258  mp_size_t  i;
259
260  for (i = 0; i < size; i++)
261    ptr[i] = i;
262#endif
263}
264
265
266void
267speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
268{
269  if (s->src_num >= numberof (s->src))
270    {
271      fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
272      abort ();
273    }
274  s->src[s->src_num].ptr = ptr;
275  s->src[s->src_num].size = size;
276  s->src_num++;
277}
278
279
280void
281speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
282{
283  if (s->dst_num >= numberof (s->dst))
284    {
285      fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
286      abort ();
287    }
288  s->dst[s->dst_num].ptr = ptr;
289  s->dst[s->dst_num].size = size;
290  s->dst_num++;
291}
292
293
294void
295speed_cache_fill (struct speed_params *s)
296{
297  static struct speed_params  prev;
298  int  i;
299
300  /* FIXME: need a better way to get the format string for a pointer */
301
302  if (speed_option_addrs)
303    {
304      int  different;
305
306      different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
307      for (i = 0; i < s->dst_num; i++)
308	different |= (s->dst[i].ptr != prev.dst[i].ptr);
309      for (i = 0; i < s->src_num; i++)
310	different |= (s->src[i].ptr != prev.src[i].ptr);
311
312      if (different)
313	{
314	  if (s->dst_num != 0)
315	    {
316	      printf ("dst");
317	      for (i = 0; i < s->dst_num; i++)
318		printf (" %08lX", (unsigned long) s->dst[i].ptr);
319	      printf (" ");
320	    }
321
322	  if (s->src_num != 0)
323	    {
324	      printf ("src");
325	      for (i = 0; i < s->src_num; i++)
326		printf (" %08lX", (unsigned long) s->src[i].ptr);
327	      printf (" ");
328	    }
329	  printf ("  (cf sp approx %08lX)\n", (unsigned long) &different);
330
331	}
332
333      memcpy (&prev, s, sizeof(prev));
334    }
335
336  switch (s->cache) {
337  case 0:
338    for (i = 0; i < s->dst_num; i++)
339      mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
340    for (i = 0; i < s->src_num; i++)
341      mpn_cache_fill (s->src[i].ptr, s->src[i].size);
342    break;
343  case 1:
344    pentium_wbinvd();
345    break;
346  }
347}
348
349
350/* Miscellanous options accepted by tune and speed programs under -o. */
351
352void
353speed_option_set (const char *s)
354{
355  int  n;
356
357  if (strcmp (s, "addrs") == 0)
358    {
359      speed_option_addrs = 1;
360    }
361  else if (strcmp (s, "verbose") == 0)
362    {
363      speed_option_verbose++;
364    }
365  else if (sscanf (s, "verbose=%d", &n) == 1)
366    {
367      speed_option_verbose = n;
368    }
369  else
370    {
371      printf ("Unrecognised -o option: %s\n", s);
372      exit (1);
373    }
374}
375
376
377/* The following are basic speed running routines for various gmp functions.
378   Many are very similar and use speed.h macros.
379
380   Each routine allocates it's own destination space for the result of the
381   function, because only it can know what the function needs.
382
383   speed_starttime() and speed_endtime() are put tight around the code to be
384   measured.  Any setups are done outside the timed portion.
385
386   Each routine is responsible for its own cache priming.
387   speed_cache_fill() is a good way to do this, see examples in speed.h.
388   One cache priming possibility, for CPUs with write-allocate cache, and
389   functions that don't take too long, is to do one dummy call before timing
390   so as to cache everything that gets used.  But speed_measure() runs a
391   routine at least twice and will take the smaller time, so this might not
392   be necessary.
393
394   Data alignment will be important, for source, destination and temporary
395   workspace.  A routine can align its destination and workspace.  Programs
396   using the routines will ensure s->xp and s->yp are aligned.  Aligning
397   onto a CACHE_LINE_SIZE boundary is suggested.  s->align_wp and
398   s->align_wp2 should be respected where it makes sense to do so.
399   SPEED_TMP_ALLOC_LIMBS is a good way to do this.
400
401   A loop of the following form can be expected to turn into good assembler
402   code on most CPUs, thereby minimizing overhead in the measurement.  It
403   can always be assumed s->reps >= 1.
404
405	  i = s->reps
406	  do
407	    foo();
408	  while (--i != 0);
409
410   Additional parameters might be added to "struct speed_params" in the
411   future.  Routines should ignore anything they don't use.
412
413   s->size can be used creatively, and s->xp and s->yp can be ignored.  For
414   example, speed_mpz_fac_ui() uses s->size as n for the factorial.  s->r is
415   just a user-supplied parameter.  speed_mpn_lshift() uses it as a shift,
416   speed_mpn_mul_1() uses it as a multiplier.  */
417
418
419/* MPN_COPY etc can be macros, so the _CALL forms are necessary */
420double
421speed_MPN_COPY (struct speed_params *s)
422{
423  SPEED_ROUTINE_MPN_COPY (MPN_COPY);
424}
425double
426speed_MPN_COPY_INCR (struct speed_params *s)
427{
428  SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR);
429}
430double
431speed_MPN_COPY_DECR (struct speed_params *s)
432{
433  SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR);
434}
435#if HAVE_NATIVE_mpn_copyi
436double
437speed_mpn_copyi (struct speed_params *s)
438{
439  SPEED_ROUTINE_MPN_COPY (mpn_copyi);
440}
441#endif
442#if HAVE_NATIVE_mpn_copyd
443double
444speed_mpn_copyd (struct speed_params *s)
445{
446  SPEED_ROUTINE_MPN_COPY (mpn_copyd);
447}
448#endif
449double
450speed_memcpy (struct speed_params *s)
451{
452  SPEED_ROUTINE_MPN_COPY_BYTES (memcpy);
453}
454double
455speed_mpn_com (struct speed_params *s)
456{
457  SPEED_ROUTINE_MPN_COPY (mpn_com);
458}
459
460
461double
462speed_mpn_addmul_1 (struct speed_params *s)
463{
464  SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
465}
466double
467speed_mpn_submul_1 (struct speed_params *s)
468{
469  SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
470}
471
472#if HAVE_NATIVE_mpn_addmul_2
473double
474speed_mpn_addmul_2 (struct speed_params *s)
475{
476  SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2);
477}
478#endif
479#if HAVE_NATIVE_mpn_addmul_3
480double
481speed_mpn_addmul_3 (struct speed_params *s)
482{
483  SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3);
484}
485#endif
486#if HAVE_NATIVE_mpn_addmul_4
487double
488speed_mpn_addmul_4 (struct speed_params *s)
489{
490  SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4);
491}
492#endif
493#if HAVE_NATIVE_mpn_addmul_5
494double
495speed_mpn_addmul_5 (struct speed_params *s)
496{
497  SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5);
498}
499#endif
500#if HAVE_NATIVE_mpn_addmul_6
501double
502speed_mpn_addmul_6 (struct speed_params *s)
503{
504  SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6);
505}
506#endif
507#if HAVE_NATIVE_mpn_addmul_7
508double
509speed_mpn_addmul_7 (struct speed_params *s)
510{
511  SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7);
512}
513#endif
514#if HAVE_NATIVE_mpn_addmul_8
515double
516speed_mpn_addmul_8 (struct speed_params *s)
517{
518  SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8);
519}
520#endif
521
522double
523speed_mpn_mul_1 (struct speed_params *s)
524{
525  SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
526}
527double
528speed_mpn_mul_1_inplace (struct speed_params *s)
529{
530  SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1);
531}
532
533#if HAVE_NATIVE_mpn_mul_2
534double
535speed_mpn_mul_2 (struct speed_params *s)
536{
537  SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2);
538}
539#endif
540#if HAVE_NATIVE_mpn_mul_3
541double
542speed_mpn_mul_3 (struct speed_params *s)
543{
544  SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3);
545}
546#endif
547#if HAVE_NATIVE_mpn_mul_4
548double
549speed_mpn_mul_4 (struct speed_params *s)
550{
551  SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4);
552}
553#endif
554
555
556double
557speed_mpn_lshift (struct speed_params *s)
558{
559  SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
560}
561double
562speed_mpn_lshiftc (struct speed_params *s)
563{
564  SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc);
565}
566double
567speed_mpn_rshift (struct speed_params *s)
568{
569  SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
570}
571
572
573/* The carry-in variants (if available) are good for measuring because they
574   won't skip a division if high<divisor.  Alternately, use -1 as a divisor
575   with the plain _1 forms. */
576double
577speed_mpn_divrem_1 (struct speed_params *s)
578{
579  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
580}
581double
582speed_mpn_divrem_1f (struct speed_params *s)
583{
584  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
585}
586#if HAVE_NATIVE_mpn_divrem_1c
587double
588speed_mpn_divrem_1c (struct speed_params *s)
589{
590  SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
591}
592double
593speed_mpn_divrem_1cf (struct speed_params *s)
594{
595  SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
596}
597#endif
598
599double
600speed_mpn_divrem_1_div (struct speed_params *s)
601{
602  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div);
603}
604double
605speed_mpn_divrem_1f_div (struct speed_params *s)
606{
607  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div);
608}
609double
610speed_mpn_divrem_1_inv (struct speed_params *s)
611{
612  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv);
613}
614double
615speed_mpn_divrem_1f_inv (struct speed_params *s)
616{
617  SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv);
618}
619double
620speed_mpn_mod_1_div (struct speed_params *s)
621{
622  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div);
623}
624double
625speed_mpn_mod_1_inv (struct speed_params *s)
626{
627  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv);
628}
629
630double
631speed_mpn_preinv_divrem_1 (struct speed_params *s)
632{
633  SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1);
634}
635double
636speed_mpn_preinv_divrem_1f (struct speed_params *s)
637{
638  SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1);
639}
640
641#if GMP_NUMB_BITS % 4 == 0
642double
643speed_mpn_mod_34lsub1 (struct speed_params *s)
644{
645  SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1);
646}
647#endif
648
649double
650speed_mpn_divrem_2 (struct speed_params *s)
651{
652  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
653}
654double
655speed_mpn_divrem_2_div (struct speed_params *s)
656{
657  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div);
658}
659double
660speed_mpn_divrem_2_inv (struct speed_params *s)
661{
662  SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv);
663}
664
665double
666speed_mpn_mod_1 (struct speed_params *s)
667{
668  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
669}
670#if HAVE_NATIVE_mpn_mod_1c
671double
672speed_mpn_mod_1c (struct speed_params *s)
673{
674  SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
675}
676#endif
677double
678speed_mpn_preinv_mod_1 (struct speed_params *s)
679{
680  SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1);
681}
682double
683speed_mpn_mod_1_1 (struct speed_params *s)
684{
685  SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p,mpn_mod_1_1p_cps);
686}
687double
688speed_mpn_mod_1_2 (struct speed_params *s)
689{
690  SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_2p,mpn_mod_1s_2p_cps,2);
691}
692double
693speed_mpn_mod_1_3 (struct speed_params *s)
694{
695  SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_3p,mpn_mod_1s_3p_cps,3);
696}
697double
698speed_mpn_mod_1_4 (struct speed_params *s)
699{
700  SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_4p,mpn_mod_1s_4p_cps,4);
701}
702
703double
704speed_mpn_divexact_1 (struct speed_params *s)
705{
706  SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1);
707}
708
709double
710speed_mpn_divexact_by3 (struct speed_params *s)
711{
712  SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3);
713}
714
715double
716speed_mpn_bdiv_dbm1c (struct speed_params *s)
717{
718  SPEED_ROUTINE_MPN_BDIV_DBM1C (mpn_bdiv_dbm1c);
719}
720
721double
722speed_mpn_bdiv_q_1 (struct speed_params *s)
723{
724  SPEED_ROUTINE_MPN_BDIV_Q_1 (mpn_bdiv_q_1);
725}
726
727double
728speed_mpn_pi1_bdiv_q_1 (struct speed_params *s)
729{
730  SPEED_ROUTINE_MPN_PI1_BDIV_Q_1 (mpn_pi1_bdiv_q_1);
731}
732
733#if HAVE_NATIVE_mpn_modexact_1_odd
734double
735speed_mpn_modexact_1_odd (struct speed_params *s)
736{
737  SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd);
738}
739#endif
740
741double
742speed_mpn_modexact_1c_odd (struct speed_params *s)
743{
744  SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd);
745}
746
747double
748speed_mpz_mod (struct speed_params *s)
749{
750  SPEED_ROUTINE_MPZ_MOD (mpz_mod);
751}
752
753double
754speed_mpn_sbpi1_div_qr (struct speed_params *s)
755{
756  SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_div_qr, inv.inv32, 2,0);
757}
758double
759speed_mpn_dcpi1_div_qr (struct speed_params *s)
760{
761  SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_div_qr, &inv, 6,3);
762}
763double
764speed_mpn_sbpi1_divappr_q (struct speed_params *s)
765{
766  SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_divappr_q, inv.inv32, 2,0);
767}
768double
769speed_mpn_dcpi1_divappr_q (struct speed_params *s)
770{
771  SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_divappr_q, &inv, 6,3);
772}
773double
774speed_mpn_mu_div_qr (struct speed_params *s)
775{
776  SPEED_ROUTINE_MPN_MU_DIV_QR (mpn_mu_div_qr, mpn_mu_div_qr_itch);
777}
778double
779speed_mpn_mu_divappr_q (struct speed_params *s)
780{
781  SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_divappr_q, mpn_mu_divappr_q_itch);
782}
783double
784speed_mpn_mu_div_q (struct speed_params *s)
785{
786  SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_div_q, mpn_mu_div_q_itch);
787}
788double
789speed_mpn_mupi_div_qr (struct speed_params *s)
790{
791  SPEED_ROUTINE_MPN_MUPI_DIV_QR (mpn_preinv_mu_div_qr, mpn_preinv_mu_div_qr_itch);
792}
793
794double
795speed_mpn_sbpi1_bdiv_qr (struct speed_params *s)
796{
797  SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_sbpi1_bdiv_qr);
798}
799double
800speed_mpn_dcpi1_bdiv_qr (struct speed_params *s)
801{
802  SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_dcpi1_bdiv_qr);
803}
804double
805speed_mpn_sbpi1_bdiv_q (struct speed_params *s)
806{
807  SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_sbpi1_bdiv_q);
808}
809double
810speed_mpn_dcpi1_bdiv_q (struct speed_params *s)
811{
812  SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_dcpi1_bdiv_q);
813}
814double
815speed_mpn_mu_bdiv_q (struct speed_params *s)
816{
817  SPEED_ROUTINE_MPN_MU_BDIV_Q (mpn_mu_bdiv_q, mpn_mu_bdiv_q_itch);
818}
819double
820speed_mpn_mu_bdiv_qr (struct speed_params *s)
821{
822  SPEED_ROUTINE_MPN_MU_BDIV_QR (mpn_mu_bdiv_qr, mpn_mu_bdiv_qr_itch);
823}
824
825double
826speed_mpn_binvert (struct speed_params *s)
827{
828  SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch);
829}
830
831double
832speed_mpn_invert (struct speed_params *s)
833{
834  SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch);
835}
836
837double
838speed_mpn_invertappr (struct speed_params *s)
839{
840  SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch);
841}
842
843double
844speed_mpn_ni_invertappr (struct speed_params *s)
845{
846  SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch);
847}
848
849double
850speed_mpn_redc_1 (struct speed_params *s)
851{
852  SPEED_ROUTINE_REDC_1 (mpn_redc_1);
853}
854double
855speed_mpn_redc_2 (struct speed_params *s)
856{
857  SPEED_ROUTINE_REDC_2 (mpn_redc_2);
858}
859double
860speed_mpn_redc_n (struct speed_params *s)
861{
862  SPEED_ROUTINE_REDC_N (mpn_redc_n);
863}
864
865
866double
867speed_mpn_popcount (struct speed_params *s)
868{
869  SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
870}
871double
872speed_mpn_hamdist (struct speed_params *s)
873{
874  SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
875}
876
877
878double
879speed_mpn_add_n (struct speed_params *s)
880{
881  SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
882}
883double
884speed_mpn_sub_n (struct speed_params *s)
885{
886SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
887}
888
889#if HAVE_NATIVE_mpn_add_n_sub_n
890double
891speed_mpn_add_n_sub_n (struct speed_params *s)
892{
893  SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size));
894}
895#endif
896
897#if HAVE_NATIVE_mpn_addlsh1_n
898double
899speed_mpn_addlsh1_n (struct speed_params *s)
900{
901  SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n);
902}
903#endif
904#if HAVE_NATIVE_mpn_sublsh1_n
905double
906speed_mpn_sublsh1_n (struct speed_params *s)
907{
908  SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n);
909}
910#endif
911#if HAVE_NATIVE_mpn_rsblsh1_n
912double
913speed_mpn_rsblsh1_n (struct speed_params *s)
914{
915  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n);
916}
917#endif
918#if HAVE_NATIVE_mpn_addlsh2_n
919double
920speed_mpn_addlsh2_n (struct speed_params *s)
921{
922  SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n);
923}
924#endif
925#if HAVE_NATIVE_mpn_sublsh2_n
926double
927speed_mpn_sublsh2_n (struct speed_params *s)
928{
929  SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n);
930}
931#endif
932#if HAVE_NATIVE_mpn_rsblsh2_n
933double
934speed_mpn_rsblsh2_n (struct speed_params *s)
935{
936  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n);
937}
938#endif
939#if HAVE_NATIVE_mpn_rsh1add_n
940double
941speed_mpn_rsh1add_n (struct speed_params *s)
942{
943  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n);
944}
945#endif
946#if HAVE_NATIVE_mpn_rsh1sub_n
947double
948speed_mpn_rsh1sub_n (struct speed_params *s)
949{
950  SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n);
951}
952#endif
953
954/* mpn_and_n etc can be macros and so have to be handled with
955   SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
956double
957speed_mpn_and_n (struct speed_params *s)
958{
959  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size));
960}
961double
962speed_mpn_andn_n (struct speed_params *s)
963{
964SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size));
965}
966double
967speed_mpn_nand_n (struct speed_params *s)
968{
969  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size));
970}
971double
972speed_mpn_ior_n (struct speed_params *s)
973{
974SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size));
975}
976double
977speed_mpn_iorn_n (struct speed_params *s)
978{
979  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size));
980}
981double
982speed_mpn_nior_n (struct speed_params *s)
983{
984  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size));
985}
986double
987speed_mpn_xor_n (struct speed_params *s)
988{
989  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size));
990}
991double
992speed_mpn_xnor_n (struct speed_params *s)
993{
994  SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size));
995}
996
997
998double
999speed_mpn_mul_n (struct speed_params *s)
1000{
1001  SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
1002}
1003double
1004speed_mpn_sqr (struct speed_params *s)
1005{
1006  SPEED_ROUTINE_MPN_SQR (mpn_sqr);
1007}
1008double
1009speed_mpn_mul_n_sqr (struct speed_params *s)
1010{
1011  SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
1012}
1013
1014double
1015speed_mpn_mul_basecase (struct speed_params *s)
1016{
1017  SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase);
1018}
1019double
1020speed_mpn_mul (struct speed_params *s)
1021{
1022  SPEED_ROUTINE_MPN_MUL(mpn_mul);
1023}
1024double
1025speed_mpn_sqr_basecase (struct speed_params *s)
1026{
1027  /* FIXME: size restrictions on some versions of sqr_basecase */
1028  SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
1029}
1030
1031#if HAVE_NATIVE_mpn_sqr_diagonal
1032double
1033speed_mpn_sqr_diagonal (struct speed_params *s)
1034{
1035  SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal);
1036}
1037#endif
1038
1039double
1040speed_mpn_toom2_sqr (struct speed_params *s)
1041{
1042  SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr);
1043}
1044double
1045speed_mpn_toom3_sqr (struct speed_params *s)
1046{
1047  SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr);
1048}
1049double
1050speed_mpn_toom4_sqr (struct speed_params *s)
1051{
1052  SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr);
1053}
1054double
1055speed_mpn_toom6_sqr (struct speed_params *s)
1056{
1057  SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr);
1058}
1059double
1060speed_mpn_toom8_sqr (struct speed_params *s)
1061{
1062  SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr);
1063}
1064double
1065speed_mpn_toom22_mul (struct speed_params *s)
1066{
1067  SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul);
1068}
1069double
1070speed_mpn_toom33_mul (struct speed_params *s)
1071{
1072  SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul);
1073}
1074double
1075speed_mpn_toom44_mul (struct speed_params *s)
1076{
1077  SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul);
1078}
1079double
1080speed_mpn_toom6h_mul (struct speed_params *s)
1081{
1082  SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul);
1083}
1084double
1085speed_mpn_toom8h_mul (struct speed_params *s)
1086{
1087  SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul);
1088}
1089
1090double
1091speed_mpn_toom32_mul (struct speed_params *s)
1092{
1093  SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul);
1094}
1095double
1096speed_mpn_toom42_mul (struct speed_params *s)
1097{
1098  SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul);
1099}
1100double
1101speed_mpn_toom43_mul (struct speed_params *s)
1102{
1103  SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul);
1104}
1105double
1106speed_mpn_toom63_mul (struct speed_params *s)
1107{
1108  SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul);
1109}
1110double
1111speed_mpn_toom32_for_toom43_mul (struct speed_params *s)
1112{
1113  SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul);
1114}
1115double
1116speed_mpn_toom43_for_toom32_mul (struct speed_params *s)
1117{
1118  SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul);
1119}
1120double
1121speed_mpn_toom32_for_toom53_mul (struct speed_params *s)
1122{
1123  SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul);
1124}
1125double
1126speed_mpn_toom53_for_toom32_mul (struct speed_params *s)
1127{
1128  SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul);
1129}
1130double
1131speed_mpn_toom42_for_toom53_mul (struct speed_params *s)
1132{
1133  SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul);
1134}
1135double
1136speed_mpn_toom53_for_toom42_mul (struct speed_params *s)
1137{
1138  SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul);
1139}
1140
1141double
1142speed_mpn_nussbaumer_mul (struct speed_params *s)
1143{
1144  SPEED_ROUTINE_MPN_MUL_N_CALL
1145    (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size));
1146}
1147double
1148speed_mpn_nussbaumer_mul_sqr (struct speed_params *s)
1149{
1150  SPEED_ROUTINE_MPN_SQR_CALL
1151    (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size));
1152}
1153
1154#if WANT_OLD_FFT_FULL
1155double
1156speed_mpn_mul_fft_full (struct speed_params *s)
1157{
1158  SPEED_ROUTINE_MPN_MUL_N_CALL
1159    (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
1160}
1161double
1162speed_mpn_mul_fft_full_sqr (struct speed_params *s)
1163{
1164  SPEED_ROUTINE_MPN_SQR_CALL
1165    (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
1166}
1167#endif
1168
1169/* These are mod 2^N+1 multiplies and squares.  If s->r is supplied it's
1170   used as k, otherwise the best k for the size is used.  If s->size isn't a
1171   multiple of 2^k it's rounded up to make the effective operation size.  */
1172
1173#define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr)       \
1174  {                                                     \
1175    mp_ptr     wp;                                      \
1176    mp_size_t  pl;                                      \
1177    int        k;                                       \
1178    unsigned   i;                                       \
1179    double     t;                                       \
1180    TMP_DECL;                                           \
1181							\
1182    SPEED_RESTRICT_COND (s->size >= 1);                 \
1183							\
1184    if (s->r != 0)                                      \
1185      k = s->r;                                         \
1186    else                                                \
1187      k = mpn_fft_best_k (s->size, sqr);                \
1188							\
1189    TMP_MARK;                                           \
1190    pl = mpn_fft_next_size (s->size, k);                \
1191    SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp);      \
1192							\
1193    speed_operand_src (s, s->xp, s->size);              \
1194    if (!sqr)                                           \
1195      speed_operand_src (s, s->yp, s->size);            \
1196    speed_operand_dst (s, wp, pl+1);                    \
1197    speed_cache_fill (s);                               \
1198							\
1199    speed_starttime ();                                 \
1200    i = s->reps;                                        \
1201    do                                                  \
1202      call;                                             \
1203    while (--i != 0);                                   \
1204    t = speed_endtime ();                               \
1205							\
1206    TMP_FREE;                                           \
1207    return t;                                           \
1208  }
1209
1210double
1211speed_mpn_mul_fft (struct speed_params *s)
1212{
1213  SPEED_ROUTINE_MPN_MUL_FFT_CALL
1214    (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
1215}
1216
1217double
1218speed_mpn_mul_fft_sqr (struct speed_params *s)
1219{
1220  SPEED_ROUTINE_MPN_MUL_FFT_CALL
1221    (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
1222}
1223
1224double
1225speed_mpn_fft_mul (struct speed_params *s)
1226{
1227  SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size));
1228}
1229
1230double
1231speed_mpn_fft_sqr (struct speed_params *s)
1232{
1233  SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size));
1234}
1235
1236double
1237speed_mpn_mullo_n (struct speed_params *s)
1238{
1239  SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n);
1240}
1241double
1242speed_mpn_mullo_basecase (struct speed_params *s)
1243{
1244  SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase);
1245}
1246
1247double
1248speed_mpn_mulmod_bnm1 (struct speed_params *s)
1249{
1250  SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp));
1251}
1252
1253double
1254speed_mpn_bc_mulmod_bnm1 (struct speed_params *s)
1255{
1256  SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp));
1257}
1258
1259double
1260speed_mpn_mulmod_bnm1_rounded (struct speed_params *s)
1261{
1262  SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1);
1263}
1264
1265double
1266speed_mpn_sqrmod_bnm1 (struct speed_params *s)
1267{
1268  SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp));
1269}
1270
1271double
1272speed_mpn_matrix22_mul (struct speed_params *s)
1273{
1274  /* Speed params only includes 2 inputs, so we have to invent the
1275     other 6. */
1276
1277  mp_ptr a;
1278  mp_ptr r;
1279  mp_ptr b;
1280  mp_ptr tp;
1281  mp_size_t itch;
1282  unsigned i;
1283  double t;
1284  TMP_DECL;
1285
1286  TMP_MARK;
1287  SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp);
1288  SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp);
1289  SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp);
1290
1291  MPN_COPY (a, s->xp, s->size);
1292  mpn_random (a + s->size, 3 * s->size);
1293  MPN_COPY (b, s->yp, s->size);
1294  mpn_random (b + s->size, 3 * s->size);
1295
1296  itch = mpn_matrix22_mul_itch (s->size, s->size);
1297  SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);
1298
1299  speed_operand_src (s, a, 4 * s->size);
1300  speed_operand_src (s, b, 4 * s->size);
1301  speed_operand_dst (s, r, 8 * s->size + 4);
1302  speed_operand_dst (s, tp, itch);
1303  speed_cache_fill (s);
1304
1305  speed_starttime ();
1306  i = s->reps;
1307  do
1308    {
1309      mp_size_t sz = s->size;
1310      MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz);
1311      MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz);
1312      MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz);
1313      MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz);
1314      mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz,
1315			b, b + 1 * sz,     b + 2 * sz,     b + 3 * sz,     sz,
1316			tp);
1317    }
1318  while (--i != 0);
1319  t = speed_endtime();
1320  TMP_FREE;
1321  return t;
1322}
1323
1324double
1325speed_mpn_hgcd (struct speed_params *s)
1326{
1327  mp_ptr wp;
1328  mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
1329  mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size);
1330  mp_ptr ap;
1331  mp_ptr bp;
1332  mp_ptr tmp1;
1333
1334  struct hgcd_matrix hgcd;
1335  int res;
1336  unsigned i;
1337  double t;
1338  TMP_DECL;
1339
1340  if (s->size < 2)
1341    return -1;
1342
1343  TMP_MARK;
1344
1345  SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
1346  SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
1347
1348  s->xp[s->size - 1] |= 1;
1349  s->yp[s->size - 1] |= 1;
1350
1351  SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
1352  SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
1353
1354  speed_starttime ();
1355  i = s->reps;
1356  do
1357    {
1358      MPN_COPY (ap, s->xp, s->size);
1359      MPN_COPY (bp, s->yp, s->size);
1360      mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
1361      res = mpn_hgcd (ap, bp, s->size, &hgcd, wp);
1362    }
1363  while (--i != 0);
1364  t = speed_endtime ();
1365  TMP_FREE;
1366  return t;
1367}
1368
1369double
1370speed_mpn_hgcd_lehmer (struct speed_params *s)
1371{
1372  mp_ptr wp;
1373  mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
1374  mp_size_t hgcd_scratch = MPN_HGCD_LEHMER_ITCH (s->size);
1375  mp_ptr ap;
1376  mp_ptr bp;
1377  mp_ptr tmp1;
1378
1379  struct hgcd_matrix hgcd;
1380  int res;
1381  unsigned i;
1382  double t;
1383  TMP_DECL;
1384
1385  if (s->size < 2)
1386    return -1;
1387
1388  TMP_MARK;
1389
1390  SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
1391  SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
1392
1393  s->xp[s->size - 1] |= 1;
1394  s->yp[s->size - 1] |= 1;
1395
1396  SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
1397  SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
1398
1399  speed_starttime ();
1400  i = s->reps;
1401  do
1402    {
1403      MPN_COPY (ap, s->xp, s->size);
1404      MPN_COPY (bp, s->yp, s->size);
1405      mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
1406      res = mpn_hgcd_lehmer (ap, bp, s->size, &hgcd, wp);
1407    }
1408  while (--i != 0);
1409  t = speed_endtime ();
1410  TMP_FREE;
1411  return t;
1412}
1413
1414double
1415speed_mpn_gcd (struct speed_params *s)
1416{
1417  SPEED_ROUTINE_MPN_GCD (mpn_gcd);
1418}
1419
1420double
1421speed_mpn_gcdext (struct speed_params *s)
1422{
1423  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
1424}
1425#if 0
1426double
1427speed_mpn_gcdext_lehmer (struct speed_params *s)
1428{
1429  SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer);
1430}
1431#endif
1432double
1433speed_mpn_gcdext_single (struct speed_params *s)
1434{
1435  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single);
1436}
1437double
1438speed_mpn_gcdext_double (struct speed_params *s)
1439{
1440  SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double);
1441}
1442double
1443speed_mpn_gcdext_one_single (struct speed_params *s)
1444{
1445  SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single);
1446}
1447double
1448speed_mpn_gcdext_one_double (struct speed_params *s)
1449{
1450  SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double);
1451}
1452double
1453speed_mpn_gcd_1 (struct speed_params *s)
1454{
1455  SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
1456}
1457double
1458speed_mpn_gcd_1N (struct speed_params *s)
1459{
1460  SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1);
1461}
1462
1463
1464double
1465speed_mpz_jacobi (struct speed_params *s)
1466{
1467  SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi);
1468}
1469double
1470speed_mpn_jacobi_base (struct speed_params *s)
1471{
1472  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
1473}
1474double
1475speed_mpn_jacobi_base_1 (struct speed_params *s)
1476{
1477  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1);
1478}
1479double
1480speed_mpn_jacobi_base_2 (struct speed_params *s)
1481{
1482  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2);
1483}
1484double
1485speed_mpn_jacobi_base_3 (struct speed_params *s)
1486{
1487  SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3);
1488}
1489
1490
1491double
1492speed_mpn_sqrtrem (struct speed_params *s)
1493{
1494  SPEED_ROUTINE_MPN_SQRTREM (mpn_sqrtrem);
1495}
1496
1497double
1498speed_mpn_rootrem (struct speed_params *s)
1499{
1500  SPEED_ROUTINE_MPN_ROOTREM (mpn_rootrem);
1501}
1502
1503
1504double
1505speed_mpz_fac_ui (struct speed_params *s)
1506{
1507  SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui);
1508}
1509
1510
1511double
1512speed_mpn_fib2_ui (struct speed_params *s)
1513{
1514  SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui);
1515}
1516double
1517speed_mpz_fib_ui (struct speed_params *s)
1518{
1519  SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui);
1520}
1521double
1522speed_mpz_fib2_ui (struct speed_params *s)
1523{
1524  SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui);
1525}
1526double
1527speed_mpz_lucnum_ui (struct speed_params *s)
1528{
1529  SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui);
1530}
1531double
1532speed_mpz_lucnum2_ui (struct speed_params *s)
1533{
1534  SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui);
1535}
1536
1537
1538double
1539speed_mpz_powm (struct speed_params *s)
1540{
1541  SPEED_ROUTINE_MPZ_POWM (mpz_powm);
1542}
1543double
1544speed_mpz_powm_mod (struct speed_params *s)
1545{
1546  SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod);
1547}
1548double
1549speed_mpz_powm_redc (struct speed_params *s)
1550{
1551  SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc);
1552}
1553double
1554speed_mpz_powm_ui (struct speed_params *s)
1555{
1556  SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui);
1557}
1558
1559
1560double
1561speed_binvert_limb (struct speed_params *s)
1562{
1563  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb);
1564}
1565
1566
1567double
1568speed_noop (struct speed_params *s)
1569{
1570  unsigned  i;
1571
1572  speed_starttime ();
1573  i = s->reps;
1574  do
1575    noop ();
1576  while (--i != 0);
1577  return speed_endtime ();
1578}
1579
1580double
1581speed_noop_wxs (struct speed_params *s)
1582{
1583  mp_ptr   wp;
1584  unsigned i;
1585  double   t;
1586  TMP_DECL;
1587
1588  TMP_MARK;
1589  wp = TMP_ALLOC_LIMBS (1);
1590
1591  speed_starttime ();
1592  i = s->reps;
1593  do
1594    noop_wxs (wp, s->xp, s->size);
1595  while (--i != 0);
1596  t = speed_endtime ();
1597
1598  TMP_FREE;
1599  return t;
1600}
1601
1602double
1603speed_noop_wxys (struct speed_params *s)
1604{
1605  mp_ptr   wp;
1606  unsigned i;
1607  double   t;
1608  TMP_DECL;
1609
1610  TMP_MARK;
1611  wp = TMP_ALLOC_LIMBS (1);
1612
1613  speed_starttime ();
1614  i = s->reps;
1615  do
1616    noop_wxys (wp, s->xp, s->yp, s->size);
1617  while (--i != 0);
1618  t = speed_endtime ();
1619
1620  TMP_FREE;
1621  return t;
1622}
1623
1624
1625#define SPEED_ROUTINE_ALLOC_FREE(variables, calls)      \
1626  {                                                     \
1627    unsigned  i;                                        \
1628    variables;                                          \
1629							\
1630    speed_starttime ();                                 \
1631    i = s->reps;                                        \
1632    do                                                  \
1633      {                                                 \
1634	calls;                                          \
1635      }                                                 \
1636    while (--i != 0);                                   \
1637    return speed_endtime ();                            \
1638  }
1639
1640
1641/* Compare these to see how much malloc/free costs and then how much
1642   __gmp_default_allocate/free and mpz_init/clear add.  mpz_init/clear or
1643   mpq_init/clear will be doing a 1 limb allocate, so use that as the size
1644   when including them in comparisons.  */
1645
1646double
1647speed_malloc_free (struct speed_params *s)
1648{
1649  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1650  SPEED_ROUTINE_ALLOC_FREE (void *p,
1651			    p = malloc (bytes);
1652			    free (p));
1653}
1654
1655double
1656speed_malloc_realloc_free (struct speed_params *s)
1657{
1658  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1659  SPEED_ROUTINE_ALLOC_FREE (void *p,
1660			    p = malloc (BYTES_PER_MP_LIMB);
1661			    p = realloc (p, bytes);
1662			    free (p));
1663}
1664
1665double
1666speed_gmp_allocate_free (struct speed_params *s)
1667{
1668  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1669  SPEED_ROUTINE_ALLOC_FREE (void *p,
1670			    p = (*__gmp_allocate_func) (bytes);
1671			    (*__gmp_free_func) (p, bytes));
1672}
1673
1674double
1675speed_gmp_allocate_reallocate_free (struct speed_params *s)
1676{
1677  size_t  bytes = s->size * BYTES_PER_MP_LIMB;
1678  SPEED_ROUTINE_ALLOC_FREE
1679    (void *p,
1680     p = (*__gmp_allocate_func) (BYTES_PER_MP_LIMB);
1681     p = (*__gmp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB);
1682     (*__gmp_free_func) (p, bytes));
1683}
1684
1685double
1686speed_mpz_init_clear (struct speed_params *s)
1687{
1688  SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1689			    mpz_init (z);
1690			    mpz_clear (z));
1691}
1692
1693double
1694speed_mpz_init_realloc_clear (struct speed_params *s)
1695{
1696  SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1697			    mpz_init (z);
1698			    _mpz_realloc (z, s->size);
1699			    mpz_clear (z));
1700}
1701
1702double
1703speed_mpq_init_clear (struct speed_params *s)
1704{
1705  SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
1706			    mpq_init (q);
1707			    mpq_clear (q));
1708}
1709
1710double
1711speed_mpf_init_clear (struct speed_params *s)
1712{
1713  SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
1714			    mpf_init (f);
1715			    mpf_clear (f));
1716}
1717
1718
1719/* Compare this to mpn_add_n to see how much overhead mpz_add adds.  Note
1720   that repeatedly calling mpz_add with the same data gives branch prediction
1721   in it an advantage.  */
1722
1723double
1724speed_mpz_add (struct speed_params *s)
1725{
1726  mpz_t     w, x, y;
1727  unsigned  i;
1728  double    t;
1729
1730  mpz_init (w);
1731  mpz_init (x);
1732  mpz_init (y);
1733
1734  mpz_set_n (x, s->xp, s->size);
1735  mpz_set_n (y, s->yp, s->size);
1736  mpz_add (w, x, y);
1737
1738  speed_starttime ();
1739  i = s->reps;
1740  do
1741    {
1742      mpz_add (w, x, y);
1743    }
1744  while (--i != 0);
1745  t = speed_endtime ();
1746
1747  mpz_clear (w);
1748  mpz_clear (x);
1749  mpz_clear (y);
1750  return t;
1751}
1752
1753
1754/* If r==0, calculate (size,size/2),
1755   otherwise calculate (size,r). */
1756
1757double
1758speed_mpz_bin_uiui (struct speed_params *s)
1759{
1760  mpz_t          w;
1761  unsigned long  k;
1762  unsigned  i;
1763  double    t;
1764
1765  mpz_init (w);
1766  if (s->r != 0)
1767    k = s->r;
1768  else
1769    k = s->size/2;
1770
1771  speed_starttime ();
1772  i = s->reps;
1773  do
1774    {
1775      mpz_bin_uiui (w, s->size, k);
1776    }
1777  while (--i != 0);
1778  t = speed_endtime ();
1779
1780  mpz_clear (w);
1781  return t;
1782}
1783
1784
1785/* The multiplies are successively dependent so the latency is measured, not
1786   the issue rate.  There's only 10 per loop so the code doesn't get too big
1787   since umul_ppmm is several instructions on some cpus.
1788
1789   Putting the arguments as "h,l,l,h" gets slightly better code from gcc
1790   2.95.2 on x86, it puts only one mov between each mul, not two.  That mov
1791   though will probably show up as a bogus extra cycle though.
1792
1793   The measuring function macros are into three parts to avoid overflowing
1794   preprocessor expansion space if umul_ppmm is big.
1795
1796   Limitations:
1797
1798   Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code
1799   generated first, especially on CPUs with low latency multipliers.
1800
1801   The default umul_ppmm doing h*l will be getting increasing numbers of
1802   high zero bits in the calculation.  CPUs with data-dependent multipliers
1803   will want to use umul_ppmm.1 to get some randomization into the
1804   calculation.  The extra xors and fetches will be a slowdown of course.  */
1805
1806#define SPEED_MACRO_UMUL_PPMM_A \
1807  {                             \
1808    mp_limb_t  h, l;            \
1809    unsigned   i;               \
1810    double     t;               \
1811				\
1812    s->time_divisor = 10;       \
1813				\
1814    h = s->xp[0];               \
1815    l = s->yp[0];               \
1816				\
1817    if (s->r == 1)              \
1818      {                         \
1819	speed_starttime ();     \
1820	i = s->reps;            \
1821	do                      \
1822	  {
1823
1824#define SPEED_MACRO_UMUL_PPMM_B \
1825	  }                     \
1826	while (--i != 0);       \
1827	t = speed_endtime ();   \
1828      }                         \
1829    else                        \
1830      {                         \
1831	speed_starttime ();     \
1832	i = s->reps;            \
1833	do                      \
1834	  {
1835
1836#define SPEED_MACRO_UMUL_PPMM_C                                         \
1837	  }                                                             \
1838	while (--i != 0);                                               \
1839	t = speed_endtime ();                                           \
1840      }                                                                 \
1841									\
1842    /* stop the compiler optimizing away the whole calculation! */      \
1843    noop_1 (h);                                                         \
1844    noop_1 (l);                                                         \
1845									\
1846    return t;                                                           \
1847  }
1848
1849
1850double
1851speed_umul_ppmm (struct speed_params *s)
1852{
1853  SPEED_MACRO_UMUL_PPMM_A;
1854  {
1855    umul_ppmm (h, l, l, h);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
1856     umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
1857     umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
1858    umul_ppmm (h, l, l, h);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
1859     umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
1860     umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
1861    umul_ppmm (h, l, l, h);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
1862     umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
1863     umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
1864    umul_ppmm (h, l, l, h);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
1865  }
1866  SPEED_MACRO_UMUL_PPMM_B;
1867  {
1868    umul_ppmm (h, l, l, h);
1869     umul_ppmm (h, l, l, h);
1870     umul_ppmm (h, l, l, h);
1871    umul_ppmm (h, l, l, h);
1872     umul_ppmm (h, l, l, h);
1873     umul_ppmm (h, l, l, h);
1874    umul_ppmm (h, l, l, h);
1875     umul_ppmm (h, l, l, h);
1876     umul_ppmm (h, l, l, h);
1877    umul_ppmm (h, l, l, h);
1878  }
1879  SPEED_MACRO_UMUL_PPMM_C;
1880}
1881
1882
1883#if HAVE_NATIVE_mpn_umul_ppmm
1884double
1885speed_mpn_umul_ppmm (struct speed_params *s)
1886{
1887  SPEED_MACRO_UMUL_PPMM_A;
1888  {
1889    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
1890     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
1891     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
1892    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
1893     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
1894     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
1895    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
1896     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
1897     h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
1898    h = mpn_umul_ppmm (&l, h, l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
1899  }
1900  SPEED_MACRO_UMUL_PPMM_B;
1901  {
1902    h = mpn_umul_ppmm (&l, h, l);
1903     h = mpn_umul_ppmm (&l, h, l);
1904     h = mpn_umul_ppmm (&l, h, l);
1905    h = mpn_umul_ppmm (&l, h, l);
1906     h = mpn_umul_ppmm (&l, h, l);
1907     h = mpn_umul_ppmm (&l, h, l);
1908    h = mpn_umul_ppmm (&l, h, l);
1909     h = mpn_umul_ppmm (&l, h, l);
1910     h = mpn_umul_ppmm (&l, h, l);
1911    h = mpn_umul_ppmm (&l, h, l);
1912  }
1913  SPEED_MACRO_UMUL_PPMM_C;
1914}
1915#endif
1916
1917#if HAVE_NATIVE_mpn_umul_ppmm_r
1918double
1919speed_mpn_umul_ppmm_r (struct speed_params *s)
1920{
1921  SPEED_MACRO_UMUL_PPMM_A;
1922  {
1923    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
1924     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
1925     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
1926    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
1927     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
1928     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
1929    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
1930     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
1931     h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
1932    h = mpn_umul_ppmm_r (h, l, &l);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
1933  }
1934  SPEED_MACRO_UMUL_PPMM_B;
1935  {
1936    h = mpn_umul_ppmm_r (h, l, &l);
1937     h = mpn_umul_ppmm_r (h, l, &l);
1938     h = mpn_umul_ppmm_r (h, l, &l);
1939    h = mpn_umul_ppmm_r (h, l, &l);
1940     h = mpn_umul_ppmm_r (h, l, &l);
1941     h = mpn_umul_ppmm_r (h, l, &l);
1942    h = mpn_umul_ppmm_r (h, l, &l);
1943     h = mpn_umul_ppmm_r (h, l, &l);
1944     h = mpn_umul_ppmm_r (h, l, &l);
1945    h = mpn_umul_ppmm_r (h, l, &l);
1946  }
1947  SPEED_MACRO_UMUL_PPMM_C;
1948}
1949#endif
1950
1951
1952/* The divisions are successively dependent so latency is measured, not
1953   issue rate.  There's only 10 per loop so the code doesn't get too big,
1954   especially for udiv_qrnnd_preinv and preinv2norm, which are several
1955   instructions each.
1956
1957   Note that it's only the division which is measured here, there's no data
1958   fetching and no shifting if the divisor gets normalized.
1959
1960   In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
1961   generate x86 div instructions with nothing in between.
1962
1963   The measuring function macros are in two parts to avoid overflowing
1964   preprocessor expansion space if udiv_qrnnd etc are big.
1965
1966   Limitations:
1967
1968   Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
1969   generated first.
1970
1971   CPUs with data-dependent divisions may want more attention paid to the
1972   randomness of the data used.  Probably the measurement wanted is over
1973   uniformly distributed numbers, but what's here might not be giving that.  */
1974
1975#define SPEED_ROUTINE_UDIV_QRNND_A(normalize)           \
1976  {                                                     \
1977    double     t;                                       \
1978    unsigned   i;                                       \
1979    mp_limb_t  q, r, d;                                 \
1980    mp_limb_t  dinv;                                    \
1981							\
1982    s->time_divisor = 10;                               \
1983							\
1984    /* divisor from "r" parameter, or a default */      \
1985    d = s->r;                                           \
1986    if (d == 0)                                         \
1987      d = mp_bases[10].big_base;                        \
1988							\
1989    if (normalize)                                      \
1990      {                                                 \
1991	unsigned  norm;                                 \
1992	count_leading_zeros (norm, d);                  \
1993	d <<= norm;                                     \
1994	invert_limb (dinv, d);                          \
1995      }                                                 \
1996							\
1997    q = s->xp[0];                                       \
1998    r = s->yp[0] % d;                                   \
1999							\
2000    speed_starttime ();                                 \
2001    i = s->reps;                                        \
2002    do                                                  \
2003      {
2004
2005#define SPEED_ROUTINE_UDIV_QRNND_B                                      \
2006      }                                                                 \
2007    while (--i != 0);                                                   \
2008    t = speed_endtime ();                                               \
2009									\
2010    /* stop the compiler optimizing away the whole calculation! */      \
2011    noop_1 (q);                                                         \
2012    noop_1 (r);                                                         \
2013									\
2014    return t;                                                           \
2015  }
2016
2017double
2018speed_udiv_qrnnd (struct speed_params *s)
2019{
2020  SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
2021  {
2022    udiv_qrnnd (q, r, r, q, d);
2023     udiv_qrnnd (q, r, r, q, d);
2024     udiv_qrnnd (q, r, r, q, d);
2025    udiv_qrnnd (q, r, r, q, d);
2026     udiv_qrnnd (q, r, r, q, d);
2027     udiv_qrnnd (q, r, r, q, d);
2028    udiv_qrnnd (q, r, r, q, d);
2029     udiv_qrnnd (q, r, r, q, d);
2030     udiv_qrnnd (q, r, r, q, d);
2031    udiv_qrnnd (q, r, r, q, d);
2032  }
2033  SPEED_ROUTINE_UDIV_QRNND_B;
2034}
2035
2036double
2037speed_udiv_qrnnd_preinv1 (struct speed_params *s)
2038{
2039  SPEED_ROUTINE_UDIV_QRNND_A (1);
2040  {
2041    udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2042     udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2043     udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2044    udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2045     udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2046     udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2047    udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2048     udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2049     udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2050    udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2051  }
2052  SPEED_ROUTINE_UDIV_QRNND_B;
2053}
2054
2055double
2056speed_udiv_qrnnd_preinv2 (struct speed_params *s)
2057{
2058  SPEED_ROUTINE_UDIV_QRNND_A (1);
2059  {
2060    udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2061     udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2062     udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2063    udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2064     udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2065     udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2066    udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2067     udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2068     udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2069    udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2070  }
2071  SPEED_ROUTINE_UDIV_QRNND_B;
2072}
2073
2074double
2075speed_udiv_qrnnd_c (struct speed_params *s)
2076{
2077  SPEED_ROUTINE_UDIV_QRNND_A (1);
2078  {
2079    __udiv_qrnnd_c (q, r, r, q, d);
2080     __udiv_qrnnd_c (q, r, r, q, d);
2081     __udiv_qrnnd_c (q, r, r, q, d);
2082    __udiv_qrnnd_c (q, r, r, q, d);
2083     __udiv_qrnnd_c (q, r, r, q, d);
2084     __udiv_qrnnd_c (q, r, r, q, d);
2085    __udiv_qrnnd_c (q, r, r, q, d);
2086     __udiv_qrnnd_c (q, r, r, q, d);
2087     __udiv_qrnnd_c (q, r, r, q, d);
2088    __udiv_qrnnd_c (q, r, r, q, d);
2089  }
2090  SPEED_ROUTINE_UDIV_QRNND_B;
2091}
2092
2093#if HAVE_NATIVE_mpn_udiv_qrnnd
2094double
2095speed_mpn_udiv_qrnnd (struct speed_params *s)
2096{
2097  SPEED_ROUTINE_UDIV_QRNND_A (1);
2098  {
2099    q = mpn_udiv_qrnnd (&r, r, q, d);
2100     q = mpn_udiv_qrnnd (&r, r, q, d);
2101     q = mpn_udiv_qrnnd (&r, r, q, d);
2102    q = mpn_udiv_qrnnd (&r, r, q, d);
2103     q = mpn_udiv_qrnnd (&r, r, q, d);
2104     q = mpn_udiv_qrnnd (&r, r, q, d);
2105    q = mpn_udiv_qrnnd (&r, r, q, d);
2106     q = mpn_udiv_qrnnd (&r, r, q, d);
2107     q = mpn_udiv_qrnnd (&r, r, q, d);
2108    q = mpn_udiv_qrnnd (&r, r, q, d);
2109  }
2110  SPEED_ROUTINE_UDIV_QRNND_B;
2111}
2112#endif
2113
2114#if HAVE_NATIVE_mpn_udiv_qrnnd_r
2115double
2116speed_mpn_udiv_qrnnd_r (struct speed_params *s)
2117{
2118  SPEED_ROUTINE_UDIV_QRNND_A (1);
2119  {
2120    q = mpn_udiv_qrnnd_r (r, q, d, &r);
2121     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2122     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2123    q = mpn_udiv_qrnnd_r (r, q, d, &r);
2124     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2125     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2126    q = mpn_udiv_qrnnd_r (r, q, d, &r);
2127     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2128     q = mpn_udiv_qrnnd_r (r, q, d, &r);
2129    q = mpn_udiv_qrnnd_r (r, q, d, &r);
2130  }
2131  SPEED_ROUTINE_UDIV_QRNND_B;
2132}
2133#endif
2134
2135
2136double
2137speed_invert_limb (struct speed_params *s)
2138{
2139  SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d));
2140}
2141
2142
2143/* xp[0] might not be particularly random, but should give an indication how
2144   "/" runs.  Same for speed_operator_mod below.  */
2145double
2146speed_operator_div (struct speed_params *s)
2147{
2148  double     t;
2149  unsigned   i;
2150  mp_limb_t  x, q, d;
2151
2152  s->time_divisor = 10;
2153
2154  /* divisor from "r" parameter, or a default */
2155  d = s->r;
2156  if (d == 0)
2157    d = mp_bases[10].big_base;
2158
2159  x = s->xp[0];
2160  q = 0;
2161
2162  speed_starttime ();
2163  i = s->reps;
2164  do
2165    {
2166      q ^= x; q /= d;
2167       q ^= x; q /= d;
2168       q ^= x; q /= d;
2169      q ^= x; q /= d;
2170       q ^= x; q /= d;
2171       q ^= x; q /= d;
2172      q ^= x; q /= d;
2173       q ^= x; q /= d;
2174       q ^= x; q /= d;
2175      q ^= x; q /= d;
2176    }
2177  while (--i != 0);
2178  t = speed_endtime ();
2179
2180  /* stop the compiler optimizing away the whole calculation! */
2181  noop_1 (q);
2182
2183  return t;
2184}
2185
2186double
2187speed_operator_mod (struct speed_params *s)
2188{
2189  double     t;
2190  unsigned   i;
2191  mp_limb_t  x, r, d;
2192
2193  s->time_divisor = 10;
2194
2195  /* divisor from "r" parameter, or a default */
2196  d = s->r;
2197  if (d == 0)
2198    d = mp_bases[10].big_base;
2199
2200  x = s->xp[0];
2201  r = 0;
2202
2203  speed_starttime ();
2204  i = s->reps;
2205  do
2206    {
2207      r ^= x; r %= d;
2208       r ^= x; r %= d;
2209       r ^= x; r %= d;
2210      r ^= x; r %= d;
2211       r ^= x; r %= d;
2212       r ^= x; r %= d;
2213      r ^= x; r %= d;
2214       r ^= x; r %= d;
2215       r ^= x; r %= d;
2216      r ^= x; r %= d;
2217    }
2218  while (--i != 0);
2219  t = speed_endtime ();
2220
2221  /* stop the compiler optimizing away the whole calculation! */
2222  noop_1 (r);
2223
2224  return t;
2225}
2226
2227
2228/* r==0 measures on data with the values uniformly distributed.  This will
2229   be typical for count_trailing_zeros in a GCD etc.
2230
2231   r==1 measures on data with the resultant count uniformly distributed
2232   between 0 and GMP_LIMB_BITS-1.  This is probably sensible for
2233   count_leading_zeros on the high limbs of divisors.  */
2234
2235int
2236speed_routine_count_zeros_setup (struct speed_params *s,
2237				 mp_ptr xp, int leading, int zero)
2238{
2239  int        i, c;
2240  mp_limb_t  n;
2241
2242  if (s->r == 0)
2243    {
2244      /* Make uniformly distributed data.  If zero isn't allowed then change
2245	 it to 1 for leading, or 0x800..00 for trailing.  */
2246      MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE);
2247      if (! zero)
2248	for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2249	  if (xp[i] == 0)
2250	    xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT;
2251    }
2252  else if (s->r == 1)
2253    {
2254      /* Make counts uniformly distributed.  A randomly chosen bit is set, and
2255	 for leading the rest above it are cleared, or for trailing then the
2256	 rest below.  */
2257      for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2258	{
2259	  mp_limb_t  set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS);
2260	  mp_limb_t  keep_below = set-1;
2261	  mp_limb_t  keep_above = MP_LIMB_T_MAX ^ keep_below;
2262	  mp_limb_t  keep = (leading ? keep_below : keep_above);
2263	  xp[i] = (s->xp_block[i] & keep) | set;
2264	}
2265    }
2266  else
2267    {
2268      return 0;
2269    }
2270
2271  /* Account for the effect of n^=c. */
2272  c = 0;
2273  for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2274    {
2275      n = xp[i];
2276      xp[i] ^= c;
2277
2278      if (leading)
2279	count_leading_zeros (c, n);
2280      else
2281	count_trailing_zeros (c, n);
2282    }
2283
2284  return 1;
2285}
2286
2287double
2288speed_count_leading_zeros (struct speed_params *s)
2289{
2290#ifdef COUNT_LEADING_ZEROS_0
2291#define COUNT_LEADING_ZEROS_0_ALLOWED   1
2292#else
2293#define COUNT_LEADING_ZEROS_0_ALLOWED   0
2294#endif
2295
2296  SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED);
2297  count_leading_zeros (c, n);
2298  SPEED_ROUTINE_COUNT_ZEROS_B ();
2299}
2300double
2301speed_count_trailing_zeros (struct speed_params *s)
2302{
2303  SPEED_ROUTINE_COUNT_ZEROS_A (0, 0);
2304  count_trailing_zeros (c, n);
2305  SPEED_ROUTINE_COUNT_ZEROS_B ();
2306}
2307
2308
2309double
2310speed_mpn_get_str (struct speed_params *s)
2311{
2312  SPEED_ROUTINE_MPN_GET_STR (mpn_get_str);
2313}
2314
2315double
2316speed_mpn_set_str (struct speed_params *s)
2317{
2318  SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base));
2319}
2320double
2321speed_mpn_bc_set_str (struct speed_params *s)
2322{
2323  SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base));
2324}
2325
2326double
2327speed_MPN_ZERO (struct speed_params *s)
2328{
2329  SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size));
2330}
2331
2332
2333int
2334speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate)
2335{
2336  if (s->r == 0)
2337    gmp_randinit_default (rstate);
2338  else if (s->r == 1)
2339    gmp_randinit_mt (rstate);
2340  else
2341    {
2342      return gmp_randinit_lc_2exp_size (rstate, s->r);
2343    }
2344  return 1;
2345}
2346
2347double
2348speed_gmp_randseed (struct speed_params *s)
2349{
2350  gmp_randstate_t  rstate;
2351  unsigned  i;
2352  double    t;
2353  mpz_t     x;
2354
2355  SPEED_RESTRICT_COND (s->size >= 1);
2356  SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2357
2358  /* s->size bits of seed */
2359  mpz_init_set_n (x, s->xp, s->size);
2360  mpz_fdiv_r_2exp (x, x, (unsigned long) s->size);
2361
2362  /* cache priming */
2363  gmp_randseed (rstate, x);
2364
2365  speed_starttime ();
2366  i = s->reps;
2367  do
2368    gmp_randseed (rstate, x);
2369  while (--i != 0);
2370  t = speed_endtime ();
2371
2372  gmp_randclear (rstate);
2373  mpz_clear (x);
2374  return t;
2375}
2376
2377double
2378speed_gmp_randseed_ui (struct speed_params *s)
2379{
2380  gmp_randstate_t  rstate;
2381  unsigned  i, j;
2382  double    t;
2383
2384  SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2385
2386  /* cache priming */
2387  gmp_randseed_ui (rstate, 123L);
2388
2389  speed_starttime ();
2390  i = s->reps;
2391  j = 0;
2392  do
2393    {
2394      gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]);
2395      j++;
2396      if (j >= SPEED_BLOCK_SIZE)
2397	j = 0;
2398    }
2399  while (--i != 0);
2400  t = speed_endtime ();
2401
2402  gmp_randclear (rstate);
2403  return t;
2404}
2405
2406double
2407speed_mpz_urandomb (struct speed_params *s)
2408{
2409  gmp_randstate_t  rstate;
2410  mpz_t     z;
2411  unsigned  i;
2412  double    t;
2413
2414  SPEED_RESTRICT_COND (s->size >= 0);
2415  SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2416
2417  mpz_init (z);
2418
2419  /* cache priming */
2420  mpz_urandomb (z, rstate, (unsigned long) s->size);
2421  mpz_urandomb (z, rstate, (unsigned long) s->size);
2422
2423  speed_starttime ();
2424  i = s->reps;
2425  do
2426    mpz_urandomb (z, rstate, (unsigned long) s->size);
2427  while (--i != 0);
2428  t = speed_endtime ();
2429
2430  mpz_clear (z);
2431  gmp_randclear (rstate);
2432  return t;
2433}
2434