1/* Speed measuring program.
2
3Copyright 1999-2003, 2005, 2006, 2008-2019 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of either:
9
10  * the GNU Lesser General Public License as published by the Free
11    Software Foundation; either version 3 of the License, or (at your
12    option) any later version.
13
14or
15
16  * the GNU General Public License as published by the Free Software
17    Foundation; either version 2 of the License, or (at your option) any
18    later version.
19
20or both in parallel, as here.
21
22The GNU MP Library is distributed in the hope that it will be useful, but
23WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25for more details.
26
27You should have received copies of the GNU General Public License and the
28GNU Lesser General Public License along with the GNU MP Library.  If not,
29see https://www.gnu.org/licenses/.  */
30
31/* Usage message is in the code below, run with no arguments to print it.
32   See README for interesting applications.
33
34   To add a new routine foo(), create a speed_foo() function in the style of
35   the existing ones and add an entry in the routine[] array.  Put FLAG_R if
36   speed_foo() wants an "r" parameter.
37
38   The routines don't have help messages or descriptions, but most have
39   suggestive names.  See the source code for full details. */
40
41#include "config.h"
42
43#include <limits.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47
48#if HAVE_UNISTD_H
49#include <unistd.h>  /* for getpid, R_OK */
50#endif
51
52#if TIME_WITH_SYS_TIME
53# include <sys/time.h>  /* for struct timeval */
54# include <time.h>
55#else
56# if HAVE_SYS_TIME_H
57#  include <sys/time.h>
58# else
59#  include <time.h>
60# endif
61#endif
62
63#if HAVE_SYS_RESOURCE_H
64#include <sys/resource.h>  /* for getrusage() */
65#endif
66
67
68#include "gmp-impl.h"
69#include "longlong.h"  /* for the benefit of speed-many.c */
70#include "tests.h"
71#include "speed.h"
72
73
74#if !HAVE_DECL_OPTARG
75extern char *optarg;
76extern int optind, opterr;
77#endif
78
79#if !HAVE_STRTOUL
80#define strtoul(p,e,b)  (unsigned long) strtol(p,e,b)
81#endif
82
83#ifdef SPEED_EXTRA_PROTOS
84SPEED_EXTRA_PROTOS
85#endif
86#ifdef SPEED_EXTRA_PROTOS2
87SPEED_EXTRA_PROTOS2
88#endif
89
90
91#if GMP_LIMB_BITS == 32
92#define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK)
93#endif
94#if GMP_LIMB_BITS == 64
95#define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK)
96#endif
97
98
99#define CMP_ABSOLUTE     1
100#define CMP_RATIO        2
101#define CMP_DIFFERENCE   3
102#define CMP_DIFFPREV     4
103int  option_cmp = CMP_ABSOLUTE;
104
105#define UNIT_SECONDS        1
106#define UNIT_CYCLES         2
107#define UNIT_CYCLESPERLIMB  3
108int  option_unit = UNIT_SECONDS;
109
110#define DATA_RANDOM   1
111#define DATA_RANDOM2  2
112#define DATA_ZEROS    3
113#define DATA_AAS      4
114#define DATA_FFS      5
115#define DATA_2FD      6
116int  option_data = DATA_RANDOM;
117
118int        option_square = 0;
119double     option_factor = 0.0;
120mp_size_t  option_step = 1;
121int        option_gnuplot = 0;
122char      *option_gnuplot_basename;
123struct size_array_t {
124  mp_size_t start, end;
125} *size_array = NULL;
126mp_size_t  size_num = 0;
127mp_size_t  size_allocnum = 0;
128int        option_resource_usage = 0;
129long       option_seed = 123456789;
130
131struct speed_params  sp;
132
133#define COLUMN_WIDTH  13  /* for the free-form output */
134
135#define FLAG_R            (1<<0)  /* require ".r" */
136#define FLAG_R_OPTIONAL   (1<<1)  /* optional ".r" */
137#define FLAG_RSIZE        (1<<2)
138#define FLAG_NODATA       (1<<3)  /* don't alloc xp, yp */
139
140const struct routine_t {
141  /* constants */
142  const char        *name;
143  speed_function_t  fun;
144  int               flag;
145} routine[] = {
146
147  { "noop",              speed_noop                 },
148  { "noop_wxs",          speed_noop_wxs             },
149  { "noop_wxys",         speed_noop_wxys            },
150
151  { "mpn_add_n",         speed_mpn_add_n,     FLAG_R_OPTIONAL },
152  { "mpn_sub_n",         speed_mpn_sub_n,     FLAG_R_OPTIONAL },
153  { "mpn_add_1",         speed_mpn_add_1,     FLAG_R },
154  { "mpn_add_1_inplace", speed_mpn_add_1_inplace, FLAG_R },
155  { "mpn_sub_1",         speed_mpn_sub_1,     FLAG_R },
156  { "mpn_sub_1_inplace", speed_mpn_sub_1_inplace, FLAG_R },
157
158  { "mpn_add_err1_n",    speed_mpn_add_err1_n    },
159  { "mpn_add_err2_n",    speed_mpn_add_err2_n    },
160  { "mpn_add_err3_n",    speed_mpn_add_err3_n    },
161  { "mpn_sub_err1_n",    speed_mpn_sub_err1_n    },
162  { "mpn_sub_err2_n",    speed_mpn_sub_err2_n    },
163  { "mpn_sub_err3_n",    speed_mpn_sub_err3_n    },
164
165#if HAVE_NATIVE_mpn_add_n_sub_n
166  { "mpn_add_n_sub_n",      speed_mpn_add_n_sub_n,     FLAG_R_OPTIONAL },
167#endif
168
169  { "mpn_addmul_1",      speed_mpn_addmul_1,  FLAG_R },
170  { "mpn_submul_1",      speed_mpn_submul_1,  FLAG_R },
171#if HAVE_NATIVE_mpn_addmul_2
172  { "mpn_addmul_2",      speed_mpn_addmul_2,  FLAG_R_OPTIONAL },
173#endif
174#if HAVE_NATIVE_mpn_addmul_3
175  { "mpn_addmul_3",      speed_mpn_addmul_3,  FLAG_R_OPTIONAL },
176#endif
177#if HAVE_NATIVE_mpn_addmul_4
178  { "mpn_addmul_4",      speed_mpn_addmul_4,  FLAG_R_OPTIONAL },
179#endif
180#if HAVE_NATIVE_mpn_addmul_5
181  { "mpn_addmul_5",      speed_mpn_addmul_5,  FLAG_R_OPTIONAL },
182#endif
183#if HAVE_NATIVE_mpn_addmul_6
184  { "mpn_addmul_6",      speed_mpn_addmul_6,  FLAG_R_OPTIONAL },
185#endif
186#if HAVE_NATIVE_mpn_addmul_7
187  { "mpn_addmul_7",      speed_mpn_addmul_7,  FLAG_R_OPTIONAL },
188#endif
189#if HAVE_NATIVE_mpn_addmul_8
190  { "mpn_addmul_8",      speed_mpn_addmul_8,  FLAG_R_OPTIONAL },
191#endif
192  { "mpn_mul_1",         speed_mpn_mul_1,     FLAG_R },
193  { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R },
194#if HAVE_NATIVE_mpn_mul_2
195  { "mpn_mul_2",         speed_mpn_mul_2,     FLAG_R_OPTIONAL },
196#endif
197#if HAVE_NATIVE_mpn_mul_3
198  { "mpn_mul_3",         speed_mpn_mul_3,     FLAG_R_OPTIONAL },
199#endif
200#if HAVE_NATIVE_mpn_mul_4
201  { "mpn_mul_4",         speed_mpn_mul_4,     FLAG_R_OPTIONAL },
202#endif
203#if HAVE_NATIVE_mpn_mul_5
204  { "mpn_mul_5",         speed_mpn_mul_5,     FLAG_R_OPTIONAL },
205#endif
206#if HAVE_NATIVE_mpn_mul_6
207  { "mpn_mul_6",         speed_mpn_mul_6,     FLAG_R_OPTIONAL },
208#endif
209
210  { "mpn_divrem_1",      speed_mpn_divrem_1,  FLAG_R },
211  { "mpn_divrem_1f",     speed_mpn_divrem_1f, FLAG_R },
212#if HAVE_NATIVE_mpn_divrem_1c
213  { "mpn_divrem_1c",     speed_mpn_divrem_1c, FLAG_R },
214  { "mpn_divrem_1cf",    speed_mpn_divrem_1cf,FLAG_R },
215#endif
216  { "mpn_mod_1",         speed_mpn_mod_1,     FLAG_R },
217#if HAVE_NATIVE_mpn_mod_1c
218  { "mpn_mod_1c",        speed_mpn_mod_1c,    FLAG_R },
219#endif
220  { "mpn_preinv_divrem_1",  speed_mpn_preinv_divrem_1,  FLAG_R },
221  { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R },
222  { "mpn_preinv_mod_1",  speed_mpn_preinv_mod_1, FLAG_R },
223
224  { "mpn_mod_1_1",       speed_mpn_mod_1_1,       FLAG_R },
225  { "mpn_mod_1_1_1",     speed_mpn_mod_1_1_1,     FLAG_R },
226  { "mpn_mod_1_1_2",     speed_mpn_mod_1_1_2,     FLAG_R },
227  { "mpn_mod_1s_2",      speed_mpn_mod_1_2,       FLAG_R },
228  { "mpn_mod_1s_3",      speed_mpn_mod_1_3,       FLAG_R },
229  { "mpn_mod_1s_4",      speed_mpn_mod_1_4,       FLAG_R },
230
231  { "mpn_divrem_1_div",  speed_mpn_divrem_1_div,  FLAG_R },
232  { "mpn_divrem_1_inv",  speed_mpn_divrem_1_inv,  FLAG_R },
233  { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R },
234  { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R },
235  { "mpn_mod_1_div",     speed_mpn_mod_1_div,     FLAG_R },
236  { "mpn_mod_1_inv",     speed_mpn_mod_1_inv,     FLAG_R },
237
238  { "mpn_divrem_2",      speed_mpn_divrem_2,        },
239  { "mpn_divrem_2_div",  speed_mpn_divrem_2_div,    },
240  { "mpn_divrem_2_inv",  speed_mpn_divrem_2_inv,    },
241
242  { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R  },
243  { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R  },
244  { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R  },
245  { "mpn_div_qr_1",      speed_mpn_div_qr_1,      FLAG_R },
246
247  { "mpn_div_qr_2n",     speed_mpn_div_qr_2n,       },
248  { "mpn_div_qr_2u",     speed_mpn_div_qr_2u,       },
249
250  { "mpn_divexact_1",    speed_mpn_divexact_1,    FLAG_R },
251  { "mpn_divexact_by3",  speed_mpn_divexact_by3          },
252
253  { "mpn_bdiv_q_1",      speed_mpn_bdiv_q_1,      FLAG_R },
254  { "mpn_pi1_bdiv_q_1",  speed_mpn_pi1_bdiv_q_1,  FLAG_R_OPTIONAL },
255  { "mpn_bdiv_dbm1c",    speed_mpn_bdiv_dbm1c,    FLAG_R_OPTIONAL },
256
257#if HAVE_NATIVE_mpn_modexact_1_odd
258  { "mpn_modexact_1_odd",  speed_mpn_modexact_1_odd,  FLAG_R },
259#endif
260  { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R },
261
262#if GMP_NUMB_BITS % 4 == 0
263  { "mpn_mod_34lsub1",   speed_mpn_mod_34lsub1 },
264#endif
265
266  { "mpn_lshift",        speed_mpn_lshift, FLAG_R   },
267  { "mpn_lshiftc",       speed_mpn_lshiftc, FLAG_R   },
268  { "mpn_rshift",        speed_mpn_rshift, FLAG_R   },
269
270  { "mpn_and_n",         speed_mpn_and_n,  FLAG_R_OPTIONAL },
271  { "mpn_andn_n",        speed_mpn_andn_n, FLAG_R_OPTIONAL },
272  { "mpn_nand_n",        speed_mpn_nand_n, FLAG_R_OPTIONAL },
273  { "mpn_ior_n",         speed_mpn_ior_n,  FLAG_R_OPTIONAL },
274  { "mpn_iorn_n",        speed_mpn_iorn_n, FLAG_R_OPTIONAL },
275  { "mpn_nior_n",        speed_mpn_nior_n, FLAG_R_OPTIONAL },
276  { "mpn_xor_n",         speed_mpn_xor_n,  FLAG_R_OPTIONAL },
277  { "mpn_xnor_n",        speed_mpn_xnor_n, FLAG_R_OPTIONAL },
278  { "mpn_com",           speed_mpn_com              },
279  { "mpn_neg",           speed_mpn_neg              },
280
281  { "mpn_popcount",      speed_mpn_popcount         },
282  { "mpn_hamdist",       speed_mpn_hamdist          },
283
284  { "mpn_matrix22_mul",  speed_mpn_matrix22_mul     },
285
286  { "mpn_hgcd2",         speed_mpn_hgcd2, FLAG_NODATA },
287  { "mpn_hgcd2_1",       speed_mpn_hgcd2_1, FLAG_NODATA },
288  { "mpn_hgcd2_2",       speed_mpn_hgcd2_2, FLAG_NODATA },
289  { "mpn_hgcd2_3",       speed_mpn_hgcd2_3, FLAG_NODATA },
290  { "mpn_hgcd2_4",       speed_mpn_hgcd2_4, FLAG_NODATA },
291  { "mpn_hgcd2_5",       speed_mpn_hgcd2_5, FLAG_NODATA },
292  { "mpn_hgcd",          speed_mpn_hgcd             },
293  { "mpn_hgcd_lehmer",   speed_mpn_hgcd_lehmer      },
294  { "mpn_hgcd_appr",     speed_mpn_hgcd_appr        },
295  { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
296
297  { "mpn_hgcd_reduce",   speed_mpn_hgcd_reduce      },
298  { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1    },
299  { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2    },
300
301  { "mpn_gcd_1",         speed_mpn_gcd_1,  FLAG_R_OPTIONAL },
302  { "mpn_gcd_11",        speed_mpn_gcd_11, FLAG_R_OPTIONAL },
303  { "mpn_gcd_1N",        speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
304  { "mpn_gcd_22",        speed_mpn_gcd_22, FLAG_R_OPTIONAL },
305
306  { "mpn_gcd",           speed_mpn_gcd                    },
307
308  { "mpn_gcdext",            speed_mpn_gcdext            },
309  { "mpn_gcdext_single",     speed_mpn_gcdext_single     },
310  { "mpn_gcdext_double",     speed_mpn_gcdext_double     },
311  { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single },
312  { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double },
313#if 0
314  { "mpn_gcdext_lehmer",     speed_mpn_gcdext_lehmer     },
315#endif
316
317  { "mpz_nextprime",     speed_mpz_nextprime        },
318
319  { "mpz_jacobi",        speed_mpz_jacobi           },
320  { "mpn_jacobi_base",   speed_mpn_jacobi_base      },
321  { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1    },
322  { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2    },
323  { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3    },
324  { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4    },
325
326  { "mpn_mul",           speed_mpn_mul,         FLAG_R_OPTIONAL },
327  { "mpn_mul_basecase",  speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
328  { "mpn_sqr_basecase",  speed_mpn_sqr_basecase     },
329#if HAVE_NATIVE_mpn_sqr_diagonal
330  { "mpn_sqr_diagonal",  speed_mpn_sqr_diagonal     },
331#endif
332#if HAVE_NATIVE_mpn_sqr_diag_addlsh1
333  { "mpn_sqr_diag_addlsh1", speed_mpn_sqr_diag_addlsh1 },
334#endif
335
336  { "mpn_mul_n",         speed_mpn_mul_n            },
337  { "mpn_sqr",           speed_mpn_sqr              },
338
339  { "mpn_toom2_sqr",     speed_mpn_toom2_sqr        },
340  { "mpn_toom3_sqr",     speed_mpn_toom3_sqr        },
341  { "mpn_toom4_sqr",     speed_mpn_toom4_sqr        },
342  { "mpn_toom6_sqr",     speed_mpn_toom6_sqr        },
343  { "mpn_toom8_sqr",     speed_mpn_toom8_sqr        },
344  { "mpn_toom22_mul",    speed_mpn_toom22_mul       },
345  { "mpn_toom33_mul",    speed_mpn_toom33_mul       },
346  { "mpn_toom44_mul",    speed_mpn_toom44_mul       },
347  { "mpn_toom6h_mul",    speed_mpn_toom6h_mul       },
348  { "mpn_toom8h_mul",    speed_mpn_toom8h_mul       },
349  { "mpn_toom32_mul",    speed_mpn_toom32_mul       },
350  { "mpn_toom42_mul",    speed_mpn_toom42_mul       },
351  { "mpn_toom43_mul",    speed_mpn_toom43_mul       },
352  { "mpn_toom63_mul",    speed_mpn_toom63_mul       },
353  { "mpn_nussbaumer_mul",    speed_mpn_nussbaumer_mul    },
354  { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
355#if WANT_OLD_FFT_FULL
356  { "mpn_mul_fft_full",      speed_mpn_mul_fft_full      },
357  { "mpn_mul_fft_full_sqr",  speed_mpn_mul_fft_full_sqr  },
358#endif
359  { "mpn_mul_fft",       speed_mpn_mul_fft,     FLAG_R_OPTIONAL },
360  { "mpn_mul_fft_sqr",   speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
361
362  { "mpn_sqrlo",          speed_mpn_sqrlo           },
363  { "mpn_sqrlo_basecase", speed_mpn_sqrlo_basecase  },
364  { "mpn_mullo_n",        speed_mpn_mullo_n         },
365  { "mpn_mullo_basecase", speed_mpn_mullo_basecase  },
366
367  { "mpn_mulmid_basecase",  speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL },
368  { "mpn_toom42_mulmid",    speed_mpn_toom42_mulmid },
369  { "mpn_mulmid_n",         speed_mpn_mulmid_n },
370  { "mpn_mulmid",           speed_mpn_mulmid, FLAG_R_OPTIONAL },
371
372  { "mpn_bc_mulmod_bnm1",      speed_mpn_bc_mulmod_bnm1      },
373  { "mpn_mulmod_bnm1",         speed_mpn_mulmod_bnm1         },
374  { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded },
375  { "mpn_sqrmod_bnm1",         speed_mpn_sqrmod_bnm1         },
376
377  { "mpn_invert",              speed_mpn_invert              },
378  { "mpn_invertappr",          speed_mpn_invertappr          },
379  { "mpn_ni_invertappr",       speed_mpn_ni_invertappr       },
380  { "mpn_binvert",             speed_mpn_binvert             },
381  { "mpn_sec_invert",          speed_mpn_sec_invert          },
382
383  { "mpn_sbpi1_div_qr",        speed_mpn_sbpi1_div_qr,    FLAG_R_OPTIONAL},
384  { "mpn_dcpi1_div_qr",        speed_mpn_dcpi1_div_qr,    FLAG_R_OPTIONAL},
385  { "mpn_mu_div_qr",           speed_mpn_mu_div_qr,       FLAG_R_OPTIONAL},
386  { "mpn_mupi_div_qr",         speed_mpn_mupi_div_qr,     FLAG_R_OPTIONAL},
387  { "mpn_sbpi1_divappr_q",     speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL},
388  { "mpn_dcpi1_divappr_q",     speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL},
389
390  { "mpn_sbpi1_bdiv_qr",       speed_mpn_sbpi1_bdiv_qr       },
391  { "mpn_dcpi1_bdiv_qr",       speed_mpn_dcpi1_bdiv_qr       },
392  { "mpn_sbpi1_bdiv_q",        speed_mpn_sbpi1_bdiv_q        },
393  { "mpn_dcpi1_bdiv_q",        speed_mpn_dcpi1_bdiv_q        },
394  { "mpn_sbpi1_bdiv_r",        speed_mpn_sbpi1_bdiv_r        },
395
396  { "mpn_broot",               speed_mpn_broot,    FLAG_R },
397  { "mpn_broot_invm1",         speed_mpn_broot_invm1, FLAG_R },
398  { "mpn_brootinv",            speed_mpn_brootinv, FLAG_R },
399
400  { "mpn_get_str",          speed_mpn_get_str,     FLAG_R_OPTIONAL },
401  { "mpn_set_str",          speed_mpn_set_str,     FLAG_R_OPTIONAL },
402  { "mpn_set_str_basecase", speed_mpn_bc_set_str,  FLAG_R_OPTIONAL },
403
404  { "mpn_sqrtrem",       speed_mpn_sqrtrem          },
405  { "mpn_rootrem",       speed_mpn_rootrem, FLAG_R  },
406  { "mpn_sqrt",          speed_mpn_sqrt             },
407  { "mpn_root",          speed_mpn_root, FLAG_R     },
408
409  { "mpn_perfect_power_p",  speed_mpn_perfect_power_p,       },
410  { "mpn_perfect_square_p", speed_mpn_perfect_square_p,      },
411
412  { "mpn_fib2_ui",       speed_mpn_fib2_ui,    FLAG_NODATA },
413  { "mpz_fib_ui",        speed_mpz_fib_ui,     FLAG_NODATA },
414  { "mpz_fib2_ui",       speed_mpz_fib2_ui,    FLAG_NODATA },
415  { "mpz_lucnum_ui",     speed_mpz_lucnum_ui,  FLAG_NODATA },
416  { "mpz_lucnum2_ui",    speed_mpz_lucnum2_ui, FLAG_NODATA },
417
418  { "mpz_add",           speed_mpz_add              },
419  { "mpz_invert",        speed_mpz_invert,   FLAG_R_OPTIONAL },
420  { "mpz_bin_uiui",      speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
421  { "mpz_bin_ui",        speed_mpz_bin_ui,   FLAG_NODATA | FLAG_R_OPTIONAL },
422  { "mpz_fac_ui",        speed_mpz_fac_ui,   FLAG_NODATA   },
423  { "mpz_2fac_ui",       speed_mpz_2fac_ui,  FLAG_NODATA   },
424  { "mpz_mfac_uiui",     speed_mpz_mfac_uiui,  FLAG_NODATA | FLAG_R_OPTIONAL },
425  { "mpz_primorial_ui",  speed_mpz_primorial_ui, FLAG_NODATA },
426  { "mpz_powm",          speed_mpz_powm,     FLAG_R_OPTIONAL },
427  { "mpz_powm_mod",      speed_mpz_powm_mod         },
428  { "mpz_powm_redc",     speed_mpz_powm_redc        },
429  { "mpz_powm_sec",      speed_mpz_powm_sec        },
430  { "mpz_powm_ui",       speed_mpz_powm_ui,  FLAG_R_OPTIONAL },
431
432  { "mpz_mod",           speed_mpz_mod              },
433  { "mpn_redc_1",        speed_mpn_redc_1           },
434  { "mpn_redc_2",        speed_mpn_redc_2           },
435  { "mpn_redc_n",        speed_mpn_redc_n           },
436
437  { "MPN_COPY",          speed_MPN_COPY             },
438  { "MPN_COPY_INCR",     speed_MPN_COPY_INCR        },
439  { "MPN_COPY_DECR",     speed_MPN_COPY_DECR        },
440  { "memcpy",            speed_memcpy               },
441#if HAVE_NATIVE_mpn_copyi
442  { "mpn_copyi",         speed_mpn_copyi            },
443#endif
444#if HAVE_NATIVE_mpn_copyd
445  { "mpn_copyd",         speed_mpn_copyd            },
446#endif
447  { "mpn_sec_tabselect", speed_mpn_sec_tabselect, FLAG_R_OPTIONAL },
448#if HAVE_NATIVE_mpn_addlsh1_n == 1
449  { "mpn_addlsh1_n",     speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
450#endif
451#if HAVE_NATIVE_mpn_sublsh1_n == 1
452  { "mpn_sublsh1_n",     speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
453#endif
454#if HAVE_NATIVE_mpn_addlsh1_n_ip1
455  { "mpn_addlsh1_n_ip1", speed_mpn_addlsh1_n_ip1    },
456#endif
457#if HAVE_NATIVE_mpn_addlsh1_n_ip2
458  { "mpn_addlsh1_n_ip2", speed_mpn_addlsh1_n_ip2    },
459#endif
460#if HAVE_NATIVE_mpn_sublsh1_n_ip1
461  { "mpn_sublsh1_n_ip1", speed_mpn_sublsh1_n_ip1    },
462#endif
463#if HAVE_NATIVE_mpn_rsblsh1_n == 1
464  { "mpn_rsblsh1_n",     speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
465#endif
466#if HAVE_NATIVE_mpn_addlsh2_n == 1
467  { "mpn_addlsh2_n",     speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
468#endif
469#if HAVE_NATIVE_mpn_sublsh2_n == 1
470  { "mpn_sublsh2_n",     speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },
471#endif
472#if HAVE_NATIVE_mpn_addlsh2_n_ip1
473  { "mpn_addlsh2_n_ip1", speed_mpn_addlsh2_n_ip1    },
474#endif
475#if HAVE_NATIVE_mpn_addlsh2_n_ip2
476  { "mpn_addlsh2_n_ip2", speed_mpn_addlsh2_n_ip2    },
477#endif
478#if HAVE_NATIVE_mpn_sublsh2_n_ip1
479  { "mpn_sublsh2_n_ip1", speed_mpn_sublsh2_n_ip1    },
480#endif
481#if HAVE_NATIVE_mpn_rsblsh2_n == 1
482  { "mpn_rsblsh2_n",     speed_mpn_rsblsh2_n, FLAG_R_OPTIONAL },
483#endif
484#if HAVE_NATIVE_mpn_addlsh_n
485  { "mpn_addlsh_n",     speed_mpn_addlsh_n, FLAG_R_OPTIONAL },
486#endif
487#if HAVE_NATIVE_mpn_sublsh_n
488  { "mpn_sublsh_n",     speed_mpn_sublsh_n, FLAG_R_OPTIONAL },
489#endif
490#if HAVE_NATIVE_mpn_addlsh_n_ip1
491  { "mpn_addlsh_n_ip1", speed_mpn_addlsh_n_ip1    },
492#endif
493#if HAVE_NATIVE_mpn_addlsh_n_ip2
494  { "mpn_addlsh_n_ip2", speed_mpn_addlsh_n_ip2    },
495#endif
496#if HAVE_NATIVE_mpn_sublsh_n_ip1
497  { "mpn_sublsh_n_ip1", speed_mpn_sublsh_n_ip1    },
498#endif
499#if HAVE_NATIVE_mpn_rsblsh_n
500  { "mpn_rsblsh_n",     speed_mpn_rsblsh_n, FLAG_R_OPTIONAL },
501#endif
502#if HAVE_NATIVE_mpn_rsh1add_n
503  { "mpn_rsh1add_n",     speed_mpn_rsh1add_n, FLAG_R_OPTIONAL },
504#endif
505#if HAVE_NATIVE_mpn_rsh1sub_n
506  { "mpn_rsh1sub_n",     speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
507#endif
508
509  { "mpn_cnd_add_n",     speed_mpn_cnd_add_n, FLAG_R_OPTIONAL },
510  { "mpn_cnd_sub_n",     speed_mpn_cnd_sub_n, FLAG_R_OPTIONAL },
511
512  { "MPN_ZERO",          speed_MPN_ZERO             },
513
514  { "binvert_limb",       speed_binvert_limb,       FLAG_NODATA },
515  { "binvert_limb_mul1",  speed_binvert_limb_mul1,  FLAG_NODATA },
516  { "binvert_limb_loop",  speed_binvert_limb_loop,  FLAG_NODATA },
517  { "binvert_limb_cond",  speed_binvert_limb_cond,  FLAG_NODATA },
518  { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
519
520  { "malloc_free",                  speed_malloc_free                  },
521  { "malloc_realloc_free",          speed_malloc_realloc_free          },
522  { "gmp_allocate_free",            speed_gmp_allocate_free            },
523  { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free },
524  { "mpz_init_clear",               speed_mpz_init_clear               },
525  { "mpq_init_clear",               speed_mpq_init_clear               },
526  { "mpf_init_clear",               speed_mpf_init_clear               },
527  { "mpz_init_realloc_clear",       speed_mpz_init_realloc_clear       },
528
529  { "umul_ppmm",         speed_umul_ppmm,     FLAG_R_OPTIONAL },
530#if HAVE_NATIVE_mpn_umul_ppmm
531  { "mpn_umul_ppmm",     speed_mpn_umul_ppmm, FLAG_R_OPTIONAL },
532#endif
533#if HAVE_NATIVE_mpn_umul_ppmm_r
534  { "mpn_umul_ppmm_r",   speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL },
535#endif
536
537  { "count_leading_zeros",  speed_count_leading_zeros,  FLAG_NODATA | FLAG_R_OPTIONAL },
538  { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
539
540  { "udiv_qrnnd",             speed_udiv_qrnnd,             FLAG_R_OPTIONAL },
541  { "udiv_qrnnd_c",           speed_udiv_qrnnd_c,           FLAG_R_OPTIONAL },
542#if HAVE_NATIVE_mpn_udiv_qrnnd
543  { "mpn_udiv_qrnnd",         speed_mpn_udiv_qrnnd,         FLAG_R_OPTIONAL },
544#endif
545#if HAVE_NATIVE_mpn_udiv_qrnnd_r
546  { "mpn_udiv_qrnnd_r",       speed_mpn_udiv_qrnnd_r,       FLAG_R_OPTIONAL },
547#endif
548  { "invert_limb",            speed_invert_limb,            FLAG_R_OPTIONAL },
549
550  { "operator_div",           speed_operator_div,           FLAG_R_OPTIONAL },
551  { "operator_mod",           speed_operator_mod,           FLAG_R_OPTIONAL },
552
553  { "gmp_randseed",    speed_gmp_randseed,    FLAG_R_OPTIONAL               },
554  { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA },
555  { "mpz_urandomb",    speed_mpz_urandomb,    FLAG_R_OPTIONAL | FLAG_NODATA },
556
557#ifdef SPEED_EXTRA_ROUTINES
558  SPEED_EXTRA_ROUTINES
559#endif
560#ifdef SPEED_EXTRA_ROUTINES2
561  SPEED_EXTRA_ROUTINES2
562#endif
563};
564
565
566struct choice_t {
567  const struct routine_t  *p;
568  mp_limb_t               r;
569  double                  scale;
570  double                  time;
571  int                     no_time;
572  double                  prev_time;
573  const char              *name;
574};
575struct choice_t  *choice;
576int  num_choices = 0;
577
578
579void
580data_fill (mp_ptr ptr, mp_size_t size)
581{
582  switch (option_data) {
583  case DATA_RANDOM:
584    mpn_random (ptr, size);
585    break;
586  case DATA_RANDOM2:
587    mpn_random2 (ptr, size);
588    break;
589  case DATA_ZEROS:
590    MPN_ZERO (ptr, size);
591    break;
592  case DATA_AAS:
593    MPN_FILL (ptr, size, GMP_NUMB_0xAA);
594    break;
595  case DATA_FFS:
596    MPN_FILL (ptr, size, GMP_NUMB_MAX);
597    break;
598  case DATA_2FD:
599    MPN_FILL (ptr, size, GMP_NUMB_MAX);
600    ptr[0] -= 2;
601    break;
602  default:
603    abort();
604    /*NOTREACHED*/
605  }
606}
607
608/* The code here handling the various combinations of output options isn't
609   too attractive, but it works and is fairly clean.  */
610
611#define SIZE_TO_DIVISOR(n)              \
612  (option_square == 1 ? (n)*(n)         \
613  : option_square == 2 ? (n)*((n)+1)/2  \
614  : (n))
615
616void
617run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
618{
619  const char  *first_open_fastest, *first_open_notfastest, *first_close;
620  int         i, fastest, want_data;
621  double      fastest_time;
622  TMP_DECL;
623
624  TMP_MARK;
625
626  /* allocate data, unless all routines are NODATA */
627  want_data = 0;
628  for (i = 0; i < num_choices; i++)
629    want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0);
630
631  if (want_data)
632    {
633      SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp);
634      SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp);
635
636      data_fill (s->xp, s->size);
637      data_fill (s->yp, s->size);
638    }
639  else
640    {
641      sp.xp = NULL;
642      sp.yp = NULL;
643    }
644
645  if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
646    {
647      first_open_fastest = "(#";
648      first_open_notfastest = " (";
649      first_close = ")";
650    }
651  else
652    {
653      first_open_fastest = "#";
654      first_open_notfastest = " ";
655      first_close = "";
656    }
657
658  fastest = -1;
659  fastest_time = -1.0;
660  for (i = 0; i < num_choices; i++)
661    {
662      s->r = choice[i].r;
663      choice[i].time = speed_measure (choice[i].p->fun, s);
664      choice[i].no_time = (choice[i].time == -1.0);
665      if (! choice[i].no_time)
666        choice[i].time *= choice[i].scale;
667
668      /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time
669         is before any differences.  */
670      {
671        double     t;
672        t = choice[i].time;
673        if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1)
674          {
675            if (choice[i].prev_time == -1.0)
676              choice[i].no_time = 1;
677            else
678              choice[i].time = choice[i].time - choice[i].prev_time;
679          }
680        choice[i].prev_time = t;
681      }
682
683      if (choice[i].no_time)
684        continue;
685
686      /* Look for the fastest after CMP_DIFFPREV has been applied, but
687         before CMP_RATIO or CMP_DIFFERENCE.  There's only a fastest shown
688         if there's more than one routine.  */
689      if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time))
690        {
691          fastest = i;
692          fastest_time = choice[i].time;
693        }
694
695      if (option_cmp == CMP_DIFFPREV)
696        {
697          /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */
698          if (option_unit == UNIT_CYCLES)
699            choice[i].time /= speed_cycletime;
700          else if (option_unit == UNIT_CYCLESPERLIMB)
701            {
702              if (prev_size == -1)
703                choice[i].time /= speed_cycletime;
704              else
705                choice[i].time /=  (speed_cycletime
706                                    * (SIZE_TO_DIVISOR(s->size)
707                                       - SIZE_TO_DIVISOR(prev_size)));
708            }
709        }
710      else
711        {
712          if (option_unit == UNIT_CYCLES)
713            choice[i].time /= speed_cycletime;
714          else if (option_unit == UNIT_CYCLESPERLIMB)
715            choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size));
716
717          if (option_cmp == CMP_RATIO && i > 0)
718            {
719              /* A ratio isn't affected by the units chosen. */
720              if (choice[0].no_time || choice[0].time == 0.0)
721                choice[i].no_time = 1;
722              else
723                choice[i].time /= choice[0].time;
724            }
725          else if (option_cmp == CMP_DIFFERENCE && i > 0)
726            {
727              if (choice[0].no_time)
728                {
729                  choice[i].no_time = 1;
730                  continue;
731                }
732              choice[i].time -= choice[0].time;
733            }
734        }
735    }
736
737  if (option_gnuplot)
738    {
739      /* In CMP_DIFFPREV, don't print anything for the first size, start
740         with the second where an actual difference is available.
741
742         In CMP_RATIO, print the first column as 1.0.
743
744         The 9 decimals printed is much more than the expected precision of
745         the measurements actually. */
746
747      if (! (option_cmp == CMP_DIFFPREV && prev_size == -1))
748        {
749          fprintf (fp, "%-6ld ", s->size);
750          for (i = 0; i < num_choices; i++)
751            fprintf (fp, "  %.9e",
752                     choice[i].no_time ? 0.0
753                     : (option_cmp == CMP_RATIO && i == 0) ? 1.0
754                     : choice[i].time);
755          fprintf (fp, "\n");
756        }
757    }
758  else
759    {
760      fprintf (fp, "%-6ld ", s->size);
761      for (i = 0; i < num_choices; i++)
762        {
763          char  buf[128];
764          int   decimals;
765
766          if (choice[i].no_time)
767            {
768              fprintf (fp, " %*s", COLUMN_WIDTH, "n/a");
769            }
770          else
771            {if (option_unit == UNIT_CYCLESPERLIMB
772                 || (option_cmp == CMP_RATIO && i > 0))
773                decimals = 4;
774              else if (option_unit == UNIT_CYCLES)
775                decimals = 2;
776              else
777                decimals = 9;
778
779              sprintf (buf, "%s%.*f%s",
780                       i == fastest ? first_open_fastest : first_open_notfastest,
781                       decimals, choice[i].time, first_close);
782              fprintf (fp, " %*s", COLUMN_WIDTH, buf);
783            }
784        }
785      fprintf (fp, "\n");
786    }
787
788  TMP_FREE;
789}
790
791void
792run_all (FILE *fp)
793{
794  mp_size_t  prev_size;
795  int        i;
796  TMP_DECL;
797
798  TMP_MARK;
799  SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp);
800  SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp);
801
802  data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
803  data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
804
805  for (i = 0; i < size_num; i++)
806    {
807      sp.size = size_array[i].start;
808      prev_size = -1;
809      for (;;)
810        {
811          mp_size_t  step;
812
813          if (option_data == DATA_2FD && sp.size >= 2)
814            sp.xp[sp.size-1] = 2;
815
816          run_one (fp, &sp, prev_size);
817          prev_size = sp.size;
818
819          if (option_data == DATA_2FD && sp.size >= 2)
820            sp.xp[sp.size-1] = MP_LIMB_T_MAX;
821
822          if (option_factor != 0.0)
823            {
824              step = (mp_size_t) (sp.size * option_factor - sp.size);
825              if (step < 1)
826                step = 1;
827            }
828          else
829            step = 1;
830          if (step < option_step)
831            step = option_step;
832
833          sp.size += step;
834          if (sp.size > size_array[i].end)
835            break;
836        }
837    }
838
839  TMP_FREE;
840}
841
842
843FILE *
844fopen_for_write (const char *filename)
845{
846  FILE  *fp;
847  if ((fp = fopen (filename, "w")) == NULL)
848    {
849      fprintf (stderr, "Cannot create %s\n", filename);
850      exit(1);
851    }
852  return fp;
853}
854
855void
856fclose_written (FILE *fp, const char *filename)
857{
858  int  err;
859
860  err = ferror (fp);
861  err |= fclose (fp);
862
863  if (err)
864    {
865      fprintf (stderr, "Error writing %s\n", filename);
866      exit(1);
867    }
868}
869
870
871void
872run_gnuplot (int argc, char *argv[])
873{
874  char  *plot_filename;
875  char  *data_filename;
876  FILE  *fp;
877  int   i;
878
879  plot_filename = (char *) (*__gmp_allocate_func)
880    (strlen (option_gnuplot_basename) + 20);
881  data_filename = (char *) (*__gmp_allocate_func)
882    (strlen (option_gnuplot_basename) + 20);
883
884  sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename);
885  sprintf (data_filename, "%s.data",    option_gnuplot_basename);
886
887  fp = fopen_for_write (plot_filename);
888
889  fprintf (fp, "# Generated with:\n");
890  fprintf (fp, "#");
891  for (i = 0; i < argc; i++)
892    fprintf (fp, " %s", argv[i]);
893  fprintf (fp, "\n");
894  fprintf (fp, "\n");
895
896  fprintf (fp, "reset\n");
897
898  /* Putting the key at the top left is usually good, and you can change it
899     interactively if it's not. */
900  fprintf (fp, "set key left\n");
901
902  /* write underscores, not subscripts */
903  fprintf (fp, "set termoption noenhanced\n");
904
905  /* designed to make it possible to see crossovers easily */
906  fprintf (fp, "set style data lines\n");
907
908  fprintf (fp, "plot ");
909  for (i = 0; i < num_choices; i++)
910    {
911      fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2);
912      fprintf (fp, " title \"%s\"", choice[i].name);
913
914      if (i != num_choices-1)
915        fprintf (fp, ", \\");
916      fprintf (fp, "\n");
917    }
918
919  fprintf (fp, "load \"-\"\n");
920  fclose_written (fp, plot_filename);
921
922  fp = fopen_for_write (data_filename);
923
924  /* Unbuffered so you can see where the program was up to if it crashes or
925     you kill it. */
926  setbuf (fp, NULL);
927
928  run_all (fp);
929  fclose_written (fp, data_filename);
930}
931
932
933/* Return a limb with n many one bits (starting from the least significant) */
934
935#define LIMB_ONES(n) \
936  ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX      \
937    : (n) == 0 ? CNST_LIMB(0)                   \
938    : (CNST_LIMB(1) << (n)) - 1)
939
940mp_limb_t
941r_string (const char *s)
942{
943  const char  *s_orig = s;
944  long        n;
945
946  if (strcmp (s, "aas") == 0)
947    return GMP_NUMB_0xAA;
948
949  {
950    mpz_t      z;
951    mp_limb_t  l;
952    int        set, siz;
953
954    mpz_init (z);
955    set = mpz_set_str (z, s, 0);
956    siz = SIZ(z);
957    l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]);
958    mpz_clear (z);
959    if (set == 0)
960      {
961        if (siz > 1 || siz < -1)
962          printf ("Warning, r parameter %s truncated to %d bits\n",
963                  s_orig, GMP_LIMB_BITS);
964        return l;
965      }
966  }
967
968  if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
969    n = strtoul (s+2, (char **) &s, 16);
970  else
971    n = strtol (s, (char **) &s, 10);
972
973  if (strcmp (s, "bits") == 0)
974    {
975      mp_limb_t  l;
976      if (n > GMP_LIMB_BITS)
977        {
978          fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
979                   n, GMP_LIMB_BITS);
980          exit (1);
981        }
982      mpn_random (&l, 1);
983      return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n);
984    }
985  else  if (strcmp (s, "ones") == 0)
986    {
987      if (n > GMP_LIMB_BITS)
988        {
989          fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
990                   n, GMP_LIMB_BITS);
991          exit (1);
992        }
993      return LIMB_ONES (n);
994    }
995  else if (*s != '\0')
996    {
997      fprintf (stderr, "invalid r parameter: %s\n", s_orig);
998      exit (1);
999    }
1000
1001  return n;
1002}
1003
1004
1005void
1006routine_find (struct choice_t *c, const char *s_orig)
1007{
1008  const char  *s;
1009  int     i;
1010  size_t  nlen;
1011
1012  c->name = s_orig;
1013  s = strchr (s_orig, '*');
1014  if (s != NULL)
1015    {
1016      c->scale = atof(s_orig);
1017      s++;
1018    }
1019  else
1020    {
1021      c->scale = 1.0;
1022      s = s_orig;
1023    }
1024
1025  for (i = 0; i < numberof (routine); i++)
1026    {
1027      nlen = strlen (routine[i].name);
1028      if (memcmp (s, routine[i].name, nlen) != 0)
1029        continue;
1030
1031      if (s[nlen] == '.')
1032        {
1033          /* match, with a .r parameter */
1034
1035          if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL)))
1036            {
1037              fprintf (stderr,
1038                       "Choice %s bad: doesn't take a \".<r>\" parameter\n",
1039                       s_orig);
1040              exit (1);
1041            }
1042
1043          c->p = &routine[i];
1044          c->r = r_string (s + nlen + 1);
1045          return;
1046        }
1047
1048      if (s[nlen] == '\0')
1049        {
1050          /* match, with no parameter */
1051
1052          if (routine[i].flag & FLAG_R)
1053            {
1054              fprintf (stderr,
1055                       "Choice %s bad: needs a \".<r>\" parameter\n",
1056                       s_orig);
1057              exit (1);
1058            }
1059
1060          c->p = &routine[i];
1061          c->r = 0;
1062          return;
1063        }
1064    }
1065
1066  fprintf (stderr, "Choice %s unrecognised\n", s_orig);
1067  exit (1);
1068}
1069
1070
1071void
1072usage (void)
1073{
1074  int  i;
1075
1076  speed_time_init ();
1077
1078  printf ("Usage: speed [-options] -s size <routine>...\n");
1079  printf ("Measure the speed of some routines.\n");
1080  printf ("Times are in seconds, accuracy is shown.\n");
1081  printf ("\n");
1082  printf ("   -p num     set precision as number of time units each routine must run\n");
1083  printf ("   -s size[-end][,size[-end]]...   sizes to measure\n");
1084  printf ("              single sizes or ranges, sep with comma or use multiple -s\n");
1085  printf ("   -t step    step through sizes by given amount\n");
1086  printf ("   -f factor  step through sizes by given factor (eg. 1.05)\n");
1087  printf ("   -r         show times as ratios of the first routine\n");
1088  printf ("   -d         show times as difference from the first routine\n");
1089  printf ("   -D         show times as difference from previous size shown\n");
1090  printf ("   -c         show times in CPU cycles\n");
1091  printf ("   -C         show times in cycles per limb\n");
1092  printf ("   -u         print resource usage (memory) at end\n");
1093  printf ("   -P name    output plot files \"name.gnuplot\" and \"name.data\"\n");
1094  printf ("   -a <type>  use given data: random(default), random2, zeros, aas, ffs, 2fd\n");
1095  printf ("   -x, -y, -w, -W <align>  specify data alignments, sources and dests\n");
1096  printf ("   -o addrs   print addresses of data blocks\n");
1097  printf ("\n");
1098  printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n");
1099  printf ("is greater.\n");
1100  printf ("If both -C and -D are used, it means cycles per however many limbs between a\n");
1101  printf ("size and the previous size.\n");
1102  printf ("\n");
1103  printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n");
1104  printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n");
1105  printf ("a log/log plot).\n");
1106  printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n");
1107  printf ("when viewing more than one routine, it means same axis scales for all data).\n");
1108  printf ("\n");
1109  printf ("The available routines are as follows.\n");
1110  printf ("\n");
1111
1112  for (i = 0; i < numberof (routine); i++)
1113    {
1114      if (routine[i].flag & FLAG_R)
1115        printf ("\t%s.r\n", routine[i].name);
1116      else if (routine[i].flag & FLAG_R_OPTIONAL)
1117        printf ("\t%s (optional .r)\n", routine[i].name);
1118      else
1119        printf ("\t%s\n", routine[i].name);
1120    }
1121  printf ("\n");
1122  printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n");
1123  printf ("r should be in decimal, or use 0xN for hexadecimal.\n");
1124  printf ("\n");
1125  printf ("Special forms for r are \"<N>bits\" for a random N bit number, \"<N>ones\" for\n");
1126  printf ("N one bits, or \"aas\" for 0xAA..AA.\n");
1127  printf ("\n");
1128  printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n");
1129  printf ("The fastest routine at each size is marked with a # (free form output only).\n");
1130  printf ("\n");
1131  printf ("%s", speed_time_string);
1132  printf ("\n");
1133  printf ("Gnuplot home page http://www.gnuplot.info/\n");
1134  printf ("Quickplot home page http://quickplot.sourceforge.net/\n");
1135}
1136
1137void
1138check_align_option (const char *name, mp_size_t align)
1139{
1140  if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK)
1141    {
1142      fprintf (stderr, "Alignment request out of range: %s %ld\n",
1143               name, (long) align);
1144      fprintf (stderr, "  should be 0 to %d (limbs), inclusive\n",
1145               SPEED_TMP_ALLOC_ADJUST_MASK);
1146      exit (1);
1147    }
1148}
1149
1150int
1151main (int argc, char *argv[])
1152{
1153  int  i;
1154  int  opt;
1155
1156  /* Unbuffered so output goes straight out when directed to a pipe or file
1157     and isn't lost on killing the program half way.  */
1158  setbuf (stdout, NULL);
1159
1160  for (;;)
1161    {
1162      opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z");
1163      if (opt == EOF)
1164        break;
1165
1166      switch (opt) {
1167      case 'a':
1168        if (strcmp (optarg, "random") == 0)       option_data = DATA_RANDOM;
1169        else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2;
1170        else if (strcmp (optarg, "zeros") == 0)   option_data = DATA_ZEROS;
1171        else if (strcmp (optarg, "aas") == 0)     option_data = DATA_AAS;
1172        else if (strcmp (optarg, "ffs") == 0)     option_data = DATA_FFS;
1173        else if (strcmp (optarg, "2fd") == 0)     option_data = DATA_2FD;
1174        else
1175          {
1176            fprintf (stderr, "unrecognised data option: %s\n", optarg);
1177            exit (1);
1178          }
1179        break;
1180      case 'C':
1181        if (option_unit  != UNIT_SECONDS) goto bad_unit;
1182        option_unit = UNIT_CYCLESPERLIMB;
1183        break;
1184      case 'c':
1185        if (option_unit != UNIT_SECONDS)
1186          {
1187          bad_unit:
1188            fprintf (stderr, "cannot use more than one of -c, -C\n");
1189            exit (1);
1190          }
1191        option_unit = UNIT_CYCLES;
1192        break;
1193      case 'D':
1194        if (option_cmp != CMP_ABSOLUTE) goto bad_cmp;
1195        option_cmp = CMP_DIFFPREV;
1196        break;
1197      case 'd':
1198        if (option_cmp != CMP_ABSOLUTE)
1199          {
1200          bad_cmp:
1201            fprintf (stderr, "cannot use more than one of -d, -D, -r\n");
1202            exit (1);
1203          }
1204        option_cmp = CMP_DIFFERENCE;
1205        break;
1206      case 'E':
1207        option_square = 1;
1208        break;
1209      case 'F':
1210        option_square = 2;
1211        break;
1212      case 'f':
1213        option_factor = atof (optarg);
1214        if (option_factor <= 1.0)
1215          {
1216            fprintf (stderr, "-f factor must be > 1.0\n");
1217            exit (1);
1218          }
1219        break;
1220      case 'o':
1221        speed_option_set (optarg);
1222        break;
1223      case 'P':
1224        option_gnuplot = 1;
1225        option_gnuplot_basename = optarg;
1226        break;
1227      case 'p':
1228        speed_precision = atoi (optarg);
1229        break;
1230      case 'R':
1231        option_seed = time (NULL);
1232        break;
1233      case 'r':
1234        if (option_cmp != CMP_ABSOLUTE)
1235          goto bad_cmp;
1236        option_cmp = CMP_RATIO;
1237        break;
1238      case 's':
1239        {
1240          char  *s;
1241          for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ","))
1242            {
1243              if (size_num == size_allocnum)
1244                {
1245                  size_array = (struct size_array_t *)
1246                    __gmp_allocate_or_reallocate
1247                    (size_array,
1248                     size_allocnum * sizeof(size_array[0]),
1249                     (size_allocnum+10) * sizeof(size_array[0]));
1250                  size_allocnum += 10;
1251                }
1252              if (sscanf (s, "%ld-%ld",
1253                          &size_array[size_num].start,
1254                          &size_array[size_num].end) != 2)
1255                {
1256                  size_array[size_num].start = size_array[size_num].end
1257                    = atol (s);
1258                }
1259
1260              if (size_array[size_num].start < 0
1261                  || size_array[size_num].end < 0
1262                  || size_array[size_num].start > size_array[size_num].end)
1263                {
1264                  fprintf (stderr, "invalid size parameter: %s\n", s);
1265                  exit (1);
1266                }
1267
1268              size_num++;
1269            }
1270        }
1271        break;
1272      case 't':
1273        option_step = atol (optarg);
1274        if (option_step < 1)
1275          {
1276            fprintf (stderr, "-t step must be >= 1\n");
1277            exit (1);
1278          }
1279        break;
1280      case 'u':
1281        option_resource_usage = 1;
1282        break;
1283      case 'z':
1284        sp.cache = 1;
1285        break;
1286      case 'x':
1287        sp.align_xp = atol (optarg);
1288        check_align_option ("-x", sp.align_xp);
1289        break;
1290      case 'y':
1291        sp.align_yp = atol (optarg);
1292        check_align_option ("-y", sp.align_yp);
1293        break;
1294      case 'w':
1295        sp.align_wp = atol (optarg);
1296        check_align_option ("-w", sp.align_wp);
1297        break;
1298      case 'W':
1299        sp.align_wp2 = atol (optarg);
1300        check_align_option ("-W", sp.align_wp2);
1301        break;
1302      case '?':
1303        exit(1);
1304      }
1305    }
1306
1307  if (optind >= argc)
1308    {
1309      usage ();
1310      exit (1);
1311    }
1312
1313  if (size_num == 0)
1314    {
1315      fprintf (stderr, "-s <size> must be specified\n");
1316      exit (1);
1317    }
1318
1319  gmp_randinit_default (__gmp_rands);
1320  __gmp_rands_initialized = 1;
1321  gmp_randseed_ui (__gmp_rands, option_seed);
1322
1323  choice = (struct choice_t *) (*__gmp_allocate_func)
1324    ((argc - optind) * sizeof(choice[0]));
1325  for ( ; optind < argc; optind++)
1326    {
1327      struct choice_t  c;
1328      routine_find (&c, argv[optind]);
1329      choice[num_choices] = c;
1330      num_choices++;
1331    }
1332
1333  if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) &&
1334      num_choices < 2)
1335    {
1336      fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n");
1337    }
1338
1339  speed_time_init ();
1340  if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB)
1341    speed_cycletime_need_cycles ();
1342  else
1343    speed_cycletime_need_seconds ();
1344
1345  if (option_gnuplot)
1346    {
1347      run_gnuplot (argc, argv);
1348    }
1349  else
1350    {
1351      if (option_unit == UNIT_SECONDS)
1352        printf ("overhead %.9f secs", speed_measure (speed_noop, NULL));
1353      else
1354        printf ("overhead %.2f cycles",
1355                speed_measure (speed_noop, NULL) / speed_cycletime);
1356      printf (", precision %d units of %.2e secs",
1357              speed_precision, speed_unittime);
1358
1359      if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
1360        printf (", CPU freq unknown\n");
1361      else
1362        printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
1363
1364      printf ("       ");
1365      for (i = 0; i < num_choices; i++)
1366        printf (" %*s", COLUMN_WIDTH, choice[i].name);
1367      printf ("\n");
1368
1369      run_all (stdout);
1370    }
1371
1372  if (option_resource_usage)
1373    {
1374#if HAVE_GETRUSAGE
1375      {
1376        /* This doesn't give data sizes on linux 2.0.x, only utime. */
1377        struct rusage  r;
1378        if (getrusage (RUSAGE_SELF, &r) != 0)
1379          perror ("getrusage");
1380        else
1381          printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n",
1382                  (long) r.ru_utime.tv_sec, (long) r.ru_utime.tv_usec,
1383                  r.ru_idrss, r.ru_isrss, r.ru_ixrss);
1384      }
1385#else
1386      printf ("getrusage() not available\n");
1387#endif
1388
1389      /* Linux kernel. */
1390      {
1391        char  buf[128];
1392        sprintf (buf, "/proc/%d/status", getpid());
1393        if (access (buf, R_OK) == 0)
1394          {
1395            sprintf (buf, "cat /proc/%d/status", getpid());
1396            system (buf);
1397          }
1398
1399      }
1400    }
1401
1402  return 0;
1403}
1404