speed.h revision 1.1.1.4
1/* Header for speed and threshold things.
2
3Copyright 1999-2003, 2005, 2006, 2008-2017, 2019 Free Software
4Foundation, Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of either:
10
11  * the GNU Lesser General Public License as published by the Free
12    Software Foundation; either version 3 of the License, or (at your
13    option) any later version.
14
15or
16
17  * the GNU General Public License as published by the Free Software
18    Foundation; either version 2 of the License, or (at your option) any
19    later version.
20
21or both in parallel, as here.
22
23The GNU MP Library is distributed in the hope that it will be useful, but
24WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26for more details.
27
28You should have received copies of the GNU General Public License and the
29GNU Lesser General Public License along with the GNU MP Library.  If not,
30see https://www.gnu.org/licenses/.  */
31
32#ifndef __SPEED_H__
33#define __SPEED_H__
34
35
36/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
37   newsize long. */
38#define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
39  do {							\
40    ASSERT ((newsize) >= (oldsize));			\
41    MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
42  } while (0)
43
44/* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
45   x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
46#define MP_LIMB_T_LOWBITMASK(n)	\
47  ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
48
49
50/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
51
52#define TMP_ALLOC_ALIGNED(bytes, align)	\
53  align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
54#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
55  ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
56
57/* CACHE_LINE_SIZE is our default alignment for speed operands, and the
58   limit on what s->align_xp etc and then request for off-alignment.  Maybe
59   this should be an option of some sort, but in any case here are some line
60   sizes,
61
62       bytes
63	 32   pentium
64	 64   athlon
65	 64   itanium-2 L1
66	128   itanium-2 L2
67*/
68#define CACHE_LINE_SIZE   64 /* bytes */
69
70#define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
71
72/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
73   alignment.  */
74#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
75  do {									\
76    mp_ptr     __ptr;							\
77    mp_size_t  __ptr_align, __ptr_add;					\
78									\
79    ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0);		\
80    __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
81    __ptr_align = (__ptr - (mp_ptr) NULL);				\
82    __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
83    (ptr) = __ptr + __ptr_add;						\
84  } while (0)
85
86
87/* This is the size for s->xp_block and s->yp_block, used in certain
88   routines that want to run across many different data values and use
89   s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
90
91   512 means 2kbytes of data for each of xp_block and yp_block, making 4k
92   total, which should fit easily in any L1 data cache. */
93
94#define SPEED_BLOCK_SIZE   512 /* limbs */
95
96
97extern double  speed_unittime;
98extern double  speed_cycletime;
99extern int     speed_precision;
100extern char    speed_time_string[];
101void speed_time_init (void);
102void speed_cycletime_fail (const char *str);
103void speed_cycletime_init (void);
104void speed_cycletime_need_cycles (void);
105void speed_cycletime_need_seconds (void);
106void speed_starttime (void);
107double speed_endtime (void);
108
109
110struct speed_params {
111  unsigned   reps;	/* how many times to run the routine */
112  mp_ptr     xp;	/* first argument */
113  mp_ptr     yp;	/* second argument */
114  mp_size_t  size;	/* size of both arguments */
115  mp_limb_t  r;		/* user supplied parameter */
116  mp_size_t  align_xp;	/* alignment of xp */
117  mp_size_t  align_yp;	/* alignment of yp */
118  mp_size_t  align_wp;	/* intended alignment of wp */
119  mp_size_t  align_wp2; /* intended alignment of wp2 */
120  mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
121  mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
122
123  double     time_divisor; /* optionally set by the speed routine */
124
125  /* used by the cache priming things */
126  int	     cache;
127  unsigned   src_num, dst_num;
128  struct {
129    mp_ptr    ptr;
130    mp_size_t size;
131  } src[5], dst[4];
132};
133
134typedef double (*speed_function_t) (struct speed_params *);
135
136double speed_measure (speed_function_t fun, struct speed_params *);
137
138/* Prototypes for speed measuring routines */
139
140double speed_back_to_back (struct speed_params *);
141double speed_count_leading_zeros (struct speed_params *);
142double speed_count_trailing_zeros (struct speed_params *);
143double speed_find_a (struct speed_params *);
144double speed_gmp_allocate_free (struct speed_params *);
145double speed_gmp_allocate_reallocate_free (struct speed_params *);
146double speed_invert_limb (struct speed_params *);
147double speed_malloc_free (struct speed_params *);
148double speed_malloc_realloc_free (struct speed_params *);
149double speed_memcpy (struct speed_params *);
150double speed_binvert_limb (struct speed_params *);
151double speed_binvert_limb_mul1 (struct speed_params *);
152double speed_binvert_limb_loop (struct speed_params *);
153double speed_binvert_limb_cond (struct speed_params *);
154double speed_binvert_limb_arith (struct speed_params *);
155
156double speed_mpf_init_clear (struct speed_params *);
157
158double speed_mpn_add_n (struct speed_params *);
159double speed_mpn_add_1 (struct speed_params *);
160double speed_mpn_add_1_inplace (struct speed_params *);
161double speed_mpn_add_err1_n (struct speed_params *);
162double speed_mpn_add_err2_n (struct speed_params *);
163double speed_mpn_add_err3_n (struct speed_params *);
164double speed_mpn_addlsh_n (struct speed_params *);
165double speed_mpn_addlsh1_n (struct speed_params *);
166double speed_mpn_addlsh2_n (struct speed_params *);
167double speed_mpn_addlsh_n_ip1 (struct speed_params *);
168double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
169double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
170double speed_mpn_addlsh_n_ip2 (struct speed_params *);
171double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
172double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
173double speed_mpn_add_n_sub_n (struct speed_params *);
174double speed_mpn_and_n (struct speed_params *);
175double speed_mpn_andn_n (struct speed_params *);
176double speed_mpn_addmul_1 (struct speed_params *);
177double speed_mpn_addmul_2 (struct speed_params *);
178double speed_mpn_addmul_3 (struct speed_params *);
179double speed_mpn_addmul_4 (struct speed_params *);
180double speed_mpn_addmul_5 (struct speed_params *);
181double speed_mpn_addmul_6 (struct speed_params *);
182double speed_mpn_addmul_7 (struct speed_params *);
183double speed_mpn_addmul_8 (struct speed_params *);
184double speed_mpn_cnd_add_n (struct speed_params *);
185double speed_mpn_cnd_sub_n (struct speed_params *);
186double speed_mpn_com (struct speed_params *);
187double speed_mpn_neg (struct speed_params *);
188double speed_mpn_copyd (struct speed_params *);
189double speed_mpn_copyi (struct speed_params *);
190double speed_MPN_COPY (struct speed_params *);
191double speed_MPN_COPY_DECR (struct speed_params *);
192double speed_MPN_COPY_INCR (struct speed_params *);
193double speed_mpn_sec_tabselect (struct speed_params *);
194double speed_mpn_divexact_1 (struct speed_params *);
195double speed_mpn_divexact_by3 (struct speed_params *);
196double speed_mpn_bdiv_q_1 (struct speed_params *);
197double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
198double speed_mpn_bdiv_dbm1c (struct speed_params *);
199double speed_mpn_divrem_1 (struct speed_params *);
200double speed_mpn_divrem_1f (struct speed_params *);
201double speed_mpn_divrem_1c (struct speed_params *);
202double speed_mpn_divrem_1cf (struct speed_params *);
203double speed_mpn_divrem_1_div (struct speed_params *);
204double speed_mpn_divrem_1f_div (struct speed_params *);
205double speed_mpn_divrem_1_inv (struct speed_params *);
206double speed_mpn_divrem_1f_inv (struct speed_params *);
207double speed_mpn_divrem_2 (struct speed_params *);
208double speed_mpn_divrem_2_div (struct speed_params *);
209double speed_mpn_divrem_2_inv (struct speed_params *);
210double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
211double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
212double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
213double speed_mpn_div_qr_1 (struct speed_params *);
214double speed_mpn_div_qr_2n (struct speed_params *);
215double speed_mpn_div_qr_2u (struct speed_params *);
216double speed_mpn_fib2_ui (struct speed_params *);
217double speed_mpn_matrix22_mul (struct speed_params *);
218double speed_mpn_hgcd2 (struct speed_params *);
219double speed_mpn_hgcd2_1 (struct speed_params *);
220double speed_mpn_hgcd2_2 (struct speed_params *);
221double speed_mpn_hgcd2_3 (struct speed_params *);
222double speed_mpn_hgcd2_4 (struct speed_params *);
223double speed_mpn_hgcd2_5 (struct speed_params *);
224double speed_mpn_hgcd (struct speed_params *);
225double speed_mpn_hgcd_lehmer (struct speed_params *);
226double speed_mpn_hgcd_appr (struct speed_params *);
227double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
228double speed_mpn_hgcd_reduce (struct speed_params *);
229double speed_mpn_hgcd_reduce_1 (struct speed_params *);
230double speed_mpn_hgcd_reduce_2 (struct speed_params *);
231double speed_mpn_gcd (struct speed_params *);
232double speed_mpn_gcd_1 (struct speed_params *);
233double speed_mpn_gcd_11 (struct speed_params *);
234double speed_mpn_gcd_1N (struct speed_params *);
235double speed_mpn_gcd_22 (struct speed_params *);
236double speed_mpn_gcdext (struct speed_params *);
237double speed_mpn_gcdext_double (struct speed_params *);
238double speed_mpn_gcdext_one_double (struct speed_params *);
239double speed_mpn_gcdext_one_single (struct speed_params *);
240double speed_mpn_gcdext_single (struct speed_params *);
241double speed_mpn_get_str (struct speed_params *);
242double speed_mpn_hamdist (struct speed_params *);
243double speed_mpn_ior_n (struct speed_params *);
244double speed_mpn_iorn_n (struct speed_params *);
245double speed_mpn_jacobi_base (struct speed_params *);
246double speed_mpn_jacobi_base_1 (struct speed_params *);
247double speed_mpn_jacobi_base_2 (struct speed_params *);
248double speed_mpn_jacobi_base_3 (struct speed_params *);
249double speed_mpn_jacobi_base_4 (struct speed_params *);
250double speed_mpn_lshift (struct speed_params *);
251double speed_mpn_lshiftc (struct speed_params *);
252double speed_mpn_mod_1 (struct speed_params *);
253double speed_mpn_mod_1c (struct speed_params *);
254double speed_mpn_mod_1_div (struct speed_params *);
255double speed_mpn_mod_1_inv (struct speed_params *);
256double speed_mpn_mod_1_1 (struct speed_params *);
257double speed_mpn_mod_1_1_1 (struct speed_params *);
258double speed_mpn_mod_1_1_2 (struct speed_params *);
259double speed_mpn_mod_1_2 (struct speed_params *);
260double speed_mpn_mod_1_3 (struct speed_params *);
261double speed_mpn_mod_1_4 (struct speed_params *);
262double speed_mpn_mod_34lsub1 (struct speed_params *);
263double speed_mpn_modexact_1_odd (struct speed_params *);
264double speed_mpn_modexact_1c_odd (struct speed_params *);
265double speed_mpn_mul_1 (struct speed_params *);
266double speed_mpn_mul_1_inplace (struct speed_params *);
267double speed_mpn_mul_2 (struct speed_params *);
268double speed_mpn_mul_3 (struct speed_params *);
269double speed_mpn_mul_4 (struct speed_params *);
270double speed_mpn_mul_5 (struct speed_params *);
271double speed_mpn_mul_6 (struct speed_params *);
272double speed_mpn_mul (struct speed_params *);
273double speed_mpn_mul_basecase (struct speed_params *);
274double speed_mpn_mulmid (struct speed_params *);
275double speed_mpn_mulmid_basecase (struct speed_params *);
276double speed_mpn_mul_fft (struct speed_params *);
277double speed_mpn_mul_fft_sqr (struct speed_params *);
278double speed_mpn_fft_mul (struct speed_params *);
279double speed_mpn_fft_sqr (struct speed_params *);
280#if WANT_OLD_FFT_FULL
281double speed_mpn_mul_fft_full (struct speed_params *);
282double speed_mpn_mul_fft_full_sqr (struct speed_params *);
283#endif
284double speed_mpn_nussbaumer_mul (struct speed_params *);
285double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
286double speed_mpn_mul_n (struct speed_params *);
287double speed_mpn_mul_n_sqr (struct speed_params *);
288double speed_mpn_mulmid_n (struct speed_params *);
289double speed_mpn_sqrlo (struct speed_params *);
290double speed_mpn_sqrlo_basecase (struct speed_params *);
291double speed_mpn_mullo_n (struct speed_params *);
292double speed_mpn_mullo_basecase (struct speed_params *);
293double speed_mpn_nand_n (struct speed_params *);
294double speed_mpn_nior_n (struct speed_params *);
295double speed_mpn_popcount (struct speed_params *);
296double speed_mpn_preinv_divrem_1 (struct speed_params *);
297double speed_mpn_preinv_divrem_1f (struct speed_params *);
298double speed_mpn_preinv_mod_1 (struct speed_params *);
299double speed_mpn_sbpi1_div_qr (struct speed_params *);
300double speed_mpn_dcpi1_div_qr (struct speed_params *);
301double speed_mpn_sbpi1_divappr_q (struct speed_params *);
302double speed_mpn_dcpi1_divappr_q (struct speed_params *);
303double speed_mpn_mu_div_qr (struct speed_params *);
304double speed_mpn_mu_divappr_q (struct speed_params *);
305double speed_mpn_mupi_div_qr (struct speed_params *);
306double speed_mpn_mu_div_q (struct speed_params *);
307double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
308double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
309double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
310double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
311double speed_mpn_sbpi1_bdiv_r (struct speed_params *);
312double speed_mpn_mu_bdiv_q (struct speed_params *);
313double speed_mpn_mu_bdiv_qr (struct speed_params *);
314double speed_mpn_broot (struct speed_params *);
315double speed_mpn_broot_invm1 (struct speed_params *);
316double speed_mpn_brootinv (struct speed_params *);
317double speed_mpn_invert (struct speed_params *);
318double speed_mpn_invertappr (struct speed_params *);
319double speed_mpn_ni_invertappr (struct speed_params *);
320double speed_mpn_sec_invert (struct speed_params *s);
321double speed_mpn_binvert (struct speed_params *);
322double speed_mpn_redc_1 (struct speed_params *);
323double speed_mpn_redc_2 (struct speed_params *);
324double speed_mpn_redc_n (struct speed_params *);
325double speed_mpn_rsblsh_n (struct speed_params *);
326double speed_mpn_rsblsh1_n (struct speed_params *);
327double speed_mpn_rsblsh2_n (struct speed_params *);
328double speed_mpn_rsh1add_n (struct speed_params *);
329double speed_mpn_rsh1sub_n (struct speed_params *);
330double speed_mpn_rshift (struct speed_params *);
331double speed_mpn_sb_divrem_m3 (struct speed_params *);
332double speed_mpn_sb_divrem_m3_div (struct speed_params *);
333double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
334double speed_mpn_set_str (struct speed_params *);
335double speed_mpn_bc_set_str (struct speed_params *);
336double speed_mpn_dc_set_str (struct speed_params *);
337double speed_mpn_set_str_pre (struct speed_params *);
338double speed_mpn_sqr_basecase (struct speed_params *);
339double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
340double speed_mpn_sqr_diagonal (struct speed_params *);
341double speed_mpn_sqr (struct speed_params *);
342double speed_mpn_sqrtrem (struct speed_params *);
343double speed_mpn_rootrem (struct speed_params *);
344double speed_mpn_sqrt (struct speed_params *);
345double speed_mpn_root (struct speed_params *);
346double speed_mpn_perfect_power_p (struct speed_params *);
347double speed_mpn_perfect_square_p (struct speed_params *);
348double speed_mpn_sub_n (struct speed_params *);
349double speed_mpn_sub_1 (struct speed_params *);
350double speed_mpn_sub_1_inplace (struct speed_params *);
351double speed_mpn_sub_err1_n (struct speed_params *);
352double speed_mpn_sub_err2_n (struct speed_params *);
353double speed_mpn_sub_err3_n (struct speed_params *);
354double speed_mpn_sublsh_n (struct speed_params *);
355double speed_mpn_sublsh1_n (struct speed_params *);
356double speed_mpn_sublsh2_n (struct speed_params *);
357double speed_mpn_sublsh_n_ip1 (struct speed_params *);
358double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
359double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
360double speed_mpn_submul_1 (struct speed_params *);
361double speed_mpn_toom2_sqr (struct speed_params *);
362double speed_mpn_toom3_sqr (struct speed_params *);
363double speed_mpn_toom4_sqr (struct speed_params *);
364double speed_mpn_toom6_sqr (struct speed_params *);
365double speed_mpn_toom8_sqr (struct speed_params *);
366double speed_mpn_toom22_mul (struct speed_params *);
367double speed_mpn_toom33_mul (struct speed_params *);
368double speed_mpn_toom44_mul (struct speed_params *);
369double speed_mpn_toom6h_mul (struct speed_params *);
370double speed_mpn_toom8h_mul (struct speed_params *);
371double speed_mpn_toom32_mul (struct speed_params *);
372double speed_mpn_toom42_mul (struct speed_params *);
373double speed_mpn_toom43_mul (struct speed_params *);
374double speed_mpn_toom63_mul (struct speed_params *);
375double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
376double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
377double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
378double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
379double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
380double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
381double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
382double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
383double speed_mpn_toom42_mulmid (struct speed_params *);
384double speed_mpn_mulmod_bnm1 (struct speed_params *);
385double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
386double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
387double speed_mpn_sqrmod_bnm1 (struct speed_params *);
388double speed_mpn_udiv_qrnnd (struct speed_params *);
389double speed_mpn_udiv_qrnnd_r (struct speed_params *);
390double speed_mpn_umul_ppmm (struct speed_params *);
391double speed_mpn_umul_ppmm_r (struct speed_params *);
392double speed_mpn_xnor_n (struct speed_params *);
393double speed_mpn_xor_n (struct speed_params *);
394double speed_MPN_ZERO (struct speed_params *);
395
396double speed_mpq_init_clear (struct speed_params *);
397
398double speed_mpz_add (struct speed_params *);
399double speed_mpz_invert (struct speed_params *);
400double speed_mpz_bin_uiui (struct speed_params *);
401double speed_mpz_bin_ui (struct speed_params *);
402double speed_mpz_fac_ui (struct speed_params *);
403double speed_mpz_2fac_ui (struct speed_params *);
404double speed_mpz_mfac_uiui (struct speed_params *);
405double speed_mpz_primorial_ui (struct speed_params *);
406double speed_mpz_fib_ui (struct speed_params *);
407double speed_mpz_fib2_ui (struct speed_params *);
408double speed_mpz_init_clear (struct speed_params *);
409double speed_mpz_init_realloc_clear (struct speed_params *);
410double speed_mpz_nextprime (struct speed_params *);
411double speed_mpz_jacobi (struct speed_params *);
412double speed_mpz_lucnum_ui (struct speed_params *);
413double speed_mpz_lucnum2_ui (struct speed_params *);
414double speed_mpz_mod (struct speed_params *);
415double speed_mpz_powm (struct speed_params *);
416double speed_mpz_powm_mod (struct speed_params *);
417double speed_mpz_powm_redc (struct speed_params *);
418double speed_mpz_powm_sec (struct speed_params *);
419double speed_mpz_powm_ui (struct speed_params *);
420double speed_mpz_urandomb (struct speed_params *);
421
422double speed_gmp_randseed (struct speed_params *);
423double speed_gmp_randseed_ui (struct speed_params *);
424
425double speed_noop (struct speed_params *);
426double speed_noop_wxs (struct speed_params *);
427double speed_noop_wxys (struct speed_params *);
428
429double speed_operator_div (struct speed_params *);
430double speed_operator_mod (struct speed_params *);
431
432double speed_udiv_qrnnd (struct speed_params *);
433double speed_udiv_qrnnd_preinv1 (struct speed_params *);
434double speed_udiv_qrnnd_preinv2 (struct speed_params *);
435double speed_udiv_qrnnd_preinv3 (struct speed_params *);
436double speed_udiv_qrnnd_c (struct speed_params *);
437double speed_umul_ppmm (struct speed_params *);
438
439/* Prototypes for other routines */
440
441#if defined (__cplusplus)
442extern "C" {
443#endif
444
445/* low 32-bits in p[0], high 32-bits in p[1] */
446void speed_cyclecounter (unsigned p[2]);
447
448#if defined (__cplusplus)
449}
450#endif
451
452void mftb_function (unsigned p[2]);
453
454double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
455int gettimeofday_microseconds_p (void);
456int getrusage_microseconds_p (void);
457int cycles_works_p (void);
458long clk_tck (void);
459double freq_measure (const char *, double (*)(void));
460
461int double_cmp_ptr (const double *, const double *);
462void pentium_wbinvd (void);
463typedef int (*qsort_function_t) (const void *, const void *);
464
465void noop (void);
466void noop_1 (mp_limb_t);
467void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
468void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
469void mpn_cache_fill (mp_srcptr, mp_size_t);
470void mpn_cache_fill_dummy (mp_limb_t);
471void speed_cache_fill (struct speed_params *);
472void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
473void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
474
475extern int  speed_option_addrs;
476extern int  speed_option_verbose;
477extern int  speed_option_cycles_broken;
478void speed_option_set (const char *);
479
480mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
481mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
482
483mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
484mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
485mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
486mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
487
488int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
489int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
490int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
491int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
492
493int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
494int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
495int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
496int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
497int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
498
499mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
500mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
501
502mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
503mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
504
505void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
506void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
507
508mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
509mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
510mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
511mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
512mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
513mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
514
515mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
516mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
517
518mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
519mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
520
521mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
522mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
523
524mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
525mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
526
527mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
528void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
529
530void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
531void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
532
533int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
534
535
536/* "get" is called repeatedly until it ticks over, just in case on a fast
537   processor it takes less than a microsecond, though this is probably
538   unlikely if it's a system call.
539
540   speed_cyclecounter is called on the same side of the "get" for the start
541   and end measurements.  It doesn't matter how long it takes from the "get"
542   sample to the cycles sample, since that period will cancel out in the
543   difference calculation (assuming it's the same each time).
544
545   Letting the test run for more than a process time slice is probably only
546   going to reduce accuracy, especially for getrusage when the cycle counter
547   is real time, or for gettimeofday if the cycle counter is in fact process
548   time.  Use CLK_TCK/2 as a reasonable stop.
549
550   It'd be desirable to be quite accurate here.  The default speed_precision
551   for a cycle counter is 10000 cycles, so to mix that with getrusage or
552   gettimeofday the frequency should be at least that accurate.  But running
553   measurements for 10000 microseconds (or more) is too long.  Be satisfied
554   with just a half clock tick (5000 microseconds usually).  */
555
556#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
557  do {									\
558    type      st1, st, et1, et;						\
559    unsigned  sc[2], ec[2];						\
560    long      dt, half_tick;						\
561    double    dc, cyc;							\
562									\
563    half_tick = (1000000L / clk_tck()) / 2;				\
564									\
565    get (st1);								\
566    do {								\
567      get (st);								\
568    } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
569									\
570    getc (sc);								\
571									\
572    for (;;)								\
573      {									\
574	get (et1);							\
575	do {								\
576	  get (et);							\
577	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
578									\
579	getc (ec);							\
580									\
581	dc = speed_cyclecounter_diff (ec, sc);				\
582									\
583	/* allow secs to cancel before multiplying */			\
584	dt = sec(et) - sec(st);						\
585	dt = dt * 1000000L + (usec(et) - usec(st));			\
586									\
587	if (dt >= half_tick)						\
588	  break;							\
589      }									\
590									\
591    cyc = dt * 1e-6 / dc;						\
592									\
593    if (speed_option_verbose >= 2)					\
594      printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
595	      name, dc, dt, cyc);					\
596									\
597    return dt * 1e-6 / dc;						\
598									\
599  } while (0)
600
601
602
603
604/* The measuring routines use these big macros to save duplication for
605   similar forms.  They also get used for some automatically generated
606   measuring of new implementations of functions.
607
608   Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
609   function pointer is considered undesirable since it's not the way a
610   normal application will be calling, and some processors might do
611   different things with an indirect call, like not branch predicting, or
612   doing a full pipe flush.  At least some of the "functions" measured are
613   actually macros too.
614
615   The net effect is to bloat the object code, possibly in a big way, but
616   only what's being measured is being run, so that doesn't matter.
617
618   The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
619   ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
620   function pointer doesn't work in gcc 3.2.  Using an actual non-pure
621   function pointer variable works, but stands a real risk of a
622   non-optimizing compiler generating unnecessary overheads in the call.
623   Currently the best idea is not to use those attributes for a timing
624   program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
625   gmp-impl.h to omit them from routines there.  */
626
627#define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
628
629/* For mpn_copy or similar. */
630#define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
631  {									\
632    mp_ptr    wp;							\
633    unsigned  i;							\
634    double    t;							\
635    TMP_DECL;								\
636									\
637    SPEED_RESTRICT_COND (s->size >= 0);					\
638									\
639    TMP_MARK;								\
640    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
641									\
642    speed_operand_src (s, s->xp, s->size);				\
643    speed_operand_dst (s, wp, s->size);					\
644    speed_cache_fill (s);						\
645									\
646    speed_starttime ();							\
647    i = s->reps;							\
648    do									\
649      call;								\
650    while (--i != 0);							\
651    t = speed_endtime ();						\
652									\
653    TMP_FREE;								\
654    return t;								\
655  }
656#define SPEED_ROUTINE_MPN_COPY(function)				\
657  SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
658
659#define SPEED_ROUTINE_MPN_TABSELECT(function)				\
660  {									\
661    mp_ptr    xp, wp;							\
662    unsigned  i;							\
663    double    t;							\
664    TMP_DECL;								\
665									\
666    SPEED_RESTRICT_COND (s->size >= 0);					\
667									\
668    if (s->r == 0)							\
669      s->r = s->size;	/* default to a quadratic shape */		\
670									\
671    TMP_MARK;								\
672    SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp);		\
673    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
674									\
675    speed_operand_src (s, xp, s->size * s->r);				\
676    speed_operand_dst (s, wp, s->size);					\
677    speed_cache_fill (s);						\
678									\
679    speed_starttime ();							\
680    i = s->reps;							\
681    do									\
682      function (wp, xp, s->size, s->r, (s->r) / 2);			\
683    while (--i != 0);							\
684    t = speed_endtime () / s->r;					\
685									\
686    TMP_FREE;								\
687    return t;								\
688  }
689
690
691#define SPEED_ROUTINE_MPN_COPYC(function)				\
692  {									\
693    mp_ptr    wp;							\
694    unsigned  i;							\
695    double    t;							\
696    TMP_DECL;								\
697									\
698    SPEED_RESTRICT_COND (s->size >= 0);					\
699									\
700    TMP_MARK;								\
701    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
702									\
703    speed_operand_src (s, s->xp, s->size);				\
704    speed_operand_dst (s, wp, s->size);					\
705    speed_cache_fill (s);						\
706									\
707    speed_starttime ();							\
708    i = s->reps;							\
709    do									\
710      function (wp, s->xp, s->size, 0);					\
711    while (--i != 0);							\
712    t = speed_endtime ();						\
713									\
714    TMP_FREE;								\
715    return t;								\
716  }
717
718/* s->size is still in limbs, and it's limbs which are copied, but
719   "function" takes a size in bytes not limbs.  */
720#define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
721  {									\
722    mp_ptr    wp;							\
723    unsigned  i;							\
724    double    t;							\
725    TMP_DECL;								\
726									\
727    SPEED_RESTRICT_COND (s->size >= 0);					\
728									\
729    TMP_MARK;								\
730    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
731									\
732    speed_operand_src (s, s->xp, s->size);				\
733    speed_operand_dst (s, wp, s->size);					\
734    speed_cache_fill (s);						\
735									\
736    speed_starttime ();							\
737    i = s->reps;							\
738    do									\
739      function (wp, s->xp, s->size * GMP_LIMB_BYTES);		\
740    while (--i != 0);							\
741    t = speed_endtime ();						\
742									\
743    TMP_FREE;								\
744    return t;								\
745  }
746
747
748/* For mpn_add_n, mpn_sub_n, or similar. */
749#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
750  {									\
751    mp_ptr     wp;							\
752    mp_ptr     xp, yp;							\
753    unsigned   i;							\
754    double     t;							\
755    TMP_DECL;								\
756									\
757    SPEED_RESTRICT_COND (s->size >= 1);					\
758									\
759    TMP_MARK;								\
760    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
761									\
762    xp = s->xp;								\
763    yp = s->yp;								\
764									\
765    if (s->r == 0)	;						\
766    else if (s->r == 1) { xp = wp;	    }				\
767    else if (s->r == 2) {	   yp = wp; }				\
768    else if (s->r == 3) { xp = wp; yp = wp; }				\
769    else if (s->r == 4) {     yp = xp;	    }				\
770    else		{						\
771      TMP_FREE;								\
772      return -1.0;							\
773    }									\
774									\
775    /* initialize wp if operand overlap */				\
776    if (xp == wp || yp == wp)						\
777      MPN_COPY (wp, s->xp, s->size);					\
778									\
779    speed_operand_src (s, xp, s->size);					\
780    speed_operand_src (s, yp, s->size);					\
781    speed_operand_dst (s, wp, s->size);					\
782    speed_cache_fill (s);						\
783									\
784    speed_starttime ();							\
785    i = s->reps;							\
786    do									\
787      call;								\
788    while (--i != 0);							\
789    t = speed_endtime ();						\
790									\
791    TMP_FREE;								\
792    return t;								\
793  }
794
795
796/* For mpn_aors_errK_n, where 1 <= K <= 3. */
797#define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
798  {									\
799    mp_ptr     wp;							\
800    mp_ptr     xp, yp;							\
801    mp_ptr     zp[K];							\
802    mp_limb_t  ep[2*K];							\
803    unsigned   i;							\
804    double     t;							\
805    TMP_DECL;								\
806									\
807    SPEED_RESTRICT_COND (s->size >= 1);					\
808									\
809    TMP_MARK;								\
810    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
811									\
812    /* (don't have a mechanism to specify zp alignments) */		\
813    for (i = 0; i < K; i++)						\
814      SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
815									\
816    xp = s->xp;								\
817    yp = s->yp;								\
818									\
819    if (s->r == 0)	;						\
820    else if (s->r == 1) { xp = wp;	    }				\
821    else if (s->r == 2) {	   yp = wp; }				\
822    else if (s->r == 3) { xp = wp; yp = wp; }				\
823    else if (s->r == 4) {     yp = xp;	    }				\
824    else		{						\
825      TMP_FREE;								\
826      return -1.0;							\
827    }									\
828									\
829    /* initialize wp if operand overlap */				\
830    if (xp == wp || yp == wp)						\
831      MPN_COPY (wp, s->xp, s->size);					\
832									\
833    speed_operand_src (s, xp, s->size);					\
834    speed_operand_src (s, yp, s->size);					\
835    for (i = 0; i < K; i++)						\
836      speed_operand_src (s, zp[i], s->size);				\
837    speed_operand_dst (s, wp, s->size);					\
838    speed_cache_fill (s);						\
839									\
840    speed_starttime ();							\
841    i = s->reps;							\
842    do									\
843      call;								\
844    while (--i != 0);							\
845    t = speed_endtime ();						\
846									\
847    TMP_FREE;								\
848    return t;								\
849  }
850
851#define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
852  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
853
854#define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
855  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
856
857#define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
858  SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
859
860
861/* For mpn_add_n, mpn_sub_n, or similar. */
862#define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
863  {									\
864    mp_ptr     ap, sp;							\
865    mp_ptr     xp, yp;							\
866    unsigned   i;							\
867    double     t;							\
868    TMP_DECL;								\
869									\
870    SPEED_RESTRICT_COND (s->size >= 1);					\
871									\
872    TMP_MARK;								\
873    SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
874    SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
875									\
876    xp = s->xp;								\
877    yp = s->yp;								\
878									\
879    if ((s->r & 1) != 0) { xp = ap; }					\
880    if ((s->r & 2) != 0) { yp = ap; }					\
881    if ((s->r & 4) != 0) { xp = sp; }					\
882    if ((s->r & 8) != 0) { yp = sp; }					\
883    if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
884      {									\
885	TMP_FREE;							\
886	return -1.0;							\
887      }									\
888									\
889    /* initialize ap if operand overlap */				\
890    if (xp == ap || yp == ap)						\
891      MPN_COPY (ap, s->xp, s->size);					\
892    /* initialize sp if operand overlap */				\
893    if (xp == sp || yp == sp)						\
894      MPN_COPY (sp, s->xp, s->size);					\
895									\
896    speed_operand_src (s, xp, s->size);					\
897    speed_operand_src (s, yp, s->size);					\
898    speed_operand_dst (s, ap, s->size);					\
899    speed_operand_dst (s, sp, s->size);					\
900    speed_cache_fill (s);						\
901									\
902    speed_starttime ();							\
903    i = s->reps;							\
904    do									\
905      call;								\
906    while (--i != 0);							\
907    t = speed_endtime ();						\
908									\
909    TMP_FREE;								\
910    return t;								\
911  }
912
913#define SPEED_ROUTINE_MPN_BINARY_N(function)				\
914   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
915
916#define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
917   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
918
919
920/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
921#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
922  {									\
923    mp_ptr    wp;							\
924    unsigned  i;							\
925    double    t;							\
926    TMP_DECL;								\
927									\
928    SPEED_RESTRICT_COND (s->size >= 1);					\
929									\
930    TMP_MARK;								\
931    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
932									\
933    speed_operand_src (s, s->xp, s->size);				\
934    speed_operand_dst (s, wp, s->size);					\
935    speed_cache_fill (s);						\
936									\
937    speed_starttime ();							\
938    i = s->reps;							\
939    do									\
940      call;								\
941    while (--i != 0);							\
942    t = speed_endtime ();						\
943									\
944    TMP_FREE;								\
945    return t;								\
946  }
947
948#define SPEED_ROUTINE_MPN_UNARY_1(function)				\
949  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
950
951#define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
952  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
953
954/* FIXME: wp is uninitialized here, should start it off from xp */
955#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
956  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
957
958#define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
959  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
960
961#define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
962    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
963
964#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
965  {									\
966    unsigned   shift;							\
967    mp_limb_t  dinv;							\
968									\
969    SPEED_RESTRICT_COND (s->size > 0);					\
970    SPEED_RESTRICT_COND (s->r != 0);					\
971									\
972    count_trailing_zeros (shift, s->r);					\
973    binvert_limb (dinv, s->r >> shift);					\
974									\
975    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
976  }
977#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
978  SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
979  ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
980
981#define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
982  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
983
984#define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
985  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
986
987#define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
988  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
989
990#define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
991  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
992
993#define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
994  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
995
996
997#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
998  {									\
999    unsigned   shift;							\
1000    mp_limb_t  dinv;							\
1001									\
1002    SPEED_RESTRICT_COND (s->size >= 0);					\
1003    SPEED_RESTRICT_COND (s->r != 0);					\
1004									\
1005    count_leading_zeros (shift, s->r);					\
1006    invert_limb (dinv, s->r << shift);					\
1007									\
1008    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
1009  }									\
1010
1011#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
1012  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
1013  ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
1014
1015/* s->size limbs worth of fraction part */
1016#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
1017  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
1018  ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
1019
1020
1021/* s->r is duplicated to form the multiplier, defaulting to
1022   MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
1023   least it provides some control.  */
1024#define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
1025  {									\
1026    mp_ptr     wp;							\
1027    mp_size_t  wn;							\
1028    unsigned   i;							\
1029    double     t;							\
1030    mp_limb_t  yp[N];							\
1031    TMP_DECL;								\
1032									\
1033    SPEED_RESTRICT_COND (s->size >= N);					\
1034									\
1035    TMP_MARK;								\
1036    wn = s->size + N-1;							\
1037    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
1038    for (i = 0; i < N; i++)						\
1039      yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
1040									\
1041    speed_operand_src (s, s->xp, s->size);				\
1042    speed_operand_src (s, yp, (mp_size_t) N);				\
1043    speed_operand_dst (s, wp, wn);					\
1044    speed_cache_fill (s);						\
1045									\
1046    speed_starttime ();							\
1047    i = s->reps;							\
1048    do									\
1049      function (wp, s->xp, s->size, yp);				\
1050    while (--i != 0);							\
1051    t = speed_endtime ();						\
1052									\
1053    TMP_FREE;								\
1054    return t;								\
1055  }
1056
1057#define SPEED_ROUTINE_MPN_UNARY_2(function)				\
1058  SPEED_ROUTINE_MPN_UNARY_N (function, 2)
1059#define SPEED_ROUTINE_MPN_UNARY_3(function)				\
1060  SPEED_ROUTINE_MPN_UNARY_N (function, 3)
1061#define SPEED_ROUTINE_MPN_UNARY_4(function)				\
1062  SPEED_ROUTINE_MPN_UNARY_N (function, 4)
1063#define SPEED_ROUTINE_MPN_UNARY_5(function)				\
1064  SPEED_ROUTINE_MPN_UNARY_N (function, 5)
1065#define SPEED_ROUTINE_MPN_UNARY_6(function)				\
1066  SPEED_ROUTINE_MPN_UNARY_N (function, 6)
1067#define SPEED_ROUTINE_MPN_UNARY_7(function)				\
1068  SPEED_ROUTINE_MPN_UNARY_N (function, 7)
1069#define SPEED_ROUTINE_MPN_UNARY_8(function)				\
1070  SPEED_ROUTINE_MPN_UNARY_N (function, 8)
1071
1072
1073/* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
1074#define SPEED_ROUTINE_MPN_MUL(function)					\
1075  {									\
1076    mp_ptr    wp;							\
1077    mp_size_t size1;							\
1078    unsigned  i;							\
1079    double    t;							\
1080    TMP_DECL;								\
1081									\
1082    size1 = (s->r == 0 ? s->size : s->r);				\
1083    if (size1 < 0) size1 = -size1 - s->size;				\
1084									\
1085    SPEED_RESTRICT_COND (size1 >= 1);					\
1086    SPEED_RESTRICT_COND (s->size >= size1);				\
1087									\
1088    TMP_MARK;								\
1089    SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
1090									\
1091    speed_operand_src (s, s->xp, s->size);				\
1092    speed_operand_src (s, s->yp, size1);				\
1093    speed_operand_dst (s, wp, size1 + s->size);				\
1094    speed_cache_fill (s);						\
1095									\
1096    speed_starttime ();							\
1097    i = s->reps;							\
1098    do									\
1099      function (wp, s->xp, s->size, s->yp, size1);			\
1100    while (--i != 0);							\
1101    t = speed_endtime ();						\
1102									\
1103    TMP_FREE;								\
1104    return t;								\
1105  }
1106
1107
1108#define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
1109  {									\
1110    mp_ptr    wp;							\
1111    unsigned  i;							\
1112    double    t;							\
1113    TMP_DECL;								\
1114									\
1115    SPEED_RESTRICT_COND (s->size >= 1);					\
1116									\
1117    TMP_MARK;								\
1118    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1119									\
1120    speed_operand_src (s, s->xp, s->size);				\
1121    speed_operand_src (s, s->yp, s->size);				\
1122    speed_operand_dst (s, wp, 2*s->size);				\
1123    speed_cache_fill (s);						\
1124									\
1125    speed_starttime ();							\
1126    i = s->reps;							\
1127    do									\
1128      call;								\
1129    while (--i != 0);							\
1130    t = speed_endtime ();						\
1131									\
1132    TMP_FREE;								\
1133    return t;								\
1134  }
1135
1136#define SPEED_ROUTINE_MPN_MUL_N(function)				\
1137  SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
1138
1139#define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
1140  {									\
1141    mp_ptr    wp;							\
1142    unsigned  i;							\
1143    double    t;							\
1144    TMP_DECL;								\
1145									\
1146    SPEED_RESTRICT_COND (s->size >= 1);					\
1147									\
1148    TMP_MARK;								\
1149    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1150									\
1151    speed_operand_src (s, s->xp, s->size);				\
1152    speed_operand_src (s, s->yp, s->size);				\
1153    speed_operand_dst (s, wp, s->size);					\
1154    speed_cache_fill (s);						\
1155									\
1156    speed_starttime ();							\
1157    i = s->reps;							\
1158    do									\
1159      call;								\
1160    while (--i != 0);							\
1161    t = speed_endtime ();						\
1162									\
1163    TMP_FREE;								\
1164    return t;								\
1165  }
1166
1167#define SPEED_ROUTINE_MPN_MULLO_N(function)				\
1168  SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1169
1170#define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
1171  SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1172
1173#define SPEED_ROUTINE_MPN_SQRLO(function)				\
1174  {									\
1175    mp_ptr    wp;							\
1176    unsigned  i;							\
1177    double    t;							\
1178    TMP_DECL;								\
1179									\
1180    SPEED_RESTRICT_COND (s->size >= 1);					\
1181									\
1182    TMP_MARK;								\
1183    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1184									\
1185    speed_operand_src (s, s->xp, s->size);				\
1186    speed_operand_dst (s, wp, s->size);					\
1187    speed_cache_fill (s);						\
1188									\
1189    speed_starttime ();							\
1190    i = s->reps;							\
1191    do									\
1192      function (wp, s->xp, s->size);					\
1193    while (--i != 0);							\
1194    t = speed_endtime ();						\
1195									\
1196    TMP_FREE;								\
1197    return t;								\
1198  }
1199
1200/* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
1201#define SPEED_ROUTINE_MPN_MULMID(function)				\
1202  {									\
1203    mp_ptr    wp, xp;							\
1204    mp_size_t size1;							\
1205    unsigned  i;							\
1206    double    t;							\
1207    TMP_DECL;								\
1208									\
1209    size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
1210									\
1211    SPEED_RESTRICT_COND (s->size >= 1);					\
1212    SPEED_RESTRICT_COND (size1 >= s->size);				\
1213									\
1214    TMP_MARK;								\
1215    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1216    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1217									\
1218    speed_operand_src (s, xp, size1);					\
1219    speed_operand_src (s, s->yp, s->size);				\
1220    speed_operand_dst (s, wp, size1 - s->size + 3);			\
1221    speed_cache_fill (s);						\
1222									\
1223    speed_starttime ();							\
1224    i = s->reps;							\
1225    do									\
1226      function (wp, xp, size1, s->yp, s->size);				\
1227    while (--i != 0);							\
1228    t = speed_endtime ();						\
1229									\
1230    TMP_FREE;								\
1231    return t;								\
1232  }
1233
1234#define SPEED_ROUTINE_MPN_MULMID_N(function)				\
1235  {									\
1236    mp_ptr    wp, xp;							\
1237    mp_size_t size1;							\
1238    unsigned  i;							\
1239    double    t;							\
1240    TMP_DECL;								\
1241									\
1242    size1 = 2 * s->size - 1;						\
1243									\
1244    SPEED_RESTRICT_COND (s->size >= 1);					\
1245									\
1246    TMP_MARK;								\
1247    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1248    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1249									\
1250    speed_operand_src (s, xp, size1);					\
1251    speed_operand_src (s, s->yp, s->size);				\
1252    speed_operand_dst (s, wp, size1 - s->size + 3);			\
1253    speed_cache_fill (s);						\
1254									\
1255    speed_starttime ();							\
1256    i = s->reps;							\
1257    do									\
1258      function (wp, xp, s->yp, s->size);				\
1259    while (--i != 0);							\
1260    t = speed_endtime ();						\
1261									\
1262    TMP_FREE;								\
1263    return t;								\
1264  }
1265
1266#define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
1267  {									\
1268    mp_ptr    wp, xp, scratch;						\
1269    mp_size_t size1, scratch_size;					\
1270    unsigned  i;							\
1271    double    t;							\
1272    TMP_DECL;								\
1273									\
1274    size1 = 2 * s->size - 1;						\
1275									\
1276    SPEED_RESTRICT_COND (s->size >= 1);					\
1277									\
1278    TMP_MARK;								\
1279    SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1280    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1281    scratch_size = mpn_toom42_mulmid_itch (s->size);			\
1282    SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
1283									\
1284    speed_operand_src (s, xp, size1);					\
1285    speed_operand_src (s, s->yp, s->size);				\
1286    speed_operand_dst (s, wp, size1 - s->size + 3);			\
1287    speed_cache_fill (s);						\
1288									\
1289    speed_starttime ();							\
1290    i = s->reps;							\
1291    do									\
1292      function (wp, xp, s->yp, s->size, scratch);			\
1293    while (--i != 0);							\
1294    t = speed_endtime ();						\
1295									\
1296    TMP_FREE;								\
1297    return t;								\
1298  }
1299
1300#define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
1301  {									\
1302    mp_ptr    wp, tp;							\
1303    unsigned  i;							\
1304    double    t;							\
1305    mp_size_t itch;							\
1306    TMP_DECL;								\
1307									\
1308    SPEED_RESTRICT_COND (s->size >= 1);					\
1309									\
1310    itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
1311									\
1312    TMP_MARK;								\
1313    SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1314    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1315									\
1316    speed_operand_src (s, s->xp, s->size);				\
1317    speed_operand_src (s, s->yp, s->size);				\
1318    speed_operand_dst (s, wp, 2 * s->size);				\
1319    speed_operand_dst (s, tp, itch);					\
1320    speed_cache_fill (s);						\
1321									\
1322    speed_starttime ();							\
1323    i = s->reps;							\
1324    do									\
1325      call;								\
1326    while (--i != 0);							\
1327    t = speed_endtime ();						\
1328									\
1329    TMP_FREE;								\
1330    return t;								\
1331  }
1332#define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
1333  {									\
1334    mp_ptr    wp, tp;							\
1335    unsigned  i;							\
1336    double    t;							\
1337    mp_size_t size, itch;						\
1338    TMP_DECL;								\
1339									\
1340    SPEED_RESTRICT_COND (s->size >= 1);					\
1341									\
1342    size = mpn_mulmod_bnm1_next_size (s->size);				\
1343    itch = mpn_mulmod_bnm1_itch (size, size, size);			\
1344									\
1345    TMP_MARK;								\
1346    SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
1347    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1348									\
1349    speed_operand_src (s, s->xp, s->size);				\
1350    speed_operand_src (s, s->yp, s->size);				\
1351    speed_operand_dst (s, wp, size);					\
1352    speed_operand_dst (s, tp, itch);					\
1353    speed_cache_fill (s);						\
1354									\
1355    speed_starttime ();							\
1356    i = s->reps;							\
1357    do									\
1358      function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
1359    while (--i != 0);							\
1360    t = speed_endtime ();						\
1361									\
1362    TMP_FREE;								\
1363    return t;								\
1364  }
1365
1366#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
1367  {									\
1368    mp_ptr    wp, tspace;						\
1369    unsigned  i;							\
1370    double    t;							\
1371    TMP_DECL;								\
1372									\
1373    SPEED_RESTRICT_COND (s->size >= minsize);				\
1374									\
1375    TMP_MARK;								\
1376    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1377    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1378									\
1379    speed_operand_src (s, s->xp, s->size);				\
1380    speed_operand_src (s, s->yp, s->size);				\
1381    speed_operand_dst (s, wp, 2*s->size);				\
1382    speed_operand_dst (s, tspace, tsize);				\
1383    speed_cache_fill (s);						\
1384									\
1385    speed_starttime ();							\
1386    i = s->reps;							\
1387    do									\
1388      call;								\
1389    while (--i != 0);							\
1390    t = speed_endtime ();						\
1391									\
1392    TMP_FREE;								\
1393    return t;								\
1394  }
1395
1396#define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
1397  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1398    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1399     mpn_toom22_mul_itch (s->size, s->size),				\
1400     MPN_TOOM22_MUL_MINSIZE)
1401
1402#define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
1403  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1404    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1405     mpn_toom33_mul_itch (s->size, s->size),				\
1406     MPN_TOOM33_MUL_MINSIZE)
1407
1408#define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
1409  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1410    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1411     mpn_toom44_mul_itch (s->size, s->size),				\
1412     MPN_TOOM44_MUL_MINSIZE)
1413
1414#define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
1415  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1416    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1417     mpn_toom6h_mul_itch (s->size, s->size),				\
1418     MPN_TOOM6H_MUL_MINSIZE)
1419
1420#define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
1421  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1422    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1423     mpn_toom8h_mul_itch (s->size, s->size),				\
1424     MPN_TOOM8H_MUL_MINSIZE)
1425
1426#define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
1427  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1428    (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
1429     mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
1430     MPN_TOOM32_MUL_MINSIZE)
1431
1432#define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
1433  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1434    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1435     mpn_toom42_mul_itch (s->size, s->size/2),				\
1436     MPN_TOOM42_MUL_MINSIZE)
1437
1438#define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
1439  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1440    (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
1441     mpn_toom43_mul_itch (s->size, s->size*3/4),			\
1442     MPN_TOOM43_MUL_MINSIZE)
1443
1444#define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
1445  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1446    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1447     mpn_toom63_mul_itch (s->size, s->size/2),				\
1448     MPN_TOOM63_MUL_MINSIZE)
1449
1450#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
1451  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1452    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1453     mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
1454     MPN_TOOM32_MUL_MINSIZE)
1455#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
1456  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1457    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1458     mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
1459     MPN_TOOM43_MUL_MINSIZE)
1460
1461#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
1462  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1463    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1464     mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
1465     MPN_TOOM32_MUL_MINSIZE)
1466#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
1467  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1468    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1469     mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
1470     MPN_TOOM53_MUL_MINSIZE)
1471
1472#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
1473  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1474    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1475     mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
1476     MPN_TOOM42_MUL_MINSIZE)
1477#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
1478  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1479    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1480     mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
1481     MPN_TOOM53_MUL_MINSIZE)
1482
1483#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
1484  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1485    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1486     mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
1487     MPN_TOOM54_MUL_MINSIZE)
1488#define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
1489  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1490    (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1491     mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
1492     MPN_TOOM54_MUL_MINSIZE)
1493
1494
1495
1496#define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
1497  {									\
1498    mp_ptr    wp;							\
1499    unsigned  i;							\
1500    double    t;							\
1501    TMP_DECL;								\
1502									\
1503    SPEED_RESTRICT_COND (s->size >= 1);					\
1504									\
1505    TMP_MARK;								\
1506    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1507									\
1508    speed_operand_src (s, s->xp, s->size);				\
1509    speed_operand_dst (s, wp, 2*s->size);				\
1510    speed_cache_fill (s);						\
1511									\
1512    speed_starttime ();							\
1513    i = s->reps;							\
1514    do									\
1515      call;								\
1516    while (--i != 0);							\
1517    t = speed_endtime ();						\
1518									\
1519    TMP_FREE;								\
1520    return t;								\
1521  }
1522
1523#define SPEED_ROUTINE_MPN_SQR(function)					\
1524  SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1525
1526#define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
1527  {									\
1528    mp_ptr    wp, tp;							\
1529    unsigned  i;							\
1530    double    t;							\
1531    TMP_DECL;								\
1532									\
1533    SPEED_RESTRICT_COND (s->size >= 2);					\
1534									\
1535    TMP_MARK;								\
1536    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
1537    SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1538									\
1539    speed_operand_src (s, s->xp, s->size);				\
1540    speed_operand_src (s, tp, 2 * s->size);				\
1541    speed_operand_dst (s, wp, 2 * s->size);				\
1542    speed_cache_fill (s);						\
1543									\
1544    speed_starttime ();							\
1545    i = s->reps;							\
1546    do									\
1547      call;								\
1548    while (--i != 0);							\
1549    t = speed_endtime () / 2;						\
1550									\
1551    TMP_FREE;								\
1552    return t;								\
1553  }
1554
1555#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
1556  {									\
1557    mp_ptr    wp, tspace;						\
1558    unsigned  i;							\
1559    double    t;							\
1560    TMP_DECL;								\
1561									\
1562    SPEED_RESTRICT_COND (s->size >= minsize);				\
1563									\
1564    TMP_MARK;								\
1565    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1566    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1567									\
1568    speed_operand_src (s, s->xp, s->size);				\
1569    speed_operand_dst (s, wp, 2*s->size);				\
1570    speed_operand_dst (s, tspace, tsize);				\
1571    speed_cache_fill (s);						\
1572									\
1573    speed_starttime ();							\
1574    i = s->reps;							\
1575    do									\
1576      call;								\
1577    while (--i != 0);							\
1578    t = speed_endtime ();						\
1579									\
1580    TMP_FREE;								\
1581    return t;								\
1582  }
1583
1584#define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
1585  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1586				mpn_toom2_sqr_itch (s->size),		\
1587				MPN_TOOM2_SQR_MINSIZE)
1588
1589#define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
1590  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1591				mpn_toom3_sqr_itch (s->size),		\
1592				MPN_TOOM3_SQR_MINSIZE)
1593
1594
1595#define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
1596  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1597				mpn_toom4_sqr_itch (s->size),		\
1598				MPN_TOOM4_SQR_MINSIZE)
1599
1600#define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
1601  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1602				mpn_toom6_sqr_itch (s->size),		\
1603				MPN_TOOM6_SQR_MINSIZE)
1604
1605#define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
1606  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1607				mpn_toom8_sqr_itch (s->size),		\
1608				MPN_TOOM8_SQR_MINSIZE)
1609
1610#define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
1611  {									\
1612    unsigned   i;							\
1613									\
1614    SPEED_RESTRICT_COND (s->size >= 0);					\
1615									\
1616    speed_operand_src (s, s->xp, s->size);				\
1617    speed_cache_fill (s);						\
1618									\
1619    speed_starttime ();							\
1620    i = s->reps;							\
1621    do									\
1622      call;								\
1623    while (--i != 0);							\
1624									\
1625    return speed_endtime ();						\
1626  }
1627
1628#define SPEED_ROUTINE_MPN_MOD_1(function)				\
1629   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1630
1631#define SPEED_ROUTINE_MPN_MOD_1C(function)				\
1632   SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1633
1634#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
1635  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1636
1637#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
1638  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1639
1640#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
1641   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1642
1643#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
1644  {									\
1645    unsigned   i;							\
1646    mp_limb_t  inv;							\
1647									\
1648    SPEED_RESTRICT_COND (s->size >= 0);					\
1649    SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
1650									\
1651    invert_limb (inv, s->r);						\
1652    speed_operand_src (s, s->xp, s->size);				\
1653    speed_cache_fill (s);						\
1654									\
1655    speed_starttime ();							\
1656    i = s->reps;							\
1657    do									\
1658      (*function) (s->xp, s->size, s->r, inv);				\
1659    while (--i != 0);							\
1660									\
1661    return speed_endtime ();						\
1662  }
1663
1664#define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
1665  {									\
1666    unsigned   i;							\
1667    mp_limb_t  inv[4];							\
1668									\
1669    SPEED_RESTRICT_COND (s->size >= 2);					\
1670									\
1671    mpn_mod_1_1p_cps (inv, s->r);					\
1672    speed_operand_src (s, s->xp, s->size);				\
1673    speed_cache_fill (s);						\
1674									\
1675    speed_starttime ();							\
1676    i = s->reps;							\
1677    do {								\
1678      pfunc (inv, s->r);						\
1679      function (s->xp, s->size, s->r << inv[1], inv);				\
1680    } while (--i != 0);							\
1681									\
1682    return speed_endtime ();						\
1683  }
1684#define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
1685  {									\
1686    unsigned   i;							\
1687    mp_limb_t  inv[N+3];						\
1688									\
1689    SPEED_RESTRICT_COND (s->size >= 1);					\
1690    SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
1691									\
1692    speed_operand_src (s, s->xp, s->size);				\
1693    speed_cache_fill (s);						\
1694									\
1695    speed_starttime ();							\
1696    i = s->reps;							\
1697    do {								\
1698      pfunc (inv, s->r);						\
1699      function (s->xp, s->size, s->r, inv);				\
1700    } while (--i != 0);							\
1701									\
1702    return speed_endtime ();						\
1703  }
1704
1705
1706/* A division of 2*s->size by s->size limbs */
1707
1708#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
1709  {									\
1710    unsigned  i;							\
1711    mp_ptr    a, d, q, r;						\
1712    double    t;							\
1713    gmp_pi1_t dinv;							\
1714    TMP_DECL;								\
1715									\
1716    SPEED_RESTRICT_COND (s->size >= 1);					\
1717									\
1718    TMP_MARK;								\
1719    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
1720    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
1721    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
1722    SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
1723									\
1724    MPN_COPY (a, s->xp, s->size);					\
1725    MPN_COPY (a+s->size, s->xp, s->size);				\
1726									\
1727    MPN_COPY (d, s->yp, s->size);					\
1728									\
1729    /* normalize the data */						\
1730    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1731    a[2*s->size-1] = d[s->size-1] - 1;					\
1732									\
1733    invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
1734									\
1735    speed_operand_src (s, a, 2*s->size);				\
1736    speed_operand_src (s, d, s->size);					\
1737    speed_operand_dst (s, q, s->size+1);				\
1738    speed_operand_dst (s, r, s->size);					\
1739    speed_cache_fill (s);						\
1740									\
1741    speed_starttime ();							\
1742    i = s->reps;							\
1743    do									\
1744      call;								\
1745    while (--i != 0);							\
1746    t = speed_endtime ();						\
1747									\
1748    TMP_FREE;								\
1749    return t;								\
1750  }
1751
1752
1753/* A remainder 2*s->size by s->size limbs */
1754
1755#define SPEED_ROUTINE_MPZ_MOD(function)					\
1756  {									\
1757    unsigned   i;							\
1758    mpz_t      a, d, r;							\
1759									\
1760    SPEED_RESTRICT_COND (s->size >= 1);					\
1761									\
1762    mpz_init_set_n (d, s->yp, s->size);					\
1763									\
1764    /* high part less than d, low part a duplicate copied in */		\
1765    mpz_init_set_n (a, s->xp, s->size);					\
1766    mpz_mod (a, a, d);							\
1767    mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
1768    MPN_COPY (PTR(a), s->xp, s->size);					\
1769									\
1770    mpz_init (r);							\
1771									\
1772    speed_operand_src (s, PTR(a), SIZ(a));				\
1773    speed_operand_src (s, PTR(d), SIZ(d));				\
1774    speed_cache_fill (s);						\
1775									\
1776    speed_starttime ();							\
1777    i = s->reps;							\
1778    do									\
1779      function (r, a, d);						\
1780    while (--i != 0);							\
1781    return speed_endtime ();						\
1782  }
1783
1784#define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
1785  {									\
1786    unsigned   i;							\
1787    mp_ptr     dp, tp, ap, qp;						\
1788    gmp_pi1_t  inv;							\
1789    double     t;							\
1790    mp_size_t size1;							\
1791    TMP_DECL;								\
1792									\
1793    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1794									\
1795    SPEED_RESTRICT_COND (s->size >= DMIN);				\
1796    SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
1797									\
1798    TMP_MARK;								\
1799    SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
1800    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1801    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1802    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
1803									\
1804    /* we don't fill in dividend completely when size1 > s->size */	\
1805    MPN_COPY (ap,         s->xp, s->size);				\
1806    MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
1807									\
1808    MPN_COPY (dp,         s->yp, s->size);				\
1809									\
1810    /* normalize the data */						\
1811    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1812    ap[size1 - 1] = dp[s->size - 1] - 1;				\
1813									\
1814    invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
1815									\
1816    speed_operand_src (s, ap, size1);					\
1817    speed_operand_dst (s, tp, size1);					\
1818    speed_operand_src (s, dp, s->size);					\
1819    speed_operand_dst (s, qp, size1 - s->size);				\
1820    speed_cache_fill (s);						\
1821									\
1822    speed_starttime ();							\
1823    i = s->reps;							\
1824    do {								\
1825      MPN_COPY (tp, ap, size1);						\
1826      function (qp, tp, size1, dp, s->size, INV);			\
1827    } while (--i != 0);							\
1828    t = speed_endtime ();						\
1829									\
1830    TMP_FREE;								\
1831    return t;								\
1832  }
1833#define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
1834  {									\
1835    unsigned   i;							\
1836    mp_ptr     dp, tp, qp, scratch;					\
1837    double     t;							\
1838    mp_size_t itch;							\
1839    TMP_DECL;								\
1840									\
1841    SPEED_RESTRICT_COND (s->size >= 2);					\
1842									\
1843    itch = itchfn (2 * s->size, s->size, 0);				\
1844    TMP_MARK;								\
1845    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1846    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1847    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
1848    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1849									\
1850    MPN_COPY (tp,         s->xp, s->size);				\
1851    MPN_COPY (tp+s->size, s->xp, s->size);				\
1852									\
1853    /* normalize the data */						\
1854    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1855    tp[2*s->size-1] = dp[s->size-1] - 1;				\
1856									\
1857    speed_operand_dst (s, qp, s->size);					\
1858    speed_operand_src (s, tp, 2 * s->size);				\
1859    speed_operand_src (s, dp, s->size);					\
1860    speed_operand_dst (s, scratch, itch);				\
1861    speed_cache_fill (s);						\
1862									\
1863    speed_starttime ();							\
1864    i = s->reps;							\
1865    do {								\
1866      function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
1867    } while (--i != 0);							\
1868    t = speed_endtime ();						\
1869									\
1870    TMP_FREE;								\
1871    return t;								\
1872  }
1873#define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
1874  {									\
1875    unsigned   i;							\
1876    mp_ptr     dp, tp, qp, rp, scratch;					\
1877    double     t;							\
1878    mp_size_t size1, itch;						\
1879    TMP_DECL;								\
1880									\
1881    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1882									\
1883    SPEED_RESTRICT_COND (s->size >= 2);					\
1884    SPEED_RESTRICT_COND (size1 >= s->size);				\
1885									\
1886    itch = itchfn (size1, s->size, 0);					\
1887    TMP_MARK;								\
1888    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1889    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1890    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1891    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1892    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1893									\
1894    /* we don't fill in dividend completely when size1 > s->size */	\
1895    MPN_COPY (tp,         s->xp, s->size);				\
1896    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1897									\
1898    MPN_COPY (dp,         s->yp, s->size);				\
1899									\
1900    /* normalize the data */						\
1901    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1902    tp[size1 - 1] = dp[s->size - 1] - 1;				\
1903									\
1904    speed_operand_dst (s, qp, size1 - s->size);				\
1905    speed_operand_dst (s, rp, s->size);					\
1906    speed_operand_src (s, tp, size1);					\
1907    speed_operand_src (s, dp, s->size);					\
1908    speed_operand_dst (s, scratch, itch);				\
1909    speed_cache_fill (s);						\
1910									\
1911    speed_starttime ();							\
1912    i = s->reps;							\
1913    do {								\
1914      function (qp, rp, tp, size1, dp, s->size, scratch);		\
1915    } while (--i != 0);							\
1916    t = speed_endtime ();						\
1917									\
1918    TMP_FREE;								\
1919    return t;								\
1920  }
1921#define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
1922  {									\
1923    unsigned   i;							\
1924    mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
1925    double     t;							\
1926    mp_size_t  size1, itch;						\
1927    TMP_DECL;								\
1928									\
1929    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1930									\
1931    SPEED_RESTRICT_COND (s->size >= 2);					\
1932    SPEED_RESTRICT_COND (size1 >= s->size);				\
1933									\
1934    itch = itchfn (size1, s->size, s->size);				\
1935    TMP_MARK;								\
1936    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1937    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1938    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1939    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1940    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1941    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
1942									\
1943    /* we don't fill in dividend completely when size1 > s->size */	\
1944    MPN_COPY (tp,         s->xp, s->size);				\
1945    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1946									\
1947    MPN_COPY (dp,         s->yp, s->size);				\
1948									\
1949    /* normalize the data */						\
1950    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1951    tp[size1 - 1] = dp[s->size-1] - 1;					\
1952									\
1953    tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
1954    mpn_invert (ip, dp, s->size, tmp);					\
1955									\
1956    speed_operand_dst (s, qp, size1 - s->size);				\
1957    speed_operand_dst (s, rp, s->size);					\
1958    speed_operand_src (s, tp, size1);					\
1959    speed_operand_src (s, dp, s->size);					\
1960    speed_operand_src (s, ip, s->size);					\
1961    speed_operand_dst (s, scratch, itch);				\
1962    speed_cache_fill (s);						\
1963									\
1964    speed_starttime ();							\
1965    i = s->reps;							\
1966    do {								\
1967      function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
1968    } while (--i != 0);							\
1969    t = speed_endtime ();						\
1970									\
1971    TMP_FREE;								\
1972    return t;								\
1973  }
1974
1975#define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
1976  {									\
1977    unsigned   i;							\
1978    mp_ptr     dp, tp, ap, qp;						\
1979    mp_limb_t  inv;							\
1980    double     t;							\
1981    TMP_DECL;								\
1982									\
1983    SPEED_RESTRICT_COND (s->size >= 1);					\
1984									\
1985    TMP_MARK;								\
1986    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
1987    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1988    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1989    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
1990									\
1991    MPN_COPY (ap,         s->xp, s->size);				\
1992    MPN_COPY (ap+s->size, s->xp, s->size);				\
1993									\
1994    /* divisor must be odd */						\
1995    MPN_COPY (dp, s->yp, s->size);					\
1996    dp[0] |= 1;								\
1997    binvert_limb (inv, dp[0]);						\
1998    inv = -inv;								\
1999									\
2000    speed_operand_src (s, ap, 2*s->size);				\
2001    speed_operand_dst (s, tp, 2*s->size);				\
2002    speed_operand_src (s, dp, s->size);					\
2003    speed_operand_dst (s, qp, s->size);					\
2004    speed_cache_fill (s);						\
2005									\
2006    speed_starttime ();							\
2007    i = s->reps;							\
2008    do {								\
2009      MPN_COPY (tp, ap, 2*s->size);					\
2010      function (qp, tp, 2*s->size, dp, s->size, inv);			\
2011    } while (--i != 0);							\
2012    t = speed_endtime ();						\
2013									\
2014    TMP_FREE;								\
2015    return t;								\
2016  }
2017#define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
2018  {									\
2019    unsigned   i;							\
2020    mp_ptr     dp, tp, qp;						\
2021    mp_limb_t  inv;							\
2022    double     t;							\
2023    TMP_DECL;								\
2024									\
2025    SPEED_RESTRICT_COND (s->size >= 1);					\
2026									\
2027    TMP_MARK;								\
2028    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2029    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2030    SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
2031									\
2032    /* divisor must be odd */						\
2033    MPN_COPY (dp, s->yp, s->size);					\
2034    dp[0] |= 1;								\
2035    binvert_limb (inv, dp[0]);						\
2036    inv = -inv;								\
2037									\
2038    speed_operand_src (s, s->xp, s->size);				\
2039    speed_operand_dst (s, tp, s->size);					\
2040    speed_operand_src (s, dp, s->size);					\
2041    speed_operand_dst (s, qp, s->size);					\
2042    speed_cache_fill (s);						\
2043									\
2044    speed_starttime ();							\
2045    i = s->reps;							\
2046    do {								\
2047      MPN_COPY (tp, s->xp, s->size);					\
2048      function (qp, tp, s->size, dp, s->size, inv);			\
2049    } while (--i != 0);							\
2050    t = speed_endtime ();						\
2051									\
2052    TMP_FREE;								\
2053    return t;								\
2054  }
2055#define SPEED_ROUTINE_MPN_PI1_BDIV_R(function)				\
2056  {									\
2057    unsigned   i;							\
2058    mp_ptr     dp, tp, ap;						\
2059    mp_limb_t  inv;							\
2060    double     t;							\
2061    TMP_DECL;								\
2062									\
2063    SPEED_RESTRICT_COND (s->size >= 1);					\
2064									\
2065    TMP_MARK;								\
2066    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
2067    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2068    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
2069									\
2070    MPN_COPY (ap,         s->xp, s->size);				\
2071    MPN_COPY (ap+s->size, s->xp, s->size);				\
2072									\
2073    /* divisor must be odd */						\
2074    MPN_COPY (dp, s->yp, s->size);					\
2075    dp[0] |= 1;								\
2076    binvert_limb (inv, dp[0]);						\
2077    inv = -inv;								\
2078									\
2079    speed_operand_src (s, ap, 2*s->size);				\
2080    speed_operand_dst (s, tp, 2*s->size);				\
2081    speed_operand_src (s, dp, s->size);					\
2082    speed_cache_fill (s);						\
2083									\
2084    speed_starttime ();							\
2085    i = s->reps;							\
2086    do {								\
2087      MPN_COPY (tp, ap, 2*s->size);					\
2088      function (tp, 2*s->size, dp, s->size, inv);			\
2089    } while (--i != 0);							\
2090    t = speed_endtime ();						\
2091									\
2092    TMP_FREE;								\
2093    return t;								\
2094  }
2095#define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
2096  {									\
2097    unsigned   i;							\
2098    mp_ptr     dp, qp, scratch;						\
2099    double     t;							\
2100    mp_size_t itch;							\
2101    TMP_DECL;								\
2102									\
2103    SPEED_RESTRICT_COND (s->size >= 2);					\
2104									\
2105    itch = itchfn (s->size, s->size);					\
2106    TMP_MARK;								\
2107    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2108    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2109    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2110									\
2111    /* divisor must be odd */						\
2112    MPN_COPY (dp, s->yp, s->size);					\
2113    dp[0] |= 1;								\
2114									\
2115    speed_operand_dst (s, qp, s->size);					\
2116    speed_operand_src (s, s->xp, s->size);				\
2117    speed_operand_src (s, dp, s->size);					\
2118    speed_operand_dst (s, scratch, itch);				\
2119    speed_cache_fill (s);						\
2120									\
2121    speed_starttime ();							\
2122    i = s->reps;							\
2123    do {								\
2124      function (qp, s->xp, s->size, dp, s->size, scratch);		\
2125    } while (--i != 0);							\
2126    t = speed_endtime ();						\
2127									\
2128    TMP_FREE;								\
2129    return t;								\
2130  }
2131#define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
2132  {									\
2133    unsigned   i;							\
2134    mp_ptr     dp, tp, qp, rp, scratch;					\
2135    double     t;							\
2136    mp_size_t itch;							\
2137    TMP_DECL;								\
2138									\
2139    SPEED_RESTRICT_COND (s->size >= 2);					\
2140									\
2141    itch = itchfn (2 * s->size, s->size);				\
2142    TMP_MARK;								\
2143    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2144    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2145    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
2146    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2147    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
2148									\
2149    MPN_COPY (tp,         s->xp, s->size);				\
2150    MPN_COPY (tp+s->size, s->xp, s->size);				\
2151									\
2152    /* divisor must be odd */						\
2153    MPN_COPY (dp, s->yp, s->size);					\
2154    dp[0] |= 1;								\
2155									\
2156    speed_operand_dst (s, qp, s->size);					\
2157    speed_operand_dst (s, rp, s->size);					\
2158    speed_operand_src (s, tp, 2 * s->size);				\
2159    speed_operand_src (s, dp, s->size);					\
2160    speed_operand_dst (s, scratch, itch);				\
2161    speed_cache_fill (s);						\
2162									\
2163    speed_starttime ();							\
2164    i = s->reps;							\
2165    do {								\
2166      function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
2167    } while (--i != 0);							\
2168    t = speed_endtime ();						\
2169									\
2170    TMP_FREE;								\
2171    return t;								\
2172  }
2173
2174#define SPEED_ROUTINE_MPN_BROOT(function)	\
2175  {						\
2176    SPEED_RESTRICT_COND (s->r & 1);		\
2177    s->xp[0] |= 1;				\
2178    SPEED_ROUTINE_MPN_UNARY_1_CALL		\
2179      ((*function) (wp, s->xp, s->size, s->r));	\
2180  }
2181
2182#define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
2183  {							\
2184    mp_ptr    wp, tp;					\
2185    unsigned  i;					\
2186    double    t;					\
2187    TMP_DECL;						\
2188    TMP_MARK;						\
2189    SPEED_RESTRICT_COND (s->size >= 1);			\
2190    SPEED_RESTRICT_COND (s->r & 1);			\
2191    wp = TMP_ALLOC_LIMBS (s->size);			\
2192    tp = TMP_ALLOC_LIMBS ( (itch));			\
2193    s->xp[0] |= 1;					\
2194							\
2195    speed_operand_src (s, s->xp, s->size);		\
2196    speed_operand_dst (s, wp, s->size);			\
2197    speed_cache_fill (s);				\
2198							\
2199    speed_starttime ();					\
2200    i = s->reps;					\
2201    do							\
2202      (*function) (wp, s->xp, s->size, s->r, tp);	\
2203    while (--i != 0);					\
2204    t = speed_endtime ();				\
2205							\
2206    TMP_FREE;						\
2207    return t;						\
2208  }
2209
2210#define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
2211  {									\
2212    long  i;								\
2213    mp_ptr    up, tp, ip;						\
2214    double    t;							\
2215    TMP_DECL;								\
2216									\
2217    SPEED_RESTRICT_COND (s->size >= 1);					\
2218									\
2219    TMP_MARK;								\
2220    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2221    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2222    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2223									\
2224    MPN_COPY (up, s->xp, s->size);					\
2225									\
2226    /* normalize the data */						\
2227    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2228									\
2229    speed_operand_src (s, up, s->size);					\
2230    speed_operand_dst (s, tp, s->size);					\
2231    speed_operand_dst (s, ip, s->size);					\
2232    speed_cache_fill (s);						\
2233									\
2234    speed_starttime ();							\
2235    i = s->reps;							\
2236    do									\
2237      function (ip, up, s->size, tp);					\
2238    while (--i != 0);							\
2239    t = speed_endtime ();						\
2240									\
2241    TMP_FREE;								\
2242    return t;								\
2243  }
2244
2245#define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
2246  {									\
2247    long  i;								\
2248    mp_ptr    up, tp, ip;						\
2249    double    t;							\
2250    TMP_DECL;								\
2251									\
2252    SPEED_RESTRICT_COND (s->size >= 1);					\
2253									\
2254    TMP_MARK;								\
2255    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2256    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2257    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2258									\
2259    MPN_COPY (up, s->xp, s->size);					\
2260									\
2261    /* normalize the data */						\
2262    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2263									\
2264    speed_operand_src (s, up, s->size);					\
2265    speed_operand_dst (s, tp, s->size);					\
2266    speed_operand_dst (s, ip, s->size);					\
2267    speed_cache_fill (s);						\
2268									\
2269    speed_starttime ();							\
2270    i = s->reps;							\
2271    do									\
2272      function (ip, up, s->size, tp);					\
2273    while (--i != 0);							\
2274    t = speed_endtime ();						\
2275									\
2276    TMP_FREE;								\
2277    return t;								\
2278  }
2279
2280#define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
2281  {									\
2282    long  i;								\
2283    mp_ptr    up, tp, ip;						\
2284    double    t;							\
2285    TMP_DECL;								\
2286									\
2287    SPEED_RESTRICT_COND (s->size >= 3);					\
2288									\
2289    TMP_MARK;								\
2290    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2291    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2292    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2293									\
2294    MPN_COPY (up, s->xp, s->size);					\
2295									\
2296    /* normalize the data */						\
2297    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2298									\
2299    speed_operand_src (s, up, s->size);					\
2300    speed_operand_dst (s, tp, s->size);					\
2301    speed_operand_dst (s, ip, s->size);					\
2302    speed_cache_fill (s);						\
2303									\
2304    speed_starttime ();							\
2305    i = s->reps;							\
2306    do									\
2307      function (ip, up, s->size, tp);					\
2308    while (--i != 0);							\
2309    t = speed_endtime ();						\
2310									\
2311    TMP_FREE;								\
2312    return t;								\
2313  }
2314
2315#define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
2316  {									\
2317    long  i;								\
2318    mp_ptr    up, tp, ip;						\
2319    double    t;							\
2320    TMP_DECL;								\
2321									\
2322    SPEED_RESTRICT_COND (s->size >= 1);					\
2323									\
2324    TMP_MARK;								\
2325    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2326    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2327    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2328									\
2329    MPN_COPY (up, s->xp, s->size);					\
2330									\
2331    /* normalize the data */						\
2332    up[0] |= 1;								\
2333									\
2334    speed_operand_src (s, up, s->size);					\
2335    speed_operand_dst (s, tp, s->size);					\
2336    speed_operand_dst (s, ip, s->size);					\
2337    speed_cache_fill (s);						\
2338									\
2339    speed_starttime ();							\
2340    i = s->reps;							\
2341    do									\
2342      function (ip, up, s->size, tp);					\
2343    while (--i != 0);							\
2344    t = speed_endtime ();						\
2345									\
2346    TMP_FREE;								\
2347    return t;								\
2348  }
2349
2350#define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn)			\
2351  {									\
2352    long  i;								\
2353    mp_ptr    up, mp, tp, ip;						\
2354    double    t;							\
2355    TMP_DECL;								\
2356									\
2357    SPEED_RESTRICT_COND (s->size >= 1);					\
2358									\
2359    TMP_MARK;								\
2360    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2361    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2362    SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp);			\
2363    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2364									\
2365    speed_operand_src (s, up, s->size);					\
2366    speed_operand_dst (s, tp, s->size);					\
2367    speed_operand_dst (s, ip, s->size);					\
2368    speed_cache_fill (s);						\
2369									\
2370    MPN_COPY (mp, s->yp, s->size);					\
2371    /* Must be odd */							\
2372    mp[0] |= 1;								\
2373    speed_starttime ();							\
2374    i = s->reps;							\
2375    do									\
2376      {									\
2377	MPN_COPY (up, s->xp, s->size);					\
2378	function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp);	\
2379      }									\
2380    while (--i != 0);							\
2381    t = speed_endtime ();						\
2382									\
2383    TMP_FREE;								\
2384    return t;								\
2385  }
2386
2387#define SPEED_ROUTINE_REDC_1(function)					\
2388  {									\
2389    unsigned   i;							\
2390    mp_ptr     cp, mp, tp, ap;						\
2391    mp_limb_t  inv;							\
2392    double     t;							\
2393    TMP_DECL;								\
2394									\
2395    SPEED_RESTRICT_COND (s->size >= 1);					\
2396									\
2397    TMP_MARK;								\
2398    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2399    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2400    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2401    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2402									\
2403    MPN_COPY (ap,         s->xp, s->size);				\
2404    MPN_COPY (ap+s->size, s->xp, s->size);				\
2405									\
2406    /* modulus must be odd */						\
2407    MPN_COPY (mp, s->yp, s->size);					\
2408    mp[0] |= 1;								\
2409    binvert_limb (inv, mp[0]);						\
2410    inv = -inv;								\
2411									\
2412    speed_operand_src (s, ap, 2*s->size+1);				\
2413    speed_operand_dst (s, tp, 2*s->size+1);				\
2414    speed_operand_src (s, mp, s->size);					\
2415    speed_operand_dst (s, cp, s->size);					\
2416    speed_cache_fill (s);						\
2417									\
2418    speed_starttime ();							\
2419    i = s->reps;							\
2420    do {								\
2421      MPN_COPY (tp, ap, 2*s->size);					\
2422      function (cp, tp, mp, s->size, inv);				\
2423    } while (--i != 0);							\
2424    t = speed_endtime ();						\
2425									\
2426    TMP_FREE;								\
2427    return t;								\
2428  }
2429#define SPEED_ROUTINE_REDC_2(function)					\
2430  {									\
2431    unsigned   i;							\
2432    mp_ptr     cp, mp, tp, ap;						\
2433    mp_limb_t  invp[2];							\
2434    double     t;							\
2435    TMP_DECL;								\
2436									\
2437    SPEED_RESTRICT_COND (s->size >= 1);					\
2438									\
2439    TMP_MARK;								\
2440    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2441    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2442    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2443    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2444									\
2445    MPN_COPY (ap,         s->xp, s->size);				\
2446    MPN_COPY (ap+s->size, s->xp, s->size);				\
2447									\
2448    /* modulus must be odd */						\
2449    MPN_COPY (mp, s->yp, s->size);					\
2450    mp[0] |= 1;								\
2451    mpn_binvert (invp, mp, 2, tp);					\
2452    invp[0] = -invp[0]; invp[1] = ~invp[1];				\
2453									\
2454    speed_operand_src (s, ap, 2*s->size+1);				\
2455    speed_operand_dst (s, tp, 2*s->size+1);				\
2456    speed_operand_src (s, mp, s->size);					\
2457    speed_operand_dst (s, cp, s->size);					\
2458    speed_cache_fill (s);						\
2459									\
2460    speed_starttime ();							\
2461    i = s->reps;							\
2462    do {								\
2463      MPN_COPY (tp, ap, 2*s->size);					\
2464      function (cp, tp, mp, s->size, invp);				\
2465    } while (--i != 0);							\
2466    t = speed_endtime ();						\
2467									\
2468    TMP_FREE;								\
2469    return t;								\
2470  }
2471#define SPEED_ROUTINE_REDC_N(function)					\
2472  {									\
2473    unsigned   i;							\
2474    mp_ptr     cp, mp, tp, ap, invp;					\
2475    double     t;							\
2476    TMP_DECL;								\
2477									\
2478    SPEED_RESTRICT_COND (s->size > 8);					\
2479									\
2480    TMP_MARK;								\
2481    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2482    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2483    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2484    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2485    SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
2486									\
2487    MPN_COPY (ap,         s->xp, s->size);				\
2488    MPN_COPY (ap+s->size, s->xp, s->size);				\
2489									\
2490    /* modulus must be odd */						\
2491    MPN_COPY (mp, s->yp, s->size);					\
2492    mp[0] |= 1;								\
2493    mpn_binvert (invp, mp, s->size, tp);				\
2494									\
2495    speed_operand_src (s, ap, 2*s->size+1);				\
2496    speed_operand_dst (s, tp, 2*s->size+1);				\
2497    speed_operand_src (s, mp, s->size);					\
2498    speed_operand_dst (s, cp, s->size);					\
2499    speed_cache_fill (s);						\
2500									\
2501    speed_starttime ();							\
2502    i = s->reps;							\
2503    do {								\
2504      MPN_COPY (tp, ap, 2*s->size);					\
2505      function (cp, tp, mp, s->size, invp);				\
2506    } while (--i != 0);							\
2507    t = speed_endtime ();						\
2508									\
2509    TMP_FREE;								\
2510    return t;								\
2511  }
2512
2513
2514#define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
2515  {									\
2516    unsigned i;								\
2517									\
2518    SPEED_RESTRICT_COND (s->size >= 1);					\
2519									\
2520    speed_operand_src (s, s->xp, s->size);				\
2521    speed_cache_fill (s);						\
2522									\
2523    speed_starttime ();							\
2524    i = s->reps;							\
2525    do									\
2526      function (s->xp, s->size);					\
2527    while (--i != 0);							\
2528									\
2529    return speed_endtime ();						\
2530  }
2531
2532#define SPEED_ROUTINE_MPN_HAMDIST(function)				\
2533  {									\
2534    unsigned i;								\
2535									\
2536    SPEED_RESTRICT_COND (s->size >= 1);					\
2537									\
2538    speed_operand_src (s, s->xp, s->size);				\
2539    speed_operand_src (s, s->yp, s->size);				\
2540    speed_cache_fill (s);						\
2541									\
2542    speed_starttime ();							\
2543    i = s->reps;							\
2544    do									\
2545      function (s->xp, s->yp, s->size);					\
2546    while (--i != 0);							\
2547									\
2548    return speed_endtime ();						\
2549  }
2550
2551
2552#define SPEED_ROUTINE_MPZ_UI(function)					\
2553  {									\
2554    mpz_t     z;							\
2555    unsigned  i;							\
2556    double    t;							\
2557									\
2558    SPEED_RESTRICT_COND (s->size >= 0);					\
2559									\
2560    mpz_init (z);							\
2561									\
2562    speed_starttime ();							\
2563    i = s->reps;							\
2564    do									\
2565      function (z, s->size);						\
2566    while (--i != 0);							\
2567    t = speed_endtime ();						\
2568									\
2569    mpz_clear (z);							\
2570    return t;								\
2571  }
2572
2573#define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2574#define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2575#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2576
2577
2578#define SPEED_ROUTINE_MPZ_2_UI(function)				\
2579  {									\
2580    mpz_t     z, z2;							\
2581    unsigned  i;							\
2582    double    t;							\
2583									\
2584    SPEED_RESTRICT_COND (s->size >= 0);					\
2585									\
2586    mpz_init (z);							\
2587    mpz_init (z2);							\
2588									\
2589    speed_starttime ();							\
2590    i = s->reps;							\
2591    do									\
2592      function (z, z2, s->size);					\
2593    while (--i != 0);							\
2594    t = speed_endtime ();						\
2595									\
2596    mpz_clear (z);							\
2597    mpz_clear (z2);							\
2598    return t;								\
2599  }
2600
2601#define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
2602#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2603
2604
2605#define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
2606  {									\
2607    mp_ptr     fp, f1p;							\
2608    mp_size_t  alloc;							\
2609    unsigned   i;							\
2610    double     t;							\
2611    TMP_DECL;								\
2612									\
2613    SPEED_RESTRICT_COND (s->size >= 0);					\
2614									\
2615    TMP_MARK;								\
2616    alloc = MPN_FIB2_SIZE (s->size);					\
2617    SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
2618    SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
2619									\
2620    speed_starttime ();							\
2621    i = s->reps;							\
2622    do									\
2623      function (fp, f1p, s->size);					\
2624    while (--i != 0);							\
2625    t = speed_endtime ();						\
2626									\
2627    TMP_FREE;								\
2628    return t;								\
2629  }
2630
2631
2632
2633/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2634   limbs.  m is forced to odd so that redc can be used.  e is limited in
2635   size so the calculation doesn't take too long. */
2636#define SPEED_ROUTINE_MPZ_POWM(function)				\
2637  {									\
2638    mpz_t     r, b, e, m;						\
2639    unsigned  i;							\
2640    double    t;							\
2641									\
2642    SPEED_RESTRICT_COND (s->size >= 1);					\
2643									\
2644    mpz_init (r);							\
2645    if (s->r < 2)							\
2646      mpz_init_set_n (b, s->xp, s->size);				\
2647    else								\
2648      mpz_init_set_ui (b, s->r);					\
2649    mpz_init_set_n (m, s->yp, s->size);					\
2650    mpz_setbit (m, 0);	/* force m to odd */				\
2651    mpz_init_set_n (e, s->xp_block, 6);					\
2652									\
2653    speed_starttime ();							\
2654    i = s->reps;							\
2655    do									\
2656      function (r, b, e, m);						\
2657    while (--i != 0);							\
2658    t = speed_endtime ();						\
2659									\
2660    mpz_clear (r);							\
2661    mpz_clear (b);							\
2662    mpz_clear (e);							\
2663    mpz_clear (m);							\
2664    return t;								\
2665  }
2666
2667/* (m-2)^0xAAAAAAAA mod m */
2668#define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
2669  {									\
2670    mpz_t     r, b, m;							\
2671    unsigned  long  e;							\
2672    unsigned  i;							\
2673    double    t;							\
2674									\
2675    SPEED_RESTRICT_COND (s->size >= 1);					\
2676									\
2677    mpz_init (r);							\
2678									\
2679    /* force m to odd */						\
2680    mpz_init (m);							\
2681    mpz_set_n (m, s->xp, s->size);					\
2682    PTR(m)[0] |= 1;							\
2683									\
2684    e = (~ (unsigned long) 0) / 3;					\
2685    if (s->r != 0)							\
2686      e = s->r;								\
2687									\
2688    mpz_init_set (b, m);						\
2689    mpz_sub_ui (b, b, 2);						\
2690/* printf ("%X\n", mpz_get_ui(m)); */					\
2691    i = s->reps;							\
2692    speed_starttime ();							\
2693    do									\
2694      function (r, b, e, m);						\
2695    while (--i != 0);							\
2696    t = speed_endtime ();						\
2697									\
2698    mpz_clear (r);							\
2699    mpz_clear (b);							\
2700    mpz_clear (m);							\
2701    return t;								\
2702  }
2703
2704
2705#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
2706  {									\
2707    mp_ptr    wp, wp2, xp, yp;						\
2708    unsigned  i;							\
2709    double    t;							\
2710    TMP_DECL;								\
2711									\
2712    SPEED_RESTRICT_COND (s->size >= 0);					\
2713									\
2714    TMP_MARK;								\
2715    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2716    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2717    xp = s->xp;								\
2718    yp = s->yp;								\
2719									\
2720    if (s->r == 0)	;						\
2721    else if (s->r == 1) { xp = wp;	      }				\
2722    else if (s->r == 2) {	    yp = wp2; }				\
2723    else if (s->r == 3) { xp = wp;  yp = wp2; }				\
2724    else if (s->r == 4) { xp = wp2; yp = wp;  }				\
2725    else {								\
2726      TMP_FREE;								\
2727      return -1.0;							\
2728    }									\
2729    if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
2730    if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
2731									\
2732    speed_operand_src (s, xp, s->size);					\
2733    speed_operand_src (s, yp, s->size);					\
2734    speed_operand_dst (s, wp, s->size);					\
2735    speed_operand_dst (s, wp2, s->size);				\
2736    speed_cache_fill (s);						\
2737									\
2738    speed_starttime ();							\
2739    i = s->reps;							\
2740    do									\
2741      call;								\
2742    while (--i != 0);							\
2743    t = speed_endtime ();						\
2744									\
2745    TMP_FREE;								\
2746    return t;								\
2747  }
2748
2749#define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
2750  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2751    (function (wp, wp2, xp, yp, s->size));
2752
2753#define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
2754  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2755    (function (wp, wp2, xp, yp, s->size, 0));
2756
2757
2758/* Doing an Nx1 gcd with the given r. */
2759#define SPEED_ROUTINE_MPN_GCD_1N(function)				\
2760  {									\
2761    mp_ptr    xp;							\
2762    unsigned  i;							\
2763    double    t;							\
2764    TMP_DECL;								\
2765									\
2766    SPEED_RESTRICT_COND (s->size >= 1);					\
2767    SPEED_RESTRICT_COND (s->r != 0);					\
2768									\
2769    TMP_MARK;								\
2770    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
2771    MPN_COPY (xp, s->xp, s->size);					\
2772    xp[0] |= refmpn_zero_p (xp, s->size);				\
2773									\
2774    speed_operand_src (s, s->xp, s->size);				\
2775    speed_cache_fill (s);						\
2776									\
2777    speed_starttime ();							\
2778    i = s->reps;							\
2779    do									\
2780      function (xp, s->size, s->r);					\
2781    while (--i != 0);							\
2782    t = speed_endtime ();						\
2783									\
2784    TMP_FREE;								\
2785    return t;								\
2786  }
2787
2788
2789/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2790
2791#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
2792  {									\
2793    unsigned  i, j;							\
2794    mp_ptr    px, py;							\
2795    mp_limb_t x_mask, y_mask;						\
2796    double    t;							\
2797    TMP_DECL;								\
2798									\
2799    SPEED_RESTRICT_COND (s->size >= 1);					\
2800    SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
2801									\
2802    TMP_MARK;								\
2803    SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
2804    SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
2805    MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
2806    MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
2807									\
2808    x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
2809    y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
2810    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2811      {									\
2812	px[i] &= x_mask; px[i] += (px[i] == 0);				\
2813	py[i] &= y_mask; py[i] += (py[i] == 0);				\
2814	setup;								\
2815      }									\
2816									\
2817    speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
2818    speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
2819    speed_cache_fill (s);						\
2820									\
2821    speed_starttime ();							\
2822    i = s->reps;							\
2823    do									\
2824      {									\
2825	j = SPEED_BLOCK_SIZE;						\
2826	do								\
2827	  {								\
2828	    call;							\
2829	  }								\
2830	while (--j != 0);						\
2831      }									\
2832    while (--i != 0);							\
2833    t = speed_endtime ();						\
2834									\
2835    TMP_FREE;								\
2836									\
2837    s->time_divisor = SPEED_BLOCK_SIZE;					\
2838    return t;								\
2839  }
2840
2841#define SPEED_ROUTINE_MPN_GCD_1(function)				\
2842  SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2843
2844#define SPEED_ROUTINE_MPN_GCD_11(function)				\
2845  SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
2846			       function (px[j-1], py[j-1]))
2847
2848/* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so
2849   we can measure gcd_22 loop only, without gcd_11. */
2850#define SPEED_ROUTINE_MPN_GCD_22(function)				\
2851  SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
2852			       function (px[j-1], px[j-1], py[j-1], py[j-1]))
2853
2854#define SPEED_ROUTINE_MPN_JACBASE(function)				\
2855  SPEED_ROUTINE_MPN_GCD_1_CALL						\
2856    ({									\
2857       /* require x<y, y odd, y!=1 */					\
2858       px[i] %= py[i];							\
2859       px[i] |= 1;							\
2860       py[i] |= 1;							\
2861       if (py[i]==1) py[i]=3;						\
2862     },									\
2863     function (px[j-1], py[j-1], 0))
2864
2865#define SPEED_ROUTINE_MPN_HGCD2(function)				\
2866  {									\
2867    unsigned   i, j;							\
2868    struct hgcd_matrix1 m = {{{0,0},{0,0}}};				\
2869    double     t;							\
2870									\
2871    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
2872    speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE);		\
2873    speed_cache_fill (s);						\
2874									\
2875    speed_starttime ();							\
2876    i = s->reps;							\
2877    mp_limb_t chain = 0;						\
2878    do									\
2879      {									\
2880	for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2)			\
2881	  {								\
2882	    /* randomized but successively dependent */			\
2883	    function (s->xp_block[j] | GMP_NUMB_HIGHBIT,		\
2884		      s->xp_block[j+1] + chain,				\
2885		      s->yp_block[j] | GMP_NUMB_HIGHBIT,		\
2886		      s->yp_block[j+1], &m);				\
2887	    chain += m.u[0][0];						\
2888	  }								\
2889      }									\
2890    while (--i != 0);							\
2891    t = speed_endtime ();						\
2892									\
2893    /* make sure the compiler won't optimize away chain */		\
2894    noop_1 (chain);							\
2895									\
2896    s->time_divisor = SPEED_BLOCK_SIZE / 2;				\
2897    return t;								\
2898  }
2899
2900#define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
2901  {									\
2902    mp_size_t hgcd_init_itch, hgcd_itch;				\
2903    mp_ptr ap, bp, wp, tmp1;						\
2904    struct hgcd_matrix hgcd;						\
2905    int res;								\
2906    unsigned i;								\
2907    double t;								\
2908    TMP_DECL;								\
2909									\
2910    if (s->size < 2)							\
2911      return -1;							\
2912									\
2913    TMP_MARK;								\
2914									\
2915    SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2916    SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2917									\
2918    s->xp[s->size - 1] |= 1;						\
2919    s->yp[s->size - 1] |= 1;						\
2920									\
2921    hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2922    hgcd_itch = itchfunc (s->size);					\
2923									\
2924    SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2925    SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
2926									\
2927    speed_operand_src (s, s->xp, s->size);				\
2928    speed_operand_src (s, s->yp, s->size);				\
2929    speed_operand_dst (s, ap, s->size + 1);				\
2930    speed_operand_dst (s, bp, s->size + 1);				\
2931    speed_operand_dst (s, wp, hgcd_itch);				\
2932    speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2933    speed_cache_fill (s);						\
2934									\
2935    speed_starttime ();							\
2936    i = s->reps;							\
2937    do									\
2938      {									\
2939	MPN_COPY (ap, s->xp, s->size);					\
2940	MPN_COPY (bp, s->yp, s->size);					\
2941	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2942	res = func (ap, bp, s->size, &hgcd, wp);			\
2943      }									\
2944    while (--i != 0);							\
2945    t = speed_endtime ();						\
2946    TMP_FREE;								\
2947    return t;								\
2948  }
2949
2950#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
2951  {									\
2952    mp_size_t hgcd_init_itch, hgcd_step_itch;				\
2953    mp_ptr ap, bp, wp, tmp1;						\
2954    struct hgcd_matrix hgcd;						\
2955    mp_size_t p = s->size/2;						\
2956    int res;								\
2957    unsigned i;								\
2958    double t;								\
2959    TMP_DECL;								\
2960									\
2961    if (s->size < 2)							\
2962      return -1;							\
2963									\
2964    TMP_MARK;								\
2965									\
2966    SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2967    SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2968									\
2969    s->xp[s->size - 1] |= 1;						\
2970    s->yp[s->size - 1] |= 1;						\
2971									\
2972    hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2973    hgcd_step_itch = itchfunc (s->size, p);				\
2974									\
2975    SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2976    SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
2977									\
2978    speed_operand_src (s, s->xp, s->size);				\
2979    speed_operand_src (s, s->yp, s->size);				\
2980    speed_operand_dst (s, ap, s->size + 1);				\
2981    speed_operand_dst (s, bp, s->size + 1);				\
2982    speed_operand_dst (s, wp, hgcd_step_itch);				\
2983    speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2984    speed_cache_fill (s);						\
2985									\
2986    speed_starttime ();							\
2987    i = s->reps;							\
2988    do									\
2989      {									\
2990	MPN_COPY (ap, s->xp, s->size);					\
2991	MPN_COPY (bp, s->yp, s->size);					\
2992	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2993	res = func (&hgcd, ap, bp, s->size, p, wp);			\
2994      }									\
2995    while (--i != 0);							\
2996    t = speed_endtime ();						\
2997    TMP_FREE;								\
2998    return t;								\
2999  }
3000
3001/* Run some GCDs of s->size limbs each.  The number of different data values
3002   is decreased as s->size**2, since GCD is a quadratic algorithm.
3003   SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
3004   though, because the plain gcd is about twice as fast as gcdext.  */
3005
3006#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
3007  {									\
3008    unsigned  i;							\
3009    mp_size_t j, pieces, psize;						\
3010    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
3011    double    t;							\
3012    TMP_DECL;								\
3013									\
3014    SPEED_RESTRICT_COND (s->size >= 1);					\
3015									\
3016    TMP_MARK;								\
3017    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
3018    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
3019    SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
3020    SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
3021									\
3022    pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
3023    pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
3024    pieces = MAX (pieces, 1);						\
3025									\
3026    psize = pieces * s->size;						\
3027    px = TMP_ALLOC_LIMBS (psize);					\
3028    py = TMP_ALLOC_LIMBS (psize);					\
3029    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3030    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3031									\
3032    /* Requirements: x >= y, y must be odd, high limbs != 0.		\
3033       No need to ensure random numbers are really great.  */		\
3034    for (j = 0; j < pieces; j++)					\
3035      {									\
3036	mp_ptr	x = px + j * s->size;					\
3037	mp_ptr	y = py + j * s->size;					\
3038	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
3039	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
3040									\
3041	if (x[s->size - 1] < y[s->size - 1])				\
3042	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
3043	else if (x[s->size - 1] == y[s->size - 1])			\
3044	  {								\
3045	    x[s->size - 1] = 2;						\
3046	    y[s->size - 1] = 1;						\
3047	  }								\
3048	y[0] |= 1;							\
3049      }									\
3050									\
3051    speed_operand_src (s, px, psize);					\
3052    speed_operand_src (s, py, psize);					\
3053    speed_operand_dst (s, xtmp, s->size);				\
3054    speed_operand_dst (s, ytmp, s->size);				\
3055    speed_operand_dst (s, wp, s->size);					\
3056    speed_cache_fill (s);						\
3057									\
3058    speed_starttime ();							\
3059    i = s->reps;							\
3060    do									\
3061      {									\
3062	j = pieces;							\
3063	do								\
3064	  {								\
3065	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
3066	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
3067	    call;							\
3068	  }								\
3069	while (--j != 0);						\
3070      }									\
3071    while (--i != 0);							\
3072    t = speed_endtime ();						\
3073									\
3074    TMP_FREE;								\
3075									\
3076    s->time_divisor = pieces;						\
3077    return t;								\
3078  }
3079
3080#define SPEED_ROUTINE_MPN_GCD(function)	\
3081  SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
3082
3083#define SPEED_ROUTINE_MPN_GCDEXT(function)				\
3084  SPEED_ROUTINE_MPN_GCD_CALL						\
3085    (4, { mp_size_t  wp2size;						\
3086	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
3087
3088
3089#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
3090  {									\
3091    unsigned  i;							\
3092    mp_size_t j, pieces, psize, wp2size;				\
3093    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
3094    double    t;							\
3095    TMP_DECL;								\
3096									\
3097    SPEED_RESTRICT_COND (s->size >= 1);					\
3098									\
3099    TMP_MARK;								\
3100									\
3101    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
3102    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
3103    MPN_COPY (xtmp, s->xp, s->size);					\
3104    MPN_COPY (ytmp, s->yp, s->size);					\
3105									\
3106    SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
3107    SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
3108									\
3109    pieces = SPEED_BLOCK_SIZE / 3;					\
3110    psize = 3 * pieces;							\
3111    px = TMP_ALLOC_LIMBS (psize);					\
3112    py = TMP_ALLOC_LIMBS (psize);					\
3113    MPN_COPY (px, s->xp_block, psize);					\
3114    MPN_COPY (py, s->yp_block, psize);					\
3115									\
3116    /* x must have at least as many bits as y,				\
3117       high limbs must be non-zero */					\
3118    for (j = 0; j < pieces; j++)					\
3119      {									\
3120	mp_ptr	x = px+3*j;						\
3121	mp_ptr	y = py+3*j;						\
3122	x[2] += (x[2] == 0);						\
3123	y[2] += (y[2] == 0);						\
3124	if (x[2] < y[2])						\
3125	  MP_LIMB_T_SWAP (x[2], y[2]);					\
3126      }									\
3127									\
3128    speed_operand_src (s, px, psize);					\
3129    speed_operand_src (s, py, psize);					\
3130    speed_operand_dst (s, xtmp, s->size);				\
3131    speed_operand_dst (s, ytmp, s->size);				\
3132    speed_operand_dst (s, wp, s->size);					\
3133    speed_cache_fill (s);						\
3134									\
3135    speed_starttime ();							\
3136    i = s->reps;							\
3137    do									\
3138      {									\
3139	mp_ptr	x = px;							\
3140	mp_ptr	y = py;							\
3141	mp_ptr	xth = &xtmp[s->size-3];					\
3142	mp_ptr	yth = &ytmp[s->size-3];					\
3143	j = pieces;							\
3144	do								\
3145	  {								\
3146	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
3147	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
3148									\
3149	    ytmp[0] |= 1; /* y must be odd, */				\
3150									\
3151	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
3152									\
3153	    x += 3;							\
3154	    y += 3;							\
3155	  }								\
3156	while (--j != 0);						\
3157      }									\
3158    while (--i != 0);							\
3159    t = speed_endtime ();						\
3160									\
3161    TMP_FREE;								\
3162									\
3163    s->time_divisor = pieces;						\
3164    return t;								\
3165  }
3166
3167/* Calculate nextprime(n) for random n of s->size bits (not limbs). */
3168#define SPEED_ROUTINE_MPZ_NEXTPRIME(function)				\
3169  {									\
3170    unsigned  i, j;							\
3171    mpz_t     wp, n;							\
3172    double    t;							\
3173									\
3174    SPEED_RESTRICT_COND (s->size >= 10);				\
3175									\
3176    mpz_init (wp);							\
3177    mpz_init_set_n (n, s->xp, s->size);					\
3178    /* limit to s->size bits, as this function is very slow */		\
3179    mpz_tdiv_r_2exp (n, n, s->size);					\
3180    /* set high bits so operand and result are genaral s->size bits */	\
3181    mpz_setbit (n, s->size - 1);					\
3182    mpz_clrbit (n, s->size - 2);					\
3183									\
3184    speed_starttime ();							\
3185    i = s->reps;							\
3186    do									\
3187      {									\
3188        /* nextprime timing is variable, so average over many calls */	\
3189        j = SPEED_BLOCK_SIZE - 1;					\
3190        /* starts on random, after measures prime to next prime */	\
3191        function (wp, n);						\
3192        do								\
3193          {								\
3194            function (wp, wp);						\
3195          }								\
3196        while (--j != 0);						\
3197      }									\
3198    while (--i != 0);							\
3199    t = speed_endtime ();						\
3200									\
3201    mpz_clear (wp);							\
3202    mpz_clear (n);							\
3203									\
3204    s->time_divisor = SPEED_BLOCK_SIZE;					\
3205    return t;								\
3206  }
3207
3208#define SPEED_ROUTINE_MPZ_JACOBI(function)				\
3209  {									\
3210    mpz_t     a, b;							\
3211    unsigned  i;							\
3212    mp_size_t j, pieces, psize;						\
3213    mp_ptr    px, py;							\
3214    double    t;							\
3215    TMP_DECL;								\
3216									\
3217    TMP_MARK;								\
3218    pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
3219    pieces = MAX (pieces, 1);						\
3220    s->time_divisor = pieces;						\
3221									\
3222    psize = pieces * s->size;						\
3223    px = TMP_ALLOC_LIMBS (psize);					\
3224    py = TMP_ALLOC_LIMBS (psize);					\
3225    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3226    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3227									\
3228    for (j = 0; j < pieces; j++)					\
3229      {									\
3230	mp_ptr	x = px+j*s->size;					\
3231	mp_ptr	y = py+j*s->size;					\
3232									\
3233	/* y odd */							\
3234	y[0] |= 1;							\
3235									\
3236	/* high limbs non-zero */					\
3237	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
3238	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
3239      }									\
3240									\
3241    SIZ(a) = s->size;							\
3242    SIZ(b) = s->size;							\
3243									\
3244    speed_operand_src (s, px, psize);					\
3245    speed_operand_src (s, py, psize);					\
3246    speed_cache_fill (s);						\
3247									\
3248    speed_starttime ();							\
3249    i = s->reps;							\
3250    do									\
3251      {									\
3252	j = pieces;							\
3253	do								\
3254	  {								\
3255	    PTR(a) = px+(j-1)*s->size;					\
3256	    PTR(b) = py+(j-1)*s->size;					\
3257	    function (a, b);						\
3258	  }								\
3259	while (--j != 0);						\
3260      }									\
3261    while (--i != 0);							\
3262    t = speed_endtime ();						\
3263									\
3264    TMP_FREE;								\
3265    return t;								\
3266  }
3267
3268#define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
3269  {									\
3270    mp_ptr    wp, xp;							\
3271    mp_limb_t yp[2];							\
3272    unsigned  i;							\
3273    double    t;							\
3274    TMP_DECL;								\
3275									\
3276    SPEED_RESTRICT_COND (s->size >= 2);					\
3277									\
3278    TMP_MARK;								\
3279    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
3280    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3281									\
3282    /* source is destroyed */						\
3283    MPN_COPY (xp, s->xp, s->size);					\
3284									\
3285    /* divisor must be normalized */					\
3286    MPN_COPY (yp, s->yp_block, 2);					\
3287    yp[1] |= GMP_NUMB_HIGHBIT;						\
3288									\
3289    speed_operand_src (s, xp, s->size);					\
3290    speed_operand_src (s, yp, 2);					\
3291    speed_operand_dst (s, wp, s->size);					\
3292    speed_cache_fill (s);						\
3293									\
3294    speed_starttime ();							\
3295    i = s->reps;							\
3296    do									\
3297      function (wp, 0, xp, s->size, yp);				\
3298    while (--i != 0);							\
3299    t = speed_endtime ();						\
3300									\
3301    TMP_FREE;								\
3302    return t;								\
3303  }
3304
3305#define SPEED_ROUTINE_MPN_DIV_QR_1(function)				\
3306  {									\
3307    mp_ptr    wp, xp;							\
3308    mp_limb_t d;							\
3309    mp_limb_t r;							\
3310    unsigned  i;							\
3311    double    t;							\
3312    TMP_DECL;								\
3313									\
3314    SPEED_RESTRICT_COND (s->size >= 1);					\
3315									\
3316    TMP_MARK;								\
3317    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3318									\
3319    d = s->r;								\
3320    if (d == 0)								\
3321      d = 1;								\
3322    speed_operand_src (s, s->xp, s->size);				\
3323    speed_operand_dst (s, wp, s->size);					\
3324    speed_cache_fill (s);						\
3325									\
3326    speed_starttime ();							\
3327    i = s->reps;							\
3328    do									\
3329      r = function (wp, wp+s->size-1, s->xp, s->size, d);		\
3330    while (--i != 0);							\
3331    t = speed_endtime ();						\
3332									\
3333    TMP_FREE;								\
3334    return t;								\
3335  }
3336
3337#define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function)			\
3338  {									\
3339    mp_ptr    wp, xp;							\
3340    mp_limb_t d, dinv;							\
3341    mp_limb_t r;							\
3342    unsigned  i;							\
3343    double    t;							\
3344    TMP_DECL;								\
3345									\
3346    SPEED_RESTRICT_COND (s->size >= 1);					\
3347									\
3348    TMP_MARK;								\
3349    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3350									\
3351    d = s->r;								\
3352    /* divisor must be normalized */					\
3353    SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT);				\
3354    invert_limb (dinv, d);						\
3355    speed_operand_src (s, s->xp, s->size);				\
3356    speed_operand_dst (s, wp, s->size);					\
3357    speed_cache_fill (s);						\
3358									\
3359    speed_starttime ();							\
3360    i = s->reps;							\
3361    do									\
3362      r = function (wp, s->xp, s->size, 0, d, dinv);			\
3363    while (--i != 0);							\
3364    t = speed_endtime ();						\
3365									\
3366    TMP_FREE;								\
3367    return t;								\
3368  }
3369
3370#define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
3371  {									\
3372    mp_ptr    wp, xp;							\
3373    mp_limb_t yp[2];							\
3374    mp_limb_t rp[2];							\
3375    unsigned  i;							\
3376    double    t;							\
3377    TMP_DECL;								\
3378									\
3379    SPEED_RESTRICT_COND (s->size >= 2);					\
3380									\
3381    TMP_MARK;								\
3382    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3383									\
3384    /* divisor must be normalized */					\
3385    MPN_COPY (yp, s->yp_block, 2);					\
3386    if (norm)								\
3387      yp[1] |= GMP_NUMB_HIGHBIT;					\
3388    else								\
3389      {									\
3390	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
3391	if (yp[1] == 0)							\
3392	  yp[1] = 1;							\
3393      }									\
3394    speed_operand_src (s, s->xp, s->size);				\
3395    speed_operand_src (s, yp, 2);					\
3396    speed_operand_dst (s, wp, s->size);					\
3397    speed_operand_dst (s, rp, 2);					\
3398    speed_cache_fill (s);						\
3399									\
3400    speed_starttime ();							\
3401    i = s->reps;							\
3402    do									\
3403      function (wp, rp, s->xp, s->size, yp);				\
3404    while (--i != 0);							\
3405    t = speed_endtime ();						\
3406									\
3407    TMP_FREE;								\
3408    return t;								\
3409  }
3410
3411#define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
3412  {									\
3413    unsigned   i, j;							\
3414    mp_ptr     xp;							\
3415    mp_limb_t  n = 1;							\
3416    double     t;							\
3417									\
3418    xp = s->xp_block-1;							\
3419									\
3420    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
3421    speed_cache_fill (s);						\
3422									\
3423    speed_starttime ();							\
3424    i = s->reps;							\
3425    do									\
3426      {									\
3427	j = SPEED_BLOCK_SIZE;						\
3428	do								\
3429	  {								\
3430	    /* randomized but successively dependent */			\
3431	    n += (xp[j] << 1);						\
3432									\
3433	    function (n, n);						\
3434	  }								\
3435	while (--j != 0);						\
3436      }									\
3437    while (--i != 0);							\
3438    t = speed_endtime ();						\
3439									\
3440    /* make sure the compiler won't optimize away n */			\
3441    noop_1 (n);								\
3442									\
3443    s->time_divisor = SPEED_BLOCK_SIZE;					\
3444    return t;								\
3445  }
3446
3447
3448#define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call)				\
3449  {									\
3450    mp_ptr    wp, wp2;							\
3451    unsigned  i;							\
3452    double    t;							\
3453    TMP_DECL;								\
3454									\
3455    SPEED_RESTRICT_COND (s->size >= 1);					\
3456									\
3457    TMP_MARK;								\
3458    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
3459    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
3460									\
3461    speed_operand_src (s, s->xp, s->size);				\
3462    speed_operand_dst (s, wp, s->size);					\
3463    speed_operand_dst (s, wp2, s->size);				\
3464    speed_cache_fill (s);						\
3465									\
3466    speed_starttime ();							\
3467    i = s->reps;							\
3468    do									\
3469      call;								\
3470    while (--i != 0);							\
3471    t = speed_endtime ();						\
3472									\
3473    TMP_FREE;								\
3474    return t;								\
3475  }
3476
3477
3478/* Calculate worst case for perfect_power
3479   Worst case is multiple prime factors larger than trial div limit. */
3480#define SPEED_ROUTINE_MPN_PERFECT_POWER(function)		 	\
3481  {									\
3482    mpz_t     r;							\
3483    unsigned  i, power;							\
3484    double    t;							\
3485									\
3486    SPEED_RESTRICT_COND (s->size >= 10);				\
3487									\
3488    mpz_init (r);							\
3489    power = s->size * GMP_NUMB_BITS / 17;				\
3490    mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1);				\
3491    mpz_mul_ui(r, r, (1 << 16) + 1);	/* larger than 1000th prime */	\
3492									\
3493    speed_starttime ();							\
3494    i = s->reps;							\
3495    do									\
3496      function (PTR(r), SIZ(r));					\
3497    while (--i != 0);							\
3498    t = speed_endtime ();						\
3499									\
3500    mpz_clear (r);							\
3501    return t;								\
3502  }
3503
3504/* Calculate worst case (larger prime) for perfect_square */
3505#define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function)			\
3506  {									\
3507    mpz_t     r;							\
3508    unsigned  i;							\
3509    double    t;							\
3510									\
3511    SPEED_RESTRICT_COND (s->size >= 2);					\
3512    mpz_init_set_n (r, s->xp, s->size / 2);				\
3513    mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1);			\
3514    mpz_mul (r, r, r);							\
3515									\
3516    speed_starttime ();							\
3517    i = s->reps;							\
3518    do									\
3519      function (PTR(r), SIZ(r));					\
3520    while (--i != 0);							\
3521    t = speed_endtime ();						\
3522									\
3523    mpz_clear (r);							\
3524    return t;								\
3525  }
3526
3527
3528/* s->size controls the number of limbs in the input, s->r is the base, or
3529   decimal by default. */
3530#define SPEED_ROUTINE_MPN_GET_STR(function)				\
3531  {									\
3532    unsigned char *wp;							\
3533    mp_size_t wn;							\
3534    mp_ptr xp;								\
3535    int base;								\
3536    unsigned i;								\
3537    double t;								\
3538    TMP_DECL;								\
3539									\
3540    SPEED_RESTRICT_COND (s->size >= 1);					\
3541									\
3542    base = s->r == 0 ? 10 : s->r;					\
3543    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3544									\
3545    TMP_MARK;								\
3546    SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
3547									\
3548    MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
3549    wp = (unsigned char *) TMP_ALLOC (wn);				\
3550									\
3551    /* use this during development to guard against overflowing wp */	\
3552    /*									\
3553    MPN_COPY (xp, s->xp, s->size);					\
3554    ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
3555    */									\
3556									\
3557    speed_operand_src (s, s->xp, s->size);				\
3558    speed_operand_dst (s, xp, s->size);					\
3559    speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES);		\
3560    speed_cache_fill (s);						\
3561									\
3562    speed_starttime ();							\
3563    i = s->reps;							\
3564    do									\
3565      {									\
3566	MPN_COPY (xp, s->xp, s->size);					\
3567	function (wp, base, xp, s->size);				\
3568      }									\
3569    while (--i != 0);							\
3570    t = speed_endtime ();						\
3571									\
3572    TMP_FREE;								\
3573    return t;								\
3574  }
3575
3576/* s->size controls the number of digits in the input, s->r is the base, or
3577   decimal by default. */
3578#define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
3579  {									\
3580    unsigned char *xp;							\
3581    mp_ptr     wp;							\
3582    mp_size_t  wn;							\
3583    unsigned   i;							\
3584    int        base;							\
3585    double     t;							\
3586    TMP_DECL;								\
3587									\
3588    SPEED_RESTRICT_COND (s->size >= 1);					\
3589									\
3590    base = s->r == 0 ? 10 : s->r;					\
3591    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3592									\
3593    TMP_MARK;								\
3594									\
3595    xp = (unsigned char *) TMP_ALLOC (s->size);				\
3596    for (i = 0; i < s->size; i++)					\
3597      xp[i] = s->xp[i] % base;						\
3598									\
3599    LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
3600    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
3601									\
3602    /* use this during development to check wn is big enough */		\
3603    /*									\
3604    ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
3605    */									\
3606									\
3607    speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES);	\
3608    speed_operand_dst (s, wp, wn);					\
3609    speed_cache_fill (s);						\
3610									\
3611    speed_starttime ();							\
3612    i = s->reps;							\
3613    do									\
3614      call;								\
3615    while (--i != 0);							\
3616    t = speed_endtime ();						\
3617									\
3618    TMP_FREE;								\
3619    return t;								\
3620  }
3621
3622
3623/* Run an accel gcd find_a() function over various data values.  A set of
3624   values is used in case some run particularly fast or slow.  The size
3625   parameter is ignored, the amount of data tested is fixed.  */
3626
3627#define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
3628  {									\
3629    unsigned  i, j;							\
3630    mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
3631    double    t;							\
3632    TMP_DECL;								\
3633									\
3634    TMP_MARK;								\
3635									\
3636    /* low must be odd, high must be non-zero */			\
3637    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3638      {									\
3639	cp[i][0] = s->xp_block[i] | 1;					\
3640	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
3641      }									\
3642									\
3643    speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
3644    speed_cache_fill (s);						\
3645									\
3646    speed_starttime ();							\
3647    i = s->reps;							\
3648    do									\
3649      {									\
3650	j = SPEED_BLOCK_SIZE;						\
3651	do								\
3652	  {								\
3653	    function (cp[j-1]);						\
3654	  }								\
3655	while (--j != 0);						\
3656      }									\
3657    while (--i != 0);							\
3658    t = speed_endtime ();						\
3659									\
3660    TMP_FREE;								\
3661									\
3662    s->time_divisor = SPEED_BLOCK_SIZE;					\
3663    return t;								\
3664  }
3665
3666
3667/* "call" should do "count_foo_zeros(c,n)".
3668   Give leading=1 if foo is leading zeros, leading=0 for trailing.
3669   Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
3670
3671#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
3672  {									\
3673    mp_ptr     xp;							\
3674    int        i, c;							\
3675    unsigned   j;							\
3676    mp_limb_t  n;							\
3677    double     t;							\
3678    TMP_DECL;								\
3679									\
3680    TMP_MARK;								\
3681    SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
3682									\
3683    if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
3684      return -1.0;							\
3685    speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
3686    speed_cache_fill (s);						\
3687									\
3688    c = 0;								\
3689    speed_starttime ();							\
3690    j = s->reps;							\
3691    do {								\
3692      for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3693	{								\
3694	  n = xp[i];							\
3695	  n ^= c;							\
3696
3697#define SPEED_ROUTINE_COUNT_ZEROS_B()					\
3698	}								\
3699    } while (--j != 0);							\
3700    t = speed_endtime ();						\
3701									\
3702    /* don't let c go dead */						\
3703    noop_1 (c);								\
3704									\
3705    s->time_divisor = SPEED_BLOCK_SIZE;					\
3706									\
3707    TMP_FREE;								\
3708    return t;								\
3709  }									\
3710
3711#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
3712  do {									\
3713    SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
3714    call;								\
3715    SPEED_ROUTINE_COUNT_ZEROS_B ();					\
3716  } while (0)								\
3717
3718#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
3719  SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
3720#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
3721  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
3722
3723#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
3724  SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
3725#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
3726  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
3727
3728
3729#define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
3730  {									\
3731    unsigned   i, j;							\
3732    mp_limb_t  d, dinv=0;						\
3733    mp_ptr     xp = s->xp_block - 1;					\
3734									\
3735    s->time_divisor = SPEED_BLOCK_SIZE;					\
3736									\
3737    speed_starttime ();							\
3738    i = s->reps;							\
3739    do									\
3740      {									\
3741	j = SPEED_BLOCK_SIZE;						\
3742	do								\
3743	  {								\
3744	    d = dinv ^ xp[j];						\
3745	    d |= GMP_LIMB_HIGHBIT;					\
3746	    do { call; } while (0);					\
3747	  }								\
3748	while (--j != 0);						\
3749      }									\
3750    while (--i != 0);							\
3751									\
3752    /* don't let the compiler optimize everything away */		\
3753    noop_1 (dinv);							\
3754									\
3755    return speed_endtime();						\
3756  }
3757
3758
3759#define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
3760  {									\
3761    unsigned  i;							\
3762    speed_starttime ();							\
3763    i = s->reps;							\
3764    do									\
3765      function ();							\
3766    while (--i != 0);							\
3767    return speed_endtime ();						\
3768  }
3769
3770
3771#define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
3772  {									\
3773    mp_ptr    wp;							\
3774    unsigned  i;							\
3775    double    t;							\
3776    TMP_DECL;								\
3777									\
3778    SPEED_RESTRICT_COND (s->size >= 0);					\
3779									\
3780    TMP_MARK;								\
3781    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3782    speed_operand_dst (s, wp, s->size);					\
3783    speed_cache_fill (s);						\
3784									\
3785    speed_starttime ();							\
3786    i = s->reps;							\
3787    do									\
3788      call;								\
3789    while (--i != 0);							\
3790    t = speed_endtime ();						\
3791									\
3792    TMP_FREE;								\
3793    return t;								\
3794  }
3795
3796#define SPEED_ROUTINE_MPN_ZERO(function)				\
3797  SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
3798
3799
3800#endif
3801