1/* Header for speed and threshold things.
2
3Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
4Software Foundation, Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of the GNU Lesser General Public License as published by
10the Free Software Foundation; either version 3 of the License, or (at your
11option) any later version.
12
13The GNU MP Library is distributed in the hope that it will be useful, but
14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16License for more details.
17
18You should have received a copy of the GNU Lesser General Public License
19along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20
21#ifndef __SPEED_H__
22#define __SPEED_H__
23
24
25/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
26   newsize long. */
27#define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
28  do {							\
29    ASSERT ((newsize) >= (oldsize));			\
30    MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
31  } while (0)
32
33/* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
34   x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
35#define MP_LIMB_T_LOWBITMASK(n)	\
36  ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
37
38
39/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
40
41#define TMP_ALLOC_ALIGNED(bytes, align)	\
42  align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
43#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
44  ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
45
46/* CACHE_LINE_SIZE is our default alignment for speed operands, and the
47   limit on what s->align_xp etc and then request for off-alignment.  Maybe
48   this should be an option of some sort, but in any case here are some line
49   sizes,
50
51       bytes
52	 32   pentium
53	 64   athlon
54	 64   itanium-2 L1
55	128   itanium-2 L2
56*/
57#define CACHE_LINE_SIZE   64 /* bytes */
58
59#define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
60
61/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
62   alignment.  */
63#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
64  do {									\
65    mp_ptr     __ptr;							\
66    mp_size_t  __ptr_align, __ptr_add;					\
67									\
68    ASSERT ((CACHE_LINE_SIZE % BYTES_PER_MP_LIMB) == 0);		\
69    __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
70    __ptr_align = (__ptr - (mp_ptr) NULL);				\
71    __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
72    (ptr) = __ptr + __ptr_add;						\
73  } while (0)
74
75
76/* This is the size for s->xp_block and s->yp_block, used in certain
77   routines that want to run across many different data values and use
78   s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
79
80   512 means 2kbytes of data for each of xp_block and yp_block, making 4k
81   total, which should fit easily in any L1 data cache. */
82
83#define SPEED_BLOCK_SIZE   512 /* limbs */
84
85
86extern double  speed_unittime;
87extern double  speed_cycletime;
88extern int     speed_precision;
89extern char    speed_time_string[];
90void speed_time_init __GMP_PROTO ((void));
91void speed_cycletime_fail __GMP_PROTO ((const char *str));
92void speed_cycletime_init __GMP_PROTO ((void));
93void speed_cycletime_need_cycles __GMP_PROTO ((void));
94void speed_cycletime_need_seconds __GMP_PROTO ((void));
95void speed_starttime __GMP_PROTO ((void));
96double speed_endtime __GMP_PROTO ((void));
97
98
99struct speed_params {
100  unsigned   reps;	/* how many times to run the routine */
101  mp_ptr     xp;	/* first argument */
102  mp_ptr     yp;	/* second argument */
103  mp_size_t  size;	/* size of both arguments */
104  mp_limb_t  r;		/* user supplied parameter */
105  mp_size_t  align_xp;	/* alignment of xp */
106  mp_size_t  align_yp;	/* alignment of yp */
107  mp_size_t  align_wp;	/* intended alignment of wp */
108  mp_size_t  align_wp2; /* intended alignment of wp2 */
109  mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
110  mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
111
112  double     time_divisor; /* optionally set by the speed routine */
113
114  /* used by the cache priming things */
115  int	     cache;
116  unsigned   src_num, dst_num;
117  struct {
118    mp_ptr    ptr;
119    mp_size_t size;
120  } src[3], dst[3];
121};
122
123typedef double (*speed_function_t) __GMP_PROTO ((struct speed_params *s));
124
125double speed_measure __GMP_PROTO ((speed_function_t fun, struct speed_params *s));
126
127/* Prototypes for speed measuring routines */
128
129double speed_back_to_back __GMP_PROTO ((struct speed_params *s));
130double speed_count_leading_zeros __GMP_PROTO ((struct speed_params *s));
131double speed_count_trailing_zeros __GMP_PROTO ((struct speed_params *s));
132double speed_find_a __GMP_PROTO ((struct speed_params *s));
133double speed_gmp_allocate_free __GMP_PROTO ((struct speed_params *s));
134double speed_gmp_allocate_reallocate_free __GMP_PROTO ((struct speed_params *s));
135double speed_invert_limb __GMP_PROTO ((struct speed_params *s));
136double speed_malloc_free __GMP_PROTO ((struct speed_params *s));
137double speed_malloc_realloc_free __GMP_PROTO ((struct speed_params *s));
138double speed_memcpy __GMP_PROTO ((struct speed_params *s));
139double speed_binvert_limb __GMP_PROTO ((struct speed_params *s));
140double speed_binvert_limb_mul1 __GMP_PROTO ((struct speed_params *s));
141double speed_binvert_limb_loop __GMP_PROTO ((struct speed_params *s));
142double speed_binvert_limb_cond __GMP_PROTO ((struct speed_params *s));
143double speed_binvert_limb_arith __GMP_PROTO ((struct speed_params *s));
144
145double speed_mpf_init_clear __GMP_PROTO ((struct speed_params *s));
146
147double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s));
148double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s));
149double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s));
150double speed_mpn_add_n_sub_n __GMP_PROTO ((struct speed_params *s));
151double speed_mpn_and_n __GMP_PROTO ((struct speed_params *s));
152double speed_mpn_andn_n __GMP_PROTO ((struct speed_params *s));
153double speed_mpn_addmul_1 __GMP_PROTO ((struct speed_params *s));
154double speed_mpn_addmul_2 __GMP_PROTO ((struct speed_params *s));
155double speed_mpn_addmul_3 __GMP_PROTO ((struct speed_params *s));
156double speed_mpn_addmul_4 __GMP_PROTO ((struct speed_params *s));
157double speed_mpn_addmul_5 __GMP_PROTO ((struct speed_params *s));
158double speed_mpn_addmul_6 __GMP_PROTO ((struct speed_params *s));
159double speed_mpn_addmul_7 __GMP_PROTO ((struct speed_params *s));
160double speed_mpn_addmul_8 __GMP_PROTO ((struct speed_params *s));
161double speed_mpn_com __GMP_PROTO ((struct speed_params *s));
162double speed_mpn_copyd __GMP_PROTO ((struct speed_params *s));
163double speed_mpn_copyi __GMP_PROTO ((struct speed_params *s));
164double speed_MPN_COPY __GMP_PROTO ((struct speed_params *s));
165double speed_MPN_COPY_DECR __GMP_PROTO ((struct speed_params *s));
166double speed_MPN_COPY_INCR __GMP_PROTO ((struct speed_params *s));
167double speed_mpn_divexact_1 __GMP_PROTO ((struct speed_params *s));
168double speed_mpn_divexact_by3 __GMP_PROTO ((struct speed_params *s));
169double speed_mpn_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
170double speed_mpn_pi1_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
171double speed_mpn_bdiv_dbm1c __GMP_PROTO ((struct speed_params *s));
172double speed_mpn_divrem_1 __GMP_PROTO ((struct speed_params *s));
173double speed_mpn_divrem_1f __GMP_PROTO ((struct speed_params *s));
174double speed_mpn_divrem_1c __GMP_PROTO ((struct speed_params *s));
175double speed_mpn_divrem_1cf __GMP_PROTO ((struct speed_params *s));
176double speed_mpn_divrem_1_div __GMP_PROTO ((struct speed_params *s));
177double speed_mpn_divrem_1f_div __GMP_PROTO ((struct speed_params *s));
178double speed_mpn_divrem_1_inv __GMP_PROTO ((struct speed_params *s));
179double speed_mpn_divrem_1f_inv __GMP_PROTO ((struct speed_params *s));
180double speed_mpn_divrem_2 __GMP_PROTO ((struct speed_params *s));
181double speed_mpn_divrem_2_div __GMP_PROTO ((struct speed_params *s));
182double speed_mpn_divrem_2_inv __GMP_PROTO ((struct speed_params *s));
183double speed_mpn_fib2_ui __GMP_PROTO ((struct speed_params *s));
184double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s));
185double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s));
186double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s));
187double speed_mpn_gcd __GMP_PROTO ((struct speed_params *s));
188double speed_mpn_gcd_1 __GMP_PROTO ((struct speed_params *s));
189double speed_mpn_gcd_1N __GMP_PROTO ((struct speed_params *s));
190double speed_mpn_gcdext __GMP_PROTO ((struct speed_params *s));
191double speed_mpn_gcdext_double __GMP_PROTO ((struct speed_params *s));
192double speed_mpn_gcdext_one_double __GMP_PROTO ((struct speed_params *s));
193double speed_mpn_gcdext_one_single __GMP_PROTO ((struct speed_params *s));
194double speed_mpn_gcdext_single __GMP_PROTO ((struct speed_params *s));
195double speed_mpn_get_str __GMP_PROTO ((struct speed_params *s));
196double speed_mpn_hamdist __GMP_PROTO ((struct speed_params *s));
197double speed_mpn_ior_n __GMP_PROTO ((struct speed_params *s));
198double speed_mpn_iorn_n __GMP_PROTO ((struct speed_params *s));
199double speed_mpn_jacobi_base __GMP_PROTO ((struct speed_params *s));
200double speed_mpn_jacobi_base_1 __GMP_PROTO ((struct speed_params *s));
201double speed_mpn_jacobi_base_2 __GMP_PROTO ((struct speed_params *s));
202double speed_mpn_jacobi_base_3 __GMP_PROTO ((struct speed_params *s));
203double speed_mpn_lshift __GMP_PROTO ((struct speed_params *s));
204double speed_mpn_lshiftc __GMP_PROTO ((struct speed_params *s));
205double speed_mpn_mod_1 __GMP_PROTO ((struct speed_params *s));
206double speed_mpn_mod_1c __GMP_PROTO ((struct speed_params *s));
207double speed_mpn_mod_1_div __GMP_PROTO ((struct speed_params *s));
208double speed_mpn_mod_1_inv __GMP_PROTO ((struct speed_params *s));
209double speed_mpn_mod_1_1 __GMP_PROTO ((struct speed_params *s));
210double speed_mpn_mod_1_2 __GMP_PROTO ((struct speed_params *s));
211double speed_mpn_mod_1_3 __GMP_PROTO ((struct speed_params *s));
212double speed_mpn_mod_1_4 __GMP_PROTO ((struct speed_params *s));
213double speed_mpn_mod_34lsub1 __GMP_PROTO ((struct speed_params *s));
214double speed_mpn_modexact_1_odd __GMP_PROTO ((struct speed_params *s));
215double speed_mpn_modexact_1c_odd __GMP_PROTO ((struct speed_params *s));
216double speed_mpn_mul_1 __GMP_PROTO ((struct speed_params *s));
217double speed_mpn_mul_1_inplace __GMP_PROTO ((struct speed_params *s));
218double speed_mpn_mul_2 __GMP_PROTO ((struct speed_params *s));
219double speed_mpn_mul_3 __GMP_PROTO ((struct speed_params *s));
220double speed_mpn_mul_4 __GMP_PROTO ((struct speed_params *s));
221double speed_mpn_mul __GMP_PROTO ((struct speed_params *s));
222double speed_mpn_mul_basecase __GMP_PROTO ((struct speed_params *s));
223double speed_mpn_mul_fft __GMP_PROTO ((struct speed_params *s));
224double speed_mpn_mul_fft_sqr __GMP_PROTO ((struct speed_params *s));
225double speed_mpn_fft_mul __GMP_PROTO ((struct speed_params *s));
226double speed_mpn_fft_sqr __GMP_PROTO ((struct speed_params *s));
227#if WANT_OLD_FFT_FULL
228double speed_mpn_mul_fft_full __GMP_PROTO ((struct speed_params *s));
229double speed_mpn_mul_fft_full_sqr __GMP_PROTO ((struct speed_params *s));
230#endif
231double speed_mpn_nussbaumer_mul __GMP_PROTO ((struct speed_params *s));
232double speed_mpn_nussbaumer_mul_sqr __GMP_PROTO ((struct speed_params *s));
233double speed_mpn_mul_n __GMP_PROTO ((struct speed_params *s));
234double speed_mpn_mul_n_sqr __GMP_PROTO ((struct speed_params *s));
235double speed_mpn_mullo_n __GMP_PROTO ((struct speed_params *s));
236double speed_mpn_mullo_basecase __GMP_PROTO ((struct speed_params *s));
237double speed_mpn_nand_n __GMP_PROTO ((struct speed_params *s));
238double speed_mpn_nior_n __GMP_PROTO ((struct speed_params *s));
239double speed_mpn_popcount __GMP_PROTO ((struct speed_params *s));
240double speed_mpn_preinv_divrem_1 __GMP_PROTO ((struct speed_params *s));
241double speed_mpn_preinv_divrem_1f __GMP_PROTO ((struct speed_params *s));
242double speed_mpn_preinv_mod_1 __GMP_PROTO ((struct speed_params *s));
243double speed_mpn_sbpi1_div_qr __GMP_PROTO ((struct speed_params *s));
244double speed_mpn_dcpi1_div_qr __GMP_PROTO ((struct speed_params *s));
245double speed_mpn_sbpi1_divappr_q __GMP_PROTO ((struct speed_params *s));
246double speed_mpn_dcpi1_divappr_q __GMP_PROTO ((struct speed_params *s));
247double speed_mpn_mu_div_qr __GMP_PROTO ((struct speed_params *s));
248double speed_mpn_mu_divappr_q __GMP_PROTO ((struct speed_params *s));
249double speed_mpn_mupi_div_qr __GMP_PROTO ((struct speed_params *s));
250double speed_mpn_mu_div_q __GMP_PROTO ((struct speed_params *s));
251double speed_mpn_sbpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s));
252double speed_mpn_dcpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s));
253double speed_mpn_sbpi1_bdiv_q __GMP_PROTO ((struct speed_params *s));
254double speed_mpn_dcpi1_bdiv_q __GMP_PROTO ((struct speed_params *s));
255double speed_mpn_mu_bdiv_q __GMP_PROTO ((struct speed_params *s));
256double speed_mpn_mu_bdiv_qr __GMP_PROTO ((struct speed_params *s));
257double speed_mpn_invert __GMP_PROTO ((struct speed_params *s));
258double speed_mpn_invertappr __GMP_PROTO ((struct speed_params *s));
259double speed_mpn_ni_invertappr __GMP_PROTO ((struct speed_params *s));
260double speed_mpn_binvert __GMP_PROTO ((struct speed_params *s));
261double speed_mpn_redc_1 __GMP_PROTO ((struct speed_params *s));
262double speed_mpn_redc_2 __GMP_PROTO ((struct speed_params *s));
263double speed_mpn_redc_n __GMP_PROTO ((struct speed_params *s));
264double speed_mpn_rsblsh1_n __GMP_PROTO ((struct speed_params *s));
265double speed_mpn_rsblsh2_n __GMP_PROTO ((struct speed_params *s));
266double speed_mpn_rsh1add_n __GMP_PROTO ((struct speed_params *s));
267double speed_mpn_rsh1sub_n __GMP_PROTO ((struct speed_params *s));
268double speed_mpn_rshift __GMP_PROTO ((struct speed_params *s));
269double speed_mpn_sb_divrem_m3 __GMP_PROTO ((struct speed_params *s));
270double speed_mpn_sb_divrem_m3_div __GMP_PROTO ((struct speed_params *s));
271double speed_mpn_sb_divrem_m3_inv __GMP_PROTO ((struct speed_params *s));
272double speed_mpn_set_str __GMP_PROTO ((struct speed_params *s));
273double speed_mpn_bc_set_str __GMP_PROTO ((struct speed_params *s));
274double speed_mpn_dc_set_str __GMP_PROTO ((struct speed_params *s));
275double speed_mpn_set_str_pre __GMP_PROTO ((struct speed_params *s));
276double speed_mpn_sqr_basecase __GMP_PROTO ((struct speed_params *s));
277double speed_mpn_sqr_diagonal __GMP_PROTO ((struct speed_params *s));
278double speed_mpn_sqr __GMP_PROTO ((struct speed_params *s));
279double speed_mpn_sqrtrem __GMP_PROTO ((struct speed_params *s));
280double speed_mpn_rootrem __GMP_PROTO ((struct speed_params *s));
281double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s));
282double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s));
283double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s));
284double speed_mpn_submul_1 __GMP_PROTO ((struct speed_params *s));
285double speed_mpn_toom2_sqr __GMP_PROTO ((struct speed_params *s));
286double speed_mpn_toom3_sqr __GMP_PROTO ((struct speed_params *s));
287double speed_mpn_toom4_sqr __GMP_PROTO ((struct speed_params *s));
288double speed_mpn_toom6_sqr __GMP_PROTO ((struct speed_params *s));
289double speed_mpn_toom8_sqr __GMP_PROTO ((struct speed_params *s));
290double speed_mpn_toom22_mul __GMP_PROTO ((struct speed_params *s));
291double speed_mpn_toom33_mul __GMP_PROTO ((struct speed_params *s));
292double speed_mpn_toom44_mul __GMP_PROTO ((struct speed_params *s));
293double speed_mpn_toom6h_mul __GMP_PROTO ((struct speed_params *s));
294double speed_mpn_toom8h_mul __GMP_PROTO ((struct speed_params *s));
295double speed_mpn_toom32_mul __GMP_PROTO ((struct speed_params *s));
296double speed_mpn_toom42_mul __GMP_PROTO ((struct speed_params *s));
297double speed_mpn_toom43_mul __GMP_PROTO ((struct speed_params *s));
298double speed_mpn_toom63_mul __GMP_PROTO ((struct speed_params *s));
299double speed_mpn_toom32_for_toom43_mul __GMP_PROTO ((struct speed_params *s));
300double speed_mpn_toom43_for_toom32_mul __GMP_PROTO ((struct speed_params *s));
301double speed_mpn_toom32_for_toom53_mul __GMP_PROTO ((struct speed_params *s));
302double speed_mpn_toom53_for_toom32_mul __GMP_PROTO ((struct speed_params *s));
303double speed_mpn_toom42_for_toom53_mul __GMP_PROTO ((struct speed_params *s));
304double speed_mpn_toom53_for_toom42_mul __GMP_PROTO ((struct speed_params *s));
305double speed_mpn_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s));
306double speed_mpn_bc_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s));
307double speed_mpn_mulmod_bnm1_rounded __GMP_PROTO ((struct speed_params *s));
308double speed_mpn_sqrmod_bnm1 __GMP_PROTO ((struct speed_params *s));
309double speed_mpn_udiv_qrnnd __GMP_PROTO ((struct speed_params *s));
310double speed_mpn_udiv_qrnnd_r __GMP_PROTO ((struct speed_params *s));
311double speed_mpn_umul_ppmm __GMP_PROTO ((struct speed_params *s));
312double speed_mpn_umul_ppmm_r __GMP_PROTO ((struct speed_params *s));
313double speed_mpn_xnor_n __GMP_PROTO ((struct speed_params *s));
314double speed_mpn_xor_n __GMP_PROTO ((struct speed_params *s));
315double speed_MPN_ZERO __GMP_PROTO ((struct speed_params *s));
316
317double speed_mpq_init_clear __GMP_PROTO ((struct speed_params *s));
318
319double speed_mpz_add __GMP_PROTO ((struct speed_params *s));
320double speed_mpz_bin_uiui __GMP_PROTO ((struct speed_params *s));
321double speed_mpz_fac_ui __GMP_PROTO ((struct speed_params *s));
322double speed_mpz_fib_ui __GMP_PROTO ((struct speed_params *s));
323double speed_mpz_fib2_ui __GMP_PROTO ((struct speed_params *s));
324double speed_mpz_init_clear __GMP_PROTO ((struct speed_params *s));
325double speed_mpz_init_realloc_clear __GMP_PROTO ((struct speed_params *s));
326double speed_mpz_jacobi __GMP_PROTO ((struct speed_params *s));
327double speed_mpz_lucnum_ui __GMP_PROTO ((struct speed_params *s));
328double speed_mpz_lucnum2_ui __GMP_PROTO ((struct speed_params *s));
329double speed_mpz_mod __GMP_PROTO ((struct speed_params *s));
330double speed_mpz_powm __GMP_PROTO ((struct speed_params *s));
331double speed_mpz_powm_mod __GMP_PROTO ((struct speed_params *s));
332double speed_mpz_powm_redc __GMP_PROTO ((struct speed_params *s));
333double speed_mpz_powm_ui __GMP_PROTO ((struct speed_params *s));
334double speed_mpz_urandomb __GMP_PROTO ((struct speed_params *s));
335
336double speed_gmp_randseed __GMP_PROTO ((struct speed_params *s));
337double speed_gmp_randseed_ui __GMP_PROTO ((struct speed_params *s));
338
339double speed_noop __GMP_PROTO ((struct speed_params *s));
340double speed_noop_wxs __GMP_PROTO ((struct speed_params *s));
341double speed_noop_wxys __GMP_PROTO ((struct speed_params *s));
342
343double speed_operator_div __GMP_PROTO ((struct speed_params *s));
344double speed_operator_mod __GMP_PROTO ((struct speed_params *s));
345
346double speed_udiv_qrnnd __GMP_PROTO ((struct speed_params *s));
347double speed_udiv_qrnnd_preinv1 __GMP_PROTO ((struct speed_params *s));
348double speed_udiv_qrnnd_preinv2 __GMP_PROTO ((struct speed_params *s));
349double speed_udiv_qrnnd_c __GMP_PROTO ((struct speed_params *s));
350double speed_umul_ppmm __GMP_PROTO ((struct speed_params *s));
351
352/* Prototypes for other routines */
353
354/* low 32-bits in p[0], high 32-bits in p[1] */
355void speed_cyclecounter __GMP_PROTO ((unsigned p[2]));
356
357void mftb_function __GMP_PROTO ((unsigned p[2]));
358
359/* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
360   output or a clobber for the cpuid, hence an explicit save and restore.  A
361   clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
362   the dummy output style in non-PIC, so there's an error if somehow -fPIC
363   is used without a -DPIC to tell us about it.	 */
364#if defined(__GNUC__) && ! defined (NO_ASM)	\
365  && (defined (__i386__) || defined (__i486__))
366#if defined (PIC) || defined (__APPLE_CC__)
367#define speed_cyclecounter(p)						\
368  do {									\
369    int	 __speed_cyclecounter__save_ebx;				\
370    int	 __speed_cyclecounter__dummy;					\
371    __asm__ __volatile__ ("movl %%ebx, %1\n"				\
372			  "cpuid\n"					\
373			  "movl %1, %%ebx\n"				\
374			  "rdtsc"					\
375			  : "=a"   ((p)[0]),				\
376			    "=&rm" (__speed_cyclecounter__save_ebx),	\
377			    "=c"   (__speed_cyclecounter__dummy),	\
378			    "=d"   ((p)[1]));				\
379  } while (0)
380#else
381#define speed_cyclecounter(p)						\
382  do {									\
383    int	 __speed_cyclecounter__dummy1;					\
384    int	 __speed_cyclecounter__dummy2;					\
385    __asm__ __volatile__ ("cpuid\n"					\
386			  "rdtsc"					\
387			  : "=a" ((p)[0]),				\
388			    "=b" (__speed_cyclecounter__dummy1),	\
389			    "=c" (__speed_cyclecounter__dummy2),	\
390			    "=d" ((p)[1]));				\
391  } while (0)
392#endif
393#endif
394
395double speed_cyclecounter_diff __GMP_PROTO ((const unsigned [2], const unsigned [2]));
396int gettimeofday_microseconds_p __GMP_PROTO ((void));
397int getrusage_microseconds_p __GMP_PROTO ((void));
398int cycles_works_p __GMP_PROTO ((void));
399long clk_tck __GMP_PROTO ((void));
400double freq_measure __GMP_PROTO ((const char *, double (*)(void)));
401
402int double_cmp_ptr __GMP_PROTO ((const double *, const double *));
403void pentium_wbinvd __GMP_PROTO ((void));
404typedef int (*qsort_function_t) __GMP_PROTO ((const void *, const void *));
405
406void noop __GMP_PROTO ((void));
407void noop_1 __GMP_PROTO ((mp_limb_t));
408void noop_wxs __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
409void noop_wxys __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
410void mpn_cache_fill __GMP_PROTO ((mp_srcptr, mp_size_t));
411void mpn_cache_fill_dummy __GMP_PROTO ((mp_limb_t));
412void speed_cache_fill __GMP_PROTO ((struct speed_params *));
413void speed_operand_src __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t));
414void speed_operand_dst __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t));
415
416extern int  speed_option_addrs;
417extern int  speed_option_verbose;
418void speed_option_set __GMP_PROTO((const char *));
419
420mp_limb_t mpn_divrem_1_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
421mp_limb_t mpn_divrem_1_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
422mp_limb_t mpn_divrem_2_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
423mp_limb_t mpn_divrem_2_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
424
425int mpn_jacobi_base_1 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
426int mpn_jacobi_base_2 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
427int mpn_jacobi_base_3 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
428
429mp_limb_t mpn_mod_1_div __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
430mp_limb_t mpn_mod_1_inv __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
431
432mp_size_t mpn_gcd_binary
433  __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
434mp_size_t mpn_gcd_accel
435  __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
436mp_size_t mpn_gcdext_one_double
437  __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
438mp_size_t mpn_gcdext_one_single
439  __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
440mp_size_t mpn_gcdext_single
441  __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
442mp_size_t mpn_gcdext_double
443  __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
444
445mp_limb_t mpn_sb_divrem_mn_div __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
446mp_limb_t mpn_sb_divrem_mn_inv __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
447
448mp_size_t mpn_set_str_basecase __GMP_PROTO ((mp_ptr, const unsigned char *, size_t, int));
449void mpn_pre_set_str __GMP_PROTO ((mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr));
450
451void mpz_powm_mod __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
452void mpz_powm_redc __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
453
454int speed_routine_count_zeros_setup
455  __GMP_PROTO ((struct speed_params *, mp_ptr, int, int));
456
457
458/* "get" is called repeatedly until it ticks over, just in case on a fast
459   processor it takes less than a microsecond, though this is probably
460   unlikely if it's a system call.
461
462   speed_cyclecounter is called on the same side of the "get" for the start
463   and end measurements.  It doesn't matter how long it takes from the "get"
464   sample to the cycles sample, since that period will cancel out in the
465   difference calculation (assuming it's the same each time).
466
467   Letting the test run for more than a process time slice is probably only
468   going to reduce accuracy, especially for getrusage when the cycle counter
469   is real time, or for gettimeofday if the cycle counter is in fact process
470   time.  Use CLK_TCK/2 as a reasonable stop.
471
472   It'd be desirable to be quite accurate here.  The default speed_precision
473   for a cycle counter is 10000 cycles, so to mix that with getrusage or
474   gettimeofday the frequency should be at least that accurate.  But running
475   measurements for 10000 microseconds (or more) is too long.  Be satisfied
476   with just a half clock tick (5000 microseconds usually).  */
477
478#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
479  do {									\
480    type      st1, st, et1, et;						\
481    unsigned  sc[2], ec[2];						\
482    long      dt, half_tick;						\
483    double    dc, cyc;							\
484									\
485    half_tick = (1000000L / clk_tck()) / 2;				\
486									\
487    get (st1);								\
488    do {								\
489      get (st);								\
490    } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
491									\
492    getc (sc);								\
493									\
494    for (;;)								\
495      {									\
496	get (et1);							\
497	do {								\
498	  get (et);							\
499	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
500									\
501	getc (ec);							\
502									\
503	dc = speed_cyclecounter_diff (ec, sc);				\
504									\
505	/* allow secs to cancel before multiplying */			\
506	dt = sec(et) - sec(st);						\
507	dt = dt * 1000000L + (usec(et) - usec(st));			\
508									\
509	if (dt >= half_tick)						\
510	  break;							\
511      }									\
512									\
513    cyc = dt * 1e-6 / dc;						\
514									\
515    if (speed_option_verbose >= 2)					\
516      printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
517	      name, dc, dt, cyc);					\
518									\
519    return dt * 1e-6 / dc;						\
520									\
521  } while (0)
522
523
524
525
526/* The measuring routines use these big macros to save duplication for
527   similar forms.  They also get used for some automatically generated
528   measuring of new implementations of functions.
529
530   Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
531   function pointer is considered undesirable since it's not the way a
532   normal application will be calling, and some processors might do
533   different things with an indirect call, like not branch predicting, or
534   doing a full pipe flush.  At least some of the "functions" measured are
535   actually macros too.
536
537   The net effect is to bloat the object code, possibly in a big way, but
538   only what's being measured is being run, so that doesn't matter.
539
540   The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
541   ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
542   function pointer doesn't work in gcc 3.2.  Using an actual non-pure
543   function pointer variable works, but stands a real risk of a
544   non-optimizing compiler generating unnecessary overheads in the call.
545   Currently the best idea is not to use those attributes for a timing
546   program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
547   gmp-impl.h to omit them from routines there.  */
548
549#define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
550
551/* For mpn_copy or similar. */
552#define SPEED_ROUTINE_MPN_COPY(function)				\
553  {									\
554    mp_ptr    wp;							\
555    unsigned  i;							\
556    double    t;							\
557    TMP_DECL;								\
558									\
559    SPEED_RESTRICT_COND (s->size >= 0);					\
560									\
561    TMP_MARK;								\
562    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
563									\
564    speed_operand_src (s, s->xp, s->size);				\
565    speed_operand_dst (s, wp, s->size);					\
566    speed_cache_fill (s);						\
567									\
568    speed_starttime ();							\
569    i = s->reps;							\
570    do									\
571      function (wp, s->xp, s->size);					\
572    while (--i != 0);							\
573    t = speed_endtime ();						\
574									\
575    TMP_FREE;								\
576    return t;								\
577  }
578
579#define SPEED_ROUTINE_MPN_COPYC(function)				\
580  {									\
581    mp_ptr    wp;							\
582    unsigned  i;							\
583    double    t;							\
584    TMP_DECL;								\
585									\
586    SPEED_RESTRICT_COND (s->size >= 0);					\
587									\
588    TMP_MARK;								\
589    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
590									\
591    speed_operand_src (s, s->xp, s->size);				\
592    speed_operand_dst (s, wp, s->size);					\
593    speed_cache_fill (s);						\
594									\
595    speed_starttime ();							\
596    i = s->reps;							\
597    do									\
598      function (wp, s->xp, s->size, 0);					\
599    while (--i != 0);							\
600    t = speed_endtime ();						\
601									\
602    TMP_FREE;								\
603    return t;								\
604  }
605
606/* s->size is still in limbs, and it's limbs which are copied, but
607   "function" takes a size in bytes not limbs.	*/
608#define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
609  {									\
610    mp_ptr    wp;							\
611    unsigned  i;							\
612    double    t;							\
613    TMP_DECL;								\
614									\
615    SPEED_RESTRICT_COND (s->size >= 0);					\
616									\
617    TMP_MARK;								\
618    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
619									\
620    speed_operand_src (s, s->xp, s->size);				\
621    speed_operand_dst (s, wp, s->size);					\
622    speed_cache_fill (s);						\
623									\
624    speed_starttime ();							\
625    i = s->reps;							\
626    do									\
627      function (wp, s->xp, s->size * BYTES_PER_MP_LIMB);		\
628    while (--i != 0);							\
629    t = speed_endtime ();						\
630									\
631    TMP_FREE;								\
632    return t;								\
633  }
634
635
636/* For mpn_add_n, mpn_sub_n, or similar. */
637#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
638  {									\
639    mp_ptr     wp;							\
640    mp_ptr     xp, yp;							\
641    unsigned   i;							\
642    double     t;							\
643    TMP_DECL;								\
644									\
645    SPEED_RESTRICT_COND (s->size >= 1);					\
646									\
647    TMP_MARK;								\
648    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
649									\
650    xp = s->xp;								\
651    yp = s->yp;								\
652									\
653    if (s->r == 0)	;						\
654    else if (s->r == 1) { xp = wp;	    }				\
655    else if (s->r == 2) {	   yp = wp; }				\
656    else if (s->r == 3) { xp = wp; yp = wp; }				\
657    else if (s->r == 4) {     yp = xp;	    }				\
658    else		{						\
659      TMP_FREE;								\
660      return -1.0;							\
661    }									\
662									\
663    /* initialize wp if operand overlap */				\
664    if (xp == wp || yp == wp)						\
665      MPN_COPY (wp, s->xp, s->size);					\
666									\
667    speed_operand_src (s, xp, s->size);					\
668    speed_operand_src (s, yp, s->size);					\
669    speed_operand_dst (s, wp, s->size);					\
670    speed_cache_fill (s);						\
671									\
672    speed_starttime ();							\
673    i = s->reps;							\
674    do									\
675      call;								\
676    while (--i != 0);							\
677    t = speed_endtime ();						\
678									\
679    TMP_FREE;								\
680    return t;								\
681  }
682
683/* For mpn_add_n, mpn_sub_n, or similar. */
684#define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
685  {									\
686    mp_ptr     ap, sp;							\
687    mp_ptr     xp, yp;							\
688    unsigned   i;							\
689    double     t;							\
690    TMP_DECL;								\
691									\
692    SPEED_RESTRICT_COND (s->size >= 1);					\
693									\
694    TMP_MARK;								\
695    SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
696    SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
697									\
698    xp = s->xp;								\
699    yp = s->yp;								\
700									\
701    if ((s->r & 1) != 0) { xp = ap; }					\
702    if ((s->r & 2) != 0) { yp = ap; }					\
703    if ((s->r & 4) != 0) { xp = sp; }					\
704    if ((s->r & 8) != 0) { yp = sp; }					\
705    if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
706      {									\
707	TMP_FREE;							\
708	return -1.0;							\
709      }									\
710									\
711    /* initialize ap if operand overlap */				\
712    if (xp == ap || yp == ap)						\
713      MPN_COPY (ap, s->xp, s->size);					\
714    /* initialize sp if operand overlap */				\
715    if (xp == sp || yp == sp)						\
716      MPN_COPY (sp, s->xp, s->size);					\
717									\
718    speed_operand_src (s, xp, s->size);					\
719    speed_operand_src (s, yp, s->size);					\
720    speed_operand_dst (s, ap, s->size);					\
721    speed_operand_dst (s, sp, s->size);					\
722    speed_cache_fill (s);						\
723									\
724    speed_starttime ();							\
725    i = s->reps;							\
726    do									\
727      call;								\
728    while (--i != 0);							\
729    t = speed_endtime ();						\
730									\
731    TMP_FREE;								\
732    return t;								\
733  }
734
735#define SPEED_ROUTINE_MPN_BINARY_N(function)				\
736   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
737
738#define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
739   SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
740
741
742/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
743#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
744  {									\
745    mp_ptr    wp;							\
746    unsigned  i;							\
747    double    t;							\
748    TMP_DECL;								\
749									\
750    SPEED_RESTRICT_COND (s->size >= 1);					\
751									\
752    TMP_MARK;								\
753    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
754									\
755    speed_operand_src (s, s->xp, s->size);				\
756    speed_operand_dst (s, wp, s->size);					\
757    speed_cache_fill (s);						\
758									\
759    speed_starttime ();							\
760    i = s->reps;							\
761    do									\
762      call;								\
763    while (--i != 0);							\
764    t = speed_endtime ();						\
765									\
766    TMP_FREE;								\
767    return t;								\
768  }
769
770#define SPEED_ROUTINE_MPN_UNARY_1(function)				\
771  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
772
773#define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
774  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
775
776/* FIXME: wp is uninitialized here, should start it off from xp */
777#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
778  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
779
780#define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
781  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
782
783#define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
784    SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
785
786#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
787  {									\
788    unsigned   shift;							\
789    mp_limb_t  dinv;							\
790									\
791    SPEED_RESTRICT_COND (s->size > 0);					\
792    SPEED_RESTRICT_COND (s->r != 0);					\
793									\
794    count_trailing_zeros (shift, s->r);					\
795    binvert_limb (dinv, s->r >> shift);					\
796									\
797    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
798  }
799#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
800  SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
801  ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
802
803#define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
804  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
805
806#define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
807  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
808
809#define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
810  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
811
812#define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
813  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
814
815#define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
816  SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
817
818
819#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
820  {									\
821    unsigned   shift;							\
822    mp_limb_t  dinv;							\
823									\
824    SPEED_RESTRICT_COND (s->size >= 0);					\
825    SPEED_RESTRICT_COND (s->r != 0);					\
826									\
827    count_leading_zeros (shift, s->r);					\
828    invert_limb (dinv, s->r << shift);					\
829									\
830    SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
831  }									\
832
833#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
834  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
835  ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
836
837/* s->size limbs worth of fraction part */
838#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
839  SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
840  ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
841
842
843/* s->r is duplicated to form the multiplier, defaulting to
844   MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
845   least it provides some control.  */
846#define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
847  {									\
848    mp_ptr     wp;							\
849    mp_size_t  wn;							\
850    unsigned   i;							\
851    double     t;							\
852    mp_limb_t  yp[N];							\
853    TMP_DECL;								\
854									\
855    SPEED_RESTRICT_COND (s->size >= N);					\
856									\
857    TMP_MARK;								\
858    wn = s->size + N-1;							\
859    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
860    for (i = 0; i < N; i++)						\
861      yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
862									\
863    speed_operand_src (s, s->xp, s->size);				\
864    speed_operand_src (s, yp, (mp_size_t) N);				\
865    speed_operand_dst (s, wp, wn);					\
866    speed_cache_fill (s);						\
867									\
868    speed_starttime ();							\
869    i = s->reps;							\
870    do									\
871      function (wp, s->xp, s->size, yp);				\
872    while (--i != 0);							\
873    t = speed_endtime ();						\
874									\
875    TMP_FREE;								\
876    return t;								\
877  }
878
879#define SPEED_ROUTINE_MPN_UNARY_2(function)				\
880  SPEED_ROUTINE_MPN_UNARY_N (function, 2)
881#define SPEED_ROUTINE_MPN_UNARY_3(function)				\
882  SPEED_ROUTINE_MPN_UNARY_N (function, 3)
883#define SPEED_ROUTINE_MPN_UNARY_4(function)				\
884  SPEED_ROUTINE_MPN_UNARY_N (function, 4)
885#define SPEED_ROUTINE_MPN_UNARY_5(function)				\
886  SPEED_ROUTINE_MPN_UNARY_N (function, 5)
887#define SPEED_ROUTINE_MPN_UNARY_6(function)				\
888  SPEED_ROUTINE_MPN_UNARY_N (function, 6)
889#define SPEED_ROUTINE_MPN_UNARY_7(function)				\
890  SPEED_ROUTINE_MPN_UNARY_N (function, 7)
891#define SPEED_ROUTINE_MPN_UNARY_8(function)				\
892  SPEED_ROUTINE_MPN_UNARY_N (function, 8)
893
894
895/* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
896#define SPEED_ROUTINE_MPN_MUL(function)					\
897  {									\
898    mp_ptr    wp, xp;							\
899    mp_size_t size1;							\
900    unsigned  i;							\
901    double    t;							\
902    TMP_DECL;								\
903									\
904    size1 = (s->r == 0 ? s->size : s->r);				\
905									\
906    SPEED_RESTRICT_COND (s->size >= 1);					\
907    SPEED_RESTRICT_COND (size1 >= s->size);				\
908									\
909    TMP_MARK;								\
910    SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
911    SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
912									\
913    speed_operand_src (s, xp, size1);					\
914    speed_operand_src (s, s->yp, s->size);				\
915    speed_operand_dst (s, wp, size1 + s->size);				\
916    speed_cache_fill (s);						\
917									\
918    speed_starttime ();							\
919    i = s->reps;							\
920    do									\
921      function (wp, xp, size1, s->yp, s->size);				\
922    while (--i != 0);							\
923    t = speed_endtime ();						\
924									\
925    TMP_FREE;								\
926    return t;								\
927  }
928
929
930#define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
931  {									\
932    mp_ptr    wp;							\
933    unsigned  i;							\
934    double    t;							\
935    TMP_DECL;								\
936									\
937    SPEED_RESTRICT_COND (s->size >= 1);					\
938									\
939    TMP_MARK;								\
940    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
941									\
942    speed_operand_src (s, s->xp, s->size);				\
943    speed_operand_src (s, s->yp, s->size);				\
944    speed_operand_dst (s, wp, 2*s->size);				\
945    speed_cache_fill (s);						\
946									\
947    speed_starttime ();							\
948    i = s->reps;							\
949    do									\
950      call;								\
951    while (--i != 0);							\
952    t = speed_endtime ();						\
953									\
954    TMP_FREE;								\
955    return t;								\
956  }
957
958#define SPEED_ROUTINE_MPN_MUL_N(function)				\
959  SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
960
961#define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
962  {									\
963    mp_ptr    wp;							\
964    unsigned  i;							\
965    double    t;							\
966    TMP_DECL;								\
967									\
968    SPEED_RESTRICT_COND (s->size >= 1);					\
969									\
970    TMP_MARK;								\
971    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
972									\
973    speed_operand_src (s, s->xp, s->size);				\
974    speed_operand_src (s, s->yp, s->size);				\
975    speed_operand_dst (s, wp, s->size);					\
976    speed_cache_fill (s);						\
977									\
978    speed_starttime ();							\
979    i = s->reps;							\
980    do									\
981      call;								\
982    while (--i != 0);							\
983    t = speed_endtime ();						\
984									\
985    TMP_FREE;								\
986    return t;								\
987  }
988
989#define SPEED_ROUTINE_MPN_MULLO_N(function)				\
990  SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
991
992/* For mpn_mul_basecase, xsize=r, ysize=s->size. */
993#define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
994  {									\
995    mp_ptr    wp;							\
996    unsigned  i;							\
997    double    t;							\
998    TMP_DECL;								\
999									\
1000    SPEED_RESTRICT_COND (s->size >= 1);					\
1001									\
1002    TMP_MARK;								\
1003    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1004									\
1005    speed_operand_src (s, s->xp, s->size);				\
1006    speed_operand_src (s, s->yp, s->size);				\
1007    speed_operand_dst (s, wp, s->size);					\
1008    speed_cache_fill (s);						\
1009									\
1010    speed_starttime ();							\
1011    i = s->reps;							\
1012    do									\
1013      function (wp, s->xp, s->yp, s->size);				\
1014    while (--i != 0);							\
1015    t = speed_endtime ();						\
1016									\
1017    TMP_FREE;								\
1018    return t;								\
1019  }
1020
1021#define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
1022  {									\
1023    mp_ptr    wp, tp;							\
1024    unsigned  i;							\
1025    double    t;							\
1026    mp_size_t itch;							\
1027    TMP_DECL;								\
1028									\
1029    SPEED_RESTRICT_COND (s->size >= 1);					\
1030									\
1031    itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
1032									\
1033    TMP_MARK;								\
1034    SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1035    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1036									\
1037    speed_operand_src (s, s->xp, s->size);				\
1038    speed_operand_src (s, s->yp, s->size);				\
1039    speed_operand_dst (s, wp, 2 * s->size);				\
1040    speed_operand_dst (s, tp, itch);					\
1041    speed_cache_fill (s);						\
1042									\
1043    speed_starttime ();							\
1044    i = s->reps;							\
1045    do									\
1046      call;								\
1047    while (--i != 0);							\
1048    t = speed_endtime ();						\
1049									\
1050    TMP_FREE;								\
1051    return t;								\
1052  }
1053#define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
1054  {									\
1055    mp_ptr    wp, tp;							\
1056    unsigned  i;							\
1057    double    t;							\
1058    mp_size_t size, itch;						\
1059    TMP_DECL;								\
1060									\
1061    SPEED_RESTRICT_COND (s->size >= 1);					\
1062									\
1063    size = mpn_mulmod_bnm1_next_size (s->size);				\
1064    itch = mpn_mulmod_bnm1_itch (size, size, size);			\
1065									\
1066    TMP_MARK;								\
1067    SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
1068    SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1069									\
1070    speed_operand_src (s, s->xp, s->size);				\
1071    speed_operand_src (s, s->yp, s->size);				\
1072    speed_operand_dst (s, wp, size);					\
1073    speed_operand_dst (s, tp, itch);					\
1074    speed_cache_fill (s);						\
1075									\
1076    speed_starttime ();							\
1077    i = s->reps;							\
1078    do									\
1079      function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
1080    while (--i != 0);							\
1081    t = speed_endtime ();						\
1082									\
1083    TMP_FREE;								\
1084    return t;								\
1085  }
1086
1087#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
1088  {									\
1089    mp_ptr    wp, tspace;						\
1090    unsigned  i;							\
1091    double    t;							\
1092    TMP_DECL;								\
1093									\
1094    SPEED_RESTRICT_COND (s->size >= minsize);				\
1095									\
1096    TMP_MARK;								\
1097    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1098    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1099									\
1100    speed_operand_src (s, s->xp, s->size);				\
1101    speed_operand_src (s, s->yp, s->size);				\
1102    speed_operand_dst (s, wp, 2*s->size);				\
1103    speed_operand_dst (s, tspace, tsize);				\
1104    speed_cache_fill (s);						\
1105									\
1106    speed_starttime ();							\
1107    i = s->reps;							\
1108    do									\
1109      call;								\
1110    while (--i != 0);							\
1111    t = speed_endtime ();						\
1112									\
1113    TMP_FREE;								\
1114    return t;								\
1115  }
1116
1117#define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
1118  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1119    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1120     mpn_toom22_mul_itch (s->size, s->size),				\
1121     MPN_TOOM22_MUL_MINSIZE)
1122
1123#define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
1124  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1125    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1126     mpn_toom33_mul_itch (s->size, s->size),				\
1127     MPN_TOOM33_MUL_MINSIZE)
1128
1129#define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
1130  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1131    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1132     mpn_toom44_mul_itch (s->size, s->size),				\
1133     MPN_TOOM44_MUL_MINSIZE)
1134
1135#define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
1136  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1137    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1138     mpn_toom6h_mul_itch (s->size, s->size),				\
1139     MPN_TOOM6H_MUL_MINSIZE)
1140
1141#define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
1142  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1143    (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1144     mpn_toom8h_mul_itch (s->size, s->size),				\
1145     MPN_TOOM8H_MUL_MINSIZE)
1146
1147#define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
1148  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1149    (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
1150     mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
1151     MPN_TOOM32_MUL_MINSIZE)
1152
1153#define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
1154  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1155    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1156     mpn_toom42_mul_itch (s->size, s->size/2),				\
1157     MPN_TOOM42_MUL_MINSIZE)
1158
1159#define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
1160  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1161    (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
1162     mpn_toom43_mul_itch (s->size, s->size*3/4),			\
1163     MPN_TOOM43_MUL_MINSIZE)
1164
1165#define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
1166  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1167    (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1168     mpn_toom63_mul_itch (s->size, s->size/2),				\
1169     MPN_TOOM63_MUL_MINSIZE)
1170
1171#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
1172  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1173    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1174     mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
1175     MPN_TOOM32_MUL_MINSIZE)
1176#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
1177  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1178    (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1179     mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
1180     MPN_TOOM43_MUL_MINSIZE)
1181
1182#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
1183  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1184    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1185     mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
1186     MPN_TOOM32_MUL_MINSIZE)
1187#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
1188  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1189    (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1190     mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
1191     MPN_TOOM53_MUL_MINSIZE)
1192
1193#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
1194  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1195    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1196     mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
1197     MPN_TOOM42_MUL_MINSIZE)
1198#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
1199  SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1200    (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1201     mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
1202     MPN_TOOM53_MUL_MINSIZE)
1203
1204
1205
1206#define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
1207  {									\
1208    mp_ptr    wp;							\
1209    unsigned  i;							\
1210    double    t;							\
1211    TMP_DECL;								\
1212									\
1213    SPEED_RESTRICT_COND (s->size >= 1);					\
1214									\
1215    TMP_MARK;								\
1216    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1217									\
1218    speed_operand_src (s, s->xp, s->size);				\
1219    speed_operand_dst (s, wp, 2*s->size);				\
1220    speed_cache_fill (s);						\
1221									\
1222    speed_starttime ();							\
1223    i = s->reps;							\
1224    do									\
1225      call;								\
1226    while (--i != 0);							\
1227    t = speed_endtime ();						\
1228									\
1229    TMP_FREE;								\
1230    return t;								\
1231  }
1232
1233#define SPEED_ROUTINE_MPN_SQR(function)					\
1234  SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1235
1236#define SPEED_ROUTINE_MPN_SQR_DIAGONAL(function)			\
1237  SPEED_ROUTINE_MPN_SQR (function)
1238
1239
1240#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
1241  {									\
1242    mp_ptr    wp, tspace;						\
1243    unsigned  i;							\
1244    double    t;							\
1245    TMP_DECL;								\
1246									\
1247    SPEED_RESTRICT_COND (s->size >= minsize);				\
1248									\
1249    TMP_MARK;								\
1250    SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1251    SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1252									\
1253    speed_operand_src (s, s->xp, s->size);				\
1254    speed_operand_dst (s, wp, 2*s->size);				\
1255    speed_operand_dst (s, tspace, tsize);				\
1256    speed_cache_fill (s);						\
1257									\
1258    speed_starttime ();							\
1259    i = s->reps;							\
1260    do									\
1261      call;								\
1262    while (--i != 0);							\
1263    t = speed_endtime ();						\
1264									\
1265    TMP_FREE;								\
1266    return t;								\
1267  }
1268
1269#define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
1270  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1271				mpn_toom2_sqr_itch (s->size),		\
1272				MPN_TOOM2_SQR_MINSIZE)
1273
1274#define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
1275  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1276				mpn_toom3_sqr_itch (s->size),		\
1277				MPN_TOOM3_SQR_MINSIZE)
1278
1279
1280#define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
1281  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1282				mpn_toom4_sqr_itch (s->size),		\
1283				MPN_TOOM4_SQR_MINSIZE)
1284
1285#define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
1286  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1287				mpn_toom6_sqr_itch (s->size),		\
1288				MPN_TOOM6_SQR_MINSIZE)
1289
1290#define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
1291  SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1292				mpn_toom8_sqr_itch (s->size),		\
1293				MPN_TOOM8_SQR_MINSIZE)
1294
1295#define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
1296  {									\
1297    unsigned   i;							\
1298									\
1299    SPEED_RESTRICT_COND (s->size >= 0);					\
1300									\
1301    speed_operand_src (s, s->xp, s->size);				\
1302    speed_cache_fill (s);						\
1303									\
1304    speed_starttime ();							\
1305    i = s->reps;							\
1306    do									\
1307      call;								\
1308    while (--i != 0);							\
1309									\
1310    return speed_endtime ();						\
1311  }
1312
1313#define SPEED_ROUTINE_MPN_MOD_1(function)				\
1314   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1315
1316#define SPEED_ROUTINE_MPN_MOD_1C(function)				\
1317   SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1318
1319#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
1320  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1321
1322#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
1323  SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1324
1325#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
1326   SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1327
1328#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
1329  {									\
1330    unsigned   i;							\
1331    mp_limb_t  inv;							\
1332									\
1333    SPEED_RESTRICT_COND (s->size >= 0);					\
1334    SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
1335									\
1336    invert_limb (inv, s->r);						\
1337    speed_operand_src (s, s->xp, s->size);				\
1338    speed_cache_fill (s);						\
1339									\
1340    speed_starttime ();							\
1341    i = s->reps;							\
1342    do									\
1343      (*function) (s->xp, s->size, s->r, inv);				\
1344    while (--i != 0);							\
1345									\
1346    return speed_endtime ();						\
1347  }
1348
1349#define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
1350  {									\
1351    unsigned   i;							\
1352    mp_limb_t  inv[4];							\
1353									\
1354    SPEED_RESTRICT_COND (s->size >= 2);					\
1355									\
1356    mpn_mod_1_1p_cps (inv, s->r);					\
1357    speed_operand_src (s, s->xp, s->size);				\
1358    speed_cache_fill (s);						\
1359									\
1360    speed_starttime ();							\
1361    i = s->reps;							\
1362    do {								\
1363      pfunc (inv, s->r);						\
1364      function (s->xp, s->size, s->r, inv);				\
1365    } while (--i != 0);							\
1366									\
1367    return speed_endtime ();						\
1368  }
1369#define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
1370  {									\
1371    unsigned   i;							\
1372    mp_limb_t  inv[N+3];						\
1373									\
1374    SPEED_RESTRICT_COND (s->size >= 1);					\
1375    SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
1376									\
1377    speed_operand_src (s, s->xp, s->size);				\
1378    speed_cache_fill (s);						\
1379									\
1380    speed_starttime ();							\
1381    i = s->reps;							\
1382    do {								\
1383      pfunc (inv, s->r);						\
1384      function (s->xp, s->size, s->r, inv);				\
1385    } while (--i != 0);							\
1386									\
1387    return speed_endtime ();						\
1388  }
1389
1390
1391/* A division of 2*s->size by s->size limbs */
1392
1393#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
1394  {									\
1395    unsigned  i;							\
1396    mp_ptr    a, d, q, r;						\
1397    double    t;							\
1398    gmp_pi1_t dinv;							\
1399    TMP_DECL;								\
1400									\
1401    SPEED_RESTRICT_COND (s->size >= 1);					\
1402									\
1403    TMP_MARK;								\
1404    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
1405    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
1406    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
1407    SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
1408									\
1409    MPN_COPY (a, s->xp, s->size);					\
1410    MPN_COPY (a+s->size, s->xp, s->size);				\
1411									\
1412    MPN_COPY (d, s->yp, s->size);					\
1413									\
1414    /* normalize the data */						\
1415    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1416    a[2*s->size-1] = d[s->size-1] - 1;					\
1417									\
1418    invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
1419									\
1420    speed_operand_src (s, a, 2*s->size);				\
1421    speed_operand_src (s, d, s->size);					\
1422    speed_operand_dst (s, q, s->size+1);				\
1423    speed_operand_dst (s, r, s->size);					\
1424    speed_cache_fill (s);						\
1425									\
1426    speed_starttime ();							\
1427    i = s->reps;							\
1428    do									\
1429      call;								\
1430    while (--i != 0);							\
1431    t = speed_endtime ();						\
1432									\
1433    TMP_FREE;								\
1434    return t;								\
1435  }
1436
1437
1438/* A remainder 2*s->size by s->size limbs */
1439
1440#define SPEED_ROUTINE_MPZ_MOD(function)					\
1441  {									\
1442    unsigned   i;							\
1443    mpz_t      a, d, r;							\
1444									\
1445    SPEED_RESTRICT_COND (s->size >= 1);					\
1446									\
1447    mpz_init_set_n (d, s->yp, s->size);					\
1448									\
1449    /* high part less than d, low part a duplicate copied in */		\
1450    mpz_init_set_n (a, s->xp, s->size);					\
1451    mpz_mod (a, a, d);							\
1452    mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
1453    MPN_COPY (PTR(a), s->xp, s->size);					\
1454									\
1455    mpz_init (r);							\
1456									\
1457    speed_operand_src (s, PTR(a), SIZ(a));				\
1458    speed_operand_src (s, PTR(d), SIZ(d));				\
1459    speed_cache_fill (s);						\
1460									\
1461    speed_starttime ();							\
1462    i = s->reps;							\
1463    do									\
1464      function (r, a, d);						\
1465    while (--i != 0);							\
1466    return speed_endtime ();						\
1467  }
1468
1469#define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
1470  {									\
1471    unsigned   i;							\
1472    mp_ptr     dp, tp, ap, qp;						\
1473    gmp_pi1_t  inv;							\
1474    double     t;							\
1475    mp_size_t size1;							\
1476    TMP_DECL;								\
1477									\
1478    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1479									\
1480    SPEED_RESTRICT_COND (s->size >= DMIN);				\
1481    SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
1482									\
1483    TMP_MARK;								\
1484    SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
1485    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1486    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1487    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
1488									\
1489    /* we don't fill in dividend completely when size1 > s->size */	\
1490    MPN_COPY (ap,         s->xp, s->size);				\
1491    MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
1492									\
1493    MPN_COPY (dp,         s->yp, s->size);				\
1494									\
1495    /* normalize the data */						\
1496    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1497    ap[size1 - 1] = dp[s->size - 1] - 1;				\
1498									\
1499    invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
1500									\
1501    speed_operand_src (s, ap, size1);					\
1502    speed_operand_dst (s, tp, size1);					\
1503    speed_operand_src (s, dp, s->size);					\
1504    speed_operand_dst (s, qp, size1 - s->size);				\
1505    speed_cache_fill (s);						\
1506									\
1507    speed_starttime ();							\
1508    i = s->reps;							\
1509    do {								\
1510      MPN_COPY (tp, ap, size1);						\
1511      function (qp, tp, size1, dp, s->size, INV);			\
1512    } while (--i != 0);							\
1513    t = speed_endtime ();						\
1514									\
1515    TMP_FREE;								\
1516    return t;								\
1517  }
1518#define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
1519  {									\
1520    unsigned   i;							\
1521    mp_ptr     dp, tp, qp, scratch;					\
1522    double     t;							\
1523    mp_size_t itch;							\
1524    TMP_DECL;								\
1525									\
1526    SPEED_RESTRICT_COND (s->size >= 2);					\
1527									\
1528    itch = itchfn (2 * s->size, s->size, 0);				\
1529    TMP_MARK;								\
1530    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1531    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1532    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
1533    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1534									\
1535    MPN_COPY (tp,         s->xp, s->size);				\
1536    MPN_COPY (tp+s->size, s->xp, s->size);				\
1537									\
1538    /* normalize the data */						\
1539    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1540    tp[2*s->size-1] = dp[s->size-1] - 1;				\
1541									\
1542    speed_operand_dst (s, qp, s->size);					\
1543    speed_operand_src (s, tp, 2 * s->size);				\
1544    speed_operand_src (s, dp, s->size);					\
1545    speed_operand_dst (s, scratch, itch);				\
1546    speed_cache_fill (s);						\
1547									\
1548    speed_starttime ();							\
1549    i = s->reps;							\
1550    do {								\
1551      function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
1552    } while (--i != 0);							\
1553    t = speed_endtime ();						\
1554									\
1555    TMP_FREE;								\
1556    return t;								\
1557  }
1558#define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
1559  {									\
1560    unsigned   i;							\
1561    mp_ptr     dp, tp, qp, rp, scratch;					\
1562    double     t;							\
1563    mp_size_t size1, itch;						\
1564    TMP_DECL;								\
1565									\
1566    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1567									\
1568    SPEED_RESTRICT_COND (s->size >= 2);					\
1569    SPEED_RESTRICT_COND (size1 >= s->size);				\
1570									\
1571    itch = itchfn (size1, s->size, 0);					\
1572    TMP_MARK;								\
1573    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1574    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1575    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1576    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1577    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1578									\
1579    /* we don't fill in dividend completely when size1 > s->size */	\
1580    MPN_COPY (tp,         s->xp, s->size);				\
1581    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1582									\
1583    MPN_COPY (dp,         s->yp, s->size);				\
1584									\
1585    /* normalize the data */						\
1586    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1587    tp[size1 - 1] = dp[s->size - 1] - 1;				\
1588									\
1589    speed_operand_dst (s, qp, size1 - s->size);				\
1590    speed_operand_dst (s, rp, s->size);					\
1591    speed_operand_src (s, tp, size1);					\
1592    speed_operand_src (s, dp, s->size);					\
1593    speed_operand_dst (s, scratch, itch);				\
1594    speed_cache_fill (s);						\
1595									\
1596    speed_starttime ();							\
1597    i = s->reps;							\
1598    do {								\
1599      function (qp, rp, tp, size1, dp, s->size, scratch);		\
1600    } while (--i != 0);							\
1601    t = speed_endtime ();						\
1602									\
1603    TMP_FREE;								\
1604    return t;								\
1605  }
1606#define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
1607  {									\
1608    unsigned   i;							\
1609    mp_ptr     dp, tp, qp, rp, ip, scratch;				\
1610    double     t;							\
1611    mp_size_t size1, itch;						\
1612    TMP_DECL;								\
1613									\
1614    size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1615									\
1616    SPEED_RESTRICT_COND (s->size >= 2);					\
1617    SPEED_RESTRICT_COND (size1 >= s->size);				\
1618									\
1619    itch = itchfn (size1, s->size, s->size);				\
1620    TMP_MARK;								\
1621    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1622    SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1623    SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1624    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1625    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1626    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
1627									\
1628    /* we don't fill in dividend completely when size1 > s->size */	\
1629    MPN_COPY (tp,         s->xp, s->size);				\
1630    MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1631									\
1632    MPN_COPY (dp,         s->yp, s->size);				\
1633									\
1634    /* normalize the data */						\
1635    dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1636    tp[size1 - 1] = dp[s->size-1] - 1;					\
1637									\
1638    mpn_invert (ip, dp, s->size, NULL);					\
1639									\
1640    speed_operand_dst (s, qp, size1 - s->size);				\
1641    speed_operand_dst (s, rp, s->size);					\
1642    speed_operand_src (s, tp, size1);					\
1643    speed_operand_src (s, dp, s->size);					\
1644    speed_operand_src (s, ip, s->size);					\
1645    speed_operand_dst (s, scratch, itch);				\
1646    speed_cache_fill (s);						\
1647									\
1648    speed_starttime ();							\
1649    i = s->reps;							\
1650    do {								\
1651      function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
1652    } while (--i != 0);							\
1653    t = speed_endtime ();						\
1654									\
1655    TMP_FREE;								\
1656    return t;								\
1657  }
1658
1659#define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
1660  {									\
1661    unsigned   i;							\
1662    mp_ptr     dp, tp, ap, qp;						\
1663    mp_limb_t  inv;							\
1664    double     t;							\
1665    TMP_DECL;								\
1666									\
1667    SPEED_RESTRICT_COND (s->size >= 1);					\
1668									\
1669    TMP_MARK;								\
1670    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
1671    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1672    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1673    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
1674									\
1675    MPN_COPY (ap,         s->xp, s->size);				\
1676    MPN_COPY (ap+s->size, s->xp, s->size);				\
1677									\
1678    /* divisor must be odd */						\
1679    MPN_COPY (dp, s->yp, s->size);					\
1680    dp[0] |= 1;								\
1681    binvert_limb (inv, dp[0]);						\
1682    inv = -inv;								\
1683									\
1684    speed_operand_src (s, ap, 2*s->size);				\
1685    speed_operand_dst (s, tp, 2*s->size);				\
1686    speed_operand_src (s, dp, s->size);					\
1687    speed_operand_dst (s, qp, s->size);					\
1688    speed_cache_fill (s);						\
1689									\
1690    speed_starttime ();							\
1691    i = s->reps;							\
1692    do {								\
1693      MPN_COPY (tp, ap, 2*s->size);					\
1694      function (qp, tp, 2*s->size, dp, s->size, inv);			\
1695    } while (--i != 0);							\
1696    t = speed_endtime ();						\
1697									\
1698    TMP_FREE;								\
1699    return t;								\
1700  }
1701#define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
1702  {									\
1703    unsigned   i;							\
1704    mp_ptr     dp, tp, qp;						\
1705    mp_limb_t  inv;							\
1706    double     t;							\
1707    TMP_DECL;								\
1708									\
1709    SPEED_RESTRICT_COND (s->size >= 1);					\
1710									\
1711    TMP_MARK;								\
1712    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1713    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1714    SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
1715									\
1716    /* divisor must be odd */						\
1717    MPN_COPY (dp, s->yp, s->size);					\
1718    dp[0] |= 1;								\
1719    binvert_limb (inv, dp[0]);						\
1720    inv = -inv;								\
1721									\
1722    speed_operand_src (s, s->xp, s->size);				\
1723    speed_operand_dst (s, tp, s->size);					\
1724    speed_operand_src (s, dp, s->size);					\
1725    speed_operand_dst (s, qp, s->size);					\
1726    speed_cache_fill (s);						\
1727									\
1728    speed_starttime ();							\
1729    i = s->reps;							\
1730    do {								\
1731      MPN_COPY (tp, s->xp, s->size);					\
1732      function (qp, tp, s->size, dp, s->size, inv);			\
1733    } while (--i != 0);							\
1734    t = speed_endtime ();						\
1735									\
1736    TMP_FREE;								\
1737    return t;								\
1738  }
1739#define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
1740  {									\
1741    unsigned   i;							\
1742    mp_ptr     dp, qp, scratch;						\
1743    double     t;							\
1744    mp_size_t itch;							\
1745    TMP_DECL;								\
1746									\
1747    SPEED_RESTRICT_COND (s->size >= 2);					\
1748									\
1749    itch = itchfn (s->size, s->size);					\
1750    TMP_MARK;								\
1751    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1752    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1753    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1754									\
1755    /* divisor must be odd */						\
1756    MPN_COPY (dp, s->yp, s->size);					\
1757    dp[0] |= 1;								\
1758									\
1759    speed_operand_dst (s, qp, s->size);					\
1760    speed_operand_src (s, s->xp, s->size);				\
1761    speed_operand_src (s, dp, s->size);					\
1762    speed_operand_dst (s, scratch, itch);				\
1763    speed_cache_fill (s);						\
1764									\
1765    speed_starttime ();							\
1766    i = s->reps;							\
1767    do {								\
1768      function (qp, s->xp, s->size, dp, s->size, scratch);		\
1769    } while (--i != 0);							\
1770    t = speed_endtime ();						\
1771									\
1772    TMP_FREE;								\
1773    return t;								\
1774  }
1775#define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
1776  {									\
1777    unsigned   i;							\
1778    mp_ptr     dp, tp, qp, rp, scratch;					\
1779    double     t;							\
1780    mp_size_t itch;							\
1781    TMP_DECL;								\
1782									\
1783    SPEED_RESTRICT_COND (s->size >= 2);					\
1784									\
1785    itch = itchfn (2 * s->size, s->size);				\
1786    TMP_MARK;								\
1787    SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1788    SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1789    SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
1790    SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1791    SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1792									\
1793    MPN_COPY (tp,         s->xp, s->size);				\
1794    MPN_COPY (tp+s->size, s->xp, s->size);				\
1795									\
1796    /* divisor must be odd */						\
1797    MPN_COPY (dp, s->yp, s->size);					\
1798    dp[0] |= 1;								\
1799									\
1800    speed_operand_dst (s, qp, s->size);					\
1801    speed_operand_dst (s, rp, s->size);					\
1802    speed_operand_src (s, tp, 2 * s->size);				\
1803    speed_operand_src (s, dp, s->size);					\
1804    speed_operand_dst (s, scratch, itch);				\
1805    speed_cache_fill (s);						\
1806									\
1807    speed_starttime ();							\
1808    i = s->reps;							\
1809    do {								\
1810      function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
1811    } while (--i != 0);							\
1812    t = speed_endtime ();						\
1813									\
1814    TMP_FREE;								\
1815    return t;								\
1816  }
1817
1818#define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
1819  {									\
1820    long  i;								\
1821    mp_ptr    up, tp, ip;						\
1822    double    t;							\
1823    TMP_DECL;								\
1824									\
1825    SPEED_RESTRICT_COND (s->size >= 1);					\
1826									\
1827    TMP_MARK;								\
1828    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
1829    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
1830    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
1831									\
1832    MPN_COPY (up, s->xp, s->size);					\
1833									\
1834    /* normalize the data */						\
1835    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1836									\
1837    speed_operand_src (s, up, s->size);					\
1838    speed_operand_dst (s, tp, s->size);					\
1839    speed_operand_dst (s, ip, s->size);					\
1840    speed_cache_fill (s);						\
1841									\
1842    speed_starttime ();							\
1843    i = s->reps;							\
1844    do									\
1845      function (ip, up, s->size, tp);					\
1846    while (--i != 0);							\
1847    t = speed_endtime ();						\
1848									\
1849    TMP_FREE;								\
1850    return t;								\
1851  }
1852
1853#define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
1854  {									\
1855    long  i;								\
1856    mp_ptr    up, tp, ip;						\
1857    double    t;							\
1858    TMP_DECL;								\
1859									\
1860    SPEED_RESTRICT_COND (s->size >= 1);					\
1861									\
1862    TMP_MARK;								\
1863    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
1864    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
1865    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
1866									\
1867    MPN_COPY (up, s->xp, s->size);					\
1868									\
1869    /* normalize the data */						\
1870    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1871									\
1872    speed_operand_src (s, up, s->size);					\
1873    speed_operand_dst (s, tp, s->size);					\
1874    speed_operand_dst (s, ip, s->size);					\
1875    speed_cache_fill (s);						\
1876									\
1877    speed_starttime ();							\
1878    i = s->reps;							\
1879    do									\
1880      function (ip, up, s->size, tp);					\
1881    while (--i != 0);							\
1882    t = speed_endtime ();						\
1883									\
1884    TMP_FREE;								\
1885    return t;								\
1886  }
1887
1888#define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
1889  {									\
1890    long  i;								\
1891    mp_ptr    up, tp, ip;						\
1892    double    t;							\
1893    TMP_DECL;								\
1894									\
1895    SPEED_RESTRICT_COND (s->size >= 3);					\
1896									\
1897    TMP_MARK;								\
1898    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
1899    SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
1900    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
1901									\
1902    MPN_COPY (up, s->xp, s->size);					\
1903									\
1904    /* normalize the data */						\
1905    up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1906									\
1907    speed_operand_src (s, up, s->size);					\
1908    speed_operand_dst (s, tp, s->size);					\
1909    speed_operand_dst (s, ip, s->size);					\
1910    speed_cache_fill (s);						\
1911									\
1912    speed_starttime ();							\
1913    i = s->reps;							\
1914    do									\
1915      function (ip, up, s->size, tp);					\
1916    while (--i != 0);							\
1917    t = speed_endtime ();						\
1918									\
1919    TMP_FREE;								\
1920    return t;								\
1921  }
1922
1923#define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
1924  {									\
1925    long  i;								\
1926    mp_ptr    up, tp, ip;						\
1927    double    t;							\
1928    TMP_DECL;								\
1929									\
1930    SPEED_RESTRICT_COND (s->size >= 1);					\
1931									\
1932    TMP_MARK;								\
1933    SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
1934    SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
1935    SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
1936									\
1937    MPN_COPY (up, s->xp, s->size);					\
1938									\
1939    /* normalize the data */						\
1940    up[0] |= 1;								\
1941									\
1942    speed_operand_src (s, up, s->size);					\
1943    speed_operand_dst (s, tp, s->size);					\
1944    speed_operand_dst (s, ip, s->size);					\
1945    speed_cache_fill (s);						\
1946									\
1947    speed_starttime ();							\
1948    i = s->reps;							\
1949    do									\
1950      function (ip, up, s->size, tp);					\
1951    while (--i != 0);							\
1952    t = speed_endtime ();						\
1953									\
1954    TMP_FREE;								\
1955    return t;								\
1956  }
1957
1958#define SPEED_ROUTINE_REDC_1(function)					\
1959  {									\
1960    unsigned   i;							\
1961    mp_ptr     cp, mp, tp, ap;						\
1962    mp_limb_t  inv;							\
1963    double     t;							\
1964    TMP_DECL;								\
1965									\
1966    SPEED_RESTRICT_COND (s->size >= 1);					\
1967									\
1968    TMP_MARK;								\
1969    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
1970    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
1971    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
1972    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
1973									\
1974    MPN_COPY (ap,         s->xp, s->size);				\
1975    MPN_COPY (ap+s->size, s->xp, s->size);				\
1976									\
1977    /* modulus must be odd */						\
1978    MPN_COPY (mp, s->yp, s->size);					\
1979    mp[0] |= 1;								\
1980    binvert_limb (inv, mp[0]);						\
1981    inv = -inv;								\
1982									\
1983    speed_operand_src (s, ap, 2*s->size+1);				\
1984    speed_operand_dst (s, tp, 2*s->size+1);				\
1985    speed_operand_src (s, mp, s->size);					\
1986    speed_operand_dst (s, cp, s->size);					\
1987    speed_cache_fill (s);						\
1988									\
1989    speed_starttime ();							\
1990    i = s->reps;							\
1991    do {								\
1992      MPN_COPY (tp, ap, 2*s->size);					\
1993      function (cp, tp, mp, s->size, inv);				\
1994    } while (--i != 0);							\
1995    t = speed_endtime ();						\
1996									\
1997    TMP_FREE;								\
1998    return t;								\
1999  }
2000#define SPEED_ROUTINE_REDC_2(function)					\
2001  {									\
2002    unsigned   i;							\
2003    mp_ptr     cp, mp, tp, ap;						\
2004    mp_limb_t  invp[2];							\
2005    double     t;							\
2006    TMP_DECL;								\
2007									\
2008    SPEED_RESTRICT_COND (s->size >= 1);					\
2009									\
2010    TMP_MARK;								\
2011    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2012    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2013    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2014    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2015									\
2016    MPN_COPY (ap,         s->xp, s->size);				\
2017    MPN_COPY (ap+s->size, s->xp, s->size);				\
2018									\
2019    /* modulus must be odd */						\
2020    MPN_COPY (mp, s->yp, s->size);					\
2021    mp[0] |= 1;								\
2022    mpn_binvert (invp, mp, 2, tp);					\
2023    invp[0] = -invp[0]; invp[1] = ~invp[1];				\
2024									\
2025    speed_operand_src (s, ap, 2*s->size+1);				\
2026    speed_operand_dst (s, tp, 2*s->size+1);				\
2027    speed_operand_src (s, mp, s->size);					\
2028    speed_operand_dst (s, cp, s->size);					\
2029    speed_cache_fill (s);						\
2030									\
2031    speed_starttime ();							\
2032    i = s->reps;							\
2033    do {								\
2034      MPN_COPY (tp, ap, 2*s->size);					\
2035      function (cp, tp, mp, s->size, invp);				\
2036    } while (--i != 0);							\
2037    t = speed_endtime ();						\
2038									\
2039    TMP_FREE;								\
2040    return t;								\
2041  }
2042#define SPEED_ROUTINE_REDC_N(function)					\
2043  {									\
2044    unsigned   i;							\
2045    mp_ptr     cp, mp, tp, ap, invp;					\
2046    double     t;							\
2047    TMP_DECL;								\
2048									\
2049    SPEED_RESTRICT_COND (s->size > 8);					\
2050									\
2051    TMP_MARK;								\
2052    SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2053    SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2054    SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2055    SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2056    SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
2057									\
2058    MPN_COPY (ap,         s->xp, s->size);				\
2059    MPN_COPY (ap+s->size, s->xp, s->size);				\
2060									\
2061    /* modulus must be odd */						\
2062    MPN_COPY (mp, s->yp, s->size);					\
2063    mp[0] |= 1;								\
2064    mpn_binvert (invp, mp, s->size, tp);				\
2065									\
2066    speed_operand_src (s, ap, 2*s->size+1);				\
2067    speed_operand_dst (s, tp, 2*s->size+1);				\
2068    speed_operand_src (s, mp, s->size);					\
2069    speed_operand_dst (s, cp, s->size);					\
2070    speed_cache_fill (s);						\
2071									\
2072    speed_starttime ();							\
2073    i = s->reps;							\
2074    do {								\
2075      MPN_COPY (tp, ap, 2*s->size);					\
2076      function (cp, tp, mp, s->size, invp);				\
2077    } while (--i != 0);							\
2078    t = speed_endtime ();						\
2079									\
2080    TMP_FREE;								\
2081    return t;								\
2082  }
2083
2084
2085#define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
2086  {									\
2087    unsigned i;								\
2088									\
2089    SPEED_RESTRICT_COND (s->size >= 1);					\
2090									\
2091    speed_operand_src (s, s->xp, s->size);				\
2092    speed_cache_fill (s);						\
2093									\
2094    speed_starttime ();							\
2095    i = s->reps;							\
2096    do									\
2097      function (s->xp, s->size);					\
2098    while (--i != 0);							\
2099									\
2100    return speed_endtime ();						\
2101  }
2102
2103#define SPEED_ROUTINE_MPN_HAMDIST(function)				\
2104  {									\
2105    unsigned i;								\
2106									\
2107    SPEED_RESTRICT_COND (s->size >= 1);					\
2108									\
2109    speed_operand_src (s, s->xp, s->size);				\
2110    speed_operand_src (s, s->yp, s->size);				\
2111    speed_cache_fill (s);						\
2112									\
2113    speed_starttime ();							\
2114    i = s->reps;							\
2115    do									\
2116      function (s->xp, s->yp, s->size);					\
2117    while (--i != 0);							\
2118									\
2119    return speed_endtime ();						\
2120  }
2121
2122
2123#define SPEED_ROUTINE_MPZ_UI(function)					\
2124  {									\
2125    mpz_t     z;							\
2126    unsigned  i;							\
2127    double    t;							\
2128									\
2129    SPEED_RESTRICT_COND (s->size >= 0);					\
2130									\
2131    mpz_init (z);							\
2132									\
2133    speed_starttime ();							\
2134    i = s->reps;							\
2135    do									\
2136      function (z, s->size);						\
2137    while (--i != 0);							\
2138    t = speed_endtime ();						\
2139									\
2140    mpz_clear (z);							\
2141    return t;								\
2142  }
2143
2144#define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2145#define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2146#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2147
2148
2149#define SPEED_ROUTINE_MPZ_2_UI(function)				\
2150  {									\
2151    mpz_t     z, z2;							\
2152    unsigned  i;							\
2153    double    t;							\
2154									\
2155    SPEED_RESTRICT_COND (s->size >= 0);					\
2156									\
2157    mpz_init (z);							\
2158    mpz_init (z2);							\
2159									\
2160    speed_starttime ();							\
2161    i = s->reps;							\
2162    do									\
2163      function (z, z2, s->size);					\
2164    while (--i != 0);							\
2165    t = speed_endtime ();						\
2166									\
2167    mpz_clear (z);							\
2168    mpz_clear (z2);							\
2169    return t;								\
2170  }
2171
2172#define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
2173#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2174
2175
2176#define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
2177  {									\
2178    mp_ptr     fp, f1p;							\
2179    mp_size_t  alloc;							\
2180    unsigned   i;							\
2181    double     t;							\
2182    TMP_DECL;								\
2183									\
2184    SPEED_RESTRICT_COND (s->size >= 0);					\
2185									\
2186    TMP_MARK;								\
2187    alloc = MPN_FIB2_SIZE (s->size);					\
2188    SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
2189    SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
2190									\
2191    speed_starttime ();							\
2192    i = s->reps;							\
2193    do									\
2194      function (fp, f1p, s->size);					\
2195    while (--i != 0);							\
2196    t = speed_endtime ();						\
2197									\
2198    TMP_FREE;								\
2199    return t;								\
2200  }
2201
2202
2203
2204/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2205   limbs.  m is forced to odd so that redc can be used.  e is limited in
2206   size so the calculation doesn't take too long. */
2207#define SPEED_ROUTINE_MPZ_POWM(function)				\
2208  {									\
2209    mpz_t     r, b, e, m;						\
2210    unsigned  i;							\
2211    double    t;							\
2212									\
2213    SPEED_RESTRICT_COND (s->size >= 1);					\
2214									\
2215    mpz_init (r);							\
2216    mpz_init_set_n (b, s->xp, s->size);					\
2217    mpz_init_set_n (m, s->yp, s->size);					\
2218    mpz_setbit (m, 0);	/* force m to odd */				\
2219    mpz_init_set_n (e, s->xp_block, 6);					\
2220									\
2221    speed_starttime ();							\
2222    i = s->reps;							\
2223    do									\
2224      function (r, b, e, m);						\
2225    while (--i != 0);							\
2226    t = speed_endtime ();						\
2227									\
2228    mpz_clear (r);							\
2229    mpz_clear (b);							\
2230    mpz_clear (e);							\
2231    mpz_clear (m);							\
2232    return t;								\
2233  }
2234
2235/* (m-2)^0xAAAAAAAA mod m */
2236#define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
2237  {									\
2238    mpz_t     r, b, m;							\
2239    unsigned  long  e;							\
2240    unsigned  i;							\
2241    double    t;							\
2242									\
2243    SPEED_RESTRICT_COND (s->size >= 1);					\
2244									\
2245    mpz_init (r);							\
2246									\
2247    /* force m to odd */						\
2248    mpz_init (m);							\
2249    mpz_set_n (m, s->xp, s->size);					\
2250    PTR(m)[0] |= 1;							\
2251									\
2252    e = (~ (unsigned long) 0) / 3;					\
2253    if (s->r != 0)							\
2254      e = s->r;								\
2255									\
2256    mpz_init_set (b, m);						\
2257    mpz_sub_ui (b, b, 2);						\
2258/* printf ("%X\n", mpz_get_ui(m)); */					\
2259    i = s->reps;							\
2260    speed_starttime ();							\
2261    do									\
2262      function (r, b, e, m);						\
2263    while (--i != 0);							\
2264    t = speed_endtime ();						\
2265									\
2266    mpz_clear (r);							\
2267    mpz_clear (b);							\
2268    mpz_clear (m);							\
2269    return t;								\
2270  }
2271
2272
2273#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
2274  {									\
2275    mp_ptr    wp, wp2, xp, yp;						\
2276    unsigned  i;							\
2277    double    t;							\
2278    TMP_DECL;								\
2279									\
2280    SPEED_RESTRICT_COND (s->size >= 0);					\
2281									\
2282    TMP_MARK;								\
2283    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2284    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2285    xp = s->xp;								\
2286    yp = s->yp;								\
2287									\
2288    if (s->r == 0)	;						\
2289    else if (s->r == 1) { xp = wp;	      }				\
2290    else if (s->r == 2) {	    yp = wp2; }				\
2291    else if (s->r == 3) { xp = wp;  yp = wp2; }				\
2292    else if (s->r == 4) { xp = wp2; yp = wp;  }				\
2293    else {								\
2294      TMP_FREE;								\
2295      return -1.0;							\
2296    }									\
2297    if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
2298    if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
2299									\
2300    speed_operand_src (s, xp, s->size);					\
2301    speed_operand_src (s, yp, s->size);					\
2302    speed_operand_dst (s, wp, s->size);					\
2303    speed_operand_dst (s, wp2, s->size);				\
2304    speed_cache_fill (s);						\
2305									\
2306    speed_starttime ();							\
2307    i = s->reps;							\
2308    do									\
2309      call;								\
2310    while (--i != 0);							\
2311    t = speed_endtime ();						\
2312									\
2313    TMP_FREE;								\
2314    return t;								\
2315  }
2316
2317#define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
2318  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2319    (function (wp, wp2, xp, yp, s->size));
2320
2321#define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
2322  SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2323    (function (wp, wp2, xp, yp, s->size, 0));
2324
2325
2326/* Doing an Nx1 gcd with the given r. */
2327#define SPEED_ROUTINE_MPN_GCD_1N(function)				\
2328  {									\
2329    mp_ptr    xp;							\
2330    unsigned  i;							\
2331    double    t;							\
2332    TMP_DECL;								\
2333									\
2334    SPEED_RESTRICT_COND (s->size >= 1);					\
2335    SPEED_RESTRICT_COND (s->r != 0);					\
2336									\
2337    TMP_MARK;								\
2338    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
2339    MPN_COPY (xp, s->xp, s->size);					\
2340    xp[0] |= refmpn_zero_p (xp, s->size);				\
2341									\
2342    speed_operand_src (s, s->xp, s->size);				\
2343    speed_cache_fill (s);						\
2344									\
2345    speed_starttime ();							\
2346    i = s->reps;							\
2347    do									\
2348      function (xp, s->size, s->r);					\
2349    while (--i != 0);							\
2350    t = speed_endtime ();						\
2351									\
2352    TMP_FREE;								\
2353    return t;								\
2354  }
2355
2356
2357/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2358
2359#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
2360  {									\
2361    unsigned  i, j;							\
2362    mp_ptr    px, py;							\
2363    mp_limb_t x_mask, y_mask;						\
2364    double    t;							\
2365    TMP_DECL;								\
2366									\
2367    SPEED_RESTRICT_COND (s->size >= 1);					\
2368    SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
2369									\
2370    TMP_MARK;								\
2371    SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
2372    SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
2373    MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
2374    MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
2375									\
2376    x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
2377    y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
2378    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2379      {									\
2380	px[i] &= x_mask; px[i] += (px[i] == 0);				\
2381	py[i] &= y_mask; py[i] += (py[i] == 0);				\
2382	setup;								\
2383      }									\
2384									\
2385    speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
2386    speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
2387    speed_cache_fill (s);						\
2388									\
2389    speed_starttime ();							\
2390    i = s->reps;							\
2391    do									\
2392      {									\
2393	j = SPEED_BLOCK_SIZE;						\
2394	do								\
2395	  {								\
2396	    call;							\
2397	  }								\
2398	while (--j != 0);						\
2399      }									\
2400    while (--i != 0);							\
2401    t = speed_endtime ();						\
2402									\
2403    TMP_FREE;								\
2404									\
2405    s->time_divisor = SPEED_BLOCK_SIZE;					\
2406    return t;								\
2407  }
2408
2409#define SPEED_ROUTINE_MPN_GCD_1(function)				\
2410  SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2411
2412#define SPEED_ROUTINE_MPN_JACBASE(function)				\
2413  SPEED_ROUTINE_MPN_GCD_1_CALL						\
2414    ({									\
2415       /* require x<y, y odd, y!=1 */					\
2416       px[i] %= py[i];							\
2417       px[i] |= 1;							\
2418       py[i] |= 1;							\
2419       if (py[i]==1) py[i]=3;						\
2420     },									\
2421     function (px[j-1], py[j-1], 0))
2422
2423
2424/* Run some GCDs of s->size limbs each.  The number of different data values
2425   is decreased as s->size**2, since GCD is a quadratic algorithm.
2426   SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
2427   though, because the plain gcd is about twice as fast as gcdext.  */
2428
2429#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
2430  {									\
2431    unsigned  i;							\
2432    mp_size_t j, pieces, psize;						\
2433    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
2434    double    t;							\
2435    TMP_DECL;								\
2436									\
2437    SPEED_RESTRICT_COND (s->size >= 1);					\
2438									\
2439    TMP_MARK;								\
2440    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
2441    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
2442    SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
2443    SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
2444									\
2445    pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
2446    pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
2447    pieces = MAX (pieces, 1);						\
2448									\
2449    psize = pieces * s->size;						\
2450    px = TMP_ALLOC_LIMBS (psize);					\
2451    py = TMP_ALLOC_LIMBS (psize);					\
2452    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
2453    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
2454									\
2455    /* Requirements: x >= y, y must be odd, high limbs != 0.		\
2456       No need to ensure random numbers are really great.  */		\
2457    for (j = 0; j < pieces; j++)					\
2458      {									\
2459	mp_ptr	x = px + j * s->size;					\
2460	mp_ptr	y = py + j * s->size;					\
2461	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
2462	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
2463									\
2464	if (x[s->size - 1] < y[s->size - 1])				\
2465	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
2466	else if (x[s->size - 1] == y[s->size - 1])			\
2467	  {								\
2468	    x[s->size - 1] = 2;						\
2469	    y[s->size - 1] = 1;						\
2470	  }								\
2471	y[0] |= 1;							\
2472      }									\
2473									\
2474    speed_operand_src (s, px, psize);					\
2475    speed_operand_src (s, py, psize);					\
2476    speed_operand_dst (s, xtmp, s->size);				\
2477    speed_operand_dst (s, ytmp, s->size);				\
2478    speed_operand_dst (s, wp, s->size);					\
2479    speed_cache_fill (s);						\
2480									\
2481    speed_starttime ();							\
2482    i = s->reps;							\
2483    do									\
2484      {									\
2485	j = pieces;							\
2486	do								\
2487	  {								\
2488	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
2489	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
2490	    call;							\
2491	  }								\
2492	while (--j != 0);						\
2493      }									\
2494    while (--i != 0);							\
2495    t = speed_endtime ();						\
2496									\
2497    TMP_FREE;								\
2498									\
2499    s->time_divisor = pieces;						\
2500    return t;								\
2501  }
2502
2503#define SPEED_ROUTINE_MPN_GCD(function)	\
2504  SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
2505
2506#define SPEED_ROUTINE_MPN_GCDEXT(function)				\
2507  SPEED_ROUTINE_MPN_GCD_CALL						\
2508    (4, { mp_size_t  wp2size;						\
2509	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
2510
2511
2512#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
2513  {									\
2514    unsigned  i;							\
2515    mp_size_t j, pieces, psize, wp2size;				\
2516    mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
2517    double    t;							\
2518    TMP_DECL;								\
2519									\
2520    SPEED_RESTRICT_COND (s->size >= 1);					\
2521									\
2522    TMP_MARK;								\
2523									\
2524    SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
2525    SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
2526    MPN_COPY (xtmp, s->xp, s->size);					\
2527    MPN_COPY (ytmp, s->yp, s->size);					\
2528									\
2529    SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
2530    SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
2531									\
2532    pieces = SPEED_BLOCK_SIZE / 3;					\
2533    psize = 3 * pieces;							\
2534    px = TMP_ALLOC_LIMBS (psize);					\
2535    py = TMP_ALLOC_LIMBS (psize);					\
2536    MPN_COPY (px, s->xp_block, psize);					\
2537    MPN_COPY (py, s->yp_block, psize);					\
2538									\
2539    /* x must have at least as many bits as y,				\
2540       high limbs must be non-zero */					\
2541    for (j = 0; j < pieces; j++)					\
2542      {									\
2543	mp_ptr	x = px+3*j;						\
2544	mp_ptr	y = py+3*j;						\
2545	x[2] += (x[2] == 0);						\
2546	y[2] += (y[2] == 0);						\
2547	if (x[2] < y[2])						\
2548	  MP_LIMB_T_SWAP (x[2], y[2]);					\
2549      }									\
2550									\
2551    speed_operand_src (s, px, psize);					\
2552    speed_operand_src (s, py, psize);					\
2553    speed_operand_dst (s, xtmp, s->size);				\
2554    speed_operand_dst (s, ytmp, s->size);				\
2555    speed_operand_dst (s, wp, s->size);					\
2556    speed_cache_fill (s);						\
2557									\
2558    speed_starttime ();							\
2559    i = s->reps;							\
2560    do									\
2561      {									\
2562	mp_ptr	x = px;							\
2563	mp_ptr	y = py;							\
2564	mp_ptr	xth = &xtmp[s->size-3];					\
2565	mp_ptr	yth = &ytmp[s->size-3];					\
2566	j = pieces;							\
2567	do								\
2568	  {								\
2569	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
2570	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
2571									\
2572	    ytmp[0] |= 1; /* y must be odd, */				\
2573									\
2574	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
2575									\
2576	    x += 3;							\
2577	    y += 3;							\
2578	  }								\
2579	while (--j != 0);						\
2580      }									\
2581    while (--i != 0);							\
2582    t = speed_endtime ();						\
2583									\
2584    TMP_FREE;								\
2585									\
2586    s->time_divisor = pieces;						\
2587    return t;								\
2588  }
2589
2590#define SPEED_ROUTINE_MPZ_JACOBI(function)				\
2591  {									\
2592    mpz_t     a, b;							\
2593    unsigned  i;							\
2594    mp_size_t j, pieces, psize;						\
2595    mp_ptr    px, py;							\
2596    double    t;							\
2597    TMP_DECL;								\
2598									\
2599    TMP_MARK;								\
2600    pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
2601    pieces = MAX (pieces, 1);						\
2602    s->time_divisor = pieces;						\
2603									\
2604    psize = pieces * s->size;						\
2605    px = TMP_ALLOC_LIMBS (psize);					\
2606    py = TMP_ALLOC_LIMBS (psize);					\
2607    MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
2608    MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
2609									\
2610    for (j = 0; j < pieces; j++)					\
2611      {									\
2612	mp_ptr	x = px+j*s->size;					\
2613	mp_ptr	y = py+j*s->size;					\
2614									\
2615	/* y odd */							\
2616	y[0] |= 1;							\
2617									\
2618	/* high limbs non-zero */					\
2619	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
2620	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
2621      }									\
2622									\
2623    SIZ(a) = s->size;							\
2624    SIZ(b) = s->size;							\
2625									\
2626    speed_operand_src (s, px, psize);					\
2627    speed_operand_src (s, py, psize);					\
2628    speed_cache_fill (s);						\
2629									\
2630    speed_starttime ();							\
2631    i = s->reps;							\
2632    do									\
2633      {									\
2634	j = pieces;							\
2635	do								\
2636	  {								\
2637	    PTR(a) = px+(j-1)*s->size;					\
2638	    PTR(b) = py+(j-1)*s->size;					\
2639	    function (a, b);						\
2640	  }								\
2641	while (--j != 0);						\
2642      }									\
2643    while (--i != 0);							\
2644    t = speed_endtime ();						\
2645									\
2646    TMP_FREE;								\
2647    return t;								\
2648  }
2649
2650#define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
2651  {									\
2652    mp_ptr    wp, xp;							\
2653    mp_limb_t yp[2];							\
2654    unsigned  i;							\
2655    double    t;							\
2656    TMP_DECL;								\
2657									\
2658    SPEED_RESTRICT_COND (s->size >= 2);					\
2659									\
2660    TMP_MARK;								\
2661    SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
2662    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
2663									\
2664    /* source is destroyed */						\
2665    MPN_COPY (xp, s->xp, s->size);					\
2666									\
2667    /* divisor must be normalized */					\
2668    MPN_COPY (yp, s->yp_block, 2);					\
2669    yp[1] |= GMP_NUMB_HIGHBIT;						\
2670									\
2671    speed_operand_src (s, xp, s->size);					\
2672    speed_operand_src (s, yp, 2);					\
2673    speed_operand_dst (s, wp, s->size);					\
2674    speed_cache_fill (s);						\
2675									\
2676    speed_starttime ();							\
2677    i = s->reps;							\
2678    do									\
2679      function (wp, 0, xp, s->size, yp);				\
2680    while (--i != 0);							\
2681    t = speed_endtime ();						\
2682									\
2683    TMP_FREE;								\
2684    return t;								\
2685  }
2686
2687
2688#define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
2689  {									\
2690    unsigned   i, j;							\
2691    mp_ptr     xp;							\
2692    mp_limb_t  n = 1;							\
2693    double     t;							\
2694									\
2695    xp = s->xp_block-1;							\
2696									\
2697    speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
2698    speed_cache_fill (s);						\
2699									\
2700    speed_starttime ();							\
2701    i = s->reps;							\
2702    do									\
2703      {									\
2704	j = SPEED_BLOCK_SIZE;						\
2705	do								\
2706	  {								\
2707	    /* randomized but successively dependent */			\
2708	    n += (xp[j] << 1);						\
2709									\
2710	    function (n, n);						\
2711	  }								\
2712	while (--j != 0);						\
2713      }									\
2714    while (--i != 0);							\
2715    t = speed_endtime ();						\
2716									\
2717    /* make sure the compiler won't optimize away n */			\
2718    noop_1 (n);								\
2719									\
2720    s->time_divisor = SPEED_BLOCK_SIZE;					\
2721    return t;								\
2722  }
2723
2724
2725#define SPEED_ROUTINE_MPN_SQRTREM(function)				\
2726  {									\
2727    mp_ptr    wp, wp2;							\
2728    unsigned  i;							\
2729    double    t;							\
2730    TMP_DECL;								\
2731									\
2732    SPEED_RESTRICT_COND (s->size >= 1);					\
2733									\
2734    TMP_MARK;								\
2735    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2736    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2737									\
2738    speed_operand_src (s, s->xp, s->size);				\
2739    speed_operand_dst (s, wp, s->size);					\
2740    speed_operand_dst (s, wp2, s->size);				\
2741    speed_cache_fill (s);						\
2742									\
2743    speed_starttime ();							\
2744    i = s->reps;							\
2745    do									\
2746      function (wp, wp2, s->xp, s->size);				\
2747    while (--i != 0);							\
2748    t = speed_endtime ();						\
2749									\
2750    TMP_FREE;								\
2751    return t;								\
2752  }
2753
2754#define SPEED_ROUTINE_MPN_ROOTREM(function)				\
2755  {									\
2756    mp_ptr    wp, wp2;							\
2757    unsigned  i;							\
2758    double    t;							\
2759    TMP_DECL;								\
2760									\
2761    SPEED_RESTRICT_COND (s->size >= 1);					\
2762									\
2763    TMP_MARK;								\
2764    SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2765    SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2766									\
2767    speed_operand_src (s, s->xp, s->size);				\
2768    speed_operand_dst (s, wp, s->size);					\
2769    speed_operand_dst (s, wp2, s->size);				\
2770    speed_cache_fill (s);						\
2771									\
2772    speed_starttime ();							\
2773    i = s->reps;							\
2774    do									\
2775      function (wp, wp2, s->xp, s->size, s->r);				\
2776    while (--i != 0);							\
2777    t = speed_endtime ();						\
2778									\
2779    TMP_FREE;								\
2780    return t;								\
2781  }
2782
2783
2784/* s->size controls the number of limbs in the input, s->r is the base, or
2785   decimal by default. */
2786#define SPEED_ROUTINE_MPN_GET_STR(function)				\
2787  {									\
2788    unsigned char *wp;							\
2789    mp_size_t wn;							\
2790    mp_ptr xp;								\
2791    int base;								\
2792    unsigned i;								\
2793    double t;								\
2794    TMP_DECL;								\
2795									\
2796    SPEED_RESTRICT_COND (s->size >= 1);					\
2797									\
2798    base = s->r == 0 ? 10 : s->r;					\
2799    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
2800									\
2801    TMP_MARK;								\
2802    SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
2803									\
2804    MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
2805    wp = TMP_ALLOC (wn);						\
2806									\
2807    /* use this during development to guard against overflowing wp */	\
2808    /*									\
2809    MPN_COPY (xp, s->xp, s->size);					\
2810    ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
2811    */									\
2812									\
2813    speed_operand_src (s, s->xp, s->size);				\
2814    speed_operand_dst (s, xp, s->size);					\
2815    speed_operand_dst (s, (mp_ptr) wp, wn/BYTES_PER_MP_LIMB);		\
2816    speed_cache_fill (s);						\
2817									\
2818    speed_starttime ();							\
2819    i = s->reps;							\
2820    do									\
2821      {									\
2822	MPN_COPY (xp, s->xp, s->size);					\
2823	function (wp, base, xp, s->size);				\
2824      }									\
2825    while (--i != 0);							\
2826    t = speed_endtime ();						\
2827									\
2828    TMP_FREE;								\
2829    return t;								\
2830  }
2831
2832/* s->size controls the number of digits in the input, s->r is the base, or
2833   decimal by default. */
2834#define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
2835  {									\
2836    unsigned char *xp;							\
2837    mp_ptr     wp;							\
2838    mp_size_t  wn;							\
2839    unsigned   i;							\
2840    int        base;							\
2841    double     t;							\
2842    TMP_DECL;								\
2843									\
2844    SPEED_RESTRICT_COND (s->size >= 1);					\
2845									\
2846    base = s->r == 0 ? 10 : s->r;					\
2847    SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
2848									\
2849    TMP_MARK;								\
2850									\
2851    xp = TMP_ALLOC (s->size);						\
2852    for (i = 0; i < s->size; i++)					\
2853      xp[i] = s->xp[i] % base;						\
2854									\
2855    wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly)) \
2856      / GMP_LIMB_BITS + 2;						\
2857    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
2858									\
2859    /* use this during development to check wn is big enough */		\
2860    /*									\
2861    ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
2862    */									\
2863									\
2864    speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB);	\
2865    speed_operand_dst (s, wp, wn);					\
2866    speed_cache_fill (s);						\
2867									\
2868    speed_starttime ();							\
2869    i = s->reps;							\
2870    do									\
2871      call;								\
2872    while (--i != 0);							\
2873    t = speed_endtime ();						\
2874									\
2875    TMP_FREE;								\
2876    return t;								\
2877  }
2878
2879
2880/* Run an accel gcd find_a() function over various data values.	 A set of
2881   values is used in case some run particularly fast or slow.  The size
2882   parameter is ignored, the amount of data tested is fixed.  */
2883
2884#define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
2885  {									\
2886    unsigned  i, j;							\
2887    mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
2888    double    t;							\
2889    TMP_DECL;								\
2890									\
2891    TMP_MARK;								\
2892									\
2893    /* low must be odd, high must be non-zero */			\
2894    for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2895      {									\
2896	cp[i][0] = s->xp_block[i] | 1;					\
2897	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
2898      }									\
2899									\
2900    speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
2901    speed_cache_fill (s);						\
2902									\
2903    speed_starttime ();							\
2904    i = s->reps;							\
2905    do									\
2906      {									\
2907	j = SPEED_BLOCK_SIZE;						\
2908	do								\
2909	  {								\
2910	    function (cp[j-1]);						\
2911	  }								\
2912	while (--j != 0);						\
2913      }									\
2914    while (--i != 0);							\
2915    t = speed_endtime ();						\
2916									\
2917    TMP_FREE;								\
2918									\
2919    s->time_divisor = SPEED_BLOCK_SIZE;					\
2920    return t;								\
2921  }
2922
2923
2924/* "call" should do "count_foo_zeros(c,n)".
2925   Give leading=1 if foo is leading zeros, leading=0 for trailing.
2926   Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
2927
2928#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
2929  {									\
2930    mp_ptr     xp;							\
2931    int        i, c;							\
2932    unsigned   j;							\
2933    mp_limb_t  n;							\
2934    double     t;							\
2935    TMP_DECL;								\
2936									\
2937    TMP_MARK;								\
2938    SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
2939									\
2940    if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
2941      return -1.0;							\
2942    speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
2943    speed_cache_fill (s);						\
2944									\
2945    c = 0;								\
2946    speed_starttime ();							\
2947    j = s->reps;							\
2948    do {								\
2949      for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2950	{								\
2951	  n = xp[i];							\
2952	  n ^= c;							\
2953
2954#define SPEED_ROUTINE_COUNT_ZEROS_B()					\
2955	}								\
2956    } while (--j != 0);							\
2957    t = speed_endtime ();						\
2958									\
2959    /* don't let c go dead */						\
2960    noop_1 (c);								\
2961									\
2962    s->time_divisor = SPEED_BLOCK_SIZE;					\
2963									\
2964    TMP_FREE;								\
2965    return t;								\
2966  }									\
2967
2968#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
2969  do {									\
2970    SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
2971    call;								\
2972    SPEED_ROUTINE_COUNT_ZEROS_B ();					\
2973  } while (0)								\
2974
2975#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
2976  SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
2977#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
2978  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
2979
2980#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
2981  SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
2982#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
2983  SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
2984
2985
2986#define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
2987  {									\
2988    unsigned   i, j;							\
2989    mp_limb_t  d, dinv=0;						\
2990    mp_ptr     xp = s->xp_block - 1;					\
2991									\
2992    s->time_divisor = SPEED_BLOCK_SIZE;					\
2993									\
2994    speed_starttime ();							\
2995    i = s->reps;							\
2996    do									\
2997      {									\
2998	j = SPEED_BLOCK_SIZE;						\
2999	do								\
3000	  {								\
3001	    d = dinv ^ xp[j];						\
3002	    d |= GMP_LIMB_HIGHBIT;					\
3003	    do { call; } while (0);					\
3004	  }								\
3005	while (--j != 0);						\
3006      }									\
3007    while (--i != 0);							\
3008									\
3009    /* don't let the compiler optimize everything away */		\
3010    noop_1 (dinv);							\
3011									\
3012    return speed_endtime();						\
3013  }
3014
3015
3016#endif
3017
3018
3019#define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
3020  {									\
3021    unsigned  i;							\
3022    speed_starttime ();							\
3023    i = s->reps;							\
3024    do									\
3025      function ();							\
3026    while (--i != 0);							\
3027    return speed_endtime ();						\
3028  }
3029
3030
3031#define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
3032  {									\
3033    mp_ptr    wp;							\
3034    unsigned  i;							\
3035    double    t;							\
3036    TMP_DECL;								\
3037									\
3038    SPEED_RESTRICT_COND (s->size >= 0);					\
3039									\
3040    TMP_MARK;								\
3041    SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3042    speed_operand_dst (s, wp, s->size);					\
3043    speed_cache_fill (s);						\
3044									\
3045    speed_starttime ();							\
3046    i = s->reps;							\
3047    do									\
3048      call;								\
3049    while (--i != 0);							\
3050    t = speed_endtime ();						\
3051									\
3052    TMP_FREE;								\
3053    return t;								\
3054  }
3055
3056#define SPEED_ROUTINE_MPN_ZERO(function)				\
3057  SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
3058