1/* Header for speed and threshold things. 2 3Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free 4Software Foundation, Inc. 5 6This file is part of the GNU MP Library. 7 8The GNU MP Library is free software; you can redistribute it and/or modify 9it under the terms of the GNU Lesser General Public License as published by 10the Free Software Foundation; either version 3 of the License, or (at your 11option) any later version. 12 13The GNU MP Library is distributed in the hope that it will be useful, but 14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16License for more details. 17 18You should have received a copy of the GNU Lesser General Public License 19along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21#ifndef __SPEED_H__ 22#define __SPEED_H__ 23 24 25/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it 26 newsize long. */ 27#define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \ 28 do { \ 29 ASSERT ((newsize) >= (oldsize)); \ 30 MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \ 31 } while (0) 32 33/* A mask of the least significant n bits. Note 1<<32 doesn't give zero on 34 x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */ 35#define MP_LIMB_T_LOWBITMASK(n) \ 36 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1) 37 38 39/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */ 40 41#define TMP_ALLOC_ALIGNED(bytes, align) \ 42 align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align)) 43#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \ 44 ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align)) 45 46/* CACHE_LINE_SIZE is our default alignment for speed operands, and the 47 limit on what s->align_xp etc and then request for off-alignment. Maybe 48 this should be an option of some sort, but in any case here are some line 49 sizes, 50 51 bytes 52 32 pentium 53 64 athlon 54 64 itanium-2 L1 55 128 itanium-2 L2 56*/ 57#define CACHE_LINE_SIZE 64 /* bytes */ 58 59#define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1) 60 61/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb 62 alignment. */ 63#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \ 64 do { \ 65 mp_ptr __ptr; \ 66 mp_size_t __ptr_align, __ptr_add; \ 67 \ 68 ASSERT ((CACHE_LINE_SIZE % BYTES_PER_MP_LIMB) == 0); \ 69 __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \ 70 __ptr_align = (__ptr - (mp_ptr) NULL); \ 71 __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \ 72 (ptr) = __ptr + __ptr_add; \ 73 } while (0) 74 75 76/* This is the size for s->xp_block and s->yp_block, used in certain 77 routines that want to run across many different data values and use 78 s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1. 79 80 512 means 2kbytes of data for each of xp_block and yp_block, making 4k 81 total, which should fit easily in any L1 data cache. */ 82 83#define SPEED_BLOCK_SIZE 512 /* limbs */ 84 85 86extern double speed_unittime; 87extern double speed_cycletime; 88extern int speed_precision; 89extern char speed_time_string[]; 90void speed_time_init __GMP_PROTO ((void)); 91void speed_cycletime_fail __GMP_PROTO ((const char *str)); 92void speed_cycletime_init __GMP_PROTO ((void)); 93void speed_cycletime_need_cycles __GMP_PROTO ((void)); 94void speed_cycletime_need_seconds __GMP_PROTO ((void)); 95void speed_starttime __GMP_PROTO ((void)); 96double speed_endtime __GMP_PROTO ((void)); 97 98 99struct speed_params { 100 unsigned reps; /* how many times to run the routine */ 101 mp_ptr xp; /* first argument */ 102 mp_ptr yp; /* second argument */ 103 mp_size_t size; /* size of both arguments */ 104 mp_limb_t r; /* user supplied parameter */ 105 mp_size_t align_xp; /* alignment of xp */ 106 mp_size_t align_yp; /* alignment of yp */ 107 mp_size_t align_wp; /* intended alignment of wp */ 108 mp_size_t align_wp2; /* intended alignment of wp2 */ 109 mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */ 110 mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */ 111 112 double time_divisor; /* optionally set by the speed routine */ 113 114 /* used by the cache priming things */ 115 int cache; 116 unsigned src_num, dst_num; 117 struct { 118 mp_ptr ptr; 119 mp_size_t size; 120 } src[3], dst[3]; 121}; 122 123typedef double (*speed_function_t) __GMP_PROTO ((struct speed_params *s)); 124 125double speed_measure __GMP_PROTO ((speed_function_t fun, struct speed_params *s)); 126 127/* Prototypes for speed measuring routines */ 128 129double speed_back_to_back __GMP_PROTO ((struct speed_params *s)); 130double speed_count_leading_zeros __GMP_PROTO ((struct speed_params *s)); 131double speed_count_trailing_zeros __GMP_PROTO ((struct speed_params *s)); 132double speed_find_a __GMP_PROTO ((struct speed_params *s)); 133double speed_gmp_allocate_free __GMP_PROTO ((struct speed_params *s)); 134double speed_gmp_allocate_reallocate_free __GMP_PROTO ((struct speed_params *s)); 135double speed_invert_limb __GMP_PROTO ((struct speed_params *s)); 136double speed_malloc_free __GMP_PROTO ((struct speed_params *s)); 137double speed_malloc_realloc_free __GMP_PROTO ((struct speed_params *s)); 138double speed_memcpy __GMP_PROTO ((struct speed_params *s)); 139double speed_binvert_limb __GMP_PROTO ((struct speed_params *s)); 140double speed_binvert_limb_mul1 __GMP_PROTO ((struct speed_params *s)); 141double speed_binvert_limb_loop __GMP_PROTO ((struct speed_params *s)); 142double speed_binvert_limb_cond __GMP_PROTO ((struct speed_params *s)); 143double speed_binvert_limb_arith __GMP_PROTO ((struct speed_params *s)); 144 145double speed_mpf_init_clear __GMP_PROTO ((struct speed_params *s)); 146 147double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s)); 148double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s)); 149double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s)); 150double speed_mpn_add_n_sub_n __GMP_PROTO ((struct speed_params *s)); 151double speed_mpn_and_n __GMP_PROTO ((struct speed_params *s)); 152double speed_mpn_andn_n __GMP_PROTO ((struct speed_params *s)); 153double speed_mpn_addmul_1 __GMP_PROTO ((struct speed_params *s)); 154double speed_mpn_addmul_2 __GMP_PROTO ((struct speed_params *s)); 155double speed_mpn_addmul_3 __GMP_PROTO ((struct speed_params *s)); 156double speed_mpn_addmul_4 __GMP_PROTO ((struct speed_params *s)); 157double speed_mpn_addmul_5 __GMP_PROTO ((struct speed_params *s)); 158double speed_mpn_addmul_6 __GMP_PROTO ((struct speed_params *s)); 159double speed_mpn_addmul_7 __GMP_PROTO ((struct speed_params *s)); 160double speed_mpn_addmul_8 __GMP_PROTO ((struct speed_params *s)); 161double speed_mpn_com __GMP_PROTO ((struct speed_params *s)); 162double speed_mpn_copyd __GMP_PROTO ((struct speed_params *s)); 163double speed_mpn_copyi __GMP_PROTO ((struct speed_params *s)); 164double speed_MPN_COPY __GMP_PROTO ((struct speed_params *s)); 165double speed_MPN_COPY_DECR __GMP_PROTO ((struct speed_params *s)); 166double speed_MPN_COPY_INCR __GMP_PROTO ((struct speed_params *s)); 167double speed_mpn_divexact_1 __GMP_PROTO ((struct speed_params *s)); 168double speed_mpn_divexact_by3 __GMP_PROTO ((struct speed_params *s)); 169double speed_mpn_bdiv_q_1 __GMP_PROTO ((struct speed_params *)); 170double speed_mpn_pi1_bdiv_q_1 __GMP_PROTO ((struct speed_params *)); 171double speed_mpn_bdiv_dbm1c __GMP_PROTO ((struct speed_params *s)); 172double speed_mpn_divrem_1 __GMP_PROTO ((struct speed_params *s)); 173double speed_mpn_divrem_1f __GMP_PROTO ((struct speed_params *s)); 174double speed_mpn_divrem_1c __GMP_PROTO ((struct speed_params *s)); 175double speed_mpn_divrem_1cf __GMP_PROTO ((struct speed_params *s)); 176double speed_mpn_divrem_1_div __GMP_PROTO ((struct speed_params *s)); 177double speed_mpn_divrem_1f_div __GMP_PROTO ((struct speed_params *s)); 178double speed_mpn_divrem_1_inv __GMP_PROTO ((struct speed_params *s)); 179double speed_mpn_divrem_1f_inv __GMP_PROTO ((struct speed_params *s)); 180double speed_mpn_divrem_2 __GMP_PROTO ((struct speed_params *s)); 181double speed_mpn_divrem_2_div __GMP_PROTO ((struct speed_params *s)); 182double speed_mpn_divrem_2_inv __GMP_PROTO ((struct speed_params *s)); 183double speed_mpn_fib2_ui __GMP_PROTO ((struct speed_params *s)); 184double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s)); 185double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s)); 186double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s)); 187double speed_mpn_gcd __GMP_PROTO ((struct speed_params *s)); 188double speed_mpn_gcd_1 __GMP_PROTO ((struct speed_params *s)); 189double speed_mpn_gcd_1N __GMP_PROTO ((struct speed_params *s)); 190double speed_mpn_gcdext __GMP_PROTO ((struct speed_params *s)); 191double speed_mpn_gcdext_double __GMP_PROTO ((struct speed_params *s)); 192double speed_mpn_gcdext_one_double __GMP_PROTO ((struct speed_params *s)); 193double speed_mpn_gcdext_one_single __GMP_PROTO ((struct speed_params *s)); 194double speed_mpn_gcdext_single __GMP_PROTO ((struct speed_params *s)); 195double speed_mpn_get_str __GMP_PROTO ((struct speed_params *s)); 196double speed_mpn_hamdist __GMP_PROTO ((struct speed_params *s)); 197double speed_mpn_ior_n __GMP_PROTO ((struct speed_params *s)); 198double speed_mpn_iorn_n __GMP_PROTO ((struct speed_params *s)); 199double speed_mpn_jacobi_base __GMP_PROTO ((struct speed_params *s)); 200double speed_mpn_jacobi_base_1 __GMP_PROTO ((struct speed_params *s)); 201double speed_mpn_jacobi_base_2 __GMP_PROTO ((struct speed_params *s)); 202double speed_mpn_jacobi_base_3 __GMP_PROTO ((struct speed_params *s)); 203double speed_mpn_lshift __GMP_PROTO ((struct speed_params *s)); 204double speed_mpn_lshiftc __GMP_PROTO ((struct speed_params *s)); 205double speed_mpn_mod_1 __GMP_PROTO ((struct speed_params *s)); 206double speed_mpn_mod_1c __GMP_PROTO ((struct speed_params *s)); 207double speed_mpn_mod_1_div __GMP_PROTO ((struct speed_params *s)); 208double speed_mpn_mod_1_inv __GMP_PROTO ((struct speed_params *s)); 209double speed_mpn_mod_1_1 __GMP_PROTO ((struct speed_params *s)); 210double speed_mpn_mod_1_2 __GMP_PROTO ((struct speed_params *s)); 211double speed_mpn_mod_1_3 __GMP_PROTO ((struct speed_params *s)); 212double speed_mpn_mod_1_4 __GMP_PROTO ((struct speed_params *s)); 213double speed_mpn_mod_34lsub1 __GMP_PROTO ((struct speed_params *s)); 214double speed_mpn_modexact_1_odd __GMP_PROTO ((struct speed_params *s)); 215double speed_mpn_modexact_1c_odd __GMP_PROTO ((struct speed_params *s)); 216double speed_mpn_mul_1 __GMP_PROTO ((struct speed_params *s)); 217double speed_mpn_mul_1_inplace __GMP_PROTO ((struct speed_params *s)); 218double speed_mpn_mul_2 __GMP_PROTO ((struct speed_params *s)); 219double speed_mpn_mul_3 __GMP_PROTO ((struct speed_params *s)); 220double speed_mpn_mul_4 __GMP_PROTO ((struct speed_params *s)); 221double speed_mpn_mul __GMP_PROTO ((struct speed_params *s)); 222double speed_mpn_mul_basecase __GMP_PROTO ((struct speed_params *s)); 223double speed_mpn_mul_fft __GMP_PROTO ((struct speed_params *s)); 224double speed_mpn_mul_fft_sqr __GMP_PROTO ((struct speed_params *s)); 225double speed_mpn_fft_mul __GMP_PROTO ((struct speed_params *s)); 226double speed_mpn_fft_sqr __GMP_PROTO ((struct speed_params *s)); 227#if WANT_OLD_FFT_FULL 228double speed_mpn_mul_fft_full __GMP_PROTO ((struct speed_params *s)); 229double speed_mpn_mul_fft_full_sqr __GMP_PROTO ((struct speed_params *s)); 230#endif 231double speed_mpn_nussbaumer_mul __GMP_PROTO ((struct speed_params *s)); 232double speed_mpn_nussbaumer_mul_sqr __GMP_PROTO ((struct speed_params *s)); 233double speed_mpn_mul_n __GMP_PROTO ((struct speed_params *s)); 234double speed_mpn_mul_n_sqr __GMP_PROTO ((struct speed_params *s)); 235double speed_mpn_mullo_n __GMP_PROTO ((struct speed_params *s)); 236double speed_mpn_mullo_basecase __GMP_PROTO ((struct speed_params *s)); 237double speed_mpn_nand_n __GMP_PROTO ((struct speed_params *s)); 238double speed_mpn_nior_n __GMP_PROTO ((struct speed_params *s)); 239double speed_mpn_popcount __GMP_PROTO ((struct speed_params *s)); 240double speed_mpn_preinv_divrem_1 __GMP_PROTO ((struct speed_params *s)); 241double speed_mpn_preinv_divrem_1f __GMP_PROTO ((struct speed_params *s)); 242double speed_mpn_preinv_mod_1 __GMP_PROTO ((struct speed_params *s)); 243double speed_mpn_sbpi1_div_qr __GMP_PROTO ((struct speed_params *s)); 244double speed_mpn_dcpi1_div_qr __GMP_PROTO ((struct speed_params *s)); 245double speed_mpn_sbpi1_divappr_q __GMP_PROTO ((struct speed_params *s)); 246double speed_mpn_dcpi1_divappr_q __GMP_PROTO ((struct speed_params *s)); 247double speed_mpn_mu_div_qr __GMP_PROTO ((struct speed_params *s)); 248double speed_mpn_mu_divappr_q __GMP_PROTO ((struct speed_params *s)); 249double speed_mpn_mupi_div_qr __GMP_PROTO ((struct speed_params *s)); 250double speed_mpn_mu_div_q __GMP_PROTO ((struct speed_params *s)); 251double speed_mpn_sbpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s)); 252double speed_mpn_dcpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s)); 253double speed_mpn_sbpi1_bdiv_q __GMP_PROTO ((struct speed_params *s)); 254double speed_mpn_dcpi1_bdiv_q __GMP_PROTO ((struct speed_params *s)); 255double speed_mpn_mu_bdiv_q __GMP_PROTO ((struct speed_params *s)); 256double speed_mpn_mu_bdiv_qr __GMP_PROTO ((struct speed_params *s)); 257double speed_mpn_invert __GMP_PROTO ((struct speed_params *s)); 258double speed_mpn_invertappr __GMP_PROTO ((struct speed_params *s)); 259double speed_mpn_ni_invertappr __GMP_PROTO ((struct speed_params *s)); 260double speed_mpn_binvert __GMP_PROTO ((struct speed_params *s)); 261double speed_mpn_redc_1 __GMP_PROTO ((struct speed_params *s)); 262double speed_mpn_redc_2 __GMP_PROTO ((struct speed_params *s)); 263double speed_mpn_redc_n __GMP_PROTO ((struct speed_params *s)); 264double speed_mpn_rsblsh1_n __GMP_PROTO ((struct speed_params *s)); 265double speed_mpn_rsblsh2_n __GMP_PROTO ((struct speed_params *s)); 266double speed_mpn_rsh1add_n __GMP_PROTO ((struct speed_params *s)); 267double speed_mpn_rsh1sub_n __GMP_PROTO ((struct speed_params *s)); 268double speed_mpn_rshift __GMP_PROTO ((struct speed_params *s)); 269double speed_mpn_sb_divrem_m3 __GMP_PROTO ((struct speed_params *s)); 270double speed_mpn_sb_divrem_m3_div __GMP_PROTO ((struct speed_params *s)); 271double speed_mpn_sb_divrem_m3_inv __GMP_PROTO ((struct speed_params *s)); 272double speed_mpn_set_str __GMP_PROTO ((struct speed_params *s)); 273double speed_mpn_bc_set_str __GMP_PROTO ((struct speed_params *s)); 274double speed_mpn_dc_set_str __GMP_PROTO ((struct speed_params *s)); 275double speed_mpn_set_str_pre __GMP_PROTO ((struct speed_params *s)); 276double speed_mpn_sqr_basecase __GMP_PROTO ((struct speed_params *s)); 277double speed_mpn_sqr_diagonal __GMP_PROTO ((struct speed_params *s)); 278double speed_mpn_sqr __GMP_PROTO ((struct speed_params *s)); 279double speed_mpn_sqrtrem __GMP_PROTO ((struct speed_params *s)); 280double speed_mpn_rootrem __GMP_PROTO ((struct speed_params *s)); 281double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s)); 282double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s)); 283double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s)); 284double speed_mpn_submul_1 __GMP_PROTO ((struct speed_params *s)); 285double speed_mpn_toom2_sqr __GMP_PROTO ((struct speed_params *s)); 286double speed_mpn_toom3_sqr __GMP_PROTO ((struct speed_params *s)); 287double speed_mpn_toom4_sqr __GMP_PROTO ((struct speed_params *s)); 288double speed_mpn_toom6_sqr __GMP_PROTO ((struct speed_params *s)); 289double speed_mpn_toom8_sqr __GMP_PROTO ((struct speed_params *s)); 290double speed_mpn_toom22_mul __GMP_PROTO ((struct speed_params *s)); 291double speed_mpn_toom33_mul __GMP_PROTO ((struct speed_params *s)); 292double speed_mpn_toom44_mul __GMP_PROTO ((struct speed_params *s)); 293double speed_mpn_toom6h_mul __GMP_PROTO ((struct speed_params *s)); 294double speed_mpn_toom8h_mul __GMP_PROTO ((struct speed_params *s)); 295double speed_mpn_toom32_mul __GMP_PROTO ((struct speed_params *s)); 296double speed_mpn_toom42_mul __GMP_PROTO ((struct speed_params *s)); 297double speed_mpn_toom43_mul __GMP_PROTO ((struct speed_params *s)); 298double speed_mpn_toom63_mul __GMP_PROTO ((struct speed_params *s)); 299double speed_mpn_toom32_for_toom43_mul __GMP_PROTO ((struct speed_params *s)); 300double speed_mpn_toom43_for_toom32_mul __GMP_PROTO ((struct speed_params *s)); 301double speed_mpn_toom32_for_toom53_mul __GMP_PROTO ((struct speed_params *s)); 302double speed_mpn_toom53_for_toom32_mul __GMP_PROTO ((struct speed_params *s)); 303double speed_mpn_toom42_for_toom53_mul __GMP_PROTO ((struct speed_params *s)); 304double speed_mpn_toom53_for_toom42_mul __GMP_PROTO ((struct speed_params *s)); 305double speed_mpn_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s)); 306double speed_mpn_bc_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s)); 307double speed_mpn_mulmod_bnm1_rounded __GMP_PROTO ((struct speed_params *s)); 308double speed_mpn_sqrmod_bnm1 __GMP_PROTO ((struct speed_params *s)); 309double speed_mpn_udiv_qrnnd __GMP_PROTO ((struct speed_params *s)); 310double speed_mpn_udiv_qrnnd_r __GMP_PROTO ((struct speed_params *s)); 311double speed_mpn_umul_ppmm __GMP_PROTO ((struct speed_params *s)); 312double speed_mpn_umul_ppmm_r __GMP_PROTO ((struct speed_params *s)); 313double speed_mpn_xnor_n __GMP_PROTO ((struct speed_params *s)); 314double speed_mpn_xor_n __GMP_PROTO ((struct speed_params *s)); 315double speed_MPN_ZERO __GMP_PROTO ((struct speed_params *s)); 316 317double speed_mpq_init_clear __GMP_PROTO ((struct speed_params *s)); 318 319double speed_mpz_add __GMP_PROTO ((struct speed_params *s)); 320double speed_mpz_bin_uiui __GMP_PROTO ((struct speed_params *s)); 321double speed_mpz_fac_ui __GMP_PROTO ((struct speed_params *s)); 322double speed_mpz_fib_ui __GMP_PROTO ((struct speed_params *s)); 323double speed_mpz_fib2_ui __GMP_PROTO ((struct speed_params *s)); 324double speed_mpz_init_clear __GMP_PROTO ((struct speed_params *s)); 325double speed_mpz_init_realloc_clear __GMP_PROTO ((struct speed_params *s)); 326double speed_mpz_jacobi __GMP_PROTO ((struct speed_params *s)); 327double speed_mpz_lucnum_ui __GMP_PROTO ((struct speed_params *s)); 328double speed_mpz_lucnum2_ui __GMP_PROTO ((struct speed_params *s)); 329double speed_mpz_mod __GMP_PROTO ((struct speed_params *s)); 330double speed_mpz_powm __GMP_PROTO ((struct speed_params *s)); 331double speed_mpz_powm_mod __GMP_PROTO ((struct speed_params *s)); 332double speed_mpz_powm_redc __GMP_PROTO ((struct speed_params *s)); 333double speed_mpz_powm_ui __GMP_PROTO ((struct speed_params *s)); 334double speed_mpz_urandomb __GMP_PROTO ((struct speed_params *s)); 335 336double speed_gmp_randseed __GMP_PROTO ((struct speed_params *s)); 337double speed_gmp_randseed_ui __GMP_PROTO ((struct speed_params *s)); 338 339double speed_noop __GMP_PROTO ((struct speed_params *s)); 340double speed_noop_wxs __GMP_PROTO ((struct speed_params *s)); 341double speed_noop_wxys __GMP_PROTO ((struct speed_params *s)); 342 343double speed_operator_div __GMP_PROTO ((struct speed_params *s)); 344double speed_operator_mod __GMP_PROTO ((struct speed_params *s)); 345 346double speed_udiv_qrnnd __GMP_PROTO ((struct speed_params *s)); 347double speed_udiv_qrnnd_preinv1 __GMP_PROTO ((struct speed_params *s)); 348double speed_udiv_qrnnd_preinv2 __GMP_PROTO ((struct speed_params *s)); 349double speed_udiv_qrnnd_c __GMP_PROTO ((struct speed_params *s)); 350double speed_umul_ppmm __GMP_PROTO ((struct speed_params *s)); 351 352/* Prototypes for other routines */ 353 354/* low 32-bits in p[0], high 32-bits in p[1] */ 355void speed_cyclecounter __GMP_PROTO ((unsigned p[2])); 356 357void mftb_function __GMP_PROTO ((unsigned p[2])); 358 359/* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy 360 output or a clobber for the cpuid, hence an explicit save and restore. A 361 clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use 362 the dummy output style in non-PIC, so there's an error if somehow -fPIC 363 is used without a -DPIC to tell us about it. */ 364#if defined(__GNUC__) && ! defined (NO_ASM) \ 365 && (defined (__i386__) || defined (__i486__)) 366#if defined (PIC) || defined (__APPLE_CC__) 367#define speed_cyclecounter(p) \ 368 do { \ 369 int __speed_cyclecounter__save_ebx; \ 370 int __speed_cyclecounter__dummy; \ 371 __asm__ __volatile__ ("movl %%ebx, %1\n" \ 372 "cpuid\n" \ 373 "movl %1, %%ebx\n" \ 374 "rdtsc" \ 375 : "=a" ((p)[0]), \ 376 "=&rm" (__speed_cyclecounter__save_ebx), \ 377 "=c" (__speed_cyclecounter__dummy), \ 378 "=d" ((p)[1])); \ 379 } while (0) 380#else 381#define speed_cyclecounter(p) \ 382 do { \ 383 int __speed_cyclecounter__dummy1; \ 384 int __speed_cyclecounter__dummy2; \ 385 __asm__ __volatile__ ("cpuid\n" \ 386 "rdtsc" \ 387 : "=a" ((p)[0]), \ 388 "=b" (__speed_cyclecounter__dummy1), \ 389 "=c" (__speed_cyclecounter__dummy2), \ 390 "=d" ((p)[1])); \ 391 } while (0) 392#endif 393#endif 394 395double speed_cyclecounter_diff __GMP_PROTO ((const unsigned [2], const unsigned [2])); 396int gettimeofday_microseconds_p __GMP_PROTO ((void)); 397int getrusage_microseconds_p __GMP_PROTO ((void)); 398int cycles_works_p __GMP_PROTO ((void)); 399long clk_tck __GMP_PROTO ((void)); 400double freq_measure __GMP_PROTO ((const char *, double (*)(void))); 401 402int double_cmp_ptr __GMP_PROTO ((const double *, const double *)); 403void pentium_wbinvd __GMP_PROTO ((void)); 404typedef int (*qsort_function_t) __GMP_PROTO ((const void *, const void *)); 405 406void noop __GMP_PROTO ((void)); 407void noop_1 __GMP_PROTO ((mp_limb_t)); 408void noop_wxs __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t)); 409void noop_wxys __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); 410void mpn_cache_fill __GMP_PROTO ((mp_srcptr, mp_size_t)); 411void mpn_cache_fill_dummy __GMP_PROTO ((mp_limb_t)); 412void speed_cache_fill __GMP_PROTO ((struct speed_params *)); 413void speed_operand_src __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t)); 414void speed_operand_dst __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t)); 415 416extern int speed_option_addrs; 417extern int speed_option_verbose; 418void speed_option_set __GMP_PROTO((const char *)); 419 420mp_limb_t mpn_divrem_1_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t)); 421mp_limb_t mpn_divrem_1_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t)); 422mp_limb_t mpn_divrem_2_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr)); 423mp_limb_t mpn_divrem_2_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr)); 424 425int mpn_jacobi_base_1 __GMP_PROTO ((mp_limb_t, mp_limb_t, int)); 426int mpn_jacobi_base_2 __GMP_PROTO ((mp_limb_t, mp_limb_t, int)); 427int mpn_jacobi_base_3 __GMP_PROTO ((mp_limb_t, mp_limb_t, int)); 428 429mp_limb_t mpn_mod_1_div __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)); 430mp_limb_t mpn_mod_1_inv __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)); 431 432mp_size_t mpn_gcd_binary 433 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); 434mp_size_t mpn_gcd_accel 435 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); 436mp_size_t mpn_gcdext_one_double 437 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); 438mp_size_t mpn_gcdext_one_single 439 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); 440mp_size_t mpn_gcdext_single 441 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); 442mp_size_t mpn_gcdext_double 443 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); 444 445mp_limb_t mpn_sb_divrem_mn_div __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t)); 446mp_limb_t mpn_sb_divrem_mn_inv __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t)); 447 448mp_size_t mpn_set_str_basecase __GMP_PROTO ((mp_ptr, const unsigned char *, size_t, int)); 449void mpn_pre_set_str __GMP_PROTO ((mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr)); 450 451void mpz_powm_mod __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr)); 452void mpz_powm_redc __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr)); 453 454int speed_routine_count_zeros_setup 455 __GMP_PROTO ((struct speed_params *, mp_ptr, int, int)); 456 457 458/* "get" is called repeatedly until it ticks over, just in case on a fast 459 processor it takes less than a microsecond, though this is probably 460 unlikely if it's a system call. 461 462 speed_cyclecounter is called on the same side of the "get" for the start 463 and end measurements. It doesn't matter how long it takes from the "get" 464 sample to the cycles sample, since that period will cancel out in the 465 difference calculation (assuming it's the same each time). 466 467 Letting the test run for more than a process time slice is probably only 468 going to reduce accuracy, especially for getrusage when the cycle counter 469 is real time, or for gettimeofday if the cycle counter is in fact process 470 time. Use CLK_TCK/2 as a reasonable stop. 471 472 It'd be desirable to be quite accurate here. The default speed_precision 473 for a cycle counter is 10000 cycles, so to mix that with getrusage or 474 gettimeofday the frequency should be at least that accurate. But running 475 measurements for 10000 microseconds (or more) is too long. Be satisfied 476 with just a half clock tick (5000 microseconds usually). */ 477 478#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \ 479 do { \ 480 type st1, st, et1, et; \ 481 unsigned sc[2], ec[2]; \ 482 long dt, half_tick; \ 483 double dc, cyc; \ 484 \ 485 half_tick = (1000000L / clk_tck()) / 2; \ 486 \ 487 get (st1); \ 488 do { \ 489 get (st); \ 490 } while (usec(st) == usec(st1) && sec(st) == sec(st1)); \ 491 \ 492 getc (sc); \ 493 \ 494 for (;;) \ 495 { \ 496 get (et1); \ 497 do { \ 498 get (et); \ 499 } while (usec(et) == usec(et1) && sec(et) == sec(et1)); \ 500 \ 501 getc (ec); \ 502 \ 503 dc = speed_cyclecounter_diff (ec, sc); \ 504 \ 505 /* allow secs to cancel before multiplying */ \ 506 dt = sec(et) - sec(st); \ 507 dt = dt * 1000000L + (usec(et) - usec(st)); \ 508 \ 509 if (dt >= half_tick) \ 510 break; \ 511 } \ 512 \ 513 cyc = dt * 1e-6 / dc; \ 514 \ 515 if (speed_option_verbose >= 2) \ 516 printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \ 517 name, dc, dt, cyc); \ 518 \ 519 return dt * 1e-6 / dc; \ 520 \ 521 } while (0) 522 523 524 525 526/* The measuring routines use these big macros to save duplication for 527 similar forms. They also get used for some automatically generated 528 measuring of new implementations of functions. 529 530 Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a 531 function pointer is considered undesirable since it's not the way a 532 normal application will be calling, and some processors might do 533 different things with an indirect call, like not branch predicting, or 534 doing a full pipe flush. At least some of the "functions" measured are 535 actually macros too. 536 537 The net effect is to bloat the object code, possibly in a big way, but 538 only what's being measured is being run, so that doesn't matter. 539 540 The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or 541 ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure 542 function pointer doesn't work in gcc 3.2. Using an actual non-pure 543 function pointer variable works, but stands a real risk of a 544 non-optimizing compiler generating unnecessary overheads in the call. 545 Currently the best idea is not to use those attributes for a timing 546 program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and 547 gmp-impl.h to omit them from routines there. */ 548 549#define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0; 550 551/* For mpn_copy or similar. */ 552#define SPEED_ROUTINE_MPN_COPY(function) \ 553 { \ 554 mp_ptr wp; \ 555 unsigned i; \ 556 double t; \ 557 TMP_DECL; \ 558 \ 559 SPEED_RESTRICT_COND (s->size >= 0); \ 560 \ 561 TMP_MARK; \ 562 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 563 \ 564 speed_operand_src (s, s->xp, s->size); \ 565 speed_operand_dst (s, wp, s->size); \ 566 speed_cache_fill (s); \ 567 \ 568 speed_starttime (); \ 569 i = s->reps; \ 570 do \ 571 function (wp, s->xp, s->size); \ 572 while (--i != 0); \ 573 t = speed_endtime (); \ 574 \ 575 TMP_FREE; \ 576 return t; \ 577 } 578 579#define SPEED_ROUTINE_MPN_COPYC(function) \ 580 { \ 581 mp_ptr wp; \ 582 unsigned i; \ 583 double t; \ 584 TMP_DECL; \ 585 \ 586 SPEED_RESTRICT_COND (s->size >= 0); \ 587 \ 588 TMP_MARK; \ 589 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 590 \ 591 speed_operand_src (s, s->xp, s->size); \ 592 speed_operand_dst (s, wp, s->size); \ 593 speed_cache_fill (s); \ 594 \ 595 speed_starttime (); \ 596 i = s->reps; \ 597 do \ 598 function (wp, s->xp, s->size, 0); \ 599 while (--i != 0); \ 600 t = speed_endtime (); \ 601 \ 602 TMP_FREE; \ 603 return t; \ 604 } 605 606/* s->size is still in limbs, and it's limbs which are copied, but 607 "function" takes a size in bytes not limbs. */ 608#define SPEED_ROUTINE_MPN_COPY_BYTES(function) \ 609 { \ 610 mp_ptr wp; \ 611 unsigned i; \ 612 double t; \ 613 TMP_DECL; \ 614 \ 615 SPEED_RESTRICT_COND (s->size >= 0); \ 616 \ 617 TMP_MARK; \ 618 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 619 \ 620 speed_operand_src (s, s->xp, s->size); \ 621 speed_operand_dst (s, wp, s->size); \ 622 speed_cache_fill (s); \ 623 \ 624 speed_starttime (); \ 625 i = s->reps; \ 626 do \ 627 function (wp, s->xp, s->size * BYTES_PER_MP_LIMB); \ 628 while (--i != 0); \ 629 t = speed_endtime (); \ 630 \ 631 TMP_FREE; \ 632 return t; \ 633 } 634 635 636/* For mpn_add_n, mpn_sub_n, or similar. */ 637#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \ 638 { \ 639 mp_ptr wp; \ 640 mp_ptr xp, yp; \ 641 unsigned i; \ 642 double t; \ 643 TMP_DECL; \ 644 \ 645 SPEED_RESTRICT_COND (s->size >= 1); \ 646 \ 647 TMP_MARK; \ 648 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 649 \ 650 xp = s->xp; \ 651 yp = s->yp; \ 652 \ 653 if (s->r == 0) ; \ 654 else if (s->r == 1) { xp = wp; } \ 655 else if (s->r == 2) { yp = wp; } \ 656 else if (s->r == 3) { xp = wp; yp = wp; } \ 657 else if (s->r == 4) { yp = xp; } \ 658 else { \ 659 TMP_FREE; \ 660 return -1.0; \ 661 } \ 662 \ 663 /* initialize wp if operand overlap */ \ 664 if (xp == wp || yp == wp) \ 665 MPN_COPY (wp, s->xp, s->size); \ 666 \ 667 speed_operand_src (s, xp, s->size); \ 668 speed_operand_src (s, yp, s->size); \ 669 speed_operand_dst (s, wp, s->size); \ 670 speed_cache_fill (s); \ 671 \ 672 speed_starttime (); \ 673 i = s->reps; \ 674 do \ 675 call; \ 676 while (--i != 0); \ 677 t = speed_endtime (); \ 678 \ 679 TMP_FREE; \ 680 return t; \ 681 } 682 683/* For mpn_add_n, mpn_sub_n, or similar. */ 684#define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \ 685 { \ 686 mp_ptr ap, sp; \ 687 mp_ptr xp, yp; \ 688 unsigned i; \ 689 double t; \ 690 TMP_DECL; \ 691 \ 692 SPEED_RESTRICT_COND (s->size >= 1); \ 693 \ 694 TMP_MARK; \ 695 SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \ 696 SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \ 697 \ 698 xp = s->xp; \ 699 yp = s->yp; \ 700 \ 701 if ((s->r & 1) != 0) { xp = ap; } \ 702 if ((s->r & 2) != 0) { yp = ap; } \ 703 if ((s->r & 4) != 0) { xp = sp; } \ 704 if ((s->r & 8) != 0) { yp = sp; } \ 705 if ((s->r & 3) == 3 || (s->r & 12) == 12) \ 706 { \ 707 TMP_FREE; \ 708 return -1.0; \ 709 } \ 710 \ 711 /* initialize ap if operand overlap */ \ 712 if (xp == ap || yp == ap) \ 713 MPN_COPY (ap, s->xp, s->size); \ 714 /* initialize sp if operand overlap */ \ 715 if (xp == sp || yp == sp) \ 716 MPN_COPY (sp, s->xp, s->size); \ 717 \ 718 speed_operand_src (s, xp, s->size); \ 719 speed_operand_src (s, yp, s->size); \ 720 speed_operand_dst (s, ap, s->size); \ 721 speed_operand_dst (s, sp, s->size); \ 722 speed_cache_fill (s); \ 723 \ 724 speed_starttime (); \ 725 i = s->reps; \ 726 do \ 727 call; \ 728 while (--i != 0); \ 729 t = speed_endtime (); \ 730 \ 731 TMP_FREE; \ 732 return t; \ 733 } 734 735#define SPEED_ROUTINE_MPN_BINARY_N(function) \ 736 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size)) 737 738#define SPEED_ROUTINE_MPN_BINARY_NC(function) \ 739 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0)) 740 741 742/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */ 743#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \ 744 { \ 745 mp_ptr wp; \ 746 unsigned i; \ 747 double t; \ 748 TMP_DECL; \ 749 \ 750 SPEED_RESTRICT_COND (s->size >= 1); \ 751 \ 752 TMP_MARK; \ 753 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 754 \ 755 speed_operand_src (s, s->xp, s->size); \ 756 speed_operand_dst (s, wp, s->size); \ 757 speed_cache_fill (s); \ 758 \ 759 speed_starttime (); \ 760 i = s->reps; \ 761 do \ 762 call; \ 763 while (--i != 0); \ 764 t = speed_endtime (); \ 765 \ 766 TMP_FREE; \ 767 return t; \ 768 } 769 770#define SPEED_ROUTINE_MPN_UNARY_1(function) \ 771 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 772 773#define SPEED_ROUTINE_MPN_UNARY_1C(function) \ 774 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 775 776/* FIXME: wp is uninitialized here, should start it off from xp */ 777#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \ 778 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r)) 779 780#define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \ 781 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 782 783#define SPEED_ROUTINE_MPN_BDIV_Q_1(function) \ 784 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 785 786#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call) \ 787 { \ 788 unsigned shift; \ 789 mp_limb_t dinv; \ 790 \ 791 SPEED_RESTRICT_COND (s->size > 0); \ 792 SPEED_RESTRICT_COND (s->r != 0); \ 793 \ 794 count_trailing_zeros (shift, s->r); \ 795 binvert_limb (dinv, s->r >> shift); \ 796 \ 797 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 798 } 799#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function) \ 800 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL \ 801 ((*function) (wp, s->xp, s->size, s->r, dinv, shift)) 802 803#define SPEED_ROUTINE_MPN_BDIV_DBM1C(function) \ 804 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 805 806#define SPEED_ROUTINE_MPN_DIVREM_1(function) \ 807 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r)) 808 809#define SPEED_ROUTINE_MPN_DIVREM_1C(function) \ 810 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0)) 811 812#define SPEED_ROUTINE_MPN_DIVREM_1F(function) \ 813 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r)) 814 815#define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \ 816 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0)) 817 818 819#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \ 820 { \ 821 unsigned shift; \ 822 mp_limb_t dinv; \ 823 \ 824 SPEED_RESTRICT_COND (s->size >= 0); \ 825 SPEED_RESTRICT_COND (s->r != 0); \ 826 \ 827 count_leading_zeros (shift, s->r); \ 828 invert_limb (dinv, s->r << shift); \ 829 \ 830 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 831 } \ 832 833#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \ 834 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 835 ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift)) 836 837/* s->size limbs worth of fraction part */ 838#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \ 839 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 840 ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift)) 841 842 843/* s->r is duplicated to form the multiplier, defaulting to 844 MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at 845 least it provides some control. */ 846#define SPEED_ROUTINE_MPN_UNARY_N(function,N) \ 847 { \ 848 mp_ptr wp; \ 849 mp_size_t wn; \ 850 unsigned i; \ 851 double t; \ 852 mp_limb_t yp[N]; \ 853 TMP_DECL; \ 854 \ 855 SPEED_RESTRICT_COND (s->size >= N); \ 856 \ 857 TMP_MARK; \ 858 wn = s->size + N-1; \ 859 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 860 for (i = 0; i < N; i++) \ 861 yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \ 862 \ 863 speed_operand_src (s, s->xp, s->size); \ 864 speed_operand_src (s, yp, (mp_size_t) N); \ 865 speed_operand_dst (s, wp, wn); \ 866 speed_cache_fill (s); \ 867 \ 868 speed_starttime (); \ 869 i = s->reps; \ 870 do \ 871 function (wp, s->xp, s->size, yp); \ 872 while (--i != 0); \ 873 t = speed_endtime (); \ 874 \ 875 TMP_FREE; \ 876 return t; \ 877 } 878 879#define SPEED_ROUTINE_MPN_UNARY_2(function) \ 880 SPEED_ROUTINE_MPN_UNARY_N (function, 2) 881#define SPEED_ROUTINE_MPN_UNARY_3(function) \ 882 SPEED_ROUTINE_MPN_UNARY_N (function, 3) 883#define SPEED_ROUTINE_MPN_UNARY_4(function) \ 884 SPEED_ROUTINE_MPN_UNARY_N (function, 4) 885#define SPEED_ROUTINE_MPN_UNARY_5(function) \ 886 SPEED_ROUTINE_MPN_UNARY_N (function, 5) 887#define SPEED_ROUTINE_MPN_UNARY_6(function) \ 888 SPEED_ROUTINE_MPN_UNARY_N (function, 6) 889#define SPEED_ROUTINE_MPN_UNARY_7(function) \ 890 SPEED_ROUTINE_MPN_UNARY_N (function, 7) 891#define SPEED_ROUTINE_MPN_UNARY_8(function) \ 892 SPEED_ROUTINE_MPN_UNARY_N (function, 8) 893 894 895/* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */ 896#define SPEED_ROUTINE_MPN_MUL(function) \ 897 { \ 898 mp_ptr wp, xp; \ 899 mp_size_t size1; \ 900 unsigned i; \ 901 double t; \ 902 TMP_DECL; \ 903 \ 904 size1 = (s->r == 0 ? s->size : s->r); \ 905 \ 906 SPEED_RESTRICT_COND (s->size >= 1); \ 907 SPEED_RESTRICT_COND (size1 >= s->size); \ 908 \ 909 TMP_MARK; \ 910 SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp); \ 911 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 912 \ 913 speed_operand_src (s, xp, size1); \ 914 speed_operand_src (s, s->yp, s->size); \ 915 speed_operand_dst (s, wp, size1 + s->size); \ 916 speed_cache_fill (s); \ 917 \ 918 speed_starttime (); \ 919 i = s->reps; \ 920 do \ 921 function (wp, xp, size1, s->yp, s->size); \ 922 while (--i != 0); \ 923 t = speed_endtime (); \ 924 \ 925 TMP_FREE; \ 926 return t; \ 927 } 928 929 930#define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \ 931 { \ 932 mp_ptr wp; \ 933 unsigned i; \ 934 double t; \ 935 TMP_DECL; \ 936 \ 937 SPEED_RESTRICT_COND (s->size >= 1); \ 938 \ 939 TMP_MARK; \ 940 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 941 \ 942 speed_operand_src (s, s->xp, s->size); \ 943 speed_operand_src (s, s->yp, s->size); \ 944 speed_operand_dst (s, wp, 2*s->size); \ 945 speed_cache_fill (s); \ 946 \ 947 speed_starttime (); \ 948 i = s->reps; \ 949 do \ 950 call; \ 951 while (--i != 0); \ 952 t = speed_endtime (); \ 953 \ 954 TMP_FREE; \ 955 return t; \ 956 } 957 958#define SPEED_ROUTINE_MPN_MUL_N(function) \ 959 SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size)); 960 961#define SPEED_ROUTINE_MPN_MULLO_N_CALL(call) \ 962 { \ 963 mp_ptr wp; \ 964 unsigned i; \ 965 double t; \ 966 TMP_DECL; \ 967 \ 968 SPEED_RESTRICT_COND (s->size >= 1); \ 969 \ 970 TMP_MARK; \ 971 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 972 \ 973 speed_operand_src (s, s->xp, s->size); \ 974 speed_operand_src (s, s->yp, s->size); \ 975 speed_operand_dst (s, wp, s->size); \ 976 speed_cache_fill (s); \ 977 \ 978 speed_starttime (); \ 979 i = s->reps; \ 980 do \ 981 call; \ 982 while (--i != 0); \ 983 t = speed_endtime (); \ 984 \ 985 TMP_FREE; \ 986 return t; \ 987 } 988 989#define SPEED_ROUTINE_MPN_MULLO_N(function) \ 990 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); 991 992/* For mpn_mul_basecase, xsize=r, ysize=s->size. */ 993#define SPEED_ROUTINE_MPN_MULLO_BASECASE(function) \ 994 { \ 995 mp_ptr wp; \ 996 unsigned i; \ 997 double t; \ 998 TMP_DECL; \ 999 \ 1000 SPEED_RESTRICT_COND (s->size >= 1); \ 1001 \ 1002 TMP_MARK; \ 1003 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 1004 \ 1005 speed_operand_src (s, s->xp, s->size); \ 1006 speed_operand_src (s, s->yp, s->size); \ 1007 speed_operand_dst (s, wp, s->size); \ 1008 speed_cache_fill (s); \ 1009 \ 1010 speed_starttime (); \ 1011 i = s->reps; \ 1012 do \ 1013 function (wp, s->xp, s->yp, s->size); \ 1014 while (--i != 0); \ 1015 t = speed_endtime (); \ 1016 \ 1017 TMP_FREE; \ 1018 return t; \ 1019 } 1020 1021#define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \ 1022 { \ 1023 mp_ptr wp, tp; \ 1024 unsigned i; \ 1025 double t; \ 1026 mp_size_t itch; \ 1027 TMP_DECL; \ 1028 \ 1029 SPEED_RESTRICT_COND (s->size >= 1); \ 1030 \ 1031 itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size); \ 1032 \ 1033 TMP_MARK; \ 1034 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ 1035 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1036 \ 1037 speed_operand_src (s, s->xp, s->size); \ 1038 speed_operand_src (s, s->yp, s->size); \ 1039 speed_operand_dst (s, wp, 2 * s->size); \ 1040 speed_operand_dst (s, tp, itch); \ 1041 speed_cache_fill (s); \ 1042 \ 1043 speed_starttime (); \ 1044 i = s->reps; \ 1045 do \ 1046 call; \ 1047 while (--i != 0); \ 1048 t = speed_endtime (); \ 1049 \ 1050 TMP_FREE; \ 1051 return t; \ 1052 } 1053#define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function) \ 1054 { \ 1055 mp_ptr wp, tp; \ 1056 unsigned i; \ 1057 double t; \ 1058 mp_size_t size, itch; \ 1059 TMP_DECL; \ 1060 \ 1061 SPEED_RESTRICT_COND (s->size >= 1); \ 1062 \ 1063 size = mpn_mulmod_bnm1_next_size (s->size); \ 1064 itch = mpn_mulmod_bnm1_itch (size, size, size); \ 1065 \ 1066 TMP_MARK; \ 1067 SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp); \ 1068 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1069 \ 1070 speed_operand_src (s, s->xp, s->size); \ 1071 speed_operand_src (s, s->yp, s->size); \ 1072 speed_operand_dst (s, wp, size); \ 1073 speed_operand_dst (s, tp, itch); \ 1074 speed_cache_fill (s); \ 1075 \ 1076 speed_starttime (); \ 1077 i = s->reps; \ 1078 do \ 1079 function (wp, size, s->xp, s->size, s->yp, s->size, tp); \ 1080 while (--i != 0); \ 1081 t = speed_endtime (); \ 1082 \ 1083 TMP_FREE; \ 1084 return t; \ 1085 } 1086 1087#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \ 1088 { \ 1089 mp_ptr wp, tspace; \ 1090 unsigned i; \ 1091 double t; \ 1092 TMP_DECL; \ 1093 \ 1094 SPEED_RESTRICT_COND (s->size >= minsize); \ 1095 \ 1096 TMP_MARK; \ 1097 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1098 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1099 \ 1100 speed_operand_src (s, s->xp, s->size); \ 1101 speed_operand_src (s, s->yp, s->size); \ 1102 speed_operand_dst (s, wp, 2*s->size); \ 1103 speed_operand_dst (s, tspace, tsize); \ 1104 speed_cache_fill (s); \ 1105 \ 1106 speed_starttime (); \ 1107 i = s->reps; \ 1108 do \ 1109 call; \ 1110 while (--i != 0); \ 1111 t = speed_endtime (); \ 1112 \ 1113 TMP_FREE; \ 1114 return t; \ 1115 } 1116 1117#define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function) \ 1118 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1119 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1120 mpn_toom22_mul_itch (s->size, s->size), \ 1121 MPN_TOOM22_MUL_MINSIZE) 1122 1123#define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function) \ 1124 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1125 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1126 mpn_toom33_mul_itch (s->size, s->size), \ 1127 MPN_TOOM33_MUL_MINSIZE) 1128 1129#define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function) \ 1130 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1131 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1132 mpn_toom44_mul_itch (s->size, s->size), \ 1133 MPN_TOOM44_MUL_MINSIZE) 1134 1135#define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function) \ 1136 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1137 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1138 mpn_toom6h_mul_itch (s->size, s->size), \ 1139 MPN_TOOM6H_MUL_MINSIZE) 1140 1141#define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function) \ 1142 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1143 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1144 mpn_toom8h_mul_itch (s->size, s->size), \ 1145 MPN_TOOM8H_MUL_MINSIZE) 1146 1147#define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \ 1148 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1149 (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \ 1150 mpn_toom32_mul_itch (s->size, 2*s->size/3), \ 1151 MPN_TOOM32_MUL_MINSIZE) 1152 1153#define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \ 1154 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1155 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1156 mpn_toom42_mul_itch (s->size, s->size/2), \ 1157 MPN_TOOM42_MUL_MINSIZE) 1158 1159#define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \ 1160 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1161 (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \ 1162 mpn_toom43_mul_itch (s->size, s->size*3/4), \ 1163 MPN_TOOM43_MUL_MINSIZE) 1164 1165#define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \ 1166 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1167 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1168 mpn_toom63_mul_itch (s->size, s->size/2), \ 1169 MPN_TOOM63_MUL_MINSIZE) 1170 1171#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \ 1172 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1173 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1174 mpn_toom32_mul_itch (s->size, 17*s->size/24), \ 1175 MPN_TOOM32_MUL_MINSIZE) 1176#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \ 1177 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1178 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1179 mpn_toom43_mul_itch (s->size, 17*s->size/24), \ 1180 MPN_TOOM43_MUL_MINSIZE) 1181 1182#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \ 1183 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1184 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1185 mpn_toom32_mul_itch (s->size, 19*s->size/30), \ 1186 MPN_TOOM32_MUL_MINSIZE) 1187#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \ 1188 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1189 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1190 mpn_toom53_mul_itch (s->size, 19*s->size/30), \ 1191 MPN_TOOM53_MUL_MINSIZE) 1192 1193#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \ 1194 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1195 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1196 mpn_toom42_mul_itch (s->size, 11*s->size/20), \ 1197 MPN_TOOM42_MUL_MINSIZE) 1198#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \ 1199 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1200 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1201 mpn_toom53_mul_itch (s->size, 11*s->size/20), \ 1202 MPN_TOOM53_MUL_MINSIZE) 1203 1204 1205 1206#define SPEED_ROUTINE_MPN_SQR_CALL(call) \ 1207 { \ 1208 mp_ptr wp; \ 1209 unsigned i; \ 1210 double t; \ 1211 TMP_DECL; \ 1212 \ 1213 SPEED_RESTRICT_COND (s->size >= 1); \ 1214 \ 1215 TMP_MARK; \ 1216 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1217 \ 1218 speed_operand_src (s, s->xp, s->size); \ 1219 speed_operand_dst (s, wp, 2*s->size); \ 1220 speed_cache_fill (s); \ 1221 \ 1222 speed_starttime (); \ 1223 i = s->reps; \ 1224 do \ 1225 call; \ 1226 while (--i != 0); \ 1227 t = speed_endtime (); \ 1228 \ 1229 TMP_FREE; \ 1230 return t; \ 1231 } 1232 1233#define SPEED_ROUTINE_MPN_SQR(function) \ 1234 SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size)) 1235 1236#define SPEED_ROUTINE_MPN_SQR_DIAGONAL(function) \ 1237 SPEED_ROUTINE_MPN_SQR (function) 1238 1239 1240#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \ 1241 { \ 1242 mp_ptr wp, tspace; \ 1243 unsigned i; \ 1244 double t; \ 1245 TMP_DECL; \ 1246 \ 1247 SPEED_RESTRICT_COND (s->size >= minsize); \ 1248 \ 1249 TMP_MARK; \ 1250 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1251 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1252 \ 1253 speed_operand_src (s, s->xp, s->size); \ 1254 speed_operand_dst (s, wp, 2*s->size); \ 1255 speed_operand_dst (s, tspace, tsize); \ 1256 speed_cache_fill (s); \ 1257 \ 1258 speed_starttime (); \ 1259 i = s->reps; \ 1260 do \ 1261 call; \ 1262 while (--i != 0); \ 1263 t = speed_endtime (); \ 1264 \ 1265 TMP_FREE; \ 1266 return t; \ 1267 } 1268 1269#define SPEED_ROUTINE_MPN_TOOM2_SQR(function) \ 1270 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1271 mpn_toom2_sqr_itch (s->size), \ 1272 MPN_TOOM2_SQR_MINSIZE) 1273 1274#define SPEED_ROUTINE_MPN_TOOM3_SQR(function) \ 1275 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1276 mpn_toom3_sqr_itch (s->size), \ 1277 MPN_TOOM3_SQR_MINSIZE) 1278 1279 1280#define SPEED_ROUTINE_MPN_TOOM4_SQR(function) \ 1281 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1282 mpn_toom4_sqr_itch (s->size), \ 1283 MPN_TOOM4_SQR_MINSIZE) 1284 1285#define SPEED_ROUTINE_MPN_TOOM6_SQR(function) \ 1286 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1287 mpn_toom6_sqr_itch (s->size), \ 1288 MPN_TOOM6_SQR_MINSIZE) 1289 1290#define SPEED_ROUTINE_MPN_TOOM8_SQR(function) \ 1291 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1292 mpn_toom8_sqr_itch (s->size), \ 1293 MPN_TOOM8_SQR_MINSIZE) 1294 1295#define SPEED_ROUTINE_MPN_MOD_CALL(call) \ 1296 { \ 1297 unsigned i; \ 1298 \ 1299 SPEED_RESTRICT_COND (s->size >= 0); \ 1300 \ 1301 speed_operand_src (s, s->xp, s->size); \ 1302 speed_cache_fill (s); \ 1303 \ 1304 speed_starttime (); \ 1305 i = s->reps; \ 1306 do \ 1307 call; \ 1308 while (--i != 0); \ 1309 \ 1310 return speed_endtime (); \ 1311 } 1312 1313#define SPEED_ROUTINE_MPN_MOD_1(function) \ 1314 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r)) 1315 1316#define SPEED_ROUTINE_MPN_MOD_1C(function) \ 1317 SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0))) 1318 1319#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \ 1320 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r)); 1321 1322#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \ 1323 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0))); 1324 1325#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \ 1326 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size)) 1327 1328#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \ 1329 { \ 1330 unsigned i; \ 1331 mp_limb_t inv; \ 1332 \ 1333 SPEED_RESTRICT_COND (s->size >= 0); \ 1334 SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \ 1335 \ 1336 invert_limb (inv, s->r); \ 1337 speed_operand_src (s, s->xp, s->size); \ 1338 speed_cache_fill (s); \ 1339 \ 1340 speed_starttime (); \ 1341 i = s->reps; \ 1342 do \ 1343 (*function) (s->xp, s->size, s->r, inv); \ 1344 while (--i != 0); \ 1345 \ 1346 return speed_endtime (); \ 1347 } 1348 1349#define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc) \ 1350 { \ 1351 unsigned i; \ 1352 mp_limb_t inv[4]; \ 1353 \ 1354 SPEED_RESTRICT_COND (s->size >= 2); \ 1355 \ 1356 mpn_mod_1_1p_cps (inv, s->r); \ 1357 speed_operand_src (s, s->xp, s->size); \ 1358 speed_cache_fill (s); \ 1359 \ 1360 speed_starttime (); \ 1361 i = s->reps; \ 1362 do { \ 1363 pfunc (inv, s->r); \ 1364 function (s->xp, s->size, s->r, inv); \ 1365 } while (--i != 0); \ 1366 \ 1367 return speed_endtime (); \ 1368 } 1369#define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N) \ 1370 { \ 1371 unsigned i; \ 1372 mp_limb_t inv[N+3]; \ 1373 \ 1374 SPEED_RESTRICT_COND (s->size >= 1); \ 1375 SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N); \ 1376 \ 1377 speed_operand_src (s, s->xp, s->size); \ 1378 speed_cache_fill (s); \ 1379 \ 1380 speed_starttime (); \ 1381 i = s->reps; \ 1382 do { \ 1383 pfunc (inv, s->r); \ 1384 function (s->xp, s->size, s->r, inv); \ 1385 } while (--i != 0); \ 1386 \ 1387 return speed_endtime (); \ 1388 } 1389 1390 1391/* A division of 2*s->size by s->size limbs */ 1392 1393#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \ 1394 { \ 1395 unsigned i; \ 1396 mp_ptr a, d, q, r; \ 1397 double t; \ 1398 gmp_pi1_t dinv; \ 1399 TMP_DECL; \ 1400 \ 1401 SPEED_RESTRICT_COND (s->size >= 1); \ 1402 \ 1403 TMP_MARK; \ 1404 SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ 1405 SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ 1406 SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ 1407 SPEED_TMP_ALLOC_LIMBS (r, s->size, s->align_wp2); \ 1408 \ 1409 MPN_COPY (a, s->xp, s->size); \ 1410 MPN_COPY (a+s->size, s->xp, s->size); \ 1411 \ 1412 MPN_COPY (d, s->yp, s->size); \ 1413 \ 1414 /* normalize the data */ \ 1415 d[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1416 a[2*s->size-1] = d[s->size-1] - 1; \ 1417 \ 1418 invert_pi1 (dinv, d[s->size-1], d[s->size-2]); \ 1419 \ 1420 speed_operand_src (s, a, 2*s->size); \ 1421 speed_operand_src (s, d, s->size); \ 1422 speed_operand_dst (s, q, s->size+1); \ 1423 speed_operand_dst (s, r, s->size); \ 1424 speed_cache_fill (s); \ 1425 \ 1426 speed_starttime (); \ 1427 i = s->reps; \ 1428 do \ 1429 call; \ 1430 while (--i != 0); \ 1431 t = speed_endtime (); \ 1432 \ 1433 TMP_FREE; \ 1434 return t; \ 1435 } 1436 1437 1438/* A remainder 2*s->size by s->size limbs */ 1439 1440#define SPEED_ROUTINE_MPZ_MOD(function) \ 1441 { \ 1442 unsigned i; \ 1443 mpz_t a, d, r; \ 1444 \ 1445 SPEED_RESTRICT_COND (s->size >= 1); \ 1446 \ 1447 mpz_init_set_n (d, s->yp, s->size); \ 1448 \ 1449 /* high part less than d, low part a duplicate copied in */ \ 1450 mpz_init_set_n (a, s->xp, s->size); \ 1451 mpz_mod (a, a, d); \ 1452 mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size); \ 1453 MPN_COPY (PTR(a), s->xp, s->size); \ 1454 \ 1455 mpz_init (r); \ 1456 \ 1457 speed_operand_src (s, PTR(a), SIZ(a)); \ 1458 speed_operand_src (s, PTR(d), SIZ(d)); \ 1459 speed_cache_fill (s); \ 1460 \ 1461 speed_starttime (); \ 1462 i = s->reps; \ 1463 do \ 1464 function (r, a, d); \ 1465 while (--i != 0); \ 1466 return speed_endtime (); \ 1467 } 1468 1469#define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN) \ 1470 { \ 1471 unsigned i; \ 1472 mp_ptr dp, tp, ap, qp; \ 1473 gmp_pi1_t inv; \ 1474 double t; \ 1475 mp_size_t size1; \ 1476 TMP_DECL; \ 1477 \ 1478 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1479 \ 1480 SPEED_RESTRICT_COND (s->size >= DMIN); \ 1481 SPEED_RESTRICT_COND (size1 - s->size >= QMIN); \ 1482 \ 1483 TMP_MARK; \ 1484 SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp); \ 1485 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1486 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1487 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2); \ 1488 \ 1489 /* we don't fill in dividend completely when size1 > s->size */ \ 1490 MPN_COPY (ap, s->xp, s->size); \ 1491 MPN_COPY (ap + size1 - s->size, s->xp, s->size); \ 1492 \ 1493 MPN_COPY (dp, s->yp, s->size); \ 1494 \ 1495 /* normalize the data */ \ 1496 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1497 ap[size1 - 1] = dp[s->size - 1] - 1; \ 1498 \ 1499 invert_pi1 (inv, dp[s->size-1], dp[s->size-2]); \ 1500 \ 1501 speed_operand_src (s, ap, size1); \ 1502 speed_operand_dst (s, tp, size1); \ 1503 speed_operand_src (s, dp, s->size); \ 1504 speed_operand_dst (s, qp, size1 - s->size); \ 1505 speed_cache_fill (s); \ 1506 \ 1507 speed_starttime (); \ 1508 i = s->reps; \ 1509 do { \ 1510 MPN_COPY (tp, ap, size1); \ 1511 function (qp, tp, size1, dp, s->size, INV); \ 1512 } while (--i != 0); \ 1513 t = speed_endtime (); \ 1514 \ 1515 TMP_FREE; \ 1516 return t; \ 1517 } 1518#define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn) \ 1519 { \ 1520 unsigned i; \ 1521 mp_ptr dp, tp, qp, scratch; \ 1522 double t; \ 1523 mp_size_t itch; \ 1524 TMP_DECL; \ 1525 \ 1526 SPEED_RESTRICT_COND (s->size >= 2); \ 1527 \ 1528 itch = itchfn (2 * s->size, s->size, 0); \ 1529 TMP_MARK; \ 1530 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1531 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1532 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 1533 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1534 \ 1535 MPN_COPY (tp, s->xp, s->size); \ 1536 MPN_COPY (tp+s->size, s->xp, s->size); \ 1537 \ 1538 /* normalize the data */ \ 1539 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1540 tp[2*s->size-1] = dp[s->size-1] - 1; \ 1541 \ 1542 speed_operand_dst (s, qp, s->size); \ 1543 speed_operand_src (s, tp, 2 * s->size); \ 1544 speed_operand_src (s, dp, s->size); \ 1545 speed_operand_dst (s, scratch, itch); \ 1546 speed_cache_fill (s); \ 1547 \ 1548 speed_starttime (); \ 1549 i = s->reps; \ 1550 do { \ 1551 function (qp, tp, 2 * s->size, dp, s->size, scratch); \ 1552 } while (--i != 0); \ 1553 t = speed_endtime (); \ 1554 \ 1555 TMP_FREE; \ 1556 return t; \ 1557 } 1558#define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn) \ 1559 { \ 1560 unsigned i; \ 1561 mp_ptr dp, tp, qp, rp, scratch; \ 1562 double t; \ 1563 mp_size_t size1, itch; \ 1564 TMP_DECL; \ 1565 \ 1566 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1567 \ 1568 SPEED_RESTRICT_COND (s->size >= 2); \ 1569 SPEED_RESTRICT_COND (size1 >= s->size); \ 1570 \ 1571 itch = itchfn (size1, s->size, 0); \ 1572 TMP_MARK; \ 1573 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1574 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1575 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1576 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1577 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1578 \ 1579 /* we don't fill in dividend completely when size1 > s->size */ \ 1580 MPN_COPY (tp, s->xp, s->size); \ 1581 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1582 \ 1583 MPN_COPY (dp, s->yp, s->size); \ 1584 \ 1585 /* normalize the data */ \ 1586 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1587 tp[size1 - 1] = dp[s->size - 1] - 1; \ 1588 \ 1589 speed_operand_dst (s, qp, size1 - s->size); \ 1590 speed_operand_dst (s, rp, s->size); \ 1591 speed_operand_src (s, tp, size1); \ 1592 speed_operand_src (s, dp, s->size); \ 1593 speed_operand_dst (s, scratch, itch); \ 1594 speed_cache_fill (s); \ 1595 \ 1596 speed_starttime (); \ 1597 i = s->reps; \ 1598 do { \ 1599 function (qp, rp, tp, size1, dp, s->size, scratch); \ 1600 } while (--i != 0); \ 1601 t = speed_endtime (); \ 1602 \ 1603 TMP_FREE; \ 1604 return t; \ 1605 } 1606#define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn) \ 1607 { \ 1608 unsigned i; \ 1609 mp_ptr dp, tp, qp, rp, ip, scratch; \ 1610 double t; \ 1611 mp_size_t size1, itch; \ 1612 TMP_DECL; \ 1613 \ 1614 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1615 \ 1616 SPEED_RESTRICT_COND (s->size >= 2); \ 1617 SPEED_RESTRICT_COND (size1 >= s->size); \ 1618 \ 1619 itch = itchfn (size1, s->size, s->size); \ 1620 TMP_MARK; \ 1621 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1622 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1623 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1624 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1625 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1626 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \ 1627 \ 1628 /* we don't fill in dividend completely when size1 > s->size */ \ 1629 MPN_COPY (tp, s->xp, s->size); \ 1630 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1631 \ 1632 MPN_COPY (dp, s->yp, s->size); \ 1633 \ 1634 /* normalize the data */ \ 1635 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1636 tp[size1 - 1] = dp[s->size-1] - 1; \ 1637 \ 1638 mpn_invert (ip, dp, s->size, NULL); \ 1639 \ 1640 speed_operand_dst (s, qp, size1 - s->size); \ 1641 speed_operand_dst (s, rp, s->size); \ 1642 speed_operand_src (s, tp, size1); \ 1643 speed_operand_src (s, dp, s->size); \ 1644 speed_operand_src (s, ip, s->size); \ 1645 speed_operand_dst (s, scratch, itch); \ 1646 speed_cache_fill (s); \ 1647 \ 1648 speed_starttime (); \ 1649 i = s->reps; \ 1650 do { \ 1651 function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch); \ 1652 } while (--i != 0); \ 1653 t = speed_endtime (); \ 1654 \ 1655 TMP_FREE; \ 1656 return t; \ 1657 } 1658 1659#define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function) \ 1660 { \ 1661 unsigned i; \ 1662 mp_ptr dp, tp, ap, qp; \ 1663 mp_limb_t inv; \ 1664 double t; \ 1665 TMP_DECL; \ 1666 \ 1667 SPEED_RESTRICT_COND (s->size >= 1); \ 1668 \ 1669 TMP_MARK; \ 1670 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \ 1671 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1672 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1673 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \ 1674 \ 1675 MPN_COPY (ap, s->xp, s->size); \ 1676 MPN_COPY (ap+s->size, s->xp, s->size); \ 1677 \ 1678 /* divisor must be odd */ \ 1679 MPN_COPY (dp, s->yp, s->size); \ 1680 dp[0] |= 1; \ 1681 binvert_limb (inv, dp[0]); \ 1682 inv = -inv; \ 1683 \ 1684 speed_operand_src (s, ap, 2*s->size); \ 1685 speed_operand_dst (s, tp, 2*s->size); \ 1686 speed_operand_src (s, dp, s->size); \ 1687 speed_operand_dst (s, qp, s->size); \ 1688 speed_cache_fill (s); \ 1689 \ 1690 speed_starttime (); \ 1691 i = s->reps; \ 1692 do { \ 1693 MPN_COPY (tp, ap, 2*s->size); \ 1694 function (qp, tp, 2*s->size, dp, s->size, inv); \ 1695 } while (--i != 0); \ 1696 t = speed_endtime (); \ 1697 \ 1698 TMP_FREE; \ 1699 return t; \ 1700 } 1701#define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function) \ 1702 { \ 1703 unsigned i; \ 1704 mp_ptr dp, tp, qp; \ 1705 mp_limb_t inv; \ 1706 double t; \ 1707 TMP_DECL; \ 1708 \ 1709 SPEED_RESTRICT_COND (s->size >= 1); \ 1710 \ 1711 TMP_MARK; \ 1712 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1713 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1714 SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2); \ 1715 \ 1716 /* divisor must be odd */ \ 1717 MPN_COPY (dp, s->yp, s->size); \ 1718 dp[0] |= 1; \ 1719 binvert_limb (inv, dp[0]); \ 1720 inv = -inv; \ 1721 \ 1722 speed_operand_src (s, s->xp, s->size); \ 1723 speed_operand_dst (s, tp, s->size); \ 1724 speed_operand_src (s, dp, s->size); \ 1725 speed_operand_dst (s, qp, s->size); \ 1726 speed_cache_fill (s); \ 1727 \ 1728 speed_starttime (); \ 1729 i = s->reps; \ 1730 do { \ 1731 MPN_COPY (tp, s->xp, s->size); \ 1732 function (qp, tp, s->size, dp, s->size, inv); \ 1733 } while (--i != 0); \ 1734 t = speed_endtime (); \ 1735 \ 1736 TMP_FREE; \ 1737 return t; \ 1738 } 1739#define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn) \ 1740 { \ 1741 unsigned i; \ 1742 mp_ptr dp, qp, scratch; \ 1743 double t; \ 1744 mp_size_t itch; \ 1745 TMP_DECL; \ 1746 \ 1747 SPEED_RESTRICT_COND (s->size >= 2); \ 1748 \ 1749 itch = itchfn (s->size, s->size); \ 1750 TMP_MARK; \ 1751 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1752 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1753 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1754 \ 1755 /* divisor must be odd */ \ 1756 MPN_COPY (dp, s->yp, s->size); \ 1757 dp[0] |= 1; \ 1758 \ 1759 speed_operand_dst (s, qp, s->size); \ 1760 speed_operand_src (s, s->xp, s->size); \ 1761 speed_operand_src (s, dp, s->size); \ 1762 speed_operand_dst (s, scratch, itch); \ 1763 speed_cache_fill (s); \ 1764 \ 1765 speed_starttime (); \ 1766 i = s->reps; \ 1767 do { \ 1768 function (qp, s->xp, s->size, dp, s->size, scratch); \ 1769 } while (--i != 0); \ 1770 t = speed_endtime (); \ 1771 \ 1772 TMP_FREE; \ 1773 return t; \ 1774 } 1775#define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn) \ 1776 { \ 1777 unsigned i; \ 1778 mp_ptr dp, tp, qp, rp, scratch; \ 1779 double t; \ 1780 mp_size_t itch; \ 1781 TMP_DECL; \ 1782 \ 1783 SPEED_RESTRICT_COND (s->size >= 2); \ 1784 \ 1785 itch = itchfn (2 * s->size, s->size); \ 1786 TMP_MARK; \ 1787 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1788 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1789 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 1790 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1791 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1792 \ 1793 MPN_COPY (tp, s->xp, s->size); \ 1794 MPN_COPY (tp+s->size, s->xp, s->size); \ 1795 \ 1796 /* divisor must be odd */ \ 1797 MPN_COPY (dp, s->yp, s->size); \ 1798 dp[0] |= 1; \ 1799 \ 1800 speed_operand_dst (s, qp, s->size); \ 1801 speed_operand_dst (s, rp, s->size); \ 1802 speed_operand_src (s, tp, 2 * s->size); \ 1803 speed_operand_src (s, dp, s->size); \ 1804 speed_operand_dst (s, scratch, itch); \ 1805 speed_cache_fill (s); \ 1806 \ 1807 speed_starttime (); \ 1808 i = s->reps; \ 1809 do { \ 1810 function (qp, rp, tp, 2 * s->size, dp, s->size, scratch); \ 1811 } while (--i != 0); \ 1812 t = speed_endtime (); \ 1813 \ 1814 TMP_FREE; \ 1815 return t; \ 1816 } 1817 1818#define SPEED_ROUTINE_MPN_INVERT(function,itchfn) \ 1819 { \ 1820 long i; \ 1821 mp_ptr up, tp, ip; \ 1822 double t; \ 1823 TMP_DECL; \ 1824 \ 1825 SPEED_RESTRICT_COND (s->size >= 1); \ 1826 \ 1827 TMP_MARK; \ 1828 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 1829 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 1830 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 1831 \ 1832 MPN_COPY (up, s->xp, s->size); \ 1833 \ 1834 /* normalize the data */ \ 1835 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1836 \ 1837 speed_operand_src (s, up, s->size); \ 1838 speed_operand_dst (s, tp, s->size); \ 1839 speed_operand_dst (s, ip, s->size); \ 1840 speed_cache_fill (s); \ 1841 \ 1842 speed_starttime (); \ 1843 i = s->reps; \ 1844 do \ 1845 function (ip, up, s->size, tp); \ 1846 while (--i != 0); \ 1847 t = speed_endtime (); \ 1848 \ 1849 TMP_FREE; \ 1850 return t; \ 1851 } 1852 1853#define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn) \ 1854 { \ 1855 long i; \ 1856 mp_ptr up, tp, ip; \ 1857 double t; \ 1858 TMP_DECL; \ 1859 \ 1860 SPEED_RESTRICT_COND (s->size >= 1); \ 1861 \ 1862 TMP_MARK; \ 1863 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 1864 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 1865 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 1866 \ 1867 MPN_COPY (up, s->xp, s->size); \ 1868 \ 1869 /* normalize the data */ \ 1870 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1871 \ 1872 speed_operand_src (s, up, s->size); \ 1873 speed_operand_dst (s, tp, s->size); \ 1874 speed_operand_dst (s, ip, s->size); \ 1875 speed_cache_fill (s); \ 1876 \ 1877 speed_starttime (); \ 1878 i = s->reps; \ 1879 do \ 1880 function (ip, up, s->size, tp); \ 1881 while (--i != 0); \ 1882 t = speed_endtime (); \ 1883 \ 1884 TMP_FREE; \ 1885 return t; \ 1886 } 1887 1888#define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn) \ 1889 { \ 1890 long i; \ 1891 mp_ptr up, tp, ip; \ 1892 double t; \ 1893 TMP_DECL; \ 1894 \ 1895 SPEED_RESTRICT_COND (s->size >= 3); \ 1896 \ 1897 TMP_MARK; \ 1898 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 1899 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 1900 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 1901 \ 1902 MPN_COPY (up, s->xp, s->size); \ 1903 \ 1904 /* normalize the data */ \ 1905 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1906 \ 1907 speed_operand_src (s, up, s->size); \ 1908 speed_operand_dst (s, tp, s->size); \ 1909 speed_operand_dst (s, ip, s->size); \ 1910 speed_cache_fill (s); \ 1911 \ 1912 speed_starttime (); \ 1913 i = s->reps; \ 1914 do \ 1915 function (ip, up, s->size, tp); \ 1916 while (--i != 0); \ 1917 t = speed_endtime (); \ 1918 \ 1919 TMP_FREE; \ 1920 return t; \ 1921 } 1922 1923#define SPEED_ROUTINE_MPN_BINVERT(function,itchfn) \ 1924 { \ 1925 long i; \ 1926 mp_ptr up, tp, ip; \ 1927 double t; \ 1928 TMP_DECL; \ 1929 \ 1930 SPEED_RESTRICT_COND (s->size >= 1); \ 1931 \ 1932 TMP_MARK; \ 1933 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 1934 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 1935 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 1936 \ 1937 MPN_COPY (up, s->xp, s->size); \ 1938 \ 1939 /* normalize the data */ \ 1940 up[0] |= 1; \ 1941 \ 1942 speed_operand_src (s, up, s->size); \ 1943 speed_operand_dst (s, tp, s->size); \ 1944 speed_operand_dst (s, ip, s->size); \ 1945 speed_cache_fill (s); \ 1946 \ 1947 speed_starttime (); \ 1948 i = s->reps; \ 1949 do \ 1950 function (ip, up, s->size, tp); \ 1951 while (--i != 0); \ 1952 t = speed_endtime (); \ 1953 \ 1954 TMP_FREE; \ 1955 return t; \ 1956 } 1957 1958#define SPEED_ROUTINE_REDC_1(function) \ 1959 { \ 1960 unsigned i; \ 1961 mp_ptr cp, mp, tp, ap; \ 1962 mp_limb_t inv; \ 1963 double t; \ 1964 TMP_DECL; \ 1965 \ 1966 SPEED_RESTRICT_COND (s->size >= 1); \ 1967 \ 1968 TMP_MARK; \ 1969 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 1970 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 1971 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 1972 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 1973 \ 1974 MPN_COPY (ap, s->xp, s->size); \ 1975 MPN_COPY (ap+s->size, s->xp, s->size); \ 1976 \ 1977 /* modulus must be odd */ \ 1978 MPN_COPY (mp, s->yp, s->size); \ 1979 mp[0] |= 1; \ 1980 binvert_limb (inv, mp[0]); \ 1981 inv = -inv; \ 1982 \ 1983 speed_operand_src (s, ap, 2*s->size+1); \ 1984 speed_operand_dst (s, tp, 2*s->size+1); \ 1985 speed_operand_src (s, mp, s->size); \ 1986 speed_operand_dst (s, cp, s->size); \ 1987 speed_cache_fill (s); \ 1988 \ 1989 speed_starttime (); \ 1990 i = s->reps; \ 1991 do { \ 1992 MPN_COPY (tp, ap, 2*s->size); \ 1993 function (cp, tp, mp, s->size, inv); \ 1994 } while (--i != 0); \ 1995 t = speed_endtime (); \ 1996 \ 1997 TMP_FREE; \ 1998 return t; \ 1999 } 2000#define SPEED_ROUTINE_REDC_2(function) \ 2001 { \ 2002 unsigned i; \ 2003 mp_ptr cp, mp, tp, ap; \ 2004 mp_limb_t invp[2]; \ 2005 double t; \ 2006 TMP_DECL; \ 2007 \ 2008 SPEED_RESTRICT_COND (s->size >= 1); \ 2009 \ 2010 TMP_MARK; \ 2011 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2012 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2013 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2014 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2015 \ 2016 MPN_COPY (ap, s->xp, s->size); \ 2017 MPN_COPY (ap+s->size, s->xp, s->size); \ 2018 \ 2019 /* modulus must be odd */ \ 2020 MPN_COPY (mp, s->yp, s->size); \ 2021 mp[0] |= 1; \ 2022 mpn_binvert (invp, mp, 2, tp); \ 2023 invp[0] = -invp[0]; invp[1] = ~invp[1]; \ 2024 \ 2025 speed_operand_src (s, ap, 2*s->size+1); \ 2026 speed_operand_dst (s, tp, 2*s->size+1); \ 2027 speed_operand_src (s, mp, s->size); \ 2028 speed_operand_dst (s, cp, s->size); \ 2029 speed_cache_fill (s); \ 2030 \ 2031 speed_starttime (); \ 2032 i = s->reps; \ 2033 do { \ 2034 MPN_COPY (tp, ap, 2*s->size); \ 2035 function (cp, tp, mp, s->size, invp); \ 2036 } while (--i != 0); \ 2037 t = speed_endtime (); \ 2038 \ 2039 TMP_FREE; \ 2040 return t; \ 2041 } 2042#define SPEED_ROUTINE_REDC_N(function) \ 2043 { \ 2044 unsigned i; \ 2045 mp_ptr cp, mp, tp, ap, invp; \ 2046 double t; \ 2047 TMP_DECL; \ 2048 \ 2049 SPEED_RESTRICT_COND (s->size > 8); \ 2050 \ 2051 TMP_MARK; \ 2052 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2053 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2054 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2055 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2056 SPEED_TMP_ALLOC_LIMBS (invp, s->size, s->align_wp2); /* align? */ \ 2057 \ 2058 MPN_COPY (ap, s->xp, s->size); \ 2059 MPN_COPY (ap+s->size, s->xp, s->size); \ 2060 \ 2061 /* modulus must be odd */ \ 2062 MPN_COPY (mp, s->yp, s->size); \ 2063 mp[0] |= 1; \ 2064 mpn_binvert (invp, mp, s->size, tp); \ 2065 \ 2066 speed_operand_src (s, ap, 2*s->size+1); \ 2067 speed_operand_dst (s, tp, 2*s->size+1); \ 2068 speed_operand_src (s, mp, s->size); \ 2069 speed_operand_dst (s, cp, s->size); \ 2070 speed_cache_fill (s); \ 2071 \ 2072 speed_starttime (); \ 2073 i = s->reps; \ 2074 do { \ 2075 MPN_COPY (tp, ap, 2*s->size); \ 2076 function (cp, tp, mp, s->size, invp); \ 2077 } while (--i != 0); \ 2078 t = speed_endtime (); \ 2079 \ 2080 TMP_FREE; \ 2081 return t; \ 2082 } 2083 2084 2085#define SPEED_ROUTINE_MPN_POPCOUNT(function) \ 2086 { \ 2087 unsigned i; \ 2088 \ 2089 SPEED_RESTRICT_COND (s->size >= 1); \ 2090 \ 2091 speed_operand_src (s, s->xp, s->size); \ 2092 speed_cache_fill (s); \ 2093 \ 2094 speed_starttime (); \ 2095 i = s->reps; \ 2096 do \ 2097 function (s->xp, s->size); \ 2098 while (--i != 0); \ 2099 \ 2100 return speed_endtime (); \ 2101 } 2102 2103#define SPEED_ROUTINE_MPN_HAMDIST(function) \ 2104 { \ 2105 unsigned i; \ 2106 \ 2107 SPEED_RESTRICT_COND (s->size >= 1); \ 2108 \ 2109 speed_operand_src (s, s->xp, s->size); \ 2110 speed_operand_src (s, s->yp, s->size); \ 2111 speed_cache_fill (s); \ 2112 \ 2113 speed_starttime (); \ 2114 i = s->reps; \ 2115 do \ 2116 function (s->xp, s->yp, s->size); \ 2117 while (--i != 0); \ 2118 \ 2119 return speed_endtime (); \ 2120 } 2121 2122 2123#define SPEED_ROUTINE_MPZ_UI(function) \ 2124 { \ 2125 mpz_t z; \ 2126 unsigned i; \ 2127 double t; \ 2128 \ 2129 SPEED_RESTRICT_COND (s->size >= 0); \ 2130 \ 2131 mpz_init (z); \ 2132 \ 2133 speed_starttime (); \ 2134 i = s->reps; \ 2135 do \ 2136 function (z, s->size); \ 2137 while (--i != 0); \ 2138 t = speed_endtime (); \ 2139 \ 2140 mpz_clear (z); \ 2141 return t; \ 2142 } 2143 2144#define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2145#define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2146#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2147 2148 2149#define SPEED_ROUTINE_MPZ_2_UI(function) \ 2150 { \ 2151 mpz_t z, z2; \ 2152 unsigned i; \ 2153 double t; \ 2154 \ 2155 SPEED_RESTRICT_COND (s->size >= 0); \ 2156 \ 2157 mpz_init (z); \ 2158 mpz_init (z2); \ 2159 \ 2160 speed_starttime (); \ 2161 i = s->reps; \ 2162 do \ 2163 function (z, z2, s->size); \ 2164 while (--i != 0); \ 2165 t = speed_endtime (); \ 2166 \ 2167 mpz_clear (z); \ 2168 mpz_clear (z2); \ 2169 return t; \ 2170 } 2171 2172#define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2173#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2174 2175 2176#define SPEED_ROUTINE_MPN_FIB2_UI(function) \ 2177 { \ 2178 mp_ptr fp, f1p; \ 2179 mp_size_t alloc; \ 2180 unsigned i; \ 2181 double t; \ 2182 TMP_DECL; \ 2183 \ 2184 SPEED_RESTRICT_COND (s->size >= 0); \ 2185 \ 2186 TMP_MARK; \ 2187 alloc = MPN_FIB2_SIZE (s->size); \ 2188 SPEED_TMP_ALLOC_LIMBS (fp, alloc, s->align_xp); \ 2189 SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp); \ 2190 \ 2191 speed_starttime (); \ 2192 i = s->reps; \ 2193 do \ 2194 function (fp, f1p, s->size); \ 2195 while (--i != 0); \ 2196 t = speed_endtime (); \ 2197 \ 2198 TMP_FREE; \ 2199 return t; \ 2200 } 2201 2202 2203 2204/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6 2205 limbs. m is forced to odd so that redc can be used. e is limited in 2206 size so the calculation doesn't take too long. */ 2207#define SPEED_ROUTINE_MPZ_POWM(function) \ 2208 { \ 2209 mpz_t r, b, e, m; \ 2210 unsigned i; \ 2211 double t; \ 2212 \ 2213 SPEED_RESTRICT_COND (s->size >= 1); \ 2214 \ 2215 mpz_init (r); \ 2216 mpz_init_set_n (b, s->xp, s->size); \ 2217 mpz_init_set_n (m, s->yp, s->size); \ 2218 mpz_setbit (m, 0); /* force m to odd */ \ 2219 mpz_init_set_n (e, s->xp_block, 6); \ 2220 \ 2221 speed_starttime (); \ 2222 i = s->reps; \ 2223 do \ 2224 function (r, b, e, m); \ 2225 while (--i != 0); \ 2226 t = speed_endtime (); \ 2227 \ 2228 mpz_clear (r); \ 2229 mpz_clear (b); \ 2230 mpz_clear (e); \ 2231 mpz_clear (m); \ 2232 return t; \ 2233 } 2234 2235/* (m-2)^0xAAAAAAAA mod m */ 2236#define SPEED_ROUTINE_MPZ_POWM_UI(function) \ 2237 { \ 2238 mpz_t r, b, m; \ 2239 unsigned long e; \ 2240 unsigned i; \ 2241 double t; \ 2242 \ 2243 SPEED_RESTRICT_COND (s->size >= 1); \ 2244 \ 2245 mpz_init (r); \ 2246 \ 2247 /* force m to odd */ \ 2248 mpz_init (m); \ 2249 mpz_set_n (m, s->xp, s->size); \ 2250 PTR(m)[0] |= 1; \ 2251 \ 2252 e = (~ (unsigned long) 0) / 3; \ 2253 if (s->r != 0) \ 2254 e = s->r; \ 2255 \ 2256 mpz_init_set (b, m); \ 2257 mpz_sub_ui (b, b, 2); \ 2258/* printf ("%X\n", mpz_get_ui(m)); */ \ 2259 i = s->reps; \ 2260 speed_starttime (); \ 2261 do \ 2262 function (r, b, e, m); \ 2263 while (--i != 0); \ 2264 t = speed_endtime (); \ 2265 \ 2266 mpz_clear (r); \ 2267 mpz_clear (b); \ 2268 mpz_clear (m); \ 2269 return t; \ 2270 } 2271 2272 2273#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \ 2274 { \ 2275 mp_ptr wp, wp2, xp, yp; \ 2276 unsigned i; \ 2277 double t; \ 2278 TMP_DECL; \ 2279 \ 2280 SPEED_RESTRICT_COND (s->size >= 0); \ 2281 \ 2282 TMP_MARK; \ 2283 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 2284 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 2285 xp = s->xp; \ 2286 yp = s->yp; \ 2287 \ 2288 if (s->r == 0) ; \ 2289 else if (s->r == 1) { xp = wp; } \ 2290 else if (s->r == 2) { yp = wp2; } \ 2291 else if (s->r == 3) { xp = wp; yp = wp2; } \ 2292 else if (s->r == 4) { xp = wp2; yp = wp; } \ 2293 else { \ 2294 TMP_FREE; \ 2295 return -1.0; \ 2296 } \ 2297 if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \ 2298 if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \ 2299 \ 2300 speed_operand_src (s, xp, s->size); \ 2301 speed_operand_src (s, yp, s->size); \ 2302 speed_operand_dst (s, wp, s->size); \ 2303 speed_operand_dst (s, wp2, s->size); \ 2304 speed_cache_fill (s); \ 2305 \ 2306 speed_starttime (); \ 2307 i = s->reps; \ 2308 do \ 2309 call; \ 2310 while (--i != 0); \ 2311 t = speed_endtime (); \ 2312 \ 2313 TMP_FREE; \ 2314 return t; \ 2315 } 2316 2317#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \ 2318 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2319 (function (wp, wp2, xp, yp, s->size)); 2320 2321#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \ 2322 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2323 (function (wp, wp2, xp, yp, s->size, 0)); 2324 2325 2326/* Doing an Nx1 gcd with the given r. */ 2327#define SPEED_ROUTINE_MPN_GCD_1N(function) \ 2328 { \ 2329 mp_ptr xp; \ 2330 unsigned i; \ 2331 double t; \ 2332 TMP_DECL; \ 2333 \ 2334 SPEED_RESTRICT_COND (s->size >= 1); \ 2335 SPEED_RESTRICT_COND (s->r != 0); \ 2336 \ 2337 TMP_MARK; \ 2338 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 2339 MPN_COPY (xp, s->xp, s->size); \ 2340 xp[0] |= refmpn_zero_p (xp, s->size); \ 2341 \ 2342 speed_operand_src (s, s->xp, s->size); \ 2343 speed_cache_fill (s); \ 2344 \ 2345 speed_starttime (); \ 2346 i = s->reps; \ 2347 do \ 2348 function (xp, s->size, s->r); \ 2349 while (--i != 0); \ 2350 t = speed_endtime (); \ 2351 \ 2352 TMP_FREE; \ 2353 return t; \ 2354 } 2355 2356 2357/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */ 2358 2359#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \ 2360 { \ 2361 unsigned i, j; \ 2362 mp_ptr px, py; \ 2363 mp_limb_t x_mask, y_mask; \ 2364 double t; \ 2365 TMP_DECL; \ 2366 \ 2367 SPEED_RESTRICT_COND (s->size >= 1); \ 2368 SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \ 2369 \ 2370 TMP_MARK; \ 2371 SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp); \ 2372 SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp); \ 2373 MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \ 2374 MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \ 2375 \ 2376 x_mask = MP_LIMB_T_LOWBITMASK (s->size); \ 2377 y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \ 2378 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 2379 { \ 2380 px[i] &= x_mask; px[i] += (px[i] == 0); \ 2381 py[i] &= y_mask; py[i] += (py[i] == 0); \ 2382 setup; \ 2383 } \ 2384 \ 2385 speed_operand_src (s, px, SPEED_BLOCK_SIZE); \ 2386 speed_operand_src (s, py, SPEED_BLOCK_SIZE); \ 2387 speed_cache_fill (s); \ 2388 \ 2389 speed_starttime (); \ 2390 i = s->reps; \ 2391 do \ 2392 { \ 2393 j = SPEED_BLOCK_SIZE; \ 2394 do \ 2395 { \ 2396 call; \ 2397 } \ 2398 while (--j != 0); \ 2399 } \ 2400 while (--i != 0); \ 2401 t = speed_endtime (); \ 2402 \ 2403 TMP_FREE; \ 2404 \ 2405 s->time_divisor = SPEED_BLOCK_SIZE; \ 2406 return t; \ 2407 } 2408 2409#define SPEED_ROUTINE_MPN_GCD_1(function) \ 2410 SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1])) 2411 2412#define SPEED_ROUTINE_MPN_JACBASE(function) \ 2413 SPEED_ROUTINE_MPN_GCD_1_CALL \ 2414 ({ \ 2415 /* require x<y, y odd, y!=1 */ \ 2416 px[i] %= py[i]; \ 2417 px[i] |= 1; \ 2418 py[i] |= 1; \ 2419 if (py[i]==1) py[i]=3; \ 2420 }, \ 2421 function (px[j-1], py[j-1], 0)) 2422 2423 2424/* Run some GCDs of s->size limbs each. The number of different data values 2425 is decreased as s->size**2, since GCD is a quadratic algorithm. 2426 SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT 2427 though, because the plain gcd is about twice as fast as gcdext. */ 2428 2429#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \ 2430 { \ 2431 unsigned i; \ 2432 mp_size_t j, pieces, psize; \ 2433 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 2434 double t; \ 2435 TMP_DECL; \ 2436 \ 2437 SPEED_RESTRICT_COND (s->size >= 1); \ 2438 \ 2439 TMP_MARK; \ 2440 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 2441 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 2442 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 2443 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 2444 \ 2445 pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \ 2446 pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \ 2447 pieces = MAX (pieces, 1); \ 2448 \ 2449 psize = pieces * s->size; \ 2450 px = TMP_ALLOC_LIMBS (psize); \ 2451 py = TMP_ALLOC_LIMBS (psize); \ 2452 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 2453 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 2454 \ 2455 /* Requirements: x >= y, y must be odd, high limbs != 0. \ 2456 No need to ensure random numbers are really great. */ \ 2457 for (j = 0; j < pieces; j++) \ 2458 { \ 2459 mp_ptr x = px + j * s->size; \ 2460 mp_ptr y = py + j * s->size; \ 2461 if (x[s->size - 1] == 0) x[s->size - 1] = 1; \ 2462 if (y[s->size - 1] == 0) y[s->size - 1] = 1; \ 2463 \ 2464 if (x[s->size - 1] < y[s->size - 1]) \ 2465 MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]); \ 2466 else if (x[s->size - 1] == y[s->size - 1]) \ 2467 { \ 2468 x[s->size - 1] = 2; \ 2469 y[s->size - 1] = 1; \ 2470 } \ 2471 y[0] |= 1; \ 2472 } \ 2473 \ 2474 speed_operand_src (s, px, psize); \ 2475 speed_operand_src (s, py, psize); \ 2476 speed_operand_dst (s, xtmp, s->size); \ 2477 speed_operand_dst (s, ytmp, s->size); \ 2478 speed_operand_dst (s, wp, s->size); \ 2479 speed_cache_fill (s); \ 2480 \ 2481 speed_starttime (); \ 2482 i = s->reps; \ 2483 do \ 2484 { \ 2485 j = pieces; \ 2486 do \ 2487 { \ 2488 MPN_COPY (xtmp, px+(j - 1)*s->size, s->size); \ 2489 MPN_COPY (ytmp, py+(j - 1)*s->size, s->size); \ 2490 call; \ 2491 } \ 2492 while (--j != 0); \ 2493 } \ 2494 while (--i != 0); \ 2495 t = speed_endtime (); \ 2496 \ 2497 TMP_FREE; \ 2498 \ 2499 s->time_divisor = pieces; \ 2500 return t; \ 2501 } 2502 2503#define SPEED_ROUTINE_MPN_GCD(function) \ 2504 SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size)) 2505 2506#define SPEED_ROUTINE_MPN_GCDEXT(function) \ 2507 SPEED_ROUTINE_MPN_GCD_CALL \ 2508 (4, { mp_size_t wp2size; \ 2509 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); }) 2510 2511 2512#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \ 2513 { \ 2514 unsigned i; \ 2515 mp_size_t j, pieces, psize, wp2size; \ 2516 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 2517 double t; \ 2518 TMP_DECL; \ 2519 \ 2520 SPEED_RESTRICT_COND (s->size >= 1); \ 2521 \ 2522 TMP_MARK; \ 2523 \ 2524 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 2525 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 2526 MPN_COPY (xtmp, s->xp, s->size); \ 2527 MPN_COPY (ytmp, s->yp, s->size); \ 2528 \ 2529 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 2530 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 2531 \ 2532 pieces = SPEED_BLOCK_SIZE / 3; \ 2533 psize = 3 * pieces; \ 2534 px = TMP_ALLOC_LIMBS (psize); \ 2535 py = TMP_ALLOC_LIMBS (psize); \ 2536 MPN_COPY (px, s->xp_block, psize); \ 2537 MPN_COPY (py, s->yp_block, psize); \ 2538 \ 2539 /* x must have at least as many bits as y, \ 2540 high limbs must be non-zero */ \ 2541 for (j = 0; j < pieces; j++) \ 2542 { \ 2543 mp_ptr x = px+3*j; \ 2544 mp_ptr y = py+3*j; \ 2545 x[2] += (x[2] == 0); \ 2546 y[2] += (y[2] == 0); \ 2547 if (x[2] < y[2]) \ 2548 MP_LIMB_T_SWAP (x[2], y[2]); \ 2549 } \ 2550 \ 2551 speed_operand_src (s, px, psize); \ 2552 speed_operand_src (s, py, psize); \ 2553 speed_operand_dst (s, xtmp, s->size); \ 2554 speed_operand_dst (s, ytmp, s->size); \ 2555 speed_operand_dst (s, wp, s->size); \ 2556 speed_cache_fill (s); \ 2557 \ 2558 speed_starttime (); \ 2559 i = s->reps; \ 2560 do \ 2561 { \ 2562 mp_ptr x = px; \ 2563 mp_ptr y = py; \ 2564 mp_ptr xth = &xtmp[s->size-3]; \ 2565 mp_ptr yth = &ytmp[s->size-3]; \ 2566 j = pieces; \ 2567 do \ 2568 { \ 2569 xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \ 2570 yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \ 2571 \ 2572 ytmp[0] |= 1; /* y must be odd, */ \ 2573 \ 2574 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \ 2575 \ 2576 x += 3; \ 2577 y += 3; \ 2578 } \ 2579 while (--j != 0); \ 2580 } \ 2581 while (--i != 0); \ 2582 t = speed_endtime (); \ 2583 \ 2584 TMP_FREE; \ 2585 \ 2586 s->time_divisor = pieces; \ 2587 return t; \ 2588 } 2589 2590#define SPEED_ROUTINE_MPZ_JACOBI(function) \ 2591 { \ 2592 mpz_t a, b; \ 2593 unsigned i; \ 2594 mp_size_t j, pieces, psize; \ 2595 mp_ptr px, py; \ 2596 double t; \ 2597 TMP_DECL; \ 2598 \ 2599 TMP_MARK; \ 2600 pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \ 2601 pieces = MAX (pieces, 1); \ 2602 s->time_divisor = pieces; \ 2603 \ 2604 psize = pieces * s->size; \ 2605 px = TMP_ALLOC_LIMBS (psize); \ 2606 py = TMP_ALLOC_LIMBS (psize); \ 2607 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 2608 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 2609 \ 2610 for (j = 0; j < pieces; j++) \ 2611 { \ 2612 mp_ptr x = px+j*s->size; \ 2613 mp_ptr y = py+j*s->size; \ 2614 \ 2615 /* y odd */ \ 2616 y[0] |= 1; \ 2617 \ 2618 /* high limbs non-zero */ \ 2619 if (x[s->size-1] == 0) x[s->size-1] = 1; \ 2620 if (y[s->size-1] == 0) y[s->size-1] = 1; \ 2621 } \ 2622 \ 2623 SIZ(a) = s->size; \ 2624 SIZ(b) = s->size; \ 2625 \ 2626 speed_operand_src (s, px, psize); \ 2627 speed_operand_src (s, py, psize); \ 2628 speed_cache_fill (s); \ 2629 \ 2630 speed_starttime (); \ 2631 i = s->reps; \ 2632 do \ 2633 { \ 2634 j = pieces; \ 2635 do \ 2636 { \ 2637 PTR(a) = px+(j-1)*s->size; \ 2638 PTR(b) = py+(j-1)*s->size; \ 2639 function (a, b); \ 2640 } \ 2641 while (--j != 0); \ 2642 } \ 2643 while (--i != 0); \ 2644 t = speed_endtime (); \ 2645 \ 2646 TMP_FREE; \ 2647 return t; \ 2648 } 2649 2650#define SPEED_ROUTINE_MPN_DIVREM_2(function) \ 2651 { \ 2652 mp_ptr wp, xp; \ 2653 mp_limb_t yp[2]; \ 2654 unsigned i; \ 2655 double t; \ 2656 TMP_DECL; \ 2657 \ 2658 SPEED_RESTRICT_COND (s->size >= 2); \ 2659 \ 2660 TMP_MARK; \ 2661 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 2662 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 2663 \ 2664 /* source is destroyed */ \ 2665 MPN_COPY (xp, s->xp, s->size); \ 2666 \ 2667 /* divisor must be normalized */ \ 2668 MPN_COPY (yp, s->yp_block, 2); \ 2669 yp[1] |= GMP_NUMB_HIGHBIT; \ 2670 \ 2671 speed_operand_src (s, xp, s->size); \ 2672 speed_operand_src (s, yp, 2); \ 2673 speed_operand_dst (s, wp, s->size); \ 2674 speed_cache_fill (s); \ 2675 \ 2676 speed_starttime (); \ 2677 i = s->reps; \ 2678 do \ 2679 function (wp, 0, xp, s->size, yp); \ 2680 while (--i != 0); \ 2681 t = speed_endtime (); \ 2682 \ 2683 TMP_FREE; \ 2684 return t; \ 2685 } 2686 2687 2688#define SPEED_ROUTINE_MODLIMB_INVERT(function) \ 2689 { \ 2690 unsigned i, j; \ 2691 mp_ptr xp; \ 2692 mp_limb_t n = 1; \ 2693 double t; \ 2694 \ 2695 xp = s->xp_block-1; \ 2696 \ 2697 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \ 2698 speed_cache_fill (s); \ 2699 \ 2700 speed_starttime (); \ 2701 i = s->reps; \ 2702 do \ 2703 { \ 2704 j = SPEED_BLOCK_SIZE; \ 2705 do \ 2706 { \ 2707 /* randomized but successively dependent */ \ 2708 n += (xp[j] << 1); \ 2709 \ 2710 function (n, n); \ 2711 } \ 2712 while (--j != 0); \ 2713 } \ 2714 while (--i != 0); \ 2715 t = speed_endtime (); \ 2716 \ 2717 /* make sure the compiler won't optimize away n */ \ 2718 noop_1 (n); \ 2719 \ 2720 s->time_divisor = SPEED_BLOCK_SIZE; \ 2721 return t; \ 2722 } 2723 2724 2725#define SPEED_ROUTINE_MPN_SQRTREM(function) \ 2726 { \ 2727 mp_ptr wp, wp2; \ 2728 unsigned i; \ 2729 double t; \ 2730 TMP_DECL; \ 2731 \ 2732 SPEED_RESTRICT_COND (s->size >= 1); \ 2733 \ 2734 TMP_MARK; \ 2735 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 2736 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 2737 \ 2738 speed_operand_src (s, s->xp, s->size); \ 2739 speed_operand_dst (s, wp, s->size); \ 2740 speed_operand_dst (s, wp2, s->size); \ 2741 speed_cache_fill (s); \ 2742 \ 2743 speed_starttime (); \ 2744 i = s->reps; \ 2745 do \ 2746 function (wp, wp2, s->xp, s->size); \ 2747 while (--i != 0); \ 2748 t = speed_endtime (); \ 2749 \ 2750 TMP_FREE; \ 2751 return t; \ 2752 } 2753 2754#define SPEED_ROUTINE_MPN_ROOTREM(function) \ 2755 { \ 2756 mp_ptr wp, wp2; \ 2757 unsigned i; \ 2758 double t; \ 2759 TMP_DECL; \ 2760 \ 2761 SPEED_RESTRICT_COND (s->size >= 1); \ 2762 \ 2763 TMP_MARK; \ 2764 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 2765 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 2766 \ 2767 speed_operand_src (s, s->xp, s->size); \ 2768 speed_operand_dst (s, wp, s->size); \ 2769 speed_operand_dst (s, wp2, s->size); \ 2770 speed_cache_fill (s); \ 2771 \ 2772 speed_starttime (); \ 2773 i = s->reps; \ 2774 do \ 2775 function (wp, wp2, s->xp, s->size, s->r); \ 2776 while (--i != 0); \ 2777 t = speed_endtime (); \ 2778 \ 2779 TMP_FREE; \ 2780 return t; \ 2781 } 2782 2783 2784/* s->size controls the number of limbs in the input, s->r is the base, or 2785 decimal by default. */ 2786#define SPEED_ROUTINE_MPN_GET_STR(function) \ 2787 { \ 2788 unsigned char *wp; \ 2789 mp_size_t wn; \ 2790 mp_ptr xp; \ 2791 int base; \ 2792 unsigned i; \ 2793 double t; \ 2794 TMP_DECL; \ 2795 \ 2796 SPEED_RESTRICT_COND (s->size >= 1); \ 2797 \ 2798 base = s->r == 0 ? 10 : s->r; \ 2799 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 2800 \ 2801 TMP_MARK; \ 2802 SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp); \ 2803 \ 2804 MPN_SIZEINBASE (wn, s->xp, s->size, base); \ 2805 wp = TMP_ALLOC (wn); \ 2806 \ 2807 /* use this during development to guard against overflowing wp */ \ 2808 /* \ 2809 MPN_COPY (xp, s->xp, s->size); \ 2810 ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn); \ 2811 */ \ 2812 \ 2813 speed_operand_src (s, s->xp, s->size); \ 2814 speed_operand_dst (s, xp, s->size); \ 2815 speed_operand_dst (s, (mp_ptr) wp, wn/BYTES_PER_MP_LIMB); \ 2816 speed_cache_fill (s); \ 2817 \ 2818 speed_starttime (); \ 2819 i = s->reps; \ 2820 do \ 2821 { \ 2822 MPN_COPY (xp, s->xp, s->size); \ 2823 function (wp, base, xp, s->size); \ 2824 } \ 2825 while (--i != 0); \ 2826 t = speed_endtime (); \ 2827 \ 2828 TMP_FREE; \ 2829 return t; \ 2830 } 2831 2832/* s->size controls the number of digits in the input, s->r is the base, or 2833 decimal by default. */ 2834#define SPEED_ROUTINE_MPN_SET_STR_CALL(call) \ 2835 { \ 2836 unsigned char *xp; \ 2837 mp_ptr wp; \ 2838 mp_size_t wn; \ 2839 unsigned i; \ 2840 int base; \ 2841 double t; \ 2842 TMP_DECL; \ 2843 \ 2844 SPEED_RESTRICT_COND (s->size >= 1); \ 2845 \ 2846 base = s->r == 0 ? 10 : s->r; \ 2847 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 2848 \ 2849 TMP_MARK; \ 2850 \ 2851 xp = TMP_ALLOC (s->size); \ 2852 for (i = 0; i < s->size; i++) \ 2853 xp[i] = s->xp[i] % base; \ 2854 \ 2855 wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly)) \ 2856 / GMP_LIMB_BITS + 2; \ 2857 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 2858 \ 2859 /* use this during development to check wn is big enough */ \ 2860 /* \ 2861 ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn); \ 2862 */ \ 2863 \ 2864 speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB); \ 2865 speed_operand_dst (s, wp, wn); \ 2866 speed_cache_fill (s); \ 2867 \ 2868 speed_starttime (); \ 2869 i = s->reps; \ 2870 do \ 2871 call; \ 2872 while (--i != 0); \ 2873 t = speed_endtime (); \ 2874 \ 2875 TMP_FREE; \ 2876 return t; \ 2877 } 2878 2879 2880/* Run an accel gcd find_a() function over various data values. A set of 2881 values is used in case some run particularly fast or slow. The size 2882 parameter is ignored, the amount of data tested is fixed. */ 2883 2884#define SPEED_ROUTINE_MPN_GCD_FINDA(function) \ 2885 { \ 2886 unsigned i, j; \ 2887 mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \ 2888 double t; \ 2889 TMP_DECL; \ 2890 \ 2891 TMP_MARK; \ 2892 \ 2893 /* low must be odd, high must be non-zero */ \ 2894 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 2895 { \ 2896 cp[i][0] = s->xp_block[i] | 1; \ 2897 cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \ 2898 } \ 2899 \ 2900 speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \ 2901 speed_cache_fill (s); \ 2902 \ 2903 speed_starttime (); \ 2904 i = s->reps; \ 2905 do \ 2906 { \ 2907 j = SPEED_BLOCK_SIZE; \ 2908 do \ 2909 { \ 2910 function (cp[j-1]); \ 2911 } \ 2912 while (--j != 0); \ 2913 } \ 2914 while (--i != 0); \ 2915 t = speed_endtime (); \ 2916 \ 2917 TMP_FREE; \ 2918 \ 2919 s->time_divisor = SPEED_BLOCK_SIZE; \ 2920 return t; \ 2921 } 2922 2923 2924/* "call" should do "count_foo_zeros(c,n)". 2925 Give leading=1 if foo is leading zeros, leading=0 for trailing. 2926 Give zero=1 if n=0 is allowed in the call, zero=0 if not. */ 2927 2928#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \ 2929 { \ 2930 mp_ptr xp; \ 2931 int i, c; \ 2932 unsigned j; \ 2933 mp_limb_t n; \ 2934 double t; \ 2935 TMP_DECL; \ 2936 \ 2937 TMP_MARK; \ 2938 SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp); \ 2939 \ 2940 if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \ 2941 return -1.0; \ 2942 speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \ 2943 speed_cache_fill (s); \ 2944 \ 2945 c = 0; \ 2946 speed_starttime (); \ 2947 j = s->reps; \ 2948 do { \ 2949 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 2950 { \ 2951 n = xp[i]; \ 2952 n ^= c; \ 2953 2954#define SPEED_ROUTINE_COUNT_ZEROS_B() \ 2955 } \ 2956 } while (--j != 0); \ 2957 t = speed_endtime (); \ 2958 \ 2959 /* don't let c go dead */ \ 2960 noop_1 (c); \ 2961 \ 2962 s->time_divisor = SPEED_BLOCK_SIZE; \ 2963 \ 2964 TMP_FREE; \ 2965 return t; \ 2966 } \ 2967 2968#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \ 2969 do { \ 2970 SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \ 2971 call; \ 2972 SPEED_ROUTINE_COUNT_ZEROS_B (); \ 2973 } while (0) \ 2974 2975#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \ 2976 SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero) 2977#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \ 2978 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0) 2979 2980#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \ 2981 SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero) 2982#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \ 2983 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0) 2984 2985 2986#define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \ 2987 { \ 2988 unsigned i, j; \ 2989 mp_limb_t d, dinv=0; \ 2990 mp_ptr xp = s->xp_block - 1; \ 2991 \ 2992 s->time_divisor = SPEED_BLOCK_SIZE; \ 2993 \ 2994 speed_starttime (); \ 2995 i = s->reps; \ 2996 do \ 2997 { \ 2998 j = SPEED_BLOCK_SIZE; \ 2999 do \ 3000 { \ 3001 d = dinv ^ xp[j]; \ 3002 d |= GMP_LIMB_HIGHBIT; \ 3003 do { call; } while (0); \ 3004 } \ 3005 while (--j != 0); \ 3006 } \ 3007 while (--i != 0); \ 3008 \ 3009 /* don't let the compiler optimize everything away */ \ 3010 noop_1 (dinv); \ 3011 \ 3012 return speed_endtime(); \ 3013 } 3014 3015 3016#endif 3017 3018 3019#define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \ 3020 { \ 3021 unsigned i; \ 3022 speed_starttime (); \ 3023 i = s->reps; \ 3024 do \ 3025 function (); \ 3026 while (--i != 0); \ 3027 return speed_endtime (); \ 3028 } 3029 3030 3031#define SPEED_ROUTINE_MPN_ZERO_CALL(call) \ 3032 { \ 3033 mp_ptr wp; \ 3034 unsigned i; \ 3035 double t; \ 3036 TMP_DECL; \ 3037 \ 3038 SPEED_RESTRICT_COND (s->size >= 0); \ 3039 \ 3040 TMP_MARK; \ 3041 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3042 speed_operand_dst (s, wp, s->size); \ 3043 speed_cache_fill (s); \ 3044 \ 3045 speed_starttime (); \ 3046 i = s->reps; \ 3047 do \ 3048 call; \ 3049 while (--i != 0); \ 3050 t = speed_endtime (); \ 3051 \ 3052 TMP_FREE; \ 3053 return t; \ 3054 } 3055 3056#define SPEED_ROUTINE_MPN_ZERO(function) \ 3057 SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size)) 3058