1/* Shared speed subroutines. 2 3Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2009, 2010 4Free Software Foundation, Inc. 5 6This file is part of the GNU MP Library. 7 8The GNU MP Library is free software; you can redistribute it and/or modify 9it under the terms of the GNU Lesser General Public License as published by 10the Free Software Foundation; either version 3 of the License, or (at your 11option) any later version. 12 13The GNU MP Library is distributed in the hope that it will be useful, but 14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16License for more details. 17 18You should have received a copy of the GNU Lesser General Public License 19along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21#define __GMP_NO_ATTRIBUTE_CONST_PURE 22 23#include <errno.h> 24#include <fcntl.h> 25#include <math.h> 26#include <stdio.h> 27#include <stdlib.h> /* for qsort */ 28#include <string.h> 29#include <unistd.h> 30#if 0 31#include <sys/ioctl.h> 32#endif 33 34#include "gmp.h" 35#include "gmp-impl.h" 36#include "longlong.h" 37 38#include "tests.h" 39#include "speed.h" 40 41 42int speed_option_addrs = 0; 43int speed_option_verbose = 0; 44 45 46/* Provide __clz_tab even if it's not required, for the benefit of new code 47 being tested with many.pl. */ 48#ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 49#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 50#include "mp_clz_tab.c" 51#undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 52#endif 53 54 55void 56pentium_wbinvd(void) 57{ 58#if 0 59 { 60 static int fd = -2; 61 62 if (fd == -2) 63 { 64 fd = open ("/dev/wbinvd", O_RDWR); 65 if (fd == -1) 66 perror ("open /dev/wbinvd"); 67 } 68 69 if (fd != -1) 70 ioctl (fd, 0, 0); 71 } 72#endif 73 74#if 0 75#define WBINVDSIZE 1024*1024*2 76 { 77 static char *p = NULL; 78 int i, sum; 79 80 if (p == NULL) 81 p = malloc (WBINVDSIZE); 82 83#if 0 84 for (i = 0; i < WBINVDSIZE; i++) 85 p[i] = i & 0xFF; 86#endif 87 88 sum = 0; 89 for (i = 0; i < WBINVDSIZE; i++) 90 sum += p[i]; 91 92 mpn_cache_fill_dummy (sum); 93 } 94#endif 95} 96 97 98int 99double_cmp_ptr (const double *p, const double *q) 100{ 101 if (*p > *q) return 1; 102 if (*p < *q) return -1; 103 return 0; 104} 105 106 107/* Measure the speed of a given routine. 108 109 The routine is run with enough repetitions to make it take at least 110 speed_precision * speed_unittime. This aims to minimize the effects of a 111 limited accuracy time base and the overhead of the measuring itself. 112 113 Measurements are made looking for 4 results within TOLERANCE of each 114 other (or 3 for routines taking longer than 2 seconds). This aims to get 115 an accurate reading even if some runs are bloated by interrupts or task 116 switches or whatever. 117 118 The given (*fun)() is expected to run its function "s->reps" many times 119 and return the total elapsed time measured using speed_starttime() and 120 speed_endtime(). If the function doesn't support the given s->size or 121 s->r, -1.0 should be returned. See the various base routines below. */ 122 123double 124speed_measure (double (*fun) __GMP_PROTO ((struct speed_params *s)), 125 struct speed_params *s) 126{ 127#define TOLERANCE 1.005 /* 0.5% */ 128 const int max_zeros = 10; 129 130 struct speed_params s_dummy; 131 int i, j, e; 132 double t[30]; 133 double t_unsorted[30]; 134 double reps_d; 135 int zeros = 0; 136 137 /* Use dummy parameters if caller doesn't provide any. Only a few special 138 "fun"s will cope with this, speed_noop() is one. */ 139 if (s == NULL) 140 { 141 memset (&s_dummy, '\0', sizeof (s_dummy)); 142 s = &s_dummy; 143 } 144 145 s->reps = 1; 146 s->time_divisor = 1.0; 147 for (i = 0; i < numberof (t); i++) 148 { 149 for (;;) 150 { 151 s->src_num = 0; 152 s->dst_num = 0; 153 154 t[i] = (*fun) (s); 155 156 if (speed_option_verbose >= 3) 157 gmp_printf("size=%ld reps=%u r=%Md attempt=%d %.9f\n", 158 (long) s->size, s->reps, s->r, i, t[i]); 159 160 if (t[i] == 0.0) 161 { 162 zeros++; 163 if (zeros > max_zeros) 164 { 165 fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros); 166 abort (); 167 } 168 continue; 169 } 170 171 if (t[i] == -1.0) 172 return -1.0; 173 174 if (t[i] >= speed_unittime * speed_precision) 175 break; 176 177 /* go to a value of reps to make t[i] >= precision */ 178 reps_d = ceil (1.1 * s->reps 179 * speed_unittime * speed_precision 180 / MAX (t[i], speed_unittime)); 181 if (reps_d > 2e9 || reps_d < 1.0) 182 { 183 fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d); 184 fprintf (stderr, " (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n", 185 s->reps, speed_unittime, speed_precision, t[i]); 186 abort (); 187 } 188 s->reps = (unsigned) reps_d; 189 } 190 t[i] /= s->reps; 191 t_unsorted[i] = t[i]; 192 193 if (speed_precision == 0) 194 return t[i]; 195 196 /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */ 197 if (t[0] >= 2.0) 198 e = 3; 199 else 200 e = 4; 201 202 /* Look for e many t[]'s within TOLERANCE of each other to consider a 203 valid measurement. Return smallest among them. */ 204 if (i >= e) 205 { 206 qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr); 207 for (j = e-1; j < i; j++) 208 if (t[j] <= t[j-e+1] * TOLERANCE) 209 return t[j-e+1] / s->time_divisor; 210 } 211 } 212 213 fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n", 214 e, (TOLERANCE-1.0)*100.0); 215 fprintf (stderr, " unsorted sorted\n"); 216 fprintf (stderr, " %.12f %.12f is about 0.5%%\n", 217 t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0)); 218 for (i = 0; i < numberof (t); i++) 219 fprintf (stderr, " %.09f %.09f\n", t_unsorted[i], t[i]); 220 221 return -1.0; 222} 223 224 225/* Read all of ptr,size to get it into the CPU memory cache. 226 227 A call to mpn_cache_fill_dummy() is used to make sure the compiler 228 doesn't optimize away the whole loop. Using "volatile mp_limb_t sum" 229 would work too, but the function call means we don't rely on every 230 compiler actually implementing volatile properly. 231 232 mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking 233 it can inline it. */ 234 235void 236mpn_cache_fill (mp_srcptr ptr, mp_size_t size) 237{ 238 mp_limb_t sum = 0; 239 mp_size_t i; 240 241 for (i = 0; i < size; i++) 242 sum += ptr[i]; 243 244 mpn_cache_fill_dummy(sum); 245} 246 247 248void 249mpn_cache_fill_write (mp_ptr ptr, mp_size_t size) 250{ 251 mpn_cache_fill (ptr, size); 252 253#if 0 254 mpn_random (ptr, size); 255#endif 256 257#if 0 258 mp_size_t i; 259 260 for (i = 0; i < size; i++) 261 ptr[i] = i; 262#endif 263} 264 265 266void 267speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size) 268{ 269 if (s->src_num >= numberof (s->src)) 270 { 271 fprintf (stderr, "speed_operand_src: no room left in s->src[]\n"); 272 abort (); 273 } 274 s->src[s->src_num].ptr = ptr; 275 s->src[s->src_num].size = size; 276 s->src_num++; 277} 278 279 280void 281speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size) 282{ 283 if (s->dst_num >= numberof (s->dst)) 284 { 285 fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n"); 286 abort (); 287 } 288 s->dst[s->dst_num].ptr = ptr; 289 s->dst[s->dst_num].size = size; 290 s->dst_num++; 291} 292 293 294void 295speed_cache_fill (struct speed_params *s) 296{ 297 static struct speed_params prev; 298 int i; 299 300 /* FIXME: need a better way to get the format string for a pointer */ 301 302 if (speed_option_addrs) 303 { 304 int different; 305 306 different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num); 307 for (i = 0; i < s->dst_num; i++) 308 different |= (s->dst[i].ptr != prev.dst[i].ptr); 309 for (i = 0; i < s->src_num; i++) 310 different |= (s->src[i].ptr != prev.src[i].ptr); 311 312 if (different) 313 { 314 if (s->dst_num != 0) 315 { 316 printf ("dst"); 317 for (i = 0; i < s->dst_num; i++) 318 printf (" %08lX", (unsigned long) s->dst[i].ptr); 319 printf (" "); 320 } 321 322 if (s->src_num != 0) 323 { 324 printf ("src"); 325 for (i = 0; i < s->src_num; i++) 326 printf (" %08lX", (unsigned long) s->src[i].ptr); 327 printf (" "); 328 } 329 printf (" (cf sp approx %08lX)\n", (unsigned long) &different); 330 331 } 332 333 memcpy (&prev, s, sizeof(prev)); 334 } 335 336 switch (s->cache) { 337 case 0: 338 for (i = 0; i < s->dst_num; i++) 339 mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size); 340 for (i = 0; i < s->src_num; i++) 341 mpn_cache_fill (s->src[i].ptr, s->src[i].size); 342 break; 343 case 1: 344 pentium_wbinvd(); 345 break; 346 } 347} 348 349 350/* Miscellanous options accepted by tune and speed programs under -o. */ 351 352void 353speed_option_set (const char *s) 354{ 355 int n; 356 357 if (strcmp (s, "addrs") == 0) 358 { 359 speed_option_addrs = 1; 360 } 361 else if (strcmp (s, "verbose") == 0) 362 { 363 speed_option_verbose++; 364 } 365 else if (sscanf (s, "verbose=%d", &n) == 1) 366 { 367 speed_option_verbose = n; 368 } 369 else 370 { 371 printf ("Unrecognised -o option: %s\n", s); 372 exit (1); 373 } 374} 375 376 377/* The following are basic speed running routines for various gmp functions. 378 Many are very similar and use speed.h macros. 379 380 Each routine allocates it's own destination space for the result of the 381 function, because only it can know what the function needs. 382 383 speed_starttime() and speed_endtime() are put tight around the code to be 384 measured. Any setups are done outside the timed portion. 385 386 Each routine is responsible for its own cache priming. 387 speed_cache_fill() is a good way to do this, see examples in speed.h. 388 One cache priming possibility, for CPUs with write-allocate cache, and 389 functions that don't take too long, is to do one dummy call before timing 390 so as to cache everything that gets used. But speed_measure() runs a 391 routine at least twice and will take the smaller time, so this might not 392 be necessary. 393 394 Data alignment will be important, for source, destination and temporary 395 workspace. A routine can align its destination and workspace. Programs 396 using the routines will ensure s->xp and s->yp are aligned. Aligning 397 onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and 398 s->align_wp2 should be respected where it makes sense to do so. 399 SPEED_TMP_ALLOC_LIMBS is a good way to do this. 400 401 A loop of the following form can be expected to turn into good assembler 402 code on most CPUs, thereby minimizing overhead in the measurement. It 403 can always be assumed s->reps >= 1. 404 405 i = s->reps 406 do 407 foo(); 408 while (--i != 0); 409 410 Additional parameters might be added to "struct speed_params" in the 411 future. Routines should ignore anything they don't use. 412 413 s->size can be used creatively, and s->xp and s->yp can be ignored. For 414 example, speed_mpz_fac_ui() uses s->size as n for the factorial. s->r is 415 just a user-supplied parameter. speed_mpn_lshift() uses it as a shift, 416 speed_mpn_mul_1() uses it as a multiplier. */ 417 418 419/* MPN_COPY etc can be macros, so the _CALL forms are necessary */ 420double 421speed_MPN_COPY (struct speed_params *s) 422{ 423 SPEED_ROUTINE_MPN_COPY (MPN_COPY); 424} 425double 426speed_MPN_COPY_INCR (struct speed_params *s) 427{ 428 SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR); 429} 430double 431speed_MPN_COPY_DECR (struct speed_params *s) 432{ 433 SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR); 434} 435#if HAVE_NATIVE_mpn_copyi 436double 437speed_mpn_copyi (struct speed_params *s) 438{ 439 SPEED_ROUTINE_MPN_COPY (mpn_copyi); 440} 441#endif 442#if HAVE_NATIVE_mpn_copyd 443double 444speed_mpn_copyd (struct speed_params *s) 445{ 446 SPEED_ROUTINE_MPN_COPY (mpn_copyd); 447} 448#endif 449double 450speed_memcpy (struct speed_params *s) 451{ 452 SPEED_ROUTINE_MPN_COPY_BYTES (memcpy); 453} 454double 455speed_mpn_com (struct speed_params *s) 456{ 457 SPEED_ROUTINE_MPN_COPY (mpn_com); 458} 459 460 461double 462speed_mpn_addmul_1 (struct speed_params *s) 463{ 464 SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1); 465} 466double 467speed_mpn_submul_1 (struct speed_params *s) 468{ 469 SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1); 470} 471 472#if HAVE_NATIVE_mpn_addmul_2 473double 474speed_mpn_addmul_2 (struct speed_params *s) 475{ 476 SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2); 477} 478#endif 479#if HAVE_NATIVE_mpn_addmul_3 480double 481speed_mpn_addmul_3 (struct speed_params *s) 482{ 483 SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3); 484} 485#endif 486#if HAVE_NATIVE_mpn_addmul_4 487double 488speed_mpn_addmul_4 (struct speed_params *s) 489{ 490 SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4); 491} 492#endif 493#if HAVE_NATIVE_mpn_addmul_5 494double 495speed_mpn_addmul_5 (struct speed_params *s) 496{ 497 SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5); 498} 499#endif 500#if HAVE_NATIVE_mpn_addmul_6 501double 502speed_mpn_addmul_6 (struct speed_params *s) 503{ 504 SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6); 505} 506#endif 507#if HAVE_NATIVE_mpn_addmul_7 508double 509speed_mpn_addmul_7 (struct speed_params *s) 510{ 511 SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7); 512} 513#endif 514#if HAVE_NATIVE_mpn_addmul_8 515double 516speed_mpn_addmul_8 (struct speed_params *s) 517{ 518 SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8); 519} 520#endif 521 522double 523speed_mpn_mul_1 (struct speed_params *s) 524{ 525 SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1); 526} 527double 528speed_mpn_mul_1_inplace (struct speed_params *s) 529{ 530 SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1); 531} 532 533#if HAVE_NATIVE_mpn_mul_2 534double 535speed_mpn_mul_2 (struct speed_params *s) 536{ 537 SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2); 538} 539#endif 540#if HAVE_NATIVE_mpn_mul_3 541double 542speed_mpn_mul_3 (struct speed_params *s) 543{ 544 SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3); 545} 546#endif 547#if HAVE_NATIVE_mpn_mul_4 548double 549speed_mpn_mul_4 (struct speed_params *s) 550{ 551 SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4); 552} 553#endif 554 555 556double 557speed_mpn_lshift (struct speed_params *s) 558{ 559 SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift); 560} 561double 562speed_mpn_lshiftc (struct speed_params *s) 563{ 564 SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc); 565} 566double 567speed_mpn_rshift (struct speed_params *s) 568{ 569 SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift); 570} 571 572 573/* The carry-in variants (if available) are good for measuring because they 574 won't skip a division if high<divisor. Alternately, use -1 as a divisor 575 with the plain _1 forms. */ 576double 577speed_mpn_divrem_1 (struct speed_params *s) 578{ 579 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1); 580} 581double 582speed_mpn_divrem_1f (struct speed_params *s) 583{ 584 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1); 585} 586#if HAVE_NATIVE_mpn_divrem_1c 587double 588speed_mpn_divrem_1c (struct speed_params *s) 589{ 590 SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c); 591} 592double 593speed_mpn_divrem_1cf (struct speed_params *s) 594{ 595 SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c); 596} 597#endif 598 599double 600speed_mpn_divrem_1_div (struct speed_params *s) 601{ 602 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div); 603} 604double 605speed_mpn_divrem_1f_div (struct speed_params *s) 606{ 607 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div); 608} 609double 610speed_mpn_divrem_1_inv (struct speed_params *s) 611{ 612 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv); 613} 614double 615speed_mpn_divrem_1f_inv (struct speed_params *s) 616{ 617 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv); 618} 619double 620speed_mpn_mod_1_div (struct speed_params *s) 621{ 622 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div); 623} 624double 625speed_mpn_mod_1_inv (struct speed_params *s) 626{ 627 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv); 628} 629 630double 631speed_mpn_preinv_divrem_1 (struct speed_params *s) 632{ 633 SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1); 634} 635double 636speed_mpn_preinv_divrem_1f (struct speed_params *s) 637{ 638 SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1); 639} 640 641#if GMP_NUMB_BITS % 4 == 0 642double 643speed_mpn_mod_34lsub1 (struct speed_params *s) 644{ 645 SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1); 646} 647#endif 648 649double 650speed_mpn_divrem_2 (struct speed_params *s) 651{ 652 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2); 653} 654double 655speed_mpn_divrem_2_div (struct speed_params *s) 656{ 657 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div); 658} 659double 660speed_mpn_divrem_2_inv (struct speed_params *s) 661{ 662 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv); 663} 664 665double 666speed_mpn_mod_1 (struct speed_params *s) 667{ 668 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1); 669} 670#if HAVE_NATIVE_mpn_mod_1c 671double 672speed_mpn_mod_1c (struct speed_params *s) 673{ 674 SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c); 675} 676#endif 677double 678speed_mpn_preinv_mod_1 (struct speed_params *s) 679{ 680 SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1); 681} 682double 683speed_mpn_mod_1_1 (struct speed_params *s) 684{ 685 SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p,mpn_mod_1_1p_cps); 686} 687double 688speed_mpn_mod_1_2 (struct speed_params *s) 689{ 690 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_2p,mpn_mod_1s_2p_cps,2); 691} 692double 693speed_mpn_mod_1_3 (struct speed_params *s) 694{ 695 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_3p,mpn_mod_1s_3p_cps,3); 696} 697double 698speed_mpn_mod_1_4 (struct speed_params *s) 699{ 700 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_4p,mpn_mod_1s_4p_cps,4); 701} 702 703double 704speed_mpn_divexact_1 (struct speed_params *s) 705{ 706 SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1); 707} 708 709double 710speed_mpn_divexact_by3 (struct speed_params *s) 711{ 712 SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3); 713} 714 715double 716speed_mpn_bdiv_dbm1c (struct speed_params *s) 717{ 718 SPEED_ROUTINE_MPN_BDIV_DBM1C (mpn_bdiv_dbm1c); 719} 720 721double 722speed_mpn_bdiv_q_1 (struct speed_params *s) 723{ 724 SPEED_ROUTINE_MPN_BDIV_Q_1 (mpn_bdiv_q_1); 725} 726 727double 728speed_mpn_pi1_bdiv_q_1 (struct speed_params *s) 729{ 730 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1 (mpn_pi1_bdiv_q_1); 731} 732 733#if HAVE_NATIVE_mpn_modexact_1_odd 734double 735speed_mpn_modexact_1_odd (struct speed_params *s) 736{ 737 SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd); 738} 739#endif 740 741double 742speed_mpn_modexact_1c_odd (struct speed_params *s) 743{ 744 SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd); 745} 746 747double 748speed_mpz_mod (struct speed_params *s) 749{ 750 SPEED_ROUTINE_MPZ_MOD (mpz_mod); 751} 752 753double 754speed_mpn_sbpi1_div_qr (struct speed_params *s) 755{ 756 SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_div_qr, inv.inv32, 2,0); 757} 758double 759speed_mpn_dcpi1_div_qr (struct speed_params *s) 760{ 761 SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_div_qr, &inv, 6,3); 762} 763double 764speed_mpn_sbpi1_divappr_q (struct speed_params *s) 765{ 766 SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_divappr_q, inv.inv32, 2,0); 767} 768double 769speed_mpn_dcpi1_divappr_q (struct speed_params *s) 770{ 771 SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_divappr_q, &inv, 6,3); 772} 773double 774speed_mpn_mu_div_qr (struct speed_params *s) 775{ 776 SPEED_ROUTINE_MPN_MU_DIV_QR (mpn_mu_div_qr, mpn_mu_div_qr_itch); 777} 778double 779speed_mpn_mu_divappr_q (struct speed_params *s) 780{ 781 SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_divappr_q, mpn_mu_divappr_q_itch); 782} 783double 784speed_mpn_mu_div_q (struct speed_params *s) 785{ 786 SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_div_q, mpn_mu_div_q_itch); 787} 788double 789speed_mpn_mupi_div_qr (struct speed_params *s) 790{ 791 SPEED_ROUTINE_MPN_MUPI_DIV_QR (mpn_preinv_mu_div_qr, mpn_preinv_mu_div_qr_itch); 792} 793 794double 795speed_mpn_sbpi1_bdiv_qr (struct speed_params *s) 796{ 797 SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_sbpi1_bdiv_qr); 798} 799double 800speed_mpn_dcpi1_bdiv_qr (struct speed_params *s) 801{ 802 SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_dcpi1_bdiv_qr); 803} 804double 805speed_mpn_sbpi1_bdiv_q (struct speed_params *s) 806{ 807 SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_sbpi1_bdiv_q); 808} 809double 810speed_mpn_dcpi1_bdiv_q (struct speed_params *s) 811{ 812 SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_dcpi1_bdiv_q); 813} 814double 815speed_mpn_mu_bdiv_q (struct speed_params *s) 816{ 817 SPEED_ROUTINE_MPN_MU_BDIV_Q (mpn_mu_bdiv_q, mpn_mu_bdiv_q_itch); 818} 819double 820speed_mpn_mu_bdiv_qr (struct speed_params *s) 821{ 822 SPEED_ROUTINE_MPN_MU_BDIV_QR (mpn_mu_bdiv_qr, mpn_mu_bdiv_qr_itch); 823} 824 825double 826speed_mpn_binvert (struct speed_params *s) 827{ 828 SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch); 829} 830 831double 832speed_mpn_invert (struct speed_params *s) 833{ 834 SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch); 835} 836 837double 838speed_mpn_invertappr (struct speed_params *s) 839{ 840 SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch); 841} 842 843double 844speed_mpn_ni_invertappr (struct speed_params *s) 845{ 846 SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch); 847} 848 849double 850speed_mpn_redc_1 (struct speed_params *s) 851{ 852 SPEED_ROUTINE_REDC_1 (mpn_redc_1); 853} 854double 855speed_mpn_redc_2 (struct speed_params *s) 856{ 857 SPEED_ROUTINE_REDC_2 (mpn_redc_2); 858} 859double 860speed_mpn_redc_n (struct speed_params *s) 861{ 862 SPEED_ROUTINE_REDC_N (mpn_redc_n); 863} 864 865 866double 867speed_mpn_popcount (struct speed_params *s) 868{ 869 SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount); 870} 871double 872speed_mpn_hamdist (struct speed_params *s) 873{ 874 SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist); 875} 876 877 878double 879speed_mpn_add_n (struct speed_params *s) 880{ 881 SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n); 882} 883double 884speed_mpn_sub_n (struct speed_params *s) 885{ 886SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n); 887} 888 889#if HAVE_NATIVE_mpn_add_n_sub_n 890double 891speed_mpn_add_n_sub_n (struct speed_params *s) 892{ 893 SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size)); 894} 895#endif 896 897#if HAVE_NATIVE_mpn_addlsh1_n 898double 899speed_mpn_addlsh1_n (struct speed_params *s) 900{ 901 SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n); 902} 903#endif 904#if HAVE_NATIVE_mpn_sublsh1_n 905double 906speed_mpn_sublsh1_n (struct speed_params *s) 907{ 908 SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n); 909} 910#endif 911#if HAVE_NATIVE_mpn_rsblsh1_n 912double 913speed_mpn_rsblsh1_n (struct speed_params *s) 914{ 915 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n); 916} 917#endif 918#if HAVE_NATIVE_mpn_addlsh2_n 919double 920speed_mpn_addlsh2_n (struct speed_params *s) 921{ 922 SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n); 923} 924#endif 925#if HAVE_NATIVE_mpn_sublsh2_n 926double 927speed_mpn_sublsh2_n (struct speed_params *s) 928{ 929 SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n); 930} 931#endif 932#if HAVE_NATIVE_mpn_rsblsh2_n 933double 934speed_mpn_rsblsh2_n (struct speed_params *s) 935{ 936 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n); 937} 938#endif 939#if HAVE_NATIVE_mpn_rsh1add_n 940double 941speed_mpn_rsh1add_n (struct speed_params *s) 942{ 943 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n); 944} 945#endif 946#if HAVE_NATIVE_mpn_rsh1sub_n 947double 948speed_mpn_rsh1sub_n (struct speed_params *s) 949{ 950 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n); 951} 952#endif 953 954/* mpn_and_n etc can be macros and so have to be handled with 955 SPEED_ROUTINE_MPN_BINARY_N_CALL forms */ 956double 957speed_mpn_and_n (struct speed_params *s) 958{ 959 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size)); 960} 961double 962speed_mpn_andn_n (struct speed_params *s) 963{ 964SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size)); 965} 966double 967speed_mpn_nand_n (struct speed_params *s) 968{ 969 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size)); 970} 971double 972speed_mpn_ior_n (struct speed_params *s) 973{ 974SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size)); 975} 976double 977speed_mpn_iorn_n (struct speed_params *s) 978{ 979 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size)); 980} 981double 982speed_mpn_nior_n (struct speed_params *s) 983{ 984 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size)); 985} 986double 987speed_mpn_xor_n (struct speed_params *s) 988{ 989 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size)); 990} 991double 992speed_mpn_xnor_n (struct speed_params *s) 993{ 994 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size)); 995} 996 997 998double 999speed_mpn_mul_n (struct speed_params *s) 1000{ 1001 SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n); 1002} 1003double 1004speed_mpn_sqr (struct speed_params *s) 1005{ 1006 SPEED_ROUTINE_MPN_SQR (mpn_sqr); 1007} 1008double 1009speed_mpn_mul_n_sqr (struct speed_params *s) 1010{ 1011 SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size)); 1012} 1013 1014double 1015speed_mpn_mul_basecase (struct speed_params *s) 1016{ 1017 SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase); 1018} 1019double 1020speed_mpn_mul (struct speed_params *s) 1021{ 1022 SPEED_ROUTINE_MPN_MUL(mpn_mul); 1023} 1024double 1025speed_mpn_sqr_basecase (struct speed_params *s) 1026{ 1027 /* FIXME: size restrictions on some versions of sqr_basecase */ 1028 SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase); 1029} 1030 1031#if HAVE_NATIVE_mpn_sqr_diagonal 1032double 1033speed_mpn_sqr_diagonal (struct speed_params *s) 1034{ 1035 SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal); 1036} 1037#endif 1038 1039double 1040speed_mpn_toom2_sqr (struct speed_params *s) 1041{ 1042 SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr); 1043} 1044double 1045speed_mpn_toom3_sqr (struct speed_params *s) 1046{ 1047 SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr); 1048} 1049double 1050speed_mpn_toom4_sqr (struct speed_params *s) 1051{ 1052 SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr); 1053} 1054double 1055speed_mpn_toom6_sqr (struct speed_params *s) 1056{ 1057 SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr); 1058} 1059double 1060speed_mpn_toom8_sqr (struct speed_params *s) 1061{ 1062 SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr); 1063} 1064double 1065speed_mpn_toom22_mul (struct speed_params *s) 1066{ 1067 SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul); 1068} 1069double 1070speed_mpn_toom33_mul (struct speed_params *s) 1071{ 1072 SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul); 1073} 1074double 1075speed_mpn_toom44_mul (struct speed_params *s) 1076{ 1077 SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul); 1078} 1079double 1080speed_mpn_toom6h_mul (struct speed_params *s) 1081{ 1082 SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul); 1083} 1084double 1085speed_mpn_toom8h_mul (struct speed_params *s) 1086{ 1087 SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul); 1088} 1089 1090double 1091speed_mpn_toom32_mul (struct speed_params *s) 1092{ 1093 SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul); 1094} 1095double 1096speed_mpn_toom42_mul (struct speed_params *s) 1097{ 1098 SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul); 1099} 1100double 1101speed_mpn_toom43_mul (struct speed_params *s) 1102{ 1103 SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul); 1104} 1105double 1106speed_mpn_toom63_mul (struct speed_params *s) 1107{ 1108 SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul); 1109} 1110double 1111speed_mpn_toom32_for_toom43_mul (struct speed_params *s) 1112{ 1113 SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul); 1114} 1115double 1116speed_mpn_toom43_for_toom32_mul (struct speed_params *s) 1117{ 1118 SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul); 1119} 1120double 1121speed_mpn_toom32_for_toom53_mul (struct speed_params *s) 1122{ 1123 SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul); 1124} 1125double 1126speed_mpn_toom53_for_toom32_mul (struct speed_params *s) 1127{ 1128 SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul); 1129} 1130double 1131speed_mpn_toom42_for_toom53_mul (struct speed_params *s) 1132{ 1133 SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul); 1134} 1135double 1136speed_mpn_toom53_for_toom42_mul (struct speed_params *s) 1137{ 1138 SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul); 1139} 1140 1141double 1142speed_mpn_nussbaumer_mul (struct speed_params *s) 1143{ 1144 SPEED_ROUTINE_MPN_MUL_N_CALL 1145 (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size)); 1146} 1147double 1148speed_mpn_nussbaumer_mul_sqr (struct speed_params *s) 1149{ 1150 SPEED_ROUTINE_MPN_SQR_CALL 1151 (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size)); 1152} 1153 1154#if WANT_OLD_FFT_FULL 1155double 1156speed_mpn_mul_fft_full (struct speed_params *s) 1157{ 1158 SPEED_ROUTINE_MPN_MUL_N_CALL 1159 (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size)); 1160} 1161double 1162speed_mpn_mul_fft_full_sqr (struct speed_params *s) 1163{ 1164 SPEED_ROUTINE_MPN_SQR_CALL 1165 (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size)); 1166} 1167#endif 1168 1169/* These are mod 2^N+1 multiplies and squares. If s->r is supplied it's 1170 used as k, otherwise the best k for the size is used. If s->size isn't a 1171 multiple of 2^k it's rounded up to make the effective operation size. */ 1172 1173#define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr) \ 1174 { \ 1175 mp_ptr wp; \ 1176 mp_size_t pl; \ 1177 int k; \ 1178 unsigned i; \ 1179 double t; \ 1180 TMP_DECL; \ 1181 \ 1182 SPEED_RESTRICT_COND (s->size >= 1); \ 1183 \ 1184 if (s->r != 0) \ 1185 k = s->r; \ 1186 else \ 1187 k = mpn_fft_best_k (s->size, sqr); \ 1188 \ 1189 TMP_MARK; \ 1190 pl = mpn_fft_next_size (s->size, k); \ 1191 SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp); \ 1192 \ 1193 speed_operand_src (s, s->xp, s->size); \ 1194 if (!sqr) \ 1195 speed_operand_src (s, s->yp, s->size); \ 1196 speed_operand_dst (s, wp, pl+1); \ 1197 speed_cache_fill (s); \ 1198 \ 1199 speed_starttime (); \ 1200 i = s->reps; \ 1201 do \ 1202 call; \ 1203 while (--i != 0); \ 1204 t = speed_endtime (); \ 1205 \ 1206 TMP_FREE; \ 1207 return t; \ 1208 } 1209 1210double 1211speed_mpn_mul_fft (struct speed_params *s) 1212{ 1213 SPEED_ROUTINE_MPN_MUL_FFT_CALL 1214 (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0); 1215} 1216 1217double 1218speed_mpn_mul_fft_sqr (struct speed_params *s) 1219{ 1220 SPEED_ROUTINE_MPN_MUL_FFT_CALL 1221 (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1); 1222} 1223 1224double 1225speed_mpn_fft_mul (struct speed_params *s) 1226{ 1227 SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size)); 1228} 1229 1230double 1231speed_mpn_fft_sqr (struct speed_params *s) 1232{ 1233 SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size)); 1234} 1235 1236double 1237speed_mpn_mullo_n (struct speed_params *s) 1238{ 1239 SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n); 1240} 1241double 1242speed_mpn_mullo_basecase (struct speed_params *s) 1243{ 1244 SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase); 1245} 1246 1247double 1248speed_mpn_mulmod_bnm1 (struct speed_params *s) 1249{ 1250 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp)); 1251} 1252 1253double 1254speed_mpn_bc_mulmod_bnm1 (struct speed_params *s) 1255{ 1256 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp)); 1257} 1258 1259double 1260speed_mpn_mulmod_bnm1_rounded (struct speed_params *s) 1261{ 1262 SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1); 1263} 1264 1265double 1266speed_mpn_sqrmod_bnm1 (struct speed_params *s) 1267{ 1268 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp)); 1269} 1270 1271double 1272speed_mpn_matrix22_mul (struct speed_params *s) 1273{ 1274 /* Speed params only includes 2 inputs, so we have to invent the 1275 other 6. */ 1276 1277 mp_ptr a; 1278 mp_ptr r; 1279 mp_ptr b; 1280 mp_ptr tp; 1281 mp_size_t itch; 1282 unsigned i; 1283 double t; 1284 TMP_DECL; 1285 1286 TMP_MARK; 1287 SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp); 1288 SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp); 1289 SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp); 1290 1291 MPN_COPY (a, s->xp, s->size); 1292 mpn_random (a + s->size, 3 * s->size); 1293 MPN_COPY (b, s->yp, s->size); 1294 mpn_random (b + s->size, 3 * s->size); 1295 1296 itch = mpn_matrix22_mul_itch (s->size, s->size); 1297 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); 1298 1299 speed_operand_src (s, a, 4 * s->size); 1300 speed_operand_src (s, b, 4 * s->size); 1301 speed_operand_dst (s, r, 8 * s->size + 4); 1302 speed_operand_dst (s, tp, itch); 1303 speed_cache_fill (s); 1304 1305 speed_starttime (); 1306 i = s->reps; 1307 do 1308 { 1309 mp_size_t sz = s->size; 1310 MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz); 1311 MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz); 1312 MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz); 1313 MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz); 1314 mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz, 1315 b, b + 1 * sz, b + 2 * sz, b + 3 * sz, sz, 1316 tp); 1317 } 1318 while (--i != 0); 1319 t = speed_endtime(); 1320 TMP_FREE; 1321 return t; 1322} 1323 1324double 1325speed_mpn_hgcd (struct speed_params *s) 1326{ 1327 mp_ptr wp; 1328 mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); 1329 mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size); 1330 mp_ptr ap; 1331 mp_ptr bp; 1332 mp_ptr tmp1; 1333 1334 struct hgcd_matrix hgcd; 1335 int res; 1336 unsigned i; 1337 double t; 1338 TMP_DECL; 1339 1340 if (s->size < 2) 1341 return -1; 1342 1343 TMP_MARK; 1344 1345 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); 1346 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); 1347 1348 s->xp[s->size - 1] |= 1; 1349 s->yp[s->size - 1] |= 1; 1350 1351 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp); 1352 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp); 1353 1354 speed_starttime (); 1355 i = s->reps; 1356 do 1357 { 1358 MPN_COPY (ap, s->xp, s->size); 1359 MPN_COPY (bp, s->yp, s->size); 1360 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); 1361 res = mpn_hgcd (ap, bp, s->size, &hgcd, wp); 1362 } 1363 while (--i != 0); 1364 t = speed_endtime (); 1365 TMP_FREE; 1366 return t; 1367} 1368 1369double 1370speed_mpn_hgcd_lehmer (struct speed_params *s) 1371{ 1372 mp_ptr wp; 1373 mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); 1374 mp_size_t hgcd_scratch = MPN_HGCD_LEHMER_ITCH (s->size); 1375 mp_ptr ap; 1376 mp_ptr bp; 1377 mp_ptr tmp1; 1378 1379 struct hgcd_matrix hgcd; 1380 int res; 1381 unsigned i; 1382 double t; 1383 TMP_DECL; 1384 1385 if (s->size < 2) 1386 return -1; 1387 1388 TMP_MARK; 1389 1390 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); 1391 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); 1392 1393 s->xp[s->size - 1] |= 1; 1394 s->yp[s->size - 1] |= 1; 1395 1396 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp); 1397 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp); 1398 1399 speed_starttime (); 1400 i = s->reps; 1401 do 1402 { 1403 MPN_COPY (ap, s->xp, s->size); 1404 MPN_COPY (bp, s->yp, s->size); 1405 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); 1406 res = mpn_hgcd_lehmer (ap, bp, s->size, &hgcd, wp); 1407 } 1408 while (--i != 0); 1409 t = speed_endtime (); 1410 TMP_FREE; 1411 return t; 1412} 1413 1414double 1415speed_mpn_gcd (struct speed_params *s) 1416{ 1417 SPEED_ROUTINE_MPN_GCD (mpn_gcd); 1418} 1419 1420double 1421speed_mpn_gcdext (struct speed_params *s) 1422{ 1423 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext); 1424} 1425#if 0 1426double 1427speed_mpn_gcdext_lehmer (struct speed_params *s) 1428{ 1429 SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer); 1430} 1431#endif 1432double 1433speed_mpn_gcdext_single (struct speed_params *s) 1434{ 1435 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single); 1436} 1437double 1438speed_mpn_gcdext_double (struct speed_params *s) 1439{ 1440 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double); 1441} 1442double 1443speed_mpn_gcdext_one_single (struct speed_params *s) 1444{ 1445 SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single); 1446} 1447double 1448speed_mpn_gcdext_one_double (struct speed_params *s) 1449{ 1450 SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double); 1451} 1452double 1453speed_mpn_gcd_1 (struct speed_params *s) 1454{ 1455 SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1); 1456} 1457double 1458speed_mpn_gcd_1N (struct speed_params *s) 1459{ 1460 SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1); 1461} 1462 1463 1464double 1465speed_mpz_jacobi (struct speed_params *s) 1466{ 1467 SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi); 1468} 1469double 1470speed_mpn_jacobi_base (struct speed_params *s) 1471{ 1472 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base); 1473} 1474double 1475speed_mpn_jacobi_base_1 (struct speed_params *s) 1476{ 1477 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1); 1478} 1479double 1480speed_mpn_jacobi_base_2 (struct speed_params *s) 1481{ 1482 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2); 1483} 1484double 1485speed_mpn_jacobi_base_3 (struct speed_params *s) 1486{ 1487 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3); 1488} 1489 1490 1491double 1492speed_mpn_sqrtrem (struct speed_params *s) 1493{ 1494 SPEED_ROUTINE_MPN_SQRTREM (mpn_sqrtrem); 1495} 1496 1497double 1498speed_mpn_rootrem (struct speed_params *s) 1499{ 1500 SPEED_ROUTINE_MPN_ROOTREM (mpn_rootrem); 1501} 1502 1503 1504double 1505speed_mpz_fac_ui (struct speed_params *s) 1506{ 1507 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui); 1508} 1509 1510 1511double 1512speed_mpn_fib2_ui (struct speed_params *s) 1513{ 1514 SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui); 1515} 1516double 1517speed_mpz_fib_ui (struct speed_params *s) 1518{ 1519 SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui); 1520} 1521double 1522speed_mpz_fib2_ui (struct speed_params *s) 1523{ 1524 SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui); 1525} 1526double 1527speed_mpz_lucnum_ui (struct speed_params *s) 1528{ 1529 SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui); 1530} 1531double 1532speed_mpz_lucnum2_ui (struct speed_params *s) 1533{ 1534 SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui); 1535} 1536 1537 1538double 1539speed_mpz_powm (struct speed_params *s) 1540{ 1541 SPEED_ROUTINE_MPZ_POWM (mpz_powm); 1542} 1543double 1544speed_mpz_powm_mod (struct speed_params *s) 1545{ 1546 SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod); 1547} 1548double 1549speed_mpz_powm_redc (struct speed_params *s) 1550{ 1551 SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc); 1552} 1553double 1554speed_mpz_powm_ui (struct speed_params *s) 1555{ 1556 SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui); 1557} 1558 1559 1560double 1561speed_binvert_limb (struct speed_params *s) 1562{ 1563 SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb); 1564} 1565 1566 1567double 1568speed_noop (struct speed_params *s) 1569{ 1570 unsigned i; 1571 1572 speed_starttime (); 1573 i = s->reps; 1574 do 1575 noop (); 1576 while (--i != 0); 1577 return speed_endtime (); 1578} 1579 1580double 1581speed_noop_wxs (struct speed_params *s) 1582{ 1583 mp_ptr wp; 1584 unsigned i; 1585 double t; 1586 TMP_DECL; 1587 1588 TMP_MARK; 1589 wp = TMP_ALLOC_LIMBS (1); 1590 1591 speed_starttime (); 1592 i = s->reps; 1593 do 1594 noop_wxs (wp, s->xp, s->size); 1595 while (--i != 0); 1596 t = speed_endtime (); 1597 1598 TMP_FREE; 1599 return t; 1600} 1601 1602double 1603speed_noop_wxys (struct speed_params *s) 1604{ 1605 mp_ptr wp; 1606 unsigned i; 1607 double t; 1608 TMP_DECL; 1609 1610 TMP_MARK; 1611 wp = TMP_ALLOC_LIMBS (1); 1612 1613 speed_starttime (); 1614 i = s->reps; 1615 do 1616 noop_wxys (wp, s->xp, s->yp, s->size); 1617 while (--i != 0); 1618 t = speed_endtime (); 1619 1620 TMP_FREE; 1621 return t; 1622} 1623 1624 1625#define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \ 1626 { \ 1627 unsigned i; \ 1628 variables; \ 1629 \ 1630 speed_starttime (); \ 1631 i = s->reps; \ 1632 do \ 1633 { \ 1634 calls; \ 1635 } \ 1636 while (--i != 0); \ 1637 return speed_endtime (); \ 1638 } 1639 1640 1641/* Compare these to see how much malloc/free costs and then how much 1642 __gmp_default_allocate/free and mpz_init/clear add. mpz_init/clear or 1643 mpq_init/clear will be doing a 1 limb allocate, so use that as the size 1644 when including them in comparisons. */ 1645 1646double 1647speed_malloc_free (struct speed_params *s) 1648{ 1649 size_t bytes = s->size * BYTES_PER_MP_LIMB; 1650 SPEED_ROUTINE_ALLOC_FREE (void *p, 1651 p = malloc (bytes); 1652 free (p)); 1653} 1654 1655double 1656speed_malloc_realloc_free (struct speed_params *s) 1657{ 1658 size_t bytes = s->size * BYTES_PER_MP_LIMB; 1659 SPEED_ROUTINE_ALLOC_FREE (void *p, 1660 p = malloc (BYTES_PER_MP_LIMB); 1661 p = realloc (p, bytes); 1662 free (p)); 1663} 1664 1665double 1666speed_gmp_allocate_free (struct speed_params *s) 1667{ 1668 size_t bytes = s->size * BYTES_PER_MP_LIMB; 1669 SPEED_ROUTINE_ALLOC_FREE (void *p, 1670 p = (*__gmp_allocate_func) (bytes); 1671 (*__gmp_free_func) (p, bytes)); 1672} 1673 1674double 1675speed_gmp_allocate_reallocate_free (struct speed_params *s) 1676{ 1677 size_t bytes = s->size * BYTES_PER_MP_LIMB; 1678 SPEED_ROUTINE_ALLOC_FREE 1679 (void *p, 1680 p = (*__gmp_allocate_func) (BYTES_PER_MP_LIMB); 1681 p = (*__gmp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB); 1682 (*__gmp_free_func) (p, bytes)); 1683} 1684 1685double 1686speed_mpz_init_clear (struct speed_params *s) 1687{ 1688 SPEED_ROUTINE_ALLOC_FREE (mpz_t z, 1689 mpz_init (z); 1690 mpz_clear (z)); 1691} 1692 1693double 1694speed_mpz_init_realloc_clear (struct speed_params *s) 1695{ 1696 SPEED_ROUTINE_ALLOC_FREE (mpz_t z, 1697 mpz_init (z); 1698 _mpz_realloc (z, s->size); 1699 mpz_clear (z)); 1700} 1701 1702double 1703speed_mpq_init_clear (struct speed_params *s) 1704{ 1705 SPEED_ROUTINE_ALLOC_FREE (mpq_t q, 1706 mpq_init (q); 1707 mpq_clear (q)); 1708} 1709 1710double 1711speed_mpf_init_clear (struct speed_params *s) 1712{ 1713 SPEED_ROUTINE_ALLOC_FREE (mpf_t f, 1714 mpf_init (f); 1715 mpf_clear (f)); 1716} 1717 1718 1719/* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note 1720 that repeatedly calling mpz_add with the same data gives branch prediction 1721 in it an advantage. */ 1722 1723double 1724speed_mpz_add (struct speed_params *s) 1725{ 1726 mpz_t w, x, y; 1727 unsigned i; 1728 double t; 1729 1730 mpz_init (w); 1731 mpz_init (x); 1732 mpz_init (y); 1733 1734 mpz_set_n (x, s->xp, s->size); 1735 mpz_set_n (y, s->yp, s->size); 1736 mpz_add (w, x, y); 1737 1738 speed_starttime (); 1739 i = s->reps; 1740 do 1741 { 1742 mpz_add (w, x, y); 1743 } 1744 while (--i != 0); 1745 t = speed_endtime (); 1746 1747 mpz_clear (w); 1748 mpz_clear (x); 1749 mpz_clear (y); 1750 return t; 1751} 1752 1753 1754/* If r==0, calculate (size,size/2), 1755 otherwise calculate (size,r). */ 1756 1757double 1758speed_mpz_bin_uiui (struct speed_params *s) 1759{ 1760 mpz_t w; 1761 unsigned long k; 1762 unsigned i; 1763 double t; 1764 1765 mpz_init (w); 1766 if (s->r != 0) 1767 k = s->r; 1768 else 1769 k = s->size/2; 1770 1771 speed_starttime (); 1772 i = s->reps; 1773 do 1774 { 1775 mpz_bin_uiui (w, s->size, k); 1776 } 1777 while (--i != 0); 1778 t = speed_endtime (); 1779 1780 mpz_clear (w); 1781 return t; 1782} 1783 1784 1785/* The multiplies are successively dependent so the latency is measured, not 1786 the issue rate. There's only 10 per loop so the code doesn't get too big 1787 since umul_ppmm is several instructions on some cpus. 1788 1789 Putting the arguments as "h,l,l,h" gets slightly better code from gcc 1790 2.95.2 on x86, it puts only one mov between each mul, not two. That mov 1791 though will probably show up as a bogus extra cycle though. 1792 1793 The measuring function macros are into three parts to avoid overflowing 1794 preprocessor expansion space if umul_ppmm is big. 1795 1796 Limitations: 1797 1798 Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code 1799 generated first, especially on CPUs with low latency multipliers. 1800 1801 The default umul_ppmm doing h*l will be getting increasing numbers of 1802 high zero bits in the calculation. CPUs with data-dependent multipliers 1803 will want to use umul_ppmm.1 to get some randomization into the 1804 calculation. The extra xors and fetches will be a slowdown of course. */ 1805 1806#define SPEED_MACRO_UMUL_PPMM_A \ 1807 { \ 1808 mp_limb_t h, l; \ 1809 unsigned i; \ 1810 double t; \ 1811 \ 1812 s->time_divisor = 10; \ 1813 \ 1814 h = s->xp[0]; \ 1815 l = s->yp[0]; \ 1816 \ 1817 if (s->r == 1) \ 1818 { \ 1819 speed_starttime (); \ 1820 i = s->reps; \ 1821 do \ 1822 { 1823 1824#define SPEED_MACRO_UMUL_PPMM_B \ 1825 } \ 1826 while (--i != 0); \ 1827 t = speed_endtime (); \ 1828 } \ 1829 else \ 1830 { \ 1831 speed_starttime (); \ 1832 i = s->reps; \ 1833 do \ 1834 { 1835 1836#define SPEED_MACRO_UMUL_PPMM_C \ 1837 } \ 1838 while (--i != 0); \ 1839 t = speed_endtime (); \ 1840 } \ 1841 \ 1842 /* stop the compiler optimizing away the whole calculation! */ \ 1843 noop_1 (h); \ 1844 noop_1 (l); \ 1845 \ 1846 return t; \ 1847 } 1848 1849 1850double 1851speed_umul_ppmm (struct speed_params *s) 1852{ 1853 SPEED_MACRO_UMUL_PPMM_A; 1854 { 1855 umul_ppmm (h, l, l, h); h ^= s->xp_block[0]; l ^= s->yp_block[0]; 1856 umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1]; 1857 umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2]; 1858 umul_ppmm (h, l, l, h); h ^= s->xp_block[3]; l ^= s->yp_block[3]; 1859 umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4]; 1860 umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5]; 1861 umul_ppmm (h, l, l, h); h ^= s->xp_block[6]; l ^= s->yp_block[6]; 1862 umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7]; 1863 umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8]; 1864 umul_ppmm (h, l, l, h); h ^= s->xp_block[9]; l ^= s->yp_block[9]; 1865 } 1866 SPEED_MACRO_UMUL_PPMM_B; 1867 { 1868 umul_ppmm (h, l, l, h); 1869 umul_ppmm (h, l, l, h); 1870 umul_ppmm (h, l, l, h); 1871 umul_ppmm (h, l, l, h); 1872 umul_ppmm (h, l, l, h); 1873 umul_ppmm (h, l, l, h); 1874 umul_ppmm (h, l, l, h); 1875 umul_ppmm (h, l, l, h); 1876 umul_ppmm (h, l, l, h); 1877 umul_ppmm (h, l, l, h); 1878 } 1879 SPEED_MACRO_UMUL_PPMM_C; 1880} 1881 1882 1883#if HAVE_NATIVE_mpn_umul_ppmm 1884double 1885speed_mpn_umul_ppmm (struct speed_params *s) 1886{ 1887 SPEED_MACRO_UMUL_PPMM_A; 1888 { 1889 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[0]; l ^= s->yp_block[0]; 1890 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1]; 1891 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2]; 1892 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[3]; l ^= s->yp_block[3]; 1893 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4]; 1894 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5]; 1895 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[6]; l ^= s->yp_block[6]; 1896 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7]; 1897 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8]; 1898 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[9]; l ^= s->yp_block[9]; 1899 } 1900 SPEED_MACRO_UMUL_PPMM_B; 1901 { 1902 h = mpn_umul_ppmm (&l, h, l); 1903 h = mpn_umul_ppmm (&l, h, l); 1904 h = mpn_umul_ppmm (&l, h, l); 1905 h = mpn_umul_ppmm (&l, h, l); 1906 h = mpn_umul_ppmm (&l, h, l); 1907 h = mpn_umul_ppmm (&l, h, l); 1908 h = mpn_umul_ppmm (&l, h, l); 1909 h = mpn_umul_ppmm (&l, h, l); 1910 h = mpn_umul_ppmm (&l, h, l); 1911 h = mpn_umul_ppmm (&l, h, l); 1912 } 1913 SPEED_MACRO_UMUL_PPMM_C; 1914} 1915#endif 1916 1917#if HAVE_NATIVE_mpn_umul_ppmm_r 1918double 1919speed_mpn_umul_ppmm_r (struct speed_params *s) 1920{ 1921 SPEED_MACRO_UMUL_PPMM_A; 1922 { 1923 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[0]; l ^= s->yp_block[0]; 1924 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1]; 1925 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2]; 1926 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[3]; l ^= s->yp_block[3]; 1927 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4]; 1928 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5]; 1929 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[6]; l ^= s->yp_block[6]; 1930 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7]; 1931 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8]; 1932 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[9]; l ^= s->yp_block[9]; 1933 } 1934 SPEED_MACRO_UMUL_PPMM_B; 1935 { 1936 h = mpn_umul_ppmm_r (h, l, &l); 1937 h = mpn_umul_ppmm_r (h, l, &l); 1938 h = mpn_umul_ppmm_r (h, l, &l); 1939 h = mpn_umul_ppmm_r (h, l, &l); 1940 h = mpn_umul_ppmm_r (h, l, &l); 1941 h = mpn_umul_ppmm_r (h, l, &l); 1942 h = mpn_umul_ppmm_r (h, l, &l); 1943 h = mpn_umul_ppmm_r (h, l, &l); 1944 h = mpn_umul_ppmm_r (h, l, &l); 1945 h = mpn_umul_ppmm_r (h, l, &l); 1946 } 1947 SPEED_MACRO_UMUL_PPMM_C; 1948} 1949#endif 1950 1951 1952/* The divisions are successively dependent so latency is measured, not 1953 issue rate. There's only 10 per loop so the code doesn't get too big, 1954 especially for udiv_qrnnd_preinv and preinv2norm, which are several 1955 instructions each. 1956 1957 Note that it's only the division which is measured here, there's no data 1958 fetching and no shifting if the divisor gets normalized. 1959 1960 In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d" 1961 generate x86 div instructions with nothing in between. 1962 1963 The measuring function macros are in two parts to avoid overflowing 1964 preprocessor expansion space if udiv_qrnnd etc are big. 1965 1966 Limitations: 1967 1968 Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code 1969 generated first. 1970 1971 CPUs with data-dependent divisions may want more attention paid to the 1972 randomness of the data used. Probably the measurement wanted is over 1973 uniformly distributed numbers, but what's here might not be giving that. */ 1974 1975#define SPEED_ROUTINE_UDIV_QRNND_A(normalize) \ 1976 { \ 1977 double t; \ 1978 unsigned i; \ 1979 mp_limb_t q, r, d; \ 1980 mp_limb_t dinv; \ 1981 \ 1982 s->time_divisor = 10; \ 1983 \ 1984 /* divisor from "r" parameter, or a default */ \ 1985 d = s->r; \ 1986 if (d == 0) \ 1987 d = mp_bases[10].big_base; \ 1988 \ 1989 if (normalize) \ 1990 { \ 1991 unsigned norm; \ 1992 count_leading_zeros (norm, d); \ 1993 d <<= norm; \ 1994 invert_limb (dinv, d); \ 1995 } \ 1996 \ 1997 q = s->xp[0]; \ 1998 r = s->yp[0] % d; \ 1999 \ 2000 speed_starttime (); \ 2001 i = s->reps; \ 2002 do \ 2003 { 2004 2005#define SPEED_ROUTINE_UDIV_QRNND_B \ 2006 } \ 2007 while (--i != 0); \ 2008 t = speed_endtime (); \ 2009 \ 2010 /* stop the compiler optimizing away the whole calculation! */ \ 2011 noop_1 (q); \ 2012 noop_1 (r); \ 2013 \ 2014 return t; \ 2015 } 2016 2017double 2018speed_udiv_qrnnd (struct speed_params *s) 2019{ 2020 SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION); 2021 { 2022 udiv_qrnnd (q, r, r, q, d); 2023 udiv_qrnnd (q, r, r, q, d); 2024 udiv_qrnnd (q, r, r, q, d); 2025 udiv_qrnnd (q, r, r, q, d); 2026 udiv_qrnnd (q, r, r, q, d); 2027 udiv_qrnnd (q, r, r, q, d); 2028 udiv_qrnnd (q, r, r, q, d); 2029 udiv_qrnnd (q, r, r, q, d); 2030 udiv_qrnnd (q, r, r, q, d); 2031 udiv_qrnnd (q, r, r, q, d); 2032 } 2033 SPEED_ROUTINE_UDIV_QRNND_B; 2034} 2035 2036double 2037speed_udiv_qrnnd_preinv1 (struct speed_params *s) 2038{ 2039 SPEED_ROUTINE_UDIV_QRNND_A (1); 2040 { 2041 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2042 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2043 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2044 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2045 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2046 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2047 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2048 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2049 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2050 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv); 2051 } 2052 SPEED_ROUTINE_UDIV_QRNND_B; 2053} 2054 2055double 2056speed_udiv_qrnnd_preinv2 (struct speed_params *s) 2057{ 2058 SPEED_ROUTINE_UDIV_QRNND_A (1); 2059 { 2060 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2061 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2062 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2063 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2064 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2065 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2066 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2067 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2068 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2069 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv); 2070 } 2071 SPEED_ROUTINE_UDIV_QRNND_B; 2072} 2073 2074double 2075speed_udiv_qrnnd_c (struct speed_params *s) 2076{ 2077 SPEED_ROUTINE_UDIV_QRNND_A (1); 2078 { 2079 __udiv_qrnnd_c (q, r, r, q, d); 2080 __udiv_qrnnd_c (q, r, r, q, d); 2081 __udiv_qrnnd_c (q, r, r, q, d); 2082 __udiv_qrnnd_c (q, r, r, q, d); 2083 __udiv_qrnnd_c (q, r, r, q, d); 2084 __udiv_qrnnd_c (q, r, r, q, d); 2085 __udiv_qrnnd_c (q, r, r, q, d); 2086 __udiv_qrnnd_c (q, r, r, q, d); 2087 __udiv_qrnnd_c (q, r, r, q, d); 2088 __udiv_qrnnd_c (q, r, r, q, d); 2089 } 2090 SPEED_ROUTINE_UDIV_QRNND_B; 2091} 2092 2093#if HAVE_NATIVE_mpn_udiv_qrnnd 2094double 2095speed_mpn_udiv_qrnnd (struct speed_params *s) 2096{ 2097 SPEED_ROUTINE_UDIV_QRNND_A (1); 2098 { 2099 q = mpn_udiv_qrnnd (&r, r, q, d); 2100 q = mpn_udiv_qrnnd (&r, r, q, d); 2101 q = mpn_udiv_qrnnd (&r, r, q, d); 2102 q = mpn_udiv_qrnnd (&r, r, q, d); 2103 q = mpn_udiv_qrnnd (&r, r, q, d); 2104 q = mpn_udiv_qrnnd (&r, r, q, d); 2105 q = mpn_udiv_qrnnd (&r, r, q, d); 2106 q = mpn_udiv_qrnnd (&r, r, q, d); 2107 q = mpn_udiv_qrnnd (&r, r, q, d); 2108 q = mpn_udiv_qrnnd (&r, r, q, d); 2109 } 2110 SPEED_ROUTINE_UDIV_QRNND_B; 2111} 2112#endif 2113 2114#if HAVE_NATIVE_mpn_udiv_qrnnd_r 2115double 2116speed_mpn_udiv_qrnnd_r (struct speed_params *s) 2117{ 2118 SPEED_ROUTINE_UDIV_QRNND_A (1); 2119 { 2120 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2121 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2122 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2123 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2124 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2125 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2126 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2127 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2128 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2129 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2130 } 2131 SPEED_ROUTINE_UDIV_QRNND_B; 2132} 2133#endif 2134 2135 2136double 2137speed_invert_limb (struct speed_params *s) 2138{ 2139 SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d)); 2140} 2141 2142 2143/* xp[0] might not be particularly random, but should give an indication how 2144 "/" runs. Same for speed_operator_mod below. */ 2145double 2146speed_operator_div (struct speed_params *s) 2147{ 2148 double t; 2149 unsigned i; 2150 mp_limb_t x, q, d; 2151 2152 s->time_divisor = 10; 2153 2154 /* divisor from "r" parameter, or a default */ 2155 d = s->r; 2156 if (d == 0) 2157 d = mp_bases[10].big_base; 2158 2159 x = s->xp[0]; 2160 q = 0; 2161 2162 speed_starttime (); 2163 i = s->reps; 2164 do 2165 { 2166 q ^= x; q /= d; 2167 q ^= x; q /= d; 2168 q ^= x; q /= d; 2169 q ^= x; q /= d; 2170 q ^= x; q /= d; 2171 q ^= x; q /= d; 2172 q ^= x; q /= d; 2173 q ^= x; q /= d; 2174 q ^= x; q /= d; 2175 q ^= x; q /= d; 2176 } 2177 while (--i != 0); 2178 t = speed_endtime (); 2179 2180 /* stop the compiler optimizing away the whole calculation! */ 2181 noop_1 (q); 2182 2183 return t; 2184} 2185 2186double 2187speed_operator_mod (struct speed_params *s) 2188{ 2189 double t; 2190 unsigned i; 2191 mp_limb_t x, r, d; 2192 2193 s->time_divisor = 10; 2194 2195 /* divisor from "r" parameter, or a default */ 2196 d = s->r; 2197 if (d == 0) 2198 d = mp_bases[10].big_base; 2199 2200 x = s->xp[0]; 2201 r = 0; 2202 2203 speed_starttime (); 2204 i = s->reps; 2205 do 2206 { 2207 r ^= x; r %= d; 2208 r ^= x; r %= d; 2209 r ^= x; r %= d; 2210 r ^= x; r %= d; 2211 r ^= x; r %= d; 2212 r ^= x; r %= d; 2213 r ^= x; r %= d; 2214 r ^= x; r %= d; 2215 r ^= x; r %= d; 2216 r ^= x; r %= d; 2217 } 2218 while (--i != 0); 2219 t = speed_endtime (); 2220 2221 /* stop the compiler optimizing away the whole calculation! */ 2222 noop_1 (r); 2223 2224 return t; 2225} 2226 2227 2228/* r==0 measures on data with the values uniformly distributed. This will 2229 be typical for count_trailing_zeros in a GCD etc. 2230 2231 r==1 measures on data with the resultant count uniformly distributed 2232 between 0 and GMP_LIMB_BITS-1. This is probably sensible for 2233 count_leading_zeros on the high limbs of divisors. */ 2234 2235int 2236speed_routine_count_zeros_setup (struct speed_params *s, 2237 mp_ptr xp, int leading, int zero) 2238{ 2239 int i, c; 2240 mp_limb_t n; 2241 2242 if (s->r == 0) 2243 { 2244 /* Make uniformly distributed data. If zero isn't allowed then change 2245 it to 1 for leading, or 0x800..00 for trailing. */ 2246 MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE); 2247 if (! zero) 2248 for (i = 0; i < SPEED_BLOCK_SIZE; i++) 2249 if (xp[i] == 0) 2250 xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT; 2251 } 2252 else if (s->r == 1) 2253 { 2254 /* Make counts uniformly distributed. A randomly chosen bit is set, and 2255 for leading the rest above it are cleared, or for trailing then the 2256 rest below. */ 2257 for (i = 0; i < SPEED_BLOCK_SIZE; i++) 2258 { 2259 mp_limb_t set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS); 2260 mp_limb_t keep_below = set-1; 2261 mp_limb_t keep_above = MP_LIMB_T_MAX ^ keep_below; 2262 mp_limb_t keep = (leading ? keep_below : keep_above); 2263 xp[i] = (s->xp_block[i] & keep) | set; 2264 } 2265 } 2266 else 2267 { 2268 return 0; 2269 } 2270 2271 /* Account for the effect of n^=c. */ 2272 c = 0; 2273 for (i = 0; i < SPEED_BLOCK_SIZE; i++) 2274 { 2275 n = xp[i]; 2276 xp[i] ^= c; 2277 2278 if (leading) 2279 count_leading_zeros (c, n); 2280 else 2281 count_trailing_zeros (c, n); 2282 } 2283 2284 return 1; 2285} 2286 2287double 2288speed_count_leading_zeros (struct speed_params *s) 2289{ 2290#ifdef COUNT_LEADING_ZEROS_0 2291#define COUNT_LEADING_ZEROS_0_ALLOWED 1 2292#else 2293#define COUNT_LEADING_ZEROS_0_ALLOWED 0 2294#endif 2295 2296 SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED); 2297 count_leading_zeros (c, n); 2298 SPEED_ROUTINE_COUNT_ZEROS_B (); 2299} 2300double 2301speed_count_trailing_zeros (struct speed_params *s) 2302{ 2303 SPEED_ROUTINE_COUNT_ZEROS_A (0, 0); 2304 count_trailing_zeros (c, n); 2305 SPEED_ROUTINE_COUNT_ZEROS_B (); 2306} 2307 2308 2309double 2310speed_mpn_get_str (struct speed_params *s) 2311{ 2312 SPEED_ROUTINE_MPN_GET_STR (mpn_get_str); 2313} 2314 2315double 2316speed_mpn_set_str (struct speed_params *s) 2317{ 2318 SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base)); 2319} 2320double 2321speed_mpn_bc_set_str (struct speed_params *s) 2322{ 2323 SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base)); 2324} 2325 2326double 2327speed_MPN_ZERO (struct speed_params *s) 2328{ 2329 SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size)); 2330} 2331 2332 2333int 2334speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate) 2335{ 2336 if (s->r == 0) 2337 gmp_randinit_default (rstate); 2338 else if (s->r == 1) 2339 gmp_randinit_mt (rstate); 2340 else 2341 { 2342 return gmp_randinit_lc_2exp_size (rstate, s->r); 2343 } 2344 return 1; 2345} 2346 2347double 2348speed_gmp_randseed (struct speed_params *s) 2349{ 2350 gmp_randstate_t rstate; 2351 unsigned i; 2352 double t; 2353 mpz_t x; 2354 2355 SPEED_RESTRICT_COND (s->size >= 1); 2356 SPEED_RESTRICT_COND (speed_randinit (s, rstate)); 2357 2358 /* s->size bits of seed */ 2359 mpz_init_set_n (x, s->xp, s->size); 2360 mpz_fdiv_r_2exp (x, x, (unsigned long) s->size); 2361 2362 /* cache priming */ 2363 gmp_randseed (rstate, x); 2364 2365 speed_starttime (); 2366 i = s->reps; 2367 do 2368 gmp_randseed (rstate, x); 2369 while (--i != 0); 2370 t = speed_endtime (); 2371 2372 gmp_randclear (rstate); 2373 mpz_clear (x); 2374 return t; 2375} 2376 2377double 2378speed_gmp_randseed_ui (struct speed_params *s) 2379{ 2380 gmp_randstate_t rstate; 2381 unsigned i, j; 2382 double t; 2383 2384 SPEED_RESTRICT_COND (speed_randinit (s, rstate)); 2385 2386 /* cache priming */ 2387 gmp_randseed_ui (rstate, 123L); 2388 2389 speed_starttime (); 2390 i = s->reps; 2391 j = 0; 2392 do 2393 { 2394 gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]); 2395 j++; 2396 if (j >= SPEED_BLOCK_SIZE) 2397 j = 0; 2398 } 2399 while (--i != 0); 2400 t = speed_endtime (); 2401 2402 gmp_randclear (rstate); 2403 return t; 2404} 2405 2406double 2407speed_mpz_urandomb (struct speed_params *s) 2408{ 2409 gmp_randstate_t rstate; 2410 mpz_t z; 2411 unsigned i; 2412 double t; 2413 2414 SPEED_RESTRICT_COND (s->size >= 0); 2415 SPEED_RESTRICT_COND (speed_randinit (s, rstate)); 2416 2417 mpz_init (z); 2418 2419 /* cache priming */ 2420 mpz_urandomb (z, rstate, (unsigned long) s->size); 2421 mpz_urandomb (z, rstate, (unsigned long) s->size); 2422 2423 speed_starttime (); 2424 i = s->reps; 2425 do 2426 mpz_urandomb (z, rstate, (unsigned long) s->size); 2427 while (--i != 0); 2428 t = speed_endtime (); 2429 2430 mpz_clear (z); 2431 gmp_randclear (rstate); 2432 return t; 2433} 2434