1/* Tune various threshold of MPFR 2 3Copyright 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. 4Contributed by the AriC and Caramel projects, INRIA. 5 6This file is part of the GNU MPFR Library. 7 8The GNU MPFR Library is free software; you can redistribute it and/or modify 9it under the terms of the GNU Lesser General Public License as published by 10the Free Software Foundation; either version 3 of the License, or (at your 11option) any later version. 12 13The GNU MPFR Library is distributed in the hope that it will be useful, but 14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16License for more details. 17 18You should have received a copy of the GNU Lesser General Public License 19along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see 20http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 2151 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23#include <stdlib.h> 24#include <time.h> 25 26#define MPFR_NEED_LONGLONG_H 27#include "mpfr-impl.h" 28 29#undef _PROTO 30#define _PROTO __GMP_PROTO 31#include "speed.h" 32 33int verbose; 34 35/* template for an unary function */ 36/* s->size: precision of both input and output 37 s->xp : Mantissa of first input 38 s->yp : mantissa of second input */ 39#define SPEED_MPFR_FUNC(mean_fun) \ 40 do \ 41 { \ 42 unsigned i; \ 43 mpfr_limb_ptr wp; \ 44 double t; \ 45 mpfr_t w, x; \ 46 mp_size_t size; \ 47 MPFR_TMP_DECL (marker); \ 48 \ 49 SPEED_RESTRICT_COND (s->size >= MPFR_PREC_MIN); \ 50 SPEED_RESTRICT_COND (s->size <= MPFR_PREC_MAX); \ 51 MPFR_TMP_MARK (marker); \ 52 \ 53 size = (s->size-1)/GMP_NUMB_BITS+1; \ 54 s->xp[size-1] |= MPFR_LIMB_HIGHBIT; \ 55 MPFR_TMP_INIT1 (s->xp, x, s->size); \ 56 MPFR_SET_EXP (x, 0); \ 57 \ 58 MPFR_TMP_INIT (wp, w, s->size, size); \ 59 \ 60 speed_operand_src (s, s->xp, size); \ 61 speed_operand_dst (s, wp, size); \ 62 speed_cache_fill (s); \ 63 \ 64 speed_starttime (); \ 65 i = s->reps; \ 66 do \ 67 mean_fun (w, x, MPFR_RNDN); \ 68 while (--i != 0); \ 69 t = speed_endtime (); \ 70 \ 71 MPFR_TMP_FREE (marker); \ 72 return t; \ 73 } \ 74 while (0) 75 76/* same as SPEED_MPFR_FUNC, but for say mpfr_sin_cos (y, z, x, r) */ 77#define SPEED_MPFR_FUNC2(mean_fun) \ 78 do \ 79 { \ 80 unsigned i; \ 81 mpfr_limb_ptr vp, wp; \ 82 double t; \ 83 mpfr_t v, w, x; \ 84 mp_size_t size; \ 85 MPFR_TMP_DECL (marker); \ 86 \ 87 SPEED_RESTRICT_COND (s->size >= MPFR_PREC_MIN); \ 88 SPEED_RESTRICT_COND (s->size <= MPFR_PREC_MAX); \ 89 MPFR_TMP_MARK (marker); \ 90 \ 91 size = (s->size-1)/GMP_NUMB_BITS+1; \ 92 s->xp[size-1] |= MPFR_LIMB_HIGHBIT; \ 93 MPFR_TMP_INIT1 (s->xp, x, s->size); \ 94 MPFR_SET_EXP (x, 0); \ 95 \ 96 MPFR_TMP_INIT (vp, v, s->size, size); \ 97 MPFR_TMP_INIT (wp, w, s->size, size); \ 98 \ 99 speed_operand_src (s, s->xp, size); \ 100 speed_operand_dst (s, vp, size); \ 101 speed_operand_dst (s, wp, size); \ 102 speed_cache_fill (s); \ 103 \ 104 speed_starttime (); \ 105 i = s->reps; \ 106 do \ 107 mean_fun (v, w, x, MPFR_RNDN); \ 108 while (--i != 0); \ 109 t = speed_endtime (); \ 110 \ 111 MPFR_TMP_FREE (marker); \ 112 return t; \ 113 } \ 114 while (0) 115 116/* template for a function like mpfr_mul */ 117#define SPEED_MPFR_OP(mean_fun) \ 118 do \ 119 { \ 120 unsigned i; \ 121 mpfr_limb_ptr wp; \ 122 double t; \ 123 mpfr_t w, x, y; \ 124 mp_size_t size; \ 125 MPFR_TMP_DECL (marker); \ 126 \ 127 SPEED_RESTRICT_COND (s->size >= MPFR_PREC_MIN); \ 128 SPEED_RESTRICT_COND (s->size <= MPFR_PREC_MAX); \ 129 MPFR_TMP_MARK (marker); \ 130 \ 131 size = (s->size-1)/GMP_NUMB_BITS+1; \ 132 s->xp[size-1] |= MPFR_LIMB_HIGHBIT; \ 133 MPFR_TMP_INIT1 (s->xp, x, s->size); \ 134 MPFR_SET_EXP (x, 0); \ 135 s->yp[size-1] |= MPFR_LIMB_HIGHBIT; \ 136 MPFR_TMP_INIT1 (s->yp, y, s->size); \ 137 MPFR_SET_EXP (y, 0); \ 138 \ 139 MPFR_TMP_INIT (wp, w, s->size, size); \ 140 \ 141 speed_operand_src (s, s->xp, size); \ 142 speed_operand_src (s, s->yp, size); \ 143 speed_operand_dst (s, wp, size); \ 144 speed_cache_fill (s); \ 145 \ 146 speed_starttime (); \ 147 i = s->reps; \ 148 do \ 149 mean_fun (w, x, y, MPFR_RNDN); \ 150 while (--i != 0); \ 151 t = speed_endtime (); \ 152 \ 153 MPFR_TMP_FREE (marker); \ 154 return t; \ 155 } \ 156 while (0) 157 158/* special template for mpfr_mul(a,b,b) */ 159#define SPEED_MPFR_SQR(mean_fun) \ 160 do \ 161 { \ 162 unsigned i; \ 163 mpfr_limb_ptr wp; \ 164 double t; \ 165 mpfr_t w, x; \ 166 mp_size_t size; \ 167 MPFR_TMP_DECL (marker); \ 168 \ 169 SPEED_RESTRICT_COND (s->size >= MPFR_PREC_MIN); \ 170 SPEED_RESTRICT_COND (s->size <= MPFR_PREC_MAX); \ 171 MPFR_TMP_MARK (marker); \ 172 \ 173 size = (s->size-1)/GMP_NUMB_BITS+1; \ 174 s->xp[size-1] |= MPFR_LIMB_HIGHBIT; \ 175 MPFR_TMP_INIT1 (s->xp, x, s->size); \ 176 MPFR_SET_EXP (x, 0); \ 177 \ 178 MPFR_TMP_INIT (wp, w, s->size, size); \ 179 \ 180 speed_operand_src (s, s->xp, size); \ 181 speed_operand_dst (s, wp, size); \ 182 speed_cache_fill (s); \ 183 \ 184 speed_starttime (); \ 185 i = s->reps; \ 186 do \ 187 mean_fun (w, x, x, MPFR_RNDN); \ 188 while (--i != 0); \ 189 t = speed_endtime (); \ 190 \ 191 MPFR_TMP_FREE (marker); \ 192 return t; \ 193 } \ 194 while (0) 195 196/* s->size: precision of both input and output 197 s->xp : Mantissa of first input 198 s->r : exponent 199 s->align_xp : sign (1 means positive, 2 means negative) 200*/ 201#define SPEED_MPFR_FUNC_WITH_EXPONENT(mean_fun) \ 202 do \ 203 { \ 204 unsigned i; \ 205 mpfr_limb_ptr wp; \ 206 double t; \ 207 mpfr_t w, x; \ 208 mp_size_t size; \ 209 MPFR_TMP_DECL (marker); \ 210 \ 211 SPEED_RESTRICT_COND (s->size >= MPFR_PREC_MIN); \ 212 SPEED_RESTRICT_COND (s->size <= MPFR_PREC_MAX); \ 213 MPFR_TMP_MARK (marker); \ 214 \ 215 size = (s->size-1)/GMP_NUMB_BITS+1; \ 216 s->xp[size-1] |= MPFR_LIMB_HIGHBIT; \ 217 MPFR_TMP_INIT1 (s->xp, x, s->size); \ 218 MPFR_SET_EXP (x, s->r); \ 219 if (s->align_xp == 2) MPFR_SET_NEG (x); \ 220 \ 221 MPFR_TMP_INIT (wp, w, s->size, size); \ 222 \ 223 speed_operand_src (s, s->xp, size); \ 224 speed_operand_dst (s, wp, size); \ 225 speed_cache_fill (s); \ 226 \ 227 speed_starttime (); \ 228 i = s->reps; \ 229 do \ 230 mean_fun (w, x, MPFR_RNDN); \ 231 while (--i != 0); \ 232 t = speed_endtime (); \ 233 \ 234 MPFR_TMP_FREE (marker); \ 235 return t; \ 236 } \ 237 while (0) 238 239/* First we include all the functions we want to tune inside this program. 240 We can't use the GNU MPFR library since the thresholds are fixed macros. */ 241 242/* Setup mpfr_exp_2 */ 243mpfr_prec_t mpfr_exp_2_threshold; 244#undef MPFR_EXP_2_THRESHOLD 245#define MPFR_EXP_2_THRESHOLD mpfr_exp_2_threshold 246#include "exp_2.c" 247static double 248speed_mpfr_exp_2 (struct speed_params *s) 249{ 250 SPEED_MPFR_FUNC (mpfr_exp_2); 251} 252 253/* Setup mpfr_exp */ 254mpfr_prec_t mpfr_exp_threshold; 255#undef MPFR_EXP_THRESHOLD 256#define MPFR_EXP_THRESHOLD mpfr_exp_threshold 257#include "exp.c" 258static double 259speed_mpfr_exp (struct speed_params *s) 260{ 261 SPEED_MPFR_FUNC (mpfr_exp); 262} 263 264/* Setup mpfr_sin_cos */ 265mpfr_prec_t mpfr_sincos_threshold; 266#undef MPFR_SINCOS_THRESHOLD 267#define MPFR_SINCOS_THRESHOLD mpfr_sincos_threshold 268#include "sin_cos.c" 269#include "cos.c" 270static double 271speed_mpfr_sincos (struct speed_params *s) 272{ 273 SPEED_MPFR_FUNC2 (mpfr_sin_cos); 274} 275 276/* Setup mpfr_mul, mpfr_sqr and mpfr_div */ 277mpfr_prec_t mpfr_mul_threshold; 278mpfr_prec_t mpfr_sqr_threshold; 279mpfr_prec_t mpfr_div_threshold; 280#undef MPFR_MUL_THRESHOLD 281#define MPFR_MUL_THRESHOLD mpfr_mul_threshold 282#undef MPFR_SQR_THRESHOLD 283#define MPFR_SQR_THRESHOLD mpfr_sqr_threshold 284#undef MPFR_DIV_THRESHOLD 285#define MPFR_DIV_THRESHOLD mpfr_div_threshold 286#include "mul.c" 287#include "div.c" 288static double 289speed_mpfr_mul (struct speed_params *s) 290{ 291 SPEED_MPFR_OP (mpfr_mul); 292} 293static double 294speed_mpfr_sqr (struct speed_params *s) 295{ 296 SPEED_MPFR_SQR (mpfr_mul); 297} 298static double 299speed_mpfr_div (struct speed_params *s) 300{ 301 SPEED_MPFR_OP (mpfr_div); 302} 303 304/************************************************ 305 * Common functions (inspired by GMP function) * 306 ************************************************/ 307static int 308analyze_data (double *dat, int ndat) 309{ 310 double x, min_x; 311 int j, min_j; 312 313 x = 0.0; 314 for (j = 0; j < ndat; j++) 315 if (dat[j] > 0.0) 316 x += dat[j]; 317 318 min_x = x; 319 min_j = 0; 320 321 for (j = 0; j < ndat; x -= dat[j], j++) 322 { 323 if (x < min_x) 324 { 325 min_x = x; 326 min_j = j; 327 } 328 } 329 return min_j; 330} 331 332static double 333mpfr_speed_measure (speed_function_t fun, struct speed_params *s, char *m) 334{ 335 double t = -1.0; 336 int i; 337 int number_of_iterations = 30; 338 for (i = 1; i <= number_of_iterations && t == -1.0; i++) 339 { 340 t = speed_measure (fun, s); 341 if ( (t == -1.0) && (i+1 <= number_of_iterations) ) 342 printf("speed_measure failed for size %lu. Trying again... (%d/%d)\n", 343 s->size, i+1, number_of_iterations); 344 } 345 if (t == -1.0) 346 { 347 fprintf (stderr, "Failed to measure %s!\n", m); 348 fprintf (stderr, "If CPU frequency scaling is enabled, please disable it:\n"); 349 fprintf (stderr, " under Linux: cpufreq-selector -g performance\n"); 350 fprintf (stderr, "On a multi-core processor, you might also try to load all the cores\n"); 351 abort (); 352 } 353 return t; 354} 355 356#define THRESHOLD_WINDOW 16 357#define THRESHOLD_FINAL_WINDOW 128 358static double 359domeasure (mpfr_prec_t *threshold, 360 double (*func) (struct speed_params *), 361 mpfr_prec_t p) 362{ 363 struct speed_params s; 364 mp_size_t size; 365 double t1, t2, d; 366 367 s.align_xp = s.align_yp = s.align_wp = 64; 368 s.size = p; 369 size = (p - 1)/GMP_NUMB_BITS+1; 370 s.xp = malloc (2*size*sizeof (mp_limb_t)); 371 if (s.xp == NULL) 372 { 373 fprintf (stderr, "Can't allocate memory.\n"); 374 abort (); 375 } 376 mpn_random (s.xp, size); 377 s.yp = s.xp + size; 378 mpn_random (s.yp, size); 379 *threshold = MPFR_PREC_MAX; 380 t1 = mpfr_speed_measure (func, &s, "function 1"); 381 *threshold = 1; 382 t2 = mpfr_speed_measure (func, &s, "function 2"); 383 free (s.xp); 384 /* t1 is the time of the first algo (used for low prec) */ 385 if (t2 >= t1) 386 d = (t2 - t1) / t2; 387 else 388 d = (t2 - t1) / t1; 389 /* d > 0 if we have to use algo 1. 390 d < 0 if we have to use algo 2 */ 391 return d; 392} 393 394/* Performs measures when both the precision and the point of evaluation 395 shall vary. s.yp is ignored and not initialized. 396 It assumes that func depends on three thresholds with a boundary of the 397 form threshold1*x + threshold2*p = some scaling factor, if x<0, 398 and threshold3*x + threshold2*p = some scaling factor, if x>=0. 399*/ 400static double 401domeasure2 (long int *threshold1, long int *threshold2, long int *threshold3, 402 double (*func) (struct speed_params *), 403 mpfr_prec_t p, 404 mpfr_t x) 405{ 406 struct speed_params s; 407 mp_size_t size; 408 double t1, t2, d; 409 mpfr_t xtmp; 410 411 if (MPFR_IS_SINGULAR (x)) 412 { 413 mpfr_fprintf (stderr, "x=%RNf is not a regular number.\n"); 414 abort (); 415 } 416 if (MPFR_IS_NEG (x)) 417 s.align_xp = 2; 418 else 419 s.align_xp = 1; 420 421 s.align_yp = s.align_wp = 64; 422 s.size = p; 423 size = (p - 1)/GMP_NUMB_BITS+1; 424 425 mpfr_init2 (xtmp, p); 426 mpn_random (xtmp->_mpfr_d, size); 427 xtmp->_mpfr_d[size-1] |= MPFR_LIMB_HIGHBIT; 428 MPFR_SET_EXP (xtmp, -53); 429 mpfr_add_ui (xtmp, xtmp, 1, MPFR_RNDN); 430 mpfr_mul (xtmp, xtmp, x, MPFR_RNDN); /* xtmp = x*(1+perturb) */ 431 /* where perturb ~ 2^(-53) is */ 432 /* randomly chosen. */ 433 s.xp = xtmp->_mpfr_d; 434 s.r = MPFR_GET_EXP (xtmp); 435 436 *threshold1 = 0; 437 *threshold2 = 0; 438 *threshold3 = 0; 439 t1 = mpfr_speed_measure (func, &s, "function 1"); 440 441 if (MPFR_IS_NEG (x)) 442 *threshold1 = INT_MIN; 443 else 444 *threshold3 = INT_MAX; 445 *threshold2 = INT_MAX; 446 t2 = mpfr_speed_measure (func, &s, "function 2"); 447 448 /* t1 is the time of the first algo (used for low prec) */ 449 if (t2 >= t1) 450 d = (t2 - t1) / t2; 451 else 452 d = (t2 - t1) / t1; 453 /* d > 0 if we have to use algo 1. 454 d < 0 if we have to use algo 2 */ 455 mpfr_clear (xtmp); 456 return d; 457} 458 459/* Tune a function with a simple THRESHOLD 460 The function doesn't depend on another threshold. 461 It assumes that it uses algo1 if p < THRESHOLD 462 and algo2 otherwise. 463 if algo2 is better for low prec, and algo1 better for high prec, 464 the behaviour of this function is undefined. */ 465static void 466tune_simple_func (mpfr_prec_t *threshold, 467 double (*func) (struct speed_params *), 468 mpfr_prec_t pstart) 469{ 470 double measure[THRESHOLD_FINAL_WINDOW+1]; 471 double d; 472 mpfr_prec_t pstep; 473 int i, numpos, numneg, try; 474 mpfr_prec_t pmin, pmax, p; 475 476 /* first look for a lower bound within 10% */ 477 pmin = p = pstart; 478 d = domeasure (threshold, func, pmin); 479 if (d < 0.0) 480 { 481 if (verbose) 482 printf ("Oops: even for %lu, algo 2 seems to be faster!\n", 483 (unsigned long) pmin); 484 *threshold = MPFR_PREC_MIN; 485 return; 486 } 487 if (d >= 1.00) 488 for (;;) 489 { 490 d = domeasure (threshold, func, pmin); 491 if (d < 1.00) 492 break; 493 p = pmin; 494 pmin += pmin/2; 495 } 496 pmin = p; 497 for (;;) 498 { 499 d = domeasure (threshold, func, pmin); 500 if (d < 0.10) 501 break; 502 pmin += GMP_NUMB_BITS; 503 } 504 505 /* then look for an upper bound within 20% */ 506 pmax = pmin * 2; 507 for (;;) 508 { 509 d = domeasure (threshold, func, pmax); 510 if (d < -0.20) 511 break; 512 pmax += pmin / 2; /* don't increase too rapidly */ 513 } 514 515 /* The threshold is between pmin and pmax. Affine them */ 516 try = 0; 517 while ((pmax-pmin) >= THRESHOLD_FINAL_WINDOW) 518 { 519 pstep = MAX(MIN(GMP_NUMB_BITS/2,(pmax-pmin)/(2*THRESHOLD_WINDOW)),1); 520 if (verbose) 521 printf ("Pmin = %8lu Pmax = %8lu Pstep=%lu\n", pmin, pmax, pstep); 522 p = (pmin + pmax) / 2; 523 for (i = numpos = numneg = 0 ; i < THRESHOLD_WINDOW + 1 ; i++) 524 { 525 measure[i] = domeasure (threshold, func, 526 p+(i-THRESHOLD_WINDOW/2)*pstep); 527 if (measure[i] > 0) 528 numpos ++; 529 else if (measure[i] < 0) 530 numneg ++; 531 } 532 if (numpos > numneg) 533 /* We use more often algo 1 than algo 2 */ 534 pmin = p - THRESHOLD_WINDOW/2*pstep; 535 else if (numpos < numneg) 536 pmax = p + THRESHOLD_WINDOW/2*pstep; 537 else 538 /* numpos == numneg ... */ 539 if (++ try > 2) 540 { 541 *threshold = p; 542 if (verbose) 543 printf ("Quick find: %lu\n", *threshold); 544 return ; 545 } 546 } 547 548 /* Final tune... */ 549 if (verbose) 550 printf ("Finalizing in [%lu, %lu]... ", pmin, pmax); 551 for (i = 0 ; i < THRESHOLD_FINAL_WINDOW+1 ; i++) 552 measure[i] = domeasure (threshold, func, pmin+i); 553 i = analyze_data (measure, THRESHOLD_FINAL_WINDOW+1); 554 *threshold = pmin + i; 555 if (verbose) 556 printf ("%lu\n", *threshold); 557 return; 558} 559 560/* Tune a function which behavior depends on both p and x, 561 in a given direction. 562 It assumes that for (x,p) close to zero, algo1 is used 563 and algo2 is used when (x,p) is far from zero. 564 If algo2 is better for low prec, and algo1 better for high prec, 565 the behaviour of this function is undefined. 566 This tuning function tries couples (x,p) of the form (ell*dirx, ell*dirp) 567 until it finds a point on the boundary. It returns ell. 568 */ 569static void 570tune_simple_func_in_some_direction (long int *threshold1, 571 long int *threshold2, 572 long int *threshold3, 573 double (*func) (struct speed_params *), 574 mpfr_prec_t pstart, 575 int dirx, int dirp, 576 mpfr_t xres, mpfr_prec_t *pres) 577{ 578 double measure[THRESHOLD_FINAL_WINDOW+1]; 579 double d; 580 mpfr_prec_t pstep; 581 int i, numpos, numneg, try; 582 mpfr_prec_t pmin, pmax, p; 583 mpfr_t xmin, xmax, x; 584 mpfr_t ratio; 585 586 mpfr_init2 (ratio, MPFR_SMALL_PRECISION); 587 mpfr_set_si (ratio, dirx, MPFR_RNDN); 588 mpfr_div_si (ratio, ratio, dirp, MPFR_RNDN); 589 590 mpfr_init2 (xmin, MPFR_SMALL_PRECISION); 591 mpfr_init2 (xmax, MPFR_SMALL_PRECISION); 592 mpfr_init2 (x, MPFR_SMALL_PRECISION); 593 594 /* first look for a lower bound within 10% */ 595 pmin = p = pstart; 596 mpfr_mul_ui (xmin, ratio, (unsigned int)pmin, MPFR_RNDN); 597 mpfr_set (x, xmin, MPFR_RNDN); 598 599 d = domeasure2 (threshold1, threshold2, threshold3, func, pmin, xmin); 600 if (d < 0.0) 601 { 602 if (verbose) 603 printf ("Oops: even for %lu, algo 2 seems to be faster!\n", 604 (unsigned long) pmin); 605 *pres = MPFR_PREC_MIN; 606 mpfr_mul_ui (xres, ratio, (unsigned int)*pres, MPFR_RNDN); 607 mpfr_clear (ratio); mpfr_clear (x); mpfr_clear (xmin); mpfr_clear (xmax); 608 return; 609 } 610 if (d >= 1.00) 611 for (;;) 612 { 613 d = domeasure2 (threshold1, threshold2, threshold3, func, pmin, xmin); 614 if (d < 1.00) 615 break; 616 p = pmin; 617 mpfr_set (x, xmin, MPFR_RNDN); 618 pmin += pmin/2; 619 mpfr_mul_ui (xmin, ratio, (unsigned int)pmin, MPFR_RNDN); 620 } 621 pmin = p; 622 mpfr_set (xmin, x, MPFR_RNDN); 623 for (;;) 624 { 625 d = domeasure2 (threshold1, threshold2, threshold3, func, pmin, xmin); 626 if (d < 0.10) 627 break; 628 pmin += GMP_NUMB_BITS; 629 mpfr_mul_ui (xmin, ratio, (unsigned int)pmin, MPFR_RNDN); 630 } 631 632 /* then look for an upper bound within 20% */ 633 pmax = pmin * 2; 634 mpfr_mul_ui (xmax, ratio, (unsigned int)pmax, MPFR_RNDN); 635 for (;;) 636 { 637 d = domeasure2 (threshold1, threshold2, threshold3, func, pmax, xmax); 638 if (d < -0.20) 639 break; 640 pmax += pmin / 2; /* don't increase too rapidly */ 641 mpfr_mul_ui (xmax, ratio, (unsigned int)pmax, MPFR_RNDN); 642 } 643 644 /* The threshold is between pmin and pmax. Affine them */ 645 try = 0; 646 while ((pmax-pmin) >= THRESHOLD_FINAL_WINDOW) 647 { 648 pstep = MAX(MIN(GMP_NUMB_BITS/2,(pmax-pmin)/(2*THRESHOLD_WINDOW)),1); 649 if (verbose) 650 printf ("Pmin = %8lu Pmax = %8lu Pstep=%lu\n", pmin, pmax, pstep); 651 p = (pmin + pmax) / 2; 652 mpfr_mul_ui (x, ratio, (unsigned int)p, MPFR_RNDN); 653 for (i = numpos = numneg = 0 ; i < THRESHOLD_WINDOW + 1 ; i++) 654 { 655 *pres = p+(i-THRESHOLD_WINDOW/2)*pstep; 656 mpfr_mul_ui (xres, ratio, (unsigned int)*pres, MPFR_RNDN); 657 measure[i] = domeasure2 (threshold1, threshold2, threshold3, 658 func, *pres, xres); 659 if (measure[i] > 0) 660 numpos ++; 661 else if (measure[i] < 0) 662 numneg ++; 663 } 664 if (numpos > numneg) 665 { 666 /* We use more often algo 1 than algo 2 */ 667 pmin = p - THRESHOLD_WINDOW/2*pstep; 668 mpfr_mul_ui (xmin, ratio, (unsigned int)pmin, MPFR_RNDN); 669 } 670 else if (numpos < numneg) 671 { 672 pmax = p + THRESHOLD_WINDOW/2*pstep; 673 mpfr_mul_ui (xmax, ratio, (unsigned int)pmax, MPFR_RNDN); 674 } 675 else 676 /* numpos == numneg ... */ 677 if (++ try > 2) 678 { 679 *pres = p; 680 mpfr_mul_ui (xres, ratio, (unsigned int)*pres, MPFR_RNDN); 681 if (verbose) 682 printf ("Quick find: %lu\n", *pres); 683 mpfr_clear (ratio); 684 mpfr_clear (x); mpfr_clear (xmin); mpfr_clear (xmax); 685 return ; 686 } 687 } 688 689 /* Final tune... */ 690 if (verbose) 691 printf ("Finalizing in [%lu, %lu]... ", pmin, pmax); 692 for (i = 0 ; i < THRESHOLD_FINAL_WINDOW+1 ; i++) 693 { 694 *pres = pmin+i; 695 mpfr_mul_ui (xres, ratio, (unsigned int)*pres, MPFR_RNDN); 696 measure[i] = domeasure2 (threshold1, threshold2, threshold3, 697 func, *pres, xres); 698 } 699 i = analyze_data (measure, THRESHOLD_FINAL_WINDOW+1); 700 *pres = pmin + i; 701 mpfr_mul_ui (xres, ratio, (unsigned int)*pres, MPFR_RNDN); 702 if (verbose) 703 printf ("%lu\n", *pres); 704 mpfr_clear (ratio); mpfr_clear (x); mpfr_clear (xmin); mpfr_clear (xmax); 705 return; 706} 707 708/************************************ 709 * Tune Mulders' mulhigh function * 710 ************************************/ 711#define TOLERANCE 1.00 712#define MULDERS_TABLE_SIZE 1024 713#ifndef MPFR_MULHIGH_SIZE 714# define MPFR_MULHIGH_SIZE MULDERS_TABLE_SIZE 715#endif 716#ifndef MPFR_SQRHIGH_SIZE 717# define MPFR_SQRHIGH_SIZE MULDERS_TABLE_SIZE 718#endif 719#ifndef MPFR_DIVHIGH_SIZE 720# define MPFR_DIVHIGH_SIZE MULDERS_TABLE_SIZE 721#endif 722#define MPFR_MULHIGH_TAB_SIZE MPFR_MULHIGH_SIZE 723#define MPFR_SQRHIGH_TAB_SIZE MPFR_SQRHIGH_SIZE 724#define MPFR_DIVHIGH_TAB_SIZE MPFR_DIVHIGH_SIZE 725#include "mulders.c" 726 727static double 728speed_mpfr_mulhigh (struct speed_params *s) 729{ 730 SPEED_ROUTINE_MPN_MUL_N (mpfr_mulhigh_n); 731} 732 733static double 734speed_mpfr_sqrhigh (struct speed_params *s) 735{ 736 SPEED_ROUTINE_MPN_SQR (mpfr_sqrhigh_n); 737} 738 739static double 740speed_mpfr_divhigh (struct speed_params *s) 741{ 742 SPEED_ROUTINE_MPN_DC_DIVREM_CALL (mpfr_divhigh_n (q, a, d, s->size)); 743} 744 745#define MAX_STEPS 513 /* maximum number of values of k tried for a given n */ 746 747/* Tune mpfr_mulhigh_n for size n */ 748static mp_size_t 749tune_mul_mulders_upto (mp_size_t n) 750{ 751 struct speed_params s; 752 mp_size_t k, kbest, step; 753 double t, tbest; 754 MPFR_TMP_DECL (marker); 755 756 if (n == 0) 757 return -1; 758 759 MPFR_TMP_MARK (marker); 760 s.align_xp = s.align_yp = s.align_wp = 64; 761 s.size = n; 762 s.xp = MPFR_TMP_ALLOC (n * sizeof (mp_limb_t)); 763 s.yp = MPFR_TMP_ALLOC (n * sizeof (mp_limb_t)); 764 mpn_random (s.xp, n); 765 mpn_random (s.yp, n); 766 767 /* Check k == -1, mpn_mul_basecase */ 768 mulhigh_ktab[n] = -1; 769 kbest = -1; 770 tbest = mpfr_speed_measure (speed_mpfr_mulhigh, &s, "mpfr_mulhigh"); 771 772 /* Check k == 0, mpn_mulhigh_n_basecase */ 773 mulhigh_ktab[n] = 0; 774 t = mpfr_speed_measure (speed_mpfr_mulhigh, &s, "mpfr_mulhigh"); 775 if (t * TOLERANCE < tbest) 776 kbest = 0, tbest = t; 777 778 /* Check Mulders with cutoff point k */ 779 step = 1 + n / (2 * MAX_STEPS); 780 /* we need k >= (n+3)/2, which translates into k >= (n+4)/2 in C */ 781 for (k = (n + 4) / 2 ; k < n ; k += step) 782 { 783 mulhigh_ktab[n] = k; 784 t = mpfr_speed_measure (speed_mpfr_mulhigh, &s, "mpfr_mulhigh"); 785 if (t * TOLERANCE < tbest) 786 kbest = k, tbest = t; 787 } 788 789 mulhigh_ktab[n] = kbest; 790 791 MPFR_TMP_FREE (marker); 792 return kbest; 793} 794 795/* Tune mpfr_sqrhigh_n for size n */ 796static mp_size_t 797tune_sqr_mulders_upto (mp_size_t n) 798{ 799 struct speed_params s; 800 mp_size_t k, kbest, step; 801 double t, tbest; 802 MPFR_TMP_DECL (marker); 803 804 if (n == 0) 805 return -1; 806 807 MPFR_TMP_MARK (marker); 808 s.align_xp = s.align_wp = 64; 809 s.size = n; 810 s.xp = MPFR_TMP_ALLOC (n * sizeof (mp_limb_t)); 811 mpn_random (s.xp, n); 812 813 /* Check k == -1, mpn_sqr_basecase */ 814 sqrhigh_ktab[n] = -1; 815 kbest = -1; 816 tbest = mpfr_speed_measure (speed_mpfr_sqrhigh, &s, "mpfr_sqrhigh"); 817 818 /* Check k == 0, mpfr_mulhigh_n_basecase */ 819 sqrhigh_ktab[n] = 0; 820 t = mpfr_speed_measure (speed_mpfr_sqrhigh, &s, "mpfr_sqrhigh"); 821 if (t * TOLERANCE < tbest) 822 kbest = 0, tbest = t; 823 824 /* Check Mulders */ 825 step = 1 + n / (2 * MAX_STEPS); 826 /* we need k >= (n+3)/2, which translates into k >= (n+4)/2 in C */ 827 for (k = (n + 4) / 2 ; k < n ; k += step) 828 { 829 sqrhigh_ktab[n] = k; 830 t = mpfr_speed_measure (speed_mpfr_sqrhigh, &s, "mpfr_sqrhigh"); 831 if (t * TOLERANCE < tbest) 832 kbest = k, tbest = t; 833 } 834 835 sqrhigh_ktab[n] = kbest; 836 837 MPFR_TMP_FREE (marker); 838 return kbest; 839} 840 841/* Tune mpfr_divhigh_n for size n */ 842static mp_size_t 843tune_div_mulders_upto (mp_size_t n) 844{ 845 struct speed_params s; 846 mp_size_t k, kbest, step; 847 double t, tbest; 848 MPFR_TMP_DECL (marker); 849 850 if (n == 0) 851 return 0; 852 853 MPFR_TMP_MARK (marker); 854 s.align_xp = s.align_yp = s.align_wp = s.align_wp2 = 64; 855 s.size = n; 856 s.xp = MPFR_TMP_ALLOC (n * sizeof (mp_limb_t)); 857 s.yp = MPFR_TMP_ALLOC (n * sizeof (mp_limb_t)); 858 mpn_random (s.xp, n); 859 mpn_random (s.yp, n); 860 861 /* Check k == n, i.e., mpn_divrem */ 862 divhigh_ktab[n] = n; 863 kbest = n; 864 tbest = mpfr_speed_measure (speed_mpfr_divhigh, &s, "mpfr_divhigh"); 865 866 /* Check k == 0, i.e., mpfr_divhigh_n_basecase */ 867#if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q) 868 if (n > 2) /* mpn_sbpi1_divappr_q requires dn > 2 */ 869#endif 870 { 871 divhigh_ktab[n] = 0; 872 t = mpfr_speed_measure (speed_mpfr_divhigh, &s, "mpfr_divhigh"); 873 if (t * TOLERANCE < tbest) 874 kbest = 0, tbest = t; 875 } 876 877 /* Check Mulders */ 878 step = 1 + n / (2 * MAX_STEPS); 879 /* we should have (n+3)/2 <= k < n, which translates into 880 (n+4)/2 <= k < n in C */ 881 for (k = (n + 4) / 2 ; k < n ; k += step) 882 { 883 divhigh_ktab[n] = k; 884 t = mpfr_speed_measure (speed_mpfr_divhigh, &s, "mpfr_divhigh"); 885 if (t * TOLERANCE < tbest) 886 kbest = k, tbest = t; 887 } 888 889 divhigh_ktab[n] = kbest; 890 891 MPFR_TMP_FREE (marker); 892 893 return kbest; 894} 895 896static void 897tune_mul_mulders (FILE *f) 898{ 899 mp_size_t k; 900 901 if (verbose) 902 printf ("Tuning mpfr_mulhigh_n[%d]", (int) MPFR_MULHIGH_TAB_SIZE); 903 fprintf (f, "#define MPFR_MULHIGH_TAB \\\n "); 904 for (k = 0 ; k < MPFR_MULHIGH_TAB_SIZE ; k++) 905 { 906 fprintf (f, "%d", (int) tune_mul_mulders_upto (k)); 907 if (k != MPFR_MULHIGH_TAB_SIZE-1) 908 fputc (',', f); 909 if ((k+1) % 16 == 0) 910 fprintf (f, " \\\n "); 911 if (verbose) 912 putchar ('.'); 913 } 914 fprintf (f, " \n"); 915 if (verbose) 916 putchar ('\n'); 917} 918 919static void 920tune_sqr_mulders (FILE *f) 921{ 922 mp_size_t k; 923 924 if (verbose) 925 printf ("Tuning mpfr_sqrhigh_n[%d]", (int) MPFR_SQRHIGH_TAB_SIZE); 926 fprintf (f, "#define MPFR_SQRHIGH_TAB \\\n "); 927 for (k = 0 ; k < MPFR_SQRHIGH_TAB_SIZE ; k++) 928 { 929 fprintf (f, "%d", (int) tune_sqr_mulders_upto (k)); 930 if (k != MPFR_SQRHIGH_TAB_SIZE-1) 931 fputc (',', f); 932 if ((k+1) % 16 == 0) 933 fprintf (f, " \\\n "); 934 if (verbose) 935 putchar ('.'); 936 } 937 fprintf (f, " \n"); 938 if (verbose) 939 putchar ('\n'); 940} 941 942static void 943tune_div_mulders (FILE *f) 944{ 945 mp_size_t k; 946 947 if (verbose) 948 printf ("Tuning mpfr_divhigh_n[%d]", (int) MPFR_DIVHIGH_TAB_SIZE); 949 fprintf (f, "#define MPFR_DIVHIGH_TAB \\\n "); 950 for (k = 0 ; k < MPFR_DIVHIGH_TAB_SIZE ; k++) 951 { 952 fprintf (f, "%d", (int) tune_div_mulders_upto (k)); 953 if (k != MPFR_DIVHIGH_TAB_SIZE - 1) 954 fputc (',', f); 955 if ((k+1) % 16 == 0) 956 fprintf (f, " /*%zu-%zu*/ \\\n ", k - 15, k); 957 if (verbose) 958 putchar ('.'); 959 } 960 fprintf (f, " \n"); 961 if (verbose) 962 putchar ('\n'); 963} 964 965/******************************************************* 966 * Tuning functions for mpfr_ai * 967 *******************************************************/ 968 969long int mpfr_ai_threshold1; 970long int mpfr_ai_threshold2; 971long int mpfr_ai_threshold3; 972#undef MPFR_AI_THRESHOLD1 973#define MPFR_AI_THRESHOLD1 mpfr_ai_threshold1 974#undef MPFR_AI_THRESHOLD2 975#define MPFR_AI_THRESHOLD2 mpfr_ai_threshold2 976#undef MPFR_AI_THRESHOLD3 977#define MPFR_AI_THRESHOLD3 mpfr_ai_threshold3 978 979#include "ai.c" 980 981static double 982speed_mpfr_ai (struct speed_params *s) 983{ 984 SPEED_MPFR_FUNC_WITH_EXPONENT (mpfr_ai); 985} 986 987 988/******************************************************* 989 * Tune all the threshold of MPFR * 990 * Warning: tune the function in their dependent order!* 991 *******************************************************/ 992static void 993all (const char *filename) 994{ 995 FILE *f; 996 time_t start_time, end_time; 997 struct tm *tp; 998 mpfr_t x1, x2, x3, tmp1, tmp2; 999 mpfr_prec_t p1, p2, p3; 1000 1001 f = fopen (filename, "w"); 1002 if (f == NULL) 1003 { 1004 fprintf (stderr, "Can't open file '%s' for writing.\n", filename); 1005 abort (); 1006 } 1007 1008 speed_time_init (); 1009 if (verbose) 1010 { 1011 printf ("Using: %s\n", speed_time_string); 1012 printf ("speed_precision %d", speed_precision); 1013 if (speed_unittime == 1.0) 1014 printf (", speed_unittime 1 cycle"); 1015 else 1016 printf (", speed_unittime %.2e secs", speed_unittime); 1017 if (speed_cycletime == 1.0 || speed_cycletime == 0.0) 1018 printf (", CPU freq unknown\n"); 1019 else 1020 printf (", CPU freq %.2f MHz\n\n", 1e-6/speed_cycletime); 1021 } 1022 1023 time (&start_time); 1024 tp = localtime (&start_time); 1025 fprintf (f, "/* Generated by MPFR's tuneup.c, %d-%02d-%02d, ", 1026 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); 1027 1028#ifdef __ICC 1029 fprintf (f, "icc %d.%d.%d */\n", __ICC / 100, __ICC / 10 % 10, __ICC % 10); 1030#elif defined(__GNUC__) 1031#ifdef __GNUC_PATCHLEVEL__ 1032 fprintf (f, "gcc %d.%d.%d */\n", __GNUC__, __GNUC_MINOR__, 1033 __GNUC_PATCHLEVEL__); 1034#else 1035 fprintf (f, "gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); 1036#endif 1037#elif defined (__SUNPRO_C) 1038 fprintf (f, "Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); 1039#elif defined (__sgi) && defined (_COMPILER_VERSION) 1040 fprintf (f, "MIPSpro C %d.%d.%d */\n", 1041 _COMPILER_VERSION / 100, 1042 _COMPILER_VERSION / 10 % 10, 1043 _COMPILER_VERSION % 10); 1044#elif defined (__DECC) && defined (__DECC_VER) 1045 fprintf (f, "DEC C %d */\n", __DECC_VER); 1046#else 1047 fprintf (f, "system compiler */\n"); 1048#endif 1049 fprintf (f, "\n\n"); 1050 fprintf (f, "#ifndef MPFR_TUNE_CASE\n"); 1051 fprintf (f, "#define MPFR_TUNE_CASE \"src/mparam.h\"\n"); 1052 fprintf (f, "#endif\n\n"); 1053 1054 /* Tune mulhigh */ 1055 tune_mul_mulders (f); 1056 1057 /* Tune sqrhigh */ 1058 tune_sqr_mulders (f); 1059 1060 /* Tune divhigh */ 1061 tune_div_mulders (f); 1062 fflush (f); 1063 1064 /* Tune mpfr_mul (threshold is in limbs, but it doesn't matter too much) */ 1065 if (verbose) 1066 printf ("Tuning mpfr_mul...\n"); 1067 tune_simple_func (&mpfr_mul_threshold, speed_mpfr_mul, 1068 2*GMP_NUMB_BITS+1); 1069 fprintf (f, "#define MPFR_MUL_THRESHOLD %lu /* limbs */\n", 1070 (unsigned long) (mpfr_mul_threshold - 1) / GMP_NUMB_BITS + 1); 1071 1072 /* Tune mpfr_sqr (threshold is in limbs, but it doesn't matter too much) */ 1073 if (verbose) 1074 printf ("Tuning mpfr_sqr...\n"); 1075 tune_simple_func (&mpfr_sqr_threshold, speed_mpfr_sqr, 1076 2*GMP_NUMB_BITS+1); 1077 fprintf (f, "#define MPFR_SQR_THRESHOLD %lu /* limbs */\n", 1078 (unsigned long) (mpfr_sqr_threshold - 1) / GMP_NUMB_BITS + 1); 1079 1080 /* Tune mpfr_div (threshold is in limbs, but it doesn't matter too much) */ 1081 if (verbose) 1082 printf ("Tuning mpfr_div...\n"); 1083 tune_simple_func (&mpfr_div_threshold, speed_mpfr_div, 1084 2*GMP_NUMB_BITS+1); 1085 fprintf (f, "#define MPFR_DIV_THRESHOLD %lu /* limbs */\n", 1086 (unsigned long) (mpfr_div_threshold - 1) / GMP_NUMB_BITS + 1); 1087 1088 /* Tune mpfr_exp_2 */ 1089 if (verbose) 1090 printf ("Tuning mpfr_exp_2...\n"); 1091 tune_simple_func (&mpfr_exp_2_threshold, speed_mpfr_exp_2, GMP_NUMB_BITS); 1092 fprintf (f, "#define MPFR_EXP_2_THRESHOLD %lu /* bits */\n", 1093 (unsigned long) mpfr_exp_2_threshold); 1094 1095 /* Tune mpfr_exp */ 1096 if (verbose) 1097 printf ("Tuning mpfr_exp...\n"); 1098 tune_simple_func (&mpfr_exp_threshold, speed_mpfr_exp, 1099 MPFR_PREC_MIN+3*GMP_NUMB_BITS); 1100 fprintf (f, "#define MPFR_EXP_THRESHOLD %lu /* bits */\n", 1101 (unsigned long) mpfr_exp_threshold); 1102 1103 /* Tune mpfr_sin_cos */ 1104 if (verbose) 1105 printf ("Tuning mpfr_sin_cos...\n"); 1106 tune_simple_func (&mpfr_sincos_threshold, speed_mpfr_sincos, 1107 MPFR_PREC_MIN+3*GMP_NUMB_BITS); 1108 fprintf (f, "#define MPFR_SINCOS_THRESHOLD %lu /* bits */\n", 1109 (unsigned long) mpfr_sincos_threshold); 1110 1111 /* Tune mpfr_ai */ 1112 if (verbose) 1113 printf ("Tuning mpfr_ai...\n"); 1114 mpfr_init2 (x1, MPFR_SMALL_PRECISION); 1115 mpfr_init2 (x2, MPFR_SMALL_PRECISION); 1116 mpfr_init2 (x3, MPFR_SMALL_PRECISION); 1117 mpfr_init2 (tmp1, MPFR_SMALL_PRECISION); 1118 mpfr_init2 (tmp2, MPFR_SMALL_PRECISION); 1119 1120 tune_simple_func_in_some_direction (&mpfr_ai_threshold1, &mpfr_ai_threshold2, 1121 &mpfr_ai_threshold3, speed_mpfr_ai, 1122 MPFR_PREC_MIN+GMP_NUMB_BITS, 1123 -60, 200, x1, &p1); 1124 tune_simple_func_in_some_direction (&mpfr_ai_threshold1, &mpfr_ai_threshold2, 1125 &mpfr_ai_threshold3, speed_mpfr_ai, 1126 MPFR_PREC_MIN+GMP_NUMB_BITS, 1127 -20, 500, x2, &p2); 1128 tune_simple_func_in_some_direction (&mpfr_ai_threshold1, &mpfr_ai_threshold2, 1129 &mpfr_ai_threshold3, speed_mpfr_ai, 1130 MPFR_PREC_MIN+GMP_NUMB_BITS, 1131 40, 200, x3, &p3); 1132 1133 mpfr_mul_ui (tmp1, x2, (unsigned long)p1, MPFR_RNDN); 1134 mpfr_mul_ui (tmp2, x1, (unsigned long)p2, MPFR_RNDN); 1135 mpfr_sub (tmp1, tmp1, tmp2, MPFR_RNDN); 1136 mpfr_div_ui (tmp1, tmp1, MPFR_AI_SCALE, MPFR_RNDN); 1137 1138 mpfr_set_ui (tmp2, (unsigned long)p1, MPFR_RNDN); 1139 mpfr_sub_ui (tmp2, tmp2, (unsigned long)p2, MPFR_RNDN); 1140 mpfr_div (tmp2, tmp2, tmp1, MPFR_RNDN); 1141 mpfr_ai_threshold1 = mpfr_get_si (tmp2, MPFR_RNDN); 1142 1143 mpfr_sub (tmp2, x2, x1, MPFR_RNDN); 1144 mpfr_div (tmp2, tmp2, tmp1, MPFR_RNDN); 1145 mpfr_ai_threshold2 = mpfr_get_si (tmp2, MPFR_RNDN); 1146 1147 mpfr_set_ui (tmp1, (unsigned long)p3, MPFR_RNDN); 1148 mpfr_mul_si (tmp1, tmp1, mpfr_ai_threshold2, MPFR_RNDN); 1149 mpfr_ui_sub (tmp1, MPFR_AI_SCALE, tmp1, MPFR_RNDN); 1150 mpfr_div (tmp1, tmp1, x3, MPFR_RNDN); 1151 mpfr_ai_threshold3 = mpfr_get_si (tmp1, MPFR_RNDN); 1152 1153 fprintf (f, "#define MPFR_AI_THRESHOLD1 %ld /* threshold for negative input of mpfr_ai */\n", mpfr_ai_threshold1); 1154 fprintf (f, "#define MPFR_AI_THRESHOLD2 %ld\n", mpfr_ai_threshold2); 1155 fprintf (f, "#define MPFR_AI_THRESHOLD3 %ld\n", mpfr_ai_threshold3); 1156 1157 mpfr_clear (x1); mpfr_clear (x2); mpfr_clear (x3); 1158 mpfr_clear (tmp1); mpfr_clear (tmp2); 1159 1160 /* End of tuning */ 1161 time (&end_time); 1162 fprintf (f, "/* Tuneup completed successfully, took %ld seconds */\n", 1163 (long) (end_time - start_time)); 1164 if (verbose) 1165 printf ("Complete (took %ld seconds).\n", (long) (end_time - start_time)); 1166 1167 fclose (f); 1168} 1169 1170 1171/* Main function */ 1172int main (int argc, char *argv[]) 1173{ 1174 /* Unbuffered so if output is redirected to a file it isn't lost if the 1175 program is killed part way through. */ 1176 setbuf (stdout, NULL); 1177 setbuf (stderr, NULL); 1178 1179 verbose = argc > 1; 1180 1181 if (verbose) 1182 printf ("Tuning MPFR (Coffee time?)...\n"); 1183 1184 all ("mparam.h"); 1185 1186 return 0; 1187} 1188