1 /* Create tuned thresholds for various algorithms. 2 3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010, 4 2011, 2012 Free Software Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21 22 /* Usage: tuneup [-t] [-t] [-p precision] 23 24 -t turns on some diagnostic traces, a second -t turns on more traces. 25 26 Notes: 27 28 The code here isn't a vision of loveliness, mainly because it's subject 29 to ongoing changes according to new things wanting to be tuned, and 30 practical requirements of systems tested. 31 32 Sometimes running the program twice produces slightly different results. 33 This is probably because there's so little separating algorithms near 34 their crossover, and on that basis it should make little or no difference 35 to the final speed of the relevant routines, but nothing has been done to 36 check that carefully. 37 38 Algorithm: 39 40 The thresholds are determined as follows. A crossover may not be a 41 single size but rather a range where it oscillates between method A or 42 method B faster. If the threshold is set making B used where A is faster 43 (or vice versa) that's bad. Badness is the percentage time lost and 44 total badness is the sum of this over all sizes measured. The threshold 45 is set to minimize total badness. 46 47 Suppose, as sizes increase, method B becomes faster than method A. The 48 effect of the rule is that, as you look at increasing sizes, isolated 49 points where B is faster are ignored, but when it's consistently faster, 50 or faster on balance, then the threshold is set there. The same result 51 is obtained thinking in the other direction of A becoming faster at 52 smaller sizes. 53 54 In practice the thresholds tend to be chosen to bring on the next 55 algorithm fairly quickly. 56 57 This rule is attractive because it's got a basis in reason and is fairly 58 easy to implement, but no work has been done to actually compare it in 59 absolute terms to other possibilities. 60 61 Implementation: 62 63 In a normal library build the thresholds are constants. To tune them 64 selected objects are recompiled with the thresholds as global variables 65 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at 66 the end of gmp-impl.h, and rules in tune/Makefile.am. 67 68 MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The 69 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one 70 level, but recurse into the basecase. 71 72 MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value. 73 Other routines in turn will make use of both of those. Naturally the 74 dependants must be tuned first. 75 76 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, 77 just a threshold based on comparing two routines (mpn_divrem_1 and 78 mpn_divexact_1), and no further use of the value determined. 79 80 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being 81 just comparisons between certain routines on representative data. 82 83 Shortcuts are applied when native (assembler) versions of routines exist. 84 For instance a native mpn_sqr_basecase is assumed to be always faster 85 than mpn_mul_basecase, with no measuring. 86 87 No attempt is made to tune within assembler routines, for instance 88 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be 89 written and tuned all by hand. Assembler routines that might have hard 90 limits are recompiled though, to make them accept a bigger range of sizes 91 than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr. 92 93 Limitations: 94 95 The FFTs aren't subject to the same badness rule as the other thresholds, 96 so each k is probably being brought on a touch early. This isn't likely 97 to make a difference, and the simpler probing means fewer tests. 98 99 */ 100 101 #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ 102 103 #include "config.h" 104 105 #include <math.h> 106 #include <stdio.h> 107 #include <stdlib.h> 108 #include <time.h> 109 #if HAVE_UNISTD_H 110 #include <unistd.h> 111 #endif 112 113 #include "gmp.h" 114 #include "gmp-impl.h" 115 #include "longlong.h" 116 117 #include "tests.h" 118 #include "speed.h" 119 120 #if !HAVE_DECL_OPTARG 121 extern char *optarg; 122 extern int optind, opterr; 123 #endif 124 125 126 #define DEFAULT_MAX_SIZE 1000 /* limbs */ 127 128 #if WANT_FFT 129 mp_size_t option_fft_max_size = 50000; /* limbs */ 130 #else 131 mp_size_t option_fft_max_size = 0; 132 #endif 133 int option_trace = 0; 134 int option_fft_trace = 0; 135 struct speed_params s; 136 137 struct dat_t { 138 mp_size_t size; 139 double d; 140 } *dat = NULL; 141 int ndat = 0; 142 int allocdat = 0; 143 144 /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that 145 case use zero here, which for params.max_size means no limit. */ 146 #ifndef TUNE_SQR_TOOM2_MAX 147 #define TUNE_SQR_TOOM2_MAX 0 148 #endif 149 150 mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX; 151 mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT; 152 mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; 153 mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT; 154 mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT; 155 mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX; 156 mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX; 157 mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX; 158 mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX; 159 mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX; 160 mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; 161 mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; 162 mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; 163 mp_size_t sqr_toom2_threshold 164 = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX); 165 mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 166 mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 167 mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 168 mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 169 mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; 170 mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; 171 mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX; 172 mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX; 173 mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX; 174 mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX; 175 mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX; 176 mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX; 177 mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX; 178 mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX; 179 mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX; 180 mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX; 181 mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX; 182 mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX; 183 mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX; 184 mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX; 185 mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX; 186 mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX; 187 mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX; 188 mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX; 189 mp_size_t inv_newton_threshold = MP_SIZE_T_MAX; 190 mp_size_t inv_appr_threshold = MP_SIZE_T_MAX; 191 mp_size_t binv_newton_threshold = MP_SIZE_T_MAX; 192 mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX; 193 mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX; 194 mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; 195 mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; 196 mp_size_t hgcd_threshold = MP_SIZE_T_MAX; 197 mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX; 198 mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX; 199 mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; 200 mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; 201 mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; 202 mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; 203 mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; 204 mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; 205 int mod_1_1p_method = 0; 206 mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX; 207 mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX; 208 mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX; 209 mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX; 210 mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX; 211 mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; 212 mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; 213 mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; 214 mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; 215 mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; 216 mp_size_t fac_odd_threshold = 0; 217 mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT; 218 219 mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; 220 mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; 221 222 struct param_t { 223 const char *name; 224 speed_function_t function; 225 speed_function_t function2; 226 double step_factor; /* how much to step relatively */ 227 int step; /* how much to step absolutely */ 228 double function_fudge; /* multiplier for "function" speeds */ 229 int stop_since_change; 230 double stop_factor; 231 mp_size_t min_size; 232 int min_is_always; 233 mp_size_t max_size; 234 mp_size_t check_size; 235 mp_size_t size_extra; 236 237 #define DATA_HIGH_LT_R 1 238 #define DATA_HIGH_GE_R 2 239 int data_high; 240 241 int noprint; 242 }; 243 244 245 /* These are normally undefined when false, which suits "#if" fine. 246 But give them zero values so they can be used in plain C "if"s. */ 247 #ifndef UDIV_PREINV_ALWAYS 248 #define UDIV_PREINV_ALWAYS 0 249 #endif 250 #ifndef HAVE_NATIVE_mpn_divexact_1 251 #define HAVE_NATIVE_mpn_divexact_1 0 252 #endif 253 #ifndef HAVE_NATIVE_mpn_divrem_1 254 #define HAVE_NATIVE_mpn_divrem_1 0 255 #endif 256 #ifndef HAVE_NATIVE_mpn_divrem_2 257 #define HAVE_NATIVE_mpn_divrem_2 0 258 #endif 259 #ifndef HAVE_NATIVE_mpn_mod_1 260 #define HAVE_NATIVE_mpn_mod_1 0 261 #endif 262 #ifndef HAVE_NATIVE_mpn_mod_1_1p 263 #define HAVE_NATIVE_mpn_mod_1_1p 0 264 #endif 265 #ifndef HAVE_NATIVE_mpn_modexact_1_odd 266 #define HAVE_NATIVE_mpn_modexact_1_odd 0 267 #endif 268 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 269 #define HAVE_NATIVE_mpn_preinv_divrem_1 0 270 #endif 271 #ifndef HAVE_NATIVE_mpn_preinv_mod_1 272 #define HAVE_NATIVE_mpn_preinv_mod_1 0 273 #endif 274 #ifndef HAVE_NATIVE_mpn_sqr_basecase 275 #define HAVE_NATIVE_mpn_sqr_basecase 0 276 #endif 277 278 279 #define MAX3(a,b,c) MAX (MAX (a, b), c) 280 281 mp_limb_t 282 randlimb_norm (void) 283 { 284 mp_limb_t n; 285 mpn_random (&n, 1); 286 n |= GMP_NUMB_HIGHBIT; 287 return n; 288 } 289 290 #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) 291 292 mp_limb_t 293 randlimb_half (void) 294 { 295 mp_limb_t n; 296 mpn_random (&n, 1); 297 n &= GMP_NUMB_HALFMASK; 298 n += (n==0); 299 return n; 300 } 301 302 303 /* Add an entry to the end of the dat[] array, reallocing to make it bigger 304 if necessary. */ 305 void 306 add_dat (mp_size_t size, double d) 307 { 308 #define ALLOCDAT_STEP 500 309 310 ASSERT_ALWAYS (ndat <= allocdat); 311 312 if (ndat == allocdat) 313 { 314 dat = (struct dat_t *) __gmp_allocate_or_reallocate 315 (dat, allocdat * sizeof(dat[0]), 316 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); 317 allocdat += ALLOCDAT_STEP; 318 } 319 320 dat[ndat].size = size; 321 dat[ndat].d = d; 322 ndat++; 323 } 324 325 326 /* Return the threshold size based on the data accumulated. */ 327 mp_size_t 328 analyze_dat (int final) 329 { 330 double x, min_x; 331 int j, min_j; 332 333 /* If the threshold is set at dat[0].size, any positive values are bad. */ 334 x = 0.0; 335 for (j = 0; j < ndat; j++) 336 if (dat[j].d > 0.0) 337 x += dat[j].d; 338 339 if (option_trace >= 2 && final) 340 { 341 printf ("\n"); 342 printf ("x is the sum of the badness from setting thresh at given size\n"); 343 printf (" (minimum x is sought)\n"); 344 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); 345 } 346 347 min_x = x; 348 min_j = 0; 349 350 351 /* When stepping to the next dat[j].size, positive values are no longer 352 bad (so subtracted), negative values become bad (so add the absolute 353 value, meaning subtract). */ 354 for (j = 0; j < ndat; x -= dat[j].d, j++) 355 { 356 if (option_trace >= 2 && final) 357 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); 358 359 if (x < min_x) 360 { 361 min_x = x; 362 min_j = j; 363 } 364 } 365 366 return min_j; 367 } 368 369 370 /* Measuring for recompiled mpn/generic/divrem_1.c, mpn/generic/mod_1.c 371 * and mpz/fac_ui.c */ 372 373 mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 374 mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t); 375 void mpz_fac_ui_tune (mpz_ptr, unsigned long); 376 377 double 378 speed_mpn_mod_1_tune (struct speed_params *s) 379 { 380 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); 381 } 382 double 383 speed_mpn_divrem_1_tune (struct speed_params *s) 384 { 385 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); 386 } 387 double 388 speed_mpz_fac_ui_tune (struct speed_params *s) 389 { 390 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune); 391 } 392 393 394 double 395 tuneup_measure (speed_function_t fun, 396 const struct param_t *param, 397 struct speed_params *s) 398 { 399 static struct param_t dummy; 400 double t; 401 TMP_DECL; 402 403 if (! param) 404 param = &dummy; 405 406 s->size += param->size_extra; 407 408 TMP_MARK; 409 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); 410 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); 411 412 mpn_random (s->xp, s->size); 413 mpn_random (s->yp, s->size); 414 415 switch (param->data_high) { 416 case DATA_HIGH_LT_R: 417 s->xp[s->size-1] %= s->r; 418 s->yp[s->size-1] %= s->r; 419 break; 420 case DATA_HIGH_GE_R: 421 s->xp[s->size-1] |= s->r; 422 s->yp[s->size-1] |= s->r; 423 break; 424 } 425 426 t = speed_measure (fun, s); 427 428 s->size -= param->size_extra; 429 430 TMP_FREE; 431 return t; 432 } 433 434 435 #define PRINT_WIDTH 31 436 437 void 438 print_define_start (const char *name) 439 { 440 printf ("#define %-*s ", PRINT_WIDTH, name); 441 if (option_trace) 442 printf ("...\n"); 443 } 444 445 void 446 print_define_end_remark (const char *name, mp_size_t value, const char *remark) 447 { 448 if (option_trace) 449 printf ("#define %-*s ", PRINT_WIDTH, name); 450 451 if (value == MP_SIZE_T_MAX) 452 printf ("MP_SIZE_T_MAX"); 453 else 454 printf ("%5ld", (long) value); 455 456 if (remark != NULL) 457 printf (" /* %s */", remark); 458 printf ("\n"); 459 fflush (stdout); 460 } 461 462 void 463 print_define_end (const char *name, mp_size_t value) 464 { 465 const char *remark; 466 if (value == MP_SIZE_T_MAX) 467 remark = "never"; 468 else if (value == 0) 469 remark = "always"; 470 else 471 remark = NULL; 472 print_define_end_remark (name, value, remark); 473 } 474 475 void 476 print_define (const char *name, mp_size_t value) 477 { 478 print_define_start (name); 479 print_define_end (name, value); 480 } 481 482 void 483 print_define_remark (const char *name, mp_size_t value, const char *remark) 484 { 485 print_define_start (name); 486 print_define_end_remark (name, value, remark); 487 } 488 489 490 void 491 one (mp_size_t *threshold, struct param_t *param) 492 { 493 int since_positive, since_thresh_change; 494 int thresh_idx, new_thresh_idx; 495 496 #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) 497 498 DEFAULT (param->function_fudge, 1.0); 499 DEFAULT (param->function2, param->function); 500 DEFAULT (param->step_factor, 0.01); /* small steps by default */ 501 DEFAULT (param->step, 1); /* small steps by default */ 502 DEFAULT (param->stop_since_change, 80); 503 DEFAULT (param->stop_factor, 1.2); 504 DEFAULT (param->min_size, 10); 505 DEFAULT (param->max_size, DEFAULT_MAX_SIZE); 506 507 if (param->check_size != 0) 508 { 509 double t1, t2; 510 s.size = param->check_size; 511 512 *threshold = s.size+1; 513 t1 = tuneup_measure (param->function, param, &s); 514 515 *threshold = s.size; 516 t2 = tuneup_measure (param->function2, param, &s); 517 if (t1 == -1.0 || t2 == -1.0) 518 { 519 printf ("Oops, can't run both functions at size %ld\n", 520 (long) s.size); 521 abort (); 522 } 523 t1 *= param->function_fudge; 524 525 /* ask that t2 is at least 4% below t1 */ 526 if (t1 < t2*1.04) 527 { 528 if (option_trace) 529 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); 530 *threshold = MP_SIZE_T_MAX; 531 if (! param->noprint) 532 print_define (param->name, *threshold); 533 return; 534 } 535 536 if (option_trace >= 2) 537 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", 538 (long) s.size, t1, t2); 539 } 540 541 if (! param->noprint || option_trace) 542 print_define_start (param->name); 543 544 ndat = 0; 545 since_positive = 0; 546 since_thresh_change = 0; 547 thresh_idx = 0; 548 549 if (option_trace >= 2) 550 { 551 printf (" algorithm-A algorithm-B ratio possible\n"); 552 printf (" (seconds) (seconds) diff thresh\n"); 553 } 554 555 for (s.size = param->min_size; 556 s.size < param->max_size; 557 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step)) 558 { 559 double ti, tiplus1, d; 560 561 /* 562 FIXME: check minimum size requirements are met, possibly by just 563 checking for the -1 returns from the speed functions. 564 */ 565 566 /* using method A at this size */ 567 *threshold = s.size+1; 568 ti = tuneup_measure (param->function, param, &s); 569 if (ti == -1.0) 570 abort (); 571 ti *= param->function_fudge; 572 573 /* using method B at this size */ 574 *threshold = s.size; 575 tiplus1 = tuneup_measure (param->function2, param, &s); 576 if (tiplus1 == -1.0) 577 abort (); 578 579 /* Calculate the fraction by which the one or the other routine is 580 slower. */ 581 if (tiplus1 >= ti) 582 d = (tiplus1 - ti) / tiplus1; /* negative */ 583 else 584 d = (tiplus1 - ti) / ti; /* positive */ 585 586 add_dat (s.size, d); 587 588 new_thresh_idx = analyze_dat (0); 589 590 if (option_trace >= 2) 591 printf ("size=%ld %.9f %.9f % .4f %c %ld\n", 592 (long) s.size, ti, tiplus1, d, 593 ti > tiplus1 ? '#' : ' ', 594 (long) dat[new_thresh_idx].size); 595 596 /* Stop if the last time method i was faster was more than a 597 certain number of measurements ago. */ 598 #define STOP_SINCE_POSITIVE 200 599 if (d >= 0) 600 since_positive = 0; 601 else 602 if (++since_positive > STOP_SINCE_POSITIVE) 603 { 604 if (option_trace >= 1) 605 printf ("stopped due to since_positive (%d)\n", 606 STOP_SINCE_POSITIVE); 607 break; 608 } 609 610 /* Stop if method A has become slower by a certain factor. */ 611 if (ti >= tiplus1 * param->stop_factor) 612 { 613 if (option_trace >= 1) 614 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", 615 param->stop_factor); 616 break; 617 } 618 619 /* Stop if the threshold implied hasn't changed in a certain 620 number of measurements. (It's this condition that usually 621 stops the loop.) */ 622 if (thresh_idx != new_thresh_idx) 623 since_thresh_change = 0, thresh_idx = new_thresh_idx; 624 else 625 if (++since_thresh_change > param->stop_since_change) 626 { 627 if (option_trace >= 1) 628 printf ("stopped due to since_thresh_change (%d)\n", 629 param->stop_since_change); 630 break; 631 } 632 633 /* Stop if the threshold implied is more than a certain number of 634 measurements ago. */ 635 #define STOP_SINCE_AFTER 500 636 if (ndat - thresh_idx > STOP_SINCE_AFTER) 637 { 638 if (option_trace >= 1) 639 printf ("stopped due to ndat - thresh_idx > amount (%d)\n", 640 STOP_SINCE_AFTER); 641 break; 642 } 643 644 /* Stop when the size limit is reached before the end of the 645 crossover, but only show this as an error for >= the default max 646 size. FIXME: Maybe should make it a param choice whether this is 647 an error. */ 648 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) 649 { 650 fprintf (stderr, "%s\n", param->name); 651 fprintf (stderr, "sizes %ld to %ld total %d measurements\n", 652 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 653 fprintf (stderr, " max size reached before end of crossover\n"); 654 break; 655 } 656 } 657 658 if (option_trace >= 1) 659 printf ("sizes %ld to %ld total %d measurements\n", 660 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 661 662 *threshold = dat[analyze_dat (1)].size; 663 664 if (param->min_is_always) 665 { 666 if (*threshold == param->min_size) 667 *threshold = 0; 668 } 669 670 if (! param->noprint || option_trace) 671 print_define_end (param->name, *threshold); 672 } 673 674 675 /* Special probing for the fft thresholds. The size restrictions on the 676 FFTs mean the graph of time vs size has a step effect. See this for 677 example using 678 679 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 680 gnuplot foo.gnuplot 681 682 The current approach is to compare routines at the midpoint of relevant 683 steps. Arguably a more sophisticated system of threshold data is wanted 684 if this step effect remains. */ 685 686 struct fft_param_t { 687 const char *table_name; 688 const char *threshold_name; 689 const char *modf_threshold_name; 690 mp_size_t *p_threshold; 691 mp_size_t *p_modf_threshold; 692 mp_size_t first_size; 693 mp_size_t max_size; 694 speed_function_t function; 695 speed_function_t mul_modf_function; 696 speed_function_t mul_function; 697 mp_size_t sqr; 698 }; 699 700 701 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with 702 N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple 703 of 2^(k-1) bits. */ 704 705 mp_size_t 706 fft_step_size (int k) 707 { 708 mp_size_t step; 709 710 step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS; 711 step *= (mp_size_t) 1 << k; 712 713 if (step <= 0) 714 { 715 printf ("Can't handle k=%d\n", k); 716 abort (); 717 } 718 719 return step; 720 } 721 722 mp_size_t 723 fft_next_size (mp_size_t pl, int k) 724 { 725 mp_size_t m = fft_step_size (k); 726 727 /* printf ("[k=%d %ld] %ld ->", k, m, pl); */ 728 729 if (pl == 0 || (pl & (m-1)) != 0) 730 pl = (pl | (m-1)) + 1; 731 732 /* printf (" %ld\n", pl); */ 733 return pl; 734 } 735 736 #define NMAX_DEFAULT 1000000 737 #define MAX_REPS 25 738 #define MIN_REPS 5 739 740 static inline size_t 741 mpn_mul_fft_lcm (size_t a, unsigned int k) 742 { 743 unsigned int l = k; 744 745 while (a % 2 == 0 && k > 0) 746 { 747 a >>= 1; 748 k--; 749 } 750 return a << l; 751 } 752 753 mp_size_t 754 fftfill (mp_size_t pl, int k, int sqr) 755 { 756 mp_size_t maxLK; 757 mp_bitcnt_t N, Nprime, nprime, M; 758 759 N = pl * GMP_NUMB_BITS; 760 M = N >> k; 761 762 maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); 763 764 Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; 765 nprime = Nprime / GMP_NUMB_BITS; 766 if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 767 { 768 size_t K2; 769 for (;;) 770 { 771 K2 = 1L << mpn_fft_best_k (nprime, sqr); 772 if ((nprime & (K2 - 1)) == 0) 773 break; 774 nprime = (nprime + K2 - 1) & -K2; 775 Nprime = nprime * GMP_LIMB_BITS; 776 } 777 } 778 ASSERT_ALWAYS (nprime < pl); 779 780 return Nprime; 781 } 782 783 static int 784 compare_double (const void *ap, const void *bp) 785 { 786 double a = * (const double *) ap; 787 double b = * (const double *) bp; 788 789 if (a < b) 790 return -1; 791 else if (a > b) 792 return 1; 793 else 794 return 0; 795 } 796 797 double 798 median (double *times, int n) 799 { 800 qsort (times, n, sizeof (double), compare_double); 801 return times[n/2]; 802 } 803 804 #define FFT_CACHE_SIZE 25 805 typedef struct fft_cache 806 { 807 mp_size_t n; 808 double time; 809 } fft_cache_t; 810 811 fft_cache_t fft_cache[FFT_CACHE_SIZE]; 812 813 double 814 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k, 815 int n_measurements) 816 { 817 int i; 818 double t, ttab[MAX_REPS]; 819 820 if (fft_cache[k].n == n) 821 return fft_cache[k].time; 822 823 for (i = 0; i < n_measurements; i++) 824 { 825 speed_starttime (); 826 mpn_mul_fft (rp, n, ap, n, bp, n, k); 827 ttab[i] = speed_endtime (); 828 } 829 830 t = median (ttab, n_measurements); 831 fft_cache[k].n = n; 832 fft_cache[k].time = t; 833 return t; 834 } 835 836 #define INSERT_FFTTAB(idx, nval, kval) \ 837 do { \ 838 fft_tab[idx].n = nval; \ 839 fft_tab[idx].k = kval; \ 840 fft_tab[idx+1].n = -1; /* sentinel */ \ 841 fft_tab[idx+1].k = -1; \ 842 } while (0) 843 844 int 845 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print) 846 { 847 mp_size_t n, n1, prev_n1; 848 int k, best_k, last_best_k, kmax; 849 int eff, prev_eff; 850 double t0, t1; 851 int n_measurements; 852 mp_limb_t *ap, *bp, *rp; 853 mp_size_t alloc; 854 char *linepref; 855 struct fft_table_nk *fft_tab; 856 857 fft_tab = mpn_fft_table3[p->sqr]; 858 859 for (k = 0; k < FFT_CACHE_SIZE; k++) 860 fft_cache[k].n = 0; 861 862 if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 863 { 864 nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD); 865 } 866 867 if (print) 868 printf ("#define %s%*s", p->table_name, 38, ""); 869 870 if (idx == 0) 871 { 872 INSERT_FFTTAB (0, nmin, initial_k); 873 874 if (print) 875 { 876 printf ("\\\n { "); 877 printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k); 878 linepref = " "; 879 } 880 881 idx = 1; 882 } 883 884 ap = malloc (sizeof (mp_limb_t)); 885 if (p->sqr) 886 bp = ap; 887 else 888 bp = malloc (sizeof (mp_limb_t)); 889 rp = malloc (sizeof (mp_limb_t)); 890 alloc = 1; 891 892 /* Round n to comply to initial k value */ 893 n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k); 894 895 n_measurements = (18 - initial_k) | 1; 896 n_measurements = MAX (n_measurements, MIN_REPS); 897 n_measurements = MIN (n_measurements, MAX_REPS); 898 899 last_best_k = initial_k; 900 best_k = initial_k; 901 902 while (n < nmax) 903 { 904 int start_k, end_k; 905 906 /* Assume the current best k is best until we hit its next FFT step. */ 907 t0 = 99999; 908 909 prev_n1 = n + 1; 910 911 start_k = MAX (4, best_k - 4); 912 end_k = MIN (24, best_k + 4); 913 for (k = start_k; k <= end_k; k++) 914 { 915 n1 = mpn_fft_next_size (prev_n1, k); 916 917 eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr); 918 919 if (eff < 70) /* avoid measuring too slow fft:s */ 920 continue; 921 922 if (n1 > alloc) 923 { 924 alloc = n1; 925 if (p->sqr) 926 { 927 ap = realloc (ap, sizeof (mp_limb_t)); 928 rp = realloc (rp, sizeof (mp_limb_t)); 929 ap = bp = realloc (ap, alloc * sizeof (mp_limb_t)); 930 mpn_random (ap, alloc); 931 rp = realloc (rp, alloc * sizeof (mp_limb_t)); 932 } 933 else 934 { 935 ap = realloc (ap, sizeof (mp_limb_t)); 936 bp = realloc (bp, sizeof (mp_limb_t)); 937 rp = realloc (rp, sizeof (mp_limb_t)); 938 ap = realloc (ap, alloc * sizeof (mp_limb_t)); 939 mpn_random (ap, alloc); 940 bp = realloc (bp, alloc * sizeof (mp_limb_t)); 941 mpn_random (bp, alloc); 942 rp = realloc (rp, alloc * sizeof (mp_limb_t)); 943 } 944 } 945 946 t1 = cached_measure (rp, ap, bp, n1, k, n_measurements); 947 948 if (t1 * n_measurements > 0.3) 949 n_measurements -= 2; 950 n_measurements = MAX (n_measurements, MIN_REPS); 951 952 if (t1 < t0) 953 { 954 best_k = k; 955 t0 = t1; 956 } 957 } 958 959 n1 = mpn_fft_next_size (prev_n1, best_k); 960 961 if (last_best_k != best_k) 962 { 963 ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1); 964 965 if (idx >= FFT_TABLE3_SIZE) 966 { 967 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 968 abort (); 969 } 970 INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k); 971 972 if (print) 973 { 974 printf (", "); 975 if (idx % 4 == 0) 976 printf ("\\\n "); 977 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 978 } 979 980 if (option_trace >= 2) 981 { 982 printf ("{%lu,%u}\n", prev_n1, best_k); 983 fflush (stdout); 984 } 985 986 last_best_k = best_k; 987 idx++; 988 } 989 990 for (;;) 991 { 992 prev_n1 = n1; 993 prev_eff = fftfill (prev_n1, best_k, p->sqr); 994 n1 = mpn_fft_next_size (prev_n1 + 1, best_k); 995 eff = fftfill (n1, best_k, p->sqr); 996 997 if (eff != prev_eff) 998 break; 999 } 1000 1001 n = prev_n1; 1002 } 1003 1004 kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */ 1005 kmax = MIN (kmax, 25-1); 1006 for (k = last_best_k + 1; k <= kmax; k++) 1007 { 1008 if (idx >= FFT_TABLE3_SIZE) 1009 { 1010 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 1011 abort (); 1012 } 1013 INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k); 1014 1015 if (print) 1016 { 1017 printf (", "); 1018 if (idx % 4 == 0) 1019 printf ("\\\n "); 1020 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1021 } 1022 1023 idx++; 1024 } 1025 1026 if (print) 1027 printf (" }\n"); 1028 1029 free (ap); 1030 if (! p->sqr) 1031 free (bp); 1032 free (rp); 1033 1034 return idx; 1035 } 1036 1037 void 1038 fft (struct fft_param_t *p) 1039 { 1040 mp_size_t size; 1041 int k, idx, initial_k; 1042 1043 /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/ 1044 1045 #if 1 1046 { 1047 /* Use plain one() mechanism, for some reasonable initial values of k. The 1048 advantage is that we don't depend on mpn_fft_table3, which can therefore 1049 leave it completely uninitialized. */ 1050 1051 static struct param_t param; 1052 mp_size_t thres, best_thres; 1053 int best_k; 1054 char buf[20]; 1055 1056 best_thres = MP_SIZE_T_MAX; 1057 best_k = -1; 1058 1059 for (k = 5; k <= 7; k++) 1060 { 1061 param.name = p->modf_threshold_name; 1062 param.min_size = 100; 1063 param.max_size = 2000; 1064 param.function = p->mul_function; 1065 param.step_factor = 0.0; 1066 param.step = 4; 1067 param.function2 = p->mul_modf_function; 1068 param.noprint = 1; 1069 s.r = k; 1070 one (&thres, ¶m); 1071 if (thres < best_thres) 1072 { 1073 best_thres = thres; 1074 best_k = k; 1075 } 1076 } 1077 1078 *(p->p_modf_threshold) = best_thres; 1079 sprintf (buf, "k = %d", best_k); 1080 print_define_remark (p->modf_threshold_name, best_thres, buf); 1081 initial_k = best_k; 1082 } 1083 #else 1084 size = p->first_size; 1085 for (;;) 1086 { 1087 double tk, tm; 1088 1089 size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr)); 1090 k = mpn_fft_best_k (size, p->sqr); 1091 1092 if (size >= p->max_size) 1093 break; 1094 1095 s.size = size + fft_step_size (k) / 2; 1096 s.r = k; 1097 tk = tuneup_measure (p->mul_modf_function, NULL, &s); 1098 if (tk == -1.0) 1099 abort (); 1100 1101 tm = tuneup_measure (p->mul_function, NULL, &s); 1102 if (tm == -1.0) 1103 abort (); 1104 1105 if (option_trace >= 2) 1106 printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n", 1107 (long) size, 1108 (long) size + fft_step_size (k) / 2, k, tk, 1109 (long) s.size, tm); 1110 1111 if (tk < tm) 1112 { 1113 *p->p_modf_threshold = s.size; 1114 print_define (p->modf_threshold_name, *p->p_modf_threshold); 1115 break; 1116 } 1117 } 1118 initial_k = ?; 1119 #endif 1120 1121 /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/ 1122 1123 idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1); 1124 printf ("#define %s_SIZE %d\n", p->table_name, idx); 1125 1126 /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/ 1127 1128 size = 2 * *p->p_modf_threshold; /* OK? */ 1129 for (;;) 1130 { 1131 double tk, tm; 1132 mp_size_t mulmod_size, mul_size;; 1133 1134 if (size >= p->max_size) 1135 break; 1136 1137 mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2; 1138 mul_size = (size + mulmod_size) / 2; /* middle of step */ 1139 1140 s.size = mulmod_size; 1141 tk = tuneup_measure (p->function, NULL, &s); 1142 if (tk == -1.0) 1143 abort (); 1144 1145 s.size = mul_size; 1146 tm = tuneup_measure (p->mul_function, NULL, &s); 1147 if (tm == -1.0) 1148 abort (); 1149 1150 if (option_trace >= 2) 1151 printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n", 1152 (long) size, 1153 (long) mulmod_size, tk, 1154 (long) mul_size, tm); 1155 1156 size = mulmod_size; 1157 1158 if (tk < tm) 1159 { 1160 *p->p_threshold = s.size; 1161 print_define (p->threshold_name, *p->p_threshold); 1162 break; 1163 } 1164 } 1165 } 1166 1167 1168 1169 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, 1170 giving wrong results. */ 1171 void 1172 tune_mul_n (void) 1173 { 1174 static struct param_t param; 1175 mp_size_t next_toom_start; 1176 int something_changed; 1177 1178 param.function = speed_mpn_mul_n; 1179 1180 param.name = "MUL_TOOM22_THRESHOLD"; 1181 param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE); 1182 param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1; 1183 one (&mul_toom22_threshold, ¶m); 1184 1185 param.noprint = 1; 1186 1187 /* Threshold sequence loop. Disable functions that would be used in a very 1188 narrow range, re-measuring things when that happens. */ 1189 something_changed = 1; 1190 while (something_changed) 1191 { 1192 something_changed = 0; 1193 1194 next_toom_start = mul_toom22_threshold; 1195 1196 if (mul_toom33_threshold != 0) 1197 { 1198 param.name = "MUL_TOOM33_THRESHOLD"; 1199 param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE); 1200 param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1; 1201 one (&mul_toom33_threshold, ¶m); 1202 1203 if (next_toom_start * 1.05 >= mul_toom33_threshold) 1204 { 1205 mul_toom33_threshold = 0; 1206 something_changed = 1; 1207 } 1208 } 1209 1210 next_toom_start = MAX (next_toom_start, mul_toom33_threshold); 1211 1212 if (mul_toom44_threshold != 0) 1213 { 1214 param.name = "MUL_TOOM44_THRESHOLD"; 1215 param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE); 1216 param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; 1217 one (&mul_toom44_threshold, ¶m); 1218 1219 if (next_toom_start * 1.05 >= mul_toom44_threshold) 1220 { 1221 mul_toom44_threshold = 0; 1222 something_changed = 1; 1223 } 1224 } 1225 1226 next_toom_start = MAX (next_toom_start, mul_toom44_threshold); 1227 1228 if (mul_toom6h_threshold != 0) 1229 { 1230 param.name = "MUL_TOOM6H_THRESHOLD"; 1231 param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE); 1232 param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1; 1233 one (&mul_toom6h_threshold, ¶m); 1234 1235 if (next_toom_start * 1.05 >= mul_toom6h_threshold) 1236 { 1237 mul_toom6h_threshold = 0; 1238 something_changed = 1; 1239 } 1240 } 1241 1242 next_toom_start = MAX (next_toom_start, mul_toom6h_threshold); 1243 1244 if (mul_toom8h_threshold != 0) 1245 { 1246 param.name = "MUL_TOOM8H_THRESHOLD"; 1247 param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE); 1248 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1; 1249 one (&mul_toom8h_threshold, ¶m); 1250 1251 if (next_toom_start * 1.05 >= mul_toom8h_threshold) 1252 { 1253 mul_toom8h_threshold = 0; 1254 something_changed = 1; 1255 } 1256 } 1257 } 1258 1259 print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD); 1260 print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD); 1261 print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD); 1262 print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD); 1263 1264 /* disabled until tuned */ 1265 MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; 1266 } 1267 1268 void 1269 tune_mul (void) 1270 { 1271 static struct param_t param; 1272 mp_size_t thres; 1273 1274 param.noprint = 1; 1275 1276 param.function = speed_mpn_toom32_for_toom43_mul; 1277 param.function2 = speed_mpn_toom43_for_toom32_mul; 1278 param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD"; 1279 param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17; 1280 one (&thres, ¶m); 1281 mul_toom32_to_toom43_threshold = thres * 17 / 24; 1282 print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold); 1283 1284 param.function = speed_mpn_toom32_for_toom53_mul; 1285 param.function2 = speed_mpn_toom53_for_toom32_mul; 1286 param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD"; 1287 param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19; 1288 one (&thres, ¶m); 1289 mul_toom32_to_toom53_threshold = thres * 19 / 30; 1290 print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold); 1291 1292 param.function = speed_mpn_toom42_for_toom53_mul; 1293 param.function2 = speed_mpn_toom53_for_toom42_mul; 1294 param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD"; 1295 param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11; 1296 one (&thres, ¶m); 1297 mul_toom42_to_toom53_threshold = thres * 11 / 20; 1298 print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold); 1299 1300 param.function = speed_mpn_toom42_mul; 1301 param.function2 = speed_mpn_toom63_mul; 1302 param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD"; 1303 param.min_size = MPN_TOOM63_MUL_MINSIZE * 2; 1304 one (&thres, ¶m); 1305 mul_toom42_to_toom63_threshold = thres / 2; 1306 print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold); 1307 1308 /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */ 1309 param.function = speed_mpn_toom43_for_toom54_mul; 1310 param.function2 = speed_mpn_toom54_for_toom43_mul; 1311 param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD"; 1312 param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5; 1313 one (&thres, ¶m); 1314 mul_toom43_to_toom54_threshold = thres * 5 / 6; 1315 print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold); 1316 } 1317 1318 1319 void 1320 tune_mullo (void) 1321 { 1322 static struct param_t param; 1323 1324 param.function = speed_mpn_mullo_n; 1325 1326 param.name = "MULLO_BASECASE_THRESHOLD"; 1327 param.min_size = 1; 1328 param.min_is_always = 1; 1329 param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1; 1330 param.stop_factor = 1.5; 1331 param.noprint = 1; 1332 one (&mullo_basecase_threshold, ¶m); 1333 1334 param.name = "MULLO_DC_THRESHOLD"; 1335 param.min_size = 8; 1336 param.min_is_always = 0; 1337 param.max_size = 1000; 1338 one (&mullo_dc_threshold, ¶m); 1339 1340 if (mullo_basecase_threshold >= mullo_dc_threshold) 1341 { 1342 print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold); 1343 print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase"); 1344 } 1345 else 1346 { 1347 print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold); 1348 print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold); 1349 } 1350 1351 #if WANT_FFT 1352 param.name = "MULLO_MUL_N_THRESHOLD"; 1353 param.min_size = mullo_dc_threshold; 1354 param.max_size = 2 * mul_fft_threshold; 1355 param.noprint = 0; 1356 param.step_factor = 0.03; 1357 one (&mullo_mul_n_threshold, ¶m); 1358 #else 1359 print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX, 1360 "without FFT use mullo forever"); 1361 #endif 1362 } 1363 1364 void 1365 tune_mulmid (void) 1366 { 1367 static struct param_t param; 1368 1369 param.name = "MULMID_TOOM42_THRESHOLD"; 1370 param.function = speed_mpn_mulmid_n; 1371 param.min_size = 4; 1372 param.max_size = 100; 1373 one (&mulmid_toom42_threshold, ¶m); 1374 } 1375 1376 void 1377 tune_mulmod_bnm1 (void) 1378 { 1379 static struct param_t param; 1380 1381 param.name = "MULMOD_BNM1_THRESHOLD"; 1382 param.function = speed_mpn_mulmod_bnm1; 1383 param.min_size = 4; 1384 param.max_size = 100; 1385 one (&mulmod_bnm1_threshold, ¶m); 1386 } 1387 1388 void 1389 tune_sqrmod_bnm1 (void) 1390 { 1391 static struct param_t param; 1392 1393 param.name = "SQRMOD_BNM1_THRESHOLD"; 1394 param.function = speed_mpn_sqrmod_bnm1; 1395 param.min_size = 4; 1396 param.max_size = 100; 1397 one (&sqrmod_bnm1_threshold, ¶m); 1398 } 1399 1400 1401 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase 1402 is faster only at size==2 then we don't want to bother with extra code 1403 just for that. Start karatsuba from 4 same as MUL above. */ 1404 1405 void 1406 tune_sqr (void) 1407 { 1408 /* disabled until tuned */ 1409 SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; 1410 1411 if (HAVE_NATIVE_mpn_sqr_basecase) 1412 { 1413 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); 1414 sqr_basecase_threshold = 0; 1415 } 1416 else 1417 { 1418 static struct param_t param; 1419 param.name = "SQR_BASECASE_THRESHOLD"; 1420 param.function = speed_mpn_sqr; 1421 param.min_size = 3; 1422 param.min_is_always = 1; 1423 param.max_size = TUNE_SQR_TOOM2_MAX; 1424 param.noprint = 1; 1425 one (&sqr_basecase_threshold, ¶m); 1426 } 1427 1428 { 1429 static struct param_t param; 1430 param.name = "SQR_TOOM2_THRESHOLD"; 1431 param.function = speed_mpn_sqr; 1432 param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE); 1433 param.max_size = TUNE_SQR_TOOM2_MAX; 1434 param.noprint = 1; 1435 one (&sqr_toom2_threshold, ¶m); 1436 1437 if (! HAVE_NATIVE_mpn_sqr_basecase 1438 && sqr_toom2_threshold < sqr_basecase_threshold) 1439 { 1440 /* Karatsuba becomes faster than mul_basecase before 1441 sqr_basecase does. Arrange for the expression 1442 "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which 1443 selects mpn_sqr_basecase in mpn_sqr to be false, by setting 1444 SQR_TOOM2_THRESHOLD to zero, making 1445 SQR_BASECASE_THRESHOLD the toom2 threshold. */ 1446 1447 sqr_basecase_threshold = SQR_TOOM2_THRESHOLD; 1448 SQR_TOOM2_THRESHOLD = 0; 1449 1450 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, 1451 "toom2"); 1452 print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD, 1453 "never sqr_basecase"); 1454 } 1455 else 1456 { 1457 if (! HAVE_NATIVE_mpn_sqr_basecase) 1458 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); 1459 print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD); 1460 } 1461 } 1462 1463 { 1464 static struct param_t param; 1465 mp_size_t next_toom_start; 1466 int something_changed; 1467 1468 param.function = speed_mpn_sqr; 1469 param.noprint = 1; 1470 1471 /* Threshold sequence loop. Disable functions that would be used in a very 1472 narrow range, re-measuring things when that happens. */ 1473 something_changed = 1; 1474 while (something_changed) 1475 { 1476 something_changed = 0; 1477 1478 next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold); 1479 1480 sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 1481 param.name = "SQR_TOOM3_THRESHOLD"; 1482 param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE); 1483 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; 1484 one (&sqr_toom3_threshold, ¶m); 1485 1486 next_toom_start = MAX (next_toom_start, sqr_toom3_threshold); 1487 1488 if (sqr_toom4_threshold != 0) 1489 { 1490 param.name = "SQR_TOOM4_THRESHOLD"; 1491 sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 1492 param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE); 1493 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; 1494 one (&sqr_toom4_threshold, ¶m); 1495 1496 if (next_toom_start * 1.05 >= sqr_toom4_threshold) 1497 { 1498 sqr_toom4_threshold = 0; 1499 something_changed = 1; 1500 } 1501 } 1502 1503 next_toom_start = MAX (next_toom_start, sqr_toom4_threshold); 1504 1505 if (sqr_toom6_threshold != 0) 1506 { 1507 param.name = "SQR_TOOM6_THRESHOLD"; 1508 sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 1509 param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE); 1510 param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1; 1511 one (&sqr_toom6_threshold, ¶m); 1512 1513 if (next_toom_start * 1.05 >= sqr_toom6_threshold) 1514 { 1515 sqr_toom6_threshold = 0; 1516 something_changed = 1; 1517 } 1518 } 1519 1520 next_toom_start = MAX (next_toom_start, sqr_toom6_threshold); 1521 1522 if (sqr_toom8_threshold != 0) 1523 { 1524 param.name = "SQR_TOOM8_THRESHOLD"; 1525 sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 1526 param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE); 1527 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1; 1528 one (&sqr_toom8_threshold, ¶m); 1529 1530 if (next_toom_start * 1.05 >= sqr_toom8_threshold) 1531 { 1532 sqr_toom8_threshold = 0; 1533 something_changed = 1; 1534 } 1535 } 1536 } 1537 1538 print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD); 1539 print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD); 1540 print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD); 1541 print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD); 1542 } 1543 } 1544 1545 1546 void 1547 tune_dc_div (void) 1548 { 1549 s.r = 0; /* clear to make speed function do 2n/n */ 1550 { 1551 static struct param_t param; 1552 param.name = "DC_DIV_QR_THRESHOLD"; 1553 param.function = speed_mpn_sbpi1_div_qr; 1554 param.function2 = speed_mpn_dcpi1_div_qr; 1555 param.min_size = 6; 1556 one (&dc_div_qr_threshold, ¶m); 1557 } 1558 { 1559 static struct param_t param; 1560 param.name = "DC_DIVAPPR_Q_THRESHOLD"; 1561 param.function = speed_mpn_sbpi1_divappr_q; 1562 param.function2 = speed_mpn_dcpi1_divappr_q; 1563 param.min_size = 6; 1564 one (&dc_divappr_q_threshold, ¶m); 1565 } 1566 } 1567 1568 static double 1569 speed_mpn_sbordcpi1_div_qr (struct speed_params *s) 1570 { 1571 if (s->size < DC_DIV_QR_THRESHOLD) 1572 return speed_mpn_sbpi1_div_qr (s); 1573 else 1574 return speed_mpn_dcpi1_div_qr (s); 1575 } 1576 1577 void 1578 tune_mu_div (void) 1579 { 1580 s.r = 0; /* clear to make speed function do 2n/n */ 1581 { 1582 static struct param_t param; 1583 param.name = "MU_DIV_QR_THRESHOLD"; 1584 param.function = speed_mpn_dcpi1_div_qr; 1585 param.function2 = speed_mpn_mu_div_qr; 1586 param.min_size = mul_toom22_threshold; 1587 param.max_size = 5000; 1588 param.step_factor = 0.02; 1589 one (&mu_div_qr_threshold, ¶m); 1590 } 1591 { 1592 static struct param_t param; 1593 param.name = "MU_DIVAPPR_Q_THRESHOLD"; 1594 param.function = speed_mpn_dcpi1_divappr_q; 1595 param.function2 = speed_mpn_mu_divappr_q; 1596 param.min_size = mul_toom22_threshold; 1597 param.max_size = 5000; 1598 param.step_factor = 0.02; 1599 one (&mu_divappr_q_threshold, ¶m); 1600 } 1601 { 1602 static struct param_t param; 1603 param.name = "MUPI_DIV_QR_THRESHOLD"; 1604 param.function = speed_mpn_sbordcpi1_div_qr; 1605 param.function2 = speed_mpn_mupi_div_qr; 1606 param.min_size = 6; 1607 param.min_is_always = 1; 1608 param.max_size = 1000; 1609 param.step_factor = 0.02; 1610 one (&mupi_div_qr_threshold, ¶m); 1611 } 1612 } 1613 1614 void 1615 tune_dc_bdiv (void) 1616 { 1617 s.r = 0; /* clear to make speed function do 2n/n*/ 1618 { 1619 static struct param_t param; 1620 param.name = "DC_BDIV_QR_THRESHOLD"; 1621 param.function = speed_mpn_sbpi1_bdiv_qr; 1622 param.function2 = speed_mpn_dcpi1_bdiv_qr; 1623 param.min_size = 4; 1624 one (&dc_bdiv_qr_threshold, ¶m); 1625 } 1626 { 1627 static struct param_t param; 1628 param.name = "DC_BDIV_Q_THRESHOLD"; 1629 param.function = speed_mpn_sbpi1_bdiv_q; 1630 param.function2 = speed_mpn_dcpi1_bdiv_q; 1631 param.min_size = 4; 1632 one (&dc_bdiv_q_threshold, ¶m); 1633 } 1634 } 1635 1636 void 1637 tune_mu_bdiv (void) 1638 { 1639 s.r = 0; /* clear to make speed function do 2n/n*/ 1640 { 1641 static struct param_t param; 1642 param.name = "MU_BDIV_QR_THRESHOLD"; 1643 param.function = speed_mpn_dcpi1_bdiv_qr; 1644 param.function2 = speed_mpn_mu_bdiv_qr; 1645 param.min_size = mul_toom22_threshold; 1646 param.max_size = 5000; 1647 param.step_factor = 0.02; 1648 one (&mu_bdiv_qr_threshold, ¶m); 1649 } 1650 { 1651 static struct param_t param; 1652 param.name = "MU_BDIV_Q_THRESHOLD"; 1653 param.function = speed_mpn_dcpi1_bdiv_q; 1654 param.function2 = speed_mpn_mu_bdiv_q; 1655 param.min_size = mul_toom22_threshold; 1656 param.max_size = 5000; 1657 param.step_factor = 0.02; 1658 one (&mu_bdiv_q_threshold, ¶m); 1659 } 1660 } 1661 1662 void 1663 tune_invertappr (void) 1664 { 1665 static struct param_t param; 1666 1667 param.function = speed_mpn_ni_invertappr; 1668 param.name = "INV_MULMOD_BNM1_THRESHOLD"; 1669 param.min_size = 4; 1670 one (&inv_mulmod_bnm1_threshold, ¶m); 1671 1672 param.function = speed_mpn_invertappr; 1673 param.name = "INV_NEWTON_THRESHOLD"; 1674 param.min_size = 3; 1675 one (&inv_newton_threshold, ¶m); 1676 } 1677 1678 void 1679 tune_invert (void) 1680 { 1681 static struct param_t param; 1682 1683 param.function = speed_mpn_invert; 1684 param.name = "INV_APPR_THRESHOLD"; 1685 param.min_size = 3; 1686 one (&inv_appr_threshold, ¶m); 1687 } 1688 1689 void 1690 tune_binvert (void) 1691 { 1692 static struct param_t param; 1693 1694 param.function = speed_mpn_binvert; 1695 param.name = "BINV_NEWTON_THRESHOLD"; 1696 param.min_size = 8; /* pointless with smaller operands */ 1697 one (&binv_newton_threshold, ¶m); 1698 } 1699 1700 void 1701 tune_redc (void) 1702 { 1703 #define TUNE_REDC_2_MAX 100 1704 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 1705 #define WANT_REDC_2 1 1706 #endif 1707 1708 #if WANT_REDC_2 1709 { 1710 static struct param_t param; 1711 param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1712 param.function = speed_mpn_redc_1; 1713 param.function2 = speed_mpn_redc_2; 1714 param.min_size = 1; 1715 param.min_is_always = 1; 1716 param.max_size = TUNE_REDC_2_MAX; 1717 param.noprint = 1; 1718 param.stop_factor = 1.5; 1719 one (&redc_1_to_redc_2_threshold, ¶m); 1720 } 1721 { 1722 static struct param_t param; 1723 param.name = "REDC_2_TO_REDC_N_THRESHOLD"; 1724 param.function = speed_mpn_redc_2; 1725 param.function2 = speed_mpn_redc_n; 1726 param.min_size = 16; 1727 param.noprint = 1; 1728 one (&redc_2_to_redc_n_threshold, ¶m); 1729 } 1730 if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold) 1731 { 1732 redc_2_to_redc_n_threshold = 0; /* disable redc_2 */ 1733 1734 /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as 1735 REDC_1_TO_REDC_2_THRESHOLD. */ 1736 { 1737 static struct param_t param; 1738 param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1739 param.function = speed_mpn_redc_1; 1740 param.function2 = speed_mpn_redc_n; 1741 param.min_size = 16; 1742 param.noprint = 1; 1743 one (&redc_1_to_redc_2_threshold, ¶m); 1744 } 1745 } 1746 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD); 1747 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD); 1748 #else 1749 { 1750 static struct param_t param; 1751 param.name = "REDC_1_TO_REDC_N_THRESHOLD"; 1752 param.function = speed_mpn_redc_1; 1753 param.function2 = speed_mpn_redc_n; 1754 param.min_size = 16; 1755 one (&redc_1_to_redc_n_threshold, ¶m); 1756 } 1757 #endif 1758 } 1759 1760 void 1761 tune_matrix22_mul (void) 1762 { 1763 static struct param_t param; 1764 param.name = "MATRIX22_STRASSEN_THRESHOLD"; 1765 param.function = speed_mpn_matrix22_mul; 1766 param.min_size = 2; 1767 one (&matrix22_strassen_threshold, ¶m); 1768 } 1769 1770 void 1771 tune_hgcd (void) 1772 { 1773 static struct param_t param; 1774 param.name = "HGCD_THRESHOLD"; 1775 param.function = speed_mpn_hgcd; 1776 /* We seem to get strange results for small sizes */ 1777 param.min_size = 30; 1778 one (&hgcd_threshold, ¶m); 1779 } 1780 1781 void 1782 tune_hgcd_appr (void) 1783 { 1784 static struct param_t param; 1785 param.name = "HGCD_APPR_THRESHOLD"; 1786 param.function = speed_mpn_hgcd_appr; 1787 /* We seem to get strange results for small sizes */ 1788 param.min_size = 50; 1789 param.stop_since_change = 150; 1790 one (&hgcd_appr_threshold, ¶m); 1791 } 1792 1793 void 1794 tune_hgcd_reduce (void) 1795 { 1796 static struct param_t param; 1797 param.name = "HGCD_REDUCE_THRESHOLD"; 1798 param.function = speed_mpn_hgcd_reduce; 1799 param.min_size = 30; 1800 param.max_size = 7000; 1801 param.step_factor = 0.04; 1802 one (&hgcd_reduce_threshold, ¶m); 1803 } 1804 1805 void 1806 tune_gcd_dc (void) 1807 { 1808 static struct param_t param; 1809 param.name = "GCD_DC_THRESHOLD"; 1810 param.function = speed_mpn_gcd; 1811 param.min_size = hgcd_threshold; 1812 param.max_size = 3000; 1813 param.step_factor = 0.02; 1814 one (&gcd_dc_threshold, ¶m); 1815 } 1816 1817 void 1818 tune_gcdext_dc (void) 1819 { 1820 static struct param_t param; 1821 param.name = "GCDEXT_DC_THRESHOLD"; 1822 param.function = speed_mpn_gcdext; 1823 param.min_size = hgcd_threshold; 1824 param.max_size = 3000; 1825 param.step_factor = 0.02; 1826 one (&gcdext_dc_threshold, ¶m); 1827 } 1828 1829 /* In tune_powm_sec we compute the table used by the win_size function. The 1830 cutoff points are in exponent bits, disregarding other operand sizes. It is 1831 not possible to use the one framework since it currently uses a granilarity 1832 of full limbs. 1833 */ 1834 1835 /* This win_size replaces the variant in the powm code, allowing us to 1836 control k in the k-ary algorithms. */ 1837 int winsize; 1838 int 1839 win_size (mp_bitcnt_t eb) 1840 { 1841 return winsize; 1842 } 1843 1844 void 1845 tune_powm_sec (void) 1846 { 1847 mp_size_t n; 1848 int k, i; 1849 mp_size_t itch; 1850 mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff; 1851 const int n_max = 3000 / GMP_NUMB_BITS; 1852 const int n_measurements = 5; 1853 mp_ptr rp, bp, ep, mp, tp; 1854 double ttab[n_measurements], tk, tkp1; 1855 TMP_DECL; 1856 TMP_MARK; 1857 1858 possible_nbits_cutoff = 0; 1859 1860 k = 1; 1861 1862 winsize = 10; /* the itch function needs this */ 1863 itch = mpn_powm_sec_itch (n_max, n_max, n_max); 1864 1865 rp = TMP_ALLOC_LIMBS (n_max); 1866 bp = TMP_ALLOC_LIMBS (n_max); 1867 ep = TMP_ALLOC_LIMBS (n_max); 1868 mp = TMP_ALLOC_LIMBS (n_max); 1869 tp = TMP_ALLOC_LIMBS (itch); 1870 1871 mpn_random (bp, n_max); 1872 mpn_random (mp, n_max); 1873 mp[0] |= 1; 1874 1875 /* How about taking the M operand size into account? 1876 1877 An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming 1878 B = O(M)). 1879 1880 Using k-ary and no sliding window, the precomputation will need time 1881 O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) + 1882 O(log(E)/k*M(N)), for the squarings, multiplications, respectively. 1883 1884 An operation R=powm_sec(B,E,N) will take time like powm. 1885 1886 Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the 1887 main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) + 1888 O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full 1889 table reads, respectively. */ 1890 1891 printf ("#define POWM_SEC_TABLE "); 1892 1893 for (nbits = 1; nbits <= n_max * GMP_NUMB_BITS; ) 1894 { 1895 n = (nbits - 1) / GMP_NUMB_BITS + 1; 1896 1897 /* Generate E such that sliding-window for k and k+1 works equally 1898 well/poorly (but sliding is not used in powm_sec, of course). */ 1899 for (i = 0; i < n; i++) 1900 ep[i] = ~CNST_LIMB(0); 1901 1902 /* Truncate E to be exactly nbits large. */ 1903 if (nbits % GMP_NUMB_BITS != 0) 1904 mpn_rshift (ep, ep, n, GMP_NUMB_BITS - nbits % GMP_NUMB_BITS); 1905 ep[n - 1] |= CNST_LIMB(1) << (nbits - 1) % GMP_NUMB_BITS; 1906 1907 winsize = k; 1908 for (i = 0; i < n_measurements; i++) 1909 { 1910 speed_starttime (); 1911 mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp); 1912 ttab[i] = speed_endtime (); 1913 } 1914 tk = median (ttab, n_measurements); 1915 1916 winsize = k + 1; 1917 speed_starttime (); 1918 for (i = 0; i < n_measurements; i++) 1919 { 1920 speed_starttime (); 1921 mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp); 1922 ttab[i] = speed_endtime (); 1923 } 1924 tkp1 = median (ttab, n_measurements); 1925 /* 1926 printf ("testing: %ld, %d", nbits, k, ep[n-1]); 1927 printf (" %10.5f %10.5f\n", tk, tkp1); 1928 */ 1929 if (tkp1 < tk) 1930 { 1931 if (possible_nbits_cutoff) 1932 { 1933 /* Two consecutive sizes indicate k increase, obey. */ 1934 if (k > 1) 1935 printf (","); 1936 printf ("%ld", (long) possible_nbits_cutoff); 1937 k++; 1938 possible_nbits_cutoff = 0; 1939 } 1940 else 1941 { 1942 /* One measurement indicate k increase, save nbits for further 1943 consideration. */ 1944 possible_nbits_cutoff = nbits; 1945 } 1946 } 1947 else 1948 possible_nbits_cutoff = 0; 1949 1950 nbits_next = nbits * 65 / 64; 1951 nbits = nbits_next + (nbits_next == nbits); 1952 } 1953 printf ("\n"); 1954 TMP_FREE; 1955 } 1956 1957 1958 /* size_extra==1 reflects the fact that with high<divisor one division is 1959 always skipped. Forcing high<divisor while testing ensures consistency 1960 while stepping through sizes, ie. that size-1 divides will be done each 1961 time. 1962 1963 min_size==2 and min_is_always are used so that if plain division is only 1964 better at size==1 then don't bother including that code just for that 1965 case, instead go with preinv always and get a size saving. */ 1966 1967 #define DIV_1_PARAMS \ 1968 param.check_size = 256; \ 1969 param.min_size = 2; \ 1970 param.min_is_always = 1; \ 1971 param.data_high = DATA_HIGH_LT_R; \ 1972 param.size_extra = 1; \ 1973 param.stop_factor = 2.0; 1974 1975 1976 double (*tuned_speed_mpn_divrem_1) (struct speed_params *); 1977 1978 void 1979 tune_divrem_1 (void) 1980 { 1981 /* plain version by default */ 1982 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1; 1983 1984 /* No support for tuning native assembler code, do that by hand and put 1985 the results in the .asm file, there's no need for such thresholds to 1986 appear in gmp-mparam.h. */ 1987 if (HAVE_NATIVE_mpn_divrem_1) 1988 return; 1989 1990 if (GMP_NAIL_BITS != 0) 1991 { 1992 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 1993 "no preinv with nails"); 1994 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 1995 "no preinv with nails"); 1996 return; 1997 } 1998 1999 if (UDIV_PREINV_ALWAYS) 2000 { 2001 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always"); 2002 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L); 2003 return; 2004 } 2005 2006 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune; 2007 2008 /* Tune for the integer part of mpn_divrem_1. This will very possibly be 2009 a bit out for the fractional part, but that's too bad, the integer part 2010 is more important. */ 2011 { 2012 static struct param_t param; 2013 param.name = "DIVREM_1_NORM_THRESHOLD"; 2014 DIV_1_PARAMS; 2015 s.r = randlimb_norm (); 2016 param.function = speed_mpn_divrem_1_tune; 2017 one (&divrem_1_norm_threshold, ¶m); 2018 } 2019 { 2020 static struct param_t param; 2021 param.name = "DIVREM_1_UNNORM_THRESHOLD"; 2022 DIV_1_PARAMS; 2023 s.r = randlimb_half (); 2024 param.function = speed_mpn_divrem_1_tune; 2025 one (&divrem_1_unnorm_threshold, ¶m); 2026 } 2027 } 2028 2029 2030 void 2031 tune_mod_1 (void) 2032 { 2033 /* No support for tuning native assembler code, do that by hand and put 2034 the results in the .asm file, there's no need for such thresholds to 2035 appear in gmp-mparam.h. */ 2036 if (HAVE_NATIVE_mpn_mod_1) 2037 return; 2038 2039 if (GMP_NAIL_BITS != 0) 2040 { 2041 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 2042 "no preinv with nails"); 2043 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 2044 "no preinv with nails"); 2045 return; 2046 } 2047 2048 if (!HAVE_NATIVE_mpn_mod_1_1p) 2049 { 2050 static struct param_t param; 2051 double t1, t2; 2052 2053 s.size = 10; 2054 s.r = randlimb_half (); 2055 2056 t1 = tuneup_measure (speed_mpn_mod_1_1_1, ¶m, &s); 2057 t2 = tuneup_measure (speed_mpn_mod_1_1_2, ¶m, &s); 2058 2059 if (t1 == -1.0 || t2 == -1.0) 2060 { 2061 printf ("Oops, can't measure all mpn_mod_1_1 methods at %ld\n", 2062 (long) s.size); 2063 abort (); 2064 } 2065 mod_1_1p_method = (t1 < t2) ? 1 : 2; 2066 print_define ("MOD_1_1P_METHOD", mod_1_1p_method); 2067 } 2068 2069 if (UDIV_PREINV_ALWAYS) 2070 { 2071 print_define ("MOD_1_NORM_THRESHOLD", 0L); 2072 print_define ("MOD_1_UNNORM_THRESHOLD", 0L); 2073 } 2074 else 2075 { 2076 { 2077 static struct param_t param; 2078 param.name = "MOD_1_NORM_THRESHOLD"; 2079 DIV_1_PARAMS; 2080 s.r = randlimb_norm (); 2081 param.function = speed_mpn_mod_1_tune; 2082 one (&mod_1_norm_threshold, ¶m); 2083 } 2084 { 2085 static struct param_t param; 2086 param.name = "MOD_1_UNNORM_THRESHOLD"; 2087 DIV_1_PARAMS; 2088 s.r = randlimb_half (); 2089 param.function = speed_mpn_mod_1_tune; 2090 one (&mod_1_unnorm_threshold, ¶m); 2091 } 2092 } 2093 { 2094 static struct param_t param; 2095 2096 param.check_size = 256; 2097 2098 s.r = randlimb_norm (); 2099 param.function = speed_mpn_mod_1_tune; 2100 2101 param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD"; 2102 param.min_size = 2; 2103 one (&mod_1n_to_mod_1_1_threshold, ¶m); 2104 } 2105 2106 { 2107 static struct param_t param; 2108 2109 param.check_size = 256; 2110 s.r = randlimb_half (); 2111 param.noprint = 1; 2112 2113 param.function = speed_mpn_mod_1_1; 2114 param.function2 = speed_mpn_mod_1_2; 2115 param.min_is_always = 1; 2116 param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD"; 2117 param.min_size = 2; 2118 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2119 2120 param.function = speed_mpn_mod_1_2; 2121 param.function2 = speed_mpn_mod_1_4; 2122 param.min_is_always = 1; 2123 param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD"; 2124 param.min_size = 1; 2125 one (&mod_1_2_to_mod_1_4_threshold, ¶m); 2126 2127 if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold) 2128 { 2129 /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */ 2130 mod_1_2_to_mod_1_4_threshold = 0; 2131 2132 param.function = speed_mpn_mod_1_1; 2133 param.function2 = speed_mpn_mod_1_4; 2134 param.min_is_always = 1; 2135 param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake"; 2136 param.min_size = 2; 2137 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2138 } 2139 2140 param.function = speed_mpn_mod_1_tune; 2141 param.function2 = NULL; 2142 param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD"; 2143 param.min_size = 2; 2144 param.min_is_always = 0; 2145 one (&mod_1u_to_mod_1_1_threshold, ¶m); 2146 2147 if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold) 2148 mod_1_1_to_mod_1_2_threshold = 0; 2149 if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold) 2150 mod_1_2_to_mod_1_4_threshold = 0; 2151 2152 print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL); 2153 print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, 2154 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL); 2155 print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, 2156 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL); 2157 } 2158 2159 { 2160 static struct param_t param; 2161 2162 param.check_size = 256; 2163 2164 param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD"; 2165 s.r = randlimb_norm (); 2166 param.function = speed_mpn_preinv_mod_1; 2167 param.function2 = speed_mpn_mod_1_tune; 2168 param.min_size = 1; 2169 one (&preinv_mod_1_to_mod_1_threshold, ¶m); 2170 } 2171 } 2172 2173 2174 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would 2175 imply that udiv_qrnnd_preinv is worth using, but it seems most 2176 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div 2177 directly. */ 2178 2179 void 2180 tune_preinv_divrem_1 (void) 2181 { 2182 static struct param_t param; 2183 speed_function_t divrem_1; 2184 const char *divrem_1_name; 2185 double t1, t2; 2186 2187 if (GMP_NAIL_BITS != 0) 2188 { 2189 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails"); 2190 return; 2191 } 2192 2193 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because 2194 it's faster than mpn_divrem_1. */ 2195 if (HAVE_NATIVE_mpn_preinv_divrem_1) 2196 { 2197 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); 2198 return; 2199 } 2200 2201 /* If udiv_qrnnd_preinv is the only division method then of course 2202 mpn_preinv_divrem_1 should be used. */ 2203 if (UDIV_PREINV_ALWAYS) 2204 { 2205 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); 2206 return; 2207 } 2208 2209 /* If we've got an assembler version of mpn_divrem_1, then compare against 2210 that, not the mpn_divrem_1_div generic C. */ 2211 if (HAVE_NATIVE_mpn_divrem_1) 2212 { 2213 divrem_1 = speed_mpn_divrem_1; 2214 divrem_1_name = "mpn_divrem_1"; 2215 } 2216 else 2217 { 2218 divrem_1 = speed_mpn_divrem_1_div; 2219 divrem_1_name = "mpn_divrem_1_div"; 2220 } 2221 2222 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ 2223 s.size = 200; /* generous but not too big */ 2224 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, 2225 since in general that's probably most common, though in fact for a 2226 64-bit limb mp_bases[10].big_base is normalized. */ 2227 s.r = urandom() & (GMP_NUMB_MASK >> 4); 2228 if (s.r == 0) s.r = 123; 2229 2230 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); 2231 t2 = tuneup_measure (divrem_1, ¶m, &s); 2232 if (t1 == -1.0 || t2 == -1.0) 2233 { 2234 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", 2235 divrem_1_name, (long) s.size); 2236 abort (); 2237 } 2238 if (option_trace >= 1) 2239 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", 2240 (long) s.size, t1, divrem_1_name, t2); 2241 2242 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); 2243 } 2244 2245 2246 2247 void 2248 tune_divrem_2 (void) 2249 { 2250 static struct param_t param; 2251 2252 /* No support for tuning native assembler code, do that by hand and put 2253 the results in the .asm file, and there's no need for such thresholds 2254 to appear in gmp-mparam.h. */ 2255 if (HAVE_NATIVE_mpn_divrem_2) 2256 return; 2257 2258 if (GMP_NAIL_BITS != 0) 2259 { 2260 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, 2261 "no preinv with nails"); 2262 return; 2263 } 2264 2265 if (UDIV_PREINV_ALWAYS) 2266 { 2267 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); 2268 return; 2269 } 2270 2271 /* Tune for the integer part of mpn_divrem_2. This will very possibly be 2272 a bit out for the fractional part, but that's too bad, the integer part 2273 is more important. 2274 2275 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save 2276 code space if plain division is better only at size==2 or size==3. */ 2277 param.name = "DIVREM_2_THRESHOLD"; 2278 param.check_size = 256; 2279 param.min_size = 4; 2280 param.min_is_always = 1; 2281 param.size_extra = 2; /* does qsize==nsize-2 divisions */ 2282 param.stop_factor = 2.0; 2283 2284 s.r = randlimb_norm (); 2285 param.function = speed_mpn_divrem_2; 2286 one (&divrem_2_threshold, ¶m); 2287 } 2288 2289 void 2290 tune_div_qr_2 (void) 2291 { 2292 static struct param_t param; 2293 param.name = "DIV_QR_2_PI2_THRESHOLD"; 2294 param.function = speed_mpn_div_qr_2n; 2295 param.check_size = 500; 2296 param.min_size = 4; 2297 one (&div_qr_2_pi2_threshold, ¶m); 2298 } 2299 2300 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so 2301 tune for that. Its speed can differ on odd or even divisor, so take an 2302 average threshold for the two. 2303 2304 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1 2305 might not vary that way, but don't test this since high<divisor isn't 2306 expected to occur often with small divisors. */ 2307 2308 void 2309 tune_divexact_1 (void) 2310 { 2311 static struct param_t param; 2312 mp_size_t thresh[2], average; 2313 int low, i; 2314 2315 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a 2316 full mpn_divrem_1. */ 2317 if (HAVE_NATIVE_mpn_divexact_1) 2318 { 2319 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)"); 2320 return; 2321 } 2322 2323 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL); 2324 2325 param.name = "DIVEXACT_1_THRESHOLD"; 2326 param.data_high = DATA_HIGH_GE_R; 2327 param.check_size = 256; 2328 param.min_size = 2; 2329 param.stop_factor = 1.5; 2330 param.function = tuned_speed_mpn_divrem_1; 2331 param.function2 = speed_mpn_divexact_1; 2332 param.noprint = 1; 2333 2334 print_define_start (param.name); 2335 2336 for (low = 0; low <= 1; low++) 2337 { 2338 s.r = randlimb_half(); 2339 if (low == 0) 2340 s.r |= 1; 2341 else 2342 s.r &= ~CNST_LIMB(7); 2343 2344 one (&thresh[low], ¶m); 2345 if (option_trace) 2346 printf ("low=%d thresh %ld\n", low, (long) thresh[low]); 2347 2348 if (thresh[low] == MP_SIZE_T_MAX) 2349 { 2350 average = MP_SIZE_T_MAX; 2351 goto divexact_1_done; 2352 } 2353 } 2354 2355 if (option_trace) 2356 { 2357 printf ("average of:"); 2358 for (i = 0; i < numberof(thresh); i++) 2359 printf (" %ld", (long) thresh[i]); 2360 printf ("\n"); 2361 } 2362 2363 average = 0; 2364 for (i = 0; i < numberof(thresh); i++) 2365 average += thresh[i]; 2366 average /= numberof(thresh); 2367 2368 /* If divexact turns out to be better as early as 3 limbs, then use it 2369 always, so as to reduce code size and conditional jumps. */ 2370 if (average <= 3) 2371 average = 0; 2372 2373 divexact_1_done: 2374 print_define_end (param.name, average); 2375 } 2376 2377 2378 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the 2379 same as mpn_mod_1, but this might not be true of an assembler 2380 implementation. The threshold used is an average based on data where a 2381 divide can be skipped and where it can't. 2382 2383 If modexact turns out to be better as early as 3 limbs, then use it 2384 always, so as to reduce code size and conditional jumps. */ 2385 2386 void 2387 tune_modexact_1_odd (void) 2388 { 2389 static struct param_t param; 2390 mp_size_t thresh_lt, thresh_ge, average; 2391 2392 #if 0 2393 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed 2394 of a full mpn_mod_1. */ 2395 if (HAVE_NATIVE_mpn_modexact_1_odd) 2396 { 2397 print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1"); 2398 return; 2399 } 2400 #endif 2401 2402 param.name = "BMOD_1_TO_MOD_1_THRESHOLD"; 2403 param.check_size = 256; 2404 param.min_size = 2; 2405 param.stop_factor = 1.5; 2406 param.function = speed_mpn_modexact_1c_odd; 2407 param.function2 = speed_mpn_mod_1_tune; 2408 param.noprint = 1; 2409 s.r = randlimb_half () | 1; 2410 2411 print_define_start (param.name); 2412 2413 param.data_high = DATA_HIGH_LT_R; 2414 one (&thresh_lt, ¶m); 2415 if (option_trace) 2416 printf ("lt thresh %ld\n", (long) thresh_lt); 2417 2418 average = thresh_lt; 2419 if (thresh_lt != MP_SIZE_T_MAX) 2420 { 2421 param.data_high = DATA_HIGH_GE_R; 2422 one (&thresh_ge, ¶m); 2423 if (option_trace) 2424 printf ("ge thresh %ld\n", (long) thresh_ge); 2425 2426 if (thresh_ge != MP_SIZE_T_MAX) 2427 { 2428 average = (thresh_ge + thresh_lt) / 2; 2429 if (thresh_ge <= 3) 2430 average = 0; 2431 } 2432 } 2433 2434 print_define_end (param.name, average); 2435 } 2436 2437 2438 void 2439 tune_jacobi_base (void) 2440 { 2441 static struct param_t param; 2442 double t1, t2, t3, t4; 2443 int method; 2444 2445 s.size = GMP_LIMB_BITS * 3 / 4; 2446 2447 t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s); 2448 if (option_trace >= 1) 2449 printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1); 2450 2451 t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s); 2452 if (option_trace >= 1) 2453 printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2); 2454 2455 t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s); 2456 if (option_trace >= 1) 2457 printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3); 2458 2459 t4 = tuneup_measure (speed_mpn_jacobi_base_4, ¶m, &s); 2460 if (option_trace >= 1) 2461 printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4); 2462 2463 if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0) 2464 { 2465 printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n", 2466 (long) s.size); 2467 abort (); 2468 } 2469 2470 if (t1 < t2 && t1 < t3 && t1 < t4) 2471 method = 1; 2472 else if (t2 < t3 && t2 < t4) 2473 method = 2; 2474 else if (t3 < t4) 2475 method = 3; 2476 else 2477 method = 4; 2478 2479 print_define ("JACOBI_BASE_METHOD", method); 2480 } 2481 2482 2483 void 2484 tune_get_str (void) 2485 { 2486 /* Tune for decimal, it being most common. Some rough testing suggests 2487 other bases are different, but not by very much. */ 2488 s.r = 10; 2489 { 2490 static struct param_t param; 2491 GET_STR_PRECOMPUTE_THRESHOLD = 0; 2492 param.name = "GET_STR_DC_THRESHOLD"; 2493 param.function = speed_mpn_get_str; 2494 param.min_size = 4; 2495 param.max_size = GET_STR_THRESHOLD_LIMIT; 2496 one (&get_str_dc_threshold, ¶m); 2497 } 2498 { 2499 static struct param_t param; 2500 param.name = "GET_STR_PRECOMPUTE_THRESHOLD"; 2501 param.function = speed_mpn_get_str; 2502 param.min_size = GET_STR_DC_THRESHOLD; 2503 param.max_size = GET_STR_THRESHOLD_LIMIT; 2504 one (&get_str_precompute_threshold, ¶m); 2505 } 2506 } 2507 2508 2509 double 2510 speed_mpn_pre_set_str (struct speed_params *s) 2511 { 2512 unsigned char *str; 2513 mp_ptr wp; 2514 mp_size_t wn; 2515 unsigned i; 2516 int base; 2517 double t; 2518 mp_ptr powtab_mem, tp; 2519 powers_t powtab[GMP_LIMB_BITS]; 2520 mp_size_t un; 2521 int chars_per_limb; 2522 TMP_DECL; 2523 2524 SPEED_RESTRICT_COND (s->size >= 1); 2525 2526 base = s->r == 0 ? 10 : s->r; 2527 SPEED_RESTRICT_COND (base >= 2 && base <= 256); 2528 2529 TMP_MARK; 2530 2531 str = TMP_ALLOC (s->size); 2532 for (i = 0; i < s->size; i++) 2533 str[i] = s->xp[i] % base; 2534 2535 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); 2536 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); 2537 2538 /* use this during development to check wn is big enough */ 2539 /* 2540 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); 2541 */ 2542 2543 speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB); 2544 speed_operand_dst (s, wp, wn); 2545 speed_cache_fill (s); 2546 2547 chars_per_limb = mp_bases[base].chars_per_limb; 2548 un = s->size / chars_per_limb + 1; 2549 powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un)); 2550 mpn_set_str_compute_powtab (powtab, powtab_mem, un, base); 2551 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); 2552 2553 speed_starttime (); 2554 i = s->reps; 2555 do 2556 { 2557 mpn_pre_set_str (wp, str, s->size, powtab, tp); 2558 } 2559 while (--i != 0); 2560 t = speed_endtime (); 2561 2562 TMP_FREE; 2563 return t; 2564 } 2565 2566 void 2567 tune_set_str (void) 2568 { 2569 s.r = 10; /* decimal */ 2570 { 2571 static struct param_t param; 2572 SET_STR_PRECOMPUTE_THRESHOLD = 0; 2573 param.step_factor = 0.01; 2574 param.name = "SET_STR_DC_THRESHOLD"; 2575 param.function = speed_mpn_pre_set_str; 2576 param.min_size = 100; 2577 param.max_size = 50000; 2578 one (&set_str_dc_threshold, ¶m); 2579 } 2580 { 2581 static struct param_t param; 2582 param.step_factor = 0.02; 2583 param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; 2584 param.function = speed_mpn_set_str; 2585 param.min_size = SET_STR_DC_THRESHOLD; 2586 param.max_size = 100000; 2587 one (&set_str_precompute_threshold, ¶m); 2588 } 2589 } 2590 2591 2592 void 2593 tune_fft_mul (void) 2594 { 2595 static struct fft_param_t param; 2596 2597 if (option_fft_max_size == 0) 2598 return; 2599 2600 param.table_name = "MUL_FFT_TABLE3"; 2601 param.threshold_name = "MUL_FFT_THRESHOLD"; 2602 param.p_threshold = &mul_fft_threshold; 2603 param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; 2604 param.p_modf_threshold = &mul_fft_modf_threshold; 2605 param.first_size = MUL_TOOM33_THRESHOLD / 2; 2606 param.max_size = option_fft_max_size; 2607 param.function = speed_mpn_fft_mul; 2608 param.mul_modf_function = speed_mpn_mul_fft; 2609 param.mul_function = speed_mpn_mul_n; 2610 param.sqr = 0; 2611 fft (¶m); 2612 } 2613 2614 2615 void 2616 tune_fft_sqr (void) 2617 { 2618 static struct fft_param_t param; 2619 2620 if (option_fft_max_size == 0) 2621 return; 2622 2623 param.table_name = "SQR_FFT_TABLE3"; 2624 param.threshold_name = "SQR_FFT_THRESHOLD"; 2625 param.p_threshold = &sqr_fft_threshold; 2626 param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; 2627 param.p_modf_threshold = &sqr_fft_modf_threshold; 2628 param.first_size = SQR_TOOM3_THRESHOLD / 2; 2629 param.max_size = option_fft_max_size; 2630 param.function = speed_mpn_fft_sqr; 2631 param.mul_modf_function = speed_mpn_mul_fft_sqr; 2632 param.mul_function = speed_mpn_sqr; 2633 param.sqr = 1; 2634 fft (¶m); 2635 } 2636 2637 void 2638 tune_fac_ui (void) 2639 { 2640 static struct param_t param; 2641 2642 param.function = speed_mpz_fac_ui_tune; 2643 2644 param.name = "FAC_DSC_THRESHOLD"; 2645 param.min_size = 70; 2646 param.max_size = FAC_DSC_THRESHOLD_LIMIT; 2647 one (&fac_dsc_threshold, ¶m); 2648 2649 param.name = "FAC_ODD_THRESHOLD"; 2650 param.min_size = 22; 2651 param.stop_factor = 1.7; 2652 param.min_is_always = 1; 2653 one (&fac_odd_threshold, ¶m); 2654 } 2655 2656 void 2657 all (void) 2658 { 2659 time_t start_time, end_time; 2660 TMP_DECL; 2661 2662 TMP_MARK; 2663 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); 2664 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); 2665 2666 mpn_random (s.xp_block, SPEED_BLOCK_SIZE); 2667 mpn_random (s.yp_block, SPEED_BLOCK_SIZE); 2668 2669 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); 2670 2671 speed_time_init (); 2672 fprintf (stderr, "Using: %s\n", speed_time_string); 2673 2674 fprintf (stderr, "speed_precision %d", speed_precision); 2675 if (speed_unittime == 1.0) 2676 fprintf (stderr, ", speed_unittime 1 cycle"); 2677 else 2678 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); 2679 if (speed_cycletime == 1.0 || speed_cycletime == 0.0) 2680 fprintf (stderr, ", CPU freq unknown\n"); 2681 else 2682 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); 2683 2684 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", 2685 DEFAULT_MAX_SIZE, (long) option_fft_max_size); 2686 fprintf (stderr, "\n"); 2687 2688 time (&start_time); 2689 { 2690 struct tm *tp; 2691 tp = localtime (&start_time); 2692 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", 2693 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); 2694 2695 #ifdef __GNUC__ 2696 /* gcc sub-minor version doesn't seem to come through as a define */ 2697 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); 2698 #define PRINTED_COMPILER 2699 #endif 2700 #if defined (__SUNPRO_C) 2701 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); 2702 #define PRINTED_COMPILER 2703 #endif 2704 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) 2705 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ 2706 printf ("MIPSpro C %d.%d.%d */\n", 2707 _COMPILER_VERSION / 100, 2708 _COMPILER_VERSION / 10 % 10, 2709 _COMPILER_VERSION % 10); 2710 #define PRINTED_COMPILER 2711 #endif 2712 #if defined (__DECC) && defined (__DECC_VER) 2713 printf ("DEC C %d */\n", __DECC_VER); 2714 #define PRINTED_COMPILER 2715 #endif 2716 #if ! defined (PRINTED_COMPILER) 2717 printf ("system compiler */\n"); 2718 #endif 2719 } 2720 printf ("\n"); 2721 2722 tune_divrem_1 (); 2723 tune_mod_1 (); 2724 tune_preinv_divrem_1 (); 2725 #if 0 2726 tune_divrem_2 (); 2727 #endif 2728 tune_div_qr_2 (); 2729 tune_divexact_1 (); 2730 tune_modexact_1_odd (); 2731 printf("\n"); 2732 2733 tune_mul_n (); 2734 printf("\n"); 2735 2736 tune_mul (); 2737 printf("\n"); 2738 2739 tune_sqr (); 2740 printf("\n"); 2741 2742 tune_mulmid (); 2743 printf("\n"); 2744 2745 tune_mulmod_bnm1 (); 2746 tune_sqrmod_bnm1 (); 2747 printf("\n"); 2748 2749 tune_fft_mul (); 2750 printf("\n"); 2751 2752 tune_fft_sqr (); 2753 printf ("\n"); 2754 2755 tune_mullo (); 2756 printf("\n"); 2757 2758 tune_dc_div (); 2759 tune_dc_bdiv (); 2760 2761 printf("\n"); 2762 tune_invertappr (); 2763 tune_invert (); 2764 printf("\n"); 2765 2766 tune_binvert (); 2767 tune_redc (); 2768 printf("\n"); 2769 2770 tune_mu_div (); 2771 tune_mu_bdiv (); 2772 printf("\n"); 2773 2774 tune_powm_sec (); 2775 printf("\n"); 2776 2777 tune_matrix22_mul (); 2778 tune_hgcd (); 2779 tune_hgcd_appr (); 2780 tune_hgcd_reduce(); 2781 tune_gcd_dc (); 2782 tune_gcdext_dc (); 2783 tune_jacobi_base (); 2784 printf("\n"); 2785 2786 tune_get_str (); 2787 tune_set_str (); 2788 printf("\n"); 2789 2790 tune_fac_ui (); 2791 printf("\n"); 2792 2793 time (&end_time); 2794 printf ("/* Tuneup completed successfully, took %ld seconds */\n", 2795 (long) (end_time - start_time)); 2796 2797 TMP_FREE; 2798 } 2799 2800 2801 int 2802 main (int argc, char *argv[]) 2803 { 2804 int opt; 2805 2806 /* Unbuffered so if output is redirected to a file it isn't lost if the 2807 program is killed part way through. */ 2808 setbuf (stdout, NULL); 2809 setbuf (stderr, NULL); 2810 2811 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) 2812 { 2813 switch (opt) { 2814 case 'f': 2815 if (optarg[0] == 't') 2816 option_fft_trace = 2; 2817 else 2818 option_fft_max_size = atol (optarg); 2819 break; 2820 case 'o': 2821 speed_option_set (optarg); 2822 break; 2823 case 'p': 2824 speed_precision = atoi (optarg); 2825 break; 2826 case 't': 2827 option_trace++; 2828 break; 2829 case '?': 2830 exit(1); 2831 } 2832 } 2833 2834 all (); 2835 exit (0); 2836 } 2837