1 /* Create tuned thresholds for various algorithms. 2 3 Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc. 4 5 This file is part of the GNU MP Library. 6 7 The GNU MP Library is free software; you can redistribute it and/or modify 8 it under the terms of either: 9 10 * the GNU Lesser General Public License as published by the Free 11 Software Foundation; either version 3 of the License, or (at your 12 option) any later version. 13 14 or 15 16 * the GNU General Public License as published by the Free Software 17 Foundation; either version 2 of the License, or (at your option) any 18 later version. 19 20 or both in parallel, as here. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 for more details. 26 27 You should have received copies of the GNU General Public License and the 28 GNU Lesser General Public License along with the GNU MP Library. If not, 29 see https://www.gnu.org/licenses/. */ 30 31 32 /* Usage: tuneup [-t] [-t] [-p precision] 33 34 -t turns on some diagnostic traces, a second -t turns on more traces. 35 36 Notes: 37 38 The code here isn't a vision of loveliness, mainly because it's subject 39 to ongoing changes according to new things wanting to be tuned, and 40 practical requirements of systems tested. 41 42 Sometimes running the program twice produces slightly different results. 43 This is probably because there's so little separating algorithms near 44 their crossover, and on that basis it should make little or no difference 45 to the final speed of the relevant routines, but nothing has been done to 46 check that carefully. 47 48 Algorithm: 49 50 The thresholds are determined as follows. A crossover may not be a 51 single size but rather a range where it oscillates between method A or 52 method B faster. If the threshold is set making B used where A is faster 53 (or vice versa) that's bad. Badness is the percentage time lost and 54 total badness is the sum of this over all sizes measured. The threshold 55 is set to minimize total badness. 56 57 Suppose, as sizes increase, method B becomes faster than method A. The 58 effect of the rule is that, as you look at increasing sizes, isolated 59 points where B is faster are ignored, but when it's consistently faster, 60 or faster on balance, then the threshold is set there. The same result 61 is obtained thinking in the other direction of A becoming faster at 62 smaller sizes. 63 64 In practice the thresholds tend to be chosen to bring on the next 65 algorithm fairly quickly. 66 67 This rule is attractive because it's got a basis in reason and is fairly 68 easy to implement, but no work has been done to actually compare it in 69 absolute terms to other possibilities. 70 71 Implementation: 72 73 In a normal library build the thresholds are constants. To tune them 74 selected objects are recompiled with the thresholds as global variables 75 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at 76 the end of gmp-impl.h, and rules in tune/Makefile.am. 77 78 MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The 79 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one 80 level, but recurse into the basecase. 81 82 MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value. 83 Other routines in turn will make use of both of those. Naturally the 84 dependants must be tuned first. 85 86 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, 87 just a threshold based on comparing two routines (mpn_divrem_1 and 88 mpn_divexact_1), and no further use of the value determined. 89 90 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being 91 just comparisons between certain routines on representative data. 92 93 Shortcuts are applied when native (assembler) versions of routines exist. 94 For instance a native mpn_sqr_basecase is assumed to be always faster 95 than mpn_mul_basecase, with no measuring. 96 97 No attempt is made to tune within assembler routines, for instance 98 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be 99 written and tuned all by hand. Assembler routines that might have hard 100 limits are recompiled though, to make them accept a bigger range of sizes 101 than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr. 102 103 Limitations: 104 105 The FFTs aren't subject to the same badness rule as the other thresholds, 106 so each k is probably being brought on a touch early. This isn't likely 107 to make a difference, and the simpler probing means fewer tests. 108 109 */ 110 111 #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ 112 113 #include "config.h" 114 115 #include <math.h> 116 #include <stdio.h> 117 #include <stdlib.h> 118 #include <time.h> 119 #if HAVE_UNISTD_H 120 #include <unistd.h> 121 #endif 122 123 #include "gmp-impl.h" 124 #include "longlong.h" 125 126 #include "tests.h" 127 #include "speed.h" 128 129 #if !HAVE_DECL_OPTARG 130 extern char *optarg; 131 extern int optind, opterr; 132 #endif 133 134 135 #define DEFAULT_MAX_SIZE 1000 /* limbs */ 136 137 #if WANT_FFT 138 mp_size_t option_fft_max_size = 50000; /* limbs */ 139 #else 140 mp_size_t option_fft_max_size = 0; 141 #endif 142 int option_trace = 0; 143 int option_fft_trace = 0; 144 struct speed_params s; 145 146 struct dat_t { 147 mp_size_t size; 148 double d; 149 } *dat = NULL; 150 int ndat = 0; 151 int allocdat = 0; 152 153 /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that 154 case use zero here, which for params.max_size means no limit. */ 155 #ifndef TUNE_SQR_TOOM2_MAX 156 #define TUNE_SQR_TOOM2_MAX 0 157 #endif 158 159 mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX; 160 mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT; 161 mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; 162 mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT; 163 mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT; 164 mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX; 165 mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX; 166 mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX; 167 mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX; 168 mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX; 169 mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; 170 mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; 171 mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; 172 mp_size_t sqr_toom2_threshold 173 = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX); 174 mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 175 mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 176 mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 177 mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 178 mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; 179 mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; 180 mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX; 181 mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX; 182 mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX; 183 mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX; 184 mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX; 185 mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX; 186 mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX; 187 mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX; 188 mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX; 189 mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX; 190 mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX; 191 mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX; 192 mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX; 193 mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX; 194 mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX; 195 mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX; 196 mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX; 197 mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX; 198 mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX; 199 mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX; 200 mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX; 201 mp_size_t inv_newton_threshold = MP_SIZE_T_MAX; 202 mp_size_t inv_appr_threshold = MP_SIZE_T_MAX; 203 mp_size_t binv_newton_threshold = MP_SIZE_T_MAX; 204 mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX; 205 mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX; 206 mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; 207 mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; 208 mp_size_t hgcd_threshold = MP_SIZE_T_MAX; 209 mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX; 210 mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX; 211 mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; 212 mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; 213 int div_qr_1n_pi1_method = 0; 214 mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX; 215 mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX; 216 mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; 217 mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; 218 mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; 219 mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; 220 int mod_1_1p_method = 0; 221 mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX; 222 mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX; 223 mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX; 224 mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX; 225 mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX; 226 mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; 227 mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; 228 mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; 229 mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; 230 mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; 231 mp_size_t fac_odd_threshold = 0; 232 mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT; 233 234 mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; 235 mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; 236 237 struct param_t { 238 const char *name; 239 speed_function_t function; 240 speed_function_t function2; 241 double step_factor; /* how much to step relatively */ 242 int step; /* how much to step absolutely */ 243 double function_fudge; /* multiplier for "function" speeds */ 244 int stop_since_change; 245 double stop_factor; 246 mp_size_t min_size; 247 int min_is_always; 248 mp_size_t max_size; 249 mp_size_t check_size; 250 mp_size_t size_extra; 251 252 #define DATA_HIGH_LT_R 1 253 #define DATA_HIGH_GE_R 2 254 int data_high; 255 256 int noprint; 257 }; 258 259 260 /* These are normally undefined when false, which suits "#if" fine. 261 But give them zero values so they can be used in plain C "if"s. */ 262 #ifndef UDIV_PREINV_ALWAYS 263 #define UDIV_PREINV_ALWAYS 0 264 #endif 265 #ifndef HAVE_NATIVE_mpn_divexact_1 266 #define HAVE_NATIVE_mpn_divexact_1 0 267 #endif 268 #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1 269 #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0 270 #endif 271 #ifndef HAVE_NATIVE_mpn_divrem_1 272 #define HAVE_NATIVE_mpn_divrem_1 0 273 #endif 274 #ifndef HAVE_NATIVE_mpn_divrem_2 275 #define HAVE_NATIVE_mpn_divrem_2 0 276 #endif 277 #ifndef HAVE_NATIVE_mpn_mod_1 278 #define HAVE_NATIVE_mpn_mod_1 0 279 #endif 280 #ifndef HAVE_NATIVE_mpn_mod_1_1p 281 #define HAVE_NATIVE_mpn_mod_1_1p 0 282 #endif 283 #ifndef HAVE_NATIVE_mpn_modexact_1_odd 284 #define HAVE_NATIVE_mpn_modexact_1_odd 0 285 #endif 286 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 287 #define HAVE_NATIVE_mpn_preinv_divrem_1 0 288 #endif 289 #ifndef HAVE_NATIVE_mpn_preinv_mod_1 290 #define HAVE_NATIVE_mpn_preinv_mod_1 0 291 #endif 292 #ifndef HAVE_NATIVE_mpn_sqr_basecase 293 #define HAVE_NATIVE_mpn_sqr_basecase 0 294 #endif 295 296 297 #define MAX3(a,b,c) MAX (MAX (a, b), c) 298 299 mp_limb_t 300 randlimb_norm (void) 301 { 302 mp_limb_t n; 303 mpn_random (&n, 1); 304 n |= GMP_NUMB_HIGHBIT; 305 return n; 306 } 307 308 #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) 309 310 mp_limb_t 311 randlimb_half (void) 312 { 313 mp_limb_t n; 314 mpn_random (&n, 1); 315 n &= GMP_NUMB_HALFMASK; 316 n += (n==0); 317 return n; 318 } 319 320 321 /* Add an entry to the end of the dat[] array, reallocing to make it bigger 322 if necessary. */ 323 void 324 add_dat (mp_size_t size, double d) 325 { 326 #define ALLOCDAT_STEP 500 327 328 ASSERT_ALWAYS (ndat <= allocdat); 329 330 if (ndat == allocdat) 331 { 332 dat = (struct dat_t *) __gmp_allocate_or_reallocate 333 (dat, allocdat * sizeof(dat[0]), 334 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); 335 allocdat += ALLOCDAT_STEP; 336 } 337 338 dat[ndat].size = size; 339 dat[ndat].d = d; 340 ndat++; 341 } 342 343 344 /* Return the threshold size based on the data accumulated. */ 345 mp_size_t 346 analyze_dat (int final) 347 { 348 double x, min_x; 349 int j, min_j; 350 351 /* If the threshold is set at dat[0].size, any positive values are bad. */ 352 x = 0.0; 353 for (j = 0; j < ndat; j++) 354 if (dat[j].d > 0.0) 355 x += dat[j].d; 356 357 if (option_trace >= 2 && final) 358 { 359 printf ("\n"); 360 printf ("x is the sum of the badness from setting thresh at given size\n"); 361 printf (" (minimum x is sought)\n"); 362 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); 363 } 364 365 min_x = x; 366 min_j = 0; 367 368 369 /* When stepping to the next dat[j].size, positive values are no longer 370 bad (so subtracted), negative values become bad (so add the absolute 371 value, meaning subtract). */ 372 for (j = 0; j < ndat; x -= dat[j].d, j++) 373 { 374 if (option_trace >= 2 && final) 375 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); 376 377 if (x < min_x) 378 { 379 min_x = x; 380 min_j = j; 381 } 382 } 383 384 return min_j; 385 } 386 387 388 /* Measuring for recompiled mpn/generic/div_qr_1.c, 389 * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */ 390 391 mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t); 392 393 #if defined (__cplusplus) 394 extern "C" { 395 #endif 396 397 mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 398 mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t); 399 void mpz_fac_ui_tune (mpz_ptr, unsigned long); 400 401 #if defined (__cplusplus) 402 } 403 #endif 404 405 double 406 speed_mpn_mod_1_tune (struct speed_params *s) 407 { 408 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); 409 } 410 double 411 speed_mpn_divrem_1_tune (struct speed_params *s) 412 { 413 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); 414 } 415 double 416 speed_mpz_fac_ui_tune (struct speed_params *s) 417 { 418 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune); 419 } 420 double 421 speed_mpn_div_qr_1_tune (struct speed_params *s) 422 { 423 SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune); 424 } 425 426 double 427 tuneup_measure (speed_function_t fun, 428 const struct param_t *param, 429 struct speed_params *s) 430 { 431 static struct param_t dummy; 432 double t; 433 TMP_DECL; 434 435 if (! param) 436 param = &dummy; 437 438 s->size += param->size_extra; 439 440 TMP_MARK; 441 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); 442 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); 443 444 mpn_random (s->xp, s->size); 445 mpn_random (s->yp, s->size); 446 447 switch (param->data_high) { 448 case DATA_HIGH_LT_R: 449 s->xp[s->size-1] %= s->r; 450 s->yp[s->size-1] %= s->r; 451 break; 452 case DATA_HIGH_GE_R: 453 s->xp[s->size-1] |= s->r; 454 s->yp[s->size-1] |= s->r; 455 break; 456 } 457 458 t = speed_measure (fun, s); 459 460 s->size -= param->size_extra; 461 462 TMP_FREE; 463 return t; 464 } 465 466 467 #define PRINT_WIDTH 31 468 469 void 470 print_define_start (const char *name) 471 { 472 printf ("#define %-*s ", PRINT_WIDTH, name); 473 if (option_trace) 474 printf ("...\n"); 475 } 476 477 void 478 print_define_end_remark (const char *name, mp_size_t value, const char *remark) 479 { 480 if (option_trace) 481 printf ("#define %-*s ", PRINT_WIDTH, name); 482 483 if (value == MP_SIZE_T_MAX) 484 printf ("MP_SIZE_T_MAX"); 485 else 486 printf ("%5ld", (long) value); 487 488 if (remark != NULL) 489 printf (" /* %s */", remark); 490 printf ("\n"); 491 fflush (stdout); 492 } 493 494 void 495 print_define_end (const char *name, mp_size_t value) 496 { 497 const char *remark; 498 if (value == MP_SIZE_T_MAX) 499 remark = "never"; 500 else if (value == 0) 501 remark = "always"; 502 else 503 remark = NULL; 504 print_define_end_remark (name, value, remark); 505 } 506 507 void 508 print_define (const char *name, mp_size_t value) 509 { 510 print_define_start (name); 511 print_define_end (name, value); 512 } 513 514 void 515 print_define_remark (const char *name, mp_size_t value, const char *remark) 516 { 517 print_define_start (name); 518 print_define_end_remark (name, value, remark); 519 } 520 521 void 522 print_define_with_speedup (const char *name, mp_size_t value, 523 mp_size_t runner_up, double speedup) 524 { 525 char buf[100]; 526 #if __STDC_VERSION__ >= 199901L 527 snprintf (buf, sizeof buf, "%.2f%% faster than %ld", 528 100.0 * (speedup - 1), runner_up); 529 #else 530 sprintf (buf, "%.2f%% faster than %ld", 531 100.0 * (speedup - 1), runner_up); 532 #endif 533 print_define_remark (name, value, buf); 534 } 535 536 void 537 one (mp_size_t *threshold, struct param_t *param) 538 { 539 int since_positive, since_thresh_change; 540 int thresh_idx, new_thresh_idx; 541 542 #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) 543 544 DEFAULT (param->function_fudge, 1.0); 545 DEFAULT (param->function2, param->function); 546 DEFAULT (param->step_factor, 0.01); /* small steps by default */ 547 DEFAULT (param->step, 1); /* small steps by default */ 548 DEFAULT (param->stop_since_change, 80); 549 DEFAULT (param->stop_factor, 1.2); 550 DEFAULT (param->min_size, 10); 551 DEFAULT (param->max_size, DEFAULT_MAX_SIZE); 552 553 if (param->check_size != 0) 554 { 555 double t1, t2; 556 s.size = param->check_size; 557 558 *threshold = s.size+1; 559 t1 = tuneup_measure (param->function, param, &s); 560 561 *threshold = s.size; 562 t2 = tuneup_measure (param->function2, param, &s); 563 if (t1 == -1.0 || t2 == -1.0) 564 { 565 printf ("Oops, can't run both functions at size %ld\n", 566 (long) s.size); 567 abort (); 568 } 569 t1 *= param->function_fudge; 570 571 /* ask that t2 is at least 4% below t1 */ 572 if (t1 < t2*1.04) 573 { 574 if (option_trace) 575 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); 576 *threshold = MP_SIZE_T_MAX; 577 if (! param->noprint) 578 print_define (param->name, *threshold); 579 return; 580 } 581 582 if (option_trace >= 2) 583 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", 584 (long) s.size, t1, t2); 585 } 586 587 if (! param->noprint || option_trace) 588 print_define_start (param->name); 589 590 ndat = 0; 591 since_positive = 0; 592 since_thresh_change = 0; 593 thresh_idx = 0; 594 595 if (option_trace >= 2) 596 { 597 printf (" algorithm-A algorithm-B ratio possible\n"); 598 printf (" (seconds) (seconds) diff thresh\n"); 599 } 600 601 for (s.size = param->min_size; 602 s.size < param->max_size; 603 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step)) 604 { 605 double ti, tiplus1, d; 606 607 /* 608 FIXME: check minimum size requirements are met, possibly by just 609 checking for the -1 returns from the speed functions. 610 */ 611 612 /* using method A at this size */ 613 *threshold = s.size+1; 614 ti = tuneup_measure (param->function, param, &s); 615 if (ti == -1.0) 616 abort (); 617 ti *= param->function_fudge; 618 619 /* using method B at this size */ 620 *threshold = s.size; 621 tiplus1 = tuneup_measure (param->function2, param, &s); 622 if (tiplus1 == -1.0) 623 abort (); 624 625 /* Calculate the fraction by which the one or the other routine is 626 slower. */ 627 if (tiplus1 >= ti) 628 d = (tiplus1 - ti) / tiplus1; /* negative */ 629 else 630 d = (tiplus1 - ti) / ti; /* positive */ 631 632 add_dat (s.size, d); 633 634 new_thresh_idx = analyze_dat (0); 635 636 if (option_trace >= 2) 637 printf ("size=%ld %.9f %.9f % .4f %c %ld\n", 638 (long) s.size, ti, tiplus1, d, 639 ti > tiplus1 ? '#' : ' ', 640 (long) dat[new_thresh_idx].size); 641 642 /* Stop if the last time method i was faster was more than a 643 certain number of measurements ago. */ 644 #define STOP_SINCE_POSITIVE 200 645 if (d >= 0) 646 since_positive = 0; 647 else 648 if (++since_positive > STOP_SINCE_POSITIVE) 649 { 650 if (option_trace >= 1) 651 printf ("stopped due to since_positive (%d)\n", 652 STOP_SINCE_POSITIVE); 653 break; 654 } 655 656 /* Stop if method A has become slower by a certain factor. */ 657 if (ti >= tiplus1 * param->stop_factor) 658 { 659 if (option_trace >= 1) 660 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", 661 param->stop_factor); 662 break; 663 } 664 665 /* Stop if the threshold implied hasn't changed in a certain 666 number of measurements. (It's this condition that usually 667 stops the loop.) */ 668 if (thresh_idx != new_thresh_idx) 669 since_thresh_change = 0, thresh_idx = new_thresh_idx; 670 else 671 if (++since_thresh_change > param->stop_since_change) 672 { 673 if (option_trace >= 1) 674 printf ("stopped due to since_thresh_change (%d)\n", 675 param->stop_since_change); 676 break; 677 } 678 679 /* Stop if the threshold implied is more than a certain number of 680 measurements ago. */ 681 #define STOP_SINCE_AFTER 500 682 if (ndat - thresh_idx > STOP_SINCE_AFTER) 683 { 684 if (option_trace >= 1) 685 printf ("stopped due to ndat - thresh_idx > amount (%d)\n", 686 STOP_SINCE_AFTER); 687 break; 688 } 689 690 /* Stop when the size limit is reached before the end of the 691 crossover, but only show this as an error for >= the default max 692 size. FIXME: Maybe should make it a param choice whether this is 693 an error. */ 694 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) 695 { 696 fprintf (stderr, "%s\n", param->name); 697 fprintf (stderr, "sizes %ld to %ld total %d measurements\n", 698 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 699 fprintf (stderr, " max size reached before end of crossover\n"); 700 break; 701 } 702 } 703 704 if (option_trace >= 1) 705 printf ("sizes %ld to %ld total %d measurements\n", 706 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 707 708 *threshold = dat[analyze_dat (1)].size; 709 710 if (param->min_is_always) 711 { 712 if (*threshold == param->min_size) 713 *threshold = 0; 714 } 715 716 if (! param->noprint || option_trace) 717 print_define_end (param->name, *threshold); 718 } 719 720 /* Time N different FUNCTIONS with the same parameters and size, to 721 select the fastest. Since *_METHOD defines start numbering from 722 one, if functions[i] is fastest, the value of the define is i+1. 723 Also output a comment with speedup compared to the next fastest 724 function. The NAME argument is used only for trace output. 725 726 Returns the index of the fastest function. 727 */ 728 int 729 one_method (int n, speed_function_t *functions, 730 const char *name, const char *define, 731 const struct param_t *param) 732 { 733 double *t; 734 int i; 735 int method; 736 int method_runner_up; 737 738 TMP_DECL; 739 TMP_MARK; 740 t = (double*) TMP_ALLOC (n * sizeof (*t)); 741 742 for (i = 0; i < n; i++) 743 { 744 t[i] = tuneup_measure (functions[i], param, &s); 745 if (option_trace >= 1) 746 printf ("size=%ld, %s, method %d %.9f\n", 747 (long) s.size, name, i + 1, t[i]); 748 if (t[i] == -1.0) 749 { 750 printf ("Oops, can't measure all %s methods\n", name); 751 abort (); 752 } 753 } 754 method = 0; 755 for (i = 1; i < n; i++) 756 if (t[i] < t[method]) 757 method = i; 758 759 method_runner_up = (method == 0); 760 for (i = 0; i < n; i++) 761 if (i != method && t[i] < t[method_runner_up]) 762 method_runner_up = i; 763 764 print_define_with_speedup (define, method + 1, method_runner_up + 1, 765 t[method_runner_up] / t[method]); 766 767 TMP_FREE; 768 return method; 769 } 770 771 772 /* Special probing for the fft thresholds. The size restrictions on the 773 FFTs mean the graph of time vs size has a step effect. See this for 774 example using 775 776 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 777 gnuplot foo.gnuplot 778 779 The current approach is to compare routines at the midpoint of relevant 780 steps. Arguably a more sophisticated system of threshold data is wanted 781 if this step effect remains. */ 782 783 struct fft_param_t { 784 const char *table_name; 785 const char *threshold_name; 786 const char *modf_threshold_name; 787 mp_size_t *p_threshold; 788 mp_size_t *p_modf_threshold; 789 mp_size_t first_size; 790 mp_size_t max_size; 791 speed_function_t function; 792 speed_function_t mul_modf_function; 793 speed_function_t mul_function; 794 mp_size_t sqr; 795 }; 796 797 798 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with 799 N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple 800 of 2^(k-1) bits. */ 801 802 mp_size_t 803 fft_step_size (int k) 804 { 805 mp_size_t step; 806 807 step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS; 808 step *= (mp_size_t) 1 << k; 809 810 if (step <= 0) 811 { 812 printf ("Can't handle k=%d\n", k); 813 abort (); 814 } 815 816 return step; 817 } 818 819 mp_size_t 820 fft_next_size (mp_size_t pl, int k) 821 { 822 mp_size_t m = fft_step_size (k); 823 824 /* printf ("[k=%d %ld] %ld ->", k, m, pl); */ 825 826 if (pl == 0 || (pl & (m-1)) != 0) 827 pl = (pl | (m-1)) + 1; 828 829 /* printf (" %ld\n", pl); */ 830 return pl; 831 } 832 833 #define NMAX_DEFAULT 1000000 834 #define MAX_REPS 25 835 #define MIN_REPS 5 836 837 static inline size_t 838 mpn_mul_fft_lcm (size_t a, unsigned int k) 839 { 840 unsigned int l = k; 841 842 while (a % 2 == 0 && k > 0) 843 { 844 a >>= 1; 845 k--; 846 } 847 return a << l; 848 } 849 850 mp_size_t 851 fftfill (mp_size_t pl, int k, int sqr) 852 { 853 mp_size_t maxLK; 854 mp_bitcnt_t N, Nprime, nprime, M; 855 856 N = pl * GMP_NUMB_BITS; 857 M = N >> k; 858 859 maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); 860 861 Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; 862 nprime = Nprime / GMP_NUMB_BITS; 863 if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 864 { 865 size_t K2; 866 for (;;) 867 { 868 K2 = 1L << mpn_fft_best_k (nprime, sqr); 869 if ((nprime & (K2 - 1)) == 0) 870 break; 871 nprime = (nprime + K2 - 1) & -K2; 872 Nprime = nprime * GMP_LIMB_BITS; 873 } 874 } 875 ASSERT_ALWAYS (nprime < pl); 876 877 return Nprime; 878 } 879 880 static int 881 compare_double (const void *ap, const void *bp) 882 { 883 double a = * (const double *) ap; 884 double b = * (const double *) bp; 885 886 if (a < b) 887 return -1; 888 else if (a > b) 889 return 1; 890 else 891 return 0; 892 } 893 894 double 895 median (double *times, int n) 896 { 897 qsort (times, n, sizeof (double), compare_double); 898 return times[n/2]; 899 } 900 901 #define FFT_CACHE_SIZE 25 902 typedef struct fft_cache 903 { 904 mp_size_t n; 905 double time; 906 } fft_cache_t; 907 908 fft_cache_t fft_cache[FFT_CACHE_SIZE]; 909 910 double 911 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k, 912 int n_measurements) 913 { 914 int i; 915 double t, ttab[MAX_REPS]; 916 917 if (fft_cache[k].n == n) 918 return fft_cache[k].time; 919 920 for (i = 0; i < n_measurements; i++) 921 { 922 speed_starttime (); 923 mpn_mul_fft (rp, n, ap, n, bp, n, k); 924 ttab[i] = speed_endtime (); 925 } 926 927 t = median (ttab, n_measurements); 928 fft_cache[k].n = n; 929 fft_cache[k].time = t; 930 return t; 931 } 932 933 #define INSERT_FFTTAB(idx, nval, kval) \ 934 do { \ 935 fft_tab[idx].n = nval; \ 936 fft_tab[idx].k = kval; \ 937 fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \ 938 fft_tab[idx+1].k = (1 << 5) - 1; \ 939 } while (0) 940 941 int 942 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print) 943 { 944 mp_size_t n, n1, prev_n1; 945 int k, best_k, last_best_k, kmax; 946 int eff, prev_eff; 947 double t0, t1; 948 int n_measurements; 949 mp_limb_t *ap, *bp, *rp; 950 mp_size_t alloc; 951 struct fft_table_nk *fft_tab; 952 953 fft_tab = mpn_fft_table3[p->sqr]; 954 955 for (k = 0; k < FFT_CACHE_SIZE; k++) 956 fft_cache[k].n = 0; 957 958 if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 959 { 960 nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD); 961 } 962 963 if (print) 964 printf ("#define %s%*s", p->table_name, 38, ""); 965 966 if (idx == 0) 967 { 968 INSERT_FFTTAB (0, nmin, initial_k); 969 970 if (print) 971 { 972 printf ("\\\n { "); 973 printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k); 974 } 975 976 idx = 1; 977 } 978 979 ap = (mp_ptr) malloc (sizeof (mp_limb_t)); 980 if (p->sqr) 981 bp = ap; 982 else 983 bp = (mp_ptr) malloc (sizeof (mp_limb_t)); 984 rp = (mp_ptr) malloc (sizeof (mp_limb_t)); 985 alloc = 1; 986 987 /* Round n to comply to initial k value */ 988 n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k); 989 990 n_measurements = (18 - initial_k) | 1; 991 n_measurements = MAX (n_measurements, MIN_REPS); 992 n_measurements = MIN (n_measurements, MAX_REPS); 993 994 last_best_k = initial_k; 995 best_k = initial_k; 996 997 while (n < nmax) 998 { 999 int start_k, end_k; 1000 1001 /* Assume the current best k is best until we hit its next FFT step. */ 1002 t0 = 99999; 1003 1004 prev_n1 = n + 1; 1005 1006 start_k = MAX (4, best_k - 4); 1007 end_k = MIN (24, best_k + 4); 1008 for (k = start_k; k <= end_k; k++) 1009 { 1010 n1 = mpn_fft_next_size (prev_n1, k); 1011 1012 eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr); 1013 1014 if (eff < 70) /* avoid measuring too slow fft:s */ 1015 continue; 1016 1017 if (n1 > alloc) 1018 { 1019 alloc = n1; 1020 if (p->sqr) 1021 { 1022 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); 1023 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); 1024 ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); 1025 mpn_random (ap, alloc); 1026 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); 1027 } 1028 else 1029 { 1030 ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); 1031 bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t)); 1032 rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); 1033 ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); 1034 mpn_random (ap, alloc); 1035 bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t)); 1036 mpn_random (bp, alloc); 1037 rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); 1038 } 1039 } 1040 1041 t1 = cached_measure (rp, ap, bp, n1, k, n_measurements); 1042 1043 if (t1 * n_measurements > 0.3) 1044 n_measurements -= 2; 1045 n_measurements = MAX (n_measurements, MIN_REPS); 1046 1047 if (t1 < t0) 1048 { 1049 best_k = k; 1050 t0 = t1; 1051 } 1052 } 1053 1054 n1 = mpn_fft_next_size (prev_n1, best_k); 1055 1056 if (last_best_k != best_k) 1057 { 1058 ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1); 1059 1060 if (idx >= FFT_TABLE3_SIZE) 1061 { 1062 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 1063 abort (); 1064 } 1065 INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k); 1066 1067 if (print) 1068 { 1069 printf (", "); 1070 if (idx % 4 == 0) 1071 printf ("\\\n "); 1072 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1073 } 1074 1075 if (option_trace >= 2) 1076 { 1077 printf ("{%lu,%u}\n", prev_n1, best_k); 1078 fflush (stdout); 1079 } 1080 1081 last_best_k = best_k; 1082 idx++; 1083 } 1084 1085 for (;;) 1086 { 1087 prev_n1 = n1; 1088 prev_eff = fftfill (prev_n1, best_k, p->sqr); 1089 n1 = mpn_fft_next_size (prev_n1 + 1, best_k); 1090 eff = fftfill (n1, best_k, p->sqr); 1091 1092 if (eff != prev_eff) 1093 break; 1094 } 1095 1096 n = prev_n1; 1097 } 1098 1099 kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */ 1100 kmax = MIN (kmax, 25-1); 1101 for (k = last_best_k + 1; k <= kmax; k++) 1102 { 1103 if (idx >= FFT_TABLE3_SIZE) 1104 { 1105 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 1106 abort (); 1107 } 1108 INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k); 1109 1110 if (print) 1111 { 1112 printf (", "); 1113 if (idx % 4 == 0) 1114 printf ("\\\n "); 1115 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1116 } 1117 1118 idx++; 1119 } 1120 1121 if (print) 1122 printf (" }\n"); 1123 1124 free (ap); 1125 if (! p->sqr) 1126 free (bp); 1127 free (rp); 1128 1129 return idx; 1130 } 1131 1132 void 1133 fft (struct fft_param_t *p) 1134 { 1135 mp_size_t size; 1136 int k, idx, initial_k; 1137 1138 /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/ 1139 1140 #if 1 1141 { 1142 /* Use plain one() mechanism, for some reasonable initial values of k. The 1143 advantage is that we don't depend on mpn_fft_table3, which can therefore 1144 leave it completely uninitialized. */ 1145 1146 static struct param_t param; 1147 mp_size_t thres, best_thres; 1148 int best_k; 1149 char buf[20]; 1150 1151 best_thres = MP_SIZE_T_MAX; 1152 best_k = -1; 1153 1154 for (k = 5; k <= 7; k++) 1155 { 1156 param.name = p->modf_threshold_name; 1157 param.min_size = 100; 1158 param.max_size = 2000; 1159 param.function = p->mul_function; 1160 param.step_factor = 0.0; 1161 param.step = 4; 1162 param.function2 = p->mul_modf_function; 1163 param.noprint = 1; 1164 s.r = k; 1165 one (&thres, ¶m); 1166 if (thres < best_thres) 1167 { 1168 best_thres = thres; 1169 best_k = k; 1170 } 1171 } 1172 1173 *(p->p_modf_threshold) = best_thres; 1174 sprintf (buf, "k = %d", best_k); 1175 print_define_remark (p->modf_threshold_name, best_thres, buf); 1176 initial_k = best_k; 1177 } 1178 #else 1179 size = p->first_size; 1180 for (;;) 1181 { 1182 double tk, tm; 1183 1184 size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr)); 1185 k = mpn_fft_best_k (size, p->sqr); 1186 1187 if (size >= p->max_size) 1188 break; 1189 1190 s.size = size + fft_step_size (k) / 2; 1191 s.r = k; 1192 tk = tuneup_measure (p->mul_modf_function, NULL, &s); 1193 if (tk == -1.0) 1194 abort (); 1195 1196 tm = tuneup_measure (p->mul_function, NULL, &s); 1197 if (tm == -1.0) 1198 abort (); 1199 1200 if (option_trace >= 2) 1201 printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n", 1202 (long) size, 1203 (long) size + fft_step_size (k) / 2, k, tk, 1204 (long) s.size, tm); 1205 1206 if (tk < tm) 1207 { 1208 *p->p_modf_threshold = s.size; 1209 print_define (p->modf_threshold_name, *p->p_modf_threshold); 1210 break; 1211 } 1212 } 1213 initial_k = ?; 1214 #endif 1215 1216 /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/ 1217 1218 idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1); 1219 printf ("#define %s_SIZE %d\n", p->table_name, idx); 1220 1221 /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/ 1222 1223 size = 2 * *p->p_modf_threshold; /* OK? */ 1224 for (;;) 1225 { 1226 double tk, tm; 1227 mp_size_t mulmod_size, mul_size;; 1228 1229 if (size >= p->max_size) 1230 break; 1231 1232 mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2; 1233 mul_size = (size + mulmod_size) / 2; /* middle of step */ 1234 1235 s.size = mulmod_size; 1236 tk = tuneup_measure (p->function, NULL, &s); 1237 if (tk == -1.0) 1238 abort (); 1239 1240 s.size = mul_size; 1241 tm = tuneup_measure (p->mul_function, NULL, &s); 1242 if (tm == -1.0) 1243 abort (); 1244 1245 if (option_trace >= 2) 1246 printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n", 1247 (long) size, 1248 (long) mulmod_size, tk, 1249 (long) mul_size, tm); 1250 1251 size = mulmod_size; 1252 1253 if (tk < tm) 1254 { 1255 *p->p_threshold = s.size; 1256 print_define (p->threshold_name, *p->p_threshold); 1257 break; 1258 } 1259 } 1260 } 1261 1262 /* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This 1263 is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch. 1264 This is used in get_str and set_str. */ 1265 void 1266 relspeed_div_1_vs_mul_1 (void) 1267 { 1268 #define max_opsize 100 1269 mp_size_t n; 1270 long j; 1271 mp_limb_t rp[max_opsize]; 1272 mp_limb_t ap[max_opsize]; 1273 double multime, divtime; 1274 1275 mpn_random (ap, max_opsize); 1276 1277 multime = 0; 1278 for (n = max_opsize; n > 1; n--) 1279 { 1280 mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1281 speed_starttime (); 1282 for (j = speed_precision; j != 0 ; j--) 1283 mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1284 multime += speed_endtime () / n; 1285 } 1286 1287 divtime = 0; 1288 for (n = max_opsize; n > 1; n--) 1289 { 1290 /* Make input divisible for good measure. */ 1291 ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10); 1292 1293 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 1294 mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10, 1295 MP_BASES_BIG_BASE_BINVERTED_10, 1296 MP_BASES_BIG_BASE_CTZ_10); 1297 #else 1298 mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1299 #endif 1300 speed_starttime (); 1301 for (j = speed_precision; j != 0 ; j--) 1302 { 1303 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 1304 mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10, 1305 MP_BASES_BIG_BASE_BINVERTED_10, 1306 MP_BASES_BIG_BASE_CTZ_10); 1307 #else 1308 mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10); 1309 #endif 1310 } 1311 divtime += speed_endtime () / n; 1312 } 1313 1314 print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime)); 1315 } 1316 1317 1318 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, 1319 giving wrong results. */ 1320 void 1321 tune_mul_n (void) 1322 { 1323 static struct param_t param; 1324 mp_size_t next_toom_start; 1325 int something_changed; 1326 1327 param.function = speed_mpn_mul_n; 1328 1329 param.name = "MUL_TOOM22_THRESHOLD"; 1330 param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE); 1331 param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1; 1332 one (&mul_toom22_threshold, ¶m); 1333 1334 param.noprint = 1; 1335 1336 /* Threshold sequence loop. Disable functions that would be used in a very 1337 narrow range, re-measuring things when that happens. */ 1338 something_changed = 1; 1339 while (something_changed) 1340 { 1341 something_changed = 0; 1342 1343 next_toom_start = mul_toom22_threshold; 1344 1345 if (mul_toom33_threshold != 0) 1346 { 1347 param.name = "MUL_TOOM33_THRESHOLD"; 1348 param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE); 1349 param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1; 1350 one (&mul_toom33_threshold, ¶m); 1351 1352 if (next_toom_start * 1.05 >= mul_toom33_threshold) 1353 { 1354 mul_toom33_threshold = 0; 1355 something_changed = 1; 1356 } 1357 } 1358 1359 next_toom_start = MAX (next_toom_start, mul_toom33_threshold); 1360 1361 if (mul_toom44_threshold != 0) 1362 { 1363 param.name = "MUL_TOOM44_THRESHOLD"; 1364 param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE); 1365 param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; 1366 one (&mul_toom44_threshold, ¶m); 1367 1368 if (next_toom_start * 1.05 >= mul_toom44_threshold) 1369 { 1370 mul_toom44_threshold = 0; 1371 something_changed = 1; 1372 } 1373 } 1374 1375 next_toom_start = MAX (next_toom_start, mul_toom44_threshold); 1376 1377 if (mul_toom6h_threshold != 0) 1378 { 1379 param.name = "MUL_TOOM6H_THRESHOLD"; 1380 param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE); 1381 param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1; 1382 one (&mul_toom6h_threshold, ¶m); 1383 1384 if (next_toom_start * 1.05 >= mul_toom6h_threshold) 1385 { 1386 mul_toom6h_threshold = 0; 1387 something_changed = 1; 1388 } 1389 } 1390 1391 next_toom_start = MAX (next_toom_start, mul_toom6h_threshold); 1392 1393 if (mul_toom8h_threshold != 0) 1394 { 1395 param.name = "MUL_TOOM8H_THRESHOLD"; 1396 param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE); 1397 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1; 1398 one (&mul_toom8h_threshold, ¶m); 1399 1400 if (next_toom_start * 1.05 >= mul_toom8h_threshold) 1401 { 1402 mul_toom8h_threshold = 0; 1403 something_changed = 1; 1404 } 1405 } 1406 } 1407 1408 print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD); 1409 print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD); 1410 print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD); 1411 print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD); 1412 1413 /* disabled until tuned */ 1414 MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; 1415 } 1416 1417 void 1418 tune_mul (void) 1419 { 1420 static struct param_t param; 1421 mp_size_t thres; 1422 1423 param.noprint = 1; 1424 1425 param.function = speed_mpn_toom32_for_toom43_mul; 1426 param.function2 = speed_mpn_toom43_for_toom32_mul; 1427 param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD"; 1428 param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17; 1429 one (&thres, ¶m); 1430 mul_toom32_to_toom43_threshold = thres * 17 / 24; 1431 print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold); 1432 1433 param.function = speed_mpn_toom32_for_toom53_mul; 1434 param.function2 = speed_mpn_toom53_for_toom32_mul; 1435 param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD"; 1436 param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19; 1437 one (&thres, ¶m); 1438 mul_toom32_to_toom53_threshold = thres * 19 / 30; 1439 print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold); 1440 1441 param.function = speed_mpn_toom42_for_toom53_mul; 1442 param.function2 = speed_mpn_toom53_for_toom42_mul; 1443 param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD"; 1444 param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11; 1445 one (&thres, ¶m); 1446 mul_toom42_to_toom53_threshold = thres * 11 / 20; 1447 print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold); 1448 1449 param.function = speed_mpn_toom42_mul; 1450 param.function2 = speed_mpn_toom63_mul; 1451 param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD"; 1452 param.min_size = MPN_TOOM63_MUL_MINSIZE * 2; 1453 one (&thres, ¶m); 1454 mul_toom42_to_toom63_threshold = thres / 2; 1455 print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold); 1456 1457 /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */ 1458 param.function = speed_mpn_toom43_for_toom54_mul; 1459 param.function2 = speed_mpn_toom54_for_toom43_mul; 1460 param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD"; 1461 param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5; 1462 one (&thres, ¶m); 1463 mul_toom43_to_toom54_threshold = thres * 5 / 6; 1464 print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold); 1465 } 1466 1467 1468 void 1469 tune_mullo (void) 1470 { 1471 static struct param_t param; 1472 1473 param.function = speed_mpn_mullo_n; 1474 1475 param.name = "MULLO_BASECASE_THRESHOLD"; 1476 param.min_size = 2; 1477 param.min_is_always = 1; 1478 param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1; 1479 param.stop_factor = 1.5; 1480 param.noprint = 1; 1481 one (&mullo_basecase_threshold, ¶m); 1482 1483 param.name = "MULLO_DC_THRESHOLD"; 1484 param.min_size = 8; 1485 param.min_is_always = 0; 1486 param.max_size = 1000; 1487 one (&mullo_dc_threshold, ¶m); 1488 1489 if (mullo_basecase_threshold >= mullo_dc_threshold) 1490 { 1491 print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold); 1492 print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase"); 1493 } 1494 else 1495 { 1496 print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold); 1497 print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold); 1498 } 1499 1500 if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2) 1501 { 1502 param.name = "MULLO_MUL_N_THRESHOLD"; 1503 param.min_size = mullo_dc_threshold; 1504 param.max_size = 2 * mul_fft_threshold; 1505 param.noprint = 0; 1506 param.step_factor = 0.03; 1507 one (&mullo_mul_n_threshold, ¶m); 1508 } 1509 else 1510 print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX, 1511 "without FFT use mullo forever"); 1512 } 1513 1514 void 1515 tune_sqrlo (void) 1516 { 1517 static struct param_t param; 1518 1519 param.function = speed_mpn_sqrlo; 1520 1521 param.name = "SQRLO_BASECASE_THRESHOLD"; 1522 param.min_size = 2; 1523 param.min_is_always = 1; 1524 param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1; 1525 param.stop_factor = 1.5; 1526 param.noprint = 1; 1527 one (&sqrlo_basecase_threshold, ¶m); 1528 1529 param.name = "SQRLO_DC_THRESHOLD"; 1530 param.min_size = 8; 1531 param.min_is_always = 0; 1532 param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1; 1533 one (&sqrlo_dc_threshold, ¶m); 1534 1535 if (sqrlo_basecase_threshold >= sqrlo_dc_threshold) 1536 { 1537 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold); 1538 print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase"); 1539 } 1540 else 1541 { 1542 print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold); 1543 print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold); 1544 } 1545 1546 if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2) 1547 { 1548 param.name = "SQRLO_SQR_THRESHOLD"; 1549 param.min_size = sqrlo_dc_threshold; 1550 param.max_size = 2 * sqr_fft_threshold; 1551 param.noprint = 0; 1552 param.step_factor = 0.03; 1553 one (&sqrlo_sqr_threshold, ¶m); 1554 } 1555 else 1556 print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX, 1557 "without FFT use sqrlo forever"); 1558 } 1559 1560 void 1561 tune_mulmid (void) 1562 { 1563 static struct param_t param; 1564 1565 param.name = "MULMID_TOOM42_THRESHOLD"; 1566 param.function = speed_mpn_mulmid_n; 1567 param.min_size = 4; 1568 param.max_size = 100; 1569 one (&mulmid_toom42_threshold, ¶m); 1570 } 1571 1572 void 1573 tune_mulmod_bnm1 (void) 1574 { 1575 static struct param_t param; 1576 1577 param.name = "MULMOD_BNM1_THRESHOLD"; 1578 param.function = speed_mpn_mulmod_bnm1; 1579 param.min_size = 4; 1580 param.max_size = 100; 1581 one (&mulmod_bnm1_threshold, ¶m); 1582 } 1583 1584 void 1585 tune_sqrmod_bnm1 (void) 1586 { 1587 static struct param_t param; 1588 1589 param.name = "SQRMOD_BNM1_THRESHOLD"; 1590 param.function = speed_mpn_sqrmod_bnm1; 1591 param.min_size = 4; 1592 param.max_size = 100; 1593 one (&sqrmod_bnm1_threshold, ¶m); 1594 } 1595 1596 1597 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase 1598 is faster only at size==2 then we don't want to bother with extra code 1599 just for that. Start karatsuba from 4 same as MUL above. */ 1600 1601 void 1602 tune_sqr (void) 1603 { 1604 /* disabled until tuned */ 1605 SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; 1606 1607 if (HAVE_NATIVE_mpn_sqr_basecase) 1608 { 1609 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); 1610 sqr_basecase_threshold = 0; 1611 } 1612 else 1613 { 1614 static struct param_t param; 1615 param.name = "SQR_BASECASE_THRESHOLD"; 1616 param.function = speed_mpn_sqr; 1617 param.min_size = 3; 1618 param.min_is_always = 1; 1619 param.max_size = TUNE_SQR_TOOM2_MAX; 1620 param.noprint = 1; 1621 one (&sqr_basecase_threshold, ¶m); 1622 } 1623 1624 { 1625 static struct param_t param; 1626 param.name = "SQR_TOOM2_THRESHOLD"; 1627 param.function = speed_mpn_sqr; 1628 param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE); 1629 param.max_size = TUNE_SQR_TOOM2_MAX; 1630 param.noprint = 1; 1631 one (&sqr_toom2_threshold, ¶m); 1632 1633 if (! HAVE_NATIVE_mpn_sqr_basecase 1634 && sqr_toom2_threshold < sqr_basecase_threshold) 1635 { 1636 /* Karatsuba becomes faster than mul_basecase before 1637 sqr_basecase does. Arrange for the expression 1638 "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which 1639 selects mpn_sqr_basecase in mpn_sqr to be false, by setting 1640 SQR_TOOM2_THRESHOLD to zero, making 1641 SQR_BASECASE_THRESHOLD the toom2 threshold. */ 1642 1643 sqr_basecase_threshold = SQR_TOOM2_THRESHOLD; 1644 SQR_TOOM2_THRESHOLD = 0; 1645 1646 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, 1647 "toom2"); 1648 print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD, 1649 "never sqr_basecase"); 1650 } 1651 else 1652 { 1653 if (! HAVE_NATIVE_mpn_sqr_basecase) 1654 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); 1655 print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD); 1656 } 1657 } 1658 1659 { 1660 static struct param_t param; 1661 mp_size_t next_toom_start; 1662 int something_changed; 1663 1664 param.function = speed_mpn_sqr; 1665 param.noprint = 1; 1666 1667 /* Threshold sequence loop. Disable functions that would be used in a very 1668 narrow range, re-measuring things when that happens. */ 1669 something_changed = 1; 1670 while (something_changed) 1671 { 1672 something_changed = 0; 1673 1674 next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold); 1675 1676 sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 1677 param.name = "SQR_TOOM3_THRESHOLD"; 1678 param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE); 1679 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; 1680 one (&sqr_toom3_threshold, ¶m); 1681 1682 next_toom_start = MAX (next_toom_start, sqr_toom3_threshold); 1683 1684 if (sqr_toom4_threshold != 0) 1685 { 1686 param.name = "SQR_TOOM4_THRESHOLD"; 1687 sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 1688 param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE); 1689 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; 1690 one (&sqr_toom4_threshold, ¶m); 1691 1692 if (next_toom_start * 1.05 >= sqr_toom4_threshold) 1693 { 1694 sqr_toom4_threshold = 0; 1695 something_changed = 1; 1696 } 1697 } 1698 1699 next_toom_start = MAX (next_toom_start, sqr_toom4_threshold); 1700 1701 if (sqr_toom6_threshold != 0) 1702 { 1703 param.name = "SQR_TOOM6_THRESHOLD"; 1704 sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 1705 param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE); 1706 param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1; 1707 one (&sqr_toom6_threshold, ¶m); 1708 1709 if (next_toom_start * 1.05 >= sqr_toom6_threshold) 1710 { 1711 sqr_toom6_threshold = 0; 1712 something_changed = 1; 1713 } 1714 } 1715 1716 next_toom_start = MAX (next_toom_start, sqr_toom6_threshold); 1717 1718 if (sqr_toom8_threshold != 0) 1719 { 1720 param.name = "SQR_TOOM8_THRESHOLD"; 1721 sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 1722 param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE); 1723 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1; 1724 one (&sqr_toom8_threshold, ¶m); 1725 1726 if (next_toom_start * 1.05 >= sqr_toom8_threshold) 1727 { 1728 sqr_toom8_threshold = 0; 1729 something_changed = 1; 1730 } 1731 } 1732 } 1733 1734 print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD); 1735 print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD); 1736 print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD); 1737 print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD); 1738 } 1739 } 1740 1741 1742 void 1743 tune_dc_div (void) 1744 { 1745 s.r = 0; /* clear to make speed function do 2n/n */ 1746 { 1747 static struct param_t param; 1748 param.name = "DC_DIV_QR_THRESHOLD"; 1749 param.function = speed_mpn_sbpi1_div_qr; 1750 param.function2 = speed_mpn_dcpi1_div_qr; 1751 param.min_size = 6; 1752 one (&dc_div_qr_threshold, ¶m); 1753 } 1754 { 1755 static struct param_t param; 1756 param.name = "DC_DIVAPPR_Q_THRESHOLD"; 1757 param.function = speed_mpn_sbpi1_divappr_q; 1758 param.function2 = speed_mpn_dcpi1_divappr_q; 1759 param.min_size = 6; 1760 one (&dc_divappr_q_threshold, ¶m); 1761 } 1762 } 1763 1764 static double 1765 speed_mpn_sbordcpi1_div_qr (struct speed_params *s) 1766 { 1767 if (s->size < DC_DIV_QR_THRESHOLD) 1768 return speed_mpn_sbpi1_div_qr (s); 1769 else 1770 return speed_mpn_dcpi1_div_qr (s); 1771 } 1772 1773 void 1774 tune_mu_div (void) 1775 { 1776 s.r = 0; /* clear to make speed function do 2n/n */ 1777 { 1778 static struct param_t param; 1779 param.name = "MU_DIV_QR_THRESHOLD"; 1780 param.function = speed_mpn_dcpi1_div_qr; 1781 param.function2 = speed_mpn_mu_div_qr; 1782 param.min_size = mul_toom22_threshold; 1783 param.max_size = 5000; 1784 param.step_factor = 0.02; 1785 one (&mu_div_qr_threshold, ¶m); 1786 } 1787 { 1788 static struct param_t param; 1789 param.name = "MU_DIVAPPR_Q_THRESHOLD"; 1790 param.function = speed_mpn_dcpi1_divappr_q; 1791 param.function2 = speed_mpn_mu_divappr_q; 1792 param.min_size = mul_toom22_threshold; 1793 param.max_size = 5000; 1794 param.step_factor = 0.02; 1795 one (&mu_divappr_q_threshold, ¶m); 1796 } 1797 { 1798 static struct param_t param; 1799 param.name = "MUPI_DIV_QR_THRESHOLD"; 1800 param.function = speed_mpn_sbordcpi1_div_qr; 1801 param.function2 = speed_mpn_mupi_div_qr; 1802 param.min_size = 6; 1803 param.min_is_always = 1; 1804 param.max_size = 1000; 1805 param.step_factor = 0.02; 1806 one (&mupi_div_qr_threshold, ¶m); 1807 } 1808 } 1809 1810 void 1811 tune_dc_bdiv (void) 1812 { 1813 s.r = 0; /* clear to make speed function do 2n/n*/ 1814 { 1815 static struct param_t param; 1816 param.name = "DC_BDIV_QR_THRESHOLD"; 1817 param.function = speed_mpn_sbpi1_bdiv_qr; 1818 param.function2 = speed_mpn_dcpi1_bdiv_qr; 1819 param.min_size = 4; 1820 one (&dc_bdiv_qr_threshold, ¶m); 1821 } 1822 { 1823 static struct param_t param; 1824 param.name = "DC_BDIV_Q_THRESHOLD"; 1825 param.function = speed_mpn_sbpi1_bdiv_q; 1826 param.function2 = speed_mpn_dcpi1_bdiv_q; 1827 param.min_size = 4; 1828 one (&dc_bdiv_q_threshold, ¶m); 1829 } 1830 } 1831 1832 void 1833 tune_mu_bdiv (void) 1834 { 1835 s.r = 0; /* clear to make speed function do 2n/n*/ 1836 { 1837 static struct param_t param; 1838 param.name = "MU_BDIV_QR_THRESHOLD"; 1839 param.function = speed_mpn_dcpi1_bdiv_qr; 1840 param.function2 = speed_mpn_mu_bdiv_qr; 1841 param.min_size = dc_bdiv_qr_threshold; 1842 param.max_size = 5000; 1843 param.step_factor = 0.02; 1844 one (&mu_bdiv_qr_threshold, ¶m); 1845 } 1846 { 1847 static struct param_t param; 1848 param.name = "MU_BDIV_Q_THRESHOLD"; 1849 param.function = speed_mpn_dcpi1_bdiv_q; 1850 param.function2 = speed_mpn_mu_bdiv_q; 1851 param.min_size = dc_bdiv_q_threshold; 1852 param.max_size = 5000; 1853 param.step_factor = 0.02; 1854 one (&mu_bdiv_q_threshold, ¶m); 1855 } 1856 } 1857 1858 void 1859 tune_invertappr (void) 1860 { 1861 static struct param_t param; 1862 1863 param.function = speed_mpn_ni_invertappr; 1864 param.name = "INV_MULMOD_BNM1_THRESHOLD"; 1865 param.min_size = 5; 1866 one (&inv_mulmod_bnm1_threshold, ¶m); 1867 1868 param.function = speed_mpn_invertappr; 1869 param.name = "INV_NEWTON_THRESHOLD"; 1870 param.min_size = 5; 1871 one (&inv_newton_threshold, ¶m); 1872 } 1873 1874 void 1875 tune_invert (void) 1876 { 1877 static struct param_t param; 1878 1879 param.function = speed_mpn_invert; 1880 param.name = "INV_APPR_THRESHOLD"; 1881 param.min_size = 5; 1882 one (&inv_appr_threshold, ¶m); 1883 } 1884 1885 void 1886 tune_binvert (void) 1887 { 1888 static struct param_t param; 1889 1890 param.function = speed_mpn_binvert; 1891 param.name = "BINV_NEWTON_THRESHOLD"; 1892 param.min_size = 8; /* pointless with smaller operands */ 1893 one (&binv_newton_threshold, ¶m); 1894 } 1895 1896 void 1897 tune_redc (void) 1898 { 1899 #define TUNE_REDC_2_MAX 100 1900 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 1901 #define WANT_REDC_2 1 1902 #endif 1903 1904 #if WANT_REDC_2 1905 { 1906 static struct param_t param; 1907 param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1908 param.function = speed_mpn_redc_1; 1909 param.function2 = speed_mpn_redc_2; 1910 param.min_size = 1; 1911 param.min_is_always = 1; 1912 param.max_size = TUNE_REDC_2_MAX; 1913 param.noprint = 1; 1914 param.stop_factor = 1.5; 1915 one (&redc_1_to_redc_2_threshold, ¶m); 1916 } 1917 { 1918 static struct param_t param; 1919 param.name = "REDC_2_TO_REDC_N_THRESHOLD"; 1920 param.function = speed_mpn_redc_2; 1921 param.function2 = speed_mpn_redc_n; 1922 param.min_size = 16; 1923 param.noprint = 1; 1924 one (&redc_2_to_redc_n_threshold, ¶m); 1925 } 1926 if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold) 1927 { 1928 redc_2_to_redc_n_threshold = 0; /* disable redc_2 */ 1929 1930 /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as 1931 REDC_1_TO_REDC_2_THRESHOLD. */ 1932 { 1933 static struct param_t param; 1934 param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1935 param.function = speed_mpn_redc_1; 1936 param.function2 = speed_mpn_redc_n; 1937 param.min_size = 16; 1938 param.noprint = 1; 1939 one (&redc_1_to_redc_2_threshold, ¶m); 1940 } 1941 } 1942 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD); 1943 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD); 1944 #else 1945 { 1946 static struct param_t param; 1947 param.name = "REDC_1_TO_REDC_N_THRESHOLD"; 1948 param.function = speed_mpn_redc_1; 1949 param.function2 = speed_mpn_redc_n; 1950 param.min_size = 16; 1951 one (&redc_1_to_redc_n_threshold, ¶m); 1952 } 1953 #endif 1954 } 1955 1956 void 1957 tune_matrix22_mul (void) 1958 { 1959 static struct param_t param; 1960 param.name = "MATRIX22_STRASSEN_THRESHOLD"; 1961 param.function = speed_mpn_matrix22_mul; 1962 param.min_size = 2; 1963 one (&matrix22_strassen_threshold, ¶m); 1964 } 1965 1966 void 1967 tune_hgcd2 (void) 1968 { 1969 static struct param_t param; 1970 hgcd2_func_t *f[5] = 1971 { mpn_hgcd2_1, 1972 mpn_hgcd2_2, 1973 mpn_hgcd2_3, 1974 mpn_hgcd2_4, 1975 mpn_hgcd2_5 }; 1976 speed_function_t speed_f[5] = 1977 { speed_mpn_hgcd2_1, 1978 speed_mpn_hgcd2_2, 1979 speed_mpn_hgcd2_3, 1980 speed_mpn_hgcd2_4, 1981 speed_mpn_hgcd2_5 }; 1982 int best; 1983 1984 s.size = 1; 1985 best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", ¶m); 1986 1987 /* Use selected function when tuning hgcd and gcd */ 1988 hgcd2_func = f[best]; 1989 } 1990 1991 void 1992 tune_hgcd (void) 1993 { 1994 static struct param_t param; 1995 param.name = "HGCD_THRESHOLD"; 1996 param.function = speed_mpn_hgcd; 1997 /* We seem to get strange results for small sizes */ 1998 param.min_size = 30; 1999 one (&hgcd_threshold, ¶m); 2000 } 2001 2002 void 2003 tune_hgcd_appr (void) 2004 { 2005 static struct param_t param; 2006 param.name = "HGCD_APPR_THRESHOLD"; 2007 param.function = speed_mpn_hgcd_appr; 2008 /* We seem to get strange results for small sizes */ 2009 param.min_size = 50; 2010 param.stop_since_change = 150; 2011 one (&hgcd_appr_threshold, ¶m); 2012 } 2013 2014 void 2015 tune_hgcd_reduce (void) 2016 { 2017 static struct param_t param; 2018 param.name = "HGCD_REDUCE_THRESHOLD"; 2019 param.function = speed_mpn_hgcd_reduce; 2020 param.min_size = 30; 2021 param.max_size = 7000; 2022 param.step_factor = 0.04; 2023 one (&hgcd_reduce_threshold, ¶m); 2024 } 2025 2026 void 2027 tune_gcd_dc (void) 2028 { 2029 static struct param_t param; 2030 param.name = "GCD_DC_THRESHOLD"; 2031 param.function = speed_mpn_gcd; 2032 param.min_size = hgcd_threshold; 2033 param.max_size = 3000; 2034 param.step_factor = 0.02; 2035 one (&gcd_dc_threshold, ¶m); 2036 } 2037 2038 void 2039 tune_gcdext_dc (void) 2040 { 2041 static struct param_t param; 2042 param.name = "GCDEXT_DC_THRESHOLD"; 2043 param.function = speed_mpn_gcdext; 2044 param.min_size = hgcd_threshold; 2045 param.max_size = 3000; 2046 param.step_factor = 0.02; 2047 one (&gcdext_dc_threshold, ¶m); 2048 } 2049 2050 /* In tune_powm_sec we compute the table used by the win_size function. The 2051 cutoff points are in exponent bits, disregarding other operand sizes. It is 2052 not possible to use the one framework since it currently uses a granularity 2053 of full limbs. 2054 */ 2055 2056 /* This win_size replaces the variant in the powm code, allowing us to 2057 control k in the k-ary algorithms. */ 2058 int winsize; 2059 int 2060 win_size (mp_bitcnt_t eb) 2061 { 2062 return winsize; 2063 } 2064 2065 void 2066 tune_powm_sec (void) 2067 { 2068 mp_size_t n; 2069 int k, i; 2070 mp_size_t itch; 2071 mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff; 2072 const int n_max = 3000 / GMP_NUMB_BITS; 2073 #define n_measurements 5 2074 mp_ptr rp, bp, ep, mp, tp; 2075 double ttab[n_measurements], tk, tkp1; 2076 TMP_DECL; 2077 TMP_MARK; 2078 2079 possible_nbits_cutoff = 0; 2080 2081 k = 1; 2082 2083 winsize = 10; /* the itch function needs this */ 2084 itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max); 2085 2086 rp = TMP_ALLOC_LIMBS (n_max); 2087 bp = TMP_ALLOC_LIMBS (n_max); 2088 ep = TMP_ALLOC_LIMBS (n_max); 2089 mp = TMP_ALLOC_LIMBS (n_max); 2090 tp = TMP_ALLOC_LIMBS (itch); 2091 2092 mpn_random (bp, n_max); 2093 mpn_random (mp, n_max); 2094 mp[0] |= 1; 2095 2096 /* How about taking the M operand size into account? 2097 2098 An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming 2099 B = O(M)). 2100 2101 Using k-ary and no sliding window, the precomputation will need time 2102 O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) + 2103 O(log(E)/k*M(N)), for the squarings, multiplications, respectively. 2104 2105 An operation R=powm_sec(B,E,N) will take time like powm. 2106 2107 Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the 2108 main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) + 2109 O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full 2110 table reads, respectively. */ 2111 2112 printf ("#define POWM_SEC_TABLE "); 2113 2114 /* For nbits == 1, we should always use k == 1, so no need to tune 2115 that. Starting with nbits == 2 also ensure that nbits always is 2116 larger than the windowsize k+1. */ 2117 for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; ) 2118 { 2119 n = (nbits - 1) / GMP_NUMB_BITS + 1; 2120 2121 /* Generate E such that sliding-window for k and k+1 works equally 2122 well/poorly (but sliding is not used in powm_sec, of course). */ 2123 for (i = 0; i < n; i++) 2124 ep[i] = ~CNST_LIMB(0); 2125 2126 winsize = k; 2127 for (i = 0; i < n_measurements; i++) 2128 { 2129 speed_starttime (); 2130 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); 2131 ttab[i] = speed_endtime (); 2132 } 2133 tk = median (ttab, n_measurements); 2134 2135 winsize = k + 1; 2136 speed_starttime (); 2137 for (i = 0; i < n_measurements; i++) 2138 { 2139 speed_starttime (); 2140 mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); 2141 ttab[i] = speed_endtime (); 2142 } 2143 tkp1 = median (ttab, n_measurements); 2144 /* 2145 printf ("testing: %ld, %d", nbits, k, ep[n-1]); 2146 printf (" %10.5f %10.5f\n", tk, tkp1); 2147 */ 2148 if (tkp1 < tk) 2149 { 2150 if (possible_nbits_cutoff) 2151 { 2152 /* Two consecutive sizes indicate k increase, obey. */ 2153 2154 /* Must always have x[k] >= k */ 2155 ASSERT_ALWAYS (possible_nbits_cutoff >= k); 2156 2157 if (k > 1) 2158 printf (","); 2159 printf ("%ld", (long) possible_nbits_cutoff); 2160 k++; 2161 possible_nbits_cutoff = 0; 2162 } 2163 else 2164 { 2165 /* One measurement indicate k increase, save nbits for further 2166 consideration. */ 2167 /* The new larger k gets used for sizes > the cutoff 2168 value, hence the cutoff should be one less than the 2169 smallest size where it gives a speedup. */ 2170 possible_nbits_cutoff = nbits - 1; 2171 } 2172 } 2173 else 2174 possible_nbits_cutoff = 0; 2175 2176 nbits_next = nbits * 65 / 64; 2177 nbits = nbits_next + (nbits_next == nbits); 2178 } 2179 printf ("\n"); 2180 TMP_FREE; 2181 } 2182 2183 2184 /* size_extra==1 reflects the fact that with high<divisor one division is 2185 always skipped. Forcing high<divisor while testing ensures consistency 2186 while stepping through sizes, ie. that size-1 divides will be done each 2187 time. 2188 2189 min_size==2 and min_is_always are used so that if plain division is only 2190 better at size==1 then don't bother including that code just for that 2191 case, instead go with preinv always and get a size saving. */ 2192 2193 #define DIV_1_PARAMS \ 2194 param.check_size = 256; \ 2195 param.min_size = 2; \ 2196 param.min_is_always = 1; \ 2197 param.data_high = DATA_HIGH_LT_R; \ 2198 param.size_extra = 1; \ 2199 param.stop_factor = 2.0; 2200 2201 2202 double (*tuned_speed_mpn_divrem_1) (struct speed_params *); 2203 2204 void 2205 tune_divrem_1 (void) 2206 { 2207 /* plain version by default */ 2208 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1; 2209 2210 /* No support for tuning native assembler code, do that by hand and put 2211 the results in the .asm file, there's no need for such thresholds to 2212 appear in gmp-mparam.h. */ 2213 if (HAVE_NATIVE_mpn_divrem_1) 2214 return; 2215 2216 if (GMP_NAIL_BITS != 0) 2217 { 2218 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 2219 "no preinv with nails"); 2220 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 2221 "no preinv with nails"); 2222 return; 2223 } 2224 2225 if (UDIV_PREINV_ALWAYS) 2226 { 2227 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always"); 2228 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L); 2229 return; 2230 } 2231 2232 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune; 2233 2234 /* Tune for the integer part of mpn_divrem_1. This will very possibly be 2235 a bit out for the fractional part, but that's too bad, the integer part 2236 is more important. */ 2237 { 2238 static struct param_t param; 2239 param.name = "DIVREM_1_NORM_THRESHOLD"; 2240 DIV_1_PARAMS; 2241 s.r = randlimb_norm (); 2242 param.function = speed_mpn_divrem_1_tune; 2243 one (&divrem_1_norm_threshold, ¶m); 2244 } 2245 { 2246 static struct param_t param; 2247 param.name = "DIVREM_1_UNNORM_THRESHOLD"; 2248 DIV_1_PARAMS; 2249 s.r = randlimb_half (); 2250 param.function = speed_mpn_divrem_1_tune; 2251 one (&divrem_1_unnorm_threshold, ¶m); 2252 } 2253 } 2254 2255 void 2256 tune_div_qr_1 (void) 2257 { 2258 if (!HAVE_NATIVE_mpn_div_qr_1n_pi1) 2259 { 2260 static struct param_t param; 2261 speed_function_t f[2] = 2262 { 2263 speed_mpn_div_qr_1n_pi1_1, 2264 speed_mpn_div_qr_1n_pi1_2, 2265 }; 2266 2267 s.size = 10; 2268 s.r = randlimb_norm (); 2269 2270 one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", ¶m); 2271 } 2272 2273 { 2274 static struct param_t param; 2275 param.name = "DIV_QR_1_NORM_THRESHOLD"; 2276 DIV_1_PARAMS; 2277 param.min_size = 1; 2278 param.min_is_always = 0; 2279 s.r = randlimb_norm (); 2280 param.function = speed_mpn_div_qr_1_tune; 2281 one (&div_qr_1_norm_threshold, ¶m); 2282 } 2283 { 2284 static struct param_t param; 2285 param.name = "DIV_QR_1_UNNORM_THRESHOLD"; 2286 DIV_1_PARAMS; 2287 param.min_size = 1; 2288 param.min_is_always = 0; 2289 s.r = randlimb_half(); 2290 param.function = speed_mpn_div_qr_1_tune; 2291 one (&div_qr_1_unnorm_threshold, ¶m); 2292 } 2293 } 2294 2295 2296 void 2297 tune_mod_1 (void) 2298 { 2299 /* No support for tuning native assembler code, do that by hand and put 2300 the results in the .asm file, there's no need for such thresholds to 2301 appear in gmp-mparam.h. */ 2302 if (HAVE_NATIVE_mpn_mod_1) 2303 return; 2304 2305 if (GMP_NAIL_BITS != 0) 2306 { 2307 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 2308 "no preinv with nails"); 2309 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 2310 "no preinv with nails"); 2311 return; 2312 } 2313 2314 if (!HAVE_NATIVE_mpn_mod_1_1p) 2315 { 2316 static struct param_t param; 2317 speed_function_t f[2] = 2318 { 2319 speed_mpn_mod_1_1_1, 2320 speed_mpn_mod_1_1_2, 2321 }; 2322 2323 s.size = 10; 2324 s.r = randlimb_half (); 2325 one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", ¶m); 2326 } 2327 2328 if (UDIV_PREINV_ALWAYS) 2329 { 2330 print_define ("MOD_1_NORM_THRESHOLD", 0L); 2331 print_define ("MOD_1_UNNORM_THRESHOLD", 0L); 2332 } 2333 else 2334 { 2335 { 2336 static struct param_t param; 2337 param.name = "MOD_1_NORM_THRESHOLD"; 2338 DIV_1_PARAMS; 2339 s.r = randlimb_norm (); 2340 param.function = speed_mpn_mod_1_tune; 2341 one (&mod_1_norm_threshold, ¶m); 2342 } 2343 { 2344 static struct param_t param; 2345 param.name = "MOD_1_UNNORM_THRESHOLD"; 2346 DIV_1_PARAMS; 2347 s.r = randlimb_half (); 2348 param.function = speed_mpn_mod_1_tune; 2349 one (&mod_1_unnorm_threshold, ¶m); 2350 } 2351 } 2352 { 2353 static struct param_t param; 2354 2355 param.check_size = 256; 2356 2357 s.r = randlimb_norm (); 2358 param.function = speed_mpn_mod_1_tune; 2359 2360 param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD"; 2361 param.min_size = 2; 2362 one (&mod_1n_to_mod_1_1_threshold, ¶m); 2363 } 2364 2365 { 2366 static struct param_t param; 2367 2368 param.check_size = 256; 2369 s.r = randlimb_half (); 2370 param.noprint = 1; 2371 2372 param.function = speed_mpn_mod_1_1; 2373 param.function2 = speed_mpn_mod_1_2; 2374 param.min_is_always = 1; 2375 param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD"; 2376 param.min_size = 2; 2377 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2378 2379 param.function = speed_mpn_mod_1_2; 2380 param.function2 = speed_mpn_mod_1_4; 2381 param.min_is_always = 1; 2382 param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD"; 2383 param.min_size = 1; 2384 one (&mod_1_2_to_mod_1_4_threshold, ¶m); 2385 2386 if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold) 2387 { 2388 /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */ 2389 mod_1_2_to_mod_1_4_threshold = 0; 2390 2391 param.function = speed_mpn_mod_1_1; 2392 param.function2 = speed_mpn_mod_1_4; 2393 param.min_is_always = 1; 2394 param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake"; 2395 param.min_size = 2; 2396 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 2397 } 2398 2399 param.function = speed_mpn_mod_1_tune; 2400 param.function2 = NULL; 2401 param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD"; 2402 param.min_size = 2; 2403 param.min_is_always = 0; 2404 one (&mod_1u_to_mod_1_1_threshold, ¶m); 2405 2406 if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold) 2407 mod_1_1_to_mod_1_2_threshold = 0; 2408 if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold) 2409 mod_1_2_to_mod_1_4_threshold = 0; 2410 2411 print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL); 2412 print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, 2413 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL); 2414 print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, 2415 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL); 2416 } 2417 2418 { 2419 static struct param_t param; 2420 2421 param.check_size = 256; 2422 2423 param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD"; 2424 s.r = randlimb_norm (); 2425 param.function = speed_mpn_preinv_mod_1; 2426 param.function2 = speed_mpn_mod_1_tune; 2427 param.min_size = 1; 2428 one (&preinv_mod_1_to_mod_1_threshold, ¶m); 2429 } 2430 } 2431 2432 2433 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would 2434 imply that udiv_qrnnd_preinv is worth using, but it seems most 2435 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div 2436 directly. */ 2437 2438 void 2439 tune_preinv_divrem_1 (void) 2440 { 2441 static struct param_t param; 2442 speed_function_t divrem_1; 2443 const char *divrem_1_name; 2444 double t1, t2; 2445 2446 if (GMP_NAIL_BITS != 0) 2447 { 2448 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails"); 2449 return; 2450 } 2451 2452 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because 2453 it's faster than mpn_divrem_1. */ 2454 if (HAVE_NATIVE_mpn_preinv_divrem_1) 2455 { 2456 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); 2457 return; 2458 } 2459 2460 /* If udiv_qrnnd_preinv is the only division method then of course 2461 mpn_preinv_divrem_1 should be used. */ 2462 if (UDIV_PREINV_ALWAYS) 2463 { 2464 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); 2465 return; 2466 } 2467 2468 /* If we've got an assembler version of mpn_divrem_1, then compare against 2469 that, not the mpn_divrem_1_div generic C. */ 2470 if (HAVE_NATIVE_mpn_divrem_1) 2471 { 2472 divrem_1 = speed_mpn_divrem_1; 2473 divrem_1_name = "mpn_divrem_1"; 2474 } 2475 else 2476 { 2477 divrem_1 = speed_mpn_divrem_1_div; 2478 divrem_1_name = "mpn_divrem_1_div"; 2479 } 2480 2481 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ 2482 s.size = 200; /* generous but not too big */ 2483 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, 2484 since in general that's probably most common, though in fact for a 2485 64-bit limb mp_bases[10].big_base is normalized. */ 2486 s.r = urandom() & (GMP_NUMB_MASK >> 4); 2487 if (s.r == 0) s.r = 123; 2488 2489 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); 2490 t2 = tuneup_measure (divrem_1, ¶m, &s); 2491 if (t1 == -1.0 || t2 == -1.0) 2492 { 2493 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", 2494 divrem_1_name, (long) s.size); 2495 abort (); 2496 } 2497 if (option_trace >= 1) 2498 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", 2499 (long) s.size, t1, divrem_1_name, t2); 2500 2501 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); 2502 } 2503 2504 2505 2506 void 2507 tune_divrem_2 (void) 2508 { 2509 static struct param_t param; 2510 2511 /* No support for tuning native assembler code, do that by hand and put 2512 the results in the .asm file, and there's no need for such thresholds 2513 to appear in gmp-mparam.h. */ 2514 if (HAVE_NATIVE_mpn_divrem_2) 2515 return; 2516 2517 if (GMP_NAIL_BITS != 0) 2518 { 2519 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, 2520 "no preinv with nails"); 2521 return; 2522 } 2523 2524 if (UDIV_PREINV_ALWAYS) 2525 { 2526 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); 2527 return; 2528 } 2529 2530 /* Tune for the integer part of mpn_divrem_2. This will very possibly be 2531 a bit out for the fractional part, but that's too bad, the integer part 2532 is more important. 2533 2534 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save 2535 code space if plain division is better only at size==2 or size==3. */ 2536 param.name = "DIVREM_2_THRESHOLD"; 2537 param.check_size = 256; 2538 param.min_size = 4; 2539 param.min_is_always = 1; 2540 param.size_extra = 2; /* does qsize==nsize-2 divisions */ 2541 param.stop_factor = 2.0; 2542 2543 s.r = randlimb_norm (); 2544 param.function = speed_mpn_divrem_2; 2545 one (&divrem_2_threshold, ¶m); 2546 } 2547 2548 void 2549 tune_div_qr_2 (void) 2550 { 2551 static struct param_t param; 2552 param.name = "DIV_QR_2_PI2_THRESHOLD"; 2553 param.function = speed_mpn_div_qr_2n; 2554 param.check_size = 500; 2555 param.min_size = 4; 2556 one (&div_qr_2_pi2_threshold, ¶m); 2557 } 2558 2559 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so 2560 tune for that. Its speed can differ on odd or even divisor, so take an 2561 average threshold for the two. 2562 2563 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1 2564 might not vary that way, but don't test this since high<divisor isn't 2565 expected to occur often with small divisors. */ 2566 2567 void 2568 tune_divexact_1 (void) 2569 { 2570 static struct param_t param; 2571 mp_size_t thresh[2], average; 2572 int low, i; 2573 2574 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a 2575 full mpn_divrem_1. */ 2576 if (HAVE_NATIVE_mpn_divexact_1) 2577 { 2578 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)"); 2579 return; 2580 } 2581 2582 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL); 2583 2584 param.name = "DIVEXACT_1_THRESHOLD"; 2585 param.data_high = DATA_HIGH_GE_R; 2586 param.check_size = 256; 2587 param.min_size = 2; 2588 param.stop_factor = 1.5; 2589 param.function = tuned_speed_mpn_divrem_1; 2590 param.function2 = speed_mpn_divexact_1; 2591 param.noprint = 1; 2592 2593 print_define_start (param.name); 2594 2595 for (low = 0; low <= 1; low++) 2596 { 2597 s.r = randlimb_half(); 2598 if (low == 0) 2599 s.r |= 1; 2600 else 2601 s.r &= ~CNST_LIMB(7); 2602 2603 one (&thresh[low], ¶m); 2604 if (option_trace) 2605 printf ("low=%d thresh %ld\n", low, (long) thresh[low]); 2606 2607 if (thresh[low] == MP_SIZE_T_MAX) 2608 { 2609 average = MP_SIZE_T_MAX; 2610 goto divexact_1_done; 2611 } 2612 } 2613 2614 if (option_trace) 2615 { 2616 printf ("average of:"); 2617 for (i = 0; i < numberof(thresh); i++) 2618 printf (" %ld", (long) thresh[i]); 2619 printf ("\n"); 2620 } 2621 2622 average = 0; 2623 for (i = 0; i < numberof(thresh); i++) 2624 average += thresh[i]; 2625 average /= numberof(thresh); 2626 2627 /* If divexact turns out to be better as early as 3 limbs, then use it 2628 always, so as to reduce code size and conditional jumps. */ 2629 if (average <= 3) 2630 average = 0; 2631 2632 divexact_1_done: 2633 print_define_end (param.name, average); 2634 } 2635 2636 2637 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the 2638 same as mpn_mod_1, but this might not be true of an assembler 2639 implementation. The threshold used is an average based on data where a 2640 divide can be skipped and where it can't. 2641 2642 If modexact turns out to be better as early as 3 limbs, then use it 2643 always, so as to reduce code size and conditional jumps. */ 2644 2645 void 2646 tune_modexact_1_odd (void) 2647 { 2648 static struct param_t param; 2649 mp_size_t thresh_lt, thresh_ge, average; 2650 2651 #if 0 2652 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed 2653 of a full mpn_mod_1. */ 2654 if (HAVE_NATIVE_mpn_modexact_1_odd) 2655 { 2656 print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1"); 2657 return; 2658 } 2659 #endif 2660 2661 param.name = "BMOD_1_TO_MOD_1_THRESHOLD"; 2662 param.check_size = 256; 2663 param.min_size = 2; 2664 param.stop_factor = 1.5; 2665 param.function = speed_mpn_modexact_1c_odd; 2666 param.function2 = speed_mpn_mod_1_tune; 2667 param.noprint = 1; 2668 s.r = randlimb_half () | 1; 2669 2670 print_define_start (param.name); 2671 2672 param.data_high = DATA_HIGH_LT_R; 2673 one (&thresh_lt, ¶m); 2674 if (option_trace) 2675 printf ("lt thresh %ld\n", (long) thresh_lt); 2676 2677 average = thresh_lt; 2678 if (thresh_lt != MP_SIZE_T_MAX) 2679 { 2680 param.data_high = DATA_HIGH_GE_R; 2681 one (&thresh_ge, ¶m); 2682 if (option_trace) 2683 printf ("ge thresh %ld\n", (long) thresh_ge); 2684 2685 if (thresh_ge != MP_SIZE_T_MAX) 2686 { 2687 average = (thresh_ge + thresh_lt) / 2; 2688 if (thresh_ge <= 3) 2689 average = 0; 2690 } 2691 } 2692 2693 print_define_end (param.name, average); 2694 } 2695 2696 2697 void 2698 tune_jacobi_base (void) 2699 { 2700 static struct param_t param; 2701 speed_function_t f[4] = 2702 { 2703 speed_mpn_jacobi_base_1, 2704 speed_mpn_jacobi_base_2, 2705 speed_mpn_jacobi_base_3, 2706 speed_mpn_jacobi_base_4, 2707 }; 2708 2709 s.size = GMP_LIMB_BITS * 3 / 4; 2710 2711 one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", ¶m); 2712 } 2713 2714 2715 void 2716 tune_get_str (void) 2717 { 2718 /* Tune for decimal, it being most common. Some rough testing suggests 2719 other bases are different, but not by very much. */ 2720 s.r = 10; 2721 { 2722 static struct param_t param; 2723 GET_STR_PRECOMPUTE_THRESHOLD = 0; 2724 param.name = "GET_STR_DC_THRESHOLD"; 2725 param.function = speed_mpn_get_str; 2726 param.min_size = 4; 2727 param.max_size = GET_STR_THRESHOLD_LIMIT; 2728 one (&get_str_dc_threshold, ¶m); 2729 } 2730 { 2731 static struct param_t param; 2732 param.name = "GET_STR_PRECOMPUTE_THRESHOLD"; 2733 param.function = speed_mpn_get_str; 2734 param.min_size = GET_STR_DC_THRESHOLD; 2735 param.max_size = GET_STR_THRESHOLD_LIMIT; 2736 one (&get_str_precompute_threshold, ¶m); 2737 } 2738 } 2739 2740 2741 double 2742 speed_mpn_pre_set_str (struct speed_params *s) 2743 { 2744 unsigned char *str; 2745 mp_ptr wp; 2746 mp_size_t wn; 2747 unsigned i; 2748 int base; 2749 double t; 2750 mp_ptr powtab_mem, tp; 2751 powers_t powtab[GMP_LIMB_BITS]; 2752 mp_size_t un; 2753 int chars_per_limb; 2754 size_t n_pows; 2755 powers_t *pt; 2756 TMP_DECL; 2757 2758 SPEED_RESTRICT_COND (s->size >= 1); 2759 2760 base = s->r == 0 ? 10 : s->r; 2761 SPEED_RESTRICT_COND (base >= 2 && base <= 256); 2762 2763 TMP_MARK; 2764 2765 str = (unsigned char *) TMP_ALLOC (s->size); 2766 for (i = 0; i < s->size; i++) 2767 str[i] = s->xp[i] % base; 2768 2769 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); 2770 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); 2771 2772 /* use this during development to check wn is big enough */ 2773 /* 2774 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); 2775 */ 2776 2777 speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES); 2778 speed_operand_dst (s, wp, wn); 2779 speed_cache_fill (s); 2780 2781 chars_per_limb = mp_bases[base].chars_per_limb; 2782 un = s->size / chars_per_limb + 1; 2783 powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un)); 2784 n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base); 2785 pt = powtab + n_pows; 2786 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); 2787 2788 speed_starttime (); 2789 i = s->reps; 2790 do 2791 { 2792 mpn_pre_set_str (wp, str, s->size, pt, tp); 2793 } 2794 while (--i != 0); 2795 t = speed_endtime (); 2796 2797 TMP_FREE; 2798 return t; 2799 } 2800 2801 void 2802 tune_set_str (void) 2803 { 2804 s.r = 10; /* decimal */ 2805 { 2806 static struct param_t param; 2807 SET_STR_PRECOMPUTE_THRESHOLD = 0; 2808 param.step_factor = 0.01; 2809 param.name = "SET_STR_DC_THRESHOLD"; 2810 param.function = speed_mpn_pre_set_str; 2811 param.min_size = 100; 2812 param.max_size = 50000; 2813 one (&set_str_dc_threshold, ¶m); 2814 } 2815 { 2816 static struct param_t param; 2817 param.step_factor = 0.02; 2818 param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; 2819 param.function = speed_mpn_set_str; 2820 param.min_size = SET_STR_DC_THRESHOLD; 2821 param.max_size = 100000; 2822 one (&set_str_precompute_threshold, ¶m); 2823 } 2824 } 2825 2826 2827 void 2828 tune_fft_mul (void) 2829 { 2830 static struct fft_param_t param; 2831 2832 if (option_fft_max_size == 0) 2833 return; 2834 2835 param.table_name = "MUL_FFT_TABLE3"; 2836 param.threshold_name = "MUL_FFT_THRESHOLD"; 2837 param.p_threshold = &mul_fft_threshold; 2838 param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; 2839 param.p_modf_threshold = &mul_fft_modf_threshold; 2840 param.first_size = MUL_TOOM33_THRESHOLD / 2; 2841 param.max_size = option_fft_max_size; 2842 param.function = speed_mpn_fft_mul; 2843 param.mul_modf_function = speed_mpn_mul_fft; 2844 param.mul_function = speed_mpn_mul_n; 2845 param.sqr = 0; 2846 fft (¶m); 2847 } 2848 2849 2850 void 2851 tune_fft_sqr (void) 2852 { 2853 static struct fft_param_t param; 2854 2855 if (option_fft_max_size == 0) 2856 return; 2857 2858 param.table_name = "SQR_FFT_TABLE3"; 2859 param.threshold_name = "SQR_FFT_THRESHOLD"; 2860 param.p_threshold = &sqr_fft_threshold; 2861 param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; 2862 param.p_modf_threshold = &sqr_fft_modf_threshold; 2863 param.first_size = SQR_TOOM3_THRESHOLD / 2; 2864 param.max_size = option_fft_max_size; 2865 param.function = speed_mpn_fft_sqr; 2866 param.mul_modf_function = speed_mpn_mul_fft_sqr; 2867 param.mul_function = speed_mpn_sqr; 2868 param.sqr = 1; 2869 fft (¶m); 2870 } 2871 2872 void 2873 tune_fac_ui (void) 2874 { 2875 static struct param_t param; 2876 2877 param.function = speed_mpz_fac_ui_tune; 2878 2879 param.name = "FAC_DSC_THRESHOLD"; 2880 param.min_size = 70; 2881 param.max_size = FAC_DSC_THRESHOLD_LIMIT; 2882 one (&fac_dsc_threshold, ¶m); 2883 2884 param.name = "FAC_ODD_THRESHOLD"; 2885 param.min_size = 22; 2886 param.stop_factor = 1.7; 2887 param.min_is_always = 1; 2888 one (&fac_odd_threshold, ¶m); 2889 } 2890 2891 void 2892 all (void) 2893 { 2894 time_t start_time, end_time; 2895 TMP_DECL; 2896 2897 TMP_MARK; 2898 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); 2899 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); 2900 2901 mpn_random (s.xp_block, SPEED_BLOCK_SIZE); 2902 mpn_random (s.yp_block, SPEED_BLOCK_SIZE); 2903 2904 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); 2905 2906 speed_time_init (); 2907 fprintf (stderr, "Using: %s\n", speed_time_string); 2908 2909 fprintf (stderr, "speed_precision %d", speed_precision); 2910 if (speed_unittime == 1.0) 2911 fprintf (stderr, ", speed_unittime 1 cycle"); 2912 else 2913 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); 2914 if (speed_cycletime == 1.0 || speed_cycletime == 0.0) 2915 fprintf (stderr, ", CPU freq unknown\n"); 2916 else 2917 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); 2918 2919 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", 2920 DEFAULT_MAX_SIZE, (long) option_fft_max_size); 2921 fprintf (stderr, "\n"); 2922 2923 time (&start_time); 2924 { 2925 struct tm *tp; 2926 tp = localtime (&start_time); 2927 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", 2928 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); 2929 2930 #ifdef __GNUC__ 2931 /* gcc sub-minor version doesn't seem to come through as a define */ 2932 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); 2933 #define PRINTED_COMPILER 2934 #endif 2935 #if defined (__SUNPRO_C) 2936 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); 2937 #define PRINTED_COMPILER 2938 #endif 2939 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) 2940 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ 2941 printf ("MIPSpro C %d.%d.%d */\n", 2942 _COMPILER_VERSION / 100, 2943 _COMPILER_VERSION / 10 % 10, 2944 _COMPILER_VERSION % 10); 2945 #define PRINTED_COMPILER 2946 #endif 2947 #if defined (__DECC) && defined (__DECC_VER) 2948 printf ("DEC C %d */\n", __DECC_VER); 2949 #define PRINTED_COMPILER 2950 #endif 2951 #if ! defined (PRINTED_COMPILER) 2952 printf ("system compiler */\n"); 2953 #endif 2954 } 2955 printf ("\n"); 2956 2957 tune_divrem_1 (); 2958 tune_mod_1 (); 2959 tune_preinv_divrem_1 (); 2960 tune_div_qr_1 (); 2961 #if 0 2962 tune_divrem_2 (); 2963 #endif 2964 tune_div_qr_2 (); 2965 tune_divexact_1 (); 2966 tune_modexact_1_odd (); 2967 printf("\n"); 2968 2969 relspeed_div_1_vs_mul_1 (); 2970 printf("\n"); 2971 2972 tune_mul_n (); 2973 printf("\n"); 2974 2975 tune_mul (); 2976 printf("\n"); 2977 2978 tune_sqr (); 2979 printf("\n"); 2980 2981 tune_mulmid (); 2982 printf("\n"); 2983 2984 tune_mulmod_bnm1 (); 2985 tune_sqrmod_bnm1 (); 2986 printf("\n"); 2987 2988 tune_fft_mul (); 2989 printf("\n"); 2990 2991 tune_fft_sqr (); 2992 printf ("\n"); 2993 2994 tune_mullo (); 2995 tune_sqrlo (); 2996 printf("\n"); 2997 2998 tune_dc_div (); 2999 tune_dc_bdiv (); 3000 3001 printf("\n"); 3002 tune_invertappr (); 3003 tune_invert (); 3004 printf("\n"); 3005 3006 tune_binvert (); 3007 tune_redc (); 3008 printf("\n"); 3009 3010 tune_mu_div (); 3011 tune_mu_bdiv (); 3012 printf("\n"); 3013 3014 tune_powm_sec (); 3015 printf("\n"); 3016 3017 tune_get_str (); 3018 tune_set_str (); 3019 printf("\n"); 3020 3021 tune_fac_ui (); 3022 printf("\n"); 3023 3024 tune_matrix22_mul (); 3025 tune_hgcd2 (); 3026 tune_hgcd (); 3027 tune_hgcd_appr (); 3028 tune_hgcd_reduce(); 3029 tune_gcd_dc (); 3030 tune_gcdext_dc (); 3031 tune_jacobi_base (); 3032 printf("\n"); 3033 3034 time (&end_time); 3035 printf ("/* Tuneup completed successfully, took %ld seconds */\n", 3036 (long) (end_time - start_time)); 3037 3038 TMP_FREE; 3039 } 3040 3041 3042 int 3043 main (int argc, char *argv[]) 3044 { 3045 int opt; 3046 3047 /* Unbuffered so if output is redirected to a file it isn't lost if the 3048 program is killed part way through. */ 3049 setbuf (stdout, NULL); 3050 setbuf (stderr, NULL); 3051 3052 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) 3053 { 3054 switch (opt) { 3055 case 'f': 3056 if (optarg[0] == 't') 3057 option_fft_trace = 2; 3058 else 3059 option_fft_max_size = atol (optarg); 3060 break; 3061 case 'o': 3062 speed_option_set (optarg); 3063 break; 3064 case 'p': 3065 speed_precision = atoi (optarg); 3066 break; 3067 case 't': 3068 option_trace++; 3069 break; 3070 case '?': 3071 exit(1); 3072 } 3073 } 3074 3075 all (); 3076 exit (0); 3077 } 3078