1 /* Create tuned thresholds for various algorithms. 2 3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free 4 Software Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21 22 /* Usage: tuneup [-t] [-t] [-p precision] 23 24 -t turns on some diagnostic traces, a second -t turns on more traces. 25 26 Notes: 27 28 The code here isn't a vision of loveliness, mainly because it's subject 29 to ongoing changes according to new things wanting to be tuned, and 30 practical requirements of systems tested. 31 32 Sometimes running the program twice produces slightly different results. 33 This is probably because there's so little separating algorithms near 34 their crossover, and on that basis it should make little or no difference 35 to the final speed of the relevant routines, but nothing has been done to 36 check that carefully. 37 38 Algorithm: 39 40 The thresholds are determined as follows. A crossover may not be a 41 single size but rather a range where it oscillates between method A or 42 method B faster. If the threshold is set making B used where A is faster 43 (or vice versa) that's bad. Badness is the percentage time lost and 44 total badness is the sum of this over all sizes measured. The threshold 45 is set to minimize total badness. 46 47 Suppose, as sizes increase, method B becomes faster than method A. The 48 effect of the rule is that, as you look at increasing sizes, isolated 49 points where B is faster are ignored, but when it's consistently faster, 50 or faster on balance, then the threshold is set there. The same result 51 is obtained thinking in the other direction of A becoming faster at 52 smaller sizes. 53 54 In practice the thresholds tend to be chosen to bring on the next 55 algorithm fairly quickly. 56 57 This rule is attractive because it's got a basis in reason and is fairly 58 easy to implement, but no work has been done to actually compare it in 59 absolute terms to other possibilities. 60 61 Implementation: 62 63 In a normal library build the thresholds are constants. To tune them 64 selected objects are recompiled with the thresholds as global variables 65 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at 66 the end of gmp-impl.h, and rules in tune/Makefile.am. 67 68 MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The 69 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one 70 level, but recurse into the basecase. 71 72 MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value. 73 Other routines in turn will make use of both of those. Naturally the 74 dependants must be tuned first. 75 76 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, 77 just a threshold based on comparing two routines (mpn_divrem_1 and 78 mpn_divexact_1), and no further use of the value determined. 79 80 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being 81 just comparisons between certain routines on representative data. 82 83 Shortcuts are applied when native (assembler) versions of routines exist. 84 For instance a native mpn_sqr_basecase is assumed to be always faster 85 than mpn_mul_basecase, with no measuring. 86 87 No attempt is made to tune within assembler routines, for instance 88 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be 89 written and tuned all by hand. Assembler routines that might have hard 90 limits are recompiled though, to make them accept a bigger range of sizes 91 than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr. 92 93 Limitations: 94 95 The FFTs aren't subject to the same badness rule as the other thresholds, 96 so each k is probably being brought on a touch early. This isn't likely 97 to make a difference, and the simpler probing means fewer tests. 98 99 */ 100 101 #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ 102 103 #include "config.h" 104 105 #include <math.h> 106 #include <stdio.h> 107 #include <stdlib.h> 108 #include <time.h> 109 #if HAVE_UNISTD_H 110 #include <unistd.h> 111 #endif 112 113 #include "gmp.h" 114 #include "gmp-impl.h" 115 #include "longlong.h" 116 117 #include "tests.h" 118 #include "speed.h" 119 120 #if !HAVE_DECL_OPTARG 121 extern char *optarg; 122 extern int optind, opterr; 123 #endif 124 125 126 #define DEFAULT_MAX_SIZE 1000 /* limbs */ 127 128 #if WANT_FFT 129 mp_size_t option_fft_max_size = 50000; /* limbs */ 130 #else 131 mp_size_t option_fft_max_size = 0; 132 #endif 133 int option_trace = 0; 134 int option_fft_trace = 0; 135 struct speed_params s; 136 137 struct dat_t { 138 mp_size_t size; 139 double d; 140 } *dat = NULL; 141 int ndat = 0; 142 int allocdat = 0; 143 144 /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that 145 case use zero here, which for params.max_size means no limit. */ 146 #ifndef TUNE_SQR_TOOM2_MAX 147 #define TUNE_SQR_TOOM2_MAX 0 148 #endif 149 150 mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX; 151 mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT; 152 mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; 153 mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT; 154 mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT; 155 mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX; 156 mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX; 157 mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX; 158 mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX; 159 mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; 160 mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; 161 mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; 162 mp_size_t sqr_toom2_threshold 163 = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX); 164 mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; 165 mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; 166 mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; 167 mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; 168 mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; 169 mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; 170 mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX; 171 mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX; 172 mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX; 173 mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX; 174 mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX; 175 mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX; 176 mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX; 177 mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX; 178 mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX; 179 mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX; 180 mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX; 181 mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX; 182 mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX; 183 mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX; 184 mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX; 185 mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX; 186 mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX; 187 mp_size_t inv_newton_threshold = MP_SIZE_T_MAX; 188 mp_size_t inv_appr_threshold = MP_SIZE_T_MAX; 189 mp_size_t binv_newton_threshold = MP_SIZE_T_MAX; 190 mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX; 191 mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX; 192 mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; 193 mp_size_t powm_threshold = MP_SIZE_T_MAX; 194 mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; 195 mp_size_t hgcd_threshold = MP_SIZE_T_MAX; 196 mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX; 197 mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; 198 mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; 199 mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; 200 mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; 201 mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; 202 mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; 203 mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX; 204 mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX; 205 mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX; 206 mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX; 207 mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX; 208 mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; 209 mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; 210 mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; 211 mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; 212 mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; 213 214 mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; 215 mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; 216 217 struct param_t { 218 const char *name; 219 speed_function_t function; 220 speed_function_t function2; 221 double step_factor; /* how much to step relatively */ 222 int step; /* how much to step absolutely */ 223 double function_fudge; /* multiplier for "function" speeds */ 224 int stop_since_change; 225 double stop_factor; 226 mp_size_t min_size; 227 int min_is_always; 228 mp_size_t max_size; 229 mp_size_t check_size; 230 mp_size_t size_extra; 231 232 #define DATA_HIGH_LT_R 1 233 #define DATA_HIGH_GE_R 2 234 int data_high; 235 236 int noprint; 237 }; 238 239 240 /* These are normally undefined when false, which suits "#if" fine. 241 But give them zero values so they can be used in plain C "if"s. */ 242 #ifndef UDIV_PREINV_ALWAYS 243 #define UDIV_PREINV_ALWAYS 0 244 #endif 245 #ifndef HAVE_NATIVE_mpn_divexact_1 246 #define HAVE_NATIVE_mpn_divexact_1 0 247 #endif 248 #ifndef HAVE_NATIVE_mpn_divrem_1 249 #define HAVE_NATIVE_mpn_divrem_1 0 250 #endif 251 #ifndef HAVE_NATIVE_mpn_divrem_2 252 #define HAVE_NATIVE_mpn_divrem_2 0 253 #endif 254 #ifndef HAVE_NATIVE_mpn_mod_1 255 #define HAVE_NATIVE_mpn_mod_1 0 256 #endif 257 #ifndef HAVE_NATIVE_mpn_modexact_1_odd 258 #define HAVE_NATIVE_mpn_modexact_1_odd 0 259 #endif 260 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 261 #define HAVE_NATIVE_mpn_preinv_divrem_1 0 262 #endif 263 #ifndef HAVE_NATIVE_mpn_preinv_mod_1 264 #define HAVE_NATIVE_mpn_preinv_mod_1 0 265 #endif 266 #ifndef HAVE_NATIVE_mpn_sqr_basecase 267 #define HAVE_NATIVE_mpn_sqr_basecase 0 268 #endif 269 270 271 #define MAX3(a,b,c) MAX (MAX (a, b), c) 272 273 mp_limb_t 274 randlimb_norm (void) 275 { 276 mp_limb_t n; 277 mpn_random (&n, 1); 278 n |= GMP_NUMB_HIGHBIT; 279 return n; 280 } 281 282 #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) 283 284 mp_limb_t 285 randlimb_half (void) 286 { 287 mp_limb_t n; 288 mpn_random (&n, 1); 289 n &= GMP_NUMB_HALFMASK; 290 n += (n==0); 291 return n; 292 } 293 294 295 /* Add an entry to the end of the dat[] array, reallocing to make it bigger 296 if necessary. */ 297 void 298 add_dat (mp_size_t size, double d) 299 { 300 #define ALLOCDAT_STEP 500 301 302 ASSERT_ALWAYS (ndat <= allocdat); 303 304 if (ndat == allocdat) 305 { 306 dat = (struct dat_t *) __gmp_allocate_or_reallocate 307 (dat, allocdat * sizeof(dat[0]), 308 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); 309 allocdat += ALLOCDAT_STEP; 310 } 311 312 dat[ndat].size = size; 313 dat[ndat].d = d; 314 ndat++; 315 } 316 317 318 /* Return the threshold size based on the data accumulated. */ 319 mp_size_t 320 analyze_dat (int final) 321 { 322 double x, min_x; 323 int j, min_j; 324 325 /* If the threshold is set at dat[0].size, any positive values are bad. */ 326 x = 0.0; 327 for (j = 0; j < ndat; j++) 328 if (dat[j].d > 0.0) 329 x += dat[j].d; 330 331 if (option_trace >= 2 && final) 332 { 333 printf ("\n"); 334 printf ("x is the sum of the badness from setting thresh at given size\n"); 335 printf (" (minimum x is sought)\n"); 336 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); 337 } 338 339 min_x = x; 340 min_j = 0; 341 342 343 /* When stepping to the next dat[j].size, positive values are no longer 344 bad (so subtracted), negative values become bad (so add the absolute 345 value, meaning subtract). */ 346 for (j = 0; j < ndat; x -= dat[j].d, j++) 347 { 348 if (option_trace >= 2 && final) 349 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); 350 351 if (x < min_x) 352 { 353 min_x = x; 354 min_j = j; 355 } 356 } 357 358 return min_j; 359 } 360 361 362 /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */ 363 364 mp_limb_t mpn_divrem_1_tune 365 __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t)); 366 mp_limb_t mpn_mod_1_tune 367 __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)); 368 369 double 370 speed_mpn_mod_1_tune (struct speed_params *s) 371 { 372 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); 373 } 374 double 375 speed_mpn_divrem_1_tune (struct speed_params *s) 376 { 377 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); 378 } 379 380 381 double 382 tuneup_measure (speed_function_t fun, 383 const struct param_t *param, 384 struct speed_params *s) 385 { 386 static struct param_t dummy; 387 double t; 388 TMP_DECL; 389 390 if (! param) 391 param = &dummy; 392 393 s->size += param->size_extra; 394 395 TMP_MARK; 396 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); 397 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); 398 399 mpn_random (s->xp, s->size); 400 mpn_random (s->yp, s->size); 401 402 switch (param->data_high) { 403 case DATA_HIGH_LT_R: 404 s->xp[s->size-1] %= s->r; 405 s->yp[s->size-1] %= s->r; 406 break; 407 case DATA_HIGH_GE_R: 408 s->xp[s->size-1] |= s->r; 409 s->yp[s->size-1] |= s->r; 410 break; 411 } 412 413 t = speed_measure (fun, s); 414 415 s->size -= param->size_extra; 416 417 TMP_FREE; 418 return t; 419 } 420 421 422 #define PRINT_WIDTH 31 423 424 void 425 print_define_start (const char *name) 426 { 427 printf ("#define %-*s ", PRINT_WIDTH, name); 428 if (option_trace) 429 printf ("...\n"); 430 } 431 432 void 433 print_define_end_remark (const char *name, mp_size_t value, const char *remark) 434 { 435 if (option_trace) 436 printf ("#define %-*s ", PRINT_WIDTH, name); 437 438 if (value == MP_SIZE_T_MAX) 439 printf ("MP_SIZE_T_MAX"); 440 else 441 printf ("%5ld", (long) value); 442 443 if (remark != NULL) 444 printf (" /* %s */", remark); 445 printf ("\n"); 446 fflush (stdout); 447 } 448 449 void 450 print_define_end (const char *name, mp_size_t value) 451 { 452 const char *remark; 453 if (value == MP_SIZE_T_MAX) 454 remark = "never"; 455 else if (value == 0) 456 remark = "always"; 457 else 458 remark = NULL; 459 print_define_end_remark (name, value, remark); 460 } 461 462 void 463 print_define (const char *name, mp_size_t value) 464 { 465 print_define_start (name); 466 print_define_end (name, value); 467 } 468 469 void 470 print_define_remark (const char *name, mp_size_t value, const char *remark) 471 { 472 print_define_start (name); 473 print_define_end_remark (name, value, remark); 474 } 475 476 477 void 478 one (mp_size_t *threshold, struct param_t *param) 479 { 480 int since_positive, since_thresh_change; 481 int thresh_idx, new_thresh_idx; 482 483 #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) 484 485 DEFAULT (param->function_fudge, 1.0); 486 DEFAULT (param->function2, param->function); 487 DEFAULT (param->step_factor, 0.01); /* small steps by default */ 488 DEFAULT (param->step, 1); /* small steps by default */ 489 DEFAULT (param->stop_since_change, 80); 490 DEFAULT (param->stop_factor, 1.2); 491 DEFAULT (param->min_size, 10); 492 DEFAULT (param->max_size, DEFAULT_MAX_SIZE); 493 494 if (param->check_size != 0) 495 { 496 double t1, t2; 497 s.size = param->check_size; 498 499 *threshold = s.size+1; 500 t1 = tuneup_measure (param->function, param, &s); 501 502 *threshold = s.size; 503 t2 = tuneup_measure (param->function2, param, &s); 504 if (t1 == -1.0 || t2 == -1.0) 505 { 506 printf ("Oops, can't run both functions at size %ld\n", 507 (long) s.size); 508 abort (); 509 } 510 t1 *= param->function_fudge; 511 512 /* ask that t2 is at least 4% below t1 */ 513 if (t1 < t2*1.04) 514 { 515 if (option_trace) 516 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); 517 *threshold = MP_SIZE_T_MAX; 518 if (! param->noprint) 519 print_define (param->name, *threshold); 520 return; 521 } 522 523 if (option_trace >= 2) 524 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", 525 (long) s.size, t1, t2); 526 } 527 528 if (! param->noprint || option_trace) 529 print_define_start (param->name); 530 531 ndat = 0; 532 since_positive = 0; 533 since_thresh_change = 0; 534 thresh_idx = 0; 535 536 if (option_trace >= 2) 537 { 538 printf (" algorithm-A algorithm-B ratio possible\n"); 539 printf (" (seconds) (seconds) diff thresh\n"); 540 } 541 542 for (s.size = param->min_size; 543 s.size < param->max_size; 544 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step)) 545 { 546 double ti, tiplus1, d; 547 548 /* 549 FIXME: check minimum size requirements are met, possibly by just 550 checking for the -1 returns from the speed functions. 551 */ 552 553 /* using method A at this size */ 554 *threshold = s.size+1; 555 ti = tuneup_measure (param->function, param, &s); 556 if (ti == -1.0) 557 abort (); 558 ti *= param->function_fudge; 559 560 /* using method B at this size */ 561 *threshold = s.size; 562 tiplus1 = tuneup_measure (param->function2, param, &s); 563 if (tiplus1 == -1.0) 564 abort (); 565 566 /* Calculate the fraction by which the one or the other routine is 567 slower. */ 568 if (tiplus1 >= ti) 569 d = (tiplus1 - ti) / tiplus1; /* negative */ 570 else 571 d = (tiplus1 - ti) / ti; /* positive */ 572 573 add_dat (s.size, d); 574 575 new_thresh_idx = analyze_dat (0); 576 577 if (option_trace >= 2) 578 printf ("size=%ld %.9f %.9f % .4f %c %ld\n", 579 (long) s.size, ti, tiplus1, d, 580 ti > tiplus1 ? '#' : ' ', 581 (long) dat[new_thresh_idx].size); 582 583 /* Stop if the last time method i was faster was more than a 584 certain number of measurements ago. */ 585 #define STOP_SINCE_POSITIVE 200 586 if (d >= 0) 587 since_positive = 0; 588 else 589 if (++since_positive > STOP_SINCE_POSITIVE) 590 { 591 if (option_trace >= 1) 592 printf ("stopped due to since_positive (%d)\n", 593 STOP_SINCE_POSITIVE); 594 break; 595 } 596 597 /* Stop if method A has become slower by a certain factor. */ 598 if (ti >= tiplus1 * param->stop_factor) 599 { 600 if (option_trace >= 1) 601 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", 602 param->stop_factor); 603 break; 604 } 605 606 /* Stop if the threshold implied hasn't changed in a certain 607 number of measurements. (It's this condition that usually 608 stops the loop.) */ 609 if (thresh_idx != new_thresh_idx) 610 since_thresh_change = 0, thresh_idx = new_thresh_idx; 611 else 612 if (++since_thresh_change > param->stop_since_change) 613 { 614 if (option_trace >= 1) 615 printf ("stopped due to since_thresh_change (%d)\n", 616 param->stop_since_change); 617 break; 618 } 619 620 /* Stop if the threshold implied is more than a certain number of 621 measurements ago. */ 622 #define STOP_SINCE_AFTER 500 623 if (ndat - thresh_idx > STOP_SINCE_AFTER) 624 { 625 if (option_trace >= 1) 626 printf ("stopped due to ndat - thresh_idx > amount (%d)\n", 627 STOP_SINCE_AFTER); 628 break; 629 } 630 631 /* Stop when the size limit is reached before the end of the 632 crossover, but only show this as an error for >= the default max 633 size. FIXME: Maybe should make it a param choice whether this is 634 an error. */ 635 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) 636 { 637 fprintf (stderr, "%s\n", param->name); 638 fprintf (stderr, "sizes %ld to %ld total %d measurements\n", 639 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 640 fprintf (stderr, " max size reached before end of crossover\n"); 641 break; 642 } 643 } 644 645 if (option_trace >= 1) 646 printf ("sizes %ld to %ld total %d measurements\n", 647 (long) dat[0].size, (long) dat[ndat-1].size, ndat); 648 649 *threshold = dat[analyze_dat (1)].size; 650 651 if (param->min_is_always) 652 { 653 if (*threshold == param->min_size) 654 *threshold = 0; 655 } 656 657 if (! param->noprint || option_trace) 658 print_define_end (param->name, *threshold); 659 } 660 661 662 /* Special probing for the fft thresholds. The size restrictions on the 663 FFTs mean the graph of time vs size has a step effect. See this for 664 example using 665 666 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 667 gnuplot foo.gnuplot 668 669 The current approach is to compare routines at the midpoint of relevant 670 steps. Arguably a more sophisticated system of threshold data is wanted 671 if this step effect remains. */ 672 673 struct fft_param_t { 674 const char *table_name; 675 const char *threshold_name; 676 const char *modf_threshold_name; 677 mp_size_t *p_threshold; 678 mp_size_t *p_modf_threshold; 679 mp_size_t first_size; 680 mp_size_t max_size; 681 speed_function_t function; 682 speed_function_t mul_modf_function; 683 speed_function_t mul_function; 684 mp_size_t sqr; 685 }; 686 687 688 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with 689 N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple 690 of 2^(k-1) bits. */ 691 692 mp_size_t 693 fft_step_size (int k) 694 { 695 mp_size_t step; 696 697 step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS; 698 step *= (mp_size_t) 1 << k; 699 700 if (step <= 0) 701 { 702 printf ("Can't handle k=%d\n", k); 703 abort (); 704 } 705 706 return step; 707 } 708 709 mp_size_t 710 fft_next_size (mp_size_t pl, int k) 711 { 712 mp_size_t m = fft_step_size (k); 713 714 /* printf ("[k=%d %ld] %ld ->", k, m, pl); */ 715 716 if (pl == 0 || (pl & (m-1)) != 0) 717 pl = (pl | (m-1)) + 1; 718 719 /* printf (" %ld\n", pl); */ 720 return pl; 721 } 722 723 #define NMAX_DEFAULT 1000000 724 #define MAX_REPS 25 725 #define MIN_REPS 5 726 727 static inline size_t 728 mpn_mul_fft_lcm (size_t a, unsigned int k) 729 { 730 unsigned int l = k; 731 732 while (a % 2 == 0 && k > 0) 733 { 734 a >>= 1; 735 k--; 736 } 737 return a << l; 738 } 739 740 mp_size_t 741 fftfill (mp_size_t pl, int k, int sqr) 742 { 743 mp_size_t maxLK; 744 mp_bitcnt_t N, Nprime, nprime, M; 745 746 N = pl * GMP_NUMB_BITS; 747 M = N >> k; 748 749 maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); 750 751 Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; 752 nprime = Nprime / GMP_NUMB_BITS; 753 if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 754 { 755 size_t K2; 756 for (;;) 757 { 758 K2 = 1L << mpn_fft_best_k (nprime, sqr); 759 if ((nprime & (K2 - 1)) == 0) 760 break; 761 nprime = (nprime + K2 - 1) & -K2; 762 Nprime = nprime * GMP_LIMB_BITS; 763 } 764 } 765 ASSERT_ALWAYS (nprime < pl); 766 767 return Nprime; 768 } 769 770 static int 771 compare_double (const void *ap, const void *bp) 772 { 773 double a = * (const double *) ap; 774 double b = * (const double *) bp; 775 776 if (a < b) 777 return -1; 778 else if (a > b) 779 return 1; 780 else 781 return 0; 782 } 783 784 double 785 median (double *times, int n) 786 { 787 qsort (times, n, sizeof (double), compare_double); 788 return times[n/2]; 789 } 790 791 #define FFT_CACHE_SIZE 25 792 typedef struct fft_cache 793 { 794 mp_size_t n; 795 double time; 796 } fft_cache_t; 797 798 fft_cache_t fft_cache[FFT_CACHE_SIZE]; 799 800 double 801 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k, 802 int n_measurements) 803 { 804 int i; 805 double t, ttab[MAX_REPS]; 806 807 if (fft_cache[k].n == n) 808 return fft_cache[k].time; 809 810 for (i = 0; i < n_measurements; i++) 811 { 812 speed_starttime (); 813 mpn_mul_fft (rp, n, ap, n, bp, n, k); 814 ttab[i] = speed_endtime (); 815 } 816 817 t = median (ttab, n_measurements); 818 fft_cache[k].n = n; 819 fft_cache[k].time = t; 820 return t; 821 } 822 823 #define INSERT_FFTTAB(idx, nval, kval) \ 824 do { \ 825 fft_tab[idx].n = nval; \ 826 fft_tab[idx].k = kval; \ 827 fft_tab[idx+1].n = -1; /* sentinel */ \ 828 fft_tab[idx+1].k = -1; \ 829 } while (0) 830 831 int 832 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print) 833 { 834 mp_size_t n, n1, prev_n1; 835 int k, best_k, last_best_k, kmax; 836 int eff, prev_eff; 837 double t0, t1; 838 int n_measurements; 839 mp_limb_t *ap, *bp, *rp; 840 mp_size_t alloc; 841 char *linepref; 842 struct fft_table_nk *fft_tab; 843 844 fft_tab = mpn_fft_table3[p->sqr]; 845 846 for (k = 0; k < FFT_CACHE_SIZE; k++) 847 fft_cache[k].n = 0; 848 849 if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) 850 { 851 nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD); 852 } 853 854 if (print) 855 printf ("#define %s%*s", p->table_name, 38, ""); 856 857 if (idx == 0) 858 { 859 INSERT_FFTTAB (0, nmin, initial_k); 860 861 if (print) 862 { 863 printf ("\\\n { "); 864 printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k); 865 linepref = " "; 866 } 867 868 idx = 1; 869 } 870 871 ap = malloc (sizeof (mp_limb_t)); 872 if (p->sqr) 873 bp = ap; 874 else 875 bp = malloc (sizeof (mp_limb_t)); 876 rp = malloc (sizeof (mp_limb_t)); 877 alloc = 1; 878 879 /* Round n to comply to initial k value */ 880 n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k); 881 882 n_measurements = (18 - initial_k) | 1; 883 n_measurements = MAX (n_measurements, MIN_REPS); 884 n_measurements = MIN (n_measurements, MAX_REPS); 885 886 last_best_k = initial_k; 887 best_k = initial_k; 888 889 while (n < nmax) 890 { 891 int start_k, end_k; 892 893 /* Assume the current best k is best until we hit its next FFT step. */ 894 t0 = 99999; 895 896 prev_n1 = n + 1; 897 898 start_k = MAX (4, best_k - 4); 899 end_k = MIN (24, best_k + 4); 900 for (k = start_k; k <= end_k; k++) 901 { 902 n1 = mpn_fft_next_size (prev_n1, k); 903 904 eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr); 905 906 if (eff < 70) /* avoid measuring too slow fft:s */ 907 continue; 908 909 if (n1 > alloc) 910 { 911 alloc = n1; 912 if (p->sqr) 913 { 914 ap = realloc (ap, sizeof (mp_limb_t)); 915 rp = realloc (rp, sizeof (mp_limb_t)); 916 ap = bp = realloc (ap, alloc * sizeof (mp_limb_t)); 917 mpn_random (ap, alloc); 918 rp = realloc (rp, alloc * sizeof (mp_limb_t)); 919 } 920 else 921 { 922 ap = realloc (ap, sizeof (mp_limb_t)); 923 bp = realloc (bp, sizeof (mp_limb_t)); 924 rp = realloc (rp, sizeof (mp_limb_t)); 925 ap = realloc (ap, alloc * sizeof (mp_limb_t)); 926 mpn_random (ap, alloc); 927 bp = realloc (bp, alloc * sizeof (mp_limb_t)); 928 mpn_random (bp, alloc); 929 rp = realloc (rp, alloc * sizeof (mp_limb_t)); 930 } 931 } 932 933 t1 = cached_measure (rp, ap, bp, n1, k, n_measurements); 934 935 if (t1 * n_measurements > 0.3) 936 n_measurements -= 2; 937 n_measurements = MAX (n_measurements, MIN_REPS); 938 939 if (t1 < t0) 940 { 941 best_k = k; 942 t0 = t1; 943 } 944 } 945 946 n1 = mpn_fft_next_size (prev_n1, best_k); 947 948 if (last_best_k != best_k) 949 { 950 ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1); 951 952 if (idx >= FFT_TABLE3_SIZE) 953 { 954 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 955 abort (); 956 } 957 INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k); 958 959 if (print) 960 { 961 printf (", "); 962 if (idx % 4 == 0) 963 printf ("\\\n "); 964 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 965 } 966 967 if (option_trace >= 2) 968 { 969 printf ("{%lu,%u}\n", prev_n1, best_k); 970 fflush (stdout); 971 } 972 973 last_best_k = best_k; 974 idx++; 975 } 976 977 for (;;) 978 { 979 prev_n1 = n1; 980 prev_eff = fftfill (prev_n1, best_k, p->sqr); 981 n1 = mpn_fft_next_size (prev_n1 + 1, best_k); 982 eff = fftfill (n1, best_k, p->sqr); 983 984 if (eff != prev_eff) 985 break; 986 } 987 988 n = prev_n1; 989 } 990 991 kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */ 992 kmax = MIN (kmax, 25-1); 993 for (k = last_best_k + 1; k <= kmax; k++) 994 { 995 if (idx >= FFT_TABLE3_SIZE) 996 { 997 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); 998 abort (); 999 } 1000 INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k); 1001 1002 if (print) 1003 { 1004 printf (", "); 1005 if (idx % 4 == 0) 1006 printf ("\\\n "); 1007 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); 1008 } 1009 1010 idx++; 1011 } 1012 1013 if (print) 1014 printf (" }\n"); 1015 1016 free (ap); 1017 if (! p->sqr) 1018 free (bp); 1019 free (rp); 1020 1021 return idx; 1022 } 1023 1024 void 1025 fft (struct fft_param_t *p) 1026 { 1027 mp_size_t size; 1028 int k, idx, initial_k; 1029 1030 /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/ 1031 1032 #if 1 1033 { 1034 /* Use plain one() mechanism, for some reasonable initial values of k. The 1035 advantage is that we don't depend on mpn_fft_table3, which can therefore 1036 leave it completely uninitialized. */ 1037 1038 static struct param_t param; 1039 mp_size_t thres, best_thres; 1040 int best_k; 1041 char buf[20]; 1042 1043 best_thres = MP_SIZE_T_MAX; 1044 best_k = -1; 1045 1046 for (k = 5; k <= 7; k++) 1047 { 1048 param.name = p->modf_threshold_name; 1049 param.min_size = 100; 1050 param.max_size = 2000; 1051 param.function = p->mul_function; 1052 param.step_factor = 0.0; 1053 param.step = 4; 1054 param.function2 = p->mul_modf_function; 1055 param.noprint = 1; 1056 s.r = k; 1057 one (&thres, ¶m); 1058 if (thres < best_thres) 1059 { 1060 best_thres = thres; 1061 best_k = k; 1062 } 1063 } 1064 1065 *(p->p_modf_threshold) = best_thres; 1066 sprintf (buf, "k = %d", best_k); 1067 print_define_remark (p->modf_threshold_name, best_thres, buf); 1068 initial_k = best_k; 1069 } 1070 #else 1071 size = p->first_size; 1072 for (;;) 1073 { 1074 double tk, tm; 1075 1076 size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr)); 1077 k = mpn_fft_best_k (size, p->sqr); 1078 1079 if (size >= p->max_size) 1080 break; 1081 1082 s.size = size + fft_step_size (k) / 2; 1083 s.r = k; 1084 tk = tuneup_measure (p->mul_modf_function, NULL, &s); 1085 if (tk == -1.0) 1086 abort (); 1087 1088 tm = tuneup_measure (p->mul_function, NULL, &s); 1089 if (tm == -1.0) 1090 abort (); 1091 1092 if (option_trace >= 2) 1093 printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n", 1094 (long) size, 1095 (long) size + fft_step_size (k) / 2, k, tk, 1096 (long) s.size, tm); 1097 1098 if (tk < tm) 1099 { 1100 *p->p_modf_threshold = s.size; 1101 print_define (p->modf_threshold_name, *p->p_modf_threshold); 1102 break; 1103 } 1104 } 1105 initial_k = ?; 1106 #endif 1107 1108 /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/ 1109 1110 idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1); 1111 printf ("#define %s_SIZE %d\n", p->table_name, idx); 1112 1113 /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/ 1114 1115 size = 2 * *p->p_modf_threshold; /* OK? */ 1116 for (;;) 1117 { 1118 double tk, tm; 1119 mp_size_t mulmod_size, mul_size;; 1120 1121 if (size >= p->max_size) 1122 break; 1123 1124 mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2; 1125 mul_size = (size + mulmod_size) / 2; /* middle of step */ 1126 1127 s.size = mulmod_size; 1128 tk = tuneup_measure (p->function, NULL, &s); 1129 if (tk == -1.0) 1130 abort (); 1131 1132 s.size = mul_size; 1133 tm = tuneup_measure (p->mul_function, NULL, &s); 1134 if (tm == -1.0) 1135 abort (); 1136 1137 if (option_trace >= 2) 1138 printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n", 1139 (long) size, 1140 (long) mulmod_size, tk, 1141 (long) mul_size, tm); 1142 1143 size = mulmod_size; 1144 1145 if (tk < tm) 1146 { 1147 *p->p_threshold = s.size; 1148 print_define (p->threshold_name, *p->p_threshold); 1149 break; 1150 } 1151 } 1152 } 1153 1154 1155 1156 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, 1157 giving wrong results. */ 1158 void 1159 tune_mul_n (void) 1160 { 1161 static struct param_t param; 1162 1163 param.function = speed_mpn_mul_n; 1164 1165 param.name = "MUL_TOOM22_THRESHOLD"; 1166 param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE); 1167 param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1; 1168 one (&mul_toom22_threshold, ¶m); 1169 1170 param.name = "MUL_TOOM33_THRESHOLD"; 1171 param.min_size = MAX (mul_toom22_threshold, MPN_TOOM33_MUL_MINSIZE); 1172 param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1; 1173 one (&mul_toom33_threshold, ¶m); 1174 1175 param.name = "MUL_TOOM44_THRESHOLD"; 1176 param.min_size = MAX (mul_toom33_threshold, MPN_TOOM44_MUL_MINSIZE); 1177 param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; 1178 one (&mul_toom44_threshold, ¶m); 1179 1180 param.name = "MUL_TOOM6H_THRESHOLD"; 1181 param.min_size = MAX (mul_toom44_threshold, MPN_TOOM6H_MUL_MINSIZE); 1182 param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1; 1183 one (&mul_toom6h_threshold, ¶m); 1184 1185 param.name = "MUL_TOOM8H_THRESHOLD"; 1186 param.min_size = MAX (mul_toom6h_threshold, MPN_TOOM8H_MUL_MINSIZE); 1187 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1; 1188 one (&mul_toom8h_threshold, ¶m); 1189 1190 /* disabled until tuned */ 1191 MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; 1192 } 1193 1194 void 1195 tune_mul (void) 1196 { 1197 static struct param_t param; 1198 mp_size_t thres; 1199 1200 param.noprint = 1; 1201 1202 param.function = speed_mpn_toom32_for_toom43_mul; 1203 param.function2 = speed_mpn_toom43_for_toom32_mul; 1204 param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD"; 1205 param.min_size = MPN_TOOM43_MUL_MINSIZE; 1206 one (&thres, ¶m); 1207 mul_toom32_to_toom43_threshold = 17*thres/24; 1208 print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold); 1209 1210 param.function = speed_mpn_toom32_for_toom53_mul; 1211 param.function2 = speed_mpn_toom53_for_toom32_mul; 1212 param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD"; 1213 param.min_size = MPN_TOOM53_MUL_MINSIZE; 1214 one (&thres, ¶m); 1215 mul_toom32_to_toom53_threshold = 19*thres/30; 1216 print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold); 1217 1218 param.function = speed_mpn_toom42_for_toom53_mul; 1219 param.function2 = speed_mpn_toom53_for_toom42_mul; 1220 param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD"; 1221 param.min_size = MPN_TOOM53_MUL_MINSIZE; 1222 one (&thres, ¶m); 1223 mul_toom42_to_toom53_threshold = 11*thres/20; 1224 print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold); 1225 1226 param.function = speed_mpn_toom42_mul; 1227 param.function2 = speed_mpn_toom63_mul; 1228 param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD"; 1229 param.min_size = MPN_TOOM63_MUL_MINSIZE; 1230 one (&thres, ¶m); 1231 mul_toom42_to_toom63_threshold = thres/2; 1232 print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold); 1233 } 1234 1235 1236 void 1237 tune_mullo (void) 1238 { 1239 static struct param_t param; 1240 1241 param.function = speed_mpn_mullo_n; 1242 1243 param.name = "MULLO_BASECASE_THRESHOLD"; 1244 param.min_size = 1; 1245 param.min_is_always = 1; 1246 param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1; 1247 param.stop_factor = 1.5; 1248 param.noprint = 1; 1249 one (&mullo_basecase_threshold, ¶m); 1250 1251 param.name = "MULLO_DC_THRESHOLD"; 1252 param.min_size = 8; 1253 param.min_is_always = 0; 1254 param.max_size = 1000; 1255 one (&mullo_dc_threshold, ¶m); 1256 1257 if (mullo_basecase_threshold >= mullo_dc_threshold) 1258 { 1259 print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold); 1260 print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase"); 1261 } 1262 else 1263 { 1264 print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold); 1265 print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold); 1266 } 1267 1268 #if WANT_FFT 1269 param.name = "MULLO_MUL_N_THRESHOLD"; 1270 param.min_size = mullo_dc_threshold; 1271 param.max_size = 2 * mul_fft_threshold; 1272 param.noprint = 0; 1273 param.step_factor = 0.03; 1274 one (&mullo_mul_n_threshold, ¶m); 1275 #else 1276 print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX, 1277 "without FFT use mullo forever"); 1278 #endif 1279 } 1280 1281 void 1282 tune_mulmod_bnm1 (void) 1283 { 1284 static struct param_t param; 1285 1286 param.name = "MULMOD_BNM1_THRESHOLD"; 1287 param.function = speed_mpn_mulmod_bnm1; 1288 param.min_size = 4; 1289 param.max_size = 100; 1290 one (&mulmod_bnm1_threshold, ¶m); 1291 } 1292 1293 void 1294 tune_sqrmod_bnm1 (void) 1295 { 1296 static struct param_t param; 1297 1298 param.name = "SQRMOD_BNM1_THRESHOLD"; 1299 param.function = speed_mpn_sqrmod_bnm1; 1300 param.min_size = 4; 1301 param.max_size = 100; 1302 one (&sqrmod_bnm1_threshold, ¶m); 1303 } 1304 1305 1306 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase 1307 is faster only at size==2 then we don't want to bother with extra code 1308 just for that. Start karatsuba from 4 same as MUL above. */ 1309 1310 void 1311 tune_sqr (void) 1312 { 1313 /* disabled until tuned */ 1314 SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; 1315 1316 if (HAVE_NATIVE_mpn_sqr_basecase) 1317 { 1318 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); 1319 sqr_basecase_threshold = 0; 1320 } 1321 else 1322 { 1323 static struct param_t param; 1324 param.name = "SQR_BASECASE_THRESHOLD"; 1325 param.function = speed_mpn_sqr; 1326 param.min_size = 3; 1327 param.min_is_always = 1; 1328 param.max_size = TUNE_SQR_TOOM2_MAX; 1329 param.noprint = 1; 1330 one (&sqr_basecase_threshold, ¶m); 1331 } 1332 1333 { 1334 static struct param_t param; 1335 param.name = "SQR_TOOM2_THRESHOLD"; 1336 param.function = speed_mpn_sqr; 1337 param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE); 1338 param.max_size = TUNE_SQR_TOOM2_MAX; 1339 param.noprint = 1; 1340 one (&sqr_toom2_threshold, ¶m); 1341 1342 if (! HAVE_NATIVE_mpn_sqr_basecase 1343 && sqr_toom2_threshold < sqr_basecase_threshold) 1344 { 1345 /* Karatsuba becomes faster than mul_basecase before 1346 sqr_basecase does. Arrange for the expression 1347 "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which 1348 selects mpn_sqr_basecase in mpn_sqr to be false, by setting 1349 SQR_TOOM2_THRESHOLD to zero, making 1350 SQR_BASECASE_THRESHOLD the toom2 threshold. */ 1351 1352 sqr_basecase_threshold = SQR_TOOM2_THRESHOLD; 1353 SQR_TOOM2_THRESHOLD = 0; 1354 1355 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, 1356 "toom2"); 1357 print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD, 1358 "never sqr_basecase"); 1359 } 1360 else 1361 { 1362 if (! HAVE_NATIVE_mpn_sqr_basecase) 1363 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); 1364 print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD); 1365 } 1366 } 1367 1368 { 1369 static struct param_t param; 1370 mp_size_t toom3_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold); 1371 1372 param.function = speed_mpn_sqr; 1373 1374 param.name = "SQR_TOOM3_THRESHOLD"; 1375 param.min_size = MAX (toom3_start, MPN_TOOM3_SQR_MINSIZE); 1376 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; 1377 one (&sqr_toom3_threshold, ¶m); 1378 1379 param.name = "SQR_TOOM4_THRESHOLD"; 1380 param.min_size = MAX (sqr_toom3_threshold, MPN_TOOM4_SQR_MINSIZE); 1381 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; 1382 one (&sqr_toom4_threshold, ¶m); 1383 1384 param.name = "SQR_TOOM6_THRESHOLD"; 1385 param.min_size = MAX (sqr_toom4_threshold, MPN_TOOM6_SQR_MINSIZE); 1386 param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1; 1387 one (&sqr_toom6_threshold, ¶m); 1388 1389 param.name = "SQR_TOOM8_THRESHOLD"; 1390 param.min_size = MAX (sqr_toom6_threshold, MPN_TOOM8_SQR_MINSIZE); 1391 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1; 1392 one (&sqr_toom8_threshold, ¶m); 1393 } 1394 } 1395 1396 1397 void 1398 tune_dc_div (void) 1399 { 1400 s.r = 0; /* clear to make speed function do 2n/n */ 1401 { 1402 static struct param_t param; 1403 param.name = "DC_DIV_QR_THRESHOLD"; 1404 param.function = speed_mpn_sbpi1_div_qr; 1405 param.function2 = speed_mpn_dcpi1_div_qr; 1406 param.min_size = 6; 1407 one (&dc_div_qr_threshold, ¶m); 1408 } 1409 { 1410 static struct param_t param; 1411 param.name = "DC_DIVAPPR_Q_THRESHOLD"; 1412 param.function = speed_mpn_sbpi1_divappr_q; 1413 param.function2 = speed_mpn_dcpi1_divappr_q; 1414 param.min_size = 6; 1415 one (&dc_divappr_q_threshold, ¶m); 1416 } 1417 } 1418 1419 static double 1420 speed_mpn_sbordcpi1_div_qr (struct speed_params *s) 1421 { 1422 if (s->size < DC_DIV_QR_THRESHOLD) 1423 return speed_mpn_sbpi1_div_qr (s); 1424 else 1425 return speed_mpn_dcpi1_div_qr (s); 1426 } 1427 1428 void 1429 tune_mu_div (void) 1430 { 1431 s.r = 0; /* clear to make speed function do 2n/n */ 1432 { 1433 static struct param_t param; 1434 param.name = "MU_DIV_QR_THRESHOLD"; 1435 param.function = speed_mpn_dcpi1_div_qr; 1436 param.function2 = speed_mpn_mu_div_qr; 1437 param.min_size = 6; 1438 param.max_size = 5000; 1439 param.step_factor = 0.02; 1440 one (&mu_div_qr_threshold, ¶m); 1441 } 1442 { 1443 static struct param_t param; 1444 param.name = "MU_DIVAPPR_Q_THRESHOLD"; 1445 param.function = speed_mpn_dcpi1_divappr_q; 1446 param.function2 = speed_mpn_mu_divappr_q; 1447 param.min_size = 6; 1448 param.max_size = 5000; 1449 param.step_factor = 0.02; 1450 one (&mu_divappr_q_threshold, ¶m); 1451 } 1452 { 1453 static struct param_t param; 1454 param.name = "MUPI_DIV_QR_THRESHOLD"; 1455 param.function = speed_mpn_sbordcpi1_div_qr; 1456 param.function2 = speed_mpn_mupi_div_qr; 1457 param.min_size = 6; 1458 param.min_is_always = 1; 1459 param.max_size = 1000; 1460 param.step_factor = 0.02; 1461 one (&mupi_div_qr_threshold, ¶m); 1462 } 1463 } 1464 1465 void 1466 tune_dc_bdiv (void) 1467 { 1468 s.r = 0; /* clear to make speed function do 2n/n*/ 1469 { 1470 static struct param_t param; 1471 param.name = "DC_BDIV_QR_THRESHOLD"; 1472 param.function = speed_mpn_sbpi1_bdiv_qr; 1473 param.function2 = speed_mpn_dcpi1_bdiv_qr; 1474 param.min_size = 4; 1475 one (&dc_bdiv_qr_threshold, ¶m); 1476 } 1477 { 1478 static struct param_t param; 1479 param.name = "DC_BDIV_Q_THRESHOLD"; 1480 param.function = speed_mpn_sbpi1_bdiv_q; 1481 param.function2 = speed_mpn_dcpi1_bdiv_q; 1482 param.min_size = 4; 1483 one (&dc_bdiv_q_threshold, ¶m); 1484 } 1485 } 1486 1487 void 1488 tune_mu_bdiv (void) 1489 { 1490 s.r = 0; /* clear to make speed function do 2n/n*/ 1491 { 1492 static struct param_t param; 1493 param.name = "MU_BDIV_QR_THRESHOLD"; 1494 param.function = speed_mpn_dcpi1_bdiv_qr; 1495 param.function2 = speed_mpn_mu_bdiv_qr; 1496 param.min_size = 4; 1497 param.max_size = 5000; 1498 param.step_factor = 0.02; 1499 one (&mu_bdiv_qr_threshold, ¶m); 1500 } 1501 { 1502 static struct param_t param; 1503 param.name = "MU_BDIV_Q_THRESHOLD"; 1504 param.function = speed_mpn_dcpi1_bdiv_q; 1505 param.function2 = speed_mpn_mu_bdiv_q; 1506 param.min_size = 4; 1507 param.max_size = 5000; 1508 param.step_factor = 0.02; 1509 one (&mu_bdiv_q_threshold, ¶m); 1510 } 1511 } 1512 1513 void 1514 tune_invertappr (void) 1515 { 1516 static struct param_t param; 1517 1518 param.function = speed_mpn_ni_invertappr; 1519 param.name = "INV_MULMOD_BNM1_THRESHOLD"; 1520 param.min_size = 4; 1521 one (&inv_mulmod_bnm1_threshold, ¶m); 1522 1523 param.function = speed_mpn_invertappr; 1524 param.name = "INV_NEWTON_THRESHOLD"; 1525 param.min_size = 3; 1526 one (&inv_newton_threshold, ¶m); 1527 } 1528 1529 void 1530 tune_invert (void) 1531 { 1532 static struct param_t param; 1533 1534 param.function = speed_mpn_invert; 1535 param.name = "INV_APPR_THRESHOLD"; 1536 param.min_size = 3; 1537 one (&inv_appr_threshold, ¶m); 1538 } 1539 1540 void 1541 tune_binvert (void) 1542 { 1543 static struct param_t param; 1544 1545 param.function = speed_mpn_binvert; 1546 param.name = "BINV_NEWTON_THRESHOLD"; 1547 param.min_size = 8; /* pointless with smaller operands */ 1548 one (&binv_newton_threshold, ¶m); 1549 } 1550 1551 void 1552 tune_redc (void) 1553 { 1554 #define TUNE_REDC_2_MAX 100 1555 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 1556 #define WANT_REDC_2 1 1557 #endif 1558 1559 #if WANT_REDC_2 1560 { 1561 static struct param_t param; 1562 param.name = "REDC_1_TO_REDC_2_THRESHOLD"; 1563 param.function = speed_mpn_redc_1; 1564 param.function2 = speed_mpn_redc_2; 1565 param.min_size = 1; 1566 param.min_is_always = 1; 1567 param.max_size = TUNE_REDC_2_MAX; 1568 param.noprint = 1; 1569 one (&redc_1_to_redc_2_threshold, ¶m); 1570 } 1571 { 1572 static struct param_t param; 1573 param.name = "REDC_2_TO_REDC_N_THRESHOLD"; 1574 param.function = speed_mpn_redc_2; 1575 param.function2 = speed_mpn_redc_n; 1576 param.min_size = 16; 1577 param.noprint = 1; 1578 one (&redc_2_to_redc_n_threshold, ¶m); 1579 } 1580 if (redc_1_to_redc_2_threshold >= TUNE_REDC_2_MAX - 1) 1581 { 1582 /* Disable REDC_2. This is not supposed to happen. */ 1583 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD); 1584 print_define_remark ("REDC_2_TO_REDC_N_THRESHOLD", 0, "anomaly: never REDC_2"); 1585 } 1586 else 1587 { 1588 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD); 1589 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD); 1590 } 1591 #else 1592 { 1593 static struct param_t param; 1594 param.name = "REDC_1_TO_REDC_N_THRESHOLD"; 1595 param.function = speed_mpn_redc_1; 1596 param.function2 = speed_mpn_redc_n; 1597 param.min_size = 16; 1598 one (&redc_1_to_redc_n_threshold, ¶m); 1599 } 1600 #endif 1601 } 1602 1603 void 1604 tune_matrix22_mul (void) 1605 { 1606 static struct param_t param; 1607 param.name = "MATRIX22_STRASSEN_THRESHOLD"; 1608 param.function = speed_mpn_matrix22_mul; 1609 param.min_size = 2; 1610 one (&matrix22_strassen_threshold, ¶m); 1611 } 1612 1613 void 1614 tune_hgcd (void) 1615 { 1616 static struct param_t param; 1617 param.name = "HGCD_THRESHOLD"; 1618 param.function = speed_mpn_hgcd; 1619 /* We seem to get strange results for small sizes */ 1620 param.min_size = 30; 1621 one (&hgcd_threshold, ¶m); 1622 } 1623 1624 void 1625 tune_gcd_dc (void) 1626 { 1627 static struct param_t param; 1628 param.name = "GCD_DC_THRESHOLD"; 1629 param.function = speed_mpn_gcd; 1630 param.min_size = hgcd_threshold; 1631 param.max_size = 3000; 1632 param.step_factor = 0.02; 1633 one (&gcd_dc_threshold, ¶m); 1634 } 1635 1636 void 1637 tune_gcdext_dc (void) 1638 { 1639 static struct param_t param; 1640 param.name = "GCDEXT_DC_THRESHOLD"; 1641 param.function = speed_mpn_gcdext; 1642 param.min_size = hgcd_threshold; 1643 param.max_size = 3000; 1644 param.step_factor = 0.02; 1645 one (&gcdext_dc_threshold, ¶m); 1646 } 1647 1648 1649 /* size_extra==1 reflects the fact that with high<divisor one division is 1650 always skipped. Forcing high<divisor while testing ensures consistency 1651 while stepping through sizes, ie. that size-1 divides will be done each 1652 time. 1653 1654 min_size==2 and min_is_always are used so that if plain division is only 1655 better at size==1 then don't bother including that code just for that 1656 case, instead go with preinv always and get a size saving. */ 1657 1658 #define DIV_1_PARAMS \ 1659 param.check_size = 256; \ 1660 param.min_size = 2; \ 1661 param.min_is_always = 1; \ 1662 param.data_high = DATA_HIGH_LT_R; \ 1663 param.size_extra = 1; \ 1664 param.stop_factor = 2.0; 1665 1666 1667 double (*tuned_speed_mpn_divrem_1) __GMP_PROTO ((struct speed_params *)); 1668 1669 void 1670 tune_divrem_1 (void) 1671 { 1672 /* plain version by default */ 1673 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1; 1674 1675 /* No support for tuning native assembler code, do that by hand and put 1676 the results in the .asm file, there's no need for such thresholds to 1677 appear in gmp-mparam.h. */ 1678 if (HAVE_NATIVE_mpn_divrem_1) 1679 return; 1680 1681 if (GMP_NAIL_BITS != 0) 1682 { 1683 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 1684 "no preinv with nails"); 1685 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 1686 "no preinv with nails"); 1687 return; 1688 } 1689 1690 if (UDIV_PREINV_ALWAYS) 1691 { 1692 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always"); 1693 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L); 1694 return; 1695 } 1696 1697 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune; 1698 1699 /* Tune for the integer part of mpn_divrem_1. This will very possibly be 1700 a bit out for the fractional part, but that's too bad, the integer part 1701 is more important. */ 1702 { 1703 static struct param_t param; 1704 param.name = "DIVREM_1_NORM_THRESHOLD"; 1705 DIV_1_PARAMS; 1706 s.r = randlimb_norm (); 1707 param.function = speed_mpn_divrem_1_tune; 1708 one (&divrem_1_norm_threshold, ¶m); 1709 } 1710 { 1711 static struct param_t param; 1712 param.name = "DIVREM_1_UNNORM_THRESHOLD"; 1713 DIV_1_PARAMS; 1714 s.r = randlimb_half (); 1715 param.function = speed_mpn_divrem_1_tune; 1716 one (&divrem_1_unnorm_threshold, ¶m); 1717 } 1718 } 1719 1720 1721 void 1722 tune_mod_1 (void) 1723 { 1724 /* No support for tuning native assembler code, do that by hand and put 1725 the results in the .asm file, there's no need for such thresholds to 1726 appear in gmp-mparam.h. */ 1727 if (HAVE_NATIVE_mpn_mod_1) 1728 return; 1729 1730 if (GMP_NAIL_BITS != 0) 1731 { 1732 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX, 1733 "no preinv with nails"); 1734 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, 1735 "no preinv with nails"); 1736 return; 1737 } 1738 1739 if (UDIV_PREINV_ALWAYS) 1740 { 1741 print_define ("MOD_1_NORM_THRESHOLD", 0L); 1742 print_define ("MOD_1_UNNORM_THRESHOLD", 0L); 1743 } 1744 else 1745 { 1746 { 1747 static struct param_t param; 1748 param.name = "MOD_1_NORM_THRESHOLD"; 1749 DIV_1_PARAMS; 1750 s.r = randlimb_norm (); 1751 param.function = speed_mpn_mod_1_tune; 1752 one (&mod_1_norm_threshold, ¶m); 1753 } 1754 { 1755 static struct param_t param; 1756 param.name = "MOD_1_UNNORM_THRESHOLD"; 1757 DIV_1_PARAMS; 1758 s.r = randlimb_half (); 1759 param.function = speed_mpn_mod_1_tune; 1760 one (&mod_1_unnorm_threshold, ¶m); 1761 } 1762 } 1763 { 1764 static struct param_t param; 1765 1766 param.check_size = 256; 1767 1768 s.r = randlimb_norm (); 1769 param.function = speed_mpn_mod_1_tune; 1770 1771 param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD"; 1772 param.min_size = 2; 1773 one (&mod_1n_to_mod_1_1_threshold, ¶m); 1774 } 1775 { 1776 static struct param_t param; 1777 1778 param.check_size = 256; 1779 1780 s.r = randlimb_norm () / 5; 1781 param.function = speed_mpn_mod_1_tune; 1782 param.noprint = 1; 1783 1784 param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD"; 1785 param.min_size = 2; 1786 one (&mod_1u_to_mod_1_1_threshold, ¶m); 1787 1788 param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD"; 1789 param.min_size = mod_1u_to_mod_1_1_threshold; 1790 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 1791 1792 if (mod_1u_to_mod_1_1_threshold + 2 >= mod_1_1_to_mod_1_2_threshold) 1793 { 1794 /* Disable mod_1_1, mod_1_2 is always faster. Measure when to switch 1795 (from mod_1_unnorm) to mod_1_2. */ 1796 mod_1_1_to_mod_1_2_threshold = 0; 1797 1798 /* This really measures mod_1u -> mod_1_2 */ 1799 param.min_size = 1; 1800 one (&mod_1u_to_mod_1_1_threshold, ¶m); 1801 } 1802 print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL); 1803 1804 param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD"; 1805 param.min_size = mod_1_1_to_mod_1_2_threshold; 1806 one (&mod_1_2_to_mod_1_4_threshold, ¶m); 1807 1808 if (mod_1_1_to_mod_1_2_threshold + 2 >= mod_1_2_to_mod_1_4_threshold) 1809 { 1810 /* Disable mod_1_2, mod_1_4 is always faster. Measure when to switch 1811 (from mod_1_unnorm or mod_1_1) to mod_1_4. */ 1812 mod_1_2_to_mod_1_4_threshold = 0; 1813 1814 param.min_size = 1; 1815 one (&mod_1_1_to_mod_1_2_threshold, ¶m); 1816 } 1817 print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, NULL); 1818 print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, NULL); 1819 } 1820 1821 { 1822 static struct param_t param; 1823 1824 param.check_size = 256; 1825 1826 param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD"; 1827 s.r = randlimb_norm (); 1828 param.function = speed_mpn_preinv_mod_1; 1829 param.function2 = speed_mpn_mod_1_tune; 1830 param.min_size = 1; 1831 one (&preinv_mod_1_to_mod_1_threshold, ¶m); 1832 } 1833 } 1834 1835 1836 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would 1837 imply that udiv_qrnnd_preinv is worth using, but it seems most 1838 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div 1839 directly. */ 1840 1841 void 1842 tune_preinv_divrem_1 (void) 1843 { 1844 static struct param_t param; 1845 speed_function_t divrem_1; 1846 const char *divrem_1_name; 1847 double t1, t2; 1848 1849 if (GMP_NAIL_BITS != 0) 1850 { 1851 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails"); 1852 return; 1853 } 1854 1855 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because 1856 it's faster than mpn_divrem_1. */ 1857 if (HAVE_NATIVE_mpn_preinv_divrem_1) 1858 { 1859 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); 1860 return; 1861 } 1862 1863 /* If udiv_qrnnd_preinv is the only division method then of course 1864 mpn_preinv_divrem_1 should be used. */ 1865 if (UDIV_PREINV_ALWAYS) 1866 { 1867 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); 1868 return; 1869 } 1870 1871 /* If we've got an assembler version of mpn_divrem_1, then compare against 1872 that, not the mpn_divrem_1_div generic C. */ 1873 if (HAVE_NATIVE_mpn_divrem_1) 1874 { 1875 divrem_1 = speed_mpn_divrem_1; 1876 divrem_1_name = "mpn_divrem_1"; 1877 } 1878 else 1879 { 1880 divrem_1 = speed_mpn_divrem_1_div; 1881 divrem_1_name = "mpn_divrem_1_div"; 1882 } 1883 1884 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ 1885 s.size = 200; /* generous but not too big */ 1886 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, 1887 since in general that's probably most common, though in fact for a 1888 64-bit limb mp_bases[10].big_base is normalized. */ 1889 s.r = urandom() & (GMP_NUMB_MASK >> 4); 1890 if (s.r == 0) s.r = 123; 1891 1892 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); 1893 t2 = tuneup_measure (divrem_1, ¶m, &s); 1894 if (t1 == -1.0 || t2 == -1.0) 1895 { 1896 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", 1897 divrem_1_name, (long) s.size); 1898 abort (); 1899 } 1900 if (option_trace >= 1) 1901 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", 1902 (long) s.size, t1, divrem_1_name, t2); 1903 1904 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); 1905 } 1906 1907 1908 1909 void 1910 tune_divrem_2 (void) 1911 { 1912 static struct param_t param; 1913 1914 /* No support for tuning native assembler code, do that by hand and put 1915 the results in the .asm file, and there's no need for such thresholds 1916 to appear in gmp-mparam.h. */ 1917 if (HAVE_NATIVE_mpn_divrem_2) 1918 return; 1919 1920 if (GMP_NAIL_BITS != 0) 1921 { 1922 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, 1923 "no preinv with nails"); 1924 return; 1925 } 1926 1927 if (UDIV_PREINV_ALWAYS) 1928 { 1929 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); 1930 return; 1931 } 1932 1933 /* Tune for the integer part of mpn_divrem_2. This will very possibly be 1934 a bit out for the fractional part, but that's too bad, the integer part 1935 is more important. 1936 1937 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save 1938 code space if plain division is better only at size==2 or size==3. */ 1939 param.name = "DIVREM_2_THRESHOLD"; 1940 param.check_size = 256; 1941 param.min_size = 4; 1942 param.min_is_always = 1; 1943 param.size_extra = 2; /* does qsize==nsize-2 divisions */ 1944 param.stop_factor = 2.0; 1945 1946 s.r = randlimb_norm (); 1947 param.function = speed_mpn_divrem_2; 1948 one (&divrem_2_threshold, ¶m); 1949 } 1950 1951 1952 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so 1953 tune for that. Its speed can differ on odd or even divisor, so take an 1954 average threshold for the two. 1955 1956 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1 1957 might not vary that way, but don't test this since high<divisor isn't 1958 expected to occur often with small divisors. */ 1959 1960 void 1961 tune_divexact_1 (void) 1962 { 1963 static struct param_t param; 1964 mp_size_t thresh[2], average; 1965 int low, i; 1966 1967 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a 1968 full mpn_divrem_1. */ 1969 if (HAVE_NATIVE_mpn_divexact_1) 1970 { 1971 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)"); 1972 return; 1973 } 1974 1975 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL); 1976 1977 param.name = "DIVEXACT_1_THRESHOLD"; 1978 param.data_high = DATA_HIGH_GE_R; 1979 param.check_size = 256; 1980 param.min_size = 2; 1981 param.stop_factor = 1.5; 1982 param.function = tuned_speed_mpn_divrem_1; 1983 param.function2 = speed_mpn_divexact_1; 1984 param.noprint = 1; 1985 1986 print_define_start (param.name); 1987 1988 for (low = 0; low <= 1; low++) 1989 { 1990 s.r = randlimb_half(); 1991 if (low == 0) 1992 s.r |= 1; 1993 else 1994 s.r &= ~CNST_LIMB(7); 1995 1996 one (&thresh[low], ¶m); 1997 if (option_trace) 1998 printf ("low=%d thresh %ld\n", low, (long) thresh[low]); 1999 2000 if (thresh[low] == MP_SIZE_T_MAX) 2001 { 2002 average = MP_SIZE_T_MAX; 2003 goto divexact_1_done; 2004 } 2005 } 2006 2007 if (option_trace) 2008 { 2009 printf ("average of:"); 2010 for (i = 0; i < numberof(thresh); i++) 2011 printf (" %ld", (long) thresh[i]); 2012 printf ("\n"); 2013 } 2014 2015 average = 0; 2016 for (i = 0; i < numberof(thresh); i++) 2017 average += thresh[i]; 2018 average /= numberof(thresh); 2019 2020 /* If divexact turns out to be better as early as 3 limbs, then use it 2021 always, so as to reduce code size and conditional jumps. */ 2022 if (average <= 3) 2023 average = 0; 2024 2025 divexact_1_done: 2026 print_define_end (param.name, average); 2027 } 2028 2029 2030 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the 2031 same as mpn_mod_1, but this might not be true of an assembler 2032 implementation. The threshold used is an average based on data where a 2033 divide can be skipped and where it can't. 2034 2035 If modexact turns out to be better as early as 3 limbs, then use it 2036 always, so as to reduce code size and conditional jumps. */ 2037 2038 void 2039 tune_modexact_1_odd (void) 2040 { 2041 static struct param_t param; 2042 mp_size_t thresh_lt, thresh_ge, average; 2043 2044 #if 0 2045 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed 2046 of a full mpn_mod_1. */ 2047 if (HAVE_NATIVE_mpn_modexact_1_odd) 2048 { 2049 print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1"); 2050 return; 2051 } 2052 #endif 2053 2054 param.name = "BMOD_1_TO_MOD_1_THRESHOLD"; 2055 param.check_size = 256; 2056 param.min_size = 2; 2057 param.stop_factor = 1.5; 2058 param.function = speed_mpn_modexact_1c_odd; 2059 param.function2 = speed_mpn_mod_1_tune; 2060 param.noprint = 1; 2061 s.r = randlimb_half () | 1; 2062 2063 print_define_start (param.name); 2064 2065 param.data_high = DATA_HIGH_LT_R; 2066 one (&thresh_lt, ¶m); 2067 if (option_trace) 2068 printf ("lt thresh %ld\n", (long) thresh_lt); 2069 2070 average = thresh_lt; 2071 if (thresh_lt != MP_SIZE_T_MAX) 2072 { 2073 param.data_high = DATA_HIGH_GE_R; 2074 one (&thresh_ge, ¶m); 2075 if (option_trace) 2076 printf ("ge thresh %ld\n", (long) thresh_ge); 2077 2078 if (thresh_ge != MP_SIZE_T_MAX) 2079 { 2080 average = (thresh_ge + thresh_lt) / 2; 2081 if (thresh_ge <= 3) 2082 average = 0; 2083 } 2084 } 2085 2086 print_define_end (param.name, average); 2087 } 2088 2089 2090 void 2091 tune_jacobi_base (void) 2092 { 2093 static struct param_t param; 2094 double t1, t2, t3; 2095 int method; 2096 2097 s.size = GMP_LIMB_BITS * 3 / 4; 2098 2099 t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s); 2100 if (option_trace >= 1) 2101 printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1); 2102 2103 t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s); 2104 if (option_trace >= 1) 2105 printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2); 2106 2107 t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s); 2108 if (option_trace >= 1) 2109 printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3); 2110 2111 if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0) 2112 { 2113 printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n", 2114 (long) s.size); 2115 abort (); 2116 } 2117 2118 if (t1 < t2 && t1 < t3) 2119 method = 1; 2120 else if (t2 < t3) 2121 method = 2; 2122 else 2123 method = 3; 2124 2125 print_define ("JACOBI_BASE_METHOD", method); 2126 } 2127 2128 2129 void 2130 tune_get_str (void) 2131 { 2132 /* Tune for decimal, it being most common. Some rough testing suggests 2133 other bases are different, but not by very much. */ 2134 s.r = 10; 2135 { 2136 static struct param_t param; 2137 GET_STR_PRECOMPUTE_THRESHOLD = 0; 2138 param.name = "GET_STR_DC_THRESHOLD"; 2139 param.function = speed_mpn_get_str; 2140 param.min_size = 4; 2141 param.max_size = GET_STR_THRESHOLD_LIMIT; 2142 one (&get_str_dc_threshold, ¶m); 2143 } 2144 { 2145 static struct param_t param; 2146 param.name = "GET_STR_PRECOMPUTE_THRESHOLD"; 2147 param.function = speed_mpn_get_str; 2148 param.min_size = GET_STR_DC_THRESHOLD; 2149 param.max_size = GET_STR_THRESHOLD_LIMIT; 2150 one (&get_str_precompute_threshold, ¶m); 2151 } 2152 } 2153 2154 2155 double 2156 speed_mpn_pre_set_str (struct speed_params *s) 2157 { 2158 unsigned char *str; 2159 mp_ptr wp; 2160 mp_size_t wn; 2161 unsigned i; 2162 int base; 2163 double t; 2164 mp_ptr powtab_mem, tp; 2165 powers_t powtab[GMP_LIMB_BITS]; 2166 mp_size_t un; 2167 int chars_per_limb; 2168 TMP_DECL; 2169 2170 SPEED_RESTRICT_COND (s->size >= 1); 2171 2172 base = s->r == 0 ? 10 : s->r; 2173 SPEED_RESTRICT_COND (base >= 2 && base <= 256); 2174 2175 TMP_MARK; 2176 2177 str = TMP_ALLOC (s->size); 2178 for (i = 0; i < s->size; i++) 2179 str[i] = s->xp[i] % base; 2180 2181 wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly)) 2182 / GMP_LIMB_BITS + 2; 2183 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); 2184 2185 /* use this during development to check wn is big enough */ 2186 /* 2187 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); 2188 */ 2189 2190 speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB); 2191 speed_operand_dst (s, wp, wn); 2192 speed_cache_fill (s); 2193 2194 chars_per_limb = mp_bases[base].chars_per_limb; 2195 un = s->size / chars_per_limb + 1; 2196 powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un)); 2197 mpn_set_str_compute_powtab (powtab, powtab_mem, un, base); 2198 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); 2199 2200 speed_starttime (); 2201 i = s->reps; 2202 do 2203 { 2204 mpn_pre_set_str (wp, str, s->size, powtab, tp); 2205 } 2206 while (--i != 0); 2207 t = speed_endtime (); 2208 2209 TMP_FREE; 2210 return t; 2211 } 2212 2213 void 2214 tune_set_str (void) 2215 { 2216 s.r = 10; /* decimal */ 2217 { 2218 static struct param_t param; 2219 SET_STR_PRECOMPUTE_THRESHOLD = 0; 2220 param.step_factor = 0.01; 2221 param.name = "SET_STR_DC_THRESHOLD"; 2222 param.function = speed_mpn_pre_set_str; 2223 param.min_size = 100; 2224 param.max_size = 50000; 2225 one (&set_str_dc_threshold, ¶m); 2226 } 2227 { 2228 static struct param_t param; 2229 param.step_factor = 0.02; 2230 param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; 2231 param.function = speed_mpn_set_str; 2232 param.min_size = SET_STR_DC_THRESHOLD; 2233 param.max_size = 100000; 2234 one (&set_str_precompute_threshold, ¶m); 2235 } 2236 } 2237 2238 2239 void 2240 tune_fft_mul (void) 2241 { 2242 static struct fft_param_t param; 2243 2244 if (option_fft_max_size == 0) 2245 return; 2246 2247 param.table_name = "MUL_FFT_TABLE3"; 2248 param.threshold_name = "MUL_FFT_THRESHOLD"; 2249 param.p_threshold = &mul_fft_threshold; 2250 param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; 2251 param.p_modf_threshold = &mul_fft_modf_threshold; 2252 param.first_size = MUL_TOOM33_THRESHOLD / 2; 2253 param.max_size = option_fft_max_size; 2254 param.function = speed_mpn_fft_mul; 2255 param.mul_modf_function = speed_mpn_mul_fft; 2256 param.mul_function = speed_mpn_mul_n; 2257 param.sqr = 0; 2258 fft (¶m); 2259 } 2260 2261 2262 void 2263 tune_fft_sqr (void) 2264 { 2265 static struct fft_param_t param; 2266 2267 if (option_fft_max_size == 0) 2268 return; 2269 2270 param.table_name = "SQR_FFT_TABLE3"; 2271 param.threshold_name = "SQR_FFT_THRESHOLD"; 2272 param.p_threshold = &sqr_fft_threshold; 2273 param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; 2274 param.p_modf_threshold = &sqr_fft_modf_threshold; 2275 param.first_size = SQR_TOOM3_THRESHOLD / 2; 2276 param.max_size = option_fft_max_size; 2277 param.function = speed_mpn_fft_sqr; 2278 param.mul_modf_function = speed_mpn_mul_fft_sqr; 2279 param.mul_function = speed_mpn_sqr; 2280 param.sqr = 1; 2281 fft (¶m); 2282 } 2283 2284 void 2285 all (void) 2286 { 2287 time_t start_time, end_time; 2288 TMP_DECL; 2289 2290 TMP_MARK; 2291 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); 2292 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); 2293 2294 mpn_random (s.xp_block, SPEED_BLOCK_SIZE); 2295 mpn_random (s.yp_block, SPEED_BLOCK_SIZE); 2296 2297 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); 2298 2299 speed_time_init (); 2300 fprintf (stderr, "Using: %s\n", speed_time_string); 2301 2302 fprintf (stderr, "speed_precision %d", speed_precision); 2303 if (speed_unittime == 1.0) 2304 fprintf (stderr, ", speed_unittime 1 cycle"); 2305 else 2306 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); 2307 if (speed_cycletime == 1.0 || speed_cycletime == 0.0) 2308 fprintf (stderr, ", CPU freq unknown\n"); 2309 else 2310 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); 2311 2312 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", 2313 DEFAULT_MAX_SIZE, (long) option_fft_max_size); 2314 fprintf (stderr, "\n"); 2315 2316 time (&start_time); 2317 { 2318 struct tm *tp; 2319 tp = localtime (&start_time); 2320 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", 2321 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); 2322 2323 #ifdef __GNUC__ 2324 /* gcc sub-minor version doesn't seem to come through as a define */ 2325 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); 2326 #define PRINTED_COMPILER 2327 #endif 2328 #if defined (__SUNPRO_C) 2329 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); 2330 #define PRINTED_COMPILER 2331 #endif 2332 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) 2333 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ 2334 printf ("MIPSpro C %d.%d.%d */\n", 2335 _COMPILER_VERSION / 100, 2336 _COMPILER_VERSION / 10 % 10, 2337 _COMPILER_VERSION % 10); 2338 #define PRINTED_COMPILER 2339 #endif 2340 #if defined (__DECC) && defined (__DECC_VER) 2341 printf ("DEC C %d */\n", __DECC_VER); 2342 #define PRINTED_COMPILER 2343 #endif 2344 #if ! defined (PRINTED_COMPILER) 2345 printf ("system compiler */\n"); 2346 #endif 2347 } 2348 printf ("\n"); 2349 2350 tune_divrem_1 (); 2351 tune_mod_1 (); 2352 tune_preinv_divrem_1 (); 2353 tune_divrem_2 (); 2354 tune_divexact_1 (); 2355 tune_modexact_1_odd (); 2356 printf("\n"); 2357 2358 tune_mul_n (); 2359 printf("\n"); 2360 2361 tune_mul (); 2362 printf("\n"); 2363 2364 tune_sqr (); 2365 printf("\n"); 2366 2367 tune_mulmod_bnm1 (); 2368 tune_sqrmod_bnm1 (); 2369 printf("\n"); 2370 2371 tune_fft_mul (); 2372 printf("\n"); 2373 2374 tune_fft_sqr (); 2375 printf ("\n"); 2376 2377 tune_mullo (); 2378 printf("\n"); 2379 2380 tune_dc_div (); 2381 tune_dc_bdiv (); 2382 2383 printf("\n"); 2384 tune_invertappr (); 2385 tune_invert (); 2386 printf("\n"); 2387 2388 tune_binvert (); 2389 tune_redc (); 2390 printf("\n"); 2391 2392 tune_mu_div (); 2393 tune_mu_bdiv (); 2394 printf("\n"); 2395 2396 tune_matrix22_mul (); 2397 tune_hgcd (); 2398 tune_gcd_dc (); 2399 tune_gcdext_dc (); 2400 tune_jacobi_base (); 2401 printf("\n"); 2402 2403 tune_get_str (); 2404 tune_set_str (); 2405 printf("\n"); 2406 2407 time (&end_time); 2408 printf ("/* Tuneup completed successfully, took %ld seconds */\n", 2409 (long) (end_time - start_time)); 2410 2411 TMP_FREE; 2412 } 2413 2414 2415 int 2416 main (int argc, char *argv[]) 2417 { 2418 int opt; 2419 2420 /* Unbuffered so if output is redirected to a file it isn't lost if the 2421 program is killed part way through. */ 2422 setbuf (stdout, NULL); 2423 setbuf (stderr, NULL); 2424 2425 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) 2426 { 2427 switch (opt) { 2428 case 'f': 2429 if (optarg[0] == 't') 2430 option_fft_trace = 2; 2431 else 2432 option_fft_max_size = atol (optarg); 2433 break; 2434 case 'o': 2435 speed_option_set (optarg); 2436 break; 2437 case 'p': 2438 speed_precision = atoi (optarg); 2439 break; 2440 case 't': 2441 option_trace++; 2442 break; 2443 case '?': 2444 exit(1); 2445 } 2446 } 2447 2448 all (); 2449 exit (0); 2450 } 2451