xref: /netbsd-src/external/lgpl3/gmp/dist/tune/tuneup.c (revision 82ad575716605df31379cf04a2f3efbc97b8a6f5)
1 /* Create tuned thresholds for various algorithms.
2 
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
4 Software Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20 
21 
22 /* Usage: tuneup [-t] [-t] [-p precision]
23 
24    -t turns on some diagnostic traces, a second -t turns on more traces.
25 
26    Notes:
27 
28    The code here isn't a vision of loveliness, mainly because it's subject
29    to ongoing changes according to new things wanting to be tuned, and
30    practical requirements of systems tested.
31 
32    Sometimes running the program twice produces slightly different results.
33    This is probably because there's so little separating algorithms near
34    their crossover, and on that basis it should make little or no difference
35    to the final speed of the relevant routines, but nothing has been done to
36    check that carefully.
37 
38    Algorithm:
39 
40    The thresholds are determined as follows.  A crossover may not be a
41    single size but rather a range where it oscillates between method A or
42    method B faster.  If the threshold is set making B used where A is faster
43    (or vice versa) that's bad.  Badness is the percentage time lost and
44    total badness is the sum of this over all sizes measured.  The threshold
45    is set to minimize total badness.
46 
47    Suppose, as sizes increase, method B becomes faster than method A.  The
48    effect of the rule is that, as you look at increasing sizes, isolated
49    points where B is faster are ignored, but when it's consistently faster,
50    or faster on balance, then the threshold is set there.  The same result
51    is obtained thinking in the other direction of A becoming faster at
52    smaller sizes.
53 
54    In practice the thresholds tend to be chosen to bring on the next
55    algorithm fairly quickly.
56 
57    This rule is attractive because it's got a basis in reason and is fairly
58    easy to implement, but no work has been done to actually compare it in
59    absolute terms to other possibilities.
60 
61    Implementation:
62 
63    In a normal library build the thresholds are constants.  To tune them
64    selected objects are recompiled with the thresholds as global variables
65    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
66    the end of gmp-impl.h, and rules in tune/Makefile.am.
67 
68    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
69    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
70    level, but recurse into the basecase.
71 
72    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
73    Other routines in turn will make use of both of those.  Naturally the
74    dependants must be tuned first.
75 
76    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
77    just a threshold based on comparing two routines (mpn_divrem_1 and
78    mpn_divexact_1), and no further use of the value determined.
79 
80    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
81    just comparisons between certain routines on representative data.
82 
83    Shortcuts are applied when native (assembler) versions of routines exist.
84    For instance a native mpn_sqr_basecase is assumed to be always faster
85    than mpn_mul_basecase, with no measuring.
86 
87    No attempt is made to tune within assembler routines, for instance
88    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
89    written and tuned all by hand.  Assembler routines that might have hard
90    limits are recompiled though, to make them accept a bigger range of sizes
91    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
92 
93    Limitations:
94 
95    The FFTs aren't subject to the same badness rule as the other thresholds,
96    so each k is probably being brought on a touch early.  This isn't likely
97    to make a difference, and the simpler probing means fewer tests.
98 
99 */
100 
101 #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
102 
103 #include "config.h"
104 
105 #include <math.h>
106 #include <stdio.h>
107 #include <stdlib.h>
108 #include <time.h>
109 #if HAVE_UNISTD_H
110 #include <unistd.h>
111 #endif
112 
113 #include "gmp.h"
114 #include "gmp-impl.h"
115 #include "longlong.h"
116 
117 #include "tests.h"
118 #include "speed.h"
119 
120 #if !HAVE_DECL_OPTARG
121 extern char *optarg;
122 extern int optind, opterr;
123 #endif
124 
125 
126 #define DEFAULT_MAX_SIZE   1000  /* limbs */
127 
128 #if WANT_FFT
129 mp_size_t  option_fft_max_size = 50000;  /* limbs */
130 #else
131 mp_size_t  option_fft_max_size = 0;
132 #endif
133 int        option_trace = 0;
134 int        option_fft_trace = 0;
135 struct speed_params  s;
136 
137 struct dat_t {
138   mp_size_t  size;
139   double     d;
140 } *dat = NULL;
141 int  ndat = 0;
142 int  allocdat = 0;
143 
144 /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
145    case use zero here, which for params.max_size means no limit.  */
146 #ifndef TUNE_SQR_TOOM2_MAX
147 #define TUNE_SQR_TOOM2_MAX  0
148 #endif
149 
150 mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
151 mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
152 mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
153 mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
154 mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
155 mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
156 mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
157 mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
158 mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
159 mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
160 mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
161 mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
162 mp_size_t  sqr_toom2_threshold
163   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
164 mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
165 mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
166 mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
167 mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
168 mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
169 mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
170 mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
171 mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
172 mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
173 mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
174 mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
175 mp_size_t  div_sb_preinv_threshold      = MP_SIZE_T_MAX;
176 mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
177 mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
178 mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
179 mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
180 mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
181 mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
182 mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
183 mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
184 mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
185 mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
186 mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
187 mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
188 mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
189 mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
190 mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
191 mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
192 mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
193 mp_size_t  powm_threshold               = MP_SIZE_T_MAX;
194 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
195 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
196 mp_size_t  gcd_accel_threshold          = MP_SIZE_T_MAX;
197 mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
198 mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
199 mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
200 mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
201 mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
202 mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
203 mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
204 mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
205 mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
206 mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
207 mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
208 mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
209 mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
210 mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
211 mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
212 mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
213 
214 mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
215 mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
216 
217 struct param_t {
218   const char        *name;
219   speed_function_t  function;
220   speed_function_t  function2;
221   double            step_factor;    /* how much to step relatively */
222   int               step;           /* how much to step absolutely */
223   double            function_fudge; /* multiplier for "function" speeds */
224   int               stop_since_change;
225   double            stop_factor;
226   mp_size_t         min_size;
227   int               min_is_always;
228   mp_size_t         max_size;
229   mp_size_t         check_size;
230   mp_size_t         size_extra;
231 
232 #define DATA_HIGH_LT_R  1
233 #define DATA_HIGH_GE_R  2
234   int               data_high;
235 
236   int               noprint;
237 };
238 
239 
240 /* These are normally undefined when false, which suits "#if" fine.
241    But give them zero values so they can be used in plain C "if"s.  */
242 #ifndef UDIV_PREINV_ALWAYS
243 #define UDIV_PREINV_ALWAYS 0
244 #endif
245 #ifndef HAVE_NATIVE_mpn_divexact_1
246 #define HAVE_NATIVE_mpn_divexact_1 0
247 #endif
248 #ifndef HAVE_NATIVE_mpn_divrem_1
249 #define HAVE_NATIVE_mpn_divrem_1 0
250 #endif
251 #ifndef HAVE_NATIVE_mpn_divrem_2
252 #define HAVE_NATIVE_mpn_divrem_2 0
253 #endif
254 #ifndef HAVE_NATIVE_mpn_mod_1
255 #define HAVE_NATIVE_mpn_mod_1 0
256 #endif
257 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
258 #define HAVE_NATIVE_mpn_modexact_1_odd 0
259 #endif
260 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
261 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
262 #endif
263 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
264 #define HAVE_NATIVE_mpn_preinv_mod_1 0
265 #endif
266 #ifndef HAVE_NATIVE_mpn_sqr_basecase
267 #define HAVE_NATIVE_mpn_sqr_basecase 0
268 #endif
269 
270 
271 #define MAX3(a,b,c)  MAX (MAX (a, b), c)
272 
273 mp_limb_t
274 randlimb_norm (void)
275 {
276   mp_limb_t  n;
277   mpn_random (&n, 1);
278   n |= GMP_NUMB_HIGHBIT;
279   return n;
280 }
281 
282 #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
283 
284 mp_limb_t
285 randlimb_half (void)
286 {
287   mp_limb_t  n;
288   mpn_random (&n, 1);
289   n &= GMP_NUMB_HALFMASK;
290   n += (n==0);
291   return n;
292 }
293 
294 
295 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
296    if necessary.  */
297 void
298 add_dat (mp_size_t size, double d)
299 {
300 #define ALLOCDAT_STEP  500
301 
302   ASSERT_ALWAYS (ndat <= allocdat);
303 
304   if (ndat == allocdat)
305     {
306       dat = (struct dat_t *) __gmp_allocate_or_reallocate
307         (dat, allocdat * sizeof(dat[0]),
308          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
309       allocdat += ALLOCDAT_STEP;
310     }
311 
312   dat[ndat].size = size;
313   dat[ndat].d = d;
314   ndat++;
315 }
316 
317 
318 /* Return the threshold size based on the data accumulated. */
319 mp_size_t
320 analyze_dat (int final)
321 {
322   double  x, min_x;
323   int     j, min_j;
324 
325   /* If the threshold is set at dat[0].size, any positive values are bad. */
326   x = 0.0;
327   for (j = 0; j < ndat; j++)
328     if (dat[j].d > 0.0)
329       x += dat[j].d;
330 
331   if (option_trace >= 2 && final)
332     {
333       printf ("\n");
334       printf ("x is the sum of the badness from setting thresh at given size\n");
335       printf ("  (minimum x is sought)\n");
336       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
337     }
338 
339   min_x = x;
340   min_j = 0;
341 
342 
343   /* When stepping to the next dat[j].size, positive values are no longer
344      bad (so subtracted), negative values become bad (so add the absolute
345      value, meaning subtract). */
346   for (j = 0; j < ndat; x -= dat[j].d, j++)
347     {
348       if (option_trace >= 2 && final)
349         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
350 
351       if (x < min_x)
352         {
353           min_x = x;
354           min_j = j;
355         }
356     }
357 
358   return min_j;
359 }
360 
361 
362 /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
363 
364 mp_limb_t mpn_divrem_1_tune
365   __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
366 mp_limb_t mpn_mod_1_tune
367    __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
368 
369 double
370 speed_mpn_mod_1_tune (struct speed_params *s)
371 {
372   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
373 }
374 double
375 speed_mpn_divrem_1_tune (struct speed_params *s)
376 {
377   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
378 }
379 
380 
381 double
382 tuneup_measure (speed_function_t fun,
383                 const struct param_t *param,
384                 struct speed_params *s)
385 {
386   static struct param_t  dummy;
387   double   t;
388   TMP_DECL;
389 
390   if (! param)
391     param = &dummy;
392 
393   s->size += param->size_extra;
394 
395   TMP_MARK;
396   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
397   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
398 
399   mpn_random (s->xp, s->size);
400   mpn_random (s->yp, s->size);
401 
402   switch (param->data_high) {
403   case DATA_HIGH_LT_R:
404     s->xp[s->size-1] %= s->r;
405     s->yp[s->size-1] %= s->r;
406     break;
407   case DATA_HIGH_GE_R:
408     s->xp[s->size-1] |= s->r;
409     s->yp[s->size-1] |= s->r;
410     break;
411   }
412 
413   t = speed_measure (fun, s);
414 
415   s->size -= param->size_extra;
416 
417   TMP_FREE;
418   return t;
419 }
420 
421 
422 #define PRINT_WIDTH  31
423 
424 void
425 print_define_start (const char *name)
426 {
427   printf ("#define %-*s  ", PRINT_WIDTH, name);
428   if (option_trace)
429     printf ("...\n");
430 }
431 
432 void
433 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
434 {
435   if (option_trace)
436     printf ("#define %-*s  ", PRINT_WIDTH, name);
437 
438   if (value == MP_SIZE_T_MAX)
439     printf ("MP_SIZE_T_MAX");
440   else
441     printf ("%5ld", (long) value);
442 
443   if (remark != NULL)
444     printf ("  /* %s */", remark);
445   printf ("\n");
446   fflush (stdout);
447 }
448 
449 void
450 print_define_end (const char *name, mp_size_t value)
451 {
452   const char  *remark;
453   if (value == MP_SIZE_T_MAX)
454     remark = "never";
455   else if (value == 0)
456     remark = "always";
457   else
458     remark = NULL;
459   print_define_end_remark (name, value, remark);
460 }
461 
462 void
463 print_define (const char *name, mp_size_t value)
464 {
465   print_define_start (name);
466   print_define_end (name, value);
467 }
468 
469 void
470 print_define_remark (const char *name, mp_size_t value, const char *remark)
471 {
472   print_define_start (name);
473   print_define_end_remark (name, value, remark);
474 }
475 
476 
477 void
478 one (mp_size_t *threshold, struct param_t *param)
479 {
480   int  since_positive, since_thresh_change;
481   int  thresh_idx, new_thresh_idx;
482 
483 #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
484 
485   DEFAULT (param->function_fudge, 1.0);
486   DEFAULT (param->function2, param->function);
487   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
488   DEFAULT (param->step, 1);            /* small steps by default */
489   DEFAULT (param->stop_since_change, 80);
490   DEFAULT (param->stop_factor, 1.2);
491   DEFAULT (param->min_size, 10);
492   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
493 
494   if (param->check_size != 0)
495     {
496       double   t1, t2;
497       s.size = param->check_size;
498 
499       *threshold = s.size+1;
500       t1 = tuneup_measure (param->function, param, &s);
501 
502       *threshold = s.size;
503       t2 = tuneup_measure (param->function2, param, &s);
504       if (t1 == -1.0 || t2 == -1.0)
505         {
506           printf ("Oops, can't run both functions at size %ld\n",
507                   (long) s.size);
508           abort ();
509         }
510       t1 *= param->function_fudge;
511 
512       /* ask that t2 is at least 4% below t1 */
513       if (t1 < t2*1.04)
514         {
515           if (option_trace)
516             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
517           *threshold = MP_SIZE_T_MAX;
518           if (! param->noprint)
519             print_define (param->name, *threshold);
520           return;
521         }
522 
523       if (option_trace >= 2)
524         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
525                 (long) s.size, t1, t2);
526     }
527 
528   if (! param->noprint || option_trace)
529     print_define_start (param->name);
530 
531   ndat = 0;
532   since_positive = 0;
533   since_thresh_change = 0;
534   thresh_idx = 0;
535 
536   if (option_trace >= 2)
537     {
538       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
539       printf ("              (seconds)    (seconds)    diff    thresh\n");
540     }
541 
542   for (s.size = param->min_size;
543        s.size < param->max_size;
544        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
545     {
546       double   ti, tiplus1, d;
547 
548       /*
549         FIXME: check minimum size requirements are met, possibly by just
550         checking for the -1 returns from the speed functions.
551       */
552 
553       /* using method A at this size */
554       *threshold = s.size+1;
555       ti = tuneup_measure (param->function, param, &s);
556       if (ti == -1.0)
557         abort ();
558       ti *= param->function_fudge;
559 
560       /* using method B at this size */
561       *threshold = s.size;
562       tiplus1 = tuneup_measure (param->function2, param, &s);
563       if (tiplus1 == -1.0)
564         abort ();
565 
566       /* Calculate the fraction by which the one or the other routine is
567          slower.  */
568       if (tiplus1 >= ti)
569         d = (tiplus1 - ti) / tiplus1;  /* negative */
570       else
571         d = (tiplus1 - ti) / ti;       /* positive */
572 
573       add_dat (s.size, d);
574 
575       new_thresh_idx = analyze_dat (0);
576 
577       if (option_trace >= 2)
578         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
579                 (long) s.size, ti, tiplus1, d,
580                 ti > tiplus1 ? '#' : ' ',
581                 (long) dat[new_thresh_idx].size);
582 
583       /* Stop if the last time method i was faster was more than a
584          certain number of measurements ago.  */
585 #define STOP_SINCE_POSITIVE  200
586       if (d >= 0)
587         since_positive = 0;
588       else
589         if (++since_positive > STOP_SINCE_POSITIVE)
590           {
591             if (option_trace >= 1)
592               printf ("stopped due to since_positive (%d)\n",
593                       STOP_SINCE_POSITIVE);
594             break;
595           }
596 
597       /* Stop if method A has become slower by a certain factor. */
598       if (ti >= tiplus1 * param->stop_factor)
599         {
600           if (option_trace >= 1)
601             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
602                     param->stop_factor);
603           break;
604         }
605 
606       /* Stop if the threshold implied hasn't changed in a certain
607          number of measurements.  (It's this condition that usually
608          stops the loop.) */
609       if (thresh_idx != new_thresh_idx)
610         since_thresh_change = 0, thresh_idx = new_thresh_idx;
611       else
612         if (++since_thresh_change > param->stop_since_change)
613           {
614             if (option_trace >= 1)
615               printf ("stopped due to since_thresh_change (%d)\n",
616                       param->stop_since_change);
617             break;
618           }
619 
620       /* Stop if the threshold implied is more than a certain number of
621          measurements ago.  */
622 #define STOP_SINCE_AFTER   500
623       if (ndat - thresh_idx > STOP_SINCE_AFTER)
624         {
625           if (option_trace >= 1)
626             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
627                     STOP_SINCE_AFTER);
628           break;
629         }
630 
631       /* Stop when the size limit is reached before the end of the
632          crossover, but only show this as an error for >= the default max
633          size.  FIXME: Maybe should make it a param choice whether this is
634          an error.  */
635       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
636         {
637           fprintf (stderr, "%s\n", param->name);
638           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
639                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
640           fprintf (stderr, "    max size reached before end of crossover\n");
641           break;
642         }
643     }
644 
645   if (option_trace >= 1)
646     printf ("sizes %ld to %ld total %d measurements\n",
647             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
648 
649   *threshold = dat[analyze_dat (1)].size;
650 
651   if (param->min_is_always)
652     {
653       if (*threshold == param->min_size)
654         *threshold = 0;
655     }
656 
657   if (! param->noprint || option_trace)
658     print_define_end (param->name, *threshold);
659 }
660 
661 
662 /* Special probing for the fft thresholds.  The size restrictions on the
663    FFTs mean the graph of time vs size has a step effect.  See this for
664    example using
665 
666        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
667        gnuplot foo.gnuplot
668 
669    The current approach is to compare routines at the midpoint of relevant
670    steps.  Arguably a more sophisticated system of threshold data is wanted
671    if this step effect remains. */
672 
673 struct fft_param_t {
674   const char        *table_name;
675   const char        *threshold_name;
676   const char        *modf_threshold_name;
677   mp_size_t         *p_threshold;
678   mp_size_t         *p_modf_threshold;
679   mp_size_t         first_size;
680   mp_size_t         max_size;
681   speed_function_t  function;
682   speed_function_t  mul_modf_function;
683   speed_function_t  mul_function;
684   mp_size_t         sqr;
685 };
686 
687 
688 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
689    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
690    of 2^(k-1) bits. */
691 
692 mp_size_t
693 fft_step_size (int k)
694 {
695   mp_size_t  step;
696 
697   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
698   step *= (mp_size_t) 1 << k;
699 
700   if (step <= 0)
701     {
702       printf ("Can't handle k=%d\n", k);
703       abort ();
704     }
705 
706   return step;
707 }
708 
709 mp_size_t
710 fft_next_size (mp_size_t pl, int k)
711 {
712   mp_size_t  m = fft_step_size (k);
713 
714 /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
715 
716   if (pl == 0 || (pl & (m-1)) != 0)
717     pl = (pl | (m-1)) + 1;
718 
719 /*    printf (" %ld\n", pl); */
720   return pl;
721 }
722 
723 #define NMAX_DEFAULT 1000000
724 #define MAX_REPS 25
725 #define MIN_REPS 5
726 
727 static inline size_t
728 mpn_mul_fft_lcm (size_t a, unsigned int k)
729 {
730   unsigned int l = k;
731 
732   while (a % 2 == 0 && k > 0)
733     {
734       a >>= 1;
735       k--;
736     }
737   return a << l;
738 }
739 
740 mp_size_t
741 fftfill (mp_size_t pl, int k, int sqr)
742 {
743   mp_size_t maxLK;
744   mp_bitcnt_t N, Nprime, nprime, M;
745 
746   N = pl * GMP_NUMB_BITS;
747   M = N >> k;
748 
749   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
750 
751   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
752   nprime = Nprime / GMP_NUMB_BITS;
753   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
754     {
755       size_t K2;
756       for (;;)
757 	{
758 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
759 	  if ((nprime & (K2 - 1)) == 0)
760 	    break;
761 	  nprime = (nprime + K2 - 1) & -K2;
762 	  Nprime = nprime * GMP_LIMB_BITS;
763 	}
764     }
765   ASSERT_ALWAYS (nprime < pl);
766 
767   return Nprime;
768 }
769 
770 static int
771 compare_double (const void *ap, const void *bp)
772 {
773   double a = * (const double *) ap;
774   double b = * (const double *) bp;
775 
776   if (a < b)
777     return -1;
778   else if (a > b)
779     return 1;
780   else
781     return 0;
782 }
783 
784 double
785 median (double *times, int n)
786 {
787   qsort (times, n, sizeof (double), compare_double);
788   return times[n/2];
789 }
790 
791 #define FFT_CACHE_SIZE 25
792 typedef struct fft_cache
793 {
794   mp_size_t n;
795   double time;
796 } fft_cache_t;
797 
798 fft_cache_t fft_cache[FFT_CACHE_SIZE];
799 
800 double
801 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
802 		int n_measurements)
803 {
804   int i;
805   double t, ttab[MAX_REPS];
806 
807   if (fft_cache[k].n == n)
808     return fft_cache[k].time;
809 
810   for (i = 0; i < n_measurements; i++)
811     {
812       speed_starttime ();
813       mpn_mul_fft (rp, n, ap, n, bp, n, k);
814       ttab[i] = speed_endtime ();
815     }
816 
817   t = median (ttab, n_measurements);
818   fft_cache[k].n = n;
819   fft_cache[k].time = t;
820   return t;
821 }
822 
823 #define INSERT_FFTTAB(idx, nval, kval)					\
824   do {									\
825     fft_tab[idx].n = nval;						\
826     fft_tab[idx].k = kval;						\
827     fft_tab[idx+1].n = -1;	/* sentinel */				\
828     fft_tab[idx+1].k = -1;						\
829   } while (0)
830 
831 int
832 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
833 {
834   mp_size_t n, n1, prev_n1;
835   int k, best_k, last_best_k, kmax;
836   int eff, prev_eff;
837   double t0, t1;
838   int n_measurements;
839   mp_limb_t *ap, *bp, *rp;
840   mp_size_t alloc;
841   char *linepref;
842   struct fft_table_nk *fft_tab;
843 
844   fft_tab = mpn_fft_table3[p->sqr];
845 
846   for (k = 0; k < FFT_CACHE_SIZE; k++)
847     fft_cache[k].n = 0;
848 
849   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
850     {
851       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
852     }
853 
854   if (print)
855     printf ("#define %s%*s", p->table_name, 38, "");
856 
857   if (idx == 0)
858     {
859       INSERT_FFTTAB (0, nmin, initial_k);
860 
861       if (print)
862 	{
863 	  printf ("\\\n  { ");
864 	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
865 	  linepref = "    ";
866 	}
867 
868       idx = 1;
869     }
870 
871   ap = malloc (sizeof (mp_limb_t));
872   if (p->sqr)
873     bp = ap;
874   else
875     bp = malloc (sizeof (mp_limb_t));
876   rp = malloc (sizeof (mp_limb_t));
877   alloc = 1;
878 
879   /* Round n to comply to initial k value */
880   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
881 
882   n_measurements = (18 - initial_k) | 1;
883   n_measurements = MAX (n_measurements, MIN_REPS);
884   n_measurements = MIN (n_measurements, MAX_REPS);
885 
886   last_best_k = initial_k;
887   best_k = initial_k;
888 
889   while (n < nmax)
890     {
891       int start_k, end_k;
892 
893       /* Assume the current best k is best until we hit its next FFT step.  */
894       t0 = 99999;
895 
896       prev_n1 = n + 1;
897 
898       start_k = MAX (4, best_k - 4);
899       end_k = MIN (24, best_k + 4);
900       for (k = start_k; k <= end_k; k++)
901 	{
902           n1 = mpn_fft_next_size (prev_n1, k);
903 
904 	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
905 
906 	  if (eff < 70)		/* avoid measuring too slow fft:s */
907 	    continue;
908 
909 	  if (n1 > alloc)
910 	    {
911 	      alloc = n1;
912 	      if (p->sqr)
913 		{
914 		  ap = realloc (ap, sizeof (mp_limb_t));
915 		  rp = realloc (rp, sizeof (mp_limb_t));
916 		  ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
917 		  mpn_random (ap, alloc);
918 		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
919 		}
920 	      else
921 		{
922 		  ap = realloc (ap, sizeof (mp_limb_t));
923 		  bp = realloc (bp, sizeof (mp_limb_t));
924 		  rp = realloc (rp, sizeof (mp_limb_t));
925 		  ap = realloc (ap, alloc * sizeof (mp_limb_t));
926 		  mpn_random (ap, alloc);
927 		  bp = realloc (bp, alloc * sizeof (mp_limb_t));
928 		  mpn_random (bp, alloc);
929 		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
930 		}
931 	    }
932 
933 	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
934 
935 	  if (t1 * n_measurements > 0.3)
936 	    n_measurements -= 2;
937 	  n_measurements = MAX (n_measurements, MIN_REPS);
938 
939 	  if (t1 < t0)
940 	    {
941 	      best_k = k;
942 	      t0 = t1;
943 	    }
944 	}
945 
946       n1 = mpn_fft_next_size (prev_n1, best_k);
947 
948       if (last_best_k != best_k)
949 	{
950 	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
951 
952 	  if (idx >= FFT_TABLE3_SIZE)
953 	    {
954 	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
955 	      abort ();
956 	    }
957 	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
958 
959 	  if (print)
960 	    {
961 	      printf (", ");
962 	      if (idx % 4 == 0)
963 		printf ("\\\n    ");
964 	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
965 	    }
966 
967 	  if (option_trace >= 2)
968 	    {
969 	      printf ("{%lu,%u}\n", prev_n1, best_k);
970 	      fflush (stdout);
971 	    }
972 
973 	  last_best_k = best_k;
974 	  idx++;
975 	}
976 
977       for (;;)
978 	{
979 	  prev_n1 = n1;
980 	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
981 	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
982 	  eff = fftfill (n1, best_k, p->sqr);
983 
984 	  if (eff != prev_eff)
985 	    break;
986 	}
987 
988       n = prev_n1;
989     }
990 
991   kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
992   kmax = MIN (kmax, 25-1);
993   for (k = last_best_k + 1; k <= kmax; k++)
994     {
995       if (idx >= FFT_TABLE3_SIZE)
996 	{
997 	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
998 	  abort ();
999 	}
1000       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1001 
1002       if (print)
1003 	{
1004 	  printf (", ");
1005 	  if (idx % 4 == 0)
1006 	    printf ("\\\n    ");
1007 	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1008 	}
1009 
1010       idx++;
1011     }
1012 
1013   if (print)
1014     printf (" }\n");
1015 
1016   free (ap);
1017   if (! p->sqr)
1018     free (bp);
1019   free (rp);
1020 
1021   return idx;
1022 }
1023 
1024 void
1025 fft (struct fft_param_t *p)
1026 {
1027   mp_size_t  size;
1028   int        k, idx, initial_k;
1029 
1030   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1031 
1032 #if 1
1033   {
1034     /* Use plain one() mechanism, for some reasonable initial values of k.  The
1035        advantage is that we don't depend on mpn_fft_table3, which can therefore
1036        leave it completely uninitialized.  */
1037 
1038     static struct param_t param;
1039     mp_size_t thres, best_thres;
1040     int best_k;
1041     char buf[20];
1042 
1043     best_thres = MP_SIZE_T_MAX;
1044     best_k = -1;
1045 
1046     for (k = 5; k <= 7; k++)
1047       {
1048 	param.name = p->modf_threshold_name;
1049 	param.min_size = 100;
1050 	param.max_size = 2000;
1051 	param.function  = p->mul_function;
1052 	param.step_factor = 0.0;
1053 	param.step = 4;
1054 	param.function2 = p->mul_modf_function;
1055 	param.noprint = 1;
1056 	s.r = k;
1057 	one (&thres, &param);
1058 	if (thres < best_thres)
1059 	  {
1060 	    best_thres = thres;
1061 	    best_k = k;
1062 	  }
1063       }
1064 
1065     *(p->p_modf_threshold) = best_thres;
1066     sprintf (buf, "k = %d", best_k);
1067     print_define_remark (p->modf_threshold_name, best_thres, buf);
1068     initial_k = best_k;
1069   }
1070 #else
1071   size = p->first_size;
1072   for (;;)
1073     {
1074       double  tk, tm;
1075 
1076       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1077       k = mpn_fft_best_k (size, p->sqr);
1078 
1079       if (size >= p->max_size)
1080         break;
1081 
1082       s.size = size + fft_step_size (k) / 2;
1083       s.r = k;
1084       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1085       if (tk == -1.0)
1086         abort ();
1087 
1088       tm = tuneup_measure (p->mul_function, NULL, &s);
1089       if (tm == -1.0)
1090         abort ();
1091 
1092       if (option_trace >= 2)
1093         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
1094                 (long) size,
1095                 (long) size + fft_step_size (k) / 2, k, tk,
1096                 (long) s.size, tm);
1097 
1098       if (tk < tm)
1099         {
1100 	  *p->p_modf_threshold = s.size;
1101 	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
1102 	  break;
1103         }
1104     }
1105   initial_k = ?;
1106 #endif
1107 
1108   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1109 
1110   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1111   printf ("#define %s_SIZE %d\n", p->table_name, idx);
1112 
1113   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1114 
1115   size = 2 * *p->p_modf_threshold;	/* OK? */
1116   for (;;)
1117     {
1118       double  tk, tm;
1119       mp_size_t mulmod_size, mul_size;;
1120 
1121       if (size >= p->max_size)
1122         break;
1123 
1124       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1125       mul_size = (size + mulmod_size) / 2;	/* middle of step */
1126 
1127       s.size = mulmod_size;
1128       tk = tuneup_measure (p->function, NULL, &s);
1129       if (tk == -1.0)
1130         abort ();
1131 
1132       s.size = mul_size;
1133       tm = tuneup_measure (p->mul_function, NULL, &s);
1134       if (tm == -1.0)
1135         abort ();
1136 
1137       if (option_trace >= 2)
1138         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
1139                 (long) size,
1140                 (long) mulmod_size, tk,
1141                 (long) mul_size, tm);
1142 
1143       size = mulmod_size;
1144 
1145       if (tk < tm)
1146         {
1147 	  *p->p_threshold = s.size;
1148 	  print_define (p->threshold_name, *p->p_threshold);
1149 	  break;
1150         }
1151     }
1152 }
1153 
1154 
1155 
1156 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1157    giving wrong results.  */
1158 void
1159 tune_mul_n (void)
1160 {
1161   static struct param_t  param;
1162 
1163   param.function = speed_mpn_mul_n;
1164 
1165   param.name = "MUL_TOOM22_THRESHOLD";
1166   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1167   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1168   one (&mul_toom22_threshold, &param);
1169 
1170   param.name = "MUL_TOOM33_THRESHOLD";
1171   param.min_size = MAX (mul_toom22_threshold, MPN_TOOM33_MUL_MINSIZE);
1172   param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1173   one (&mul_toom33_threshold, &param);
1174 
1175   param.name = "MUL_TOOM44_THRESHOLD";
1176   param.min_size = MAX (mul_toom33_threshold, MPN_TOOM44_MUL_MINSIZE);
1177   param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1178   one (&mul_toom44_threshold, &param);
1179 
1180   param.name = "MUL_TOOM6H_THRESHOLD";
1181   param.min_size = MAX (mul_toom44_threshold, MPN_TOOM6H_MUL_MINSIZE);
1182   param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1183   one (&mul_toom6h_threshold, &param);
1184 
1185   param.name = "MUL_TOOM8H_THRESHOLD";
1186   param.min_size = MAX (mul_toom6h_threshold, MPN_TOOM8H_MUL_MINSIZE);
1187   param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1188   one (&mul_toom8h_threshold, &param);
1189 
1190   /* disabled until tuned */
1191   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1192 }
1193 
1194 void
1195 tune_mul (void)
1196 {
1197   static struct param_t  param;
1198   mp_size_t thres;
1199 
1200   param.noprint = 1;
1201 
1202   param.function = speed_mpn_toom32_for_toom43_mul;
1203   param.function2 = speed_mpn_toom43_for_toom32_mul;
1204   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1205   param.min_size = MPN_TOOM43_MUL_MINSIZE;
1206   one (&thres, &param);
1207   mul_toom32_to_toom43_threshold = 17*thres/24;
1208   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1209 
1210   param.function = speed_mpn_toom32_for_toom53_mul;
1211   param.function2 = speed_mpn_toom53_for_toom32_mul;
1212   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1213   param.min_size = MPN_TOOM53_MUL_MINSIZE;
1214   one (&thres, &param);
1215   mul_toom32_to_toom53_threshold = 19*thres/30;
1216   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1217 
1218   param.function = speed_mpn_toom42_for_toom53_mul;
1219   param.function2 = speed_mpn_toom53_for_toom42_mul;
1220   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1221   param.min_size = MPN_TOOM53_MUL_MINSIZE;
1222   one (&thres, &param);
1223   mul_toom42_to_toom53_threshold = 11*thres/20;
1224   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1225 
1226   param.function = speed_mpn_toom42_mul;
1227   param.function2 = speed_mpn_toom63_mul;
1228   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1229   param.min_size = MPN_TOOM63_MUL_MINSIZE;
1230   one (&thres, &param);
1231   mul_toom42_to_toom63_threshold = thres/2;
1232   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1233 }
1234 
1235 
1236 void
1237 tune_mullo (void)
1238 {
1239   static struct param_t  param;
1240 
1241   param.function = speed_mpn_mullo_n;
1242 
1243   param.name = "MULLO_BASECASE_THRESHOLD";
1244   param.min_size = 1;
1245   param.min_is_always = 1;
1246   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1247   param.stop_factor = 1.5;
1248   param.noprint = 1;
1249   one (&mullo_basecase_threshold, &param);
1250 
1251   param.name = "MULLO_DC_THRESHOLD";
1252   param.min_size = 8;
1253   param.min_is_always = 0;
1254   param.max_size = 1000;
1255   one (&mullo_dc_threshold, &param);
1256 
1257   if (mullo_basecase_threshold >= mullo_dc_threshold)
1258     {
1259       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1260       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1261     }
1262   else
1263     {
1264       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1265       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1266     }
1267 
1268 #if WANT_FFT
1269   param.name = "MULLO_MUL_N_THRESHOLD";
1270   param.min_size = mullo_dc_threshold;
1271   param.max_size = 2 * mul_fft_threshold;
1272   param.noprint = 0;
1273   param.step_factor = 0.03;
1274   one (&mullo_mul_n_threshold, &param);
1275 #else
1276   print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1277                            "without FFT use mullo forever");
1278 #endif
1279 }
1280 
1281 void
1282 tune_mulmod_bnm1 (void)
1283 {
1284   static struct param_t  param;
1285 
1286   param.name = "MULMOD_BNM1_THRESHOLD";
1287   param.function = speed_mpn_mulmod_bnm1;
1288   param.min_size = 4;
1289   param.max_size = 100;
1290   one (&mulmod_bnm1_threshold, &param);
1291 }
1292 
1293 void
1294 tune_sqrmod_bnm1 (void)
1295 {
1296   static struct param_t  param;
1297 
1298   param.name = "SQRMOD_BNM1_THRESHOLD";
1299   param.function = speed_mpn_sqrmod_bnm1;
1300   param.min_size = 4;
1301   param.max_size = 100;
1302   one (&sqrmod_bnm1_threshold, &param);
1303 }
1304 
1305 
1306 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1307    is faster only at size==2 then we don't want to bother with extra code
1308    just for that.  Start karatsuba from 4 same as MUL above.  */
1309 
1310 void
1311 tune_sqr (void)
1312 {
1313   /* disabled until tuned */
1314   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1315 
1316   if (HAVE_NATIVE_mpn_sqr_basecase)
1317     {
1318       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1319       sqr_basecase_threshold = 0;
1320     }
1321   else
1322     {
1323       static struct param_t  param;
1324       param.name = "SQR_BASECASE_THRESHOLD";
1325       param.function = speed_mpn_sqr;
1326       param.min_size = 3;
1327       param.min_is_always = 1;
1328       param.max_size = TUNE_SQR_TOOM2_MAX;
1329       param.noprint = 1;
1330       one (&sqr_basecase_threshold, &param);
1331     }
1332 
1333   {
1334     static struct param_t  param;
1335     param.name = "SQR_TOOM2_THRESHOLD";
1336     param.function = speed_mpn_sqr;
1337     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1338     param.max_size = TUNE_SQR_TOOM2_MAX;
1339     param.noprint = 1;
1340     one (&sqr_toom2_threshold, &param);
1341 
1342     if (! HAVE_NATIVE_mpn_sqr_basecase
1343         && sqr_toom2_threshold < sqr_basecase_threshold)
1344       {
1345         /* Karatsuba becomes faster than mul_basecase before
1346            sqr_basecase does.  Arrange for the expression
1347            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1348            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1349            SQR_TOOM2_THRESHOLD to zero, making
1350            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
1351 
1352         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1353         SQR_TOOM2_THRESHOLD = 0;
1354 
1355         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1356                              "toom2");
1357         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1358                              "never sqr_basecase");
1359       }
1360     else
1361       {
1362         if (! HAVE_NATIVE_mpn_sqr_basecase)
1363           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1364         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1365       }
1366   }
1367 
1368   {
1369     static struct param_t  param;
1370     mp_size_t toom3_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1371 
1372     param.function = speed_mpn_sqr;
1373 
1374     param.name = "SQR_TOOM3_THRESHOLD";
1375     param.min_size = MAX (toom3_start, MPN_TOOM3_SQR_MINSIZE);
1376     param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1377     one (&sqr_toom3_threshold, &param);
1378 
1379     param.name = "SQR_TOOM4_THRESHOLD";
1380     param.min_size = MAX (sqr_toom3_threshold, MPN_TOOM4_SQR_MINSIZE);
1381     param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1382     one (&sqr_toom4_threshold, &param);
1383 
1384     param.name = "SQR_TOOM6_THRESHOLD";
1385     param.min_size = MAX (sqr_toom4_threshold, MPN_TOOM6_SQR_MINSIZE);
1386     param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1387     one (&sqr_toom6_threshold, &param);
1388 
1389     param.name = "SQR_TOOM8_THRESHOLD";
1390     param.min_size = MAX (sqr_toom6_threshold, MPN_TOOM8_SQR_MINSIZE);
1391     param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1392     one (&sqr_toom8_threshold, &param);
1393   }
1394 }
1395 
1396 
1397 void
1398 tune_dc_div (void)
1399 {
1400   s.r = 0;		/* clear to make speed function do 2n/n */
1401   {
1402     static struct param_t  param;
1403     param.name = "DC_DIV_QR_THRESHOLD";
1404     param.function = speed_mpn_sbpi1_div_qr;
1405     param.function2 = speed_mpn_dcpi1_div_qr;
1406     param.min_size = 6;
1407     one (&dc_div_qr_threshold, &param);
1408   }
1409   {
1410     static struct param_t  param;
1411     param.name = "DC_DIVAPPR_Q_THRESHOLD";
1412     param.function = speed_mpn_sbpi1_divappr_q;
1413     param.function2 = speed_mpn_dcpi1_divappr_q;
1414     param.min_size = 6;
1415     one (&dc_divappr_q_threshold, &param);
1416   }
1417 }
1418 
1419 static double
1420 speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1421 {
1422   if (s->size < DC_DIV_QR_THRESHOLD)
1423     return speed_mpn_sbpi1_div_qr (s);
1424   else
1425     return speed_mpn_dcpi1_div_qr (s);
1426 }
1427 
1428 void
1429 tune_mu_div (void)
1430 {
1431   s.r = 0;		/* clear to make speed function do 2n/n */
1432   {
1433     static struct param_t  param;
1434     param.name = "MU_DIV_QR_THRESHOLD";
1435     param.function = speed_mpn_dcpi1_div_qr;
1436     param.function2 = speed_mpn_mu_div_qr;
1437     param.min_size = 6;
1438     param.max_size = 5000;
1439     param.step_factor = 0.02;
1440     one (&mu_div_qr_threshold, &param);
1441   }
1442   {
1443     static struct param_t  param;
1444     param.name = "MU_DIVAPPR_Q_THRESHOLD";
1445     param.function = speed_mpn_dcpi1_divappr_q;
1446     param.function2 = speed_mpn_mu_divappr_q;
1447     param.min_size = 6;
1448     param.max_size = 5000;
1449     param.step_factor = 0.02;
1450     one (&mu_divappr_q_threshold, &param);
1451   }
1452   {
1453     static struct param_t  param;
1454     param.name = "MUPI_DIV_QR_THRESHOLD";
1455     param.function = speed_mpn_sbordcpi1_div_qr;
1456     param.function2 = speed_mpn_mupi_div_qr;
1457     param.min_size = 6;
1458     param.min_is_always = 1;
1459     param.max_size = 1000;
1460     param.step_factor = 0.02;
1461     one (&mupi_div_qr_threshold, &param);
1462   }
1463 }
1464 
1465 void
1466 tune_dc_bdiv (void)
1467 {
1468   s.r = 0;		/* clear to make speed function do 2n/n*/
1469   {
1470     static struct param_t  param;
1471     param.name = "DC_BDIV_QR_THRESHOLD";
1472     param.function = speed_mpn_sbpi1_bdiv_qr;
1473     param.function2 = speed_mpn_dcpi1_bdiv_qr;
1474     param.min_size = 4;
1475     one (&dc_bdiv_qr_threshold, &param);
1476   }
1477   {
1478     static struct param_t  param;
1479     param.name = "DC_BDIV_Q_THRESHOLD";
1480     param.function = speed_mpn_sbpi1_bdiv_q;
1481     param.function2 = speed_mpn_dcpi1_bdiv_q;
1482     param.min_size = 4;
1483     one (&dc_bdiv_q_threshold, &param);
1484   }
1485 }
1486 
1487 void
1488 tune_mu_bdiv (void)
1489 {
1490   s.r = 0;		/* clear to make speed function do 2n/n*/
1491   {
1492     static struct param_t  param;
1493     param.name = "MU_BDIV_QR_THRESHOLD";
1494     param.function = speed_mpn_dcpi1_bdiv_qr;
1495     param.function2 = speed_mpn_mu_bdiv_qr;
1496     param.min_size = 4;
1497     param.max_size = 5000;
1498     param.step_factor = 0.02;
1499     one (&mu_bdiv_qr_threshold, &param);
1500   }
1501   {
1502     static struct param_t  param;
1503     param.name = "MU_BDIV_Q_THRESHOLD";
1504     param.function = speed_mpn_dcpi1_bdiv_q;
1505     param.function2 = speed_mpn_mu_bdiv_q;
1506     param.min_size = 4;
1507     param.max_size = 5000;
1508     param.step_factor = 0.02;
1509     one (&mu_bdiv_q_threshold, &param);
1510   }
1511 }
1512 
1513 void
1514 tune_invertappr (void)
1515 {
1516   static struct param_t  param;
1517 
1518   param.function = speed_mpn_ni_invertappr;
1519   param.name = "INV_MULMOD_BNM1_THRESHOLD";
1520   param.min_size = 4;
1521   one (&inv_mulmod_bnm1_threshold, &param);
1522 
1523   param.function = speed_mpn_invertappr;
1524   param.name = "INV_NEWTON_THRESHOLD";
1525   param.min_size = 3;
1526   one (&inv_newton_threshold, &param);
1527 }
1528 
1529 void
1530 tune_invert (void)
1531 {
1532   static struct param_t  param;
1533 
1534   param.function = speed_mpn_invert;
1535   param.name = "INV_APPR_THRESHOLD";
1536   param.min_size = 3;
1537   one (&inv_appr_threshold, &param);
1538 }
1539 
1540 void
1541 tune_binvert (void)
1542 {
1543   static struct param_t  param;
1544 
1545   param.function = speed_mpn_binvert;
1546   param.name = "BINV_NEWTON_THRESHOLD";
1547   param.min_size = 8;		/* pointless with smaller operands */
1548   one (&binv_newton_threshold, &param);
1549 }
1550 
1551 void
1552 tune_redc (void)
1553 {
1554 #define TUNE_REDC_2_MAX 100
1555 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1556 #define WANT_REDC_2 1
1557 #endif
1558 
1559 #if WANT_REDC_2
1560   {
1561     static struct param_t  param;
1562     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1563     param.function = speed_mpn_redc_1;
1564     param.function2 = speed_mpn_redc_2;
1565     param.min_size = 1;
1566     param.min_is_always = 1;
1567     param.max_size = TUNE_REDC_2_MAX;
1568     param.noprint = 1;
1569     one (&redc_1_to_redc_2_threshold, &param);
1570   }
1571   {
1572     static struct param_t  param;
1573     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1574     param.function = speed_mpn_redc_2;
1575     param.function2 = speed_mpn_redc_n;
1576     param.min_size = 16;
1577     param.noprint = 1;
1578     one (&redc_2_to_redc_n_threshold, &param);
1579   }
1580   if (redc_1_to_redc_2_threshold >= TUNE_REDC_2_MAX - 1)
1581     {
1582       /* Disable REDC_2.  This is not supposed to happen.  */
1583       print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1584       print_define_remark ("REDC_2_TO_REDC_N_THRESHOLD", 0, "anomaly: never REDC_2");
1585     }
1586   else
1587     {
1588       print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1589       print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1590     }
1591 #else
1592   {
1593     static struct param_t  param;
1594     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1595     param.function = speed_mpn_redc_1;
1596     param.function2 = speed_mpn_redc_n;
1597     param.min_size = 16;
1598     one (&redc_1_to_redc_n_threshold, &param);
1599   }
1600 #endif
1601 }
1602 
1603 void
1604 tune_matrix22_mul (void)
1605 {
1606   static struct param_t  param;
1607   param.name = "MATRIX22_STRASSEN_THRESHOLD";
1608   param.function = speed_mpn_matrix22_mul;
1609   param.min_size = 2;
1610   one (&matrix22_strassen_threshold, &param);
1611 }
1612 
1613 void
1614 tune_hgcd (void)
1615 {
1616   static struct param_t  param;
1617   param.name = "HGCD_THRESHOLD";
1618   param.function = speed_mpn_hgcd;
1619   /* We seem to get strange results for small sizes */
1620   param.min_size = 30;
1621   one (&hgcd_threshold, &param);
1622 }
1623 
1624 void
1625 tune_gcd_dc (void)
1626 {
1627   static struct param_t  param;
1628   param.name = "GCD_DC_THRESHOLD";
1629   param.function = speed_mpn_gcd;
1630   param.min_size = hgcd_threshold;
1631   param.max_size = 3000;
1632   param.step_factor = 0.02;
1633   one (&gcd_dc_threshold, &param);
1634 }
1635 
1636 void
1637 tune_gcdext_dc (void)
1638 {
1639   static struct param_t  param;
1640   param.name = "GCDEXT_DC_THRESHOLD";
1641   param.function = speed_mpn_gcdext;
1642   param.min_size = hgcd_threshold;
1643   param.max_size = 3000;
1644   param.step_factor = 0.02;
1645   one (&gcdext_dc_threshold, &param);
1646 }
1647 
1648 
1649 /* size_extra==1 reflects the fact that with high<divisor one division is
1650    always skipped.  Forcing high<divisor while testing ensures consistency
1651    while stepping through sizes, ie. that size-1 divides will be done each
1652    time.
1653 
1654    min_size==2 and min_is_always are used so that if plain division is only
1655    better at size==1 then don't bother including that code just for that
1656    case, instead go with preinv always and get a size saving.  */
1657 
1658 #define DIV_1_PARAMS                    \
1659   param.check_size = 256;               \
1660   param.min_size = 2;                   \
1661   param.min_is_always = 1;              \
1662   param.data_high = DATA_HIGH_LT_R;     \
1663   param.size_extra = 1;                 \
1664   param.stop_factor = 2.0;
1665 
1666 
1667 double (*tuned_speed_mpn_divrem_1) __GMP_PROTO ((struct speed_params *));
1668 
1669 void
1670 tune_divrem_1 (void)
1671 {
1672   /* plain version by default */
1673   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
1674 
1675   /* No support for tuning native assembler code, do that by hand and put
1676      the results in the .asm file, there's no need for such thresholds to
1677      appear in gmp-mparam.h.  */
1678   if (HAVE_NATIVE_mpn_divrem_1)
1679     return;
1680 
1681   if (GMP_NAIL_BITS != 0)
1682     {
1683       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1684                            "no preinv with nails");
1685       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1686                            "no preinv with nails");
1687       return;
1688     }
1689 
1690   if (UDIV_PREINV_ALWAYS)
1691     {
1692       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
1693       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
1694       return;
1695     }
1696 
1697   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1698 
1699   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
1700      a bit out for the fractional part, but that's too bad, the integer part
1701      is more important. */
1702   {
1703     static struct param_t  param;
1704     param.name = "DIVREM_1_NORM_THRESHOLD";
1705     DIV_1_PARAMS;
1706     s.r = randlimb_norm ();
1707     param.function = speed_mpn_divrem_1_tune;
1708     one (&divrem_1_norm_threshold, &param);
1709   }
1710   {
1711     static struct param_t  param;
1712     param.name = "DIVREM_1_UNNORM_THRESHOLD";
1713     DIV_1_PARAMS;
1714     s.r = randlimb_half ();
1715     param.function = speed_mpn_divrem_1_tune;
1716     one (&divrem_1_unnorm_threshold, &param);
1717   }
1718 }
1719 
1720 
1721 void
1722 tune_mod_1 (void)
1723 {
1724   /* No support for tuning native assembler code, do that by hand and put
1725      the results in the .asm file, there's no need for such thresholds to
1726      appear in gmp-mparam.h.  */
1727   if (HAVE_NATIVE_mpn_mod_1)
1728     return;
1729 
1730   if (GMP_NAIL_BITS != 0)
1731     {
1732       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1733                            "no preinv with nails");
1734       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1735                            "no preinv with nails");
1736       return;
1737     }
1738 
1739   if (UDIV_PREINV_ALWAYS)
1740     {
1741       print_define ("MOD_1_NORM_THRESHOLD", 0L);
1742       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
1743     }
1744   else
1745     {
1746       {
1747 	static struct param_t  param;
1748 	param.name = "MOD_1_NORM_THRESHOLD";
1749 	DIV_1_PARAMS;
1750 	s.r = randlimb_norm ();
1751 	param.function = speed_mpn_mod_1_tune;
1752 	one (&mod_1_norm_threshold, &param);
1753       }
1754       {
1755 	static struct param_t  param;
1756 	param.name = "MOD_1_UNNORM_THRESHOLD";
1757 	DIV_1_PARAMS;
1758 	s.r = randlimb_half ();
1759 	param.function = speed_mpn_mod_1_tune;
1760 	one (&mod_1_unnorm_threshold, &param);
1761       }
1762     }
1763   {
1764     static struct param_t  param;
1765 
1766     param.check_size = 256;
1767 
1768     s.r = randlimb_norm ();
1769     param.function = speed_mpn_mod_1_tune;
1770 
1771     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
1772     param.min_size = 2;
1773     one (&mod_1n_to_mod_1_1_threshold, &param);
1774   }
1775   {
1776     static struct param_t  param;
1777 
1778     param.check_size = 256;
1779 
1780     s.r = randlimb_norm () / 5;
1781     param.function = speed_mpn_mod_1_tune;
1782     param.noprint = 1;
1783 
1784     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
1785     param.min_size = 2;
1786     one (&mod_1u_to_mod_1_1_threshold, &param);
1787 
1788     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
1789     param.min_size = mod_1u_to_mod_1_1_threshold;
1790     one (&mod_1_1_to_mod_1_2_threshold, &param);
1791 
1792     if (mod_1u_to_mod_1_1_threshold + 2 >= mod_1_1_to_mod_1_2_threshold)
1793       {
1794 	/* Disable mod_1_1, mod_1_2 is always faster.  Measure when to switch
1795 	   (from mod_1_unnorm) to mod_1_2.  */
1796 	mod_1_1_to_mod_1_2_threshold = 0;
1797 
1798 	/* This really measures mod_1u -> mod_1_2 */
1799 	param.min_size = 1;
1800 	one (&mod_1u_to_mod_1_1_threshold, &param);
1801       }
1802     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
1803 
1804     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
1805     param.min_size = mod_1_1_to_mod_1_2_threshold;
1806     one (&mod_1_2_to_mod_1_4_threshold, &param);
1807 
1808     if (mod_1_1_to_mod_1_2_threshold + 2 >= mod_1_2_to_mod_1_4_threshold)
1809       {
1810 	/* Disable mod_1_2, mod_1_4 is always faster.  Measure when to switch
1811 	   (from mod_1_unnorm or mod_1_1) to mod_1_4.  */
1812 	mod_1_2_to_mod_1_4_threshold = 0;
1813 
1814 	param.min_size = 1;
1815 	one (&mod_1_1_to_mod_1_2_threshold, &param);
1816       }
1817     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, NULL);
1818     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, NULL);
1819   }
1820 
1821   {
1822     static struct param_t  param;
1823 
1824     param.check_size = 256;
1825 
1826     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
1827     s.r = randlimb_norm ();
1828     param.function = speed_mpn_preinv_mod_1;
1829     param.function2 = speed_mpn_mod_1_tune;
1830     param.min_size = 1;
1831     one (&preinv_mod_1_to_mod_1_threshold, &param);
1832   }
1833 }
1834 
1835 
1836 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
1837    imply that udiv_qrnnd_preinv is worth using, but it seems most
1838    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
1839    directly.  */
1840 
1841 void
1842 tune_preinv_divrem_1 (void)
1843 {
1844   static struct param_t  param;
1845   speed_function_t  divrem_1;
1846   const char        *divrem_1_name;
1847   double            t1, t2;
1848 
1849   if (GMP_NAIL_BITS != 0)
1850     {
1851       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
1852       return;
1853     }
1854 
1855   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
1856      it's faster than mpn_divrem_1.  */
1857   if (HAVE_NATIVE_mpn_preinv_divrem_1)
1858     {
1859       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
1860       return;
1861     }
1862 
1863   /* If udiv_qrnnd_preinv is the only division method then of course
1864      mpn_preinv_divrem_1 should be used.  */
1865   if (UDIV_PREINV_ALWAYS)
1866     {
1867       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
1868       return;
1869     }
1870 
1871   /* If we've got an assembler version of mpn_divrem_1, then compare against
1872      that, not the mpn_divrem_1_div generic C.  */
1873   if (HAVE_NATIVE_mpn_divrem_1)
1874     {
1875       divrem_1 = speed_mpn_divrem_1;
1876       divrem_1_name = "mpn_divrem_1";
1877     }
1878   else
1879     {
1880       divrem_1 = speed_mpn_divrem_1_div;
1881       divrem_1_name = "mpn_divrem_1_div";
1882     }
1883 
1884   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
1885   s.size = 200;                     /* generous but not too big */
1886   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
1887      since in general that's probably most common, though in fact for a
1888      64-bit limb mp_bases[10].big_base is normalized.  */
1889   s.r = urandom() & (GMP_NUMB_MASK >> 4);
1890   if (s.r == 0) s.r = 123;
1891 
1892   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
1893   t2 = tuneup_measure (divrem_1, &param, &s);
1894   if (t1 == -1.0 || t2 == -1.0)
1895     {
1896       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
1897               divrem_1_name, (long) s.size);
1898       abort ();
1899     }
1900   if (option_trace >= 1)
1901     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
1902             (long) s.size, t1, divrem_1_name, t2);
1903 
1904   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
1905 }
1906 
1907 
1908 
1909 void
1910 tune_divrem_2 (void)
1911 {
1912   static struct param_t  param;
1913 
1914   /* No support for tuning native assembler code, do that by hand and put
1915      the results in the .asm file, and there's no need for such thresholds
1916      to appear in gmp-mparam.h.  */
1917   if (HAVE_NATIVE_mpn_divrem_2)
1918     return;
1919 
1920   if (GMP_NAIL_BITS != 0)
1921     {
1922       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
1923                            "no preinv with nails");
1924       return;
1925     }
1926 
1927   if (UDIV_PREINV_ALWAYS)
1928     {
1929       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
1930       return;
1931     }
1932 
1933   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
1934      a bit out for the fractional part, but that's too bad, the integer part
1935      is more important.
1936 
1937      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
1938      code space if plain division is better only at size==2 or size==3. */
1939   param.name = "DIVREM_2_THRESHOLD";
1940   param.check_size = 256;
1941   param.min_size = 4;
1942   param.min_is_always = 1;
1943   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
1944   param.stop_factor = 2.0;
1945 
1946   s.r = randlimb_norm ();
1947   param.function = speed_mpn_divrem_2;
1948   one (&divrem_2_threshold, &param);
1949 }
1950 
1951 
1952 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
1953    tune for that.  Its speed can differ on odd or even divisor, so take an
1954    average threshold for the two.
1955 
1956    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
1957    might not vary that way, but don't test this since high<divisor isn't
1958    expected to occur often with small divisors.  */
1959 
1960 void
1961 tune_divexact_1 (void)
1962 {
1963   static struct param_t  param;
1964   mp_size_t  thresh[2], average;
1965   int        low, i;
1966 
1967   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
1968      full mpn_divrem_1.  */
1969   if (HAVE_NATIVE_mpn_divexact_1)
1970     {
1971       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
1972       return;
1973     }
1974 
1975   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
1976 
1977   param.name = "DIVEXACT_1_THRESHOLD";
1978   param.data_high = DATA_HIGH_GE_R;
1979   param.check_size = 256;
1980   param.min_size = 2;
1981   param.stop_factor = 1.5;
1982   param.function  = tuned_speed_mpn_divrem_1;
1983   param.function2 = speed_mpn_divexact_1;
1984   param.noprint = 1;
1985 
1986   print_define_start (param.name);
1987 
1988   for (low = 0; low <= 1; low++)
1989     {
1990       s.r = randlimb_half();
1991       if (low == 0)
1992         s.r |= 1;
1993       else
1994         s.r &= ~CNST_LIMB(7);
1995 
1996       one (&thresh[low], &param);
1997       if (option_trace)
1998         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
1999 
2000       if (thresh[low] == MP_SIZE_T_MAX)
2001         {
2002           average = MP_SIZE_T_MAX;
2003           goto divexact_1_done;
2004         }
2005     }
2006 
2007   if (option_trace)
2008     {
2009       printf ("average of:");
2010       for (i = 0; i < numberof(thresh); i++)
2011         printf (" %ld", (long) thresh[i]);
2012       printf ("\n");
2013     }
2014 
2015   average = 0;
2016   for (i = 0; i < numberof(thresh); i++)
2017     average += thresh[i];
2018   average /= numberof(thresh);
2019 
2020   /* If divexact turns out to be better as early as 3 limbs, then use it
2021      always, so as to reduce code size and conditional jumps.  */
2022   if (average <= 3)
2023     average = 0;
2024 
2025  divexact_1_done:
2026   print_define_end (param.name, average);
2027 }
2028 
2029 
2030 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2031    same as mpn_mod_1, but this might not be true of an assembler
2032    implementation.  The threshold used is an average based on data where a
2033    divide can be skipped and where it can't.
2034 
2035    If modexact turns out to be better as early as 3 limbs, then use it
2036    always, so as to reduce code size and conditional jumps.  */
2037 
2038 void
2039 tune_modexact_1_odd (void)
2040 {
2041   static struct param_t  param;
2042   mp_size_t  thresh_lt, thresh_ge, average;
2043 
2044 #if 0
2045   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2046      of a full mpn_mod_1.  */
2047   if (HAVE_NATIVE_mpn_modexact_1_odd)
2048     {
2049       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2050       return;
2051     }
2052 #endif
2053 
2054   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2055   param.check_size = 256;
2056   param.min_size = 2;
2057   param.stop_factor = 1.5;
2058   param.function  = speed_mpn_modexact_1c_odd;
2059   param.function2 = speed_mpn_mod_1_tune;
2060   param.noprint = 1;
2061   s.r = randlimb_half () | 1;
2062 
2063   print_define_start (param.name);
2064 
2065   param.data_high = DATA_HIGH_LT_R;
2066   one (&thresh_lt, &param);
2067   if (option_trace)
2068     printf ("lt thresh %ld\n", (long) thresh_lt);
2069 
2070   average = thresh_lt;
2071   if (thresh_lt != MP_SIZE_T_MAX)
2072     {
2073       param.data_high = DATA_HIGH_GE_R;
2074       one (&thresh_ge, &param);
2075       if (option_trace)
2076         printf ("ge thresh %ld\n", (long) thresh_ge);
2077 
2078       if (thresh_ge != MP_SIZE_T_MAX)
2079         {
2080           average = (thresh_ge + thresh_lt) / 2;
2081           if (thresh_ge <= 3)
2082             average = 0;
2083         }
2084     }
2085 
2086   print_define_end (param.name, average);
2087 }
2088 
2089 
2090 void
2091 tune_jacobi_base (void)
2092 {
2093   static struct param_t  param;
2094   double   t1, t2, t3;
2095   int      method;
2096 
2097   s.size = GMP_LIMB_BITS * 3 / 4;
2098 
2099   t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
2100   if (option_trace >= 1)
2101     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
2102 
2103   t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
2104   if (option_trace >= 1)
2105     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
2106 
2107   t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
2108   if (option_trace >= 1)
2109     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
2110 
2111   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
2112     {
2113       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
2114               (long) s.size);
2115       abort ();
2116     }
2117 
2118   if (t1 < t2 && t1 < t3)
2119     method = 1;
2120   else if (t2 < t3)
2121     method = 2;
2122   else
2123     method = 3;
2124 
2125   print_define ("JACOBI_BASE_METHOD", method);
2126 }
2127 
2128 
2129 void
2130 tune_get_str (void)
2131 {
2132   /* Tune for decimal, it being most common.  Some rough testing suggests
2133      other bases are different, but not by very much.  */
2134   s.r = 10;
2135   {
2136     static struct param_t  param;
2137     GET_STR_PRECOMPUTE_THRESHOLD = 0;
2138     param.name = "GET_STR_DC_THRESHOLD";
2139     param.function = speed_mpn_get_str;
2140     param.min_size = 4;
2141     param.max_size = GET_STR_THRESHOLD_LIMIT;
2142     one (&get_str_dc_threshold, &param);
2143   }
2144   {
2145     static struct param_t  param;
2146     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2147     param.function = speed_mpn_get_str;
2148     param.min_size = GET_STR_DC_THRESHOLD;
2149     param.max_size = GET_STR_THRESHOLD_LIMIT;
2150     one (&get_str_precompute_threshold, &param);
2151   }
2152 }
2153 
2154 
2155 double
2156 speed_mpn_pre_set_str (struct speed_params *s)
2157 {
2158   unsigned char *str;
2159   mp_ptr     wp;
2160   mp_size_t  wn;
2161   unsigned   i;
2162   int        base;
2163   double     t;
2164   mp_ptr powtab_mem, tp;
2165   powers_t powtab[GMP_LIMB_BITS];
2166   mp_size_t un;
2167   int chars_per_limb;
2168   TMP_DECL;
2169 
2170   SPEED_RESTRICT_COND (s->size >= 1);
2171 
2172   base = s->r == 0 ? 10 : s->r;
2173   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2174 
2175   TMP_MARK;
2176 
2177   str = TMP_ALLOC (s->size);
2178   for (i = 0; i < s->size; i++)
2179     str[i] = s->xp[i] % base;
2180 
2181   wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly))
2182     / GMP_LIMB_BITS + 2;
2183   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2184 
2185   /* use this during development to check wn is big enough */
2186   /*
2187   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2188   */
2189 
2190   speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
2191   speed_operand_dst (s, wp, wn);
2192   speed_cache_fill (s);
2193 
2194   chars_per_limb = mp_bases[base].chars_per_limb;
2195   un = s->size / chars_per_limb + 1;
2196   powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
2197   mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
2198   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2199 
2200   speed_starttime ();
2201   i = s->reps;
2202   do
2203     {
2204       mpn_pre_set_str (wp, str, s->size, powtab, tp);
2205     }
2206   while (--i != 0);
2207   t = speed_endtime ();
2208 
2209   TMP_FREE;
2210   return t;
2211 }
2212 
2213 void
2214 tune_set_str (void)
2215 {
2216   s.r = 10;  /* decimal */
2217   {
2218     static struct param_t  param;
2219     SET_STR_PRECOMPUTE_THRESHOLD = 0;
2220     param.step_factor = 0.01;
2221     param.name = "SET_STR_DC_THRESHOLD";
2222     param.function = speed_mpn_pre_set_str;
2223     param.min_size = 100;
2224     param.max_size = 50000;
2225     one (&set_str_dc_threshold, &param);
2226   }
2227   {
2228     static struct param_t  param;
2229     param.step_factor = 0.02;
2230     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2231     param.function = speed_mpn_set_str;
2232     param.min_size = SET_STR_DC_THRESHOLD;
2233     param.max_size = 100000;
2234     one (&set_str_precompute_threshold, &param);
2235   }
2236 }
2237 
2238 
2239 void
2240 tune_fft_mul (void)
2241 {
2242   static struct fft_param_t  param;
2243 
2244   if (option_fft_max_size == 0)
2245     return;
2246 
2247   param.table_name          = "MUL_FFT_TABLE3";
2248   param.threshold_name      = "MUL_FFT_THRESHOLD";
2249   param.p_threshold         = &mul_fft_threshold;
2250   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2251   param.p_modf_threshold    = &mul_fft_modf_threshold;
2252   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
2253   param.max_size            = option_fft_max_size;
2254   param.function            = speed_mpn_fft_mul;
2255   param.mul_modf_function   = speed_mpn_mul_fft;
2256   param.mul_function        = speed_mpn_mul_n;
2257   param.sqr = 0;
2258   fft (&param);
2259 }
2260 
2261 
2262 void
2263 tune_fft_sqr (void)
2264 {
2265   static struct fft_param_t  param;
2266 
2267   if (option_fft_max_size == 0)
2268     return;
2269 
2270   param.table_name          = "SQR_FFT_TABLE3";
2271   param.threshold_name      = "SQR_FFT_THRESHOLD";
2272   param.p_threshold         = &sqr_fft_threshold;
2273   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2274   param.p_modf_threshold    = &sqr_fft_modf_threshold;
2275   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
2276   param.max_size            = option_fft_max_size;
2277   param.function            = speed_mpn_fft_sqr;
2278   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
2279   param.mul_function        = speed_mpn_sqr;
2280   param.sqr = 1;
2281   fft (&param);
2282 }
2283 
2284 void
2285 all (void)
2286 {
2287   time_t  start_time, end_time;
2288   TMP_DECL;
2289 
2290   TMP_MARK;
2291   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2292   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2293 
2294   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2295   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2296 
2297   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2298 
2299   speed_time_init ();
2300   fprintf (stderr, "Using: %s\n", speed_time_string);
2301 
2302   fprintf (stderr, "speed_precision %d", speed_precision);
2303   if (speed_unittime == 1.0)
2304     fprintf (stderr, ", speed_unittime 1 cycle");
2305   else
2306     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2307   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2308     fprintf (stderr, ", CPU freq unknown\n");
2309   else
2310     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2311 
2312   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2313            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2314   fprintf (stderr, "\n");
2315 
2316   time (&start_time);
2317   {
2318     struct tm  *tp;
2319     tp = localtime (&start_time);
2320     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2321             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2322 
2323 #ifdef __GNUC__
2324     /* gcc sub-minor version doesn't seem to come through as a define */
2325     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2326 #define PRINTED_COMPILER
2327 #endif
2328 #if defined (__SUNPRO_C)
2329     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2330 #define PRINTED_COMPILER
2331 #endif
2332 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2333     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2334     printf ("MIPSpro C %d.%d.%d */\n",
2335 	    _COMPILER_VERSION / 100,
2336 	    _COMPILER_VERSION / 10 % 10,
2337 	    _COMPILER_VERSION % 10);
2338 #define PRINTED_COMPILER
2339 #endif
2340 #if defined (__DECC) && defined (__DECC_VER)
2341     printf ("DEC C %d */\n", __DECC_VER);
2342 #define PRINTED_COMPILER
2343 #endif
2344 #if ! defined (PRINTED_COMPILER)
2345     printf ("system compiler */\n");
2346 #endif
2347   }
2348   printf ("\n");
2349 
2350   tune_divrem_1 ();
2351   tune_mod_1 ();
2352   tune_preinv_divrem_1 ();
2353   tune_divrem_2 ();
2354   tune_divexact_1 ();
2355   tune_modexact_1_odd ();
2356   printf("\n");
2357 
2358   tune_mul_n ();
2359   printf("\n");
2360 
2361   tune_mul ();
2362   printf("\n");
2363 
2364   tune_sqr ();
2365   printf("\n");
2366 
2367   tune_mulmod_bnm1 ();
2368   tune_sqrmod_bnm1 ();
2369   printf("\n");
2370 
2371   tune_fft_mul ();
2372   printf("\n");
2373 
2374   tune_fft_sqr ();
2375   printf ("\n");
2376 
2377   tune_mullo ();
2378   printf("\n");
2379 
2380   tune_dc_div ();
2381   tune_dc_bdiv ();
2382 
2383   printf("\n");
2384   tune_invertappr ();
2385   tune_invert ();
2386   printf("\n");
2387 
2388   tune_binvert ();
2389   tune_redc ();
2390   printf("\n");
2391 
2392   tune_mu_div ();
2393   tune_mu_bdiv ();
2394   printf("\n");
2395 
2396   tune_matrix22_mul ();
2397   tune_hgcd ();
2398   tune_gcd_dc ();
2399   tune_gcdext_dc ();
2400   tune_jacobi_base ();
2401   printf("\n");
2402 
2403   tune_get_str ();
2404   tune_set_str ();
2405   printf("\n");
2406 
2407   time (&end_time);
2408   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
2409           (long) (end_time - start_time));
2410 
2411   TMP_FREE;
2412 }
2413 
2414 
2415 int
2416 main (int argc, char *argv[])
2417 {
2418   int  opt;
2419 
2420   /* Unbuffered so if output is redirected to a file it isn't lost if the
2421      program is killed part way through.  */
2422   setbuf (stdout, NULL);
2423   setbuf (stderr, NULL);
2424 
2425   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
2426     {
2427       switch (opt) {
2428       case 'f':
2429         if (optarg[0] == 't')
2430           option_fft_trace = 2;
2431         else
2432           option_fft_max_size = atol (optarg);
2433         break;
2434       case 'o':
2435         speed_option_set (optarg);
2436         break;
2437       case 'p':
2438         speed_precision = atoi (optarg);
2439         break;
2440       case 't':
2441         option_trace++;
2442         break;
2443       case '?':
2444         exit(1);
2445       }
2446     }
2447 
2448   all ();
2449   exit (0);
2450 }
2451