xref: /netbsd-src/external/lgpl3/gmp/dist/tune/tuneup.c (revision c3ab26950fe8540fb553d1d1dcae454bc98e5a25)
1 /* Create tuned thresholds for various algorithms.
2 
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010,
4 2011, 2012 Free Software Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20 
21 
22 /* Usage: tuneup [-t] [-t] [-p precision]
23 
24    -t turns on some diagnostic traces, a second -t turns on more traces.
25 
26    Notes:
27 
28    The code here isn't a vision of loveliness, mainly because it's subject
29    to ongoing changes according to new things wanting to be tuned, and
30    practical requirements of systems tested.
31 
32    Sometimes running the program twice produces slightly different results.
33    This is probably because there's so little separating algorithms near
34    their crossover, and on that basis it should make little or no difference
35    to the final speed of the relevant routines, but nothing has been done to
36    check that carefully.
37 
38    Algorithm:
39 
40    The thresholds are determined as follows.  A crossover may not be a
41    single size but rather a range where it oscillates between method A or
42    method B faster.  If the threshold is set making B used where A is faster
43    (or vice versa) that's bad.  Badness is the percentage time lost and
44    total badness is the sum of this over all sizes measured.  The threshold
45    is set to minimize total badness.
46 
47    Suppose, as sizes increase, method B becomes faster than method A.  The
48    effect of the rule is that, as you look at increasing sizes, isolated
49    points where B is faster are ignored, but when it's consistently faster,
50    or faster on balance, then the threshold is set there.  The same result
51    is obtained thinking in the other direction of A becoming faster at
52    smaller sizes.
53 
54    In practice the thresholds tend to be chosen to bring on the next
55    algorithm fairly quickly.
56 
57    This rule is attractive because it's got a basis in reason and is fairly
58    easy to implement, but no work has been done to actually compare it in
59    absolute terms to other possibilities.
60 
61    Implementation:
62 
63    In a normal library build the thresholds are constants.  To tune them
64    selected objects are recompiled with the thresholds as global variables
65    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
66    the end of gmp-impl.h, and rules in tune/Makefile.am.
67 
68    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
69    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
70    level, but recurse into the basecase.
71 
72    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
73    Other routines in turn will make use of both of those.  Naturally the
74    dependants must be tuned first.
75 
76    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
77    just a threshold based on comparing two routines (mpn_divrem_1 and
78    mpn_divexact_1), and no further use of the value determined.
79 
80    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
81    just comparisons between certain routines on representative data.
82 
83    Shortcuts are applied when native (assembler) versions of routines exist.
84    For instance a native mpn_sqr_basecase is assumed to be always faster
85    than mpn_mul_basecase, with no measuring.
86 
87    No attempt is made to tune within assembler routines, for instance
88    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
89    written and tuned all by hand.  Assembler routines that might have hard
90    limits are recompiled though, to make them accept a bigger range of sizes
91    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
92 
93    Limitations:
94 
95    The FFTs aren't subject to the same badness rule as the other thresholds,
96    so each k is probably being brought on a touch early.  This isn't likely
97    to make a difference, and the simpler probing means fewer tests.
98 
99 */
100 
101 #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
102 
103 #include "config.h"
104 
105 #include <math.h>
106 #include <stdio.h>
107 #include <stdlib.h>
108 #include <time.h>
109 #if HAVE_UNISTD_H
110 #include <unistd.h>
111 #endif
112 
113 #include "gmp.h"
114 #include "gmp-impl.h"
115 #include "longlong.h"
116 
117 #include "tests.h"
118 #include "speed.h"
119 
120 #if !HAVE_DECL_OPTARG
121 extern char *optarg;
122 extern int optind, opterr;
123 #endif
124 
125 
126 #define DEFAULT_MAX_SIZE   1000  /* limbs */
127 
128 #if WANT_FFT
129 mp_size_t  option_fft_max_size = 50000;  /* limbs */
130 #else
131 mp_size_t  option_fft_max_size = 0;
132 #endif
133 int        option_trace = 0;
134 int        option_fft_trace = 0;
135 struct speed_params  s;
136 
137 struct dat_t {
138   mp_size_t  size;
139   double     d;
140 } *dat = NULL;
141 int  ndat = 0;
142 int  allocdat = 0;
143 
144 /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
145    case use zero here, which for params.max_size means no limit.  */
146 #ifndef TUNE_SQR_TOOM2_MAX
147 #define TUNE_SQR_TOOM2_MAX  0
148 #endif
149 
150 mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
151 mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
152 mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
153 mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
154 mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
155 mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
156 mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
157 mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
158 mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
159 mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
160 mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
161 mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
162 mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
163 mp_size_t  sqr_toom2_threshold
164   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
165 mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
166 mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
167 mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
168 mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
169 mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
170 mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
171 mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
172 mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
173 mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
174 mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
175 mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
176 mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
177 mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
178 mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
179 mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
180 mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
181 mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
182 mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
183 mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
184 mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
185 mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
186 mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
187 mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
188 mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
189 mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
190 mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
191 mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
192 mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
193 mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
194 mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
195 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
196 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
197 mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
198 mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
199 mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
200 mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
201 mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
202 mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
203 mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
204 mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
205 int	   mod_1_1p_method		= 0;
206 mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
207 mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
208 mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
209 mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
210 mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
211 mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
212 mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
213 mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
214 mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
215 mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
216 mp_size_t  fac_odd_threshold            = 0;
217 mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
218 
219 mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
220 mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
221 
222 struct param_t {
223   const char        *name;
224   speed_function_t  function;
225   speed_function_t  function2;
226   double            step_factor;    /* how much to step relatively */
227   int               step;           /* how much to step absolutely */
228   double            function_fudge; /* multiplier for "function" speeds */
229   int               stop_since_change;
230   double            stop_factor;
231   mp_size_t         min_size;
232   int               min_is_always;
233   mp_size_t         max_size;
234   mp_size_t         check_size;
235   mp_size_t         size_extra;
236 
237 #define DATA_HIGH_LT_R  1
238 #define DATA_HIGH_GE_R  2
239   int               data_high;
240 
241   int               noprint;
242 };
243 
244 
245 /* These are normally undefined when false, which suits "#if" fine.
246    But give them zero values so they can be used in plain C "if"s.  */
247 #ifndef UDIV_PREINV_ALWAYS
248 #define UDIV_PREINV_ALWAYS 0
249 #endif
250 #ifndef HAVE_NATIVE_mpn_divexact_1
251 #define HAVE_NATIVE_mpn_divexact_1 0
252 #endif
253 #ifndef HAVE_NATIVE_mpn_divrem_1
254 #define HAVE_NATIVE_mpn_divrem_1 0
255 #endif
256 #ifndef HAVE_NATIVE_mpn_divrem_2
257 #define HAVE_NATIVE_mpn_divrem_2 0
258 #endif
259 #ifndef HAVE_NATIVE_mpn_mod_1
260 #define HAVE_NATIVE_mpn_mod_1 0
261 #endif
262 #ifndef HAVE_NATIVE_mpn_mod_1_1p
263 #define HAVE_NATIVE_mpn_mod_1_1p 0
264 #endif
265 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
266 #define HAVE_NATIVE_mpn_modexact_1_odd 0
267 #endif
268 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
269 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
270 #endif
271 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
272 #define HAVE_NATIVE_mpn_preinv_mod_1 0
273 #endif
274 #ifndef HAVE_NATIVE_mpn_sqr_basecase
275 #define HAVE_NATIVE_mpn_sqr_basecase 0
276 #endif
277 
278 
279 #define MAX3(a,b,c)  MAX (MAX (a, b), c)
280 
281 mp_limb_t
282 randlimb_norm (void)
283 {
284   mp_limb_t  n;
285   mpn_random (&n, 1);
286   n |= GMP_NUMB_HIGHBIT;
287   return n;
288 }
289 
290 #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
291 
292 mp_limb_t
293 randlimb_half (void)
294 {
295   mp_limb_t  n;
296   mpn_random (&n, 1);
297   n &= GMP_NUMB_HALFMASK;
298   n += (n==0);
299   return n;
300 }
301 
302 
303 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
304    if necessary.  */
305 void
306 add_dat (mp_size_t size, double d)
307 {
308 #define ALLOCDAT_STEP  500
309 
310   ASSERT_ALWAYS (ndat <= allocdat);
311 
312   if (ndat == allocdat)
313     {
314       dat = (struct dat_t *) __gmp_allocate_or_reallocate
315         (dat, allocdat * sizeof(dat[0]),
316          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
317       allocdat += ALLOCDAT_STEP;
318     }
319 
320   dat[ndat].size = size;
321   dat[ndat].d = d;
322   ndat++;
323 }
324 
325 
326 /* Return the threshold size based on the data accumulated. */
327 mp_size_t
328 analyze_dat (int final)
329 {
330   double  x, min_x;
331   int     j, min_j;
332 
333   /* If the threshold is set at dat[0].size, any positive values are bad. */
334   x = 0.0;
335   for (j = 0; j < ndat; j++)
336     if (dat[j].d > 0.0)
337       x += dat[j].d;
338 
339   if (option_trace >= 2 && final)
340     {
341       printf ("\n");
342       printf ("x is the sum of the badness from setting thresh at given size\n");
343       printf ("  (minimum x is sought)\n");
344       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
345     }
346 
347   min_x = x;
348   min_j = 0;
349 
350 
351   /* When stepping to the next dat[j].size, positive values are no longer
352      bad (so subtracted), negative values become bad (so add the absolute
353      value, meaning subtract). */
354   for (j = 0; j < ndat; x -= dat[j].d, j++)
355     {
356       if (option_trace >= 2 && final)
357         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
358 
359       if (x < min_x)
360         {
361           min_x = x;
362           min_j = j;
363         }
364     }
365 
366   return min_j;
367 }
368 
369 
370 /* Measuring for recompiled mpn/generic/divrem_1.c, mpn/generic/mod_1.c
371  * and mpz/fac_ui.c */
372 
373 mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
374 mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
375 void mpz_fac_ui_tune (mpz_ptr, unsigned long);
376 
377 double
378 speed_mpn_mod_1_tune (struct speed_params *s)
379 {
380   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
381 }
382 double
383 speed_mpn_divrem_1_tune (struct speed_params *s)
384 {
385   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
386 }
387 double
388 speed_mpz_fac_ui_tune (struct speed_params *s)
389 {
390   SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
391 }
392 
393 
394 double
395 tuneup_measure (speed_function_t fun,
396                 const struct param_t *param,
397                 struct speed_params *s)
398 {
399   static struct param_t  dummy;
400   double   t;
401   TMP_DECL;
402 
403   if (! param)
404     param = &dummy;
405 
406   s->size += param->size_extra;
407 
408   TMP_MARK;
409   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
410   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
411 
412   mpn_random (s->xp, s->size);
413   mpn_random (s->yp, s->size);
414 
415   switch (param->data_high) {
416   case DATA_HIGH_LT_R:
417     s->xp[s->size-1] %= s->r;
418     s->yp[s->size-1] %= s->r;
419     break;
420   case DATA_HIGH_GE_R:
421     s->xp[s->size-1] |= s->r;
422     s->yp[s->size-1] |= s->r;
423     break;
424   }
425 
426   t = speed_measure (fun, s);
427 
428   s->size -= param->size_extra;
429 
430   TMP_FREE;
431   return t;
432 }
433 
434 
435 #define PRINT_WIDTH  31
436 
437 void
438 print_define_start (const char *name)
439 {
440   printf ("#define %-*s  ", PRINT_WIDTH, name);
441   if (option_trace)
442     printf ("...\n");
443 }
444 
445 void
446 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
447 {
448   if (option_trace)
449     printf ("#define %-*s  ", PRINT_WIDTH, name);
450 
451   if (value == MP_SIZE_T_MAX)
452     printf ("MP_SIZE_T_MAX");
453   else
454     printf ("%5ld", (long) value);
455 
456   if (remark != NULL)
457     printf ("  /* %s */", remark);
458   printf ("\n");
459   fflush (stdout);
460 }
461 
462 void
463 print_define_end (const char *name, mp_size_t value)
464 {
465   const char  *remark;
466   if (value == MP_SIZE_T_MAX)
467     remark = "never";
468   else if (value == 0)
469     remark = "always";
470   else
471     remark = NULL;
472   print_define_end_remark (name, value, remark);
473 }
474 
475 void
476 print_define (const char *name, mp_size_t value)
477 {
478   print_define_start (name);
479   print_define_end (name, value);
480 }
481 
482 void
483 print_define_remark (const char *name, mp_size_t value, const char *remark)
484 {
485   print_define_start (name);
486   print_define_end_remark (name, value, remark);
487 }
488 
489 
490 void
491 one (mp_size_t *threshold, struct param_t *param)
492 {
493   int  since_positive, since_thresh_change;
494   int  thresh_idx, new_thresh_idx;
495 
496 #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
497 
498   DEFAULT (param->function_fudge, 1.0);
499   DEFAULT (param->function2, param->function);
500   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
501   DEFAULT (param->step, 1);            /* small steps by default */
502   DEFAULT (param->stop_since_change, 80);
503   DEFAULT (param->stop_factor, 1.2);
504   DEFAULT (param->min_size, 10);
505   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
506 
507   if (param->check_size != 0)
508     {
509       double   t1, t2;
510       s.size = param->check_size;
511 
512       *threshold = s.size+1;
513       t1 = tuneup_measure (param->function, param, &s);
514 
515       *threshold = s.size;
516       t2 = tuneup_measure (param->function2, param, &s);
517       if (t1 == -1.0 || t2 == -1.0)
518         {
519           printf ("Oops, can't run both functions at size %ld\n",
520                   (long) s.size);
521           abort ();
522         }
523       t1 *= param->function_fudge;
524 
525       /* ask that t2 is at least 4% below t1 */
526       if (t1 < t2*1.04)
527         {
528           if (option_trace)
529             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
530           *threshold = MP_SIZE_T_MAX;
531           if (! param->noprint)
532             print_define (param->name, *threshold);
533           return;
534         }
535 
536       if (option_trace >= 2)
537         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
538                 (long) s.size, t1, t2);
539     }
540 
541   if (! param->noprint || option_trace)
542     print_define_start (param->name);
543 
544   ndat = 0;
545   since_positive = 0;
546   since_thresh_change = 0;
547   thresh_idx = 0;
548 
549   if (option_trace >= 2)
550     {
551       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
552       printf ("              (seconds)    (seconds)    diff    thresh\n");
553     }
554 
555   for (s.size = param->min_size;
556        s.size < param->max_size;
557        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
558     {
559       double   ti, tiplus1, d;
560 
561       /*
562         FIXME: check minimum size requirements are met, possibly by just
563         checking for the -1 returns from the speed functions.
564       */
565 
566       /* using method A at this size */
567       *threshold = s.size+1;
568       ti = tuneup_measure (param->function, param, &s);
569       if (ti == -1.0)
570         abort ();
571       ti *= param->function_fudge;
572 
573       /* using method B at this size */
574       *threshold = s.size;
575       tiplus1 = tuneup_measure (param->function2, param, &s);
576       if (tiplus1 == -1.0)
577         abort ();
578 
579       /* Calculate the fraction by which the one or the other routine is
580          slower.  */
581       if (tiplus1 >= ti)
582         d = (tiplus1 - ti) / tiplus1;  /* negative */
583       else
584         d = (tiplus1 - ti) / ti;       /* positive */
585 
586       add_dat (s.size, d);
587 
588       new_thresh_idx = analyze_dat (0);
589 
590       if (option_trace >= 2)
591         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
592                 (long) s.size, ti, tiplus1, d,
593                 ti > tiplus1 ? '#' : ' ',
594                 (long) dat[new_thresh_idx].size);
595 
596       /* Stop if the last time method i was faster was more than a
597          certain number of measurements ago.  */
598 #define STOP_SINCE_POSITIVE  200
599       if (d >= 0)
600         since_positive = 0;
601       else
602         if (++since_positive > STOP_SINCE_POSITIVE)
603           {
604             if (option_trace >= 1)
605               printf ("stopped due to since_positive (%d)\n",
606                       STOP_SINCE_POSITIVE);
607             break;
608           }
609 
610       /* Stop if method A has become slower by a certain factor. */
611       if (ti >= tiplus1 * param->stop_factor)
612         {
613           if (option_trace >= 1)
614             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
615                     param->stop_factor);
616           break;
617         }
618 
619       /* Stop if the threshold implied hasn't changed in a certain
620          number of measurements.  (It's this condition that usually
621          stops the loop.) */
622       if (thresh_idx != new_thresh_idx)
623         since_thresh_change = 0, thresh_idx = new_thresh_idx;
624       else
625         if (++since_thresh_change > param->stop_since_change)
626           {
627             if (option_trace >= 1)
628               printf ("stopped due to since_thresh_change (%d)\n",
629                       param->stop_since_change);
630             break;
631           }
632 
633       /* Stop if the threshold implied is more than a certain number of
634          measurements ago.  */
635 #define STOP_SINCE_AFTER   500
636       if (ndat - thresh_idx > STOP_SINCE_AFTER)
637         {
638           if (option_trace >= 1)
639             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
640                     STOP_SINCE_AFTER);
641           break;
642         }
643 
644       /* Stop when the size limit is reached before the end of the
645          crossover, but only show this as an error for >= the default max
646          size.  FIXME: Maybe should make it a param choice whether this is
647          an error.  */
648       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
649         {
650           fprintf (stderr, "%s\n", param->name);
651           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
652                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
653           fprintf (stderr, "    max size reached before end of crossover\n");
654           break;
655         }
656     }
657 
658   if (option_trace >= 1)
659     printf ("sizes %ld to %ld total %d measurements\n",
660             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
661 
662   *threshold = dat[analyze_dat (1)].size;
663 
664   if (param->min_is_always)
665     {
666       if (*threshold == param->min_size)
667         *threshold = 0;
668     }
669 
670   if (! param->noprint || option_trace)
671     print_define_end (param->name, *threshold);
672 }
673 
674 
675 /* Special probing for the fft thresholds.  The size restrictions on the
676    FFTs mean the graph of time vs size has a step effect.  See this for
677    example using
678 
679        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
680        gnuplot foo.gnuplot
681 
682    The current approach is to compare routines at the midpoint of relevant
683    steps.  Arguably a more sophisticated system of threshold data is wanted
684    if this step effect remains. */
685 
686 struct fft_param_t {
687   const char        *table_name;
688   const char        *threshold_name;
689   const char        *modf_threshold_name;
690   mp_size_t         *p_threshold;
691   mp_size_t         *p_modf_threshold;
692   mp_size_t         first_size;
693   mp_size_t         max_size;
694   speed_function_t  function;
695   speed_function_t  mul_modf_function;
696   speed_function_t  mul_function;
697   mp_size_t         sqr;
698 };
699 
700 
701 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
702    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
703    of 2^(k-1) bits. */
704 
705 mp_size_t
706 fft_step_size (int k)
707 {
708   mp_size_t  step;
709 
710   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
711   step *= (mp_size_t) 1 << k;
712 
713   if (step <= 0)
714     {
715       printf ("Can't handle k=%d\n", k);
716       abort ();
717     }
718 
719   return step;
720 }
721 
722 mp_size_t
723 fft_next_size (mp_size_t pl, int k)
724 {
725   mp_size_t  m = fft_step_size (k);
726 
727 /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
728 
729   if (pl == 0 || (pl & (m-1)) != 0)
730     pl = (pl | (m-1)) + 1;
731 
732 /*    printf (" %ld\n", pl); */
733   return pl;
734 }
735 
736 #define NMAX_DEFAULT 1000000
737 #define MAX_REPS 25
738 #define MIN_REPS 5
739 
740 static inline size_t
741 mpn_mul_fft_lcm (size_t a, unsigned int k)
742 {
743   unsigned int l = k;
744 
745   while (a % 2 == 0 && k > 0)
746     {
747       a >>= 1;
748       k--;
749     }
750   return a << l;
751 }
752 
753 mp_size_t
754 fftfill (mp_size_t pl, int k, int sqr)
755 {
756   mp_size_t maxLK;
757   mp_bitcnt_t N, Nprime, nprime, M;
758 
759   N = pl * GMP_NUMB_BITS;
760   M = N >> k;
761 
762   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
763 
764   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
765   nprime = Nprime / GMP_NUMB_BITS;
766   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
767     {
768       size_t K2;
769       for (;;)
770 	{
771 	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
772 	  if ((nprime & (K2 - 1)) == 0)
773 	    break;
774 	  nprime = (nprime + K2 - 1) & -K2;
775 	  Nprime = nprime * GMP_LIMB_BITS;
776 	}
777     }
778   ASSERT_ALWAYS (nprime < pl);
779 
780   return Nprime;
781 }
782 
783 static int
784 compare_double (const void *ap, const void *bp)
785 {
786   double a = * (const double *) ap;
787   double b = * (const double *) bp;
788 
789   if (a < b)
790     return -1;
791   else if (a > b)
792     return 1;
793   else
794     return 0;
795 }
796 
797 double
798 median (double *times, int n)
799 {
800   qsort (times, n, sizeof (double), compare_double);
801   return times[n/2];
802 }
803 
804 #define FFT_CACHE_SIZE 25
805 typedef struct fft_cache
806 {
807   mp_size_t n;
808   double time;
809 } fft_cache_t;
810 
811 fft_cache_t fft_cache[FFT_CACHE_SIZE];
812 
813 double
814 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
815 		int n_measurements)
816 {
817   int i;
818   double t, ttab[MAX_REPS];
819 
820   if (fft_cache[k].n == n)
821     return fft_cache[k].time;
822 
823   for (i = 0; i < n_measurements; i++)
824     {
825       speed_starttime ();
826       mpn_mul_fft (rp, n, ap, n, bp, n, k);
827       ttab[i] = speed_endtime ();
828     }
829 
830   t = median (ttab, n_measurements);
831   fft_cache[k].n = n;
832   fft_cache[k].time = t;
833   return t;
834 }
835 
836 #define INSERT_FFTTAB(idx, nval, kval)					\
837   do {									\
838     fft_tab[idx].n = nval;						\
839     fft_tab[idx].k = kval;						\
840     fft_tab[idx+1].n = -1;	/* sentinel */				\
841     fft_tab[idx+1].k = -1;						\
842   } while (0)
843 
844 int
845 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
846 {
847   mp_size_t n, n1, prev_n1;
848   int k, best_k, last_best_k, kmax;
849   int eff, prev_eff;
850   double t0, t1;
851   int n_measurements;
852   mp_limb_t *ap, *bp, *rp;
853   mp_size_t alloc;
854   char *linepref;
855   struct fft_table_nk *fft_tab;
856 
857   fft_tab = mpn_fft_table3[p->sqr];
858 
859   for (k = 0; k < FFT_CACHE_SIZE; k++)
860     fft_cache[k].n = 0;
861 
862   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
863     {
864       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
865     }
866 
867   if (print)
868     printf ("#define %s%*s", p->table_name, 38, "");
869 
870   if (idx == 0)
871     {
872       INSERT_FFTTAB (0, nmin, initial_k);
873 
874       if (print)
875 	{
876 	  printf ("\\\n  { ");
877 	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
878 	  linepref = "    ";
879 	}
880 
881       idx = 1;
882     }
883 
884   ap = malloc (sizeof (mp_limb_t));
885   if (p->sqr)
886     bp = ap;
887   else
888     bp = malloc (sizeof (mp_limb_t));
889   rp = malloc (sizeof (mp_limb_t));
890   alloc = 1;
891 
892   /* Round n to comply to initial k value */
893   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
894 
895   n_measurements = (18 - initial_k) | 1;
896   n_measurements = MAX (n_measurements, MIN_REPS);
897   n_measurements = MIN (n_measurements, MAX_REPS);
898 
899   last_best_k = initial_k;
900   best_k = initial_k;
901 
902   while (n < nmax)
903     {
904       int start_k, end_k;
905 
906       /* Assume the current best k is best until we hit its next FFT step.  */
907       t0 = 99999;
908 
909       prev_n1 = n + 1;
910 
911       start_k = MAX (4, best_k - 4);
912       end_k = MIN (24, best_k + 4);
913       for (k = start_k; k <= end_k; k++)
914 	{
915           n1 = mpn_fft_next_size (prev_n1, k);
916 
917 	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
918 
919 	  if (eff < 70)		/* avoid measuring too slow fft:s */
920 	    continue;
921 
922 	  if (n1 > alloc)
923 	    {
924 	      alloc = n1;
925 	      if (p->sqr)
926 		{
927 		  ap = realloc (ap, sizeof (mp_limb_t));
928 		  rp = realloc (rp, sizeof (mp_limb_t));
929 		  ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
930 		  mpn_random (ap, alloc);
931 		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
932 		}
933 	      else
934 		{
935 		  ap = realloc (ap, sizeof (mp_limb_t));
936 		  bp = realloc (bp, sizeof (mp_limb_t));
937 		  rp = realloc (rp, sizeof (mp_limb_t));
938 		  ap = realloc (ap, alloc * sizeof (mp_limb_t));
939 		  mpn_random (ap, alloc);
940 		  bp = realloc (bp, alloc * sizeof (mp_limb_t));
941 		  mpn_random (bp, alloc);
942 		  rp = realloc (rp, alloc * sizeof (mp_limb_t));
943 		}
944 	    }
945 
946 	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
947 
948 	  if (t1 * n_measurements > 0.3)
949 	    n_measurements -= 2;
950 	  n_measurements = MAX (n_measurements, MIN_REPS);
951 
952 	  if (t1 < t0)
953 	    {
954 	      best_k = k;
955 	      t0 = t1;
956 	    }
957 	}
958 
959       n1 = mpn_fft_next_size (prev_n1, best_k);
960 
961       if (last_best_k != best_k)
962 	{
963 	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
964 
965 	  if (idx >= FFT_TABLE3_SIZE)
966 	    {
967 	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
968 	      abort ();
969 	    }
970 	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
971 
972 	  if (print)
973 	    {
974 	      printf (", ");
975 	      if (idx % 4 == 0)
976 		printf ("\\\n    ");
977 	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
978 	    }
979 
980 	  if (option_trace >= 2)
981 	    {
982 	      printf ("{%lu,%u}\n", prev_n1, best_k);
983 	      fflush (stdout);
984 	    }
985 
986 	  last_best_k = best_k;
987 	  idx++;
988 	}
989 
990       for (;;)
991 	{
992 	  prev_n1 = n1;
993 	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
994 	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
995 	  eff = fftfill (n1, best_k, p->sqr);
996 
997 	  if (eff != prev_eff)
998 	    break;
999 	}
1000 
1001       n = prev_n1;
1002     }
1003 
1004   kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
1005   kmax = MIN (kmax, 25-1);
1006   for (k = last_best_k + 1; k <= kmax; k++)
1007     {
1008       if (idx >= FFT_TABLE3_SIZE)
1009 	{
1010 	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1011 	  abort ();
1012 	}
1013       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1014 
1015       if (print)
1016 	{
1017 	  printf (", ");
1018 	  if (idx % 4 == 0)
1019 	    printf ("\\\n    ");
1020 	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1021 	}
1022 
1023       idx++;
1024     }
1025 
1026   if (print)
1027     printf (" }\n");
1028 
1029   free (ap);
1030   if (! p->sqr)
1031     free (bp);
1032   free (rp);
1033 
1034   return idx;
1035 }
1036 
1037 void
1038 fft (struct fft_param_t *p)
1039 {
1040   mp_size_t  size;
1041   int        k, idx, initial_k;
1042 
1043   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1044 
1045 #if 1
1046   {
1047     /* Use plain one() mechanism, for some reasonable initial values of k.  The
1048        advantage is that we don't depend on mpn_fft_table3, which can therefore
1049        leave it completely uninitialized.  */
1050 
1051     static struct param_t param;
1052     mp_size_t thres, best_thres;
1053     int best_k;
1054     char buf[20];
1055 
1056     best_thres = MP_SIZE_T_MAX;
1057     best_k = -1;
1058 
1059     for (k = 5; k <= 7; k++)
1060       {
1061 	param.name = p->modf_threshold_name;
1062 	param.min_size = 100;
1063 	param.max_size = 2000;
1064 	param.function  = p->mul_function;
1065 	param.step_factor = 0.0;
1066 	param.step = 4;
1067 	param.function2 = p->mul_modf_function;
1068 	param.noprint = 1;
1069 	s.r = k;
1070 	one (&thres, &param);
1071 	if (thres < best_thres)
1072 	  {
1073 	    best_thres = thres;
1074 	    best_k = k;
1075 	  }
1076       }
1077 
1078     *(p->p_modf_threshold) = best_thres;
1079     sprintf (buf, "k = %d", best_k);
1080     print_define_remark (p->modf_threshold_name, best_thres, buf);
1081     initial_k = best_k;
1082   }
1083 #else
1084   size = p->first_size;
1085   for (;;)
1086     {
1087       double  tk, tm;
1088 
1089       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1090       k = mpn_fft_best_k (size, p->sqr);
1091 
1092       if (size >= p->max_size)
1093         break;
1094 
1095       s.size = size + fft_step_size (k) / 2;
1096       s.r = k;
1097       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1098       if (tk == -1.0)
1099         abort ();
1100 
1101       tm = tuneup_measure (p->mul_function, NULL, &s);
1102       if (tm == -1.0)
1103         abort ();
1104 
1105       if (option_trace >= 2)
1106         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
1107                 (long) size,
1108                 (long) size + fft_step_size (k) / 2, k, tk,
1109                 (long) s.size, tm);
1110 
1111       if (tk < tm)
1112         {
1113 	  *p->p_modf_threshold = s.size;
1114 	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
1115 	  break;
1116         }
1117     }
1118   initial_k = ?;
1119 #endif
1120 
1121   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1122 
1123   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1124   printf ("#define %s_SIZE %d\n", p->table_name, idx);
1125 
1126   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1127 
1128   size = 2 * *p->p_modf_threshold;	/* OK? */
1129   for (;;)
1130     {
1131       double  tk, tm;
1132       mp_size_t mulmod_size, mul_size;;
1133 
1134       if (size >= p->max_size)
1135         break;
1136 
1137       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1138       mul_size = (size + mulmod_size) / 2;	/* middle of step */
1139 
1140       s.size = mulmod_size;
1141       tk = tuneup_measure (p->function, NULL, &s);
1142       if (tk == -1.0)
1143         abort ();
1144 
1145       s.size = mul_size;
1146       tm = tuneup_measure (p->mul_function, NULL, &s);
1147       if (tm == -1.0)
1148         abort ();
1149 
1150       if (option_trace >= 2)
1151         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
1152                 (long) size,
1153                 (long) mulmod_size, tk,
1154                 (long) mul_size, tm);
1155 
1156       size = mulmod_size;
1157 
1158       if (tk < tm)
1159         {
1160 	  *p->p_threshold = s.size;
1161 	  print_define (p->threshold_name, *p->p_threshold);
1162 	  break;
1163         }
1164     }
1165 }
1166 
1167 
1168 
1169 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1170    giving wrong results.  */
1171 void
1172 tune_mul_n (void)
1173 {
1174   static struct param_t  param;
1175   mp_size_t next_toom_start;
1176   int something_changed;
1177 
1178   param.function = speed_mpn_mul_n;
1179 
1180   param.name = "MUL_TOOM22_THRESHOLD";
1181   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1182   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1183   one (&mul_toom22_threshold, &param);
1184 
1185   param.noprint = 1;
1186 
1187   /* Threshold sequence loop.  Disable functions that would be used in a very
1188      narrow range, re-measuring things when that happens.  */
1189   something_changed = 1;
1190   while (something_changed)
1191     {
1192       something_changed = 0;
1193 
1194 	next_toom_start = mul_toom22_threshold;
1195 
1196 	if (mul_toom33_threshold != 0)
1197 	  {
1198 	    param.name = "MUL_TOOM33_THRESHOLD";
1199 	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
1200 	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1201 	    one (&mul_toom33_threshold, &param);
1202 
1203 	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
1204 	      {
1205 		mul_toom33_threshold = 0;
1206 		something_changed = 1;
1207 	      }
1208 	  }
1209 
1210 	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
1211 
1212 	if (mul_toom44_threshold != 0)
1213 	  {
1214 	    param.name = "MUL_TOOM44_THRESHOLD";
1215 	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
1216 	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1217 	    one (&mul_toom44_threshold, &param);
1218 
1219 	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
1220 	      {
1221 		mul_toom44_threshold = 0;
1222 		something_changed = 1;
1223 	      }
1224 	  }
1225 
1226 	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
1227 
1228 	if (mul_toom6h_threshold != 0)
1229 	  {
1230 	    param.name = "MUL_TOOM6H_THRESHOLD";
1231 	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
1232 	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1233 	    one (&mul_toom6h_threshold, &param);
1234 
1235 	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
1236 	      {
1237 		mul_toom6h_threshold = 0;
1238 		something_changed = 1;
1239 	      }
1240 	  }
1241 
1242 	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
1243 
1244 	if (mul_toom8h_threshold != 0)
1245 	  {
1246 	    param.name = "MUL_TOOM8H_THRESHOLD";
1247 	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
1248 	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1249 	    one (&mul_toom8h_threshold, &param);
1250 
1251 	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
1252 	      {
1253 		mul_toom8h_threshold = 0;
1254 		something_changed = 1;
1255 	      }
1256 	  }
1257     }
1258 
1259     print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
1260     print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
1261     print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
1262     print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
1263 
1264   /* disabled until tuned */
1265   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1266 }
1267 
1268 void
1269 tune_mul (void)
1270 {
1271   static struct param_t  param;
1272   mp_size_t thres;
1273 
1274   param.noprint = 1;
1275 
1276   param.function = speed_mpn_toom32_for_toom43_mul;
1277   param.function2 = speed_mpn_toom43_for_toom32_mul;
1278   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1279   param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
1280   one (&thres, &param);
1281   mul_toom32_to_toom43_threshold = thres * 17 / 24;
1282   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1283 
1284   param.function = speed_mpn_toom32_for_toom53_mul;
1285   param.function2 = speed_mpn_toom53_for_toom32_mul;
1286   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1287   param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
1288   one (&thres, &param);
1289   mul_toom32_to_toom53_threshold = thres * 19 / 30;
1290   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1291 
1292   param.function = speed_mpn_toom42_for_toom53_mul;
1293   param.function2 = speed_mpn_toom53_for_toom42_mul;
1294   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1295   param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
1296   one (&thres, &param);
1297   mul_toom42_to_toom53_threshold = thres * 11 / 20;
1298   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1299 
1300   param.function = speed_mpn_toom42_mul;
1301   param.function2 = speed_mpn_toom63_mul;
1302   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1303   param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
1304   one (&thres, &param);
1305   mul_toom42_to_toom63_threshold = thres / 2;
1306   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1307 
1308   /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
1309   param.function = speed_mpn_toom43_for_toom54_mul;
1310   param.function2 = speed_mpn_toom54_for_toom43_mul;
1311   param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
1312   param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
1313   one (&thres, &param);
1314   mul_toom43_to_toom54_threshold = thres * 5 / 6;
1315   print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
1316 }
1317 
1318 
1319 void
1320 tune_mullo (void)
1321 {
1322   static struct param_t  param;
1323 
1324   param.function = speed_mpn_mullo_n;
1325 
1326   param.name = "MULLO_BASECASE_THRESHOLD";
1327   param.min_size = 1;
1328   param.min_is_always = 1;
1329   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1330   param.stop_factor = 1.5;
1331   param.noprint = 1;
1332   one (&mullo_basecase_threshold, &param);
1333 
1334   param.name = "MULLO_DC_THRESHOLD";
1335   param.min_size = 8;
1336   param.min_is_always = 0;
1337   param.max_size = 1000;
1338   one (&mullo_dc_threshold, &param);
1339 
1340   if (mullo_basecase_threshold >= mullo_dc_threshold)
1341     {
1342       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1343       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1344     }
1345   else
1346     {
1347       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1348       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1349     }
1350 
1351 #if WANT_FFT
1352   param.name = "MULLO_MUL_N_THRESHOLD";
1353   param.min_size = mullo_dc_threshold;
1354   param.max_size = 2 * mul_fft_threshold;
1355   param.noprint = 0;
1356   param.step_factor = 0.03;
1357   one (&mullo_mul_n_threshold, &param);
1358 #else
1359   print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1360                            "without FFT use mullo forever");
1361 #endif
1362 }
1363 
1364 void
1365 tune_mulmid (void)
1366 {
1367   static struct param_t  param;
1368 
1369   param.name = "MULMID_TOOM42_THRESHOLD";
1370   param.function = speed_mpn_mulmid_n;
1371   param.min_size = 4;
1372   param.max_size = 100;
1373   one (&mulmid_toom42_threshold, &param);
1374 }
1375 
1376 void
1377 tune_mulmod_bnm1 (void)
1378 {
1379   static struct param_t  param;
1380 
1381   param.name = "MULMOD_BNM1_THRESHOLD";
1382   param.function = speed_mpn_mulmod_bnm1;
1383   param.min_size = 4;
1384   param.max_size = 100;
1385   one (&mulmod_bnm1_threshold, &param);
1386 }
1387 
1388 void
1389 tune_sqrmod_bnm1 (void)
1390 {
1391   static struct param_t  param;
1392 
1393   param.name = "SQRMOD_BNM1_THRESHOLD";
1394   param.function = speed_mpn_sqrmod_bnm1;
1395   param.min_size = 4;
1396   param.max_size = 100;
1397   one (&sqrmod_bnm1_threshold, &param);
1398 }
1399 
1400 
1401 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1402    is faster only at size==2 then we don't want to bother with extra code
1403    just for that.  Start karatsuba from 4 same as MUL above.  */
1404 
1405 void
1406 tune_sqr (void)
1407 {
1408   /* disabled until tuned */
1409   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1410 
1411   if (HAVE_NATIVE_mpn_sqr_basecase)
1412     {
1413       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1414       sqr_basecase_threshold = 0;
1415     }
1416   else
1417     {
1418       static struct param_t  param;
1419       param.name = "SQR_BASECASE_THRESHOLD";
1420       param.function = speed_mpn_sqr;
1421       param.min_size = 3;
1422       param.min_is_always = 1;
1423       param.max_size = TUNE_SQR_TOOM2_MAX;
1424       param.noprint = 1;
1425       one (&sqr_basecase_threshold, &param);
1426     }
1427 
1428   {
1429     static struct param_t  param;
1430     param.name = "SQR_TOOM2_THRESHOLD";
1431     param.function = speed_mpn_sqr;
1432     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1433     param.max_size = TUNE_SQR_TOOM2_MAX;
1434     param.noprint = 1;
1435     one (&sqr_toom2_threshold, &param);
1436 
1437     if (! HAVE_NATIVE_mpn_sqr_basecase
1438         && sqr_toom2_threshold < sqr_basecase_threshold)
1439       {
1440         /* Karatsuba becomes faster than mul_basecase before
1441            sqr_basecase does.  Arrange for the expression
1442            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1443            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1444            SQR_TOOM2_THRESHOLD to zero, making
1445            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
1446 
1447         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1448         SQR_TOOM2_THRESHOLD = 0;
1449 
1450         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1451                              "toom2");
1452         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1453                              "never sqr_basecase");
1454       }
1455     else
1456       {
1457         if (! HAVE_NATIVE_mpn_sqr_basecase)
1458           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1459         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1460       }
1461   }
1462 
1463   {
1464     static struct param_t  param;
1465     mp_size_t next_toom_start;
1466     int something_changed;
1467 
1468     param.function = speed_mpn_sqr;
1469     param.noprint = 1;
1470 
1471   /* Threshold sequence loop.  Disable functions that would be used in a very
1472      narrow range, re-measuring things when that happens.  */
1473     something_changed = 1;
1474     while (something_changed)
1475       {
1476 	something_changed = 0;
1477 
1478 	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1479 
1480 	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
1481 	param.name = "SQR_TOOM3_THRESHOLD";
1482 	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
1483 	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1484 	one (&sqr_toom3_threshold, &param);
1485 
1486 	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
1487 
1488 	if (sqr_toom4_threshold != 0)
1489 	  {
1490 	    param.name = "SQR_TOOM4_THRESHOLD";
1491 	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
1492 	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
1493 	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1494 	    one (&sqr_toom4_threshold, &param);
1495 
1496 	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
1497 	      {
1498 		sqr_toom4_threshold = 0;
1499 		something_changed = 1;
1500 	      }
1501 	  }
1502 
1503 	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
1504 
1505 	if (sqr_toom6_threshold != 0)
1506 	  {
1507 	    param.name = "SQR_TOOM6_THRESHOLD";
1508 	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
1509 	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
1510 	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1511 	    one (&sqr_toom6_threshold, &param);
1512 
1513 	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
1514 	      {
1515 		sqr_toom6_threshold = 0;
1516 		something_changed = 1;
1517 	      }
1518 	  }
1519 
1520 	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
1521 
1522 	if (sqr_toom8_threshold != 0)
1523 	  {
1524 	    param.name = "SQR_TOOM8_THRESHOLD";
1525 	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
1526 	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
1527 	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1528 	    one (&sqr_toom8_threshold, &param);
1529 
1530 	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
1531 	      {
1532 		sqr_toom8_threshold = 0;
1533 		something_changed = 1;
1534 	      }
1535 	  }
1536       }
1537 
1538     print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
1539     print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
1540     print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
1541     print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
1542   }
1543 }
1544 
1545 
1546 void
1547 tune_dc_div (void)
1548 {
1549   s.r = 0;		/* clear to make speed function do 2n/n */
1550   {
1551     static struct param_t  param;
1552     param.name = "DC_DIV_QR_THRESHOLD";
1553     param.function = speed_mpn_sbpi1_div_qr;
1554     param.function2 = speed_mpn_dcpi1_div_qr;
1555     param.min_size = 6;
1556     one (&dc_div_qr_threshold, &param);
1557   }
1558   {
1559     static struct param_t  param;
1560     param.name = "DC_DIVAPPR_Q_THRESHOLD";
1561     param.function = speed_mpn_sbpi1_divappr_q;
1562     param.function2 = speed_mpn_dcpi1_divappr_q;
1563     param.min_size = 6;
1564     one (&dc_divappr_q_threshold, &param);
1565   }
1566 }
1567 
1568 static double
1569 speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1570 {
1571   if (s->size < DC_DIV_QR_THRESHOLD)
1572     return speed_mpn_sbpi1_div_qr (s);
1573   else
1574     return speed_mpn_dcpi1_div_qr (s);
1575 }
1576 
1577 void
1578 tune_mu_div (void)
1579 {
1580   s.r = 0;		/* clear to make speed function do 2n/n */
1581   {
1582     static struct param_t  param;
1583     param.name = "MU_DIV_QR_THRESHOLD";
1584     param.function = speed_mpn_dcpi1_div_qr;
1585     param.function2 = speed_mpn_mu_div_qr;
1586     param.min_size = mul_toom22_threshold;
1587     param.max_size = 5000;
1588     param.step_factor = 0.02;
1589     one (&mu_div_qr_threshold, &param);
1590   }
1591   {
1592     static struct param_t  param;
1593     param.name = "MU_DIVAPPR_Q_THRESHOLD";
1594     param.function = speed_mpn_dcpi1_divappr_q;
1595     param.function2 = speed_mpn_mu_divappr_q;
1596     param.min_size = mul_toom22_threshold;
1597     param.max_size = 5000;
1598     param.step_factor = 0.02;
1599     one (&mu_divappr_q_threshold, &param);
1600   }
1601   {
1602     static struct param_t  param;
1603     param.name = "MUPI_DIV_QR_THRESHOLD";
1604     param.function = speed_mpn_sbordcpi1_div_qr;
1605     param.function2 = speed_mpn_mupi_div_qr;
1606     param.min_size = 6;
1607     param.min_is_always = 1;
1608     param.max_size = 1000;
1609     param.step_factor = 0.02;
1610     one (&mupi_div_qr_threshold, &param);
1611   }
1612 }
1613 
1614 void
1615 tune_dc_bdiv (void)
1616 {
1617   s.r = 0;		/* clear to make speed function do 2n/n*/
1618   {
1619     static struct param_t  param;
1620     param.name = "DC_BDIV_QR_THRESHOLD";
1621     param.function = speed_mpn_sbpi1_bdiv_qr;
1622     param.function2 = speed_mpn_dcpi1_bdiv_qr;
1623     param.min_size = 4;
1624     one (&dc_bdiv_qr_threshold, &param);
1625   }
1626   {
1627     static struct param_t  param;
1628     param.name = "DC_BDIV_Q_THRESHOLD";
1629     param.function = speed_mpn_sbpi1_bdiv_q;
1630     param.function2 = speed_mpn_dcpi1_bdiv_q;
1631     param.min_size = 4;
1632     one (&dc_bdiv_q_threshold, &param);
1633   }
1634 }
1635 
1636 void
1637 tune_mu_bdiv (void)
1638 {
1639   s.r = 0;		/* clear to make speed function do 2n/n*/
1640   {
1641     static struct param_t  param;
1642     param.name = "MU_BDIV_QR_THRESHOLD";
1643     param.function = speed_mpn_dcpi1_bdiv_qr;
1644     param.function2 = speed_mpn_mu_bdiv_qr;
1645     param.min_size = mul_toom22_threshold;
1646     param.max_size = 5000;
1647     param.step_factor = 0.02;
1648     one (&mu_bdiv_qr_threshold, &param);
1649   }
1650   {
1651     static struct param_t  param;
1652     param.name = "MU_BDIV_Q_THRESHOLD";
1653     param.function = speed_mpn_dcpi1_bdiv_q;
1654     param.function2 = speed_mpn_mu_bdiv_q;
1655     param.min_size = mul_toom22_threshold;
1656     param.max_size = 5000;
1657     param.step_factor = 0.02;
1658     one (&mu_bdiv_q_threshold, &param);
1659   }
1660 }
1661 
1662 void
1663 tune_invertappr (void)
1664 {
1665   static struct param_t  param;
1666 
1667   param.function = speed_mpn_ni_invertappr;
1668   param.name = "INV_MULMOD_BNM1_THRESHOLD";
1669   param.min_size = 4;
1670   one (&inv_mulmod_bnm1_threshold, &param);
1671 
1672   param.function = speed_mpn_invertappr;
1673   param.name = "INV_NEWTON_THRESHOLD";
1674   param.min_size = 3;
1675   one (&inv_newton_threshold, &param);
1676 }
1677 
1678 void
1679 tune_invert (void)
1680 {
1681   static struct param_t  param;
1682 
1683   param.function = speed_mpn_invert;
1684   param.name = "INV_APPR_THRESHOLD";
1685   param.min_size = 3;
1686   one (&inv_appr_threshold, &param);
1687 }
1688 
1689 void
1690 tune_binvert (void)
1691 {
1692   static struct param_t  param;
1693 
1694   param.function = speed_mpn_binvert;
1695   param.name = "BINV_NEWTON_THRESHOLD";
1696   param.min_size = 8;		/* pointless with smaller operands */
1697   one (&binv_newton_threshold, &param);
1698 }
1699 
1700 void
1701 tune_redc (void)
1702 {
1703 #define TUNE_REDC_2_MAX 100
1704 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1705 #define WANT_REDC_2 1
1706 #endif
1707 
1708 #if WANT_REDC_2
1709   {
1710     static struct param_t  param;
1711     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1712     param.function = speed_mpn_redc_1;
1713     param.function2 = speed_mpn_redc_2;
1714     param.min_size = 1;
1715     param.min_is_always = 1;
1716     param.max_size = TUNE_REDC_2_MAX;
1717     param.noprint = 1;
1718     param.stop_factor = 1.5;
1719     one (&redc_1_to_redc_2_threshold, &param);
1720   }
1721   {
1722     static struct param_t  param;
1723     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1724     param.function = speed_mpn_redc_2;
1725     param.function2 = speed_mpn_redc_n;
1726     param.min_size = 16;
1727     param.noprint = 1;
1728     one (&redc_2_to_redc_n_threshold, &param);
1729   }
1730   if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
1731     {
1732       redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
1733 
1734       /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
1735 	 REDC_1_TO_REDC_2_THRESHOLD.  */
1736       {
1737 	static struct param_t  param;
1738 	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1739 	param.function = speed_mpn_redc_1;
1740 	param.function2 = speed_mpn_redc_n;
1741 	param.min_size = 16;
1742 	param.noprint = 1;
1743 	one (&redc_1_to_redc_2_threshold, &param);
1744       }
1745     }
1746   print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1747   print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1748 #else
1749   {
1750     static struct param_t  param;
1751     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1752     param.function = speed_mpn_redc_1;
1753     param.function2 = speed_mpn_redc_n;
1754     param.min_size = 16;
1755     one (&redc_1_to_redc_n_threshold, &param);
1756   }
1757 #endif
1758 }
1759 
1760 void
1761 tune_matrix22_mul (void)
1762 {
1763   static struct param_t  param;
1764   param.name = "MATRIX22_STRASSEN_THRESHOLD";
1765   param.function = speed_mpn_matrix22_mul;
1766   param.min_size = 2;
1767   one (&matrix22_strassen_threshold, &param);
1768 }
1769 
1770 void
1771 tune_hgcd (void)
1772 {
1773   static struct param_t  param;
1774   param.name = "HGCD_THRESHOLD";
1775   param.function = speed_mpn_hgcd;
1776   /* We seem to get strange results for small sizes */
1777   param.min_size = 30;
1778   one (&hgcd_threshold, &param);
1779 }
1780 
1781 void
1782 tune_hgcd_appr (void)
1783 {
1784   static struct param_t  param;
1785   param.name = "HGCD_APPR_THRESHOLD";
1786   param.function = speed_mpn_hgcd_appr;
1787   /* We seem to get strange results for small sizes */
1788   param.min_size = 50;
1789   param.stop_since_change = 150;
1790   one (&hgcd_appr_threshold, &param);
1791 }
1792 
1793 void
1794 tune_hgcd_reduce (void)
1795 {
1796   static struct param_t  param;
1797   param.name = "HGCD_REDUCE_THRESHOLD";
1798   param.function = speed_mpn_hgcd_reduce;
1799   param.min_size = 30;
1800   param.max_size = 7000;
1801   param.step_factor = 0.04;
1802   one (&hgcd_reduce_threshold, &param);
1803 }
1804 
1805 void
1806 tune_gcd_dc (void)
1807 {
1808   static struct param_t  param;
1809   param.name = "GCD_DC_THRESHOLD";
1810   param.function = speed_mpn_gcd;
1811   param.min_size = hgcd_threshold;
1812   param.max_size = 3000;
1813   param.step_factor = 0.02;
1814   one (&gcd_dc_threshold, &param);
1815 }
1816 
1817 void
1818 tune_gcdext_dc (void)
1819 {
1820   static struct param_t  param;
1821   param.name = "GCDEXT_DC_THRESHOLD";
1822   param.function = speed_mpn_gcdext;
1823   param.min_size = hgcd_threshold;
1824   param.max_size = 3000;
1825   param.step_factor = 0.02;
1826   one (&gcdext_dc_threshold, &param);
1827 }
1828 
1829 /* In tune_powm_sec we compute the table used by the win_size function.  The
1830    cutoff points are in exponent bits, disregarding other operand sizes.  It is
1831    not possible to use the one framework since it currently uses a granilarity
1832    of full limbs.
1833 */
1834 
1835 /* This win_size replaces the variant in the powm code, allowing us to
1836    control k in the k-ary algorithms.  */
1837 int winsize;
1838 int
1839 win_size (mp_bitcnt_t eb)
1840 {
1841   return winsize;
1842 }
1843 
1844 void
1845 tune_powm_sec (void)
1846 {
1847   mp_size_t n;
1848   int k, i;
1849   mp_size_t itch;
1850   mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
1851   const int n_max = 3000 / GMP_NUMB_BITS;
1852   const int n_measurements = 5;
1853   mp_ptr rp, bp, ep, mp, tp;
1854   double ttab[n_measurements], tk, tkp1;
1855   TMP_DECL;
1856   TMP_MARK;
1857 
1858   possible_nbits_cutoff = 0;
1859 
1860   k = 1;
1861 
1862   winsize = 10;			/* the itch function needs this */
1863   itch = mpn_powm_sec_itch (n_max, n_max, n_max);
1864 
1865   rp = TMP_ALLOC_LIMBS (n_max);
1866   bp = TMP_ALLOC_LIMBS (n_max);
1867   ep = TMP_ALLOC_LIMBS (n_max);
1868   mp = TMP_ALLOC_LIMBS (n_max);
1869   tp = TMP_ALLOC_LIMBS (itch);
1870 
1871   mpn_random (bp, n_max);
1872   mpn_random (mp, n_max);
1873   mp[0] |= 1;
1874 
1875 /* How about taking the M operand size into account?
1876 
1877    An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
1878    B = O(M)).
1879 
1880    Using k-ary and no sliding window, the precomputation will need time
1881    O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
1882    O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
1883 
1884    An operation R=powm_sec(B,E,N) will take time like powm.
1885 
1886    Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
1887    main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
1888    O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
1889    table reads, respectively.  */
1890 
1891   printf ("#define POWM_SEC_TABLE  ");
1892 
1893   for (nbits = 1; nbits <= n_max * GMP_NUMB_BITS; )
1894     {
1895       n = (nbits - 1) / GMP_NUMB_BITS + 1;
1896 
1897       /* Generate E such that sliding-window for k and k+1 works equally
1898 	 well/poorly (but sliding is not used in powm_sec, of course). */
1899       for (i = 0; i < n; i++)
1900 	ep[i] = ~CNST_LIMB(0);
1901 
1902       /* Truncate E to be exactly nbits large.  */
1903       if (nbits % GMP_NUMB_BITS != 0)
1904 	mpn_rshift (ep, ep, n, GMP_NUMB_BITS - nbits % GMP_NUMB_BITS);
1905       ep[n - 1] |= CNST_LIMB(1) << (nbits - 1) % GMP_NUMB_BITS;
1906 
1907       winsize = k;
1908       for (i = 0; i < n_measurements; i++)
1909 	{
1910 	  speed_starttime ();
1911 	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
1912 	  ttab[i] = speed_endtime ();
1913 	}
1914       tk = median (ttab, n_measurements);
1915 
1916       winsize = k + 1;
1917       speed_starttime ();
1918       for (i = 0; i < n_measurements; i++)
1919 	{
1920 	  speed_starttime ();
1921 	  mpn_powm_sec (rp, bp, n, ep, n, mp, n, tp);
1922 	  ttab[i] = speed_endtime ();
1923 	}
1924       tkp1 = median (ttab, n_measurements);
1925 /*
1926       printf ("testing: %ld, %d", nbits, k, ep[n-1]);
1927       printf ("   %10.5f  %10.5f\n", tk, tkp1);
1928 */
1929       if (tkp1 < tk)
1930 	{
1931 	  if (possible_nbits_cutoff)
1932 	    {
1933 	      /* Two consecutive sizes indicate k increase, obey.  */
1934 	      if (k > 1)
1935 		printf (",");
1936 	      printf ("%ld", (long) possible_nbits_cutoff);
1937 	      k++;
1938 	      possible_nbits_cutoff = 0;
1939 	    }
1940 	  else
1941 	    {
1942 	      /* One measurement indicate k increase, save nbits for further
1943 		 consideration.  */
1944 	      possible_nbits_cutoff = nbits;
1945 	    }
1946 	}
1947       else
1948 	possible_nbits_cutoff = 0;
1949 
1950       nbits_next = nbits * 65 / 64;
1951       nbits = nbits_next + (nbits_next == nbits);
1952     }
1953   printf ("\n");
1954   TMP_FREE;
1955 }
1956 
1957 
1958 /* size_extra==1 reflects the fact that with high<divisor one division is
1959    always skipped.  Forcing high<divisor while testing ensures consistency
1960    while stepping through sizes, ie. that size-1 divides will be done each
1961    time.
1962 
1963    min_size==2 and min_is_always are used so that if plain division is only
1964    better at size==1 then don't bother including that code just for that
1965    case, instead go with preinv always and get a size saving.  */
1966 
1967 #define DIV_1_PARAMS                    \
1968   param.check_size = 256;               \
1969   param.min_size = 2;                   \
1970   param.min_is_always = 1;              \
1971   param.data_high = DATA_HIGH_LT_R;     \
1972   param.size_extra = 1;                 \
1973   param.stop_factor = 2.0;
1974 
1975 
1976 double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
1977 
1978 void
1979 tune_divrem_1 (void)
1980 {
1981   /* plain version by default */
1982   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
1983 
1984   /* No support for tuning native assembler code, do that by hand and put
1985      the results in the .asm file, there's no need for such thresholds to
1986      appear in gmp-mparam.h.  */
1987   if (HAVE_NATIVE_mpn_divrem_1)
1988     return;
1989 
1990   if (GMP_NAIL_BITS != 0)
1991     {
1992       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1993                            "no preinv with nails");
1994       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1995                            "no preinv with nails");
1996       return;
1997     }
1998 
1999   if (UDIV_PREINV_ALWAYS)
2000     {
2001       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
2002       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
2003       return;
2004     }
2005 
2006   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
2007 
2008   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
2009      a bit out for the fractional part, but that's too bad, the integer part
2010      is more important. */
2011   {
2012     static struct param_t  param;
2013     param.name = "DIVREM_1_NORM_THRESHOLD";
2014     DIV_1_PARAMS;
2015     s.r = randlimb_norm ();
2016     param.function = speed_mpn_divrem_1_tune;
2017     one (&divrem_1_norm_threshold, &param);
2018   }
2019   {
2020     static struct param_t  param;
2021     param.name = "DIVREM_1_UNNORM_THRESHOLD";
2022     DIV_1_PARAMS;
2023     s.r = randlimb_half ();
2024     param.function = speed_mpn_divrem_1_tune;
2025     one (&divrem_1_unnorm_threshold, &param);
2026   }
2027 }
2028 
2029 
2030 void
2031 tune_mod_1 (void)
2032 {
2033   /* No support for tuning native assembler code, do that by hand and put
2034      the results in the .asm file, there's no need for such thresholds to
2035      appear in gmp-mparam.h.  */
2036   if (HAVE_NATIVE_mpn_mod_1)
2037     return;
2038 
2039   if (GMP_NAIL_BITS != 0)
2040     {
2041       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
2042                            "no preinv with nails");
2043       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
2044                            "no preinv with nails");
2045       return;
2046     }
2047 
2048   if (!HAVE_NATIVE_mpn_mod_1_1p)
2049     {
2050       static struct param_t  param;
2051       double   t1, t2;
2052 
2053       s.size = 10;
2054       s.r = randlimb_half ();
2055 
2056       t1 = tuneup_measure (speed_mpn_mod_1_1_1, &param, &s);
2057       t2 = tuneup_measure (speed_mpn_mod_1_1_2, &param, &s);
2058 
2059       if (t1 == -1.0 || t2 == -1.0)
2060 	{
2061 	  printf ("Oops, can't measure all mpn_mod_1_1 methods at %ld\n",
2062 		  (long) s.size);
2063 	  abort ();
2064 	}
2065       mod_1_1p_method = (t1 < t2) ? 1 : 2;
2066       print_define ("MOD_1_1P_METHOD", mod_1_1p_method);
2067     }
2068 
2069   if (UDIV_PREINV_ALWAYS)
2070     {
2071       print_define ("MOD_1_NORM_THRESHOLD", 0L);
2072       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
2073     }
2074   else
2075     {
2076       {
2077 	static struct param_t  param;
2078 	param.name = "MOD_1_NORM_THRESHOLD";
2079 	DIV_1_PARAMS;
2080 	s.r = randlimb_norm ();
2081 	param.function = speed_mpn_mod_1_tune;
2082 	one (&mod_1_norm_threshold, &param);
2083       }
2084       {
2085 	static struct param_t  param;
2086 	param.name = "MOD_1_UNNORM_THRESHOLD";
2087 	DIV_1_PARAMS;
2088 	s.r = randlimb_half ();
2089 	param.function = speed_mpn_mod_1_tune;
2090 	one (&mod_1_unnorm_threshold, &param);
2091       }
2092     }
2093   {
2094     static struct param_t  param;
2095 
2096     param.check_size = 256;
2097 
2098     s.r = randlimb_norm ();
2099     param.function = speed_mpn_mod_1_tune;
2100 
2101     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
2102     param.min_size = 2;
2103     one (&mod_1n_to_mod_1_1_threshold, &param);
2104   }
2105 
2106   {
2107     static struct param_t  param;
2108 
2109     param.check_size = 256;
2110     s.r = randlimb_half ();
2111     param.noprint = 1;
2112 
2113     param.function = speed_mpn_mod_1_1;
2114     param.function2 = speed_mpn_mod_1_2;
2115     param.min_is_always = 1;
2116     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
2117     param.min_size = 2;
2118     one (&mod_1_1_to_mod_1_2_threshold, &param);
2119 
2120     param.function = speed_mpn_mod_1_2;
2121     param.function2 = speed_mpn_mod_1_4;
2122     param.min_is_always = 1;
2123     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
2124     param.min_size = 1;
2125     one (&mod_1_2_to_mod_1_4_threshold, &param);
2126 
2127     if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
2128       {
2129 	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
2130 	mod_1_2_to_mod_1_4_threshold = 0;
2131 
2132 	param.function = speed_mpn_mod_1_1;
2133 	param.function2 = speed_mpn_mod_1_4;
2134 	param.min_is_always = 1;
2135 	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
2136 	param.min_size = 2;
2137 	one (&mod_1_1_to_mod_1_2_threshold, &param);
2138       }
2139 
2140     param.function = speed_mpn_mod_1_tune;
2141     param.function2 = NULL;
2142     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
2143     param.min_size = 2;
2144     param.min_is_always = 0;
2145     one (&mod_1u_to_mod_1_1_threshold, &param);
2146 
2147     if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
2148       mod_1_1_to_mod_1_2_threshold = 0;
2149     if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
2150       mod_1_2_to_mod_1_4_threshold = 0;
2151 
2152     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
2153     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
2154 			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
2155     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
2156 			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
2157   }
2158 
2159   {
2160     static struct param_t  param;
2161 
2162     param.check_size = 256;
2163 
2164     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
2165     s.r = randlimb_norm ();
2166     param.function = speed_mpn_preinv_mod_1;
2167     param.function2 = speed_mpn_mod_1_tune;
2168     param.min_size = 1;
2169     one (&preinv_mod_1_to_mod_1_threshold, &param);
2170   }
2171 }
2172 
2173 
2174 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
2175    imply that udiv_qrnnd_preinv is worth using, but it seems most
2176    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
2177    directly.  */
2178 
2179 void
2180 tune_preinv_divrem_1 (void)
2181 {
2182   static struct param_t  param;
2183   speed_function_t  divrem_1;
2184   const char        *divrem_1_name;
2185   double            t1, t2;
2186 
2187   if (GMP_NAIL_BITS != 0)
2188     {
2189       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
2190       return;
2191     }
2192 
2193   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
2194      it's faster than mpn_divrem_1.  */
2195   if (HAVE_NATIVE_mpn_preinv_divrem_1)
2196     {
2197       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
2198       return;
2199     }
2200 
2201   /* If udiv_qrnnd_preinv is the only division method then of course
2202      mpn_preinv_divrem_1 should be used.  */
2203   if (UDIV_PREINV_ALWAYS)
2204     {
2205       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
2206       return;
2207     }
2208 
2209   /* If we've got an assembler version of mpn_divrem_1, then compare against
2210      that, not the mpn_divrem_1_div generic C.  */
2211   if (HAVE_NATIVE_mpn_divrem_1)
2212     {
2213       divrem_1 = speed_mpn_divrem_1;
2214       divrem_1_name = "mpn_divrem_1";
2215     }
2216   else
2217     {
2218       divrem_1 = speed_mpn_divrem_1_div;
2219       divrem_1_name = "mpn_divrem_1_div";
2220     }
2221 
2222   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
2223   s.size = 200;                     /* generous but not too big */
2224   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
2225      since in general that's probably most common, though in fact for a
2226      64-bit limb mp_bases[10].big_base is normalized.  */
2227   s.r = urandom() & (GMP_NUMB_MASK >> 4);
2228   if (s.r == 0) s.r = 123;
2229 
2230   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
2231   t2 = tuneup_measure (divrem_1, &param, &s);
2232   if (t1 == -1.0 || t2 == -1.0)
2233     {
2234       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
2235               divrem_1_name, (long) s.size);
2236       abort ();
2237     }
2238   if (option_trace >= 1)
2239     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
2240             (long) s.size, t1, divrem_1_name, t2);
2241 
2242   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
2243 }
2244 
2245 
2246 
2247 void
2248 tune_divrem_2 (void)
2249 {
2250   static struct param_t  param;
2251 
2252   /* No support for tuning native assembler code, do that by hand and put
2253      the results in the .asm file, and there's no need for such thresholds
2254      to appear in gmp-mparam.h.  */
2255   if (HAVE_NATIVE_mpn_divrem_2)
2256     return;
2257 
2258   if (GMP_NAIL_BITS != 0)
2259     {
2260       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
2261                            "no preinv with nails");
2262       return;
2263     }
2264 
2265   if (UDIV_PREINV_ALWAYS)
2266     {
2267       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
2268       return;
2269     }
2270 
2271   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
2272      a bit out for the fractional part, but that's too bad, the integer part
2273      is more important.
2274 
2275      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
2276      code space if plain division is better only at size==2 or size==3. */
2277   param.name = "DIVREM_2_THRESHOLD";
2278   param.check_size = 256;
2279   param.min_size = 4;
2280   param.min_is_always = 1;
2281   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
2282   param.stop_factor = 2.0;
2283 
2284   s.r = randlimb_norm ();
2285   param.function = speed_mpn_divrem_2;
2286   one (&divrem_2_threshold, &param);
2287 }
2288 
2289 void
2290 tune_div_qr_2 (void)
2291 {
2292   static struct param_t  param;
2293   param.name = "DIV_QR_2_PI2_THRESHOLD";
2294   param.function = speed_mpn_div_qr_2n;
2295   param.check_size = 500;
2296   param.min_size = 4;
2297   one (&div_qr_2_pi2_threshold, &param);
2298 }
2299 
2300 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
2301    tune for that.  Its speed can differ on odd or even divisor, so take an
2302    average threshold for the two.
2303 
2304    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
2305    might not vary that way, but don't test this since high<divisor isn't
2306    expected to occur often with small divisors.  */
2307 
2308 void
2309 tune_divexact_1 (void)
2310 {
2311   static struct param_t  param;
2312   mp_size_t  thresh[2], average;
2313   int        low, i;
2314 
2315   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
2316      full mpn_divrem_1.  */
2317   if (HAVE_NATIVE_mpn_divexact_1)
2318     {
2319       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
2320       return;
2321     }
2322 
2323   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
2324 
2325   param.name = "DIVEXACT_1_THRESHOLD";
2326   param.data_high = DATA_HIGH_GE_R;
2327   param.check_size = 256;
2328   param.min_size = 2;
2329   param.stop_factor = 1.5;
2330   param.function  = tuned_speed_mpn_divrem_1;
2331   param.function2 = speed_mpn_divexact_1;
2332   param.noprint = 1;
2333 
2334   print_define_start (param.name);
2335 
2336   for (low = 0; low <= 1; low++)
2337     {
2338       s.r = randlimb_half();
2339       if (low == 0)
2340         s.r |= 1;
2341       else
2342         s.r &= ~CNST_LIMB(7);
2343 
2344       one (&thresh[low], &param);
2345       if (option_trace)
2346         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
2347 
2348       if (thresh[low] == MP_SIZE_T_MAX)
2349         {
2350           average = MP_SIZE_T_MAX;
2351           goto divexact_1_done;
2352         }
2353     }
2354 
2355   if (option_trace)
2356     {
2357       printf ("average of:");
2358       for (i = 0; i < numberof(thresh); i++)
2359         printf (" %ld", (long) thresh[i]);
2360       printf ("\n");
2361     }
2362 
2363   average = 0;
2364   for (i = 0; i < numberof(thresh); i++)
2365     average += thresh[i];
2366   average /= numberof(thresh);
2367 
2368   /* If divexact turns out to be better as early as 3 limbs, then use it
2369      always, so as to reduce code size and conditional jumps.  */
2370   if (average <= 3)
2371     average = 0;
2372 
2373  divexact_1_done:
2374   print_define_end (param.name, average);
2375 }
2376 
2377 
2378 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2379    same as mpn_mod_1, but this might not be true of an assembler
2380    implementation.  The threshold used is an average based on data where a
2381    divide can be skipped and where it can't.
2382 
2383    If modexact turns out to be better as early as 3 limbs, then use it
2384    always, so as to reduce code size and conditional jumps.  */
2385 
2386 void
2387 tune_modexact_1_odd (void)
2388 {
2389   static struct param_t  param;
2390   mp_size_t  thresh_lt, thresh_ge, average;
2391 
2392 #if 0
2393   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2394      of a full mpn_mod_1.  */
2395   if (HAVE_NATIVE_mpn_modexact_1_odd)
2396     {
2397       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2398       return;
2399     }
2400 #endif
2401 
2402   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2403   param.check_size = 256;
2404   param.min_size = 2;
2405   param.stop_factor = 1.5;
2406   param.function  = speed_mpn_modexact_1c_odd;
2407   param.function2 = speed_mpn_mod_1_tune;
2408   param.noprint = 1;
2409   s.r = randlimb_half () | 1;
2410 
2411   print_define_start (param.name);
2412 
2413   param.data_high = DATA_HIGH_LT_R;
2414   one (&thresh_lt, &param);
2415   if (option_trace)
2416     printf ("lt thresh %ld\n", (long) thresh_lt);
2417 
2418   average = thresh_lt;
2419   if (thresh_lt != MP_SIZE_T_MAX)
2420     {
2421       param.data_high = DATA_HIGH_GE_R;
2422       one (&thresh_ge, &param);
2423       if (option_trace)
2424         printf ("ge thresh %ld\n", (long) thresh_ge);
2425 
2426       if (thresh_ge != MP_SIZE_T_MAX)
2427         {
2428           average = (thresh_ge + thresh_lt) / 2;
2429           if (thresh_ge <= 3)
2430             average = 0;
2431         }
2432     }
2433 
2434   print_define_end (param.name, average);
2435 }
2436 
2437 
2438 void
2439 tune_jacobi_base (void)
2440 {
2441   static struct param_t  param;
2442   double   t1, t2, t3, t4;
2443   int      method;
2444 
2445   s.size = GMP_LIMB_BITS * 3 / 4;
2446 
2447   t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
2448   if (option_trace >= 1)
2449     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
2450 
2451   t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
2452   if (option_trace >= 1)
2453     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
2454 
2455   t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
2456   if (option_trace >= 1)
2457     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
2458 
2459   t4 = tuneup_measure (speed_mpn_jacobi_base_4, &param, &s);
2460   if (option_trace >= 1)
2461     printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4);
2462 
2463   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0)
2464     {
2465       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
2466               (long) s.size);
2467       abort ();
2468     }
2469 
2470   if (t1 < t2 && t1 < t3 && t1 < t4)
2471     method = 1;
2472   else if (t2 < t3 && t2 < t4)
2473     method = 2;
2474   else if (t3 < t4)
2475     method = 3;
2476   else
2477     method = 4;
2478 
2479   print_define ("JACOBI_BASE_METHOD", method);
2480 }
2481 
2482 
2483 void
2484 tune_get_str (void)
2485 {
2486   /* Tune for decimal, it being most common.  Some rough testing suggests
2487      other bases are different, but not by very much.  */
2488   s.r = 10;
2489   {
2490     static struct param_t  param;
2491     GET_STR_PRECOMPUTE_THRESHOLD = 0;
2492     param.name = "GET_STR_DC_THRESHOLD";
2493     param.function = speed_mpn_get_str;
2494     param.min_size = 4;
2495     param.max_size = GET_STR_THRESHOLD_LIMIT;
2496     one (&get_str_dc_threshold, &param);
2497   }
2498   {
2499     static struct param_t  param;
2500     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2501     param.function = speed_mpn_get_str;
2502     param.min_size = GET_STR_DC_THRESHOLD;
2503     param.max_size = GET_STR_THRESHOLD_LIMIT;
2504     one (&get_str_precompute_threshold, &param);
2505   }
2506 }
2507 
2508 
2509 double
2510 speed_mpn_pre_set_str (struct speed_params *s)
2511 {
2512   unsigned char *str;
2513   mp_ptr     wp;
2514   mp_size_t  wn;
2515   unsigned   i;
2516   int        base;
2517   double     t;
2518   mp_ptr powtab_mem, tp;
2519   powers_t powtab[GMP_LIMB_BITS];
2520   mp_size_t un;
2521   int chars_per_limb;
2522   TMP_DECL;
2523 
2524   SPEED_RESTRICT_COND (s->size >= 1);
2525 
2526   base = s->r == 0 ? 10 : s->r;
2527   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2528 
2529   TMP_MARK;
2530 
2531   str = TMP_ALLOC (s->size);
2532   for (i = 0; i < s->size; i++)
2533     str[i] = s->xp[i] % base;
2534 
2535   LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
2536   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2537 
2538   /* use this during development to check wn is big enough */
2539   /*
2540   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2541   */
2542 
2543   speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
2544   speed_operand_dst (s, wp, wn);
2545   speed_cache_fill (s);
2546 
2547   chars_per_limb = mp_bases[base].chars_per_limb;
2548   un = s->size / chars_per_limb + 1;
2549   powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
2550   mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
2551   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2552 
2553   speed_starttime ();
2554   i = s->reps;
2555   do
2556     {
2557       mpn_pre_set_str (wp, str, s->size, powtab, tp);
2558     }
2559   while (--i != 0);
2560   t = speed_endtime ();
2561 
2562   TMP_FREE;
2563   return t;
2564 }
2565 
2566 void
2567 tune_set_str (void)
2568 {
2569   s.r = 10;  /* decimal */
2570   {
2571     static struct param_t  param;
2572     SET_STR_PRECOMPUTE_THRESHOLD = 0;
2573     param.step_factor = 0.01;
2574     param.name = "SET_STR_DC_THRESHOLD";
2575     param.function = speed_mpn_pre_set_str;
2576     param.min_size = 100;
2577     param.max_size = 50000;
2578     one (&set_str_dc_threshold, &param);
2579   }
2580   {
2581     static struct param_t  param;
2582     param.step_factor = 0.02;
2583     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2584     param.function = speed_mpn_set_str;
2585     param.min_size = SET_STR_DC_THRESHOLD;
2586     param.max_size = 100000;
2587     one (&set_str_precompute_threshold, &param);
2588   }
2589 }
2590 
2591 
2592 void
2593 tune_fft_mul (void)
2594 {
2595   static struct fft_param_t  param;
2596 
2597   if (option_fft_max_size == 0)
2598     return;
2599 
2600   param.table_name          = "MUL_FFT_TABLE3";
2601   param.threshold_name      = "MUL_FFT_THRESHOLD";
2602   param.p_threshold         = &mul_fft_threshold;
2603   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2604   param.p_modf_threshold    = &mul_fft_modf_threshold;
2605   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
2606   param.max_size            = option_fft_max_size;
2607   param.function            = speed_mpn_fft_mul;
2608   param.mul_modf_function   = speed_mpn_mul_fft;
2609   param.mul_function        = speed_mpn_mul_n;
2610   param.sqr = 0;
2611   fft (&param);
2612 }
2613 
2614 
2615 void
2616 tune_fft_sqr (void)
2617 {
2618   static struct fft_param_t  param;
2619 
2620   if (option_fft_max_size == 0)
2621     return;
2622 
2623   param.table_name          = "SQR_FFT_TABLE3";
2624   param.threshold_name      = "SQR_FFT_THRESHOLD";
2625   param.p_threshold         = &sqr_fft_threshold;
2626   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2627   param.p_modf_threshold    = &sqr_fft_modf_threshold;
2628   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
2629   param.max_size            = option_fft_max_size;
2630   param.function            = speed_mpn_fft_sqr;
2631   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
2632   param.mul_function        = speed_mpn_sqr;
2633   param.sqr = 1;
2634   fft (&param);
2635 }
2636 
2637 void
2638 tune_fac_ui (void)
2639 {
2640   static struct param_t  param;
2641 
2642   param.function = speed_mpz_fac_ui_tune;
2643 
2644   param.name = "FAC_DSC_THRESHOLD";
2645   param.min_size = 70;
2646   param.max_size = FAC_DSC_THRESHOLD_LIMIT;
2647   one (&fac_dsc_threshold, &param);
2648 
2649   param.name = "FAC_ODD_THRESHOLD";
2650   param.min_size = 22;
2651   param.stop_factor = 1.7;
2652   param.min_is_always = 1;
2653   one (&fac_odd_threshold, &param);
2654 }
2655 
2656 void
2657 all (void)
2658 {
2659   time_t  start_time, end_time;
2660   TMP_DECL;
2661 
2662   TMP_MARK;
2663   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2664   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2665 
2666   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2667   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2668 
2669   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2670 
2671   speed_time_init ();
2672   fprintf (stderr, "Using: %s\n", speed_time_string);
2673 
2674   fprintf (stderr, "speed_precision %d", speed_precision);
2675   if (speed_unittime == 1.0)
2676     fprintf (stderr, ", speed_unittime 1 cycle");
2677   else
2678     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2679   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2680     fprintf (stderr, ", CPU freq unknown\n");
2681   else
2682     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2683 
2684   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2685            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2686   fprintf (stderr, "\n");
2687 
2688   time (&start_time);
2689   {
2690     struct tm  *tp;
2691     tp = localtime (&start_time);
2692     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2693             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2694 
2695 #ifdef __GNUC__
2696     /* gcc sub-minor version doesn't seem to come through as a define */
2697     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2698 #define PRINTED_COMPILER
2699 #endif
2700 #if defined (__SUNPRO_C)
2701     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2702 #define PRINTED_COMPILER
2703 #endif
2704 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2705     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2706     printf ("MIPSpro C %d.%d.%d */\n",
2707 	    _COMPILER_VERSION / 100,
2708 	    _COMPILER_VERSION / 10 % 10,
2709 	    _COMPILER_VERSION % 10);
2710 #define PRINTED_COMPILER
2711 #endif
2712 #if defined (__DECC) && defined (__DECC_VER)
2713     printf ("DEC C %d */\n", __DECC_VER);
2714 #define PRINTED_COMPILER
2715 #endif
2716 #if ! defined (PRINTED_COMPILER)
2717     printf ("system compiler */\n");
2718 #endif
2719   }
2720   printf ("\n");
2721 
2722   tune_divrem_1 ();
2723   tune_mod_1 ();
2724   tune_preinv_divrem_1 ();
2725 #if 0
2726   tune_divrem_2 ();
2727 #endif
2728   tune_div_qr_2 ();
2729   tune_divexact_1 ();
2730   tune_modexact_1_odd ();
2731   printf("\n");
2732 
2733   tune_mul_n ();
2734   printf("\n");
2735 
2736   tune_mul ();
2737   printf("\n");
2738 
2739   tune_sqr ();
2740   printf("\n");
2741 
2742   tune_mulmid ();
2743   printf("\n");
2744 
2745   tune_mulmod_bnm1 ();
2746   tune_sqrmod_bnm1 ();
2747   printf("\n");
2748 
2749   tune_fft_mul ();
2750   printf("\n");
2751 
2752   tune_fft_sqr ();
2753   printf ("\n");
2754 
2755   tune_mullo ();
2756   printf("\n");
2757 
2758   tune_dc_div ();
2759   tune_dc_bdiv ();
2760 
2761   printf("\n");
2762   tune_invertappr ();
2763   tune_invert ();
2764   printf("\n");
2765 
2766   tune_binvert ();
2767   tune_redc ();
2768   printf("\n");
2769 
2770   tune_mu_div ();
2771   tune_mu_bdiv ();
2772   printf("\n");
2773 
2774   tune_powm_sec ();
2775   printf("\n");
2776 
2777   tune_matrix22_mul ();
2778   tune_hgcd ();
2779   tune_hgcd_appr ();
2780   tune_hgcd_reduce();
2781   tune_gcd_dc ();
2782   tune_gcdext_dc ();
2783   tune_jacobi_base ();
2784   printf("\n");
2785 
2786   tune_get_str ();
2787   tune_set_str ();
2788   printf("\n");
2789 
2790   tune_fac_ui ();
2791   printf("\n");
2792 
2793   time (&end_time);
2794   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
2795           (long) (end_time - start_time));
2796 
2797   TMP_FREE;
2798 }
2799 
2800 
2801 int
2802 main (int argc, char *argv[])
2803 {
2804   int  opt;
2805 
2806   /* Unbuffered so if output is redirected to a file it isn't lost if the
2807      program is killed part way through.  */
2808   setbuf (stdout, NULL);
2809   setbuf (stderr, NULL);
2810 
2811   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
2812     {
2813       switch (opt) {
2814       case 'f':
2815         if (optarg[0] == 't')
2816           option_fft_trace = 2;
2817         else
2818           option_fft_max_size = atol (optarg);
2819         break;
2820       case 'o':
2821         speed_option_set (optarg);
2822         break;
2823       case 'p':
2824         speed_precision = atoi (optarg);
2825         break;
2826       case 't':
2827         option_trace++;
2828         break;
2829       case '?':
2830         exit(1);
2831       }
2832     }
2833 
2834   all ();
2835   exit (0);
2836 }
2837