xref: /netbsd-src/external/lgpl3/gmp/dist/tune/speed.c (revision 1daf83e636cd998f45e5597a8f995a540e2d5b4a)
1 /* Speed measuring program.
2 
3 Copyright 1999-2003, 2005, 2006, 2008-2019 Free Software Foundation, Inc.
4 
5 This file is part of the GNU MP Library.
6 
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9 
10   * the GNU Lesser General Public License as published by the Free
11     Software Foundation; either version 3 of the License, or (at your
12     option) any later version.
13 
14 or
15 
16   * the GNU General Public License as published by the Free Software
17     Foundation; either version 2 of the License, or (at your option) any
18     later version.
19 
20 or both in parallel, as here.
21 
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 for more details.
26 
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library.  If not,
29 see https://www.gnu.org/licenses/.  */
30 
31 /* Usage message is in the code below, run with no arguments to print it.
32    See README for interesting applications.
33 
34    To add a new routine foo(), create a speed_foo() function in the style of
35    the existing ones and add an entry in the routine[] array.  Put FLAG_R if
36    speed_foo() wants an "r" parameter.
37 
38    The routines don't have help messages or descriptions, but most have
39    suggestive names.  See the source code for full details. */
40 
41 #include "config.h"
42 
43 #include <limits.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 
48 #if HAVE_UNISTD_H
49 #include <unistd.h>  /* for getpid, R_OK */
50 #endif
51 
52 #if TIME_WITH_SYS_TIME
53 # include <sys/time.h>  /* for struct timeval */
54 # include <time.h>
55 #else
56 # if HAVE_SYS_TIME_H
57 #  include <sys/time.h>
58 # else
59 #  include <time.h>
60 # endif
61 #endif
62 
63 #if HAVE_SYS_RESOURCE_H
64 #include <sys/resource.h>  /* for getrusage() */
65 #endif
66 
67 
68 #include "gmp-impl.h"
69 #include "longlong.h"  /* for the benefit of speed-many.c */
70 #include "tests.h"
71 #include "speed.h"
72 
73 
74 #if !HAVE_DECL_OPTARG
75 extern char *optarg;
76 extern int optind, opterr;
77 #endif
78 
79 #if !HAVE_STRTOUL
80 #define strtoul(p,e,b)  (unsigned long) strtol(p,e,b)
81 #endif
82 
83 #ifdef SPEED_EXTRA_PROTOS
84 SPEED_EXTRA_PROTOS
85 #endif
86 #ifdef SPEED_EXTRA_PROTOS2
87 SPEED_EXTRA_PROTOS2
88 #endif
89 
90 
91 #if GMP_LIMB_BITS == 32
92 #define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK)
93 #endif
94 #if GMP_LIMB_BITS == 64
95 #define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK)
96 #endif
97 
98 
99 #define CMP_ABSOLUTE     1
100 #define CMP_RATIO        2
101 #define CMP_DIFFERENCE   3
102 #define CMP_DIFFPREV     4
103 int  option_cmp = CMP_ABSOLUTE;
104 
105 #define UNIT_SECONDS        1
106 #define UNIT_CYCLES         2
107 #define UNIT_CYCLESPERLIMB  3
108 int  option_unit = UNIT_SECONDS;
109 
110 #define DATA_RANDOM   1
111 #define DATA_RANDOM2  2
112 #define DATA_ZEROS    3
113 #define DATA_AAS      4
114 #define DATA_FFS      5
115 #define DATA_2FD      6
116 int  option_data = DATA_RANDOM;
117 
118 int        option_square = 0;
119 double     option_factor = 0.0;
120 mp_size_t  option_step = 1;
121 int        option_gnuplot = 0;
122 char      *option_gnuplot_basename;
123 struct size_array_t {
124   mp_size_t start, end;
125 } *size_array = NULL;
126 mp_size_t  size_num = 0;
127 mp_size_t  size_allocnum = 0;
128 int        option_resource_usage = 0;
129 long       option_seed = 123456789;
130 
131 struct speed_params  sp;
132 
133 #define COLUMN_WIDTH  13  /* for the free-form output */
134 
135 #define FLAG_R            (1<<0)  /* require ".r" */
136 #define FLAG_R_OPTIONAL   (1<<1)  /* optional ".r" */
137 #define FLAG_RSIZE        (1<<2)
138 #define FLAG_NODATA       (1<<3)  /* don't alloc xp, yp */
139 
140 const struct routine_t {
141   /* constants */
142   const char        *name;
143   speed_function_t  fun;
144   int               flag;
145 } routine[] = {
146 
147   { "noop",              speed_noop                 },
148   { "noop_wxs",          speed_noop_wxs             },
149   { "noop_wxys",         speed_noop_wxys            },
150 
151   { "mpn_add_n",         speed_mpn_add_n,     FLAG_R_OPTIONAL },
152   { "mpn_sub_n",         speed_mpn_sub_n,     FLAG_R_OPTIONAL },
153   { "mpn_add_1",         speed_mpn_add_1,     FLAG_R },
154   { "mpn_add_1_inplace", speed_mpn_add_1_inplace, FLAG_R },
155   { "mpn_sub_1",         speed_mpn_sub_1,     FLAG_R },
156   { "mpn_sub_1_inplace", speed_mpn_sub_1_inplace, FLAG_R },
157 
158   { "mpn_add_err1_n",    speed_mpn_add_err1_n    },
159   { "mpn_add_err2_n",    speed_mpn_add_err2_n    },
160   { "mpn_add_err3_n",    speed_mpn_add_err3_n    },
161   { "mpn_sub_err1_n",    speed_mpn_sub_err1_n    },
162   { "mpn_sub_err2_n",    speed_mpn_sub_err2_n    },
163   { "mpn_sub_err3_n",    speed_mpn_sub_err3_n    },
164 
165 #if HAVE_NATIVE_mpn_add_n_sub_n
166   { "mpn_add_n_sub_n",      speed_mpn_add_n_sub_n,     FLAG_R_OPTIONAL },
167 #endif
168 
169   { "mpn_addmul_1",      speed_mpn_addmul_1,  FLAG_R },
170   { "mpn_submul_1",      speed_mpn_submul_1,  FLAG_R },
171 #if HAVE_NATIVE_mpn_addmul_2
172   { "mpn_addmul_2",      speed_mpn_addmul_2,  FLAG_R_OPTIONAL },
173 #endif
174 #if HAVE_NATIVE_mpn_addmul_3
175   { "mpn_addmul_3",      speed_mpn_addmul_3,  FLAG_R_OPTIONAL },
176 #endif
177 #if HAVE_NATIVE_mpn_addmul_4
178   { "mpn_addmul_4",      speed_mpn_addmul_4,  FLAG_R_OPTIONAL },
179 #endif
180 #if HAVE_NATIVE_mpn_addmul_5
181   { "mpn_addmul_5",      speed_mpn_addmul_5,  FLAG_R_OPTIONAL },
182 #endif
183 #if HAVE_NATIVE_mpn_addmul_6
184   { "mpn_addmul_6",      speed_mpn_addmul_6,  FLAG_R_OPTIONAL },
185 #endif
186 #if HAVE_NATIVE_mpn_addmul_7
187   { "mpn_addmul_7",      speed_mpn_addmul_7,  FLAG_R_OPTIONAL },
188 #endif
189 #if HAVE_NATIVE_mpn_addmul_8
190   { "mpn_addmul_8",      speed_mpn_addmul_8,  FLAG_R_OPTIONAL },
191 #endif
192   { "mpn_mul_1",         speed_mpn_mul_1,     FLAG_R },
193   { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R },
194 #if HAVE_NATIVE_mpn_mul_2
195   { "mpn_mul_2",         speed_mpn_mul_2,     FLAG_R_OPTIONAL },
196 #endif
197 #if HAVE_NATIVE_mpn_mul_3
198   { "mpn_mul_3",         speed_mpn_mul_3,     FLAG_R_OPTIONAL },
199 #endif
200 #if HAVE_NATIVE_mpn_mul_4
201   { "mpn_mul_4",         speed_mpn_mul_4,     FLAG_R_OPTIONAL },
202 #endif
203 #if HAVE_NATIVE_mpn_mul_5
204   { "mpn_mul_5",         speed_mpn_mul_5,     FLAG_R_OPTIONAL },
205 #endif
206 #if HAVE_NATIVE_mpn_mul_6
207   { "mpn_mul_6",         speed_mpn_mul_6,     FLAG_R_OPTIONAL },
208 #endif
209 
210   { "mpn_divrem_1",      speed_mpn_divrem_1,  FLAG_R },
211   { "mpn_divrem_1f",     speed_mpn_divrem_1f, FLAG_R },
212 #if HAVE_NATIVE_mpn_divrem_1c
213   { "mpn_divrem_1c",     speed_mpn_divrem_1c, FLAG_R },
214   { "mpn_divrem_1cf",    speed_mpn_divrem_1cf,FLAG_R },
215 #endif
216   { "mpn_mod_1",         speed_mpn_mod_1,     FLAG_R },
217 #if HAVE_NATIVE_mpn_mod_1c
218   { "mpn_mod_1c",        speed_mpn_mod_1c,    FLAG_R },
219 #endif
220   { "mpn_preinv_divrem_1",  speed_mpn_preinv_divrem_1,  FLAG_R },
221   { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R },
222   { "mpn_preinv_mod_1",  speed_mpn_preinv_mod_1, FLAG_R },
223 
224   { "mpn_mod_1_1",       speed_mpn_mod_1_1,       FLAG_R },
225   { "mpn_mod_1_1_1",     speed_mpn_mod_1_1_1,     FLAG_R },
226   { "mpn_mod_1_1_2",     speed_mpn_mod_1_1_2,     FLAG_R },
227   { "mpn_mod_1s_2",      speed_mpn_mod_1_2,       FLAG_R },
228   { "mpn_mod_1s_3",      speed_mpn_mod_1_3,       FLAG_R },
229   { "mpn_mod_1s_4",      speed_mpn_mod_1_4,       FLAG_R },
230 
231   { "mpn_divrem_1_div",  speed_mpn_divrem_1_div,  FLAG_R },
232   { "mpn_divrem_1_inv",  speed_mpn_divrem_1_inv,  FLAG_R },
233   { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R },
234   { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R },
235   { "mpn_mod_1_div",     speed_mpn_mod_1_div,     FLAG_R },
236   { "mpn_mod_1_inv",     speed_mpn_mod_1_inv,     FLAG_R },
237 
238   { "mpn_divrem_2",      speed_mpn_divrem_2,        },
239   { "mpn_divrem_2_div",  speed_mpn_divrem_2_div,    },
240   { "mpn_divrem_2_inv",  speed_mpn_divrem_2_inv,    },
241 
242   { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R  },
243   { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R  },
244   { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R  },
245   { "mpn_div_qr_1",      speed_mpn_div_qr_1,      FLAG_R },
246 
247   { "mpn_div_qr_2n",     speed_mpn_div_qr_2n,       },
248   { "mpn_div_qr_2u",     speed_mpn_div_qr_2u,       },
249 
250   { "mpn_divexact_1",    speed_mpn_divexact_1,    FLAG_R },
251   { "mpn_divexact_by3",  speed_mpn_divexact_by3          },
252 
253   { "mpn_bdiv_q_1",      speed_mpn_bdiv_q_1,      FLAG_R },
254   { "mpn_pi1_bdiv_q_1",  speed_mpn_pi1_bdiv_q_1,  FLAG_R_OPTIONAL },
255   { "mpn_bdiv_dbm1c",    speed_mpn_bdiv_dbm1c,    FLAG_R_OPTIONAL },
256 
257 #if HAVE_NATIVE_mpn_modexact_1_odd
258   { "mpn_modexact_1_odd",  speed_mpn_modexact_1_odd,  FLAG_R },
259 #endif
260   { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R },
261 
262 #if GMP_NUMB_BITS % 4 == 0
263   { "mpn_mod_34lsub1",   speed_mpn_mod_34lsub1 },
264 #endif
265 
266   { "mpn_lshift",        speed_mpn_lshift, FLAG_R   },
267   { "mpn_lshiftc",       speed_mpn_lshiftc, FLAG_R   },
268   { "mpn_rshift",        speed_mpn_rshift, FLAG_R   },
269 
270   { "mpn_and_n",         speed_mpn_and_n,  FLAG_R_OPTIONAL },
271   { "mpn_andn_n",        speed_mpn_andn_n, FLAG_R_OPTIONAL },
272   { "mpn_nand_n",        speed_mpn_nand_n, FLAG_R_OPTIONAL },
273   { "mpn_ior_n",         speed_mpn_ior_n,  FLAG_R_OPTIONAL },
274   { "mpn_iorn_n",        speed_mpn_iorn_n, FLAG_R_OPTIONAL },
275   { "mpn_nior_n",        speed_mpn_nior_n, FLAG_R_OPTIONAL },
276   { "mpn_xor_n",         speed_mpn_xor_n,  FLAG_R_OPTIONAL },
277   { "mpn_xnor_n",        speed_mpn_xnor_n, FLAG_R_OPTIONAL },
278   { "mpn_com",           speed_mpn_com              },
279   { "mpn_neg",           speed_mpn_neg              },
280 
281   { "mpn_popcount",      speed_mpn_popcount         },
282   { "mpn_hamdist",       speed_mpn_hamdist          },
283 
284   { "mpn_matrix22_mul",  speed_mpn_matrix22_mul     },
285 
286   { "mpn_hgcd2",         speed_mpn_hgcd2, FLAG_NODATA },
287   { "mpn_hgcd2_1",       speed_mpn_hgcd2_1, FLAG_NODATA },
288   { "mpn_hgcd2_2",       speed_mpn_hgcd2_2, FLAG_NODATA },
289   { "mpn_hgcd2_3",       speed_mpn_hgcd2_3, FLAG_NODATA },
290   { "mpn_hgcd2_4",       speed_mpn_hgcd2_4, FLAG_NODATA },
291   { "mpn_hgcd2_5",       speed_mpn_hgcd2_5, FLAG_NODATA },
292   { "mpn_hgcd",          speed_mpn_hgcd             },
293   { "mpn_hgcd_lehmer",   speed_mpn_hgcd_lehmer      },
294   { "mpn_hgcd_appr",     speed_mpn_hgcd_appr        },
295   { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
296 
297   { "mpn_hgcd_reduce",   speed_mpn_hgcd_reduce      },
298   { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1    },
299   { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2    },
300 
301   { "mpn_gcd_1",         speed_mpn_gcd_1,  FLAG_R_OPTIONAL },
302   { "mpn_gcd_11",        speed_mpn_gcd_11, FLAG_R_OPTIONAL },
303   { "mpn_gcd_1N",        speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
304   { "mpn_gcd_22",        speed_mpn_gcd_22, FLAG_R_OPTIONAL },
305 
306   { "mpn_gcd",           speed_mpn_gcd                    },
307 
308   { "mpn_gcdext",            speed_mpn_gcdext            },
309   { "mpn_gcdext_single",     speed_mpn_gcdext_single     },
310   { "mpn_gcdext_double",     speed_mpn_gcdext_double     },
311   { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single },
312   { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double },
313 #if 0
314   { "mpn_gcdext_lehmer",     speed_mpn_gcdext_lehmer     },
315 #endif
316 
317   { "mpz_nextprime",     speed_mpz_nextprime        },
318 
319   { "mpz_jacobi",        speed_mpz_jacobi           },
320   { "mpn_jacobi_base",   speed_mpn_jacobi_base      },
321   { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1    },
322   { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2    },
323   { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3    },
324   { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4    },
325 
326   { "mpn_mul",           speed_mpn_mul,         FLAG_R_OPTIONAL },
327   { "mpn_mul_basecase",  speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
328   { "mpn_sqr_basecase",  speed_mpn_sqr_basecase     },
329 #if HAVE_NATIVE_mpn_sqr_diagonal
330   { "mpn_sqr_diagonal",  speed_mpn_sqr_diagonal     },
331 #endif
332 #if HAVE_NATIVE_mpn_sqr_diag_addlsh1
333   { "mpn_sqr_diag_addlsh1", speed_mpn_sqr_diag_addlsh1 },
334 #endif
335 
336   { "mpn_mul_n",         speed_mpn_mul_n            },
337   { "mpn_sqr",           speed_mpn_sqr              },
338 
339   { "mpn_toom2_sqr",     speed_mpn_toom2_sqr        },
340   { "mpn_toom3_sqr",     speed_mpn_toom3_sqr        },
341   { "mpn_toom4_sqr",     speed_mpn_toom4_sqr        },
342   { "mpn_toom6_sqr",     speed_mpn_toom6_sqr        },
343   { "mpn_toom8_sqr",     speed_mpn_toom8_sqr        },
344   { "mpn_toom22_mul",    speed_mpn_toom22_mul       },
345   { "mpn_toom33_mul",    speed_mpn_toom33_mul       },
346   { "mpn_toom44_mul",    speed_mpn_toom44_mul       },
347   { "mpn_toom6h_mul",    speed_mpn_toom6h_mul       },
348   { "mpn_toom8h_mul",    speed_mpn_toom8h_mul       },
349   { "mpn_toom32_mul",    speed_mpn_toom32_mul       },
350   { "mpn_toom42_mul",    speed_mpn_toom42_mul       },
351   { "mpn_toom43_mul",    speed_mpn_toom43_mul       },
352   { "mpn_toom63_mul",    speed_mpn_toom63_mul       },
353   { "mpn_nussbaumer_mul",    speed_mpn_nussbaumer_mul    },
354   { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
355 #if WANT_OLD_FFT_FULL
356   { "mpn_mul_fft_full",      speed_mpn_mul_fft_full      },
357   { "mpn_mul_fft_full_sqr",  speed_mpn_mul_fft_full_sqr  },
358 #endif
359   { "mpn_mul_fft",       speed_mpn_mul_fft,     FLAG_R_OPTIONAL },
360   { "mpn_mul_fft_sqr",   speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
361 
362   { "mpn_sqrlo",          speed_mpn_sqrlo           },
363   { "mpn_sqrlo_basecase", speed_mpn_sqrlo_basecase  },
364   { "mpn_mullo_n",        speed_mpn_mullo_n         },
365   { "mpn_mullo_basecase", speed_mpn_mullo_basecase  },
366 
367   { "mpn_mulmid_basecase",  speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL },
368   { "mpn_toom42_mulmid",    speed_mpn_toom42_mulmid },
369   { "mpn_mulmid_n",         speed_mpn_mulmid_n },
370   { "mpn_mulmid",           speed_mpn_mulmid, FLAG_R_OPTIONAL },
371 
372   { "mpn_bc_mulmod_bnm1",      speed_mpn_bc_mulmod_bnm1      },
373   { "mpn_mulmod_bnm1",         speed_mpn_mulmod_bnm1         },
374   { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded },
375   { "mpn_sqrmod_bnm1",         speed_mpn_sqrmod_bnm1         },
376 
377   { "mpn_invert",              speed_mpn_invert              },
378   { "mpn_invertappr",          speed_mpn_invertappr          },
379   { "mpn_ni_invertappr",       speed_mpn_ni_invertappr       },
380   { "mpn_binvert",             speed_mpn_binvert             },
381   { "mpn_sec_invert",          speed_mpn_sec_invert          },
382 
383   { "mpn_sbpi1_div_qr",        speed_mpn_sbpi1_div_qr,    FLAG_R_OPTIONAL},
384   { "mpn_dcpi1_div_qr",        speed_mpn_dcpi1_div_qr,    FLAG_R_OPTIONAL},
385   { "mpn_mu_div_qr",           speed_mpn_mu_div_qr,       FLAG_R_OPTIONAL},
386   { "mpn_mupi_div_qr",         speed_mpn_mupi_div_qr,     FLAG_R_OPTIONAL},
387   { "mpn_sbpi1_divappr_q",     speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL},
388   { "mpn_dcpi1_divappr_q",     speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL},
389 
390   { "mpn_sbpi1_bdiv_qr",       speed_mpn_sbpi1_bdiv_qr       },
391   { "mpn_dcpi1_bdiv_qr",       speed_mpn_dcpi1_bdiv_qr       },
392   { "mpn_sbpi1_bdiv_q",        speed_mpn_sbpi1_bdiv_q        },
393   { "mpn_dcpi1_bdiv_q",        speed_mpn_dcpi1_bdiv_q        },
394   { "mpn_sbpi1_bdiv_r",        speed_mpn_sbpi1_bdiv_r        },
395 
396   { "mpn_broot",               speed_mpn_broot,    FLAG_R },
397   { "mpn_broot_invm1",         speed_mpn_broot_invm1, FLAG_R },
398   { "mpn_brootinv",            speed_mpn_brootinv, FLAG_R },
399 
400   { "mpn_get_str",          speed_mpn_get_str,     FLAG_R_OPTIONAL },
401   { "mpn_set_str",          speed_mpn_set_str,     FLAG_R_OPTIONAL },
402   { "mpn_set_str_basecase", speed_mpn_bc_set_str,  FLAG_R_OPTIONAL },
403 
404   { "mpn_sqrtrem",       speed_mpn_sqrtrem          },
405   { "mpn_rootrem",       speed_mpn_rootrem, FLAG_R  },
406   { "mpn_sqrt",          speed_mpn_sqrt             },
407   { "mpn_root",          speed_mpn_root, FLAG_R     },
408 
409   { "mpn_perfect_power_p",  speed_mpn_perfect_power_p,       },
410   { "mpn_perfect_square_p", speed_mpn_perfect_square_p,      },
411 
412   { "mpn_fib2_ui",       speed_mpn_fib2_ui,    FLAG_NODATA },
413   { "mpz_fib_ui",        speed_mpz_fib_ui,     FLAG_NODATA },
414   { "mpz_fib2_ui",       speed_mpz_fib2_ui,    FLAG_NODATA },
415   { "mpz_lucnum_ui",     speed_mpz_lucnum_ui,  FLAG_NODATA },
416   { "mpz_lucnum2_ui",    speed_mpz_lucnum2_ui, FLAG_NODATA },
417 
418   { "mpz_add",           speed_mpz_add              },
419   { "mpz_invert",        speed_mpz_invert,   FLAG_R_OPTIONAL },
420   { "mpz_bin_uiui",      speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
421   { "mpz_bin_ui",        speed_mpz_bin_ui,   FLAG_NODATA | FLAG_R_OPTIONAL },
422   { "mpz_fac_ui",        speed_mpz_fac_ui,   FLAG_NODATA   },
423   { "mpz_2fac_ui",       speed_mpz_2fac_ui,  FLAG_NODATA   },
424   { "mpz_mfac_uiui",     speed_mpz_mfac_uiui,  FLAG_NODATA | FLAG_R_OPTIONAL },
425   { "mpz_primorial_ui",  speed_mpz_primorial_ui, FLAG_NODATA },
426   { "mpz_powm",          speed_mpz_powm,     FLAG_R_OPTIONAL },
427   { "mpz_powm_mod",      speed_mpz_powm_mod         },
428   { "mpz_powm_redc",     speed_mpz_powm_redc        },
429   { "mpz_powm_sec",      speed_mpz_powm_sec        },
430   { "mpz_powm_ui",       speed_mpz_powm_ui,  FLAG_R_OPTIONAL },
431 
432   { "mpz_mod",           speed_mpz_mod              },
433   { "mpn_redc_1",        speed_mpn_redc_1           },
434   { "mpn_redc_2",        speed_mpn_redc_2           },
435   { "mpn_redc_n",        speed_mpn_redc_n           },
436 
437   { "MPN_COPY",          speed_MPN_COPY             },
438   { "MPN_COPY_INCR",     speed_MPN_COPY_INCR        },
439   { "MPN_COPY_DECR",     speed_MPN_COPY_DECR        },
440   { "memcpy",            speed_memcpy               },
441 #if HAVE_NATIVE_mpn_copyi
442   { "mpn_copyi",         speed_mpn_copyi            },
443 #endif
444 #if HAVE_NATIVE_mpn_copyd
445   { "mpn_copyd",         speed_mpn_copyd            },
446 #endif
447   { "mpn_sec_tabselect", speed_mpn_sec_tabselect, FLAG_R_OPTIONAL },
448 #if HAVE_NATIVE_mpn_addlsh1_n == 1
449   { "mpn_addlsh1_n",     speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
450 #endif
451 #if HAVE_NATIVE_mpn_sublsh1_n == 1
452   { "mpn_sublsh1_n",     speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
453 #endif
454 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
455   { "mpn_addlsh1_n_ip1", speed_mpn_addlsh1_n_ip1    },
456 #endif
457 #if HAVE_NATIVE_mpn_addlsh1_n_ip2
458   { "mpn_addlsh1_n_ip2", speed_mpn_addlsh1_n_ip2    },
459 #endif
460 #if HAVE_NATIVE_mpn_sublsh1_n_ip1
461   { "mpn_sublsh1_n_ip1", speed_mpn_sublsh1_n_ip1    },
462 #endif
463 #if HAVE_NATIVE_mpn_rsblsh1_n == 1
464   { "mpn_rsblsh1_n",     speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
465 #endif
466 #if HAVE_NATIVE_mpn_addlsh2_n == 1
467   { "mpn_addlsh2_n",     speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
468 #endif
469 #if HAVE_NATIVE_mpn_sublsh2_n == 1
470   { "mpn_sublsh2_n",     speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },
471 #endif
472 #if HAVE_NATIVE_mpn_addlsh2_n_ip1
473   { "mpn_addlsh2_n_ip1", speed_mpn_addlsh2_n_ip1    },
474 #endif
475 #if HAVE_NATIVE_mpn_addlsh2_n_ip2
476   { "mpn_addlsh2_n_ip2", speed_mpn_addlsh2_n_ip2    },
477 #endif
478 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
479   { "mpn_sublsh2_n_ip1", speed_mpn_sublsh2_n_ip1    },
480 #endif
481 #if HAVE_NATIVE_mpn_rsblsh2_n == 1
482   { "mpn_rsblsh2_n",     speed_mpn_rsblsh2_n, FLAG_R_OPTIONAL },
483 #endif
484 #if HAVE_NATIVE_mpn_addlsh_n
485   { "mpn_addlsh_n",     speed_mpn_addlsh_n, FLAG_R_OPTIONAL },
486 #endif
487 #if HAVE_NATIVE_mpn_sublsh_n
488   { "mpn_sublsh_n",     speed_mpn_sublsh_n, FLAG_R_OPTIONAL },
489 #endif
490 #if HAVE_NATIVE_mpn_addlsh_n_ip1
491   { "mpn_addlsh_n_ip1", speed_mpn_addlsh_n_ip1    },
492 #endif
493 #if HAVE_NATIVE_mpn_addlsh_n_ip2
494   { "mpn_addlsh_n_ip2", speed_mpn_addlsh_n_ip2    },
495 #endif
496 #if HAVE_NATIVE_mpn_sublsh_n_ip1
497   { "mpn_sublsh_n_ip1", speed_mpn_sublsh_n_ip1    },
498 #endif
499 #if HAVE_NATIVE_mpn_rsblsh_n
500   { "mpn_rsblsh_n",     speed_mpn_rsblsh_n, FLAG_R_OPTIONAL },
501 #endif
502 #if HAVE_NATIVE_mpn_rsh1add_n
503   { "mpn_rsh1add_n",     speed_mpn_rsh1add_n, FLAG_R_OPTIONAL },
504 #endif
505 #if HAVE_NATIVE_mpn_rsh1sub_n
506   { "mpn_rsh1sub_n",     speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
507 #endif
508 
509   { "mpn_cnd_add_n",     speed_mpn_cnd_add_n, FLAG_R_OPTIONAL },
510   { "mpn_cnd_sub_n",     speed_mpn_cnd_sub_n, FLAG_R_OPTIONAL },
511 
512   { "MPN_ZERO",          speed_MPN_ZERO             },
513 
514   { "binvert_limb",       speed_binvert_limb,       FLAG_NODATA },
515   { "binvert_limb_mul1",  speed_binvert_limb_mul1,  FLAG_NODATA },
516   { "binvert_limb_loop",  speed_binvert_limb_loop,  FLAG_NODATA },
517   { "binvert_limb_cond",  speed_binvert_limb_cond,  FLAG_NODATA },
518   { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
519 
520   { "malloc_free",                  speed_malloc_free                  },
521   { "malloc_realloc_free",          speed_malloc_realloc_free          },
522   { "gmp_allocate_free",            speed_gmp_allocate_free            },
523   { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free },
524   { "mpz_init_clear",               speed_mpz_init_clear               },
525   { "mpq_init_clear",               speed_mpq_init_clear               },
526   { "mpf_init_clear",               speed_mpf_init_clear               },
527   { "mpz_init_realloc_clear",       speed_mpz_init_realloc_clear       },
528 
529   { "umul_ppmm",         speed_umul_ppmm,     FLAG_R_OPTIONAL },
530 #if HAVE_NATIVE_mpn_umul_ppmm
531   { "mpn_umul_ppmm",     speed_mpn_umul_ppmm, FLAG_R_OPTIONAL },
532 #endif
533 #if HAVE_NATIVE_mpn_umul_ppmm_r
534   { "mpn_umul_ppmm_r",   speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL },
535 #endif
536 
537   { "count_leading_zeros",  speed_count_leading_zeros,  FLAG_NODATA | FLAG_R_OPTIONAL },
538   { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
539 
540   { "udiv_qrnnd",             speed_udiv_qrnnd,             FLAG_R_OPTIONAL },
541   { "udiv_qrnnd_c",           speed_udiv_qrnnd_c,           FLAG_R_OPTIONAL },
542 #if HAVE_NATIVE_mpn_udiv_qrnnd
543   { "mpn_udiv_qrnnd",         speed_mpn_udiv_qrnnd,         FLAG_R_OPTIONAL },
544 #endif
545 #if HAVE_NATIVE_mpn_udiv_qrnnd_r
546   { "mpn_udiv_qrnnd_r",       speed_mpn_udiv_qrnnd_r,       FLAG_R_OPTIONAL },
547 #endif
548   { "invert_limb",            speed_invert_limb,            FLAG_R_OPTIONAL },
549 
550   { "operator_div",           speed_operator_div,           FLAG_R_OPTIONAL },
551   { "operator_mod",           speed_operator_mod,           FLAG_R_OPTIONAL },
552 
553   { "gmp_randseed",    speed_gmp_randseed,    FLAG_R_OPTIONAL               },
554   { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA },
555   { "mpz_urandomb",    speed_mpz_urandomb,    FLAG_R_OPTIONAL | FLAG_NODATA },
556 
557 #ifdef SPEED_EXTRA_ROUTINES
558   SPEED_EXTRA_ROUTINES
559 #endif
560 #ifdef SPEED_EXTRA_ROUTINES2
561   SPEED_EXTRA_ROUTINES2
562 #endif
563 };
564 
565 
566 struct choice_t {
567   const struct routine_t  *p;
568   mp_limb_t               r;
569   double                  scale;
570   double                  time;
571   int                     no_time;
572   double                  prev_time;
573   const char              *name;
574 };
575 struct choice_t  *choice;
576 int  num_choices = 0;
577 
578 
579 void
data_fill(mp_ptr ptr,mp_size_t size)580 data_fill (mp_ptr ptr, mp_size_t size)
581 {
582   switch (option_data) {
583   case DATA_RANDOM:
584     mpn_random (ptr, size);
585     break;
586   case DATA_RANDOM2:
587     mpn_random2 (ptr, size);
588     break;
589   case DATA_ZEROS:
590     MPN_ZERO (ptr, size);
591     break;
592   case DATA_AAS:
593     MPN_FILL (ptr, size, GMP_NUMB_0xAA);
594     break;
595   case DATA_FFS:
596     MPN_FILL (ptr, size, GMP_NUMB_MAX);
597     break;
598   case DATA_2FD:
599     MPN_FILL (ptr, size, GMP_NUMB_MAX);
600     ptr[0] -= 2;
601     break;
602   default:
603     abort();
604     /*NOTREACHED*/
605   }
606 }
607 
608 /* The code here handling the various combinations of output options isn't
609    too attractive, but it works and is fairly clean.  */
610 
611 #define SIZE_TO_DIVISOR(n)              \
612   (option_square == 1 ? (n)*(n)         \
613   : option_square == 2 ? (n)*((n)+1)/2  \
614   : (n))
615 
616 void
run_one(FILE * fp,struct speed_params * s,mp_size_t prev_size)617 run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
618 {
619   const char  *first_open_fastest, *first_open_notfastest, *first_close;
620   int         i, fastest, want_data;
621   double      fastest_time;
622   TMP_DECL;
623 
624   TMP_MARK;
625 
626   /* allocate data, unless all routines are NODATA */
627   want_data = 0;
628   for (i = 0; i < num_choices; i++)
629     want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0);
630 
631   if (want_data)
632     {
633       SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp);
634       SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp);
635 
636       data_fill (s->xp, s->size);
637       data_fill (s->yp, s->size);
638     }
639   else
640     {
641       sp.xp = NULL;
642       sp.yp = NULL;
643     }
644 
645   if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
646     {
647       first_open_fastest = "(#";
648       first_open_notfastest = " (";
649       first_close = ")";
650     }
651   else
652     {
653       first_open_fastest = "#";
654       first_open_notfastest = " ";
655       first_close = "";
656     }
657 
658   fastest = -1;
659   fastest_time = -1.0;
660   for (i = 0; i < num_choices; i++)
661     {
662       s->r = choice[i].r;
663       choice[i].time = speed_measure (choice[i].p->fun, s);
664       choice[i].no_time = (choice[i].time == -1.0);
665       if (! choice[i].no_time)
666         choice[i].time *= choice[i].scale;
667 
668       /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time
669          is before any differences.  */
670       {
671         double     t;
672         t = choice[i].time;
673         if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1)
674           {
675             if (choice[i].prev_time == -1.0)
676               choice[i].no_time = 1;
677             else
678               choice[i].time = choice[i].time - choice[i].prev_time;
679           }
680         choice[i].prev_time = t;
681       }
682 
683       if (choice[i].no_time)
684         continue;
685 
686       /* Look for the fastest after CMP_DIFFPREV has been applied, but
687          before CMP_RATIO or CMP_DIFFERENCE.  There's only a fastest shown
688          if there's more than one routine.  */
689       if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time))
690         {
691           fastest = i;
692           fastest_time = choice[i].time;
693         }
694 
695       if (option_cmp == CMP_DIFFPREV)
696         {
697           /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */
698           if (option_unit == UNIT_CYCLES)
699             choice[i].time /= speed_cycletime;
700           else if (option_unit == UNIT_CYCLESPERLIMB)
701             {
702               if (prev_size == -1)
703                 choice[i].time /= speed_cycletime;
704               else
705                 choice[i].time /=  (speed_cycletime
706                                     * (SIZE_TO_DIVISOR(s->size)
707                                        - SIZE_TO_DIVISOR(prev_size)));
708             }
709         }
710       else
711         {
712           if (option_unit == UNIT_CYCLES)
713             choice[i].time /= speed_cycletime;
714           else if (option_unit == UNIT_CYCLESPERLIMB)
715             choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size));
716 
717           if (option_cmp == CMP_RATIO && i > 0)
718             {
719               /* A ratio isn't affected by the units chosen. */
720               if (choice[0].no_time || choice[0].time == 0.0)
721                 choice[i].no_time = 1;
722               else
723                 choice[i].time /= choice[0].time;
724             }
725           else if (option_cmp == CMP_DIFFERENCE && i > 0)
726             {
727               if (choice[0].no_time)
728                 {
729                   choice[i].no_time = 1;
730                   continue;
731                 }
732               choice[i].time -= choice[0].time;
733             }
734         }
735     }
736 
737   if (option_gnuplot)
738     {
739       /* In CMP_DIFFPREV, don't print anything for the first size, start
740          with the second where an actual difference is available.
741 
742          In CMP_RATIO, print the first column as 1.0.
743 
744          The 9 decimals printed is much more than the expected precision of
745          the measurements actually. */
746 
747       if (! (option_cmp == CMP_DIFFPREV && prev_size == -1))
748         {
749           fprintf (fp, "%-6ld ", s->size);
750           for (i = 0; i < num_choices; i++)
751             fprintf (fp, "  %.9e",
752                      choice[i].no_time ? 0.0
753                      : (option_cmp == CMP_RATIO && i == 0) ? 1.0
754                      : choice[i].time);
755           fprintf (fp, "\n");
756         }
757     }
758   else
759     {
760       fprintf (fp, "%-6ld ", s->size);
761       for (i = 0; i < num_choices; i++)
762         {
763           char  buf[128];
764           int   decimals;
765 
766           if (choice[i].no_time)
767             {
768               fprintf (fp, " %*s", COLUMN_WIDTH, "n/a");
769             }
770           else
771             {if (option_unit == UNIT_CYCLESPERLIMB
772                  || (option_cmp == CMP_RATIO && i > 0))
773                 decimals = 4;
774               else if (option_unit == UNIT_CYCLES)
775                 decimals = 2;
776               else
777                 decimals = 9;
778 
779               sprintf (buf, "%s%.*f%s",
780                        i == fastest ? first_open_fastest : first_open_notfastest,
781                        decimals, choice[i].time, first_close);
782               fprintf (fp, " %*s", COLUMN_WIDTH, buf);
783             }
784         }
785       fprintf (fp, "\n");
786     }
787 
788   TMP_FREE;
789 }
790 
791 void
run_all(FILE * fp)792 run_all (FILE *fp)
793 {
794   mp_size_t  prev_size;
795   int        i;
796   TMP_DECL;
797 
798   TMP_MARK;
799   SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp);
800   SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp);
801 
802   data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
803   data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
804 
805   for (i = 0; i < size_num; i++)
806     {
807       sp.size = size_array[i].start;
808       prev_size = -1;
809       for (;;)
810         {
811           mp_size_t  step;
812 
813           if (option_data == DATA_2FD && sp.size >= 2)
814             sp.xp[sp.size-1] = 2;
815 
816           run_one (fp, &sp, prev_size);
817           prev_size = sp.size;
818 
819           if (option_data == DATA_2FD && sp.size >= 2)
820             sp.xp[sp.size-1] = MP_LIMB_T_MAX;
821 
822           if (option_factor != 0.0)
823             {
824               step = (mp_size_t) (sp.size * option_factor - sp.size);
825               if (step < 1)
826                 step = 1;
827             }
828           else
829             step = 1;
830           if (step < option_step)
831             step = option_step;
832 
833           sp.size += step;
834           if (sp.size > size_array[i].end)
835             break;
836         }
837     }
838 
839   TMP_FREE;
840 }
841 
842 
843 FILE *
fopen_for_write(const char * filename)844 fopen_for_write (const char *filename)
845 {
846   FILE  *fp;
847   if ((fp = fopen (filename, "w")) == NULL)
848     {
849       fprintf (stderr, "Cannot create %s\n", filename);
850       exit(1);
851     }
852   return fp;
853 }
854 
855 void
fclose_written(FILE * fp,const char * filename)856 fclose_written (FILE *fp, const char *filename)
857 {
858   int  err;
859 
860   err = ferror (fp);
861   err |= fclose (fp);
862 
863   if (err)
864     {
865       fprintf (stderr, "Error writing %s\n", filename);
866       exit(1);
867     }
868 }
869 
870 
871 void
run_gnuplot(int argc,char * argv[])872 run_gnuplot (int argc, char *argv[])
873 {
874   char  *plot_filename;
875   char  *data_filename;
876   FILE  *fp;
877   int   i;
878 
879   plot_filename = (char *) (*__gmp_allocate_func)
880     (strlen (option_gnuplot_basename) + 20);
881   data_filename = (char *) (*__gmp_allocate_func)
882     (strlen (option_gnuplot_basename) + 20);
883 
884   sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename);
885   sprintf (data_filename, "%s.data",    option_gnuplot_basename);
886 
887   fp = fopen_for_write (plot_filename);
888 
889   fprintf (fp, "# Generated with:\n");
890   fprintf (fp, "#");
891   for (i = 0; i < argc; i++)
892     fprintf (fp, " %s", argv[i]);
893   fprintf (fp, "\n");
894   fprintf (fp, "\n");
895 
896   fprintf (fp, "reset\n");
897 
898   /* Putting the key at the top left is usually good, and you can change it
899      interactively if it's not. */
900   fprintf (fp, "set key left\n");
901 
902   /* write underscores, not subscripts */
903   fprintf (fp, "set termoption noenhanced\n");
904 
905   /* designed to make it possible to see crossovers easily */
906   fprintf (fp, "set style data lines\n");
907 
908   fprintf (fp, "plot ");
909   for (i = 0; i < num_choices; i++)
910     {
911       fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2);
912       fprintf (fp, " title \"%s\"", choice[i].name);
913 
914       if (i != num_choices-1)
915         fprintf (fp, ", \\");
916       fprintf (fp, "\n");
917     }
918 
919   fprintf (fp, "load \"-\"\n");
920   fclose_written (fp, plot_filename);
921 
922   fp = fopen_for_write (data_filename);
923 
924   /* Unbuffered so you can see where the program was up to if it crashes or
925      you kill it. */
926   setbuf (fp, NULL);
927 
928   run_all (fp);
929   fclose_written (fp, data_filename);
930 }
931 
932 
933 /* Return a limb with n many one bits (starting from the least significant) */
934 
935 #define LIMB_ONES(n) \
936   ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX      \
937     : (n) == 0 ? CNST_LIMB(0)                   \
938     : (CNST_LIMB(1) << (n)) - 1)
939 
940 mp_limb_t
r_string(const char * s)941 r_string (const char *s)
942 {
943   const char  *s_orig = s;
944   long        n;
945 
946   if (strcmp (s, "aas") == 0)
947     return GMP_NUMB_0xAA;
948 
949   {
950     mpz_t      z;
951     mp_limb_t  l;
952     int        set, siz;
953 
954     mpz_init (z);
955     set = mpz_set_str (z, s, 0);
956     siz = SIZ(z);
957     l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]);
958     mpz_clear (z);
959     if (set == 0)
960       {
961         if (siz > 1 || siz < -1)
962           printf ("Warning, r parameter %s truncated to %d bits\n",
963                   s_orig, GMP_LIMB_BITS);
964         return l;
965       }
966   }
967 
968   if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
969     n = strtoul (s+2, (char **) &s, 16);
970   else
971     n = strtol (s, (char **) &s, 10);
972 
973   if (strcmp (s, "bits") == 0)
974     {
975       mp_limb_t  l;
976       if (n > GMP_LIMB_BITS)
977         {
978           fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
979                    n, GMP_LIMB_BITS);
980           exit (1);
981         }
982       mpn_random (&l, 1);
983       return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n);
984     }
985   else  if (strcmp (s, "ones") == 0)
986     {
987       if (n > GMP_LIMB_BITS)
988         {
989           fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
990                    n, GMP_LIMB_BITS);
991           exit (1);
992         }
993       return LIMB_ONES (n);
994     }
995   else if (*s != '\0')
996     {
997       fprintf (stderr, "invalid r parameter: %s\n", s_orig);
998       exit (1);
999     }
1000 
1001   return n;
1002 }
1003 
1004 
1005 void
routine_find(struct choice_t * c,const char * s_orig)1006 routine_find (struct choice_t *c, const char *s_orig)
1007 {
1008   const char  *s;
1009   int     i;
1010   size_t  nlen;
1011 
1012   c->name = s_orig;
1013   s = strchr (s_orig, '*');
1014   if (s != NULL)
1015     {
1016       c->scale = atof(s_orig);
1017       s++;
1018     }
1019   else
1020     {
1021       c->scale = 1.0;
1022       s = s_orig;
1023     }
1024 
1025   for (i = 0; i < numberof (routine); i++)
1026     {
1027       nlen = strlen (routine[i].name);
1028       if (memcmp (s, routine[i].name, nlen) != 0)
1029         continue;
1030 
1031       if (s[nlen] == '.')
1032         {
1033           /* match, with a .r parameter */
1034 
1035           if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL)))
1036             {
1037               fprintf (stderr,
1038                        "Choice %s bad: doesn't take a \".<r>\" parameter\n",
1039                        s_orig);
1040               exit (1);
1041             }
1042 
1043           c->p = &routine[i];
1044           c->r = r_string (s + nlen + 1);
1045           return;
1046         }
1047 
1048       if (s[nlen] == '\0')
1049         {
1050           /* match, with no parameter */
1051 
1052           if (routine[i].flag & FLAG_R)
1053             {
1054               fprintf (stderr,
1055                        "Choice %s bad: needs a \".<r>\" parameter\n",
1056                        s_orig);
1057               exit (1);
1058             }
1059 
1060           c->p = &routine[i];
1061           c->r = 0;
1062           return;
1063         }
1064     }
1065 
1066   fprintf (stderr, "Choice %s unrecognised\n", s_orig);
1067   exit (1);
1068 }
1069 
1070 
1071 void
usage(void)1072 usage (void)
1073 {
1074   int  i;
1075 
1076   speed_time_init ();
1077 
1078   printf ("Usage: speed [-options] -s size <routine>...\n");
1079   printf ("Measure the speed of some routines.\n");
1080   printf ("Times are in seconds, accuracy is shown.\n");
1081   printf ("\n");
1082   printf ("   -p num     set precision as number of time units each routine must run\n");
1083   printf ("   -s size[-end][,size[-end]]...   sizes to measure\n");
1084   printf ("              single sizes or ranges, sep with comma or use multiple -s\n");
1085   printf ("   -t step    step through sizes by given amount\n");
1086   printf ("   -f factor  step through sizes by given factor (eg. 1.05)\n");
1087   printf ("   -r         show times as ratios of the first routine\n");
1088   printf ("   -d         show times as difference from the first routine\n");
1089   printf ("   -D         show times as difference from previous size shown\n");
1090   printf ("   -c         show times in CPU cycles\n");
1091   printf ("   -C         show times in cycles per limb\n");
1092   printf ("   -u         print resource usage (memory) at end\n");
1093   printf ("   -P name    output plot files \"name.gnuplot\" and \"name.data\"\n");
1094   printf ("   -a <type>  use given data: random(default), random2, zeros, aas, ffs, 2fd\n");
1095   printf ("   -x, -y, -w, -W <align>  specify data alignments, sources and dests\n");
1096   printf ("   -o addrs   print addresses of data blocks\n");
1097   printf ("\n");
1098   printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n");
1099   printf ("is greater.\n");
1100   printf ("If both -C and -D are used, it means cycles per however many limbs between a\n");
1101   printf ("size and the previous size.\n");
1102   printf ("\n");
1103   printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n");
1104   printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n");
1105   printf ("a log/log plot).\n");
1106   printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n");
1107   printf ("when viewing more than one routine, it means same axis scales for all data).\n");
1108   printf ("\n");
1109   printf ("The available routines are as follows.\n");
1110   printf ("\n");
1111 
1112   for (i = 0; i < numberof (routine); i++)
1113     {
1114       if (routine[i].flag & FLAG_R)
1115         printf ("\t%s.r\n", routine[i].name);
1116       else if (routine[i].flag & FLAG_R_OPTIONAL)
1117         printf ("\t%s (optional .r)\n", routine[i].name);
1118       else
1119         printf ("\t%s\n", routine[i].name);
1120     }
1121   printf ("\n");
1122   printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n");
1123   printf ("r should be in decimal, or use 0xN for hexadecimal.\n");
1124   printf ("\n");
1125   printf ("Special forms for r are \"<N>bits\" for a random N bit number, \"<N>ones\" for\n");
1126   printf ("N one bits, or \"aas\" for 0xAA..AA.\n");
1127   printf ("\n");
1128   printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n");
1129   printf ("The fastest routine at each size is marked with a # (free form output only).\n");
1130   printf ("\n");
1131   printf ("%s", speed_time_string);
1132   printf ("\n");
1133   printf ("Gnuplot home page http://www.gnuplot.info/\n");
1134   printf ("Quickplot home page http://quickplot.sourceforge.net/\n");
1135 }
1136 
1137 void
check_align_option(const char * name,mp_size_t align)1138 check_align_option (const char *name, mp_size_t align)
1139 {
1140   if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK)
1141     {
1142       fprintf (stderr, "Alignment request out of range: %s %ld\n",
1143                name, (long) align);
1144       fprintf (stderr, "  should be 0 to %d (limbs), inclusive\n",
1145                SPEED_TMP_ALLOC_ADJUST_MASK);
1146       exit (1);
1147     }
1148 }
1149 
1150 int
main(int argc,char * argv[])1151 main (int argc, char *argv[])
1152 {
1153   int  i;
1154   int  opt;
1155 
1156   /* Unbuffered so output goes straight out when directed to a pipe or file
1157      and isn't lost on killing the program half way.  */
1158   setbuf (stdout, NULL);
1159 
1160   for (;;)
1161     {
1162       opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z");
1163       if (opt == EOF)
1164         break;
1165 
1166       switch (opt) {
1167       case 'a':
1168         if (strcmp (optarg, "random") == 0)       option_data = DATA_RANDOM;
1169         else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2;
1170         else if (strcmp (optarg, "zeros") == 0)   option_data = DATA_ZEROS;
1171         else if (strcmp (optarg, "aas") == 0)     option_data = DATA_AAS;
1172         else if (strcmp (optarg, "ffs") == 0)     option_data = DATA_FFS;
1173         else if (strcmp (optarg, "2fd") == 0)     option_data = DATA_2FD;
1174         else
1175           {
1176             fprintf (stderr, "unrecognised data option: %s\n", optarg);
1177             exit (1);
1178           }
1179         break;
1180       case 'C':
1181         if (option_unit  != UNIT_SECONDS) goto bad_unit;
1182         option_unit = UNIT_CYCLESPERLIMB;
1183         break;
1184       case 'c':
1185         if (option_unit != UNIT_SECONDS)
1186           {
1187           bad_unit:
1188             fprintf (stderr, "cannot use more than one of -c, -C\n");
1189             exit (1);
1190           }
1191         option_unit = UNIT_CYCLES;
1192         break;
1193       case 'D':
1194         if (option_cmp != CMP_ABSOLUTE) goto bad_cmp;
1195         option_cmp = CMP_DIFFPREV;
1196         break;
1197       case 'd':
1198         if (option_cmp != CMP_ABSOLUTE)
1199           {
1200           bad_cmp:
1201             fprintf (stderr, "cannot use more than one of -d, -D, -r\n");
1202             exit (1);
1203           }
1204         option_cmp = CMP_DIFFERENCE;
1205         break;
1206       case 'E':
1207         option_square = 1;
1208         break;
1209       case 'F':
1210         option_square = 2;
1211         break;
1212       case 'f':
1213         option_factor = atof (optarg);
1214         if (option_factor <= 1.0)
1215           {
1216             fprintf (stderr, "-f factor must be > 1.0\n");
1217             exit (1);
1218           }
1219         break;
1220       case 'o':
1221         speed_option_set (optarg);
1222         break;
1223       case 'P':
1224         option_gnuplot = 1;
1225         option_gnuplot_basename = optarg;
1226         break;
1227       case 'p':
1228         speed_precision = atoi (optarg);
1229         break;
1230       case 'R':
1231         option_seed = time (NULL);
1232         break;
1233       case 'r':
1234         if (option_cmp != CMP_ABSOLUTE)
1235           goto bad_cmp;
1236         option_cmp = CMP_RATIO;
1237         break;
1238       case 's':
1239         {
1240           char  *s;
1241           for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ","))
1242             {
1243               if (size_num == size_allocnum)
1244                 {
1245                   size_array = (struct size_array_t *)
1246                     __gmp_allocate_or_reallocate
1247                     (size_array,
1248                      size_allocnum * sizeof(size_array[0]),
1249                      (size_allocnum+10) * sizeof(size_array[0]));
1250                   size_allocnum += 10;
1251                 }
1252               if (sscanf (s, "%ld-%ld",
1253                           &size_array[size_num].start,
1254                           &size_array[size_num].end) != 2)
1255                 {
1256                   size_array[size_num].start = size_array[size_num].end
1257                     = atol (s);
1258                 }
1259 
1260               if (size_array[size_num].start < 0
1261                   || size_array[size_num].end < 0
1262                   || size_array[size_num].start > size_array[size_num].end)
1263                 {
1264                   fprintf (stderr, "invalid size parameter: %s\n", s);
1265                   exit (1);
1266                 }
1267 
1268               size_num++;
1269             }
1270         }
1271         break;
1272       case 't':
1273         option_step = atol (optarg);
1274         if (option_step < 1)
1275           {
1276             fprintf (stderr, "-t step must be >= 1\n");
1277             exit (1);
1278           }
1279         break;
1280       case 'u':
1281         option_resource_usage = 1;
1282         break;
1283       case 'z':
1284         sp.cache = 1;
1285         break;
1286       case 'x':
1287         sp.align_xp = atol (optarg);
1288         check_align_option ("-x", sp.align_xp);
1289         break;
1290       case 'y':
1291         sp.align_yp = atol (optarg);
1292         check_align_option ("-y", sp.align_yp);
1293         break;
1294       case 'w':
1295         sp.align_wp = atol (optarg);
1296         check_align_option ("-w", sp.align_wp);
1297         break;
1298       case 'W':
1299         sp.align_wp2 = atol (optarg);
1300         check_align_option ("-W", sp.align_wp2);
1301         break;
1302       case '?':
1303         exit(1);
1304       }
1305     }
1306 
1307   if (optind >= argc)
1308     {
1309       usage ();
1310       exit (1);
1311     }
1312 
1313   if (size_num == 0)
1314     {
1315       fprintf (stderr, "-s <size> must be specified\n");
1316       exit (1);
1317     }
1318 
1319   gmp_randinit_default (__gmp_rands);
1320   __gmp_rands_initialized = 1;
1321   gmp_randseed_ui (__gmp_rands, option_seed);
1322 
1323   choice = (struct choice_t *) (*__gmp_allocate_func)
1324     ((argc - optind) * sizeof(choice[0]));
1325   for ( ; optind < argc; optind++)
1326     {
1327       struct choice_t  c;
1328       routine_find (&c, argv[optind]);
1329       choice[num_choices] = c;
1330       num_choices++;
1331     }
1332 
1333   if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) &&
1334       num_choices < 2)
1335     {
1336       fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n");
1337     }
1338 
1339   speed_time_init ();
1340   if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB)
1341     speed_cycletime_need_cycles ();
1342   else
1343     speed_cycletime_need_seconds ();
1344 
1345   if (option_gnuplot)
1346     {
1347       run_gnuplot (argc, argv);
1348     }
1349   else
1350     {
1351       if (option_unit == UNIT_SECONDS)
1352         printf ("overhead %.9f secs", speed_measure (speed_noop, NULL));
1353       else
1354         printf ("overhead %.2f cycles",
1355                 speed_measure (speed_noop, NULL) / speed_cycletime);
1356       printf (", precision %d units of %.2e secs",
1357               speed_precision, speed_unittime);
1358 
1359       if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
1360         printf (", CPU freq unknown\n");
1361       else
1362         printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
1363 
1364       printf ("       ");
1365       for (i = 0; i < num_choices; i++)
1366         printf (" %*s", COLUMN_WIDTH, choice[i].name);
1367       printf ("\n");
1368 
1369       run_all (stdout);
1370     }
1371 
1372   if (option_resource_usage)
1373     {
1374 #if HAVE_GETRUSAGE
1375       {
1376         /* This doesn't give data sizes on linux 2.0.x, only utime. */
1377         struct rusage  r;
1378         if (getrusage (RUSAGE_SELF, &r) != 0)
1379           perror ("getrusage");
1380         else
1381           printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n",
1382                   (long) r.ru_utime.tv_sec, (long) r.ru_utime.tv_usec,
1383                   r.ru_idrss, r.ru_isrss, r.ru_ixrss);
1384       }
1385 #else
1386       printf ("getrusage() not available\n");
1387 #endif
1388 
1389       /* Linux kernel. */
1390       {
1391         char  buf[128];
1392         sprintf (buf, "/proc/%d/status", getpid());
1393         if (access (buf, R_OK) == 0)
1394           {
1395             sprintf (buf, "cat /proc/%d/status", getpid());
1396             system (buf);
1397           }
1398 
1399       }
1400     }
1401 
1402   return 0;
1403 }
1404